diff --git a/build.sh b/build.sh
index 6569b933b006e3df36fbf8a80c545cb60ebeef63..5c47c21e440e8402a9874a5606c45ae8e9974ed7 100755
--- a/build.sh
+++ b/build.sh
@@ -10,7 +10,7 @@ enable_autotuner="1"
 buildtype=RelWithDebInfo
 backends="all"
 build_for_openeuler="0"
-enabled_projects="clang;lld;compiler-rt;openmp;clang-tools-extra"
+enabled_projects="clang;lld;openmp;clang-tools-extra"
 embedded_toolchain="0"
 split_dwarf=on
 use_ccache="0"
@@ -322,7 +322,7 @@ fi
 
 if [ $embedded_toolchain == "1" ]; then
   echo "Build for embedded cross tool chain"
-  enabled_projects="clang;lld;compiler-rt;"
+  enabled_projects="clang;lld;"
   CMAKE_OPTIONS="$CMAKE_OPTIONS \
                 -DLLVM_BUILD_FOR_EMBEDDED=ON"
 fi
@@ -364,14 +364,14 @@ mkdir -p "$build_prefix" && cd "$build_prefix"
 cmake $CMAKE_OPTIONS \
       -DCOMPILER_RT_BUILD_SANITIZERS=on \
       -DLLVM_ENABLE_PROJECTS=$enabled_projects \
-      -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \
+      -DLLVM_ENABLE_RUNTIMES="compiler-rt;libunwind" \
       -DLLVM_USE_LINKER=gold \
       -DLLVM_LIT_ARGS="-sv -j$threads" \
       -DLLVM_USE_SPLIT_DWARF=$split_dwarf \
       -DCMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO="-Wl,--gdb-index -Wl,--compress-debug-sections=zlib" \
       -DCMAKE_EXE_LINKER_FLAGS_DEBUG="-Wl,--gdb-index -Wl,--compress-debug-sections=zlib" \
       -DBUILD_SHARED_LIBS=OFF \
-      -DLLVM_ENABLE_LIBCXX=OFF \
+      -DLLVM_STATIC_LINK_CXX_STDLIB=ON \
       -DLLVM_ENABLE_ZLIB=ON \
       -DLLVM_BUILD_RUNTIME=ON \
       -DLLVM_INCLUDE_TOOLS=ON \
@@ -409,6 +409,39 @@ if [ $do_install == "1" ]; then
   make -j$threads $verbose $install
 fi
 
+# build libcxx/libcxxabi with the just-built clang/clang++
+c_compiler="$install_prefix/bin/clang"
+cxx_compiler="$install_prefix/bin/clang++"
+if pushd runtimes > /dev/null 2>&1; then
+  if [ ! -f "$build_prefix"/projects/libcxx/CMakeCache.txt ]; then
+    mkdir -p "$build_prefix/projects/libcxx" && cd "$build_prefix/projects/libcxx"
+    cmake -Wno-dev \
+	  -DCMAKE_BUILD_TYPE=$buildtype \
+	  -DCMAKE_INSTALL_PREFIX="$install_prefix" \
+	  -DCMAKE_C_COMPILER="$c_compiler" \
+	  -DCMAKE_CXX_COMPILER="$cxx_compiler" \
+	  -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" \
+	  -DLLVM_LIT_ARGS="-sv -j$threads" \
+	  -DBUILD_SHARED_LIBS=OFF \
+	  -DCMAKE_SKIP_RPATH=ON \
+	  -DLLVM_USE_LINKER=gold \
+	  -DLLVM_USE_SPLIT_DWARF=$split_dwarf \
+	  ../../../runtimes
+  else
+    cd "$build_prefix"/projects/libcxx
+  fi
+  install_libcxx=${install/\/strip/-strripped}
+  make -j$threads $verbose \
+	  ${install_libcxx/install/install-cxx} ${install_libcxx/install/install-cxxabi} ${install_libcxx/install/install-cxxabi-headers}
+  if [ -n "$unit_test" ]; then
+	  make -j$threads $verbose ${unit_test/all/cxx} ${unit_test/all/cxxabi}
+  fi
+  popd > /dev/null 2>&1
+else
+  echo "$0: directory not found: libcxx"
+  exit 1
+fi
+
 if [ -n "$unit_test" ]; then
   make -j$threads $verbose check-all
 fi
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 05dad41c07faf0cae49dfb7fb0459e130edfb9a3..4f81412bd6f16827ba167659bf45c60a2160d76b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -160,6 +160,9 @@ Resolutions to C++ Defect Reports
   ``-Wdeprecated-literal-operator`` for the latter, off by default for now.
 
   .. code-block:: c++
+- Attributes now expect unevaluated strings in attributes parameters that are string literals.
+  This is applied to both C++ standard attributes, and other attributes supported by Clang.
+  This completes the implementation of `P2361R6 Unevaluated Strings <https://wg21.link/P2361R6>_`
 
     // What follows is warned by -Wuser-defined-literals
     // albeit "ill-formed, no diagnostic required".
@@ -935,6 +938,11 @@ Arm and AArch64 Support
   and the selected processor lacks floating point registers.
   (`#55755 <https://github.com/llvm/llvm-project/issues/55755>`_)
 
+- New AArch64 asm constraints have been added for r8-r11(Uci) and r12-r15(Ucj).
+
+Android Support
+^^^^^^^^^^^^^^^
+
 - Clang builtin ``__arithmetic_fence`` and the command line option ``-fprotect-parens``
   are now enabled for AArch64.
 
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 8d20d088bb63c48d02fa88fc46f7a5de2fa44ecb..80cfd50d91df49833cc504e50f43fee7ce3e4210 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -3966,6 +3966,34 @@ public:
   /// because TrailingObjects cannot handle repeated types.
   struct ExceptionType { QualType Type; };
 
+  /// The AArch64 SME ACLE (Arm C/C++ Language Extensions) define a number
+  /// of function type attributes that can be set on function types, including
+  /// function pointers.
+  enum AArch64SMETypeAttributes : unsigned {
+    SME_NormalFunction = 0,
+    SME_PStateSMEnabledMask = 1 << 0,
+    SME_PStateSMCompatibleMask = 1 << 1,
+
+    // Describes the value of the state using ArmStateValue.
+    SME_ZAShift = 2,
+    SME_ZAMask = 0b111 << SME_ZAShift,
+
+    SME_AttributeMask = 0b111'111 // We only support maximum 6 bits because of
+                                  // the bitmask in FunctionTypeExtraBitfields.
+  };
+
+  enum ArmStateValue : unsigned {
+    ARM_None = 0,
+    ARM_Preserves = 1,
+    ARM_In = 2,
+    ARM_Out = 3,
+    ARM_InOut = 4,
+  };
+
+  static ArmStateValue getArmZAState(unsigned AttrBits) {
+    return (ArmStateValue)((AttrBits & SME_ZAMask) >> SME_ZAShift);
+  }
+
   /// A simple holder for various uncommon bits which do not fit in
   /// FunctionTypeBitfields. Aligned to alignof(void *) to maintain the
   /// alignment of subsequent objects in TrailingObjects.
@@ -3973,7 +4001,13 @@ public:
     /// The number of types in the exception specification.
     /// A whole unsigned is not needed here and according to
     /// [implimits] 8 bits would be enough here.
-    uint16_t NumExceptionType = 0;
+    unsigned NumExceptionType : 10;
+
+    /// Any AArch64 SME ACLE type attributes that need to be propagated
+    /// on declarations and function pointers.
+    unsigned AArch64SMEAttributes : 6;
+    FunctionTypeExtraBitfields()
+        : NumExceptionType(0), AArch64SMEAttributes(SME_NormalFunction) {}
   };
 
 protected:
@@ -4152,18 +4186,22 @@ public:
   /// the various bits of extra information about a function prototype.
   struct ExtProtoInfo {
     FunctionType::ExtInfo ExtInfo;
-    bool Variadic : 1;
-    bool HasTrailingReturn : 1;
+    unsigned Variadic : 1;
+    unsigned HasTrailingReturn : 1;
+    unsigned AArch64SMEAttributes : 6;
     Qualifiers TypeQuals;
     RefQualifierKind RefQualifier = RQ_None;
     ExceptionSpecInfo ExceptionSpec;
     const ExtParameterInfo *ExtParameterInfos = nullptr;
     SourceLocation EllipsisLoc;
 
-    ExtProtoInfo() : Variadic(false), HasTrailingReturn(false) {}
+    ExtProtoInfo()
+        : Variadic(false), HasTrailingReturn(false),
+          AArch64SMEAttributes(SME_NormalFunction) {}
 
     ExtProtoInfo(CallingConv CC)
-        : ExtInfo(CC), Variadic(false), HasTrailingReturn(false) {}
+        : ExtInfo(CC), Variadic(false), HasTrailingReturn(false),
+          AArch64SMEAttributes(SME_NormalFunction) {}
 
     ExtProtoInfo withExceptionSpec(const ExceptionSpecInfo &ESI) {
       ExtProtoInfo Result(*this);
@@ -4172,7 +4210,15 @@ public:
     }
 
     bool requiresFunctionProtoTypeExtraBitfields() const {
-      return ExceptionSpec.Type == EST_Dynamic;
+      return ExceptionSpec.Type == EST_Dynamic ||
+             AArch64SMEAttributes != SME_NormalFunction;
+    }
+
+    void setArmSMEAttribute(AArch64SMETypeAttributes Kind, bool Enable = true) {
+      if (Enable)
+        AArch64SMEAttributes |= Kind;
+      else
+        AArch64SMEAttributes &= ~Kind;
     }
   };
 
@@ -4299,6 +4345,7 @@ public:
     EPI.TypeQuals = getMethodQuals();
     EPI.RefQualifier = getRefQualifier();
     EPI.ExtParameterInfos = getExtParameterInfosOrNull();
+    EPI.AArch64SMEAttributes = getAArch64SMEAttributes();
     return EPI;
   }
 
@@ -4480,6 +4527,15 @@ public:
     return getTrailingObjects<ExtParameterInfo>();
   }
 
+  /// Return a bitmask describing the SME attributes on the function type, see
+  /// AArch64SMETypeAttributes for their values.
+  unsigned getAArch64SMEAttributes() const {
+    if (!hasExtraBitfields())
+      return SME_NormalFunction;
+    return getTrailingObjects<FunctionTypeExtraBitfields>()
+        ->AArch64SMEAttributes;
+  }
+
   ExtParameterInfo getExtParameterInfo(unsigned I) const {
     assert(I < getNumParams() && "parameter index out of range");
     if (hasExtParameterInfos())
diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td
index 3cc826c1463a9fc241c041eb3c358accefc3c356..682c869b0c5847936a02189aa36a1e6b4ebb8323 100644
--- a/clang/include/clang/AST/TypeProperties.td
+++ b/clang/include/clang/AST/TypeProperties.td
@@ -323,6 +323,9 @@ let Class = FunctionProtoType in {
                     ? node->getExtParameterInfos()
                     : llvm::ArrayRef<FunctionProtoType::ExtParameterInfo>() }];
   }
+  def : Property<"AArch64SMEAttributes", UInt32> {
+    let Read = [{ node->getAArch64SMEAttributes() }];
+  }
 
   def : Creator<[{
     auto extInfo = FunctionType::ExtInfo(noReturn, hasRegParm, regParm,
@@ -338,6 +341,7 @@ let Class = FunctionProtoType in {
     epi.ExceptionSpec = exceptionSpecifier;
     epi.ExtParameterInfos =
       extParameterInfo.empty() ? nullptr : extParameterInfo.data();
+    epi.AArch64SMEAttributes = AArch64SMEAttributes;
     return ctx.getFunctionType(returnType, parameters, epi);
   }]>;
 }
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index d5204b28696672261b253c2d01dc75db8d768b51..ad0ad208a7901ec56f63e6028f632203f3a2ba7d 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -415,7 +415,7 @@ class TargetArch<list<string> arches> : TargetSpec {
   let Arches = arches;
 }
 def TargetARM : TargetArch<["arm", "thumb", "armeb", "thumbeb"]>;
-def TargetAArch64 : TargetArch<["aarch64"]>;
+def TargetAArch64 : TargetArch<["aarch64", "aarch64_be", "aarch64_32"]>;
 def TargetAnyArm : TargetArch<!listconcat(TargetARM.Arches, TargetAArch64.Arches)>;
 def TargetAVR : TargetArch<["avr"]>;
 def TargetBPF : TargetArch<["bpfel", "bpfeb"]>;
@@ -2435,9 +2435,64 @@ def AArch64SVEPcs: DeclOrTypeAttr {
 
 def ArmStreaming : TypeAttr, TargetSpecificAttr<TargetAArch64> {
   let Spellings = [RegularKeyword<"__arm_streaming">];
-  let Documentation = [ArmStreamingDocs];
+  let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>;
+  let Documentation = [ArmSmeStreamingDocs];
+}
+
+def ArmStreamingCompatible : TypeAttr, TargetSpecificAttr<TargetAArch64> {
+  let Spellings = [RegularKeyword<"__arm_streaming_compatible">];
+  let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>;
+  let Documentation = [ArmSmeStreamingCompatibleDocs];
+}
+
+def ArmNew : InheritableAttr, TargetSpecificAttr<TargetAArch64> {
+  let Spellings = [RegularKeyword<"__arm_new">];
+  let Args = [VariadicStringArgument<"NewArgs">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [ArmNewDocs];
+
+  let AdditionalMembers = [{
+    bool isNewZA() const {
+      return llvm::is_contained(newArgs(), "za");
+    }
+  }];
+}
+
+def ArmIn : TypeAttr, TargetSpecificAttr<TargetAArch64> {
+  let Spellings = [RegularKeyword<"__arm_in">];
+  let Args = [VariadicStringArgument<"InArgs">];
+  let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>;
+  let Documentation = [ArmInDocs];
 }
 
+def ArmOut : TypeAttr, TargetSpecificAttr<TargetAArch64> {
+  let Spellings = [RegularKeyword<"__arm_out">];
+  let Args = [VariadicStringArgument<"OutArgs">];
+  let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>;
+  let Documentation = [ArmOutDocs];
+}
+
+def ArmInOut : TypeAttr, TargetSpecificAttr<TargetAArch64> {
+  let Spellings = [RegularKeyword<"__arm_inout">];
+  let Args = [VariadicStringArgument<"InOutArgs">];
+  let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>;
+  let Documentation = [ArmInOutDocs];
+}
+
+def ArmPreserves : TypeAttr, TargetSpecificAttr<TargetAArch64> {
+  let Spellings = [RegularKeyword<"__arm_preserves">];
+  let Args = [VariadicStringArgument<"PreserveArgs">];
+  let Subjects = SubjectList<[HasFunctionProto], ErrorDiag>;
+  let Documentation = [ArmPreservesDocs];
+}
+
+def ArmLocallyStreaming : InheritableAttr, TargetSpecificAttr<TargetAArch64> {
+  let Spellings = [RegularKeyword<"__arm_locally_streaming">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [ArmSmeLocallyStreamingDocs];
+}
+
+
 def Pure : InheritableAttr {
   let Spellings = [GCC<"pure">];
   let Documentation = [Undocumented];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 2c950231255d7ffa7a1b8d678adbe7b51aa1afa6..395982c45ecfa0dfd173582352afe7e0497decbb 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -6578,38 +6578,195 @@ Requirements on Development Tools - Engineering Specification Documentation
   }];
 }
 
-def ArmStreamingDocs : Documentation {
-  let Category = DocCatType;
+def DocCatArmSmeAttributes : DocumentationCategory<"AArch64 SME Attributes"> {
   let Content = [{
-.. Note:: This attribute has not been implemented yet, but once it is
-   implemented, it will behave as described below.
+Clang supports a number of AArch64-specific attributes to manage state
+added by the Scalable Matrix Extension (SME). This state includes the
+runtime mode that the processor is in (e.g. non-streaming or streaming)
+as well as the state of the ``ZA`` Matrix Storage.
 
-The ``__arm_streaming`` keyword is only available on AArch64 targets.
-It applies to function types and specifies that the function has a
-"streaming interface".  This means that:
+The attributes come in the form of type- and declaration attributes:
 
-* the function requires the Scalable Matrix Extension (SME)
+* The SME declaration attributes can appear anywhere that a standard
+  ``[[...]]`` declaration attribute can appear.
 
-* the function must be entered in streaming mode (that is, with PSTATE.SM
-  set to 1)
+* The SME type attributes apply only to prototyped functions and can appear
+  anywhere that a standard ``[[...]]`` type attribute can appear. The SME
+  type attributes do not apply to functions having a K&R-style
+  unprototyped function type.
 
-* the function must return in streaming mode
+See `Arm C Language Extensions <https://github.com/ARM-software/acle>`_
+for more details about the features related to the SME extension.
 
 See `Procedure Call Standard for the Arm® 64-bit Architecture (AArch64)
 <https://github.com/ARM-software/abi-aa>`_ for more details about
-streaming-interface functions.
+streaming-interface functions and shared/private-ZA interface functions.
+  }];
+}
+
+def ArmSmeStreamingDocs : Documentation {
+  let Category = DocCatArmSmeAttributes;
+  let Content = [{
+The ``__arm_streaming`` keyword applies to prototyped function types and specifies
+that the function has a "streaming interface".  This means that:
+
+* the function requires that the processor implements the Scalable Matrix
+  Extension (SME).
+
+* the function must be entered in streaming mode (that is, with PSTATE.SM
+  set to 1)
+
+* the function must return in streaming mode
 
 Clang manages PSTATE.SM automatically; it is not the source code's
-responsibility to do this.  For example, if a normal non-streaming
+responsibility to do this.  For example, if a non-streaming
 function calls an ``__arm_streaming`` function, Clang generates code
 that switches into streaming mode before calling the function and
 switches back to non-streaming mode on return.
+  }];
+}
+
+def ArmSmeStreamingCompatibleDocs : Documentation {
+  let Category = DocCatArmSmeAttributes;
+  let Content = [{
+The ``__arm_streaming_compatible`` keyword applies to prototyped function types and
+specifies that the function has a “streaming compatible interface”.  This
+means that:
 
-``__arm_streaming`` can appear anywhere that a standard ``[[...]]`` type
-attribute can appear.
+* the function may be entered in either non-streaming mode (PSTATE.SM=0) or
+  in streaming mode (PSTATE.SM=1).
 
-See `Arm C Language Extensions <https://github.com/ARM-software/acle>`_
-for more details about this extension, and for other related SME features.
+* the function must return in the same mode as it was entered.
+
+* the code executed in the function is compatible with either mode.
+
+Clang manages PSTATE.SM automatically; it is not the source code's
+responsibility to do this.  Clang will ensure that the generated code in
+streaming-compatible functions is valid in either mode (PSTATE.SM=0 or
+PSTATE.SM=1). For example, if an ``__arm_streaming_compatible`` function calls a
+non-streaming function, Clang generates code to temporarily switch out of streaming
+mode before calling the function and switch back to streaming-mode on return if
+``PSTATE.SM`` is ``1`` on entry of the caller. If ``PSTATE.SM`` is ``0`` on
+entry to the ``__arm_streaming_compatible`` function, the call will be executed
+without changing modes.
+  }];
+}
+
+def ArmInDocs : Documentation {
+  let Category = DocCatArmSmeAttributes;
+  let Content = [{
+The ``__arm_in`` keyword applies to prototyped function types and specifies
+that the function shares a given state S with its caller.  For ``__arm_in``, the
+function takes the state S as input and returns with the state S unchanged.
+
+The attribute takes string arguments to instruct the compiler which state
+is shared.  The supported states for S are:
+
+* ``"za"`` for Matrix Storage (requires SME)
+
+The attributes ``__arm_in(S)``, ``__arm_out(S)``, ``__arm_inout(S)`` and
+``__arm_preserves(S)`` are all mutually exclusive for the same state S.
+  }];
+}
+
+def ArmOutDocs : Documentation {
+  let Category = DocCatArmSmeAttributes;
+  let Content = [{
+The ``__arm_out`` keyword applies to prototyped function types and specifies
+that the function shares a given state S with its caller.  For ``__arm_out``,
+the function ignores the incoming state for S and returns new state for S.
+
+The attribute takes string arguments to instruct the compiler which state
+is shared.  The supported states for S are:
+
+* ``"za"`` for Matrix Storage (requires SME)
+
+The attributes ``__arm_in(S)``, ``__arm_out(S)``, ``__arm_inout(S)`` and
+``__arm_preserves(S)`` are all mutually exclusive for the same state S.
+  }];
+}
+
+def ArmInOutDocs : Documentation {
+  let Category = DocCatArmSmeAttributes;
+  let Content = [{
+The ``__arm_inout`` keyword applies to prototyped function types and specifies
+that the function shares a given state S with its caller.  For ``__arm_inout``,
+the function takes the state S as input and returns new state for S.
+
+The attribute takes string arguments to instruct the compiler which state
+is shared.  The supported states for S are:
+
+* ``"za"`` for Matrix Storage (requires SME)
+
+The attributes ``__arm_in(S)``, ``__arm_out(S)``, ``__arm_inout(S)`` and
+``__arm_preserves(S)`` are all mutually exclusive for the same state S.
+  }];
+}
+
+def ArmPreservesDocs : Documentation {
+  let Category = DocCatArmSmeAttributes;
+  let Content = [{
+The ``__arm_preserves`` keyword applies to prototyped function types and
+specifies that the function does not read a given state S and returns
+with state S unchanged.
+
+The attribute takes string arguments to instruct the compiler which state
+is shared.  The supported states for S are:
+
+* ``"za"`` for Matrix Storage (requires SME)
+
+The attributes ``__arm_in(S)``, ``__arm_out(S)``, ``__arm_inout(S)`` and
+``__arm_preserves(S)`` are all mutually exclusive for the same state S.
+  }];
+}
+
+def ArmSmeLocallyStreamingDocs : Documentation {
+  let Category = DocCatArmSmeAttributes;
+  let Content = [{
+The ``__arm_locally_streaming`` keyword applies to function declarations
+and specifies that all the statements in the function are executed in
+streaming mode. This means that:
+
+* the function requires that the target processor implements the Scalable Matrix
+  Extension (SME).
+
+* the program automatically puts the machine into streaming mode before
+  executing the statements and automatically restores the previous mode
+  afterwards.
+
+Clang manages PSTATE.SM automatically; it is not the source code's
+responsibility to do this.  For example, Clang will emit code to enable
+streaming mode at the start of the function, and disable streaming mode
+at the end of the function.
+  }];
+}
+
+def ArmNewDocs : Documentation {
+  let Category = DocCatArmSmeAttributes;
+  let Content = [{
+The ``__arm_new`` keyword applies to function declarations and specifies
+that the function will create a new scope for state S.
+
+The attribute takes string arguments to instruct the compiler for which state
+to create new scope.  The supported states for S are:
+
+* ``"za"`` for Matrix Storage (requires SME)
+
+For state ``"za"``, this means that:
+
+* the function requires that the target processor implements the Scalable Matrix
+  Extension (SME).
+
+* the function will commit any lazily saved ZA data.
+
+* the function will create a new ZA context and enable PSTATE.ZA.
+
+* the function will disable PSTATE.ZA (by setting it to 0) before returning.
+
+For ``__arm_new("za")`` functions Clang will set up the ZA context automatically
+on entry to the function and disable it before returning. For example, if ZA is
+in a dormant state Clang will generate the code to commit a lazy-save and set up
+a new ZA state before executing user code.
   }];
 }
 
diff --git a/clang/include/clang/Basic/AttributeCommonInfo.h b/clang/include/clang/Basic/AttributeCommonInfo.h
index 6396c0dc6ef0299a530210f0dd14f2d791751245..99846f20482cbba9c05fe360ece460efe42df4e4 100644
--- a/clang/include/clang/Basic/AttributeCommonInfo.h
+++ b/clang/include/clang/Basic/AttributeCommonInfo.h
@@ -235,6 +235,19 @@ protected:
     return SpellingIndex != SpellingNotCalculated;
   }
 };
+
+inline bool doesKeywordAttributeTakeArgs(tok::TokenKind Kind) {
+  switch (Kind) {
+  default:
+    return false;
+#define KEYWORD_ATTRIBUTE(NAME, HASARG, ...)                                   \
+  case tok::kw_##NAME:                                                         \
+    return HASARG;
+#include "clang/Basic/RegularKeywordAttrInfo.inc"
+#undef KEYWORD_ATTRIBUTE
+  }
+}
+
 } // namespace clang
 
 #endif // LLVM_CLANG_BASIC_ATTRIBUTECOMMONINFO_H
diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def
index eaae6c9ad84686864fae49d48f56e2ff53b784d3..33eb1dc2f6d8e2c9e4623df419cee627adeec6cf 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.def
+++ b/clang/include/clang/Basic/BuiltinsAArch64.def
@@ -68,6 +68,9 @@ TARGET_BUILTIN(__builtin_arm_ldg, "v*v*", "t", "mte")
 TARGET_BUILTIN(__builtin_arm_stg, "vv*", "t", "mte")
 TARGET_BUILTIN(__builtin_arm_subp, "Uiv*v*", "t", "mte")
 
+// SME state function
+BUILTIN(__builtin_arm_get_sme_state, "vULi*ULi*", "n")
+
 // Memory Operations
 TARGET_BUILTIN(__builtin_arm_mops_memset_tag, "v*v*iz", "", "mte,mops")
 
diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt
index 691d14f013cfed3f6b473e32ca09ae71582237f0..096a85af4abdcb3887a6802a225b99b1ceb3eb69 100644
--- a/clang/include/clang/Basic/CMakeLists.txt
+++ b/clang/include/clang/Basic/CMakeLists.txt
@@ -48,10 +48,10 @@ clang_tablegen(AttrSubMatchRulesList.inc -gen-clang-attr-subject-match-rule-list
   SOURCE Attr.td
   TARGET ClangAttrSubjectMatchRuleList)
 
-clang_tablegen(AttrTokenKinds.inc -gen-clang-attr-token-kinds
+clang_tablegen(RegularKeywordAttrInfo.inc -gen-clang-regular-keyword-attr-info
   -I ${CMAKE_CURRENT_SOURCE_DIR}/../../
   SOURCE Attr.td
-  TARGET ClangAttrTokenKinds
+  TARGET ClangRegularKeywordAttrInfo
   )
 
 clang_tablegen(AttrHasAttributeImpl.inc -gen-clang-attr-has-attribute-impl
@@ -96,6 +96,9 @@ clang_tablegen(arm_sve_sema_rangechecks.inc -gen-arm-sve-sema-rangechecks
   ${CLANG_BASIC_OPTIONS}
   SOURCE arm_sve.td
   TARGET ClangARMSveSemaRangeChecks)
+clang_tablegen(arm_sve_streaming_attrs.inc -gen-arm-sve-streaming-attrs
+  SOURCE arm_sve.td
+  TARGET ClangARMSveStreamingAttrs)
 clang_tablegen(arm_sme_builtins.inc -gen-arm-sme-builtins
   SOURCE arm_sme.td
   TARGET ClangARMSmeBuiltins)
@@ -105,6 +108,12 @@ clang_tablegen(arm_sme_builtin_cg.inc -gen-arm-sme-builtin-codegen
 clang_tablegen(arm_sme_sema_rangechecks.inc -gen-arm-sme-sema-rangechecks
   SOURCE arm_sme.td
   TARGET ClangARMSmeSemaRangeChecks)
+clang_tablegen(arm_sme_streaming_attrs.inc -gen-arm-sme-streaming-attrs
+  SOURCE arm_sme.td
+  TARGET ClangARMSmeStreamingAttrs)
+clang_tablegen(arm_sme_builtins_za_state.inc -gen-arm-sme-builtin-za-state
+  SOURCE arm_sme.td
+  TARGET ClangARMSmeBuiltinsZAState)
 clang_tablegen(arm_cde_builtins.inc -gen-arm-cde-builtin-def
   SOURCE arm_cde.td
   TARGET ClangARMCdeBuiltinsDef)
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index d7cc3cff0797d0dcbe9b3534df38059d7d643795..d35445db231eda6fdc447439e8eba30f367de1d6 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -56,7 +56,9 @@ def err_expected_string_literal : Error<"expected string literal "
   "%select{in %1|for diagnostic message in static_assert|"
           "for optional message in 'availability' attribute|"
           "for %select{language name|source container name|USR}1 in "
-          "'external_source_symbol' attribute}0">;
+          "'external_source_symbol' attribute|"
+          "as argument of '%1' attribute}0">;
+
 def err_invalid_string_udl : Error<
   "string literal with user-defined suffix cannot be used here">;
 def err_invalid_character_udl : Error<
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index 11022962ae9e3893c4c2f05101d35a8aca0a3e58..db2428caa5c8972992feea005aabffb3bc37cf08 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -273,6 +273,10 @@ def err_builtin_needs_feature : Error<"%0 needs target feature %1">;
 def err_function_needs_feature : Error<
   "always_inline function %1 requires target feature '%2', but would "
   "be inlined into function %0 that is compiled without support for '%2'">;
+def err_function_always_inline_attribute_mismatch : Error<
+  "always_inline function %1 and its caller %0 have mismatching %2 attributes">;
+def err_function_always_inline_new_za : Error<
+  "always_inline function %0 has new za state">;
 
 def warn_avx_calling_convention
     : Warning<"AVX vector %select{return|argument}0 of type %1 without '%2' "
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 0e97620945af4841ab9a3720b5510b57d59a0119..e304ef6aec1cd33ea1b3878552d832f72b0d1ff0 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -2035,6 +2035,10 @@ def err_different_return_type_for_overriding_virtual_function : Error<
   "than the function it overrides}1,2">;
 def note_overridden_virtual_function : Note<
   "overridden virtual function is here">;
+def err_conflicting_overriding_attributes : Error<
+  "virtual function %0 has different attributes "
+  "%diff{($) than the function it overrides (which has $)|"
+  "than the function it overrides}1,2">;
 def err_conflicting_overriding_cc_attributes : Error<
   "virtual function %0 has different calling convention attributes "
   "%diff{($) than the function it overrides (which has calling convention $)|"
@@ -3097,6 +3101,12 @@ def err_attribute_bad_sve_vector_size : Error<
 def err_attribute_arm_feature_sve_bits_unsupported : Error<
   "%0 is only supported when '-msve-vector-bits=<bits>' is specified with a "
   "value of 128, 256, 512, 1024 or 2048.">;
+def warn_attribute_arm_sm_incompat_builtin : Warning<
+  "builtin call has undefined behaviour when called from a %0 function">,
+  InGroup<DiagGroup<"undefined-arm-streaming">>;
+def warn_attribute_arm_za_builtin_no_za_state : Warning<
+  "builtin call is not valid when calling from a function without active ZA state">,
+  InGroup<DiagGroup<"undefined-arm-za">>;
 def err_sve_vector_in_non_sve_target : Error<
   "SVE vector type %0 cannot be used in a target without sve">;
 def err_attribute_riscv_rvv_bits_unsupported : Error<
@@ -3621,6 +3631,22 @@ def err_attribute_vecreturn_only_vector_member : Error<
   "the vecreturn attribute can only be used on a class or structure with one member, which must be a vector">;
 def err_attribute_vecreturn_only_pod_record : Error<
   "the vecreturn attribute can only be used on a POD (plain old data) class or structure (i.e. no virtual functions)">;
+def err_sme_attr_mismatch : Error<
+  "function declared %0 was previously declared %1, which has different SME function attributes">;
+def err_sme_call_in_non_sme_target : Error<
+  "call to a streaming function requires 'sme'">;
+def err_sme_za_call_no_za_state : Error<
+  "call to a shared ZA function requires the caller to have ZA state">;
+def err_sme_definition_using_sm_in_non_sme_target : Error<
+  "function executed in streaming-SVE mode requires 'sme'">;
+def err_sme_definition_using_za_in_non_sme_target : Error<
+  "function using ZA state requires 'sme'">;
+def err_conflicting_attributes_arm_state : Error<
+  "conflicting attributes for state '%0'">;
+def err_unknown_arm_state : Error<
+  "unknown state '%0'">;
+def err_missing_arm_state : Error<
+  "missing state for %0">;
 def err_cconv_change : Error<
   "function declared '%0' here was previously declared "
   "%select{'%2'|without calling convention}1">;
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index afd101b007b4265d1faf92af0d89f80042285c92..a540a3ee1bf0a2164b5f92aa9417d5c929d5d6f5 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -753,9 +753,9 @@ KEYWORD(__builtin_sycl_unique_stable_name, KEYSYCL)
 
 // Keywords defined by Attr.td.
 #ifndef KEYWORD_ATTRIBUTE
-#define KEYWORD_ATTRIBUTE(X, EMPTY) KEYWORD(EMPTY ## X, KEYALL)
+#define KEYWORD_ATTRIBUTE(X, HASARG, EMPTY) KEYWORD(EMPTY ## X, KEYALL)
 #endif
-#include "clang/Basic/AttrTokenKinds.inc"
+#include "clang/Basic/RegularKeywordAttrInfo.inc"
 
 // Clang-specific keywords enabled only in testing.
 TESTING_KEYWORD(__unknown_anytype , KEYALL)
diff --git a/clang/include/clang/Basic/TokenKinds.h b/clang/include/clang/Basic/TokenKinds.h
index ff117bd5afc56a4948b8aeefe66674a891e25f21..7529b922619ada3cd7993ca75bc943111ad66d1f 100644
--- a/clang/include/clang/Basic/TokenKinds.h
+++ b/clang/include/clang/Basic/TokenKinds.h
@@ -110,7 +110,7 @@ bool isPragmaAnnotation(TokenKind K);
 inline constexpr bool isRegularKeywordAttribute(TokenKind K) {
   return (false
 #define KEYWORD_ATTRIBUTE(X, ...) || (K == tok::kw_##X)
-#include "clang/Basic/AttrTokenKinds.inc"
+#include "clang/Basic/RegularKeywordAttrInfo.inc"
   );
 }
 
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index b950f5cb8acc2090e0f90dc9ff1bf05ed69c53ba..dc998d1f5668828594d228588aaa32259dfdb45e 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -20,120 +20,130 @@ include "arm_sve_sme_incl.td"
 
 multiclass ZALoad<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
   let TargetGuard = "sme" in {
-    def NAME # _H : MInst<"svld1_hor_" # n_suffix, "vimiPQ", t,
-                          [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA],
+    def NAME # _H : MInst<"svld1_hor_" # n_suffix, "vimPQ", t,
+                          [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
                           MemEltTyDefault, i_prefix # "_horiz", ch>;
 
-    def NAME # _H_VNUM : MInst<"svld1_hor_vnum_" # n_suffix, "vimiPQl", t,
-                               [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA],
+    def NAME # _H_VNUM : MInst<"svld1_hor_vnum_" # n_suffix, "vimPQl", t,
+                               [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
                                MemEltTyDefault, i_prefix # "_horiz", ch>;
 
-    def NAME # _V : MInst<"svld1_ver_" # n_suffix, "vimiPQ", t,
-                          [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA],
+    def NAME # _V : MInst<"svld1_ver_" # n_suffix, "vimPQ", t,
+                          [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
                           MemEltTyDefault, i_prefix # "_vert", ch>;
 
-    def NAME # _V_VNUM : MInst<"svld1_ver_vnum_" # n_suffix, "vimiPQl", t,
-                               [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA],
+    def NAME # _V_VNUM : MInst<"svld1_ver_vnum_" # n_suffix, "vimPQl", t,
+                               [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
                                MemEltTyDefault, i_prefix # "_vert", ch>;
   }
 }
 
-defm SVLD1_ZA8 : ZALoad<"za8", "c", "aarch64_sme_ld1b", [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_15>]>;
-defm SVLD1_ZA16 : ZALoad<"za16", "s", "aarch64_sme_ld1h", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>;
-defm SVLD1_ZA32 : ZALoad<"za32", "i", "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>;
-defm SVLD1_ZA64 : ZALoad<"za64", "l", "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>;
-defm SVLD1_ZA128 : ZALoad<"za128", "q", "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0_0>]>;
+defm SVLD1_ZA8 : ZALoad<"za8", "c", "aarch64_sme_ld1b", [ImmCheck<0, ImmCheck0_0>]>;
+defm SVLD1_ZA16 : ZALoad<"za16", "s", "aarch64_sme_ld1h", [ImmCheck<0, ImmCheck0_1>]>;
+defm SVLD1_ZA32 : ZALoad<"za32", "i", "aarch64_sme_ld1w", [ImmCheck<0, ImmCheck0_3>]>;
+defm SVLD1_ZA64 : ZALoad<"za64", "l", "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0_7>]>;
+defm SVLD1_ZA128 : ZALoad<"za128", "q", "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>]>;
 
-def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmiQ", "",
-                          [IsOverloadNone, IsStreamingCompatible, IsSharedZA],
-                          MemEltTyDefault, "aarch64_sme_ldr",
-                          [ImmCheck<1, ImmCheck0_15>]>;
+let TargetGuard = "sme" in {
+def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQl", "",
+                          [IsOverloadNone, IsStreamingCompatible, IsInOutZA],
+                          MemEltTyDefault, "aarch64_sme_ldr">;
+
+def SVLDR_ZA : MInst<"svldr_za", "vmQ", "",
+                          [IsOverloadNone, IsStreamingCompatible, IsInOutZA],
+                          MemEltTyDefault, "aarch64_sme_ldr", []>;
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // Stores
 
 multiclass ZAStore<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
   let TargetGuard = "sme" in {
-    def NAME # _H : MInst<"svst1_hor_" # n_suffix, "vimiP%", t,
-                          [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA],
+    def NAME # _H : MInst<"svst1_hor_" # n_suffix, "vimP%", t,
+                          [IsStore, IsOverloadNone, IsStreaming, IsInZA],
                           MemEltTyDefault, i_prefix # "_horiz", ch>;
 
-    def NAME # _H_VNUM : MInst<"svst1_hor_vnum_" # n_suffix, "vimiP%l", t,
-                               [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA],
+    def NAME # _H_VNUM : MInst<"svst1_hor_vnum_" # n_suffix, "vimP%l", t,
+                               [IsStore, IsOverloadNone, IsStreaming, IsInZA],
                                MemEltTyDefault, i_prefix # "_horiz", ch>;
 
-    def NAME # _V : MInst<"svst1_ver_" # n_suffix, "vimiP%", t,
-                          [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA],
+    def NAME # _V : MInst<"svst1_ver_" # n_suffix, "vimP%", t,
+                          [IsStore, IsOverloadNone, IsStreaming, IsInZA],
                           MemEltTyDefault, i_prefix # "_vert", ch>;
 
-    def NAME # _V_VNUM : MInst<"svst1_ver_vnum_" # n_suffix, "vimiP%l", t,
-                               [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA],
+    def NAME # _V_VNUM : MInst<"svst1_ver_vnum_" # n_suffix, "vimP%l", t,
+                               [IsStore, IsOverloadNone, IsStreaming, IsInZA],
                                MemEltTyDefault, i_prefix # "_vert", ch>;
   }
 }
 
-defm SVST1_ZA8 : ZAStore<"za8", "c", "aarch64_sme_st1b", [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_15>]>;
-defm SVST1_ZA16 : ZAStore<"za16", "s", "aarch64_sme_st1h", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>;
-defm SVST1_ZA32 : ZAStore<"za32", "i", "aarch64_sme_st1w", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>;
-defm SVST1_ZA64 : ZAStore<"za64", "l", "aarch64_sme_st1d", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>;
-defm SVST1_ZA128 : ZAStore<"za128", "q", "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0_0>]>;
+defm SVST1_ZA8 : ZAStore<"za8", "c", "aarch64_sme_st1b", [ImmCheck<0, ImmCheck0_0>]>;
+defm SVST1_ZA16 : ZAStore<"za16", "s", "aarch64_sme_st1h", [ImmCheck<0, ImmCheck0_1>]>;
+defm SVST1_ZA32 : ZAStore<"za32", "i", "aarch64_sme_st1w", [ImmCheck<0, ImmCheck0_3>]>;
+defm SVST1_ZA64 : ZAStore<"za64", "l", "aarch64_sme_st1d", [ImmCheck<0, ImmCheck0_7>]>;
+defm SVST1_ZA128 : ZAStore<"za128", "q", "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>]>;
 
-def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vmi%", "",
-                          [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA],
-                          MemEltTyDefault, "aarch64_sme_str",
-                          [ImmCheck<1, ImmCheck0_15>]>;
+let TargetGuard = "sme" in {
+def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%l", "",
+                          [IsOverloadNone, IsStreamingCompatible, IsInZA],
+                          MemEltTyDefault, "aarch64_sme_str">;
+
+def SVSTR_ZA : MInst<"svstr_za", "vm%", "",
+                      [IsOverloadNone, IsStreamingCompatible, IsInZA],
+                      MemEltTyDefault, "aarch64_sme_str", []>;
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // Read horizontal/vertical ZA slices
 
 multiclass ZARead<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
   let TargetGuard = "sme" in {
-    def NAME # _H : SInst<"svread_hor_" # n_suffix # "[_{d}]", "ddPimi", t,
+    def NAME # _H : SInst<"svread_hor_" # n_suffix # "[_{d}]", "ddPim", t,
                           MergeOp1, i_prefix # "_horiz",
-                          [IsReadZA, IsStreaming, IsSharedZA, IsPreservesZA], ch>;
+                          [IsReadZA, IsStreaming, IsInZA], ch>;
 
-    def NAME # _V : SInst<"svread_ver_" # n_suffix # "[_{d}]", "ddPimi", t,
+    def NAME # _V : SInst<"svread_ver_" # n_suffix # "[_{d}]", "ddPim", t,
                           MergeOp1, i_prefix # "_vert",
-                          [IsReadZA, IsStreaming, IsSharedZA, IsPreservesZA], ch>;
+                          [IsReadZA, IsStreaming, IsInZA], ch>;
   }
 }
 
-defm SVREAD_ZA8 : ZARead<"za8", "cUc", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_0>, ImmCheck<4, ImmCheck0_15>]>;
-defm SVREAD_ZA16 : ZARead<"za16", "sUshb", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_1>, ImmCheck<4, ImmCheck0_7>]>;
-defm SVREAD_ZA32 : ZARead<"za32", "iUif", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_3>, ImmCheck<4, ImmCheck0_3>]>;
-defm SVREAD_ZA64 : ZARead<"za64", "lUld", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_7>, ImmCheck<4, ImmCheck0_1>]>;
-defm SVREAD_ZA128 : ZARead<"za128", "csilUcUsUiUlhbfd", "aarch64_sme_readq", [ImmCheck<2, ImmCheck0_15>, ImmCheck<4, ImmCheck0_0>]>;
+defm SVREAD_ZA8 : ZARead<"za8", "cUc", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_0>]>;
+defm SVREAD_ZA16 : ZARead<"za16", "sUshb", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_1>]>;
+defm SVREAD_ZA32 : ZARead<"za32", "iUif", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_3>]>;
+defm SVREAD_ZA64 : ZARead<"za64", "lUld", "aarch64_sme_read", [ImmCheck<2, ImmCheck0_7>]>;
+defm SVREAD_ZA128 : ZARead<"za128", "csilUcUsUiUlhbfd", "aarch64_sme_readq", [ImmCheck<2, ImmCheck0_15>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Write horizontal/vertical ZA slices
 
 multiclass ZAWrite<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
   let TargetGuard = "sme" in {
-    def NAME # _H : SInst<"svwrite_hor_" # n_suffix # "[_{d}]", "vimiPd", t,
+    def NAME # _H : SInst<"svwrite_hor_" # n_suffix # "[_{d}]", "vimPd", t,
                           MergeOp1, i_prefix # "_horiz",
-                          [IsWriteZA, IsStreaming, IsSharedZA], ch>;
+                          [IsWriteZA, IsStreaming, IsInOutZA], ch>;
 
-    def NAME # _V : SInst<"svwrite_ver_" # n_suffix # "[_{d}]", "vimiPd", t,
+    def NAME # _V : SInst<"svwrite_ver_" # n_suffix # "[_{d}]", "vimPd", t,
                           MergeOp1, i_prefix # "_vert",
-                          [IsWriteZA, IsStreaming, IsSharedZA], ch>;
+                          [IsWriteZA, IsStreaming, IsInOutZA], ch>;
   }
 }
 
-defm SVWRITE_ZA8 : ZAWrite<"za8", "cUc", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_15>]>;
-defm SVWRITE_ZA16 : ZAWrite<"za16", "sUshb", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_1>, ImmCheck<2, ImmCheck0_7>]>;
-defm SVWRITE_ZA32 : ZAWrite<"za32", "iUif", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_3>, ImmCheck<2, ImmCheck0_3>]>;
-defm SVWRITE_ZA64 : ZAWrite<"za64", "lUld", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_7>, ImmCheck<2, ImmCheck0_1>]>;
-defm SVWRITE_ZA128 : ZAWrite<"za128", "csilUcUsUiUlhbfd", "aarch64_sme_writeq", [ImmCheck<0, ImmCheck0_15>, ImmCheck<2, ImmCheck0_0>]>;
+defm SVWRITE_ZA8 : ZAWrite<"za8", "cUc", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_0>]>;
+defm SVWRITE_ZA16 : ZAWrite<"za16", "sUshb", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_1>]>;
+defm SVWRITE_ZA32 : ZAWrite<"za32", "iUif", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_3>]>;
+defm SVWRITE_ZA64 : ZAWrite<"za64", "lUld", "aarch64_sme_write", [ImmCheck<0, ImmCheck0_7>]>;
+defm SVWRITE_ZA128 : ZAWrite<"za128", "csilUcUsUiUlhbfd", "aarch64_sme_writeq", [ImmCheck<0, ImmCheck0_15>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // SME - Zero
 
 let TargetGuard = "sme" in {
   def SVZERO_MASK_ZA : SInst<"svzero_mask_za", "vi", "", MergeNone, "aarch64_sme_zero",
-                             [IsOverloadNone, IsStreamingCompatible, IsSharedZA],
+                             [IsOverloadNone, IsStreamingCompatible, IsInOutZA],
                              [ImmCheck<0, ImmCheck0_255>]>;
   def SVZERO_ZA      : SInst<"svzero_za", "v", "", MergeNone, "aarch64_sme_zero",
-                             [IsOverloadNone, IsStreamingCompatible, IsSharedZA]>;
+                             [IsOverloadNone, IsStreamingCompatible, IsOutZA]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -143,7 +153,7 @@ multiclass ZACount<string n_suffix> {
   let TargetGuard = "sme" in {
     def NAME : SInst<"sv" # n_suffix, "nv", "", MergeNone,
                       "aarch64_sme_" # n_suffix,
-                      [IsOverloadNone, IsStreamingCompatible, IsPreservesZA]>;
+                      [IsOverloadNone, IsStreamingCompatible]>;
   }
 }
 
@@ -158,13 +168,13 @@ defm SVCNTSD : ZACount<"cntsd">;
 multiclass ZAAdd<string n_suffix> {
   let TargetGuard = "sme" in {
     def NAME # _ZA32: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPd", "iUi", MergeOp1,
-                      "aarch64_sme_" # n_suffix, [IsStreaming, IsSharedZA],
+                      "aarch64_sme_" # n_suffix, [IsStreaming, IsInOutZA],
                       [ImmCheck<0, ImmCheck0_3>]>;
   }
 
   let TargetGuard = "sme-i16i64" in {
     def NAME # _ZA64: SInst<"sv" # n_suffix # "_za64[_{d}]", "viPPd", "lUl", MergeOp1,
-                     "aarch64_sme_" # n_suffix, [IsStreaming, IsSharedZA],
+                     "aarch64_sme_" # n_suffix, [IsStreaming, IsInOutZA],
                      [ImmCheck<0, ImmCheck0_7>]>;
   }
 }
@@ -180,7 +190,7 @@ multiclass ZAIntOuterProd<string n_suffix1, string n_suffix2> {
     def NAME # _ZA32_B: SInst<"sv" # n_suffix2 # "_za32[_{d}]",
                               "viPPdd", !cond(!eq(n_suffix1, "s") : "", true: "U") # "c",
                               MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide",
-                              [IsStreaming, IsSharedZA],
+                              [IsStreaming, IsInOutZA],
                               [ImmCheck<0, ImmCheck0_3>]>;
   }
 
@@ -188,7 +198,7 @@ multiclass ZAIntOuterProd<string n_suffix1, string n_suffix2> {
     def NAME # _ZA64_H: SInst<"sv" # n_suffix2 # "_za64[_{d}]",
                               "viPPdd", !cond(!eq(n_suffix1, "s") : "", true: "U") # "s",
                               MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide",
-                              [IsStreaming, IsSharedZA],
+                              [IsStreaming, IsInOutZA],
                               [ImmCheck<0, ImmCheck0_7>]>;
   }
 }
@@ -207,7 +217,7 @@ multiclass ZAIntOuterProdMixedSigns<string n_suffix1, string n_suffix2> {
                               "viPPd" # !cond(!eq(n_suffix1, "su") : "u", true: "x"),
                               !cond(!eq(n_suffix1, "su") : "", true: "U") # "c",
                               MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide",
-                              [IsStreaming, IsSharedZA],
+                              [IsStreaming, IsInOutZA],
                               [ImmCheck<0, ImmCheck0_3>]>;
   }
 
@@ -216,7 +226,7 @@ multiclass ZAIntOuterProdMixedSigns<string n_suffix1, string n_suffix2> {
                               "viPPd" # !cond(!eq(n_suffix1, "su") : "u", true: "x"),
                               !cond(!eq(n_suffix1, "su") : "", true: "U") # "s",
                               MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide",
-                              [IsStreaming, IsSharedZA],
+                              [IsStreaming, IsInOutZA],
                               [ImmCheck<0, ImmCheck0_7>]>;
   }
 }
@@ -233,27 +243,62 @@ multiclass ZAFPOuterProd<string n_suffix> {
   let TargetGuard = "sme" in {
     def NAME # _ZA32_B: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "h",
                               MergeOp1, "aarch64_sme_" # n_suffix # "_wide",
-                              [IsStreaming, IsSharedZA],
+                              [IsStreaming, IsInOutZA],
                               [ImmCheck<0, ImmCheck0_3>]>;
 
     def NAME # _ZA32_H: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "b",
                               MergeOp1, "aarch64_sme_" # n_suffix # "_wide",
-                              [IsStreaming, IsSharedZA],
+                              [IsStreaming, IsInOutZA],
                               [ImmCheck<0, ImmCheck0_3>]>;
 
     def NAME # _ZA32_S: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "f",
                               MergeOp1, "aarch64_sme_" # n_suffix,
-                              [IsStreaming, IsSharedZA],
+                              [IsStreaming, IsInOutZA],
                               [ImmCheck<0, ImmCheck0_3>]>;
   }
 
   let TargetGuard = "sme-f64f64" in {
     def NAME # _ZA64_D: SInst<"sv" # n_suffix # "_za64[_{d}]", "viPPdd", "d",
                               MergeOp1, "aarch64_sme_" # n_suffix,
-                              [IsStreaming, IsSharedZA],
-                              [ImmCheck<0, ImmCheck0_3>]>;
+                              [IsStreaming, IsInOutZA],
+                              [ImmCheck<0, ImmCheck0_7>]>;
   }
 }
 
 defm SVMOPA : ZAFPOuterProd<"mopa">;
 defm SVMOPS : ZAFPOuterProd<"mops">;
+
+////////////////////////////////////////////////////////////////////////////////
+// SME2 - ADD, SUB
+
+multiclass ZAAddSub<string n_suffix> {
+  let TargetGuard = "sme2" in {
+    def NAME # _WRITE_SINGLE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x2", "vm2d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsInOutZA], []>;
+    def NAME # _WRITE_SINGLE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x4", "vm4d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+    def NAME # _WRITE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x2", "vm22", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsInOutZA], []>;
+    def NAME # _WRITE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x4", "vm44", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+    def NAME # _ZA32_VG1x2_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x2", "vm2", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x2", [IsStreaming, IsInOutZA], []>;
+    def NAME # _ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x4", "vm4", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x4", [IsStreaming, IsInOutZA], []>;
+  }
+
+  let TargetGuard = "sme2,sme-i16i64" in {
+    def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsInOutZA], []>;
+    def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+    def NAME # _WRITE_ZA64_VG1x2_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x2", "vm22", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsInOutZA], []>;
+    def NAME # _WRITE_ZA64_VG1x4_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x4", "vm44", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+    def NAME # _ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>;
+    def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>;
+  }
+
+  let TargetGuard = "sme2,sme-f64f64" in {
+    def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>;
+    def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>;
+  }
+}
+
+defm SVADD : ZAAddSub<"add">;
+defm SVSUB : ZAAddSub<"sub">;
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 894a0a1296b047332e61438b2e0c296c647b9316..ef013977ffdc344a7851375a7e1a1ed4cd02e31b 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -19,27 +19,27 @@ include "arm_sve_sme_incl.td"
 // Loads
 
 // Load one vector (scalar base)
-def SVLD1   : MInst<"svld1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad],               MemEltTyDefault, "aarch64_sve_ld1">;
-def SVLD1SB : MInst<"svld1sb_{d}", "dPS", "silUsUiUl",       [IsLoad],               MemEltTyInt8,    "aarch64_sve_ld1">;
-def SVLD1UB : MInst<"svld1ub_{d}", "dPW", "silUsUiUl",       [IsLoad, IsZExtReturn], MemEltTyInt8,    "aarch64_sve_ld1">;
-def SVLD1SH : MInst<"svld1sh_{d}", "dPT", "ilUiUl",          [IsLoad],               MemEltTyInt16,   "aarch64_sve_ld1">;
-def SVLD1UH : MInst<"svld1uh_{d}", "dPX", "ilUiUl",          [IsLoad, IsZExtReturn], MemEltTyInt16,   "aarch64_sve_ld1">;
-def SVLD1SW : MInst<"svld1sw_{d}", "dPU", "lUl",             [IsLoad],               MemEltTyInt32,   "aarch64_sve_ld1">;
-def SVLD1UW : MInst<"svld1uw_{d}", "dPY", "lUl",             [IsLoad, IsZExtReturn], MemEltTyInt32,   "aarch64_sve_ld1">;
+def SVLD1   : MInst<"svld1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible],               MemEltTyDefault, "aarch64_sve_ld1">;
+def SVLD1SB : MInst<"svld1sb_{d}", "dPS", "silUsUiUl",       [IsLoad, IsStreamingCompatible],               MemEltTyInt8,    "aarch64_sve_ld1">;
+def SVLD1UB : MInst<"svld1ub_{d}", "dPW", "silUsUiUl",       [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt8,    "aarch64_sve_ld1">;
+def SVLD1SH : MInst<"svld1sh_{d}", "dPT", "ilUiUl",          [IsLoad, IsStreamingCompatible],               MemEltTyInt16,   "aarch64_sve_ld1">;
+def SVLD1UH : MInst<"svld1uh_{d}", "dPX", "ilUiUl",          [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt16,   "aarch64_sve_ld1">;
+def SVLD1SW : MInst<"svld1sw_{d}", "dPU", "lUl",             [IsLoad, IsStreamingCompatible],               MemEltTyInt32,   "aarch64_sve_ld1">;
+def SVLD1UW : MInst<"svld1uw_{d}", "dPY", "lUl",             [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt32,   "aarch64_sve_ld1">;
 
 let TargetGuard = "sve,bf16" in {
-  def SVLD1_BF      : MInst<"svld1[_{2}]",      "dPc",  "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">;
-  def SVLD1_VNUM_BF : MInst<"svld1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">;
+  def SVLD1_BF      : MInst<"svld1[_{2}]",      "dPc",  "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">;
+  def SVLD1_VNUM_BF : MInst<"svld1_vnum[_{2}]", "dPcl", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ld1">;
 }
 
 // Load one vector (scalar base, VL displacement)
-def SVLD1_VNUM   : MInst<"svld1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad],               MemEltTyDefault, "aarch64_sve_ld1">;
-def SVLD1SB_VNUM : MInst<"svld1sb_vnum_{d}", "dPSl", "silUsUiUl",       [IsLoad],               MemEltTyInt8,    "aarch64_sve_ld1">;
-def SVLD1UB_VNUM : MInst<"svld1ub_vnum_{d}", "dPWl", "silUsUiUl",       [IsLoad, IsZExtReturn], MemEltTyInt8,    "aarch64_sve_ld1">;
-def SVLD1SH_VNUM : MInst<"svld1sh_vnum_{d}", "dPTl", "ilUiUl",          [IsLoad],               MemEltTyInt16,   "aarch64_sve_ld1">;
-def SVLD1UH_VNUM : MInst<"svld1uh_vnum_{d}", "dPXl", "ilUiUl",          [IsLoad, IsZExtReturn], MemEltTyInt16,   "aarch64_sve_ld1">;
-def SVLD1SW_VNUM : MInst<"svld1sw_vnum_{d}", "dPUl", "lUl",             [IsLoad],               MemEltTyInt32,   "aarch64_sve_ld1">;
-def SVLD1UW_VNUM : MInst<"svld1uw_vnum_{d}", "dPYl", "lUl",             [IsLoad, IsZExtReturn], MemEltTyInt32,   "aarch64_sve_ld1">;
+def SVLD1_VNUM   : MInst<"svld1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible],               MemEltTyDefault, "aarch64_sve_ld1">;
+def SVLD1SB_VNUM : MInst<"svld1sb_vnum_{d}", "dPSl", "silUsUiUl",       [IsLoad, IsStreamingCompatible],               MemEltTyInt8,    "aarch64_sve_ld1">;
+def SVLD1UB_VNUM : MInst<"svld1ub_vnum_{d}", "dPWl", "silUsUiUl",       [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt8,    "aarch64_sve_ld1">;
+def SVLD1SH_VNUM : MInst<"svld1sh_vnum_{d}", "dPTl", "ilUiUl",          [IsLoad, IsStreamingCompatible],               MemEltTyInt16,   "aarch64_sve_ld1">;
+def SVLD1UH_VNUM : MInst<"svld1uh_vnum_{d}", "dPXl", "ilUiUl",          [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt16,   "aarch64_sve_ld1">;
+def SVLD1SW_VNUM : MInst<"svld1sw_vnum_{d}", "dPUl", "lUl",             [IsLoad, IsStreamingCompatible],               MemEltTyInt32,   "aarch64_sve_ld1">;
+def SVLD1UW_VNUM : MInst<"svld1uw_vnum_{d}", "dPYl", "lUl",             [IsLoad, IsZExtReturn, IsStreamingCompatible], MemEltTyInt32,   "aarch64_sve_ld1">;
 
 // Load one vector (vector base)
 def SVLD1_GATHER_BASES_U   : MInst<"svld1_gather[_{2}base]_{d}",   "dPu", "ilUiUlfd", [IsGatherLoad],               MemEltTyDefault, "aarch64_sve_ld1_gather_scalar_offset">;
@@ -243,27 +243,27 @@ let TargetGuard = "sve,bf16" in {
 }
 
 // Load one vector, unextended load, non-temporal (scalar base)
-def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">;
+def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">;
 
 // Load one vector, unextended load, non-temporal (scalar base, VL displacement)
-def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">;
+def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">;
 
 let TargetGuard = "sve,bf16" in {
-  def SVLDNT1_BF      : MInst<"svldnt1[_{2}]",      "dPc",  "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">;
-  def SVLDNT1_VNUM_BF : MInst<"svldnt1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">;
+  def SVLDNT1_BF      : MInst<"svldnt1[_{2}]",      "dPc",  "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">;
+  def SVLDNT1_VNUM_BF : MInst<"svldnt1_vnum[_{2}]", "dPcl", "b", [IsLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_ldnt1">;
 }
 
 // Load one quadword and replicate (scalar base)
-def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq">;
+def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq", [IsStreamingCompatible]>;
 
 let TargetGuard = "sve,bf16" in {
-  def SVLD1RQ_BF : SInst<"svld1rq[_{2}]", "dPc",  "b", MergeNone, "aarch64_sve_ld1rq">;
+  def SVLD1RQ_BF : SInst<"svld1rq[_{2}]", "dPc",  "b", MergeNone, "aarch64_sve_ld1rq", [IsStreamingCompatible]>;
 }
 
 multiclass StructLoad<string name, string proto, string i> {
-  def : SInst<name, proto, "csilUcUsUiUlhfd", MergeNone, i, [IsStructLoad]>;
+  def : SInst<name, proto, "csilUcUsUiUlhfd", MergeNone, i, [IsStructLoad, IsStreamingCompatible]>;
   let TargetGuard = "sve,bf16" in {
-    def: SInst<name, proto, "b", MergeNone, i, [IsStructLoad]>;
+    def: SInst<name, proto, "b", MergeNone, i, [IsStructLoad, IsStreamingCompatible]>;
   }
 }
 
@@ -286,42 +286,42 @@ let TargetGuard = "sve,f64mm,bf16" in {
 }
 
 let TargetGuard = "sve,bf16" in {
-  def SVBFDOT        : SInst<"svbfdot[_{0}]",        "MMdd",  "b", MergeNone, "aarch64_sve_bfdot",        [IsOverloadNone]>;
-  def SVBFMLALB      : SInst<"svbfmlalb[_{0}]",      "MMdd",  "b", MergeNone, "aarch64_sve_bfmlalb",      [IsOverloadNone]>;
-  def SVBFMLALT      : SInst<"svbfmlalt[_{0}]",      "MMdd",  "b", MergeNone, "aarch64_sve_bfmlalt",      [IsOverloadNone]>;
-  def SVBFMMLA       : SInst<"svbfmmla[_{0}]",       "MMdd",  "b", MergeNone, "aarch64_sve_bfmmla",       [IsOverloadNone]>;
-  def SVBFDOT_N      : SInst<"svbfdot[_n_{0}]",      "MMda",  "b", MergeNone, "aarch64_sve_bfdot",        [IsOverloadNone]>;
-  def SVBFMLAL_N     : SInst<"svbfmlalb[_n_{0}]",    "MMda",  "b", MergeNone, "aarch64_sve_bfmlalb",      [IsOverloadNone]>;
-  def SVBFMLALT_N    : SInst<"svbfmlalt[_n_{0}]",    "MMda",  "b", MergeNone, "aarch64_sve_bfmlalt",      [IsOverloadNone]>;
-  def SVBFDOT_LANE   : SInst<"svbfdot_lane[_{0}]",   "MMddi", "b", MergeNone, "aarch64_sve_bfdot_lane_v2",   [IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVBFMLALB_LANE : SInst<"svbfmlalb_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalb_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalt_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVBFDOT        : SInst<"svbfdot[_{0}]",        "MMdd",  "b", MergeNone, "aarch64_sve_bfdot",        [IsOverloadNone, IsStreamingCompatible]>;
+  def SVBFMLALB      : SInst<"svbfmlalb[_{0}]",      "MMdd",  "b", MergeNone, "aarch64_sve_bfmlalb",      [IsOverloadNone, IsStreamingCompatible]>;
+  def SVBFMLALT      : SInst<"svbfmlalt[_{0}]",      "MMdd",  "b", MergeNone, "aarch64_sve_bfmlalt",      [IsOverloadNone, IsStreamingCompatible]>;
+  def SVBFMMLA       : SInst<"svbfmmla[_{0}]",       "MMdd",  "b", MergeNone, "aarch64_sve_bfmmla",       [IsOverloadNone, IsStreamingCompatible]>;
+  def SVBFDOT_N      : SInst<"svbfdot[_n_{0}]",      "MMda",  "b", MergeNone, "aarch64_sve_bfdot",        [IsOverloadNone, IsStreamingCompatible]>;
+  def SVBFMLAL_N     : SInst<"svbfmlalb[_n_{0}]",    "MMda",  "b", MergeNone, "aarch64_sve_bfmlalb",      [IsOverloadNone, IsStreamingCompatible]>;
+  def SVBFMLALT_N    : SInst<"svbfmlalt[_n_{0}]",    "MMda",  "b", MergeNone, "aarch64_sve_bfmlalt",      [IsOverloadNone, IsStreamingCompatible]>;
+  def SVBFDOT_LANE   : SInst<"svbfdot_lane[_{0}]",   "MMddi", "b", MergeNone, "aarch64_sve_bfdot_lane_v2",   [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVBFMLALB_LANE : SInst<"svbfmlalb_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalb_lane_v2", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalt_lane_v2", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<3, ImmCheck0_7>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Stores
 
 // Store one vector (scalar base)
-def SVST1    : MInst<"svst1[_{d}]",  "vPpd", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_st1">;
-def SVST1B_S : MInst<"svst1b[_{d}]", "vPAd", "sil",             [IsStore], MemEltTyInt8,    "aarch64_sve_st1">;
-def SVST1B_U : MInst<"svst1b[_{d}]", "vPEd", "UsUiUl",          [IsStore], MemEltTyInt8,    "aarch64_sve_st1">;
-def SVST1H_S : MInst<"svst1h[_{d}]", "vPBd", "il",              [IsStore], MemEltTyInt16,   "aarch64_sve_st1">;
-def SVST1H_U : MInst<"svst1h[_{d}]", "vPFd", "UiUl",            [IsStore], MemEltTyInt16,   "aarch64_sve_st1">;
-def SVST1W_S : MInst<"svst1w[_{d}]", "vPCd", "l",               [IsStore], MemEltTyInt32,   "aarch64_sve_st1">;
-def SVST1W_U : MInst<"svst1w[_{d}]", "vPGd", "Ul",              [IsStore], MemEltTyInt32,   "aarch64_sve_st1">;
+def SVST1    : MInst<"svst1[_{d}]",  "vPpd", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">;
+def SVST1B_S : MInst<"svst1b[_{d}]", "vPAd", "sil",             [IsStore, IsStreamingCompatible], MemEltTyInt8,    "aarch64_sve_st1">;
+def SVST1B_U : MInst<"svst1b[_{d}]", "vPEd", "UsUiUl",          [IsStore, IsStreamingCompatible], MemEltTyInt8,    "aarch64_sve_st1">;
+def SVST1H_S : MInst<"svst1h[_{d}]", "vPBd", "il",              [IsStore, IsStreamingCompatible], MemEltTyInt16,   "aarch64_sve_st1">;
+def SVST1H_U : MInst<"svst1h[_{d}]", "vPFd", "UiUl",            [IsStore, IsStreamingCompatible], MemEltTyInt16,   "aarch64_sve_st1">;
+def SVST1W_S : MInst<"svst1w[_{d}]", "vPCd", "l",               [IsStore, IsStreamingCompatible], MemEltTyInt32,   "aarch64_sve_st1">;
+def SVST1W_U : MInst<"svst1w[_{d}]", "vPGd", "Ul",              [IsStore, IsStreamingCompatible], MemEltTyInt32,   "aarch64_sve_st1">;
 
 // Store one vector (scalar base, VL displacement)
-def SVST1_VNUM    : MInst<"svst1_vnum[_{d}]",  "vPpld", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_st1">;
-def SVST1B_VNUM_S : MInst<"svst1b_vnum[_{d}]", "vPAld", "sil",             [IsStore], MemEltTyInt8,    "aarch64_sve_st1">;
-def SVST1B_VNUM_U : MInst<"svst1b_vnum[_{d}]", "vPEld", "UsUiUl",          [IsStore], MemEltTyInt8,    "aarch64_sve_st1">;
-def SVST1H_VNUM_S : MInst<"svst1h_vnum[_{d}]", "vPBld", "il",              [IsStore], MemEltTyInt16,   "aarch64_sve_st1">;
-def SVST1H_VNUM_U : MInst<"svst1h_vnum[_{d}]", "vPFld", "UiUl",            [IsStore], MemEltTyInt16,   "aarch64_sve_st1">;
-def SVST1W_VNUM_S : MInst<"svst1w_vnum[_{d}]", "vPCld", "l",               [IsStore], MemEltTyInt32,   "aarch64_sve_st1">;
-def SVST1W_VNUM_U : MInst<"svst1w_vnum[_{d}]", "vPGld", "Ul",              [IsStore], MemEltTyInt32,   "aarch64_sve_st1">;
+def SVST1_VNUM    : MInst<"svst1_vnum[_{d}]",  "vPpld", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">;
+def SVST1B_VNUM_S : MInst<"svst1b_vnum[_{d}]", "vPAld", "sil",             [IsStore, IsStreamingCompatible], MemEltTyInt8,    "aarch64_sve_st1">;
+def SVST1B_VNUM_U : MInst<"svst1b_vnum[_{d}]", "vPEld", "UsUiUl",          [IsStore, IsStreamingCompatible], MemEltTyInt8,    "aarch64_sve_st1">;
+def SVST1H_VNUM_S : MInst<"svst1h_vnum[_{d}]", "vPBld", "il",              [IsStore, IsStreamingCompatible], MemEltTyInt16,   "aarch64_sve_st1">;
+def SVST1H_VNUM_U : MInst<"svst1h_vnum[_{d}]", "vPFld", "UiUl",            [IsStore, IsStreamingCompatible], MemEltTyInt16,   "aarch64_sve_st1">;
+def SVST1W_VNUM_S : MInst<"svst1w_vnum[_{d}]", "vPCld", "l",               [IsStore, IsStreamingCompatible], MemEltTyInt32,   "aarch64_sve_st1">;
+def SVST1W_VNUM_U : MInst<"svst1w_vnum[_{d}]", "vPGld", "Ul",              [IsStore, IsStreamingCompatible], MemEltTyInt32,   "aarch64_sve_st1">;
 
 let TargetGuard = "sve,bf16" in {
-  def SVST1_BF      : MInst<"svst1[_{d}]",      "vPpd",  "b", [IsStore], MemEltTyDefault, "aarch64_sve_st1">;
-  def SVST1_VNUM_BF : MInst<"svst1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_st1">;
+  def SVST1_BF      : MInst<"svst1[_{d}]",      "vPpd",  "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">;
+  def SVST1_VNUM_BF : MInst<"svst1_vnum[_{d}]", "vPpld", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_st1">;
 }
 
 // Store one vector (vector base)
@@ -394,9 +394,9 @@ def SVST1H_SCATTER_INDEX_S    : MInst<"svst1h_scatter[_{2}base]_index[_{d}]", "v
 def SVST1W_SCATTER_INDEX_S    : MInst<"svst1w_scatter[_{2}base]_index[_{d}]", "vPuld", "lUl",      [IsScatterStore], MemEltTyInt32,   "aarch64_sve_st1_scatter_scalar_offset">;
 
 multiclass StructStore<string name, string proto, string i> {
-  def : SInst<name, proto, "csilUcUsUiUlhfd", MergeNone, i, [IsStructStore]>;
+  def : SInst<name, proto, "csilUcUsUiUlhfd", MergeNone, i, [IsStructStore, IsStreamingCompatible]>;
   let TargetGuard = "sve,bf16" in {
-    def: SInst<name, proto, "b", MergeNone, i, [IsStructStore]>;
+    def: SInst<name, proto, "b", MergeNone, i, [IsStructStore, IsStreamingCompatible]>;
   }
 }
 // Store N vectors into N-element structure (scalar base)
@@ -410,30 +410,30 @@ defm SVST3_VNUM : StructStore<"svst3_vnum[_{d}]", "vPpl3", "aarch64_sve_st3">;
 defm SVST4_VNUM : StructStore<"svst4_vnum[_{d}]", "vPpl4", "aarch64_sve_st4">;
 
 // Store one vector, with no truncation, non-temporal (scalar base)
-def SVSTNT1 : MInst<"svstnt1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">;
+def SVSTNT1 : MInst<"svstnt1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">;
 
 // Store one vector, with no truncation, non-temporal (scalar base, VL displacement)
-def SVSTNT1_VNUM : MInst<"svstnt1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">;
+def SVSTNT1_VNUM : MInst<"svstnt1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">;
 
 let TargetGuard = "sve,bf16" in {
-  def SVSTNT1_BF      : MInst<"svstnt1[_{d}]",      "vPpd",  "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">;
-  def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">;
+  def SVSTNT1_BF      : MInst<"svstnt1[_{d}]",      "vPpd",  "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">;
+  def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_stnt1">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Prefetches
 
 // Prefetch (Scalar base)
-def SVPRFB : MInst<"svprfb", "vPQJ", "c", [IsPrefetch], MemEltTyInt8,  "aarch64_sve_prf">;
-def SVPRFH : MInst<"svprfh", "vPQJ", "s", [IsPrefetch], MemEltTyInt16, "aarch64_sve_prf">;
-def SVPRFW : MInst<"svprfw", "vPQJ", "i", [IsPrefetch], MemEltTyInt32, "aarch64_sve_prf">;
-def SVPRFD : MInst<"svprfd", "vPQJ", "l", [IsPrefetch], MemEltTyInt64, "aarch64_sve_prf">;
+def SVPRFB : MInst<"svprfb", "vPQJ", "c", [IsPrefetch, IsStreamingCompatible], MemEltTyInt8,  "aarch64_sve_prf">;
+def SVPRFH : MInst<"svprfh", "vPQJ", "s", [IsPrefetch, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_prf">;
+def SVPRFW : MInst<"svprfw", "vPQJ", "i", [IsPrefetch, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_prf">;
+def SVPRFD : MInst<"svprfd", "vPQJ", "l", [IsPrefetch, IsStreamingCompatible], MemEltTyInt64, "aarch64_sve_prf">;
 
 // Prefetch (Scalar base, VL displacement)
-def SVPRFB_VNUM : MInst<"svprfb_vnum", "vPQlJ", "c", [IsPrefetch], MemEltTyInt8,  "aarch64_sve_prf">;
-def SVPRFH_VNUM : MInst<"svprfh_vnum", "vPQlJ", "s", [IsPrefetch], MemEltTyInt16, "aarch64_sve_prf">;
-def SVPRFW_VNUM : MInst<"svprfw_vnum", "vPQlJ", "i", [IsPrefetch], MemEltTyInt32, "aarch64_sve_prf">;
-def SVPRFD_VNUM : MInst<"svprfd_vnum", "vPQlJ", "l", [IsPrefetch], MemEltTyInt64, "aarch64_sve_prf">;
+def SVPRFB_VNUM : MInst<"svprfb_vnum", "vPQlJ", "c", [IsPrefetch, IsStreamingCompatible], MemEltTyInt8,  "aarch64_sve_prf">;
+def SVPRFH_VNUM : MInst<"svprfh_vnum", "vPQlJ", "s", [IsPrefetch, IsStreamingCompatible], MemEltTyInt16, "aarch64_sve_prf">;
+def SVPRFW_VNUM : MInst<"svprfw_vnum", "vPQlJ", "i", [IsPrefetch, IsStreamingCompatible], MemEltTyInt32, "aarch64_sve_prf">;
+def SVPRFD_VNUM : MInst<"svprfd_vnum", "vPQlJ", "l", [IsPrefetch, IsStreamingCompatible], MemEltTyInt64, "aarch64_sve_prf">;
 
 // Prefetch (Vector bases)
 def SVPRFB_GATHER_BASES : MInst<"svprfb_gather[_{2}base]", "vPdJ", "UiUl", [IsGatherPrefetch], MemEltTyInt8,  "aarch64_sve_prfb_gather_scalar_offset">;
@@ -488,9 +488,9 @@ def SVDUPQ_32 : SInst<"svdupq[_n]_{d}", "dssss",  "iUif", MergeNone>;
 def SVDUPQ_64 : SInst<"svdupq[_n]_{d}", "dss",  "lUld", MergeNone>;
 
 multiclass svdup_base<string n, string p, MergeType mt, string i> {
-  def NAME : SInst<n, p, "csilUcUsUiUlhfd", mt, i>;
+  def NAME : SInst<n, p, "csilUcUsUiUlhfd", mt, i, [IsStreamingCompatible]>;
   let TargetGuard = "sve,bf16" in {
-    def _BF16: SInst<n, p, "b", mt, i>;
+    def _BF16: SInst<n, p, "b", mt, i, [IsStreamingCompatible]>;
   }
 }
 
@@ -499,14 +499,14 @@ defm SVDUP_M : svdup_base<"svdup[_n]_{d}", "ddPs", MergeOp1,     "aarch64_sve_du
 defm SVDUP_X : svdup_base<"svdup[_n]_{d}", "dPs",  MergeAnyExp,  "aarch64_sve_dup">;
 defm SVDUP_Z : svdup_base<"svdup[_n]_{d}", "dPs",  MergeZeroExp, "aarch64_sve_dup">;
 
-def SVINDEX : SInst<"svindex_{d}",   "dss",  "csilUcUsUiUl",    MergeNone,    "aarch64_sve_index">;
+def SVINDEX : SInst<"svindex_{d}",   "dss",  "csilUcUsUiUl",    MergeNone,    "aarch64_sve_index", [IsStreamingCompatible]>;
 
 // Integer arithmetic
 
-multiclass SInstZPZ<string name, string types, string intrinsic, list<FlagType> flags=[]> {
-  def _M : SInst<name # "[_{d}]", "ddPd", types, MergeOp1,     intrinsic, flags>;
-  def _X : SInst<name # "[_{d}]", "dPd",  types, MergeAnyExp,  intrinsic, flags>;
-  def _Z : SInst<name # "[_{d}]", "dPd",  types, MergeZeroExp, intrinsic, flags>;
+multiclass SInstZPZ<string name, string types, string intrinsic> {
+  def _M : SInst<name # "[_{d}]", "ddPd", types, MergeOp1,     intrinsic, [IsStreamingCompatible]>;
+  def _X : SInst<name # "[_{d}]", "dPd",  types, MergeAnyExp,  intrinsic, [IsStreamingCompatible]>;
+  def _Z : SInst<name # "[_{d}]", "dPd",  types, MergeZeroExp, intrinsic, [IsStreamingCompatible]>;
 }
 
 defm SVABS : SInstZPZ<"svabs", "csil", "aarch64_sve_abs">;
@@ -515,13 +515,13 @@ defm SVNEG : SInstZPZ<"svneg", "csil", "aarch64_sve_neg">;
 //------------------------------------------------------------------------------
 
 multiclass SInstZPZZ<string name, string types, string m_intrinsic, string x_intrinsic, list<FlagType> flags=[]> {
-  def _M   : SInst<name # "[_{d}]",   "dPdd", types, MergeOp1,  m_intrinsic, flags>;
-  def _X   : SInst<name # "[_{d}]",   "dPdd", types, MergeAny,  x_intrinsic, flags>;
-  def _Z   : SInst<name # "[_{d}]",   "dPdd", types, MergeZero, m_intrinsic, flags>;
+  def _M   : SInst<name # "[_{d}]",   "dPdd", types, MergeOp1,  m_intrinsic, !listconcat(flags, [IsStreamingCompatible])>;
+  def _X   : SInst<name # "[_{d}]",   "dPdd", types, MergeAny,  x_intrinsic, !listconcat(flags, [IsStreamingCompatible])>;
+  def _Z   : SInst<name # "[_{d}]",   "dPdd", types, MergeZero, m_intrinsic, !listconcat(flags, [IsStreamingCompatible])>;
 
-  def _N_M : SInst<name # "[_n_{d}]", "dPda", types, MergeOp1,  m_intrinsic, flags>;
-  def _N_X : SInst<name # "[_n_{d}]", "dPda", types, MergeAny,  x_intrinsic, flags>;
-  def _N_Z : SInst<name # "[_n_{d}]", "dPda", types, MergeZero, m_intrinsic, flags>;
+  def _N_M : SInst<name # "[_n_{d}]", "dPda", types, MergeOp1,  m_intrinsic, !listconcat(flags, [IsStreamingCompatible])>;
+  def _N_X : SInst<name # "[_n_{d}]", "dPda", types, MergeAny,  x_intrinsic, !listconcat(flags, [IsStreamingCompatible])>;
+  def _N_Z : SInst<name # "[_n_{d}]", "dPda", types, MergeZero, m_intrinsic, !listconcat(flags, [IsStreamingCompatible])>;
 }
 
 defm SVABD_S  : SInstZPZZ<"svabd",  "csil",         "aarch64_sve_sabd",  "aarch64_sve_sabd_u">;
@@ -553,26 +553,26 @@ multiclass SInstZPZZZ<string name, string types, string m_intrinsic, string x_in
   def _N_Z : SInst<name # "[_n_{d}]", "dPdda", types, MergeZero, m_intrinsic, flags>;
 }
 
-defm SVMAD : SInstZPZZZ<"svmad", "csilUcUsUiUl", "aarch64_sve_mad", "aarch64_sve_mla_u", [ReverseMergeAnyAccOp]>;
-defm SVMLA : SInstZPZZZ<"svmla", "csilUcUsUiUl", "aarch64_sve_mla", "aarch64_sve_mla_u">;
-defm SVMLS : SInstZPZZZ<"svmls", "csilUcUsUiUl", "aarch64_sve_mls", "aarch64_sve_mls_u">;
-defm SVMSB : SInstZPZZZ<"svmsb", "csilUcUsUiUl", "aarch64_sve_msb", "aarch64_sve_mls_u", [ReverseMergeAnyAccOp]>;
+defm SVMAD : SInstZPZZZ<"svmad", "csilUcUsUiUl", "aarch64_sve_mad", "aarch64_sve_mla_u", [ReverseMergeAnyAccOp, IsStreamingCompatible]>;
+defm SVMLA : SInstZPZZZ<"svmla", "csilUcUsUiUl", "aarch64_sve_mla", "aarch64_sve_mla_u", [IsStreamingCompatible]>;
+defm SVMLS : SInstZPZZZ<"svmls", "csilUcUsUiUl", "aarch64_sve_mls", "aarch64_sve_mls_u", [IsStreamingCompatible]>;
+defm SVMSB : SInstZPZZZ<"svmsb", "csilUcUsUiUl", "aarch64_sve_msb", "aarch64_sve_mls_u", [ReverseMergeAnyAccOp, IsStreamingCompatible]>;
 
 //------------------------------------------------------------------------------
 
-def SVDOT_S    : SInst<"svdot[_{0}]",    "ddqq", "il",       MergeNone, "aarch64_sve_sdot">;
-def SVDOT_U    : SInst<"svdot[_{0}]",    "ddqq", "UiUl",     MergeNone, "aarch64_sve_udot">;
-def SVQADD_S   : SInst<"svqadd[_{d}]",   "ddd",  "csil",     MergeNone, "aarch64_sve_sqadd_x">;
-def SVQADD_U   : SInst<"svqadd[_{d}]",   "ddd",  "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x">;
-def SVQSUB_S   : SInst<"svqsub[_{d}]",   "ddd",  "csil",     MergeNone, "aarch64_sve_sqsub_x">;
-def SVQSUB_U   : SInst<"svqsub[_{d}]",   "ddd",  "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x">;
+def SVDOT_S    : SInst<"svdot[_{0}]",    "ddqq", "il",       MergeNone, "aarch64_sve_sdot", [IsStreamingCompatible]>;
+def SVDOT_U    : SInst<"svdot[_{0}]",    "ddqq", "UiUl",     MergeNone, "aarch64_sve_udot", [IsStreamingCompatible]>;
+def SVQADD_S   : SInst<"svqadd[_{d}]",   "ddd",  "csil",     MergeNone, "aarch64_sve_sqadd_x", [IsStreamingCompatible]>;
+def SVQADD_U   : SInst<"svqadd[_{d}]",   "ddd",  "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x", [IsStreamingCompatible]>;
+def SVQSUB_S   : SInst<"svqsub[_{d}]",   "ddd",  "csil",     MergeNone, "aarch64_sve_sqsub_x", [IsStreamingCompatible]>;
+def SVQSUB_U   : SInst<"svqsub[_{d}]",   "ddd",  "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x", [IsStreamingCompatible]>;
 
-def SVDOT_N_S  : SInst<"svdot[_n_{0}]",  "ddqr", "il",       MergeNone, "aarch64_sve_sdot">;
-def SVDOT_N_U  : SInst<"svdot[_n_{0}]",  "ddqr", "UiUl",     MergeNone, "aarch64_sve_udot">;
-def SVQADD_N_S : SInst<"svqadd[_n_{d}]", "dda",  "csil",     MergeNone, "aarch64_sve_sqadd_x">;
-def SVQADD_N_U : SInst<"svqadd[_n_{d}]", "dda",  "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x">;
-def SVQSUB_N_S : SInst<"svqsub[_n_{d}]", "dda",  "csil",     MergeNone, "aarch64_sve_sqsub_x">;
-def SVQSUB_N_U : SInst<"svqsub[_n_{d}]", "dda",  "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x">;
+def SVDOT_N_S  : SInst<"svdot[_n_{0}]",  "ddqr", "il",       MergeNone, "aarch64_sve_sdot", [IsStreamingCompatible]>;
+def SVDOT_N_U  : SInst<"svdot[_n_{0}]",  "ddqr", "UiUl",     MergeNone, "aarch64_sve_udot", [IsStreamingCompatible]>;
+def SVQADD_N_S : SInst<"svqadd[_n_{d}]", "dda",  "csil",     MergeNone, "aarch64_sve_sqadd_x", [IsStreamingCompatible]>;
+def SVQADD_N_U : SInst<"svqadd[_n_{d}]", "dda",  "UcUsUiUl", MergeNone, "aarch64_sve_uqadd_x", [IsStreamingCompatible]>;
+def SVQSUB_N_S : SInst<"svqsub[_n_{d}]", "dda",  "csil",     MergeNone, "aarch64_sve_sqsub_x", [IsStreamingCompatible]>;
+def SVQSUB_N_U : SInst<"svqsub[_n_{d}]", "dda",  "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x", [IsStreamingCompatible]>;
 
 def SVDOT_LANE_S : SInst<"svdot_lane[_{d}]",  "ddqqi",  "il",   MergeNone, "aarch64_sve_sdot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>;
 def SVDOT_LANE_U : SInst<"svdot_lane[_{d}]",  "ddqqi",  "UiUl", MergeNone, "aarch64_sve_udot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>;
@@ -592,107 +592,107 @@ defm SVNOT  : SInstZPZ<"svnot",  "csilUcUsUiUl", "aarch64_sve_not">;
 // Shifts
 
 multiclass SInst_SHIFT<string name, string intrinsic, string ts, string wide_ts> {
-  def _M : SInst<name # "[_{d}]", "dPdu", ts, MergeOp1,  intrinsic>;
-  def _X : SInst<name # "[_{d}]", "dPdu", ts, MergeAny,  intrinsic # _u>;
-  def _Z : SInst<name # "[_{d}]", "dPdu", ts, MergeZero, intrinsic>;
+  def _M : SInst<name # "[_{d}]", "dPdu", ts, MergeOp1,  intrinsic, [IsStreamingCompatible]>;
+  def _X : SInst<name # "[_{d}]", "dPdu", ts, MergeAny,  intrinsic # _u, [IsStreamingCompatible]>;
+  def _Z : SInst<name # "[_{d}]", "dPdu", ts, MergeZero, intrinsic, [IsStreamingCompatible]>;
 
-  def _N_M : SInst<name # "[_n_{d}]", "dPdL", ts, MergeOp1,  intrinsic>;
-  def _N_X : SInst<name # "[_n_{d}]", "dPdL", ts, MergeAny,  intrinsic # _u>;
-  def _N_Z : SInst<name # "[_n_{d}]", "dPdL", ts, MergeZero, intrinsic>;
+  def _N_M : SInst<name # "[_n_{d}]", "dPdL", ts, MergeOp1,  intrinsic, [IsStreamingCompatible]>;
+  def _N_X : SInst<name # "[_n_{d}]", "dPdL", ts, MergeAny,  intrinsic # _u, [IsStreamingCompatible]>;
+  def _N_Z : SInst<name # "[_n_{d}]", "dPdL", ts, MergeZero, intrinsic, [IsStreamingCompatible]>;
 
-  def _WIDE_M : SInst<name # _wide # "[_{d}]", "dPdg", wide_ts, MergeOp1,  intrinsic # _wide>;
-  def _WIDE_X : SInst<name # _wide # "[_{d}]", "dPdg", wide_ts, MergeAny,  intrinsic # _wide>;
-  def _WIDE_Z : SInst<name # _wide # "[_{d}]", "dPdg", wide_ts, MergeZero, intrinsic # _wide>;
+  def _WIDE_M : SInst<name # _wide # "[_{d}]", "dPdg", wide_ts, MergeOp1,  intrinsic # _wide, [IsStreamingCompatible]>;
+  def _WIDE_X : SInst<name # _wide # "[_{d}]", "dPdg", wide_ts, MergeAny,  intrinsic # _wide, [IsStreamingCompatible]>;
+  def _WIDE_Z : SInst<name # _wide # "[_{d}]", "dPdg", wide_ts, MergeZero, intrinsic # _wide, [IsStreamingCompatible]>;
 
-  def _WIDE_N_M : SInst<name # _wide # "[_n_{d}]", "dPdf", wide_ts, MergeOp1,  intrinsic # _wide>;
-  def _WIDE_N_X : SInst<name # _wide # "[_n_{d}]", "dPdf", wide_ts, MergeAny,  intrinsic # _wide>;
-  def _WIDE_N_Z : SInst<name # _wide # "[_n_{d}]", "dPdf", wide_ts, MergeZero, intrinsic # _wide>;
+  def _WIDE_N_M : SInst<name # _wide # "[_n_{d}]", "dPdf", wide_ts, MergeOp1,  intrinsic # _wide, [IsStreamingCompatible]>;
+  def _WIDE_N_X : SInst<name # _wide # "[_n_{d}]", "dPdf", wide_ts, MergeAny,  intrinsic # _wide, [IsStreamingCompatible]>;
+  def _WIDE_N_Z : SInst<name # _wide # "[_n_{d}]", "dPdf", wide_ts, MergeZero, intrinsic # _wide, [IsStreamingCompatible]>;
 }
 
 defm SVASR : SInst_SHIFT<"svasr", "aarch64_sve_asr", "csil", "csi">;
 defm SVLSL : SInst_SHIFT<"svlsl", "aarch64_sve_lsl", "csilUcUsUiUl", "csiUcUsUi">;
 defm SVLSR : SInst_SHIFT<"svlsr", "aarch64_sve_lsr", "UcUsUiUl", "UcUsUi">;
 
-def SVASRD_M : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeOp1,  "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
-def SVASRD_X : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeAny,  "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
-def SVASRD_Z : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeZero, "aarch64_sve_asrd", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+def SVASRD_M : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeOp1,  "aarch64_sve_asrd", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+def SVASRD_X : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeAny,  "aarch64_sve_asrd", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+def SVASRD_Z : SInst<"svasrd[_n_{d}]", "dPdi", "csil",            MergeZero, "aarch64_sve_asrd", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
 
-def SVINSR : SInst<"svinsr[_n_{d}]", "dds", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr">;
+def SVINSR : SInst<"svinsr[_n_{d}]", "dds", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_insr", [IsStreamingCompatible]>;
 let TargetGuard = "sve,bf16" in {
-  def SVINSR_BF16 : SInst<"svinsr[_n_{d}]", "dds",  "b", MergeNone, "aarch64_sve_insr">;
+  def SVINSR_BF16 : SInst<"svinsr[_n_{d}]", "dds",  "b", MergeNone, "aarch64_sve_insr", [IsStreamingCompatible]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Integer reductions
 
-def SVADDV_S : SInst<"svaddv[_{d}]", "lPd", "csil",         MergeNone, "aarch64_sve_saddv">;
-def SVADDV_U : SInst<"svaddv[_{d}]", "nPd", "UcUsUiUl",     MergeNone, "aarch64_sve_uaddv">;
-def SVANDV   : SInst<"svandv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_andv">;
-def SVEORV   : SInst<"sveorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorv">;
-def SVMAXV_S : SInst<"svmaxv[_{d}]", "sPd", "csil",         MergeNone, "aarch64_sve_smaxv">;
-def SVMAXV_U : SInst<"svmaxv[_{d}]", "sPd", "UcUsUiUl",     MergeNone, "aarch64_sve_umaxv">;
-def SVMINV_S : SInst<"svminv[_{d}]", "sPd", "csil",         MergeNone, "aarch64_sve_sminv">;
-def SVMINV_U : SInst<"svminv[_{d}]", "sPd", "UcUsUiUl",     MergeNone, "aarch64_sve_uminv">;
-def SVORV    : SInst<"svorv[_{d}]",  "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orv">;
+def SVADDV_S : SInst<"svaddv[_{d}]", "lPd", "csil",         MergeNone, "aarch64_sve_saddv", [IsStreamingCompatible]>;
+def SVADDV_U : SInst<"svaddv[_{d}]", "nPd", "UcUsUiUl",     MergeNone, "aarch64_sve_uaddv", [IsStreamingCompatible]>;
+def SVANDV   : SInst<"svandv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_andv", [IsStreamingCompatible]>;
+def SVEORV   : SInst<"sveorv[_{d}]", "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorv", [IsStreamingCompatible]>;
+def SVMAXV_S : SInst<"svmaxv[_{d}]", "sPd", "csil",         MergeNone, "aarch64_sve_smaxv", [IsStreamingCompatible]>;
+def SVMAXV_U : SInst<"svmaxv[_{d}]", "sPd", "UcUsUiUl",     MergeNone, "aarch64_sve_umaxv", [IsStreamingCompatible]>;
+def SVMINV_S : SInst<"svminv[_{d}]", "sPd", "csil",         MergeNone, "aarch64_sve_sminv", [IsStreamingCompatible]>;
+def SVMINV_U : SInst<"svminv[_{d}]", "sPd", "UcUsUiUl",     MergeNone, "aarch64_sve_uminv", [IsStreamingCompatible]>;
+def SVORV    : SInst<"svorv[_{d}]",  "sPd", "csilUcUsUiUl", MergeNone, "aarch64_sve_orv", [IsStreamingCompatible]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Integer comparisons
 
-def SVCMPEQ : SInst<"svcmpeq[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq">;
-def SVCMPNE : SInst<"svcmpne[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne">;
-def SVCMPGE : SInst<"svcmpge[_{d}]", "PPdd", "csil",         MergeNone, "aarch64_sve_cmpge">;
-def SVCMPGT : SInst<"svcmpgt[_{d}]", "PPdd", "csil",         MergeNone, "aarch64_sve_cmpgt">;
-def SVCMPLE : SInst<"svcmple[_{d}]", "PPdd", "csil",         MergeNone, "aarch64_sve_cmpge", [ReverseCompare]>;
-def SVCMPLT : SInst<"svcmplt[_{d}]", "PPdd", "csil",         MergeNone, "aarch64_sve_cmpgt", [ReverseCompare]>;
-def SVCMPHI : SInst<"svcmpgt[_{d}]", "PPdd", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphi">;
-def SVCMPHS : SInst<"svcmpge[_{d}]", "PPdd", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphs">;
-def SVCMPLO : SInst<"svcmplt[_{d}]", "PPdd", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphi", [ReverseCompare]>;
-def SVCMPLS : SInst<"svcmple[_{d}]", "PPdd", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphs", [ReverseCompare]>;
-
-def SVCMPEQ_N : SInst<"svcmpeq[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq">;
-def SVCMPNE_N : SInst<"svcmpne[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne">;
-def SVCMPGE_N : SInst<"svcmpge[_n_{d}]", "PPda", "csil",         MergeNone, "aarch64_sve_cmpge">;
-def SVCMPGT_N : SInst<"svcmpgt[_n_{d}]", "PPda", "csil",         MergeNone, "aarch64_sve_cmpgt">;
-def SVCMPLE_N : SInst<"svcmple[_n_{d}]", "PPda", "csil",         MergeNone, "aarch64_sve_cmpge", [ReverseCompare]>;
-def SVCMPLT_N : SInst<"svcmplt[_n_{d}]", "PPda", "csil",         MergeNone, "aarch64_sve_cmpgt", [ReverseCompare]>;
-def SVCMPHS_N : SInst<"svcmpge[_n_{d}]", "PPda", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphs">;
-def SVCMPHI_N : SInst<"svcmpgt[_n_{d}]", "PPda", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphi">;
-def SVCMPLS_N : SInst<"svcmple[_n_{d}]", "PPda", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphs", [ReverseCompare]>;
-def SVCMPLO_N : SInst<"svcmplt[_n_{d}]", "PPda", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphi", [ReverseCompare]>;
-
-def SVCMPEQ_WIDE : SInst<"svcmpeq_wide[_{d}]", "PPdw", "csi",    MergeNone, "aarch64_sve_cmpeq_wide">;
-def SVCMPNE_WIDE : SInst<"svcmpne_wide[_{d}]", "PPdw", "csi",    MergeNone, "aarch64_sve_cmpne_wide">;
-def SVCMPGE_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "csi",    MergeNone, "aarch64_sve_cmpge_wide">;
-def SVCMPGT_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "csi",    MergeNone, "aarch64_sve_cmpgt_wide">;
-def SVCMPLE_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "csi",    MergeNone, "aarch64_sve_cmple_wide">;
-def SVCMPLT_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "csi",    MergeNone, "aarch64_sve_cmplt_wide">;
-def SVCMPHI_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide">;
-def SVCMPHS_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide">;
-def SVCMPLO_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide">;
-def SVCMPLS_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide">;
-
-def SVCMPEQ_WIDE_N : SInst<"svcmpeq_wide[_n_{d}]", "PPdj", "csi",    MergeNone, "aarch64_sve_cmpeq_wide">;
-def SVCMPNE_WIDE_N : SInst<"svcmpne_wide[_n_{d}]", "PPdj", "csi",    MergeNone, "aarch64_sve_cmpne_wide">;
-def SVCMPGE_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "csi",    MergeNone, "aarch64_sve_cmpge_wide">;
-def SVCMPGT_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "csi",    MergeNone, "aarch64_sve_cmpgt_wide">;
-def SVCMPLE_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "csi",    MergeNone, "aarch64_sve_cmple_wide">;
-def SVCMPLT_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "csi",    MergeNone, "aarch64_sve_cmplt_wide">;
-def SVCMPHS_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide">;
-def SVCMPHI_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide">;
-def SVCMPLO_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide">;
-def SVCMPLS_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide">;
+def SVCMPEQ : SInst<"svcmpeq[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq", [IsStreamingCompatible]>;
+def SVCMPNE : SInst<"svcmpne[_{d}]", "PPdd", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne", [IsStreamingCompatible]>;
+def SVCMPGE : SInst<"svcmpge[_{d}]", "PPdd", "csil",         MergeNone, "aarch64_sve_cmpge", [IsStreamingCompatible]>;
+def SVCMPGT : SInst<"svcmpgt[_{d}]", "PPdd", "csil",         MergeNone, "aarch64_sve_cmpgt", [IsStreamingCompatible]>;
+def SVCMPLE : SInst<"svcmple[_{d}]", "PPdd", "csil",         MergeNone, "aarch64_sve_cmpge", [ReverseCompare, IsStreamingCompatible]>;
+def SVCMPLT : SInst<"svcmplt[_{d}]", "PPdd", "csil",         MergeNone, "aarch64_sve_cmpgt", [ReverseCompare, IsStreamingCompatible]>;
+def SVCMPHI : SInst<"svcmpgt[_{d}]", "PPdd", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphi", [IsStreamingCompatible]>;
+def SVCMPHS : SInst<"svcmpge[_{d}]", "PPdd", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphs", [IsStreamingCompatible]>;
+def SVCMPLO : SInst<"svcmplt[_{d}]", "PPdd", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphi", [ReverseCompare, IsStreamingCompatible]>;
+def SVCMPLS : SInst<"svcmple[_{d}]", "PPdd", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphs", [ReverseCompare, IsStreamingCompatible]>;
+
+def SVCMPEQ_N : SInst<"svcmpeq[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpeq", [IsStreamingCompatible]>;
+def SVCMPNE_N : SInst<"svcmpne[_n_{d}]", "PPda", "csilUcUsUiUl", MergeNone, "aarch64_sve_cmpne", [IsStreamingCompatible]>;
+def SVCMPGE_N : SInst<"svcmpge[_n_{d}]", "PPda", "csil",         MergeNone, "aarch64_sve_cmpge", [IsStreamingCompatible]>;
+def SVCMPGT_N : SInst<"svcmpgt[_n_{d}]", "PPda", "csil",         MergeNone, "aarch64_sve_cmpgt", [IsStreamingCompatible]>;
+def SVCMPLE_N : SInst<"svcmple[_n_{d}]", "PPda", "csil",         MergeNone, "aarch64_sve_cmpge", [ReverseCompare, IsStreamingCompatible]>;
+def SVCMPLT_N : SInst<"svcmplt[_n_{d}]", "PPda", "csil",         MergeNone, "aarch64_sve_cmpgt", [ReverseCompare, IsStreamingCompatible]>;
+def SVCMPHS_N : SInst<"svcmpge[_n_{d}]", "PPda", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphs", [IsStreamingCompatible]>;
+def SVCMPHI_N : SInst<"svcmpgt[_n_{d}]", "PPda", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphi", [IsStreamingCompatible]>;
+def SVCMPLS_N : SInst<"svcmple[_n_{d}]", "PPda", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphs", [ReverseCompare, IsStreamingCompatible]>;
+def SVCMPLO_N : SInst<"svcmplt[_n_{d}]", "PPda", "UcUsUiUl",     MergeNone, "aarch64_sve_cmphi", [ReverseCompare, IsStreamingCompatible]>;
+
+def SVCMPEQ_WIDE : SInst<"svcmpeq_wide[_{d}]", "PPdw", "csi",    MergeNone, "aarch64_sve_cmpeq_wide", [IsStreamingCompatible]>;
+def SVCMPNE_WIDE : SInst<"svcmpne_wide[_{d}]", "PPdw", "csi",    MergeNone, "aarch64_sve_cmpne_wide", [IsStreamingCompatible]>;
+def SVCMPGE_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "csi",    MergeNone, "aarch64_sve_cmpge_wide", [IsStreamingCompatible]>;
+def SVCMPGT_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "csi",    MergeNone, "aarch64_sve_cmpgt_wide", [IsStreamingCompatible]>;
+def SVCMPLE_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "csi",    MergeNone, "aarch64_sve_cmple_wide", [IsStreamingCompatible]>;
+def SVCMPLT_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "csi",    MergeNone, "aarch64_sve_cmplt_wide", [IsStreamingCompatible]>;
+def SVCMPHI_WIDE : SInst<"svcmpgt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide", [IsStreamingCompatible]>;
+def SVCMPHS_WIDE : SInst<"svcmpge_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide", [IsStreamingCompatible]>;
+def SVCMPLO_WIDE : SInst<"svcmplt_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide", [IsStreamingCompatible]>;
+def SVCMPLS_WIDE : SInst<"svcmple_wide[_{d}]", "PPdw", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide", [IsStreamingCompatible]>;
+
+def SVCMPEQ_WIDE_N : SInst<"svcmpeq_wide[_n_{d}]", "PPdj", "csi",    MergeNone, "aarch64_sve_cmpeq_wide", [IsStreamingCompatible]>;
+def SVCMPNE_WIDE_N : SInst<"svcmpne_wide[_n_{d}]", "PPdj", "csi",    MergeNone, "aarch64_sve_cmpne_wide", [IsStreamingCompatible]>;
+def SVCMPGE_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "csi",    MergeNone, "aarch64_sve_cmpge_wide", [IsStreamingCompatible]>;
+def SVCMPGT_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "csi",    MergeNone, "aarch64_sve_cmpgt_wide", [IsStreamingCompatible]>;
+def SVCMPLE_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "csi",    MergeNone, "aarch64_sve_cmple_wide", [IsStreamingCompatible]>;
+def SVCMPLT_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "csi",    MergeNone, "aarch64_sve_cmplt_wide", [IsStreamingCompatible]>;
+def SVCMPHS_WIDE_N : SInst<"svcmpge_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphs_wide", [IsStreamingCompatible]>;
+def SVCMPHI_WIDE_N : SInst<"svcmpgt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmphi_wide", [IsStreamingCompatible]>;
+def SVCMPLO_WIDE_N : SInst<"svcmplt_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmplo_wide", [IsStreamingCompatible]>;
+def SVCMPLS_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone, "aarch64_sve_cmpls_wide", [IsStreamingCompatible]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // While comparisons
 
-def SVWHILELE_S32 : SInst<"svwhilele_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilele", [IsOverloadWhile]>;
-def SVWHILELE_S64 : SInst<"svwhilele_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilele", [IsOverloadWhile]>;
-def SVWHILELO_U32 : SInst<"svwhilelt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile]>;
-def SVWHILELO_U64 : SInst<"svwhilelt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile]>;
-def SVWHILELS_U32 : SInst<"svwhilele_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile]>;
-def SVWHILELS_U64 : SInst<"svwhilele_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile]>;
-def SVWHILELT_S32 : SInst<"svwhilelt_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile]>;
-def SVWHILELT_S64 : SInst<"svwhilelt_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile]>;
+def SVWHILELE_S32 : SInst<"svwhilele_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilele", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILELE_S64 : SInst<"svwhilele_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilele", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILELO_U32 : SInst<"svwhilelt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILELO_U64 : SInst<"svwhilelt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILELS_U32 : SInst<"svwhilele_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILELS_U64 : SInst<"svwhilele_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILELT_S32 : SInst<"svwhilelt_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILELT_S64 : SInst<"svwhilelt_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile, IsStreamingCompatible]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Counting bit
@@ -703,12 +703,12 @@ multiclass SInstCLS<string name, string types, string intrinsic, list<FlagType>
   def _Z : SInst<name # "[_{d}]", "uPd",  types, MergeZeroExp, intrinsic, flags>;
 }
 
-defm SVCLS : SInstCLS<"svcls", "csil",            "aarch64_sve_cls">;
-defm SVCLZ : SInstCLS<"svclz", "csilUcUsUiUl",    "aarch64_sve_clz">;
-defm SVCNT : SInstCLS<"svcnt", "csilUcUsUiUlhfd", "aarch64_sve_cnt">;
+defm SVCLS : SInstCLS<"svcls", "csil",            "aarch64_sve_cls", [IsStreamingCompatible]>;
+defm SVCLZ : SInstCLS<"svclz", "csilUcUsUiUl",    "aarch64_sve_clz", [IsStreamingCompatible]>;
+defm SVCNT : SInstCLS<"svcnt", "csilUcUsUiUlhfd", "aarch64_sve_cnt", [IsStreamingCompatible]>;
 
 let TargetGuard = "sve,bf16" in {
-  defm SVCNT_BF16 : SInstCLS<"svcnt", "b", "aarch64_sve_cnt">;
+  defm SVCNT_BF16 : SInstCLS<"svcnt", "b", "aarch64_sve_cnt", [IsStreamingCompatible]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -763,13 +763,13 @@ def SVTMAD  : SInst<"svtmad[_{d}]",  "dddi", "hfd", MergeNone, "aarch64_sve_ftma
 def SVTSMUL : SInst<"svtsmul[_{d}]", "ddu",  "hfd", MergeNone, "aarch64_sve_ftsmul_x">;
 def SVTSSEL : SInst<"svtssel[_{d}]", "ddu",  "hfd", MergeNone, "aarch64_sve_ftssel_x">;
 
-def SVSCALE_M   : SInst<"svscale[_{d}]",   "dPdx", "hfd", MergeOp1,  "aarch64_sve_fscale">;
-def SVSCALE_X   : SInst<"svscale[_{d}]",   "dPdx", "hfd", MergeAny,  "aarch64_sve_fscale">;
-def SVSCALE_Z   : SInst<"svscale[_{d}]",   "dPdx", "hfd", MergeZero, "aarch64_sve_fscale">;
+def SVSCALE_M   : SInst<"svscale[_{d}]",   "dPdx", "hfd", MergeOp1,  "aarch64_sve_fscale", [IsStreamingCompatible]>;
+def SVSCALE_X   : SInst<"svscale[_{d}]",   "dPdx", "hfd", MergeAny,  "aarch64_sve_fscale", [IsStreamingCompatible]>;
+def SVSCALE_Z   : SInst<"svscale[_{d}]",   "dPdx", "hfd", MergeZero, "aarch64_sve_fscale", [IsStreamingCompatible]>;
 
-def SVSCALE_N_M : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeOp1,  "aarch64_sve_fscale">;
-def SVSCALE_N_X : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeAny,  "aarch64_sve_fscale">;
-def SVSCALE_N_Z : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeZero, "aarch64_sve_fscale">;
+def SVSCALE_N_M : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeOp1,  "aarch64_sve_fscale", [IsStreamingCompatible]>;
+def SVSCALE_N_X : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeAny,  "aarch64_sve_fscale", [IsStreamingCompatible]>;
+def SVSCALE_N_Z : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeZero, "aarch64_sve_fscale", [IsStreamingCompatible]>;
 
 defm SVMAD_F  : SInstZPZZZ<"svmad",  "hfd", "aarch64_sve_fmad",  "aarch64_sve_fmla_u",  [ReverseMergeAnyAccOp]>;
 defm SVMLA_F  : SInstZPZZZ<"svmla",  "hfd", "aarch64_sve_fmla",  "aarch64_sve_fmla_u">;
@@ -780,42 +780,42 @@ defm SVNMLA_F : SInstZPZZZ<"svnmla", "hfd", "aarch64_sve_fnmla", "aarch64_sve_fn
 defm SVNMLS_F : SInstZPZZZ<"svnmls", "hfd", "aarch64_sve_fnmls", "aarch64_sve_fnmls_u">;
 defm SVNMSB_F : SInstZPZZZ<"svnmsb", "hfd", "aarch64_sve_fnmsb", "aarch64_sve_fnmls_u", [ReverseMergeAnyAccOp]>;
 
-def SVCADD_M : SInst<"svcadd[_{d}]", "dPddi",  "hfd", MergeOp1,  "aarch64_sve_fcadd", [], [ImmCheck<3, ImmCheckComplexRot90_270>]>;
-def SVCADD_X : SInst<"svcadd[_{d}]", "dPddi",  "hfd", MergeAny,  "aarch64_sve_fcadd", [], [ImmCheck<3, ImmCheckComplexRot90_270>]>;
-def SVCADD_Z : SInst<"svcadd[_{d}]", "dPddi",  "hfd", MergeZero, "aarch64_sve_fcadd", [], [ImmCheck<3, ImmCheckComplexRot90_270>]>;
-def SVCMLA_M : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeOp1,  "aarch64_sve_fcmla", [], [ImmCheck<4, ImmCheckComplexRotAll90>]>;
-def SVCMLA_X : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeAny,  "aarch64_sve_fcmla", [], [ImmCheck<4, ImmCheckComplexRotAll90>]>;
-def SVCMLA_Z : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeZero, "aarch64_sve_fcmla", [], [ImmCheck<4, ImmCheckComplexRotAll90>]>;
+def SVCADD_M : SInst<"svcadd[_{d}]", "dPddi",  "hfd", MergeOp1,  "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>;
+def SVCADD_X : SInst<"svcadd[_{d}]", "dPddi",  "hfd", MergeAny,  "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>;
+def SVCADD_Z : SInst<"svcadd[_{d}]", "dPddi",  "hfd", MergeZero, "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>;
+def SVCMLA_M : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeOp1,  "aarch64_sve_fcmla", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>]>;
+def SVCMLA_X : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeAny,  "aarch64_sve_fcmla", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>]>;
+def SVCMLA_Z : SInst<"svcmla[_{d}]", "dPdddi", "hfd", MergeZero, "aarch64_sve_fcmla", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>]>;
 
-def SVCMLA_LANE : SInst<"svcmla_lane[_{d}]", "ddddii", "hf",  MergeNone, "aarch64_sve_fcmla_lane", [], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>,
+def SVCMLA_LANE : SInst<"svcmla_lane[_{d}]", "ddddii", "hf",  MergeNone, "aarch64_sve_fcmla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>,
                                                                                                         ImmCheck<4, ImmCheckComplexRotAll90>]>;
-def SVMLA_LANE  : SInst<"svmla_lane[_{d}]",  "ddddi",  "hfd", MergeNone, "aarch64_sve_fmla_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVMLS_LANE  : SInst<"svmls_lane[_{d}]",  "ddddi",  "hfd", MergeNone, "aarch64_sve_fmls_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVMUL_LANE  : SInst<"svmul_lane[_{d}]",  "dddi",   "hfd", MergeNone, "aarch64_sve_fmul_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def SVMLA_LANE  : SInst<"svmla_lane[_{d}]",  "ddddi",  "hfd", MergeNone, "aarch64_sve_fmla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMLS_LANE  : SInst<"svmls_lane[_{d}]",  "ddddi",  "hfd", MergeNone, "aarch64_sve_fmls_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMUL_LANE  : SInst<"svmul_lane[_{d}]",  "dddi",   "hfd", MergeNone, "aarch64_sve_fmul_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 
-def SVRECPE  : SInst<"svrecpe[_{d}]",  "dd",  "hfd", MergeNone, "aarch64_sve_frecpe_x">;
-def SVRECPS  : SInst<"svrecps[_{d}]",  "ddd", "hfd", MergeNone, "aarch64_sve_frecps_x">;
-def SVRSQRTE : SInst<"svrsqrte[_{d}]", "dd",  "hfd", MergeNone, "aarch64_sve_frsqrte_x">;
-def SVRSQRTS : SInst<"svrsqrts[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frsqrts_x">;
+def SVRECPE  : SInst<"svrecpe[_{d}]",  "dd",  "hfd", MergeNone, "aarch64_sve_frecpe_x", [IsStreamingCompatible]>;
+def SVRECPS  : SInst<"svrecps[_{d}]",  "ddd", "hfd", MergeNone, "aarch64_sve_frecps_x", [IsStreamingCompatible]>;
+def SVRSQRTE : SInst<"svrsqrte[_{d}]", "dd",  "hfd", MergeNone, "aarch64_sve_frsqrte_x", [IsStreamingCompatible]>;
+def SVRSQRTS : SInst<"svrsqrts[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frsqrts_x", [IsStreamingCompatible]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Floating-point reductions
 
-def SVFADDA   : SInst<"svadda[_{d}]",   "sPsd", "hfd", MergeNone, "aarch64_sve_fadda">;
-def SVFADDV   : SInst<"svaddv[_{d}]",   "sPd",  "hfd", MergeNone, "aarch64_sve_faddv">;
-def SVFMAXV   : SInst<"svmaxv[_{d}]",   "sPd",  "hfd", MergeNone, "aarch64_sve_fmaxv">;
-def SVFMAXNMV : SInst<"svmaxnmv[_{d}]", "sPd",  "hfd", MergeNone, "aarch64_sve_fmaxnmv">;
-def SVFMINV   : SInst<"svminv[_{d}]",   "sPd",  "hfd", MergeNone, "aarch64_sve_fminv">;
-def SVFMINNMV : SInst<"svminnmv[_{d}]", "sPd",  "hfd", MergeNone, "aarch64_sve_fminnmv">;
+def SVFADDA   : SInst<"svadda[_{d}]",   "sPsd", "hfd", MergeNone, "aarch64_sve_fadda", [IsStreamingCompatible]>;
+def SVFADDV   : SInst<"svaddv[_{d}]",   "sPd",  "hfd", MergeNone, "aarch64_sve_faddv", [IsStreamingCompatible]>;
+def SVFMAXV   : SInst<"svmaxv[_{d}]",   "sPd",  "hfd", MergeNone, "aarch64_sve_fmaxv", [IsStreamingCompatible]>;
+def SVFMAXNMV : SInst<"svmaxnmv[_{d}]", "sPd",  "hfd", MergeNone, "aarch64_sve_fmaxnmv", [IsStreamingCompatible]>;
+def SVFMINV   : SInst<"svminv[_{d}]",   "sPd",  "hfd", MergeNone, "aarch64_sve_fminv", [IsStreamingCompatible]>;
+def SVFMINNMV : SInst<"svminnmv[_{d}]", "sPd",  "hfd", MergeNone, "aarch64_sve_fminnmv", [IsStreamingCompatible]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Floating-point comparisons
 
-def SVACGE  : SInst<"svacge[_{d}]",  "PPdd", "hfd", MergeNone, "aarch64_sve_facge">;
-def SVACGT  : SInst<"svacgt[_{d}]",  "PPdd", "hfd", MergeNone, "aarch64_sve_facgt">;
-def SVACLE  : SInst<"svacle[_{d}]",  "PPdd", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare]>;
-def SVACLT  : SInst<"svaclt[_{d}]",  "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare]>;
-def SVCMPUO : SInst<"svcmpuo[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpuo">;
+def SVACGE  : SInst<"svacge[_{d}]",  "PPdd", "hfd", MergeNone, "aarch64_sve_facge", [IsStreamingCompatible]>;
+def SVACGT  : SInst<"svacgt[_{d}]",  "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [IsStreamingCompatible]>;
+def SVACLE  : SInst<"svacle[_{d}]",  "PPdd", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare, IsStreamingCompatible]>;
+def SVACLT  : SInst<"svaclt[_{d}]",  "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare, IsStreamingCompatible]>;
+def SVCMPUO : SInst<"svcmpuo[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpuo", [IsStreamingCompatible]>;
 
 def SVACGE_N  : SInst<"svacge[_n_{d}]",  "PPda", "hfd", MergeNone, "aarch64_sve_facge">;
 def SVACGT_N  : SInst<"svacgt[_n_{d}]",  "PPda", "hfd", MergeNone, "aarch64_sve_facgt">;
@@ -823,19 +823,19 @@ def SVACLE_N  : SInst<"svacle[_n_{d}]",  "PPda", "hfd", MergeNone, "aarch64_sve_
 def SVACLT_N  : SInst<"svaclt[_n_{d}]",  "PPda", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare]>;
 def SVCMPUO_N : SInst<"svcmpuo[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpuo">;
 
-def SVCMPEQ_F : SInst<"svcmpeq[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpeq">;
-def SVCMPNE_F : SInst<"svcmpne[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpne">;
-def SVCMPGE_F : SInst<"svcmpge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge">;
-def SVCMPGT_F : SInst<"svcmpgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt">;
-def SVCMPLE_F : SInst<"svcmple[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare]>;
-def SVCMPLT_F : SInst<"svcmplt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare]>;
+def SVCMPEQ_F : SInst<"svcmpeq[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpeq", [IsStreamingCompatible]>;
+def SVCMPNE_F : SInst<"svcmpne[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpne", [IsStreamingCompatible]>;
+def SVCMPGE_F : SInst<"svcmpge[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge", [IsStreamingCompatible]>;
+def SVCMPGT_F : SInst<"svcmpgt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt", [IsStreamingCompatible]>;
+def SVCMPLE_F : SInst<"svcmple[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare, IsStreamingCompatible]>;
+def SVCMPLT_F : SInst<"svcmplt[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare, IsStreamingCompatible]>;
 
-def SVCMPEQ_F_N : SInst<"svcmpeq[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpeq">;
-def SVCMPNE_F_N : SInst<"svcmpne[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpne">;
-def SVCMPGE_F_N : SInst<"svcmpge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge">;
-def SVCMPGT_F_N : SInst<"svcmpgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt">;
-def SVCMPLE_F_N : SInst<"svcmple[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare]>;
-def SVCMPLT_F_N : SInst<"svcmplt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare]>;
+def SVCMPEQ_F_N : SInst<"svcmpeq[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpeq", [IsStreamingCompatible]>;
+def SVCMPNE_F_N : SInst<"svcmpne[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpne", [IsStreamingCompatible]>;
+def SVCMPGE_F_N : SInst<"svcmpge[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge", [IsStreamingCompatible]>;
+def SVCMPGT_F_N : SInst<"svcmpgt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt", [IsStreamingCompatible]>;
+def SVCMPLE_F_N : SInst<"svcmple[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpge", [ReverseCompare, IsStreamingCompatible]>;
+def SVCMPLT_F_N : SInst<"svcmplt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpgt", [ReverseCompare, IsStreamingCompatible]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Floating-point conversions
@@ -843,16 +843,16 @@ def SVCMPLT_F_N : SInst<"svcmplt[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sv
 multiclass SInstCvtMXZ<
     string name, string m_types, string xz_types, string types,
     string intrinsic, list<FlagType> flags = [IsOverloadNone]> {
-  def _M : SInst<name, m_types,  types, MergeOp1,     intrinsic, flags>;
-  def _X : SInst<name, xz_types, types, MergeAnyExp,  intrinsic, flags>;
-  def _Z : SInst<name, xz_types, types, MergeZeroExp, intrinsic, flags>;
+  def _M : SInst<name, m_types,  types, MergeOp1,     intrinsic, !listconcat(flags, [IsStreamingCompatible])>;
+  def _X : SInst<name, xz_types, types, MergeAnyExp,  intrinsic, !listconcat(flags, [IsStreamingCompatible])>;
+  def _Z : SInst<name, xz_types, types, MergeZeroExp, intrinsic, !listconcat(flags, [IsStreamingCompatible])>;
 }
 
 multiclass SInstCvtMX<string name, string m_types, string xz_types,
                       string types, string intrinsic,
                       list<FlagType> flags = [IsOverloadNone]> {
-  def _M : SInst<name, m_types,  types, MergeOp1,     intrinsic, flags>;
-  def _X : SInst<name, xz_types, types, MergeAnyExp,  intrinsic, flags>;
+  def _M : SInst<name, m_types,  types, MergeOp1,     intrinsic, !listconcat(flags, [IsStreamingCompatible])>;
+  def _X : SInst<name, xz_types, types, MergeAnyExp,  intrinsic, !listconcat(flags, [IsStreamingCompatible])>;
 }
 
 // svcvt_s##_f16
@@ -866,7 +866,7 @@ defm SVFCVTZS_S64_F32 : SInstCvtMXZ<"svcvt_s64[_f32]", "ddPM", "dPM", "l",  "aar
 
 let TargetGuard = "sve,bf16" in {
   defm SVCVT_BF16_F32   : SInstCvtMXZ<"svcvt_bf16[_f32]",  "ddPM", "dPM", "b",  "aarch64_sve_fcvt_bf16f32">;
-  def SVCVTNT_BF16_F32 : SInst<"svcvtnt_bf16[_f32]", "ddPM", "b",  MergeOp1, "aarch64_sve_fcvtnt_bf16f32", [IsOverloadNone]>;
+  def SVCVTNT_BF16_F32 : SInst<"svcvtnt_bf16[_f32]", "ddPM", "b",  MergeOp1, "aarch64_sve_fcvtnt_bf16f32", [IsOverloadNone, IsStreamingCompatible]>;
 }
 
 // svcvt_s##_f64
@@ -930,11 +930,11 @@ defm SVCVTLT_F64    : SInstCvtMX<"svcvtlt_f64[_f32]",  "ddPh", "dPh", "d", "aarc
 
 defm SVCVTX_F32     : SInstCvtMXZ<"svcvtx_f32[_f64]",  "MMPd", "MPd", "d", "aarch64_sve_fcvtx_f32f64">;
 
-def SVCVTNT_F32     : SInst<"svcvtnt_f16[_f32]",  "hhPd", "f", MergeOp1, "aarch64_sve_fcvtnt_f16f32", [IsOverloadNone]>;
-def SVCVTNT_F64     : SInst<"svcvtnt_f32[_f64]",  "hhPd", "d", MergeOp1, "aarch64_sve_fcvtnt_f32f64", [IsOverloadNone]>;
+def SVCVTNT_F32     : SInst<"svcvtnt_f16[_f32]",  "hhPd", "f", MergeOp1, "aarch64_sve_fcvtnt_f16f32", [IsOverloadNone, IsStreamingCompatible]>;
+def SVCVTNT_F64     : SInst<"svcvtnt_f32[_f64]",  "hhPd", "d", MergeOp1, "aarch64_sve_fcvtnt_f32f64", [IsOverloadNone, IsStreamingCompatible]>;
 //  SVCVTNT_X       : Implemented as macro by SveEmitter.cpp
 
-def SVCVTXNT_F32    : SInst<"svcvtxnt_f32[_f64]", "MMPd", "d", MergeOp1, "aarch64_sve_fcvtxnt_f32f64", [IsOverloadNone]>;
+def SVCVTXNT_F32    : SInst<"svcvtxnt_f32[_f64]", "MMPd", "d", MergeOp1, "aarch64_sve_fcvtxnt_f32f64", [IsOverloadNone, IsStreamingCompatible]>;
 //  SVCVTXNT_X_F32  : Implemented as macro by SveEmitter.cpp
 
 }
@@ -943,9 +943,9 @@ def SVCVTXNT_F32    : SInst<"svcvtxnt_f32[_f64]", "MMPd", "d", MergeOp1, "aarch6
 // Permutations and selection
 
 multiclass SVEPerm<string name, string proto, string i> {
-  def : SInst<name, proto, "csilUcUsUiUlhfd", MergeNone, i>;
+  def : SInst<name, proto, "csilUcUsUiUlhfd", MergeNone, i, [IsStreamingCompatible]>;
   let TargetGuard = "sve,bf16" in {
-    def: SInst<name, proto, "b", MergeNone, i>;
+    def: SInst<name, proto, "b", MergeNone, i, [IsStreamingCompatible]>;
   }
 }
 
@@ -969,81 +969,81 @@ def SVDUPQ_LANE  : SInst<"svdupq_lane[_{d}]", "ddn",  "csilUcUsUiUlhfd", MergeNo
 let TargetGuard = "sve,bf16" in {
   def SVDUPQ_LANE_BF16  : SInst<"svdupq_lane[_{d}]", "ddn",  "b", MergeNone, "aarch64_sve_dupq_lane">;
 }
-def SVEXT        : SInst<"svext[_{d}]",       "dddi", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ext", [], [ImmCheck<2, ImmCheckExtract, 1>]>;
+def SVEXT        : SInst<"svext[_{d}]",       "dddi", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ext", [IsStreamingCompatible], [ImmCheck<2, ImmCheckExtract, 1>]>;
 defm SVLASTA     : SVEPerm<"svlasta[_{d}]",   "sPd",  "aarch64_sve_lasta">;
 defm SVLASTB     : SVEPerm<"svlastb[_{d}]",   "sPd",  "aarch64_sve_lastb">;
-def SVREV        : SInst<"svrev[_{d}]",       "dd",   "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_rev">;
-def SVSEL        : SInst<"svsel[_{d}]",       "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_sel">;
-def SVSPLICE     : SInst<"svsplice[_{d}]",    "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_splice">;
-def SVTBL        : SInst<"svtbl[_{d}]",       "ddu",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl">;
+def SVREV        : SInst<"svrev[_{d}]",       "dd",   "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_rev", [IsStreamingCompatible]>;
+def SVSEL        : SInst<"svsel[_{d}]",       "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_sel", [IsStreamingCompatible]>;
+def SVSPLICE     : SInst<"svsplice[_{d}]",    "dPdd", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_splice", [IsStreamingCompatible]>;
+def SVTBL        : SInst<"svtbl[_{d}]",       "ddu",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>;
 
 let TargetGuard = "sve,bf16" in {
-  def SVTBL_BF16 : SInst<"svtbl[_{d}]",       "ddu",  "b",               MergeNone, "aarch64_sve_tbl">;
+  def SVTBL_BF16 : SInst<"svtbl[_{d}]",       "ddu",  "b",               MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>;
 }
 
-def SVTRN1       : SInst<"svtrn1[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1">;
-def SVTRN2       : SInst<"svtrn2[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2">;
-def SVUNPKHI_S   : SInst<"svunpkhi[_{d}]",    "dh",   "sil",             MergeNone, "aarch64_sve_sunpkhi">;
-def SVUNPKHI_U   : SInst<"svunpkhi[_{d}]",    "dh",   "UsUiUl",          MergeNone, "aarch64_sve_uunpkhi">;
-def SVUNPKLO_S   : SInst<"svunpklo[_{d}]",    "dh",   "sil",             MergeNone, "aarch64_sve_sunpklo">;
-def SVUNPKLO_U   : SInst<"svunpklo[_{d}]",    "dh",   "UsUiUl",          MergeNone, "aarch64_sve_uunpklo">;
-def SVUZP1       : SInst<"svuzp1[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1">;
-def SVUZP2       : SInst<"svuzp2[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2">;
-def SVZIP1       : SInst<"svzip1[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1">;
-def SVZIP2       : SInst<"svzip2[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2">;
+def SVTRN1       : SInst<"svtrn1[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1", [IsStreamingCompatible]>;
+def SVTRN2       : SInst<"svtrn2[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2", [IsStreamingCompatible]>;
+def SVUNPKHI_S   : SInst<"svunpkhi[_{d}]",    "dh",   "sil",             MergeNone, "aarch64_sve_sunpkhi", [IsStreamingCompatible]>;
+def SVUNPKHI_U   : SInst<"svunpkhi[_{d}]",    "dh",   "UsUiUl",          MergeNone, "aarch64_sve_uunpkhi", [IsStreamingCompatible]>;
+def SVUNPKLO_S   : SInst<"svunpklo[_{d}]",    "dh",   "sil",             MergeNone, "aarch64_sve_sunpklo", [IsStreamingCompatible]>;
+def SVUNPKLO_U   : SInst<"svunpklo[_{d}]",    "dh",   "UsUiUl",          MergeNone, "aarch64_sve_uunpklo", [IsStreamingCompatible]>;
+def SVUZP1       : SInst<"svuzp1[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1", [IsStreamingCompatible]>;
+def SVUZP2       : SInst<"svuzp2[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2", [IsStreamingCompatible]>;
+def SVZIP1       : SInst<"svzip1[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1", [IsStreamingCompatible]>;
+def SVZIP2       : SInst<"svzip2[_{d}]",      "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2", [IsStreamingCompatible]>;
 
 let TargetGuard = "sve,bf16" in {
-def SVEXT_BF16    : SInst<"svext[_{d}]",    "dddi", "b", MergeNone, "aarch64_sve_ext", [], [ImmCheck<2, ImmCheckExtract, 1>]>;
-def SVREV_BF16    : SInst<"svrev[_{d}]",    "dd",   "b", MergeNone, "aarch64_sve_rev">;
-def SVSEL_BF16    : SInst<"svsel[_{d}]",    "dPdd", "b", MergeNone, "aarch64_sve_sel">;
-def SVSPLICE_BF16 : SInst<"svsplice[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_splice">;
-def SVTRN1_BF16   : SInst<"svtrn1[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_trn1">;
-def SVTRN2_BF16   : SInst<"svtrn2[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_trn2">;
-def SVUZP1_BF16   : SInst<"svuzp1[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_uzp1">;
-def SVUZP2_BF16   : SInst<"svuzp2[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_uzp2">;
-def SVZIP1_BF16   : SInst<"svzip1[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_zip1">;
-def SVZIP2_BF16   : SInst<"svzip2[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_zip2">;
-}
-
-def SVREV_B8   : SInst<"svrev_b8",     "PP",   "Pc", MergeNone, "aarch64_sve_rev">;
-def SVREV_B16  : SInst<"svrev_b16",    "PP",   "Pc", MergeNone, "aarch64_sve_rev_b16",  [IsOverloadNone]>;
-def SVREV_B32  : SInst<"svrev_b32",    "PP",   "Pc", MergeNone, "aarch64_sve_rev_b32",  [IsOverloadNone]>;
-def SVREV_B64  : SInst<"svrev_b64",    "PP",   "Pc", MergeNone, "aarch64_sve_rev_b64",  [IsOverloadNone]>;
-def SVSEL_B    : SInst<"svsel[_b]",    "PPPP", "Pc", MergeNone, "aarch64_sve_sel">;
-def SVTRN1_B8  : SInst<"svtrn1_b8",    "PPP",  "Pc", MergeNone, "aarch64_sve_trn1">;
-def SVTRN1_B16 : SInst<"svtrn1_b16",   "PPP",  "Pc", MergeNone, "aarch64_sve_trn1_b16", [IsOverloadNone]>;
-def SVTRN1_B32 : SInst<"svtrn1_b32",   "PPP",  "Pc", MergeNone, "aarch64_sve_trn1_b32", [IsOverloadNone]>;
-def SVTRN1_B64 : SInst<"svtrn1_b64",   "PPP",  "Pc", MergeNone, "aarch64_sve_trn1_b64", [IsOverloadNone]>;
-def SVTRN2_B8  : SInst<"svtrn2_b8",    "PPP",  "Pc", MergeNone, "aarch64_sve_trn2">;
-def SVTRN2_B16 : SInst<"svtrn2_b16",   "PPP",  "Pc", MergeNone, "aarch64_sve_trn2_b16", [IsOverloadNone]>;
-def SVTRN2_B32 : SInst<"svtrn2_b32",   "PPP",  "Pc", MergeNone, "aarch64_sve_trn2_b32", [IsOverloadNone]>;
-def SVTRN2_B64 : SInst<"svtrn2_b64",   "PPP",  "Pc", MergeNone, "aarch64_sve_trn2_b64", [IsOverloadNone]>;
-def SVPUNPKHI  : SInst<"svunpkhi[_b]", "PP",   "Pc", MergeNone, "aarch64_sve_punpkhi">;
-def SVPUNPKLO  : SInst<"svunpklo[_b]", "PP",   "Pc", MergeNone, "aarch64_sve_punpklo">;
-def SVUZP1_B8  : SInst<"svuzp1_b8",    "PPP",  "Pc", MergeNone, "aarch64_sve_uzp1">;
-def SVUZP1_B16 : SInst<"svuzp1_b16",   "PPP",  "Pc", MergeNone, "aarch64_sve_uzp1_b16", [IsOverloadNone]>;
-def SVUZP1_B32 : SInst<"svuzp1_b32",   "PPP",  "Pc", MergeNone, "aarch64_sve_uzp1_b32", [IsOverloadNone]>;
-def SVUZP1_B64 : SInst<"svuzp1_b64",   "PPP",  "Pc", MergeNone, "aarch64_sve_uzp1_b64", [IsOverloadNone]>;
-def SVUZP2_B8  : SInst<"svuzp2_b8",    "PPP",  "Pc", MergeNone, "aarch64_sve_uzp2">;
-def SVUZP2_B16 : SInst<"svuzp2_b16",   "PPP",  "Pc", MergeNone, "aarch64_sve_uzp2_b16", [IsOverloadNone]>;
-def SVUZP2_B32 : SInst<"svuzp2_b32",   "PPP",  "Pc", MergeNone, "aarch64_sve_uzp2_b32", [IsOverloadNone]>;
-def SVUZP2_B64 : SInst<"svuzp2_b64",   "PPP",  "Pc", MergeNone, "aarch64_sve_uzp2_b64", [IsOverloadNone]>;
-def SVZIP1_B8  : SInst<"svzip1_b8",    "PPP",  "Pc", MergeNone, "aarch64_sve_zip1">;
-def SVZIP1_B16 : SInst<"svzip1_b16",   "PPP",  "Pc", MergeNone, "aarch64_sve_zip1_b16", [IsOverloadNone]>;
-def SVZIP1_B32 : SInst<"svzip1_b32",   "PPP",  "Pc", MergeNone, "aarch64_sve_zip1_b32", [IsOverloadNone]>;
-def SVZIP1_B64 : SInst<"svzip1_b64",   "PPP",  "Pc", MergeNone, "aarch64_sve_zip1_b64", [IsOverloadNone]>;
-def SVZIP2_B   : SInst<"svzip2_b8",    "PPP",  "Pc", MergeNone, "aarch64_sve_zip2">;
-def SVZIP2_B16 : SInst<"svzip2_b16",   "PPP",  "Pc", MergeNone, "aarch64_sve_zip2_b16", [IsOverloadNone]>;
-def SVZIP2_B32 : SInst<"svzip2_b32",   "PPP",  "Pc", MergeNone, "aarch64_sve_zip2_b32", [IsOverloadNone]>;
-def SVZIP2_B64 : SInst<"svzip2_b64",   "PPP",  "Pc", MergeNone, "aarch64_sve_zip2_b64", [IsOverloadNone]>;
+def SVEXT_BF16    : SInst<"svext[_{d}]",    "dddi", "b", MergeNone, "aarch64_sve_ext", [IsStreamingCompatible], [ImmCheck<2, ImmCheckExtract, 1>]>;
+def SVREV_BF16    : SInst<"svrev[_{d}]",    "dd",   "b", MergeNone, "aarch64_sve_rev", [IsStreamingCompatible]>;
+def SVSEL_BF16    : SInst<"svsel[_{d}]",    "dPdd", "b", MergeNone, "aarch64_sve_sel", [IsStreamingCompatible]>;
+def SVSPLICE_BF16 : SInst<"svsplice[_{d}]", "dPdd", "b", MergeNone, "aarch64_sve_splice", [IsStreamingCompatible]>;
+def SVTRN1_BF16   : SInst<"svtrn1[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_trn1", [IsStreamingCompatible]>;
+def SVTRN2_BF16   : SInst<"svtrn2[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_trn2", [IsStreamingCompatible]>;
+def SVUZP1_BF16   : SInst<"svuzp1[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_uzp1", [IsStreamingCompatible]>;
+def SVUZP2_BF16   : SInst<"svuzp2[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_uzp2", [IsStreamingCompatible]>;
+def SVZIP1_BF16   : SInst<"svzip1[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_zip1", [IsStreamingCompatible]>;
+def SVZIP2_BF16   : SInst<"svzip2[_{d}]",   "ddd",  "b", MergeNone, "aarch64_sve_zip2", [IsStreamingCompatible]>;
+}
+
+def SVREV_B8   : SInst<"svrev_b8",     "PP",   "Pc", MergeNone, "aarch64_sve_rev", [IsStreamingCompatible]>;
+def SVREV_B16  : SInst<"svrev_b16",    "PP",   "Pc", MergeNone, "aarch64_sve_rev_b16",  [IsOverloadNone, IsStreamingCompatible]>;
+def SVREV_B32  : SInst<"svrev_b32",    "PP",   "Pc", MergeNone, "aarch64_sve_rev_b32",  [IsOverloadNone, IsStreamingCompatible]>;
+def SVREV_B64  : SInst<"svrev_b64",    "PP",   "Pc", MergeNone, "aarch64_sve_rev_b64",  [IsOverloadNone, IsStreamingCompatible]>;
+def SVSEL_B    : SInst<"svsel[_b]",    "PPPP", "Pc", MergeNone, "aarch64_sve_sel", [IsStreamingCompatible]>;
+def SVTRN1_B8  : SInst<"svtrn1_b8",    "PPP",  "Pc", MergeNone, "aarch64_sve_trn1", [IsStreamingCompatible]>;
+def SVTRN1_B16 : SInst<"svtrn1_b16",   "PPP",  "Pc", MergeNone, "aarch64_sve_trn1_b16", [IsOverloadNone, IsStreamingCompatible]>;
+def SVTRN1_B32 : SInst<"svtrn1_b32",   "PPP",  "Pc", MergeNone, "aarch64_sve_trn1_b32", [IsOverloadNone, IsStreamingCompatible]>;
+def SVTRN1_B64 : SInst<"svtrn1_b64",   "PPP",  "Pc", MergeNone, "aarch64_sve_trn1_b64", [IsOverloadNone, IsStreamingCompatible]>;
+def SVTRN2_B8  : SInst<"svtrn2_b8",    "PPP",  "Pc", MergeNone, "aarch64_sve_trn2", [IsStreamingCompatible]>;
+def SVTRN2_B16 : SInst<"svtrn2_b16",   "PPP",  "Pc", MergeNone, "aarch64_sve_trn2_b16", [IsOverloadNone, IsStreamingCompatible]>;
+def SVTRN2_B32 : SInst<"svtrn2_b32",   "PPP",  "Pc", MergeNone, "aarch64_sve_trn2_b32", [IsOverloadNone, IsStreamingCompatible]>;
+def SVTRN2_B64 : SInst<"svtrn2_b64",   "PPP",  "Pc", MergeNone, "aarch64_sve_trn2_b64", [IsOverloadNone, IsStreamingCompatible]>;
+def SVPUNPKHI  : SInst<"svunpkhi[_b]", "PP",   "Pc", MergeNone, "aarch64_sve_punpkhi", [IsStreamingCompatible]>;
+def SVPUNPKLO  : SInst<"svunpklo[_b]", "PP",   "Pc", MergeNone, "aarch64_sve_punpklo", [IsStreamingCompatible]>;
+def SVUZP1_B8  : SInst<"svuzp1_b8",    "PPP",  "Pc", MergeNone, "aarch64_sve_uzp1", [IsStreamingCompatible]>;
+def SVUZP1_B16 : SInst<"svuzp1_b16",   "PPP",  "Pc", MergeNone, "aarch64_sve_uzp1_b16", [IsOverloadNone, IsStreamingCompatible]>;
+def SVUZP1_B32 : SInst<"svuzp1_b32",   "PPP",  "Pc", MergeNone, "aarch64_sve_uzp1_b32", [IsOverloadNone, IsStreamingCompatible]>;
+def SVUZP1_B64 : SInst<"svuzp1_b64",   "PPP",  "Pc", MergeNone, "aarch64_sve_uzp1_b64", [IsOverloadNone, IsStreamingCompatible]>;
+def SVUZP2_B8  : SInst<"svuzp2_b8",    "PPP",  "Pc", MergeNone, "aarch64_sve_uzp2", [IsStreamingCompatible]>;
+def SVUZP2_B16 : SInst<"svuzp2_b16",   "PPP",  "Pc", MergeNone, "aarch64_sve_uzp2_b16", [IsOverloadNone, IsStreamingCompatible]>;
+def SVUZP2_B32 : SInst<"svuzp2_b32",   "PPP",  "Pc", MergeNone, "aarch64_sve_uzp2_b32", [IsOverloadNone, IsStreamingCompatible]>;
+def SVUZP2_B64 : SInst<"svuzp2_b64",   "PPP",  "Pc", MergeNone, "aarch64_sve_uzp2_b64", [IsOverloadNone, IsStreamingCompatible]>;
+def SVZIP1_B8  : SInst<"svzip1_b8",    "PPP",  "Pc", MergeNone, "aarch64_sve_zip1", [IsStreamingCompatible]>;
+def SVZIP1_B16 : SInst<"svzip1_b16",   "PPP",  "Pc", MergeNone, "aarch64_sve_zip1_b16", [IsOverloadNone, IsStreamingCompatible]>;
+def SVZIP1_B32 : SInst<"svzip1_b32",   "PPP",  "Pc", MergeNone, "aarch64_sve_zip1_b32", [IsOverloadNone, IsStreamingCompatible]>;
+def SVZIP1_B64 : SInst<"svzip1_b64",   "PPP",  "Pc", MergeNone, "aarch64_sve_zip1_b64", [IsOverloadNone, IsStreamingCompatible]>;
+def SVZIP2_B   : SInst<"svzip2_b8",    "PPP",  "Pc", MergeNone, "aarch64_sve_zip2", [IsStreamingCompatible]>;
+def SVZIP2_B16 : SInst<"svzip2_b16",   "PPP",  "Pc", MergeNone, "aarch64_sve_zip2_b16", [IsOverloadNone, IsStreamingCompatible]>;
+def SVZIP2_B32 : SInst<"svzip2_b32",   "PPP",  "Pc", MergeNone, "aarch64_sve_zip2_b32", [IsOverloadNone, IsStreamingCompatible]>;
+def SVZIP2_B64 : SInst<"svzip2_b64",   "PPP",  "Pc", MergeNone, "aarch64_sve_zip2_b64", [IsOverloadNone, IsStreamingCompatible]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Predicate creation
 
-def SVPFALSE : SInst<"svpfalse[_b]", "Pv", "", MergeNone, "", [IsOverloadNone]>;
+def SVPFALSE : SInst<"svpfalse[_b]", "Pv", "", MergeNone, "", [IsOverloadNone, IsStreamingCompatible]>;
 
-def SVPTRUE_PAT : SInst<"svptrue_pat_{d}", "PI", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue">;
-def SVPTRUE     : SInst<"svptrue_{d}",     "Pv",  "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsAppendSVALL]>;
+def SVPTRUE_PAT : SInst<"svptrue_pat_{d}", "PI", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsStreamingCompatible]>;
+def SVPTRUE     : SInst<"svptrue_{d}",     "Pv",  "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsAppendSVALL, IsStreamingCompatible]>;
 
 def SVDUPQ_B8      : SInst<"svdupq[_n]_{d}",  "Pssssssssssssssss",  "Pc", MergeNone>;
 def SVDUPQ_B16     : SInst<"svdupq[_n]_{d}", "Pssssssss",  "Ps", MergeNone>;
@@ -1055,33 +1055,33 @@ def SVDUP_N_B      : SInst<"svdup[_n]_{d}",  "Ps", "PcPsPiPl", MergeNone>;
 ////////////////////////////////////////////////////////////////////////////////
 // Predicate operations
 
-def SVAND_B_Z  : SInst<"svand[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_and_z">;
-def SVBIC_B_Z  : SInst<"svbic[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_bic_z">;
-def SVEOR_B_Z  : SInst<"sveor[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_eor_z">;
-def SVMOV_B_Z  : SInst<"svmov[_b]_z",  "PPP",  "Pc", MergeNone>; // Uses custom expansion
-def SVNAND_B_Z : SInst<"svnand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nand_z">;
-def SVNOR_B_Z  : SInst<"svnor[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_nor_z">;
-def SVNOT_B_Z  : SInst<"svnot[_b]_z",  "PPP",  "Pc", MergeNone>; // Uses custom expansion
-def SVORN_B_Z  : SInst<"svorn[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_orn_z">;
-def SVORR_B_Z  : SInst<"svorr[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_orr_z">;
-
-def SVBRKA    : SInst<"svbrka[_b]_m",  "PPPP", "Pc", MergeNone, "aarch64_sve_brka">;
-def SVBRKA_Z  : SInst<"svbrka[_b]_z",  "PPP",  "Pc", MergeNone, "aarch64_sve_brka_z">;
-def SVBRKB    : SInst<"svbrkb[_b]_m",  "PPPP", "Pc", MergeNone, "aarch64_sve_brkb">;
-def SVBRKB_Z  : SInst<"svbrkb[_b]_z",  "PPP",  "Pc", MergeNone, "aarch64_sve_brkb_z">;
-def SVBRKN_Z  : SInst<"svbrkn[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_brkn_z">;
-def SVBRKPA_Z : SInst<"svbrkpa[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpa_z">;
-def SVBRKPB_Z : SInst<"svbrkpb[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpb_z">;
-
-def SVPFIRST : SInst<"svpfirst[_b]", "PPP", "Pc",       MergeNone, "aarch64_sve_pfirst">;
-def SVPNEXT  : SInst<"svpnext_{d}",    "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_pnext">;
+def SVAND_B_Z  : SInst<"svand[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_and_z", [IsStreamingCompatible]>;
+def SVBIC_B_Z  : SInst<"svbic[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_bic_z", [IsStreamingCompatible]>;
+def SVEOR_B_Z  : SInst<"sveor[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_eor_z", [IsStreamingCompatible]>;
+def SVMOV_B_Z  : SInst<"svmov[_b]_z",  "PPP",  "Pc", MergeNone, "", [IsStreamingCompatible]>; // Uses custom expansion
+def SVNAND_B_Z : SInst<"svnand[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_nand_z", [IsStreamingCompatible]>;
+def SVNOR_B_Z  : SInst<"svnor[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_nor_z", [IsStreamingCompatible]>;
+def SVNOT_B_Z  : SInst<"svnot[_b]_z",  "PPP",  "Pc", MergeNone, "", [IsStreamingCompatible]>; // Uses custom expansion
+def SVORN_B_Z  : SInst<"svorn[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_orn_z", [IsStreamingCompatible]>;
+def SVORR_B_Z  : SInst<"svorr[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_orr_z", [IsStreamingCompatible]>;
+
+def SVBRKA    : SInst<"svbrka[_b]_m",  "PPPP", "Pc", MergeNone, "aarch64_sve_brka", [IsStreamingCompatible]>;
+def SVBRKA_Z  : SInst<"svbrka[_b]_z",  "PPP",  "Pc", MergeNone, "aarch64_sve_brka_z", [IsStreamingCompatible]>;
+def SVBRKB    : SInst<"svbrkb[_b]_m",  "PPPP", "Pc", MergeNone, "aarch64_sve_brkb", [IsStreamingCompatible]>;
+def SVBRKB_Z  : SInst<"svbrkb[_b]_z",  "PPP",  "Pc", MergeNone, "aarch64_sve_brkb_z", [IsStreamingCompatible]>;
+def SVBRKN_Z  : SInst<"svbrkn[_b]_z",  "PPPP", "Pc", MergeNone, "aarch64_sve_brkn_z", [IsStreamingCompatible]>;
+def SVBRKPA_Z : SInst<"svbrkpa[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpa_z", [IsStreamingCompatible]>;
+def SVBRKPB_Z : SInst<"svbrkpb[_b]_z", "PPPP", "Pc", MergeNone, "aarch64_sve_brkpb_z", [IsStreamingCompatible]>;
+
+def SVPFIRST : SInst<"svpfirst[_b]", "PPP", "Pc",       MergeNone, "aarch64_sve_pfirst", [IsStreamingCompatible]>;
+def SVPNEXT  : SInst<"svpnext_{d}",    "PPP", "PcPsPiPl", MergeNone, "aarch64_sve_pnext", [IsStreamingCompatible]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Testing predicates
 
-def SVPTEST_ANY   : SInst<"svptest_any",   "sPP", "Pc", MergeNone, "aarch64_sve_ptest_any">;
-def SVPTEST_FIRST : SInst<"svptest_first", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_first">;
-def SVPTEST_LAST  : SInst<"svptest_last",  "sPP", "Pc", MergeNone, "aarch64_sve_ptest_last">;
+def SVPTEST_ANY   : SInst<"svptest_any",   "sPP", "Pc", MergeNone, "aarch64_sve_ptest_any", [IsStreamingCompatible]>;
+def SVPTEST_FIRST : SInst<"svptest_first", "sPP", "Pc", MergeNone, "aarch64_sve_ptest_first", [IsStreamingCompatible]>;
+def SVPTEST_LAST  : SInst<"svptest_last",  "sPP", "Pc", MergeNone, "aarch64_sve_ptest_last", [IsStreamingCompatible]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // FFR manipulation
@@ -1094,21 +1094,21 @@ def SVWRFFR   : SInst<"svwrffr",   "vP", "Pc", MergeNone, "", [IsOverloadNone]>;
 ////////////////////////////////////////////////////////////////////////////////
 // Counting elements
 
-def SVCNTB_PAT : SInst<"svcntb_pat", "nI", "", MergeNone, "aarch64_sve_cntb", [IsOverloadNone]>;
-def SVCNTH_PAT : SInst<"svcnth_pat", "nI", "", MergeNone, "aarch64_sve_cnth", [IsOverloadNone]>;
-def SVCNTW_PAT : SInst<"svcntw_pat", "nI", "", MergeNone, "aarch64_sve_cntw", [IsOverloadNone]>;
-def SVCNTD_PAT : SInst<"svcntd_pat", "nI", "", MergeNone, "aarch64_sve_cntd", [IsOverloadNone]>;
+def SVCNTB_PAT : SInst<"svcntb_pat", "nI", "", MergeNone, "aarch64_sve_cntb", [IsOverloadNone, IsStreamingCompatible]>;
+def SVCNTH_PAT : SInst<"svcnth_pat", "nI", "", MergeNone, "aarch64_sve_cnth", [IsOverloadNone, IsStreamingCompatible]>;
+def SVCNTW_PAT : SInst<"svcntw_pat", "nI", "", MergeNone, "aarch64_sve_cntw", [IsOverloadNone, IsStreamingCompatible]>;
+def SVCNTD_PAT : SInst<"svcntd_pat", "nI", "", MergeNone, "aarch64_sve_cntd", [IsOverloadNone, IsStreamingCompatible]>;
 
-def SVCNTB : SInst<"svcntb", "nv", "", MergeNone, "aarch64_sve_cntb", [IsAppendSVALL, IsOverloadNone]>;
-def SVCNTH : SInst<"svcnth", "nv", "", MergeNone, "aarch64_sve_cnth", [IsAppendSVALL, IsOverloadNone]>;
-def SVCNTW : SInst<"svcntw", "nv", "", MergeNone, "aarch64_sve_cntw", [IsAppendSVALL, IsOverloadNone]>;
-def SVCNTD : SInst<"svcntd", "nv", "", MergeNone, "aarch64_sve_cntd", [IsAppendSVALL, IsOverloadNone]>;
+def SVCNTB : SInst<"svcntb", "nv", "", MergeNone, "aarch64_sve_cntb", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>;
+def SVCNTH : SInst<"svcnth", "nv", "", MergeNone, "aarch64_sve_cnth", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>;
+def SVCNTW : SInst<"svcntw", "nv", "", MergeNone, "aarch64_sve_cntw", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>;
+def SVCNTD : SInst<"svcntd", "nv", "", MergeNone, "aarch64_sve_cntd", [IsAppendSVALL, IsOverloadNone, IsStreamingCompatible]>;
 
-def SVCNTP : SInst<"svcntp_{d}",  "nPP", "PcPsPiPl",        MergeNone, "aarch64_sve_cntp">;
-def SVLEN  : SInst<"svlen[_{d}]", "nd",  "csilUcUsUiUlhfd", MergeNone>;
+def SVCNTP : SInst<"svcntp_{d}",  "nPP", "PcPsPiPl",        MergeNone, "aarch64_sve_cntp", [IsStreamingCompatible]>;
+def SVLEN  : SInst<"svlen[_{d}]", "nd",  "csilUcUsUiUlhfd", MergeNone, "", [IsStreamingCompatible]>;
 
 let TargetGuard = "sve,bf16" in {
-def SVLEN_BF16 : SInst<"svlen[_{d}]", "nd", "b", MergeNone>;
+def SVLEN_BF16 : SInst<"svlen[_{d}]", "nd", "b", MergeNone, "", [IsStreamingCompatible]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1125,20 +1125,20 @@ def UnsignedWord       : sat_type<"U", "Ui">;
 def UnsignedDoubleWord : sat_type<"U", "Ul">;
 
 multiclass SInst_SAT1<string name, string intrinsic, sat_type type> {
-  def _N32     : SInst<name # "_pat[_n_{d}]", "ssIi", type.U # "i", MergeNone, intrinsic # "_n32", [IsOverloadNone], [ImmCheck<2, ImmCheck1_16>]>;
-  def _N64     : SInst<name # "_pat[_n_{d}]", "ssIi", type.U # "l", MergeNone, intrinsic # "_n64", [IsOverloadNone], [ImmCheck<2, ImmCheck1_16>]>;
-  def _N32_ALL : SInst<name # "[_n_{d}]",     "ssi",  type.U # "i", MergeNone, intrinsic # "_n32", [IsOverloadNone, IsInsertOp1SVALL], [ImmCheck<1, ImmCheck1_16>]>;
-  def _N64_ALL : SInst<name # "[_n_{d}]",     "ssi",  type.U # "l", MergeNone, intrinsic # "_n64", [IsOverloadNone, IsInsertOp1SVALL], [ImmCheck<1, ImmCheck1_16>]>;
+  def _N32     : SInst<name # "_pat[_n_{d}]", "ssIi", type.U # "i", MergeNone, intrinsic # "_n32", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<2, ImmCheck1_16>]>;
+  def _N64     : SInst<name # "_pat[_n_{d}]", "ssIi", type.U # "l", MergeNone, intrinsic # "_n64", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<2, ImmCheck1_16>]>;
+  def _N32_ALL : SInst<name # "[_n_{d}]",     "ssi",  type.U # "i", MergeNone, intrinsic # "_n32", [IsOverloadNone, IsInsertOp1SVALL, IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>;
+  def _N64_ALL : SInst<name # "[_n_{d}]",     "ssi",  type.U # "l", MergeNone, intrinsic # "_n64", [IsOverloadNone, IsInsertOp1SVALL, IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>;
 }
 
 multiclass SInst_SAT2<string name, string intrinsic, sat_type type> {
-  def ""       : SInst<name # "_pat[_{d}]",   "ddIi", type.T,       MergeNone, intrinsic, [], [ImmCheck<2, ImmCheck1_16>]>;
-  def _ALL     : SInst<name # "[_{d}]",       "ddi",  type.T,       MergeNone, intrinsic, [IsInsertOp1SVALL], [ImmCheck<1, ImmCheck1_16>]>;
+  def ""       : SInst<name # "_pat[_{d}]",   "ddIi", type.T,       MergeNone, intrinsic, [IsStreamingCompatible], [ImmCheck<2, ImmCheck1_16>]>;
+  def _ALL     : SInst<name # "[_{d}]",       "ddi",  type.T,       MergeNone, intrinsic, [IsInsertOp1SVALL, IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>;
 
-  def _N32     : SInst<name # "_pat[_n_{d}]", "ssIi", type.U # "i", MergeNone, intrinsic # "_n32", [IsOverloadNone], [ImmCheck<2, ImmCheck1_16>]>;
-  def _N64     : SInst<name # "_pat[_n_{d}]", "ssIi", type.U # "l", MergeNone, intrinsic # "_n64", [IsOverloadNone], [ImmCheck<2, ImmCheck1_16>]>;
-  def _N32_ALL : SInst<name # "[_n_{d}]",     "ssi",  type.U # "i", MergeNone, intrinsic # "_n32", [IsOverloadNone, IsInsertOp1SVALL], [ImmCheck<1, ImmCheck1_16>]>;
-  def _N64_ALL : SInst<name # "[_n_{d}]",     "ssi",  type.U # "l", MergeNone, intrinsic # "_n64", [IsOverloadNone, IsInsertOp1SVALL], [ImmCheck<1, ImmCheck1_16>]>;
+  def _N32     : SInst<name # "_pat[_n_{d}]", "ssIi", type.U # "i", MergeNone, intrinsic # "_n32", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<2, ImmCheck1_16>]>;
+  def _N64     : SInst<name # "_pat[_n_{d}]", "ssIi", type.U # "l", MergeNone, intrinsic # "_n64", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<2, ImmCheck1_16>]>;
+  def _N32_ALL : SInst<name # "[_n_{d}]",     "ssi",  type.U # "i", MergeNone, intrinsic # "_n32", [IsOverloadNone, IsInsertOp1SVALL, IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>;
+  def _N64_ALL : SInst<name # "[_n_{d}]",     "ssi",  type.U # "l", MergeNone, intrinsic # "_n64", [IsOverloadNone, IsInsertOp1SVALL, IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>;
 }
 
 defm SVQDECB_S : SInst_SAT1<"svqdecb", "aarch64_sve_sqdecb", SignedByte>;
@@ -1159,32 +1159,32 @@ defm SVQINCW_U : SInst_SAT2<"svqincw", "aarch64_sve_uqincw", UnsignedWord>;
 defm SVQINCD_S : SInst_SAT2<"svqincd", "aarch64_sve_sqincd", SignedDoubleWord>;
 defm SVQINCD_U : SInst_SAT2<"svqincd", "aarch64_sve_uqincd", UnsignedDoubleWord>;
 
-def SVQDECP_S : SInst<"svqdecp[_{d}]", "ddP", "sil",    MergeNone, "aarch64_sve_sqdecp">;
-def SVQDECP_U : SInst<"svqdecp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqdecp">;
-def SVQINCP_S : SInst<"svqincp[_{d}]", "ddP", "sil",    MergeNone, "aarch64_sve_sqincp">;
-def SVQINCP_U : SInst<"svqincp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqincp">;
+def SVQDECP_S : SInst<"svqdecp[_{d}]", "ddP", "sil",    MergeNone, "aarch64_sve_sqdecp", [IsStreamingCompatible]>;
+def SVQDECP_U : SInst<"svqdecp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqdecp", [IsStreamingCompatible]>;
+def SVQINCP_S : SInst<"svqincp[_{d}]", "ddP", "sil",    MergeNone, "aarch64_sve_sqincp", [IsStreamingCompatible]>;
+def SVQINCP_U : SInst<"svqincp[_{d}]", "ddP", "UsUiUl", MergeNone, "aarch64_sve_uqincp", [IsStreamingCompatible]>;
 
-def SVQDECP_N_S32 : SInst<"svqdecp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n32">;
-def SVQDECP_N_S64 : SInst<"svqdecp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n64">;
-def SVQDECP_N_U32 : SInst<"svqdecp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n32">;
-def SVQDECP_N_U64 : SInst<"svqdecp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n64">;
-def SVQINCP_N_S32 : SInst<"svqincp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n32">;
-def SVQINCP_N_S64 : SInst<"svqincp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n64">;
-def SVQINCP_N_U32 : SInst<"svqincp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n32">;
-def SVQINCP_N_U64 : SInst<"svqincp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n64">;
+def SVQDECP_N_S32 : SInst<"svqdecp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n32", [IsStreamingCompatible]>;
+def SVQDECP_N_S64 : SInst<"svqdecp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqdecp_n64", [IsStreamingCompatible]>;
+def SVQDECP_N_U32 : SInst<"svqdecp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n32", [IsStreamingCompatible]>;
+def SVQDECP_N_U64 : SInst<"svqdecp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqdecp_n64", [IsStreamingCompatible]>;
+def SVQINCP_N_S32 : SInst<"svqincp[_n_s32]_{d}", "kkP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n32", [IsStreamingCompatible]>;
+def SVQINCP_N_S64 : SInst<"svqincp[_n_s64]_{d}", "llP", "PcPsPiPl", MergeNone, "aarch64_sve_sqincp_n64", [IsStreamingCompatible]>;
+def SVQINCP_N_U32 : SInst<"svqincp[_n_u32]_{d}", "mmP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n32", [IsStreamingCompatible]>;
+def SVQINCP_N_U64 : SInst<"svqincp[_n_u64]_{d}", "nnP", "PcPsPiPl", MergeNone, "aarch64_sve_uqincp_n64", [IsStreamingCompatible]>;
 
 let TargetGuard = "sve,i8mm" in {
 def SVMLLA_S32   : SInst<"svmmla[_s32]",   "ddqq","i",  MergeNone, "aarch64_sve_smmla">;
 def SVMLLA_U32   : SInst<"svmmla[_u32]",   "ddqq","Ui", MergeNone, "aarch64_sve_ummla">;
 def SVUSMLLA_S32 : SInst<"svusmmla[_s32]", "ddbq","i",  MergeNone, "aarch64_sve_usmmla">;
 
-def SVUSDOT_S    : SInst<"svusdot[_s32]",    "ddbq", "i",       MergeNone, "aarch64_sve_usdot">;
-def SVUSDOT_N_S  : SInst<"svusdot[_n_s32]",  "ddbr", "i",       MergeNone, "aarch64_sve_usdot">;
-def SVSUDOT_S    : SInst<"svsudot[_s32]",    "ddqb", "i",       MergeNone, "aarch64_sve_usdot", [ReverseUSDOT]>;
-def SVSUDOT_N_S  : SInst<"svsudot[_n_s32]",  "ddq@", "i",       MergeNone, "aarch64_sve_usdot", [ReverseUSDOT]>;
+def SVUSDOT_S    : SInst<"svusdot[_s32]",    "ddbq", "i",       MergeNone, "aarch64_sve_usdot", [IsStreamingCompatible]>;
+def SVUSDOT_N_S  : SInst<"svusdot[_n_s32]",  "ddbr", "i",       MergeNone, "aarch64_sve_usdot", [IsStreamingCompatible]>;
+def SVSUDOT_S    : SInst<"svsudot[_s32]",    "ddqb", "i",       MergeNone, "aarch64_sve_usdot", [ReverseUSDOT, IsStreamingCompatible]>;
+def SVSUDOT_N_S  : SInst<"svsudot[_n_s32]",  "ddq@", "i",       MergeNone, "aarch64_sve_usdot", [ReverseUSDOT, IsStreamingCompatible]>;
 
-def SVUSDOT_LANE_S : SInst<"svusdot_lane[_s32]",  "ddbqi",  "i",   MergeNone, "aarch64_sve_usdot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>;
-def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]",  "ddqbi",  "i",   MergeNone, "aarch64_sve_sudot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>;
+def SVUSDOT_LANE_S : SInst<"svusdot_lane[_s32]",  "ddbqi",  "i",   MergeNone, "aarch64_sve_usdot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>;
+def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]",  "ddqbi",  "i",   MergeNone, "aarch64_sve_sudot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>;
 }
 
 let TargetGuard = "sve,f32mm" in {
@@ -1193,12 +1193,12 @@ def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla
 
 let TargetGuard = "sve,f64mm" in {
 def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd","d", MergeNone, "aarch64_sve_fmmla">;
-def SVTRN1Q      : SInst<"svtrn1q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1q">;
-def SVTRN2Q      : SInst<"svtrn2q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2q">;
-def SVUZP1Q      : SInst<"svuzp1q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1q">;
-def SVUZP2Q      : SInst<"svuzp2q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2q">;
-def SVZIP1Q      : SInst<"svzip1q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1q">;
-def SVZIP2Q      : SInst<"svzip2q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2q">;
+def SVTRN1Q      : SInst<"svtrn1q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn1q", [IsStreamingCompatible]>;
+def SVTRN2Q      : SInst<"svtrn2q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_trn2q", [IsStreamingCompatible]>;
+def SVUZP1Q      : SInst<"svuzp1q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp1q", [IsStreamingCompatible]>;
+def SVUZP2Q      : SInst<"svuzp2q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_uzp2q", [IsStreamingCompatible]>;
+def SVZIP1Q      : SInst<"svzip1q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip1q", [IsStreamingCompatible]>;
+def SVZIP2Q      : SInst<"svzip2q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_zip2q", [IsStreamingCompatible]>;
 }
 
 let TargetGuard = "sve,bf16,f64mm" in {
@@ -1212,57 +1212,57 @@ def SVZIP2Q_BF16      : SInst<"svzip2q[_{d}]",     "ddd",  "b", MergeNone, "aarc
 
 ////////////////////////////////////////////////////////////////////////////////
 // Vector creation
-def SVUNDEF_1 : SInst<"svundef_{d}",  "dv", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>;
-def SVUNDEF_2 : SInst<"svundef2_{d}", "2v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>;
-def SVUNDEF_3 : SInst<"svundef3_{d}", "3v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>;
-def SVUNDEF_4 : SInst<"svundef4_{d}", "4v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef]>;
+def SVUNDEF_1 : SInst<"svundef_{d}",  "dv", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>;
+def SVUNDEF_2 : SInst<"svundef2_{d}", "2v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>;
+def SVUNDEF_3 : SInst<"svundef3_{d}", "3v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>;
+def SVUNDEF_4 : SInst<"svundef4_{d}", "4v", "csilUcUsUiUlhfd", MergeNone, "", [IsUndef, IsStreamingCompatible]>;
 
-def SVCREATE_2 : SInst<"svcreate2[_{d}]", "2dd",   "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate]>;
-def SVCREATE_3 : SInst<"svcreate3[_{d}]", "3ddd",  "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate]>;
-def SVCREATE_4 : SInst<"svcreate4[_{d}]", "4dddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate]>;
+def SVCREATE_2 : SInst<"svcreate2[_{d}]", "2dd",   "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>;
+def SVCREATE_3 : SInst<"svcreate3[_{d}]", "3ddd",  "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>;
+def SVCREATE_4 : SInst<"svcreate4[_{d}]", "4dddd", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>;
 
 let TargetGuard = "sve,bf16" in {
-def SVUNDEF_1_BF16 : SInst<"svundef_{d}",  "dv", "b", MergeNone, "", [IsUndef]>;
-def SVUNDEF_2_BF16 : SInst<"svundef2_{d}", "2v", "b", MergeNone, "", [IsUndef]>;
-def SVUNDEF_3_BF16 : SInst<"svundef3_{d}", "3v", "b", MergeNone, "", [IsUndef]>;
-def SVUNDEF_4_BF16 : SInst<"svundef4_{d}", "4v", "b", MergeNone, "", [IsUndef]>;
+def SVUNDEF_1_BF16 : SInst<"svundef_{d}",  "dv", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>;
+def SVUNDEF_2_BF16 : SInst<"svundef2_{d}", "2v", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>;
+def SVUNDEF_3_BF16 : SInst<"svundef3_{d}", "3v", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>;
+def SVUNDEF_4_BF16 : SInst<"svundef4_{d}", "4v", "b", MergeNone, "", [IsUndef, IsStreamingCompatible]>;
 
-def SVCREATE_2_BF16 : SInst<"svcreate2[_{d}]", "2dd",   "b", MergeNone, "", [IsTupleCreate]>;
-def SVCREATE_3_BF16 : SInst<"svcreate3[_{d}]", "3ddd",  "b", MergeNone, "", [IsTupleCreate]>;
-def SVCREATE_4_BF16 : SInst<"svcreate4[_{d}]", "4dddd", "b", MergeNone, "", [IsTupleCreate]>;
+def SVCREATE_2_BF16 : SInst<"svcreate2[_{d}]", "2dd",   "b", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>;
+def SVCREATE_3_BF16 : SInst<"svcreate3[_{d}]", "3ddd",  "b", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>;
+def SVCREATE_4_BF16 : SInst<"svcreate4[_{d}]", "4dddd", "b", MergeNone, "", [IsTupleCreate, IsStreamingCompatible]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Vector insertion and extraction
-def SVGET_2 : SInst<"svget2[_{d}]", "d2i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_1>]>;
-def SVGET_3 : SInst<"svget3[_{d}]", "d3i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_2>]>;
-def SVGET_4 : SInst<"svget4[_{d}]", "d4i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_3>]>;
+def SVGET_2 : SInst<"svget2[_{d}]", "d2i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>;
+def SVGET_3 : SInst<"svget3[_{d}]", "d3i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>;
+def SVGET_4 : SInst<"svget4[_{d}]", "d4i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>;
 
-def SVSET_2 : SInst<"svset2[_{d}]", "22id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_1>]>;
-def SVSET_3 : SInst<"svset3[_{d}]", "33id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_2>]>;
-def SVSET_4 : SInst<"svset4[_{d}]", "44id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_3>]>;
+def SVSET_2 : SInst<"svset2[_{d}]", "22id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>;
+def SVSET_3 : SInst<"svset3[_{d}]", "33id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>;
+def SVSET_4 : SInst<"svset4[_{d}]", "44id", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>;
 
 let TargetGuard = "sve,bf16" in {
-def SVGET_2_BF16 : SInst<"svget2[_{d}]", "d2i", "b", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_1>]>;
-def SVGET_3_BF16 : SInst<"svget3[_{d}]", "d3i", "b", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_2>]>;
-def SVGET_4_BF16 : SInst<"svget4[_{d}]", "d4i", "b", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_3>]>;
+def SVGET_2_BF16 : SInst<"svget2[_{d}]", "d2i", "b", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>;
+def SVGET_3_BF16 : SInst<"svget3[_{d}]", "d3i", "b", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>;
+def SVGET_4_BF16 : SInst<"svget4[_{d}]", "d4i", "b", MergeNone, "", [IsTupleGet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>;
 
-def SVSET_2_BF16 : SInst<"svset2[_{d}]", "22id", "b", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_1>]>;
-def SVSET_3_BF16 : SInst<"svset3[_{d}]", "33id", "b", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_2>]>;
-def SVSET_4_BF16 : SInst<"svset4[_{d}]", "44id", "b", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_3>]>;
+def SVSET_2_BF16 : SInst<"svset2[_{d}]", "22id", "b", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_1>]>;
+def SVSET_3_BF16 : SInst<"svset3[_{d}]", "33id", "b", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_2>]>;
+def SVSET_4_BF16 : SInst<"svset4[_{d}]", "44id", "b", MergeNone, "", [IsTupleSet, IsStreamingCompatible], [ImmCheck<1, ImmCheck0_3>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 WhileGE/GT
 let TargetGuard = "sve2" in {
-def SVWHILEGE_S32 : SInst<"svwhilege_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilege", [IsOverloadWhile]>;
-def SVWHILEGE_S64 : SInst<"svwhilege_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilege", [IsOverloadWhile]>;
-def SVWHILEGT_S32 : SInst<"svwhilegt_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile]>;
-def SVWHILEGT_S64 : SInst<"svwhilegt_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile]>;
-def SVWHILEHI_U32 : SInst<"svwhilegt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile]>;
-def SVWHILEHI_U64 : SInst<"svwhilegt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile]>;
-def SVWHILEHS_U32 : SInst<"svwhilege_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile]>;
-def SVWHILEHS_U64 : SInst<"svwhilege_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile]>;
+def SVWHILEGE_S32 : SInst<"svwhilege_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilege", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILEGE_S64 : SInst<"svwhilege_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilege", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILEGT_S32 : SInst<"svwhilegt_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILEGT_S64 : SInst<"svwhilegt_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILEHI_U32 : SInst<"svwhilegt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILEHI_U64 : SInst<"svwhilegt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILEHS_U32 : SInst<"svwhilege_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILEHS_U64 : SInst<"svwhilege_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile, IsStreamingCompatible]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1304,49 +1304,49 @@ multiclass SInstZPZxZ<string name, string types, string pat_v, string pat_n, str
 }
 
 let TargetGuard = "sve2" in {
-defm SVQRSHL_S : SInstZPZxZ<"svqrshl", "csil",     "dPdx", "dPdK", "aarch64_sve_sqrshl">;
-defm SVQRSHL_U : SInstZPZxZ<"svqrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqrshl">;
-defm SVQSHL_S  : SInstZPZxZ<"svqshl",  "csil",     "dPdx", "dPdK", "aarch64_sve_sqshl">;
-defm SVQSHL_U  : SInstZPZxZ<"svqshl",  "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqshl">;
-defm SVRSHL_S  : SInstZPZxZ<"svrshl",  "csil",     "dPdx", "dPdK", "aarch64_sve_srshl">;
-defm SVRSHL_U  : SInstZPZxZ<"svrshl",  "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_urshl">;
-defm SVSQADD   : SInstZPZxZ<"svsqadd", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_usqadd">;
-defm SVUQADD   : SInstZPZxZ<"svuqadd", "csil",     "dPdu", "dPdL", "aarch64_sve_suqadd">;
-
-def SVABA_S        : SInst<"svaba[_{d}]",     "dddd", "csil"    , MergeNone, "aarch64_sve_saba">;
-def SVABA_U        : SInst<"svaba[_{d}]",     "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uaba">;
-def SVQDMULH       : SInst<"svqdmulh[_{d}]",  "ddd",  "csil",     MergeNone, "aarch64_sve_sqdmulh">;
-def SVQRDMULH      : SInst<"svqrdmulh[_{d}]", "ddd",  "csil",     MergeNone, "aarch64_sve_sqrdmulh">;
-def SVQRDMLAH      : SInst<"svqrdmlah[_{d}]", "dddd", "csil",     MergeNone, "aarch64_sve_sqrdmlah">;
-def SVQRDMLSH      : SInst<"svqrdmlsh[_{d}]", "dddd", "csil",     MergeNone, "aarch64_sve_sqrdmlsh">;
-
-def SVABA_S_N      : SInst<"svaba[_n_{d}]",     "ddda", "csil",     MergeNone, "aarch64_sve_saba">;
-def SVABA_U_N      : SInst<"svaba[_n_{d}]",     "ddda", "UcUsUiUl", MergeNone, "aarch64_sve_uaba">;
-def SVQDMULH_N     : SInst<"svqdmulh[_n_{d}]",  "dda",  "csil",     MergeNone, "aarch64_sve_sqdmulh">;
-def SVQRDMULH_N    : SInst<"svqrdmulh[_n_{d}]", "dda",  "csil",     MergeNone, "aarch64_sve_sqrdmulh">;
-def SVQRDMLAH_N    : SInst<"svqrdmlah[_n_{d}]", "ddda", "csil",     MergeNone, "aarch64_sve_sqrdmlah">;
-def SVQRDMLSH_N    : SInst<"svqrdmlsh[_n_{d}]", "ddda", "csil",     MergeNone, "aarch64_sve_sqrdmlsh">;
-
-def SVQDMULH_LANE  : SInst<"svqdmulh_lane[_{d}]",  "dddi",  "sil", MergeNone, "aarch64_sve_sqdmulh_lane",  [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
-def SVQRDMULH_LANE : SInst<"svqrdmulh_lane[_{d}]", "dddi",  "sil", MergeNone, "aarch64_sve_sqrdmulh_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
-def SVQRDMLAH_LANE : SInst<"svqrdmlah_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlah_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVQRDMLSH_LANE : SInst<"svqrdmlsh_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlsh_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-
-def SVQSHLU_M  : SInst<"svqshlu[_n_{d}]", "uPdi", "csil",         MergeOp1,  "aarch64_sve_sqshlu", [], [ImmCheck<2, ImmCheckShiftLeft,  1>]>;
-def SVQSHLU_X  : SInst<"svqshlu[_n_{d}]", "uPdi", "csil",         MergeAny,  "aarch64_sve_sqshlu", [], [ImmCheck<2, ImmCheckShiftLeft,  1>]>;
-def SVQSHLU_Z  : SInst<"svqshlu[_n_{d}]", "uPdi", "csil",         MergeZero, "aarch64_sve_sqshlu", [], [ImmCheck<2, ImmCheckShiftLeft,  1>]>;
-def SVRSHR_M_S : SInst<"svrshr[_n_{d}]",  "dPdi", "csil",         MergeOp1,  "aarch64_sve_srshr",  [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
-def SVRSHR_M_U : SInst<"svrshr[_n_{d}]",  "dPdi", "UcUsUiUl",     MergeOp1,  "aarch64_sve_urshr",  [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
-def SVRSHR_X_S : SInst<"svrshr[_n_{d}]",  "dPdi", "csil",         MergeAny,  "aarch64_sve_srshr",  [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
-def SVRSHR_X_U : SInst<"svrshr[_n_{d}]",  "dPdi", "UcUsUiUl",     MergeAny,  "aarch64_sve_urshr",  [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
-def SVRSHR_Z_S : SInst<"svrshr[_n_{d}]",  "dPdi", "csil",         MergeZero, "aarch64_sve_srshr",  [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
-def SVRSHR_Z_U : SInst<"svrshr[_n_{d}]",  "dPdi", "UcUsUiUl",     MergeZero, "aarch64_sve_urshr",  [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
-def SVRSRA_S   : SInst<"svrsra[_n_{d}]",  "dddi", "csil",         MergeNone, "aarch64_sve_srsra",  [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
-def SVRSRA_U   : SInst<"svrsra[_n_{d}]",  "dddi", "UcUsUiUl",     MergeNone, "aarch64_sve_ursra",  [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
-def SVSLI      : SInst<"svsli[_n_{d}]",   "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sli",    [], [ImmCheck<2, ImmCheckShiftLeft,  1>]>;
-def SVSRA_S    : SInst<"svsra[_n_{d}]",   "dddi", "csil",         MergeNone, "aarch64_sve_ssra",   [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
-def SVSRA_U    : SInst<"svsra[_n_{d}]",   "dddi", "UcUsUiUl",     MergeNone, "aarch64_sve_usra",   [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
-def SVSRI      : SInst<"svsri[_n_{d}]",   "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sri",    [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+defm SVQRSHL_S : SInstZPZxZ<"svqrshl", "csil",     "dPdx", "dPdK", "aarch64_sve_sqrshl", [IsStreamingCompatible]>;
+defm SVQRSHL_U : SInstZPZxZ<"svqrshl", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqrshl", [IsStreamingCompatible]>;
+defm SVQSHL_S  : SInstZPZxZ<"svqshl",  "csil",     "dPdx", "dPdK", "aarch64_sve_sqshl", [IsStreamingCompatible]>;
+defm SVQSHL_U  : SInstZPZxZ<"svqshl",  "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_uqshl", [IsStreamingCompatible]>;
+defm SVRSHL_S  : SInstZPZxZ<"svrshl",  "csil",     "dPdx", "dPdK", "aarch64_sve_srshl", [IsStreamingCompatible]>;
+defm SVRSHL_U  : SInstZPZxZ<"svrshl",  "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_urshl", [IsStreamingCompatible]>;
+defm SVSQADD   : SInstZPZxZ<"svsqadd", "UcUsUiUl", "dPdx", "dPdK", "aarch64_sve_usqadd", [IsStreamingCompatible]>;
+defm SVUQADD   : SInstZPZxZ<"svuqadd", "csil",     "dPdu", "dPdL", "aarch64_sve_suqadd", [IsStreamingCompatible]>;
+
+def SVABA_S        : SInst<"svaba[_{d}]",     "dddd", "csil"    , MergeNone, "aarch64_sve_saba", [IsStreamingCompatible]>;
+def SVABA_U        : SInst<"svaba[_{d}]",     "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uaba", [IsStreamingCompatible]>;
+def SVQDMULH       : SInst<"svqdmulh[_{d}]",  "ddd",  "csil",     MergeNone, "aarch64_sve_sqdmulh", [IsStreamingCompatible]>;
+def SVQRDMULH      : SInst<"svqrdmulh[_{d}]", "ddd",  "csil",     MergeNone, "aarch64_sve_sqrdmulh", [IsStreamingCompatible]>;
+def SVQRDMLAH      : SInst<"svqrdmlah[_{d}]", "dddd", "csil",     MergeNone, "aarch64_sve_sqrdmlah", [IsStreamingCompatible]>;
+def SVQRDMLSH      : SInst<"svqrdmlsh[_{d}]", "dddd", "csil",     MergeNone, "aarch64_sve_sqrdmlsh", [IsStreamingCompatible]>;
+
+def SVABA_S_N      : SInst<"svaba[_n_{d}]",     "ddda", "csil",     MergeNone, "aarch64_sve_saba", [IsStreamingCompatible]>;
+def SVABA_U_N      : SInst<"svaba[_n_{d}]",     "ddda", "UcUsUiUl", MergeNone, "aarch64_sve_uaba", [IsStreamingCompatible]>;
+def SVQDMULH_N     : SInst<"svqdmulh[_n_{d}]",  "dda",  "csil",     MergeNone, "aarch64_sve_sqdmulh", [IsStreamingCompatible]>;
+def SVQRDMULH_N    : SInst<"svqrdmulh[_n_{d}]", "dda",  "csil",     MergeNone, "aarch64_sve_sqrdmulh", [IsStreamingCompatible]>;
+def SVQRDMLAH_N    : SInst<"svqrdmlah[_n_{d}]", "ddda", "csil",     MergeNone, "aarch64_sve_sqrdmlah", [IsStreamingCompatible]>;
+def SVQRDMLSH_N    : SInst<"svqrdmlsh[_n_{d}]", "ddda", "csil",     MergeNone, "aarch64_sve_sqrdmlsh", [IsStreamingCompatible]>;
+
+def SVQDMULH_LANE  : SInst<"svqdmulh_lane[_{d}]",  "dddi",  "sil", MergeNone, "aarch64_sve_sqdmulh_lane",  [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def SVQRDMULH_LANE : SInst<"svqrdmulh_lane[_{d}]", "dddi",  "sil", MergeNone, "aarch64_sve_sqrdmulh_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def SVQRDMLAH_LANE : SInst<"svqrdmlah_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlah_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVQRDMLSH_LANE : SInst<"svqrdmlsh_lane[_{d}]", "ddddi", "sil", MergeNone, "aarch64_sve_sqrdmlsh_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+
+def SVQSHLU_M  : SInst<"svqshlu[_n_{d}]", "uPdi", "csil",         MergeOp1,  "aarch64_sve_sqshlu", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft,  1>]>;
+def SVQSHLU_X  : SInst<"svqshlu[_n_{d}]", "uPdi", "csil",         MergeAny,  "aarch64_sve_sqshlu", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft,  1>]>;
+def SVQSHLU_Z  : SInst<"svqshlu[_n_{d}]", "uPdi", "csil",         MergeZero, "aarch64_sve_sqshlu", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft,  1>]>;
+def SVRSHR_M_S : SInst<"svrshr[_n_{d}]",  "dPdi", "csil",         MergeOp1,  "aarch64_sve_srshr",  [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+def SVRSHR_M_U : SInst<"svrshr[_n_{d}]",  "dPdi", "UcUsUiUl",     MergeOp1,  "aarch64_sve_urshr",  [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+def SVRSHR_X_S : SInst<"svrshr[_n_{d}]",  "dPdi", "csil",         MergeAny,  "aarch64_sve_srshr",  [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+def SVRSHR_X_U : SInst<"svrshr[_n_{d}]",  "dPdi", "UcUsUiUl",     MergeAny,  "aarch64_sve_urshr",  [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+def SVRSHR_Z_S : SInst<"svrshr[_n_{d}]",  "dPdi", "csil",         MergeZero, "aarch64_sve_srshr",  [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+def SVRSHR_Z_U : SInst<"svrshr[_n_{d}]",  "dPdi", "UcUsUiUl",     MergeZero, "aarch64_sve_urshr",  [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+def SVRSRA_S   : SInst<"svrsra[_n_{d}]",  "dddi", "csil",         MergeNone, "aarch64_sve_srsra",  [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+def SVRSRA_U   : SInst<"svrsra[_n_{d}]",  "dddi", "UcUsUiUl",     MergeNone, "aarch64_sve_ursra",  [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+def SVSLI      : SInst<"svsli[_n_{d}]",   "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sli",    [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftLeft,  1>]>;
+def SVSRA_S    : SInst<"svsra[_n_{d}]",   "dddi", "csil",         MergeNone, "aarch64_sve_ssra",   [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+def SVSRA_U    : SInst<"svsra[_n_{d}]",   "dddi", "UcUsUiUl",     MergeNone, "aarch64_sve_usra",   [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+def SVSRI      : SInst<"svsri[_n_{d}]",   "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_sri",    [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1358,29 +1358,29 @@ multiclass SInstPairwise<string name, string types, string intrinsic, list<FlagT
 }
 
 let TargetGuard = "sve2" in {
-defm SVADDP   : SInstPairwise<"svaddp",   "csliUcUsUiUl", "aarch64_sve_addp">;
-defm SVADDP_F : SInstPairwise<"svaddp",   "hfd",          "aarch64_sve_faddp">;
-defm SVMAXNMP : SInstPairwise<"svmaxnmp", "hfd",          "aarch64_sve_fmaxnmp">;
-defm SVMAXP_F : SInstPairwise<"svmaxp",   "hfd",          "aarch64_sve_fmaxp">;
-defm SVMAXP_S : SInstPairwise<"svmaxp",   "csli",         "aarch64_sve_smaxp">;
-defm SVMAXP_U : SInstPairwise<"svmaxp",   "UcUsUiUl",     "aarch64_sve_umaxp">;
-defm SVMINNMP : SInstPairwise<"svminnmp", "hfd",          "aarch64_sve_fminnmp">;
-defm SVMINP_F : SInstPairwise<"svminp",   "hfd",          "aarch64_sve_fminp">;
-defm SVMINP_S : SInstPairwise<"svminp",   "csli",         "aarch64_sve_sminp">;
-defm SVMINP_U : SInstPairwise<"svminp",   "UcUsUiUl",     "aarch64_sve_uminp">;
+defm SVADDP   : SInstPairwise<"svaddp",   "csliUcUsUiUl", "aarch64_sve_addp", [IsStreamingCompatible]>;
+defm SVADDP_F : SInstPairwise<"svaddp",   "hfd",          "aarch64_sve_faddp", [IsStreamingCompatible]>;
+defm SVMAXNMP : SInstPairwise<"svmaxnmp", "hfd",          "aarch64_sve_fmaxnmp", [IsStreamingCompatible]>;
+defm SVMAXP_F : SInstPairwise<"svmaxp",   "hfd",          "aarch64_sve_fmaxp", [IsStreamingCompatible]>;
+defm SVMAXP_S : SInstPairwise<"svmaxp",   "csli",         "aarch64_sve_smaxp", [IsStreamingCompatible]>;
+defm SVMAXP_U : SInstPairwise<"svmaxp",   "UcUsUiUl",     "aarch64_sve_umaxp", [IsStreamingCompatible]>;
+defm SVMINNMP : SInstPairwise<"svminnmp", "hfd",          "aarch64_sve_fminnmp", [IsStreamingCompatible]>;
+defm SVMINP_F : SInstPairwise<"svminp",   "hfd",          "aarch64_sve_fminp", [IsStreamingCompatible]>;
+defm SVMINP_S : SInstPairwise<"svminp",   "csli",         "aarch64_sve_sminp", [IsStreamingCompatible]>;
+defm SVMINP_U : SInstPairwise<"svminp",   "UcUsUiUl",     "aarch64_sve_uminp", [IsStreamingCompatible]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Widening pairwise arithmetic
 
 let TargetGuard = "sve2" in {
-def SVADALP_S_M : SInst<"svadalp[_{d}]", "dPdh", "sil",    MergeOp1,  "aarch64_sve_sadalp">;
-def SVADALP_S_X : SInst<"svadalp[_{d}]", "dPdh", "sil",    MergeAny,  "aarch64_sve_sadalp">;
-def SVADALP_S_Z : SInst<"svadalp[_{d}]", "dPdh", "sil",    MergeZero, "aarch64_sve_sadalp">;
+def SVADALP_S_M : SInst<"svadalp[_{d}]", "dPdh", "sil",    MergeOp1,  "aarch64_sve_sadalp", [IsStreamingCompatible]>;
+def SVADALP_S_X : SInst<"svadalp[_{d}]", "dPdh", "sil",    MergeAny,  "aarch64_sve_sadalp", [IsStreamingCompatible]>;
+def SVADALP_S_Z : SInst<"svadalp[_{d}]", "dPdh", "sil",    MergeZero, "aarch64_sve_sadalp", [IsStreamingCompatible]>;
 
-def SVADALP_U_M : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeOp1,  "aarch64_sve_uadalp">;
-def SVADALP_U_X : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeAny,  "aarch64_sve_uadalp">;
-def SVADALP_U_Z : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeZero, "aarch64_sve_uadalp">;
+def SVADALP_U_M : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeOp1,  "aarch64_sve_uadalp", [IsStreamingCompatible]>;
+def SVADALP_U_X : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeAny,  "aarch64_sve_uadalp", [IsStreamingCompatible]>;
+def SVADALP_U_Z : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeZero, "aarch64_sve_uadalp", [IsStreamingCompatible]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1388,56 +1388,56 @@ def SVADALP_U_Z : SInst<"svadalp[_{d}]", "dPdh", "UsUiUl", MergeZero, "aarch64_s
 //
 
 let TargetGuard = "sve2" in {
-def SVBCAX  : SInst<"svbcax[_{d}]",  "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax">;
-def SVBSL   : SInst<"svbsl[_{d}]",   "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl">;
-def SVBSL1N : SInst<"svbsl1n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n">;
-def SVBSL2N : SInst<"svbsl2n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n">;
-def SVEOR3  : SInst<"sveor3[_{d}]",  "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3">;
-def SVNBSL  : SInst<"svnbsl[_{d}]",  "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl">;
-
-def SVBCAX_N  : SInst<"svbcax[_n_{d}]",  "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax">;
-def SVBSL_N   : SInst<"svbsl[_n_{d}]",   "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl">;
-def SVBSL1N_N : SInst<"svbsl1n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n">;
-def SVBSL2N_N : SInst<"svbsl2n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n">;
-def SVEOR3_N  : SInst<"sveor3[_n_{d}]",  "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3">;
-def SVNBSL_N  : SInst<"svnbsl[_n_{d}]",  "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl">;
-def SVXAR_N   : SInst<"svxar[_n_{d}]",   "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_xar", [], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
+def SVBCAX  : SInst<"svbcax[_{d}]",  "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax", [IsStreamingCompatible]>;
+def SVBSL   : SInst<"svbsl[_{d}]",   "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl", [IsStreamingCompatible]>;
+def SVBSL1N : SInst<"svbsl1n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n", [IsStreamingCompatible]>;
+def SVBSL2N : SInst<"svbsl2n[_{d}]", "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n", [IsStreamingCompatible]>;
+def SVEOR3  : SInst<"sveor3[_{d}]",  "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3", [IsStreamingCompatible]>;
+def SVNBSL  : SInst<"svnbsl[_{d}]",  "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl", [IsStreamingCompatible]>;
+
+def SVBCAX_N  : SInst<"svbcax[_n_{d}]",  "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bcax", [IsStreamingCompatible]>;
+def SVBSL_N   : SInst<"svbsl[_n_{d}]",   "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl", [IsStreamingCompatible]>;
+def SVBSL1N_N : SInst<"svbsl1n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl1n", [IsStreamingCompatible]>;
+def SVBSL2N_N : SInst<"svbsl2n[_n_{d}]", "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_bsl2n", [IsStreamingCompatible]>;
+def SVEOR3_N  : SInst<"sveor3[_n_{d}]",  "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eor3", [IsStreamingCompatible]>;
+def SVNBSL_N  : SInst<"svnbsl[_n_{d}]",  "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_nbsl", [IsStreamingCompatible]>;
+def SVXAR_N   : SInst<"svxar[_n_{d}]",   "dddi", "csilUcUsUiUl", MergeNone, "aarch64_sve_xar", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRight, 1>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Large integer arithmetic
 
 let TargetGuard = "sve2" in {
-def SVADCLB : SInst<"svadclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclb">;
-def SVADCLT : SInst<"svadclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclt">;
-def SVSBCLB : SInst<"svsbclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclb">;
-def SVSBCLT : SInst<"svsbclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclt">;
+def SVADCLB : SInst<"svadclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclb", [IsStreamingCompatible]>;
+def SVADCLT : SInst<"svadclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_adclt", [IsStreamingCompatible]>;
+def SVSBCLB : SInst<"svsbclb[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclb", [IsStreamingCompatible]>;
+def SVSBCLT : SInst<"svsbclt[_{d}]", "dddd", "UiUl", MergeNone, "aarch64_sve_sbclt", [IsStreamingCompatible]>;
 
-def SVADCLB_N : SInst<"svadclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclb">;
-def SVADCLT_N : SInst<"svadclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclt">;
-def SVSBCLB_N : SInst<"svsbclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclb">;
-def SVSBCLT_N : SInst<"svsbclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclt">;
+def SVADCLB_N : SInst<"svadclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclb", [IsStreamingCompatible]>;
+def SVADCLT_N : SInst<"svadclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_adclt", [IsStreamingCompatible]>;
+def SVSBCLB_N : SInst<"svsbclb[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclb", [IsStreamingCompatible]>;
+def SVSBCLT_N : SInst<"svsbclt[_n_{d}]", "ddda", "UiUl", MergeNone, "aarch64_sve_sbclt", [IsStreamingCompatible]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Multiplication by indexed elements
 
 let TargetGuard = "sve2" in {
-def SVMLA_LANE_2 : SInst<"svmla_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mla_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVMLS_LANE_2 : SInst<"svmls_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mls_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVMUL_LANE_2 : SInst<"svmul_lane[_{d}]", "dddi",  "silUsUiUl", MergeNone, "aarch64_sve_mul_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def SVMLA_LANE_2 : SInst<"svmla_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mla_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMLS_LANE_2 : SInst<"svmls_lane[_{d}]", "ddddi", "silUsUiUl", MergeNone, "aarch64_sve_mls_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMUL_LANE_2 : SInst<"svmul_lane[_{d}]", "dddi",  "silUsUiUl", MergeNone, "aarch64_sve_mul_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Uniform complex integer arithmetic
 let TargetGuard = "sve2" in {
-def SVCADD             : SInst<"svcadd[_{d}]",          "dddi",   "csilUcUsUiUl", MergeNone, "aarch64_sve_cadd_x",           [], [ImmCheck<2, ImmCheckComplexRot90_270>]>;
-def SVSQCADD           : SInst<"svqcadd[_{d}]",         "dddi",   "csil",         MergeNone, "aarch64_sve_sqcadd_x",         [], [ImmCheck<2, ImmCheckComplexRot90_270>]>;
-def SVCMLA             : SInst<"svcmla[_{d}]",          "ddddi",  "csilUcUsUiUl", MergeNone, "aarch64_sve_cmla_x",           [], [ImmCheck<3, ImmCheckComplexRotAll90>]>;
-def SVCMLA_LANE_X      : SInst<"svcmla_lane[_{d}]",     "ddddii", "siUsUi",       MergeNone, "aarch64_sve_cmla_lane_x",      [], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>,
+def SVCADD             : SInst<"svcadd[_{d}]",          "dddi",   "csilUcUsUiUl", MergeNone, "aarch64_sve_cadd_x",           [IsStreamingCompatible], [ImmCheck<2, ImmCheckComplexRot90_270>]>;
+def SVSQCADD           : SInst<"svqcadd[_{d}]",         "dddi",   "csil",         MergeNone, "aarch64_sve_sqcadd_x",         [IsStreamingCompatible], [ImmCheck<2, ImmCheckComplexRot90_270>]>;
+def SVCMLA             : SInst<"svcmla[_{d}]",          "ddddi",  "csilUcUsUiUl", MergeNone, "aarch64_sve_cmla_x",           [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRotAll90>]>;
+def SVCMLA_LANE_X      : SInst<"svcmla_lane[_{d}]",     "ddddii", "siUsUi",       MergeNone, "aarch64_sve_cmla_lane_x",      [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>,
                                                                                                                                   ImmCheck<4, ImmCheckComplexRotAll90>]>;
-def SVSQRDCMLAH_X      : SInst<"svqrdcmlah[_{d}]",      "ddddi",  "csil",         MergeNone, "aarch64_sve_sqrdcmlah_x",      [], [ImmCheck<3, ImmCheckComplexRotAll90>]>;
-def SVSQRDCMLAH_LANE_X : SInst<"svqrdcmlah_lane[_{d}]", "ddddii", "si",           MergeNone, "aarch64_sve_sqrdcmlah_lane_x", [], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>,
+def SVSQRDCMLAH_X      : SInst<"svqrdcmlah[_{d}]",      "ddddi",  "csil",         MergeNone, "aarch64_sve_sqrdcmlah_x",      [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRotAll90>]>;
+def SVSQRDCMLAH_LANE_X : SInst<"svqrdcmlah_lane[_{d}]", "ddddii", "si",           MergeNone, "aarch64_sve_sqrdcmlah_lane_x", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexCompRotate, 2>,
                                                                                                                                   ImmCheck<4, ImmCheckComplexRotAll90>]>;
 }
 
@@ -1445,18 +1445,18 @@ def SVSQRDCMLAH_LANE_X : SInst<"svqrdcmlah_lane[_{d}]", "ddddii", "si",
 // SVE2 - Widening DSP operations
 
 multiclass SInstWideDSPAcc<string name, string types, string intrinsic> {
-  def    : SInst<name # "[_{d}]",   "ddhh", types, MergeNone, intrinsic>;
-  def _N : SInst<name # "[_n_{d}]", "ddhR", types, MergeNone, intrinsic>;
+  def    : SInst<name # "[_{d}]",   "ddhh", types, MergeNone, intrinsic, [IsStreamingCompatible]>;
+  def _N : SInst<name # "[_n_{d}]", "ddhR", types, MergeNone, intrinsic, [IsStreamingCompatible]>;
 }
 
 multiclass SInstWideDSPLong<string name, string types, string intrinsic> {
-  def    : SInst<name # "[_{d}]",   "dhh", types, MergeNone, intrinsic>;
-  def _N : SInst<name # "[_n_{d}]", "dhR", types, MergeNone, intrinsic>;
+  def    : SInst<name # "[_{d}]",   "dhh", types, MergeNone, intrinsic, [IsStreamingCompatible]>;
+  def _N : SInst<name # "[_n_{d}]", "dhR", types, MergeNone, intrinsic, [IsStreamingCompatible]>;
 }
 
 multiclass SInstWideDSPWide<string name, string types, string intrinsic> {
-  def    : SInst<name # "[_{d}]",   "ddh", types, MergeNone, intrinsic>;
-  def _N : SInst<name # "[_n_{d}]", "ddR", types, MergeNone, intrinsic>;
+  def    : SInst<name # "[_{d}]",   "ddh", types, MergeNone, intrinsic, [IsStreamingCompatible]>;
+  def _N : SInst<name # "[_n_{d}]", "ddR", types, MergeNone, intrinsic, [IsStreamingCompatible]>;
 }
 
 let TargetGuard = "sve2" in {
@@ -1505,87 +1505,87 @@ defm SVSUBWB_U : SInstWideDSPWide<"svsubwb", "UsUiUl", "aarch64_sve_usubwb">;
 defm SVSUBWT_S : SInstWideDSPWide<"svsubwt", "sil",    "aarch64_sve_ssubwt">;
 defm SVSUBWT_U : SInstWideDSPWide<"svsubwt", "UsUiUl", "aarch64_sve_usubwt">;
 
-def SVSHLLB_S_N : SInst<"svshllb[_n_{d}]", "dhi", "sil",    MergeNone, "aarch64_sve_sshllb", [], [ImmCheck<1, ImmCheckShiftLeft,  0>]>;
-def SVSHLLB_U_N : SInst<"svshllb[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllb", [], [ImmCheck<1, ImmCheckShiftLeft,  0>]>;
-def SVSHLLT_S_N : SInst<"svshllt[_n_{d}]", "dhi", "sil",    MergeNone, "aarch64_sve_sshllt", [], [ImmCheck<1, ImmCheckShiftLeft,  0>]>;
-def SVSHLLT_U_N : SInst<"svshllt[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllt", [], [ImmCheck<1, ImmCheckShiftLeft,  0>]>;
-
-def SVMOVLB_S_N : SInst<"svmovlb[_{d}]", "dh",  "sil",    MergeNone>;
-def SVMOVLB_U_N : SInst<"svmovlb[_{d}]", "dh",  "UsUiUl", MergeNone>;
-def SVMOVLT_S_N : SInst<"svmovlt[_{d}]", "dh",  "sil",    MergeNone>;
-def SVMOVLT_U_N : SInst<"svmovlt[_{d}]", "dh",  "UsUiUl", MergeNone>;
-
-def SVMLALB_S_LANE : SInst<"svmlalb_lane[_{d}]",   "ddhhi", "il",   MergeNone, "aarch64_sve_smlalb_lane",   [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVMLALB_U_LANE : SInst<"svmlalb_lane[_{d}]",   "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalb_lane",   [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVMLALT_S_LANE : SInst<"svmlalt_lane[_{d}]",   "ddhhi", "il",   MergeNone, "aarch64_sve_smlalt_lane",   [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVMLALT_U_LANE : SInst<"svmlalt_lane[_{d}]",   "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalt_lane",   [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVMLSLB_S_LANE : SInst<"svmlslb_lane[_{d}]",   "ddhhi", "il",   MergeNone, "aarch64_sve_smlslb_lane",   [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVMLSLB_U_LANE : SInst<"svmlslb_lane[_{d}]",   "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslb_lane",   [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVMLSLT_S_LANE : SInst<"svmlslt_lane[_{d}]",   "ddhhi", "il",   MergeNone, "aarch64_sve_smlslt_lane",   [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVMLSLT_U_LANE : SInst<"svmlslt_lane[_{d}]",   "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslt_lane",   [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVMULLB_S_LANE : SInst<"svmullb_lane[_{d}]",   "dhhi",  "il",   MergeNone, "aarch64_sve_smullb_lane",   [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
-def SVMULLB_U_LANE : SInst<"svmullb_lane[_{d}]",   "dhhi",  "UiUl", MergeNone, "aarch64_sve_umullb_lane",   [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
-def SVMULLT_S_LANE : SInst<"svmullt_lane[_{d}]",   "dhhi",  "il",   MergeNone, "aarch64_sve_smullt_lane",   [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
-def SVMULLT_U_LANE : SInst<"svmullt_lane[_{d}]",   "dhhi",  "UiUl", MergeNone, "aarch64_sve_umullt_lane",   [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
-def SVQDMLALB_LANE : SInst<"svqdmlalb_lane[_{d}]", "ddhhi", "il",   MergeNone, "aarch64_sve_sqdmlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVQDMLALT_LANE : SInst<"svqdmlalt_lane[_{d}]", "ddhhi", "il",   MergeNone, "aarch64_sve_sqdmlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVQDMLSLB_LANE : SInst<"svqdmlslb_lane[_{d}]", "ddhhi", "il",   MergeNone, "aarch64_sve_sqdmlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVQDMLSLT_LANE : SInst<"svqdmlslt_lane[_{d}]", "ddhhi", "il",   MergeNone, "aarch64_sve_sqdmlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVQDMULLB_LANE : SInst<"svqdmullb_lane[_{d}]", "dhhi",  "il",   MergeNone, "aarch64_sve_sqdmullb_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
-def SVQDMULLT_LANE : SInst<"svqdmullt_lane[_{d}]", "dhhi",  "il",   MergeNone, "aarch64_sve_sqdmullt_lane", [], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def SVSHLLB_S_N : SInst<"svshllb[_n_{d}]", "dhi", "sil",    MergeNone, "aarch64_sve_sshllb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft,  0>]>;
+def SVSHLLB_U_N : SInst<"svshllb[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft,  0>]>;
+def SVSHLLT_S_N : SInst<"svshllt[_n_{d}]", "dhi", "sil",    MergeNone, "aarch64_sve_sshllt", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft,  0>]>;
+def SVSHLLT_U_N : SInst<"svshllt[_n_{d}]", "dhi", "UsUiUl", MergeNone, "aarch64_sve_ushllt", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftLeft,  0>]>;
+
+def SVMOVLB_S_N : SInst<"svmovlb[_{d}]", "dh",  "sil",    MergeNone, "", [IsStreamingCompatible]>;
+def SVMOVLB_U_N : SInst<"svmovlb[_{d}]", "dh",  "UsUiUl", MergeNone, "", [IsStreamingCompatible]>;
+def SVMOVLT_S_N : SInst<"svmovlt[_{d}]", "dh",  "sil",    MergeNone, "", [IsStreamingCompatible]>;
+def SVMOVLT_U_N : SInst<"svmovlt[_{d}]", "dh",  "UsUiUl", MergeNone, "", [IsStreamingCompatible]>;
+
+def SVMLALB_S_LANE : SInst<"svmlalb_lane[_{d}]",   "ddhhi", "il",   MergeNone, "aarch64_sve_smlalb_lane",   [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMLALB_U_LANE : SInst<"svmlalb_lane[_{d}]",   "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalb_lane",   [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMLALT_S_LANE : SInst<"svmlalt_lane[_{d}]",   "ddhhi", "il",   MergeNone, "aarch64_sve_smlalt_lane",   [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMLALT_U_LANE : SInst<"svmlalt_lane[_{d}]",   "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlalt_lane",   [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMLSLB_S_LANE : SInst<"svmlslb_lane[_{d}]",   "ddhhi", "il",   MergeNone, "aarch64_sve_smlslb_lane",   [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMLSLB_U_LANE : SInst<"svmlslb_lane[_{d}]",   "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslb_lane",   [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMLSLT_S_LANE : SInst<"svmlslt_lane[_{d}]",   "ddhhi", "il",   MergeNone, "aarch64_sve_smlslt_lane",   [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMLSLT_U_LANE : SInst<"svmlslt_lane[_{d}]",   "ddhhi", "UiUl", MergeNone, "aarch64_sve_umlslt_lane",   [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMULLB_S_LANE : SInst<"svmullb_lane[_{d}]",   "dhhi",  "il",   MergeNone, "aarch64_sve_smullb_lane",   [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def SVMULLB_U_LANE : SInst<"svmullb_lane[_{d}]",   "dhhi",  "UiUl", MergeNone, "aarch64_sve_umullb_lane",   [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def SVMULLT_S_LANE : SInst<"svmullt_lane[_{d}]",   "dhhi",  "il",   MergeNone, "aarch64_sve_smullt_lane",   [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def SVMULLT_U_LANE : SInst<"svmullt_lane[_{d}]",   "dhhi",  "UiUl", MergeNone, "aarch64_sve_umullt_lane",   [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def SVQDMLALB_LANE : SInst<"svqdmlalb_lane[_{d}]", "ddhhi", "il",   MergeNone, "aarch64_sve_sqdmlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVQDMLALT_LANE : SInst<"svqdmlalt_lane[_{d}]", "ddhhi", "il",   MergeNone, "aarch64_sve_sqdmlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVQDMLSLB_LANE : SInst<"svqdmlslb_lane[_{d}]", "ddhhi", "il",   MergeNone, "aarch64_sve_sqdmlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVQDMLSLT_LANE : SInst<"svqdmlslt_lane[_{d}]", "ddhhi", "il",   MergeNone, "aarch64_sve_sqdmlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVQDMULLB_LANE : SInst<"svqdmullb_lane[_{d}]", "dhhi",  "il",   MergeNone, "aarch64_sve_sqdmullb_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+def SVQDMULLT_LANE : SInst<"svqdmullt_lane[_{d}]", "dhhi",  "il",   MergeNone, "aarch64_sve_sqdmullt_lane", [IsStreamingCompatible], [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Narrowing DSP operations
 
 let TargetGuard = "sve2" in {
-def SVADDHNB   : SInst<"svaddhnb[_{d}]",     "hdd",  "silUsUiUl", MergeNone, "aarch64_sve_addhnb">;
-def SVADDHNT   : SInst<"svaddhnt[_{d}]",     "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnt">;
-def SVRADDHNB  : SInst<"svraddhnb[_{d}]",    "hdd",  "silUsUiUl", MergeNone, "aarch64_sve_raddhnb">;
-def SVRADDHNT  : SInst<"svraddhnt[_{d}]",    "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt">;
-def SVRSUBHNB  : SInst<"svrsubhnb[_{d}]",    "hdd",  "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb">;
-def SVRSUBHNT  : SInst<"svrsubhnt[_{d}]",    "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt">;
-def SVSUBHNB   : SInst<"svsubhnb[_{d}]",     "hdd",  "silUsUiUl", MergeNone, "aarch64_sve_subhnb">;
-def SVSUBHNT   : SInst<"svsubhnt[_{d}]",     "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnt">;
-
-def SVADDHNB_N  : SInst<"svaddhnb[_n_{d}]",  "hda",  "silUsUiUl", MergeNone, "aarch64_sve_addhnb">;
-def SVADDHNT_N  : SInst<"svaddhnt[_n_{d}]",  "hhda", "silUsUiUl", MergeNone, "aarch64_sve_addhnt">;
-def SVRADDHNB_N : SInst<"svraddhnb[_n_{d}]", "hda",  "silUsUiUl", MergeNone, "aarch64_sve_raddhnb">;
-def SVRADDHNT_N : SInst<"svraddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt">;
-def SVRSUBHNB_N : SInst<"svrsubhnb[_n_{d}]", "hda",  "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb">;
-def SVRSUBHNT_N : SInst<"svrsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt">;
-def SVSUBHNB_N  : SInst<"svsubhnb[_n_{d}]",  "hda",  "silUsUiUl", MergeNone, "aarch64_sve_subhnb">;
-def SVSUBHNT_N  : SInst<"svsubhnt[_n_{d}]",  "hhda", "silUsUiUl", MergeNone, "aarch64_sve_subhnt">;
-
-def SVSHRNB      : SInst<"svshrnb[_n_{d}]",    "hdi",  "silUsUiUl", MergeNone, "aarch64_sve_shrnb",     [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
-def SVRSHRNB     : SInst<"svrshrnb[_n_{d}]",   "hdi",  "silUsUiUl", MergeNone, "aarch64_sve_rshrnb",    [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
-def SVQSHRUNB    : SInst<"svqshrunb[_n_{d}]",  "edi",  "sil",       MergeNone, "aarch64_sve_sqshrunb",  [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
-def SVQRSHRUNB   : SInst<"svqrshrunb[_n_{d}]", "edi",  "sil",       MergeNone, "aarch64_sve_sqrshrunb", [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
-def SVQSHRNB_S   : SInst<"svqshrnb[_n_{d}]",   "hdi",  "sil",       MergeNone, "aarch64_sve_sqshrnb",   [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
-def SVQSHRNB_U   : SInst<"svqshrnb[_n_{d}]",   "hdi",  "UsUiUl",    MergeNone, "aarch64_sve_uqshrnb",   [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
-def SVQRSHRNB_S  : SInst<"svqrshrnb[_n_{d}]",  "hdi",  "sil",       MergeNone, "aarch64_sve_sqrshrnb",  [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
-def SVQRSHRNB_U  : SInst<"svqrshrnb[_n_{d}]",  "hdi",  "UsUiUl",    MergeNone, "aarch64_sve_uqrshrnb",  [], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
-
-def SVSHRNT      : SInst<"svshrnt[_n_{d}]",    "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnt",     [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
-def SVRSHRNT     : SInst<"svrshrnt[_n_{d}]",   "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnt",    [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
-def SVQSHRUNT    : SInst<"svqshrunt[_n_{d}]",  "eedi", "sil",       MergeNone, "aarch64_sve_sqshrunt",  [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
-def SVQRSHRUNT   : SInst<"svqrshrunt[_n_{d}]", "eedi", "sil",       MergeNone, "aarch64_sve_sqrshrunt", [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
-def SVQSHRNT_S   : SInst<"svqshrnt[_n_{d}]",   "hhdi", "sil",       MergeNone, "aarch64_sve_sqshrnt",   [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
-def SVQSHRNT_U   : SInst<"svqshrnt[_n_{d}]",   "hhdi", "UsUiUl",    MergeNone, "aarch64_sve_uqshrnt",   [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
-def SVQRSHRNT_S  : SInst<"svqrshrnt[_n_{d}]",  "hhdi", "sil",       MergeNone, "aarch64_sve_sqrshrnt",  [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
-def SVQRSHRNT_U  : SInst<"svqrshrnt[_n_{d}]",  "hhdi", "UsUiUl",    MergeNone, "aarch64_sve_uqrshrnt",  [], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
+def SVADDHNB   : SInst<"svaddhnb[_{d}]",     "hdd",  "silUsUiUl", MergeNone, "aarch64_sve_addhnb", [IsStreamingCompatible]>;
+def SVADDHNT   : SInst<"svaddhnt[_{d}]",     "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_addhnt", [IsStreamingCompatible]>;
+def SVRADDHNB  : SInst<"svraddhnb[_{d}]",    "hdd",  "silUsUiUl", MergeNone, "aarch64_sve_raddhnb", [IsStreamingCompatible]>;
+def SVRADDHNT  : SInst<"svraddhnt[_{d}]",    "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt", [IsStreamingCompatible]>;
+def SVRSUBHNB  : SInst<"svrsubhnb[_{d}]",    "hdd",  "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb", [IsStreamingCompatible]>;
+def SVRSUBHNT  : SInst<"svrsubhnt[_{d}]",    "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt", [IsStreamingCompatible]>;
+def SVSUBHNB   : SInst<"svsubhnb[_{d}]",     "hdd",  "silUsUiUl", MergeNone, "aarch64_sve_subhnb", [IsStreamingCompatible]>;
+def SVSUBHNT   : SInst<"svsubhnt[_{d}]",     "hhdd", "silUsUiUl", MergeNone, "aarch64_sve_subhnt", [IsStreamingCompatible]>;
+
+def SVADDHNB_N  : SInst<"svaddhnb[_n_{d}]",  "hda",  "silUsUiUl", MergeNone, "aarch64_sve_addhnb", [IsStreamingCompatible]>;
+def SVADDHNT_N  : SInst<"svaddhnt[_n_{d}]",  "hhda", "silUsUiUl", MergeNone, "aarch64_sve_addhnt", [IsStreamingCompatible]>;
+def SVRADDHNB_N : SInst<"svraddhnb[_n_{d}]", "hda",  "silUsUiUl", MergeNone, "aarch64_sve_raddhnb", [IsStreamingCompatible]>;
+def SVRADDHNT_N : SInst<"svraddhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_raddhnt", [IsStreamingCompatible]>;
+def SVRSUBHNB_N : SInst<"svrsubhnb[_n_{d}]", "hda",  "silUsUiUl", MergeNone, "aarch64_sve_rsubhnb", [IsStreamingCompatible]>;
+def SVRSUBHNT_N : SInst<"svrsubhnt[_n_{d}]", "hhda", "silUsUiUl", MergeNone, "aarch64_sve_rsubhnt", [IsStreamingCompatible]>;
+def SVSUBHNB_N  : SInst<"svsubhnb[_n_{d}]",  "hda",  "silUsUiUl", MergeNone, "aarch64_sve_subhnb", [IsStreamingCompatible]>;
+def SVSUBHNT_N  : SInst<"svsubhnt[_n_{d}]",  "hhda", "silUsUiUl", MergeNone, "aarch64_sve_subhnt", [IsStreamingCompatible]>;
+
+def SVSHRNB      : SInst<"svshrnb[_n_{d}]",    "hdi",  "silUsUiUl", MergeNone, "aarch64_sve_shrnb",     [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
+def SVRSHRNB     : SInst<"svrshrnb[_n_{d}]",   "hdi",  "silUsUiUl", MergeNone, "aarch64_sve_rshrnb",    [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
+def SVQSHRUNB    : SInst<"svqshrunb[_n_{d}]",  "edi",  "sil",       MergeNone, "aarch64_sve_sqshrunb",  [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
+def SVQRSHRUNB   : SInst<"svqrshrunb[_n_{d}]", "edi",  "sil",       MergeNone, "aarch64_sve_sqrshrunb", [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
+def SVQSHRNB_S   : SInst<"svqshrnb[_n_{d}]",   "hdi",  "sil",       MergeNone, "aarch64_sve_sqshrnb",   [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
+def SVQSHRNB_U   : SInst<"svqshrnb[_n_{d}]",   "hdi",  "UsUiUl",    MergeNone, "aarch64_sve_uqshrnb",   [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
+def SVQRSHRNB_S  : SInst<"svqrshrnb[_n_{d}]",  "hdi",  "sil",       MergeNone, "aarch64_sve_sqrshrnb",  [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
+def SVQRSHRNB_U  : SInst<"svqrshrnb[_n_{d}]",  "hdi",  "UsUiUl",    MergeNone, "aarch64_sve_uqrshrnb",  [IsStreamingCompatible], [ImmCheck<1, ImmCheckShiftRightNarrow, 0>]>;
+
+def SVSHRNT      : SInst<"svshrnt[_n_{d}]",    "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_shrnt",     [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
+def SVRSHRNT     : SInst<"svrshrnt[_n_{d}]",   "hhdi", "silUsUiUl", MergeNone, "aarch64_sve_rshrnt",    [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
+def SVQSHRUNT    : SInst<"svqshrunt[_n_{d}]",  "eedi", "sil",       MergeNone, "aarch64_sve_sqshrunt",  [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
+def SVQRSHRUNT   : SInst<"svqrshrunt[_n_{d}]", "eedi", "sil",       MergeNone, "aarch64_sve_sqrshrunt", [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
+def SVQSHRNT_S   : SInst<"svqshrnt[_n_{d}]",   "hhdi", "sil",       MergeNone, "aarch64_sve_sqshrnt",   [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
+def SVQSHRNT_U   : SInst<"svqshrnt[_n_{d}]",   "hhdi", "UsUiUl",    MergeNone, "aarch64_sve_uqshrnt",   [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
+def SVQRSHRNT_S  : SInst<"svqrshrnt[_n_{d}]",  "hhdi", "sil",       MergeNone, "aarch64_sve_sqrshrnt",  [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
+def SVQRSHRNT_U  : SInst<"svqrshrnt[_n_{d}]",  "hhdi", "UsUiUl",    MergeNone, "aarch64_sve_uqrshrnt",  [IsStreamingCompatible], [ImmCheck<2, ImmCheckShiftRightNarrow, 1>]>;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Unary narrowing operations
 
 let TargetGuard = "sve2" in {
-def SVQXTNB_S  : SInst<"svqxtnb[_{d}]",  "hd",  "sil",     MergeNone, "aarch64_sve_sqxtnb">;
-def SVQXTNB_U  : SInst<"svqxtnb[_{d}]",  "hd",  "UsUiUl",  MergeNone, "aarch64_sve_uqxtnb">;
-def SVQXTUNB_S : SInst<"svqxtunb[_{d}]", "ed",  "sil",     MergeNone, "aarch64_sve_sqxtunb">;
+def SVQXTNB_S  : SInst<"svqxtnb[_{d}]",  "hd",  "sil",     MergeNone, "aarch64_sve_sqxtnb", [IsStreamingCompatible]>;
+def SVQXTNB_U  : SInst<"svqxtnb[_{d}]",  "hd",  "UsUiUl",  MergeNone, "aarch64_sve_uqxtnb", [IsStreamingCompatible]>;
+def SVQXTUNB_S : SInst<"svqxtunb[_{d}]", "ed",  "sil",     MergeNone, "aarch64_sve_sqxtunb", [IsStreamingCompatible]>;
 
-def SVQXTNT_S  : SInst<"svqxtnt[_{d}]",  "hhd", "sil",     MergeNone, "aarch64_sve_sqxtnt">;
-def SVQXTNT_U  : SInst<"svqxtnt[_{d}]",  "hhd", "UsUiUl",  MergeNone, "aarch64_sve_uqxtnt">;
-def SVQXTUNT_S : SInst<"svqxtunt[_{d}]", "eed", "sil",     MergeNone, "aarch64_sve_sqxtunt">;
+def SVQXTNT_S  : SInst<"svqxtnt[_{d}]",  "hhd", "sil",     MergeNone, "aarch64_sve_sqxtnt", [IsStreamingCompatible]>;
+def SVQXTNT_U  : SInst<"svqxtnt[_{d}]",  "hhd", "UsUiUl",  MergeNone, "aarch64_sve_uqxtnt", [IsStreamingCompatible]>;
+def SVQXTUNT_S : SInst<"svqxtunt[_{d}]", "eed", "sil",     MergeNone, "aarch64_sve_sqxtunt", [IsStreamingCompatible]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1726,18 +1726,18 @@ def SVSTNT1W_SCATTER_INDEX_S : MInst<"svstnt1w_scatter[_{2}base]_index[_{d}]", "
 // SVE2 - Polynomial arithmetic
 
 let TargetGuard = "sve2" in {
-def SVEORBT         : SInst<"sveorbt[_{d}]",         "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt">;
-def SVEORBT_N       : SInst<"sveorbt[_n_{d}]",       "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt">;
-def SVEORTB         : SInst<"sveortb[_{d}]",         "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb">;
-def SVEORTB_N       : SInst<"sveortb[_n_{d}]",       "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb">;
-def SVPMUL          : SInst<"svpmul[_{d}]",          "ddd",  "Uc",           MergeNone, "aarch64_sve_pmul">;
-def SVPMUL_N        : SInst<"svpmul[_n_{d}]",        "dda",  "Uc",           MergeNone, "aarch64_sve_pmul">;
-def SVPMULLB        : SInst<"svpmullb[_{d}]",        "dhh",  "UsUl",         MergeNone>;
-def SVPMULLB_N      : SInst<"svpmullb[_n_{d}]",      "dhR",  "UsUl",         MergeNone>;
-def SVPMULLB_PAIR   : SInst<"svpmullb_pair[_{d}]",   "ddd",  "UcUi",         MergeNone, "aarch64_sve_pmullb_pair">;
-def SVPMULLB_PAIR_N : SInst<"svpmullb_pair[_n_{d}]", "dda",  "UcUi",         MergeNone, "aarch64_sve_pmullb_pair">;
-def SVPMULLT        : SInst<"svpmullt[_{d}]",        "dhh",  "UsUl",         MergeNone>;
-def SVPMULLT_N      : SInst<"svpmullt[_n_{d}]",      "dhR",  "UsUl",         MergeNone>;
+def SVEORBT         : SInst<"sveorbt[_{d}]",         "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt", [IsStreamingCompatible]>;
+def SVEORBT_N       : SInst<"sveorbt[_n_{d}]",       "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eorbt", [IsStreamingCompatible]>;
+def SVEORTB         : SInst<"sveortb[_{d}]",         "dddd", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb", [IsStreamingCompatible]>;
+def SVEORTB_N       : SInst<"sveortb[_n_{d}]",       "ddda", "csilUcUsUiUl", MergeNone, "aarch64_sve_eortb", [IsStreamingCompatible]>;
+def SVPMUL          : SInst<"svpmul[_{d}]",          "ddd",  "Uc",           MergeNone, "aarch64_sve_pmul", [IsStreamingCompatible]>;
+def SVPMUL_N        : SInst<"svpmul[_n_{d}]",        "dda",  "Uc",           MergeNone, "aarch64_sve_pmul", [IsStreamingCompatible]>;
+def SVPMULLB        : SInst<"svpmullb[_{d}]",        "dhh",  "UsUl",         MergeNone, "", [IsStreamingCompatible]>;
+def SVPMULLB_N      : SInst<"svpmullb[_n_{d}]",      "dhR",  "UsUl",         MergeNone, "", [IsStreamingCompatible]>;
+def SVPMULLB_PAIR   : SInst<"svpmullb_pair[_{d}]",   "ddd",  "UcUi",         MergeNone, "aarch64_sve_pmullb_pair", [IsStreamingCompatible]>;
+def SVPMULLB_PAIR_N : SInst<"svpmullb_pair[_n_{d}]", "dda",  "UcUi",         MergeNone, "aarch64_sve_pmullb_pair", [IsStreamingCompatible]>;
+def SVPMULLT        : SInst<"svpmullt[_{d}]",        "dhh",  "UsUl",         MergeNone, "", [IsStreamingCompatible]>;
+def SVPMULLT_N      : SInst<"svpmullt[_n_{d}]",      "dhR",  "UsUl",         MergeNone, "", [IsStreamingCompatible]>;
 def SVPMULLT_PAIR   : SInst<"svpmullt_pair[_{d}]",   "ddd",  "UcUi",         MergeNone, "aarch64_sve_pmullt_pair">;
 def SVPMULLT_PAIR_N : SInst<"svpmullt_pair[_n_{d}]", "dda",  "UcUi",         MergeNone, "aarch64_sve_pmullt_pair">;
 }
@@ -1746,8 +1746,8 @@ def SVPMULLT_PAIR_N : SInst<"svpmullt_pair[_n_{d}]", "dda",  "UcUi",         Mer
 // SVE2 - Complex integer dot product
 
 let TargetGuard = "sve2" in {
-def SVCDOT      : SInst<"svcdot[_{d}]",      "ddqqi",  "il",   MergeNone, "aarch64_sve_cdot",      [], [ImmCheck<3, ImmCheckComplexRotAll90>]>;
-def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il",   MergeNone, "aarch64_sve_cdot_lane", [], [ImmCheck<4, ImmCheckComplexRotAll90>,
+def SVCDOT      : SInst<"svcdot[_{d}]",      "ddqqi",  "il",   MergeNone, "aarch64_sve_cdot",      [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRotAll90>]>;
+def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il",   MergeNone, "aarch64_sve_cdot_lane", [IsStreamingCompatible], [ImmCheck<4, ImmCheckComplexRotAll90>,
                                                                                                         ImmCheck<3, ImmCheckLaneIndexDot, 2>]>;
 }
 
@@ -1755,27 +1755,27 @@ def SVCDOT_LANE : SInst<"svcdot_lane[_{d}]", "ddqqii", "il",   MergeNone, "aarch
 // SVE2 - Floating-point widening multiply-accumulate
 
 let TargetGuard = "sve2" in {
-def SVMLALB_F      : SInst<"svmlalb[_{d}]",      "ddhh",  "f",   MergeNone, "aarch64_sve_fmlalb">;
-def SVMLALB_F_N    : SInst<"svmlalb[_n_{d}]",    "ddhR",  "f",   MergeNone, "aarch64_sve_fmlalb">;
-def SVMLALB_F_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "f",   MergeNone, "aarch64_sve_fmlalb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVMLALT_F      : SInst<"svmlalt[_{d}]",      "ddhh",  "f",   MergeNone, "aarch64_sve_fmlalt">;
-def SVMLALT_F_N    : SInst<"svmlalt[_n_{d}]",    "ddhR",  "f",   MergeNone, "aarch64_sve_fmlalt">;
-def SVMLALT_F_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "f",   MergeNone, "aarch64_sve_fmlalt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVMLSLB_F      : SInst<"svmlslb[_{d}]",      "ddhh",  "f",   MergeNone, "aarch64_sve_fmlslb">;
-def SVMLSLB_F_N    : SInst<"svmlslb[_n_{d}]",    "ddhR",  "f",   MergeNone, "aarch64_sve_fmlslb">;
-def SVMLSLB_F_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "f",   MergeNone, "aarch64_sve_fmlslb_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
-def SVMLSLT_F      : SInst<"svmlslt[_{d}]",      "ddhh",  "f",   MergeNone, "aarch64_sve_fmlslt">;
-def SVMLSLT_F_N    : SInst<"svmlslt[_n_{d}]",    "ddhR",  "f",   MergeNone, "aarch64_sve_fmlslt">;
-def SVMLSLT_F_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "f",   MergeNone, "aarch64_sve_fmlslt_lane", [], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMLALB_F      : SInst<"svmlalb[_{d}]",      "ddhh",  "f",   MergeNone, "aarch64_sve_fmlalb", [IsStreamingCompatible]>;
+def SVMLALB_F_N    : SInst<"svmlalb[_n_{d}]",    "ddhR",  "f",   MergeNone, "aarch64_sve_fmlalb", [IsStreamingCompatible]>;
+def SVMLALB_F_LANE : SInst<"svmlalb_lane[_{d}]", "ddhhi", "f",   MergeNone, "aarch64_sve_fmlalb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMLALT_F      : SInst<"svmlalt[_{d}]",      "ddhh",  "f",   MergeNone, "aarch64_sve_fmlalt", [IsStreamingCompatible]>;
+def SVMLALT_F_N    : SInst<"svmlalt[_n_{d}]",    "ddhR",  "f",   MergeNone, "aarch64_sve_fmlalt", [IsStreamingCompatible]>;
+def SVMLALT_F_LANE : SInst<"svmlalt_lane[_{d}]", "ddhhi", "f",   MergeNone, "aarch64_sve_fmlalt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMLSLB_F      : SInst<"svmlslb[_{d}]",      "ddhh",  "f",   MergeNone, "aarch64_sve_fmlslb", [IsStreamingCompatible]>;
+def SVMLSLB_F_N    : SInst<"svmlslb[_n_{d}]",    "ddhR",  "f",   MergeNone, "aarch64_sve_fmlslb", [IsStreamingCompatible]>;
+def SVMLSLB_F_LANE : SInst<"svmlslb_lane[_{d}]", "ddhhi", "f",   MergeNone, "aarch64_sve_fmlslb_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
+def SVMLSLT_F      : SInst<"svmlslt[_{d}]",      "ddhh",  "f",   MergeNone, "aarch64_sve_fmlslt", [IsStreamingCompatible]>;
+def SVMLSLT_F_N    : SInst<"svmlslt[_n_{d}]",    "ddhR",  "f",   MergeNone, "aarch64_sve_fmlslt", [IsStreamingCompatible]>;
+def SVMLSLT_F_LANE : SInst<"svmlslt_lane[_{d}]", "ddhhi", "f",   MergeNone, "aarch64_sve_fmlslt_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndex, 2>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Floating-point integer binary logarithm
 
 let TargetGuard = "sve2" in {
-def SVLOGB_M  : SInst<"svlogb[_{d}]", "xxPd", "hfd", MergeOp1,     "aarch64_sve_flogb">;
-def SVLOGB_X  : SInst<"svlogb[_{d}]", "xPd",  "hfd", MergeAnyExp,  "aarch64_sve_flogb">;
-def SVLOGB_Z  : SInst<"svlogb[_{d}]", "xPd",  "hfd", MergeZeroExp, "aarch64_sve_flogb">;
+def SVLOGB_M  : SInst<"svlogb[_{d}]", "xxPd", "hfd", MergeOp1,     "aarch64_sve_flogb", [IsStreamingCompatible]>;
+def SVLOGB_X  : SInst<"svlogb[_{d}]", "xPd",  "hfd", MergeAnyExp,  "aarch64_sve_flogb", [IsStreamingCompatible]>;
+def SVLOGB_Z  : SInst<"svlogb[_{d}]", "xPd",  "hfd", MergeZeroExp, "aarch64_sve_flogb", [IsStreamingCompatible]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1797,32 +1797,32 @@ def SVNMATCH : SInst<"svnmatch[_{d}]", "PPdd", "csUcUs", MergeNone, "aarch64_sve
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Contiguous conflict detection
 let TargetGuard = "sve2" in {
-def SVWHILERW_B : SInst<"svwhilerw[_{1}]", "Pcc", "cUc",  MergeNone, "aarch64_sve_whilerw_b", [IsOverloadWhileRW]>;
-def SVWHILERW_H : SInst<"svwhilerw[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW]>;
-def SVWHILERW_S : SInst<"svwhilerw[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilerw_s", [IsOverloadWhileRW]>;
-def SVWHILERW_D : SInst<"svwhilerw[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilerw_d", [IsOverloadWhileRW]>;
+def SVWHILERW_B : SInst<"svwhilerw[_{1}]", "Pcc", "cUc",  MergeNone, "aarch64_sve_whilerw_b", [IsOverloadWhileRW, IsStreamingCompatible]>;
+def SVWHILERW_H : SInst<"svwhilerw[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, IsStreamingCompatible]>;
+def SVWHILERW_S : SInst<"svwhilerw[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilerw_s", [IsOverloadWhileRW, IsStreamingCompatible]>;
+def SVWHILERW_D : SInst<"svwhilerw[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilerw_d", [IsOverloadWhileRW, IsStreamingCompatible]>;
 
-def SVWHILEWR_B : SInst<"svwhilewr[_{1}]", "Pcc", "cUc",  MergeNone, "aarch64_sve_whilewr_b", [IsOverloadWhileRW]>;
-def SVWHILEWR_H : SInst<"svwhilewr[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW]>;
-def SVWHILEWR_S : SInst<"svwhilewr[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilewr_s", [IsOverloadWhileRW]>;
-def SVWHILEWR_D : SInst<"svwhilewr[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilewr_d", [IsOverloadWhileRW]>;
+def SVWHILEWR_B : SInst<"svwhilewr[_{1}]", "Pcc", "cUc",  MergeNone, "aarch64_sve_whilewr_b", [IsOverloadWhileRW, IsStreamingCompatible]>;
+def SVWHILEWR_H : SInst<"svwhilewr[_{1}]", "Pcc", "sUsh", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW, IsStreamingCompatible]>;
+def SVWHILEWR_S : SInst<"svwhilewr[_{1}]", "Pcc", "iUif", MergeNone, "aarch64_sve_whilewr_s", [IsOverloadWhileRW, IsStreamingCompatible]>;
+def SVWHILEWR_D : SInst<"svwhilewr[_{1}]", "Pcc", "lUld", MergeNone, "aarch64_sve_whilewr_d", [IsOverloadWhileRW, IsStreamingCompatible]>;
 }
 
 let TargetGuard = "sve2,bf16" in {
-def SVWHILERW_H_BF16 : SInst<"svwhilerw[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW]>;
-def SVWHILEWR_H_BF16 : SInst<"svwhilewr[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW]>;
+def SVWHILERW_H_BF16 : SInst<"svwhilerw[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilerw_h", [IsOverloadWhileRW, IsStreamingCompatible]>;
+def SVWHILEWR_H_BF16 : SInst<"svwhilewr[_{1}]", "Pcc", "b", MergeNone, "aarch64_sve_whilewr_h", [IsOverloadWhileRW, IsStreamingCompatible]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 - Extended table lookup/permute
 let TargetGuard = "sve2" in {
-def SVTBL2 : SInst<"svtbl2[_{d}]", "d2u",  "csilUcUsUiUlhfd", MergeNone>;
-def SVTBX  : SInst<"svtbx[_{d}]",  "dddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbx">;
+def SVTBL2 : SInst<"svtbl2[_{d}]", "d2u",  "csilUcUsUiUlhfd", MergeNone, "", [IsStreamingCompatible]>;
+def SVTBX  : SInst<"svtbx[_{d}]",  "dddu", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbx", [IsStreamingCompatible]>;
 }
 
 let TargetGuard = "sve2,bf16" in {
-def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u",  "b", MergeNone>;
-def SVTBX_BF16  : SInst<"svtbx[_{d}]",  "dddu", "b", MergeNone, "aarch64_sve_tbx">;
+def SVTBL2_BF16 : SInst<"svtbl2[_{d}]", "d2u",  "b", MergeNone, "", [IsStreamingCompatible]>;
+def SVTBX_BF16  : SInst<"svtbx[_{d}]",  "dddu", "b", MergeNone, "aarch64_sve_tbx", [IsStreamingCompatible]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1868,3 +1868,15 @@ let TargetGuard = "sve2p1" in {
 def SVSCLAMP : SInst<"svclamp[_{d}]", "dddd", "csil",     MergeNone, "aarch64_sve_sclamp", [], []>;
 def SVUCLAMP : SInst<"svclamp[_{d}]", "dddd", "UcUsUiUl", MergeNone, "aarch64_sve_uclamp", [], []>;
 }
+
+////////////////////////////////////////////////////////////////////////////////
+// SME2
+
+// SME intrinsics which operate only on vectors and do not require ZA should be added here,
+// as they could possibly become SVE instructions in the future.
+
+let TargetGuard = "sme2" in {
+// == ADD (vectors) ==
+  def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>;
+  def SVADD_SINGLE_X4 : SInst<"svadd[_single_{d}_x4]", "44d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x4", [IsStreaming], []>;
+}
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index 74c9b9266771b02920e0a30b0020eae1c9ed5921..75c1f05e1f2ff8b37aa0e15626d6a0059ee73066 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -218,10 +218,11 @@ def ReverseMergeAnyBinOp      : FlagType<0x800000000>; // e.g. Implement SUBR_X
 def ReverseMergeAnyAccOp      : FlagType<0x1000000000>; // e.g. Implement MSB_X using MLS_X.
 def IsStreaming               : FlagType<0x2000000000>;
 def IsStreamingCompatible     : FlagType<0x4000000000>;
-def IsSharedZA                : FlagType<0x8000000000>;
-def IsPreservesZA             : FlagType<0x10000000000>;
-def IsReadZA                  : FlagType<0x20000000000>;
-def IsWriteZA                 : FlagType<0x40000000000>;
+def IsReadZA                  : FlagType<0x8000000000>;
+def IsWriteZA                 : FlagType<0x10000000000>;
+def IsInZA                    : FlagType<0x20000000000>;
+def IsOutZA                   : FlagType<0x40000000000>;
+def IsInOutZA                 : FlagType<0x80000000000>;
 
 // These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h
 class ImmCheckType<int val> {
@@ -254,7 +255,7 @@ class ImmCheck<int arg, ImmCheckType kind, int eltSizeArg = -1> {
 }
 
 class Inst<string n, string p, string t, MergeType mt, string i,
-           list<FlagType> ft, list<ImmCheck> ch, MemEltType met> {
+           list<FlagType> ft, list<ImmCheck> ch, MemEltType met = MemEltTyDefault> {
   string Name = n;
   string Prototype = p;
   string Types = t;
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 475dfe845528d9902ced70d97a31a49044a952b4..43e3a9d4d3983658dcc1eff3a08a3ba6a7e554c4 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -2790,6 +2790,13 @@ private:
   /// clang accepts as an extension.
   void DiagnoseCXX11AttributeExtension(ParsedAttributes &Attrs);
 
+  ExprResult ParseUnevaluatedStringInAttribute(const IdentifierInfo &AttrName);
+
+  bool
+  ParseAttributeArgumentList(const clang::IdentifierInfo &AttrName,
+                             SmallVectorImpl<Expr *> &Exprs,
+                             ParsedAttributeArgumentsProperties ArgsProperties);
+
   /// Parses syntax-generic attribute arguments for attributes which are
   /// known to the implementation, and adds them to the given ParsedAttributes
   /// list with the given attribute syntax. Returns the number of arguments
diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h
index 592580bccd234fbf447591fd50eae8a9311bd8b0..8c0edca1ebc5eeee613cbfb5e960d4f0e47d3ea2 100644
--- a/clang/include/clang/Sema/ParsedAttr.h
+++ b/clang/include/clang/Sema/ParsedAttr.h
@@ -24,6 +24,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/VersionTuple.h"
+#include <bitset>
 #include <cassert>
 #include <cstddef>
 #include <cstring>
@@ -911,6 +912,20 @@ private:
   VecTy AttrList;
 };
 
+struct ParsedAttributeArgumentsProperties {
+  ParsedAttributeArgumentsProperties(uint32_t StringLiteralBits)
+      : StringLiterals(StringLiteralBits) {}
+  bool isStringLiteralArg(unsigned I) const {
+    // If the last bit is set, assume we have a variadic parameter
+    if (I >= StringLiterals.size())
+      return StringLiterals.test(StringLiterals.size() - 1);
+    return StringLiterals.test(I);
+  }
+
+private:
+  std::bitset<32> StringLiterals;
+};
+
 /// ParsedAttributes - A collection of parsed attributes.  Currently
 /// we don't differentiate between the various attribute syntaxes,
 /// which is basically silly.
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index b2ab6d0f8445728489e4c6423440a256d4e2b3f6..86cee3c728dc160fe5024cf54ae778cfee1cad70 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -7076,6 +7076,8 @@ public:
                                  NestedNameSpecInfo &IdInfo,
                                  bool EnteringContext);
 
+  bool IsInvalidSMECallConversion(QualType FromType, QualType ToType);
+
   /// The parser has parsed a nested-name-specifier
   /// 'template[opt] template-name < template-args >::'.
   ///
@@ -13604,6 +13606,9 @@ private:
                                     CallExpr *TheCall);
   bool CheckMVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
+  bool ParseSVEImmChecks(CallExpr *TheCall,
+                         SmallVector<std::tuple<int, int, int>, 3> &ImmChecks);
+  bool CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckCDEBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
                                    CallExpr *TheCall);
   bool CheckARMCoprocessorImmediate(const TargetInfo &TI, const Expr *CoprocArg,
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 99c859034423bbb7d54b318f40e4432984bf9c86..f0ab7b303541c4ddae417fcfa099cefcdc901344 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3389,12 +3389,19 @@ FunctionProtoType::FunctionProtoType(QualType result, ArrayRef<QualType> params,
     argSlot[i] = params[i];
   }
 
+  // Propagate the SME ACLE attributes.
+  if (epi.AArch64SMEAttributes != SME_NormalFunction) {
+    auto &ExtraBits = *getTrailingObjects<FunctionTypeExtraBitfields>();
+    assert(epi.AArch64SMEAttributes <= SME_AttributeMask &&
+           "Not enough bits to encode SME attributes");
+    ExtraBits.AArch64SMEAttributes = epi.AArch64SMEAttributes;
+  }
+
   // Fill in the exception type array if present.
   if (getExceptionSpecType() == EST_Dynamic) {
     auto &ExtraBits = *getTrailingObjects<FunctionTypeExtraBitfields>();
     size_t NumExceptions = epi.ExceptionSpec.Exceptions.size();
-    assert(NumExceptions <= UINT16_MAX &&
-           "Not enough bits to encode exceptions");
+    assert(NumExceptions <= 1023 && "Not enough bits to encode exceptions");
     ExtraBits.NumExceptionType = NumExceptions;
 
     assert(hasExtraBitfields() && "missing trailing extra bitfields!");
@@ -3551,8 +3558,11 @@ void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID, QualType Result,
   // This is followed by an optional "consumed argument" section of the
   // same length as the first type sequence:
   //      bool*
-  // Finally, we have the ext info and trailing return type flag:
-  //      int bool
+  // This is followed by the ext info:
+  //      int
+  // Finally we have a trailing return type flag (bool)
+  // combined with AArch64 SME Attributes, to save space:
+  //      int
   //
   // There is no ambiguity between the consumed arguments and an empty EH
   // spec because of the leading 'bool' which unambiguously indicates
@@ -3585,8 +3595,9 @@ void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID, QualType Result,
     for (unsigned i = 0; i != NumParams; ++i)
       ID.AddInteger(epi.ExtParameterInfos[i].getOpaqueValue());
   }
+
   epi.ExtInfo.Profile(ID);
-  ID.AddBoolean(epi.HasTrailingReturn);
+  ID.AddInteger((epi.AArch64SMEAttributes << 1) | epi.HasTrailingReturn);
 }
 
 void FunctionProtoType::Profile(llvm::FoldingSetNodeID &ID,
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 1b62f6630928c18bef1738bd446535e3eeed9750..31e6e56e7f693c99e6d51b21c5aafe0289732ba6 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -937,6 +937,20 @@ void TypePrinter::printFunctionProtoAfter(const FunctionProtoType *T,
   OS << ')';
 
   FunctionType::ExtInfo Info = T->getExtInfo();
+  unsigned SMEBits = T->getAArch64SMEAttributes();
+
+  if (SMEBits & FunctionType::SME_PStateSMCompatibleMask)
+    OS << " __arm_streaming_compatible";
+  if (SMEBits & FunctionType::SME_PStateSMEnabledMask)
+    OS << " __arm_streaming";
+  if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Preserves)
+    OS << " __arm_preserves(\"za\")";
+  if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_In)
+    OS << " __arm_in(\"za\")";
+  if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Out)
+    OS << " __arm_out(\"za\")";
+  if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_InOut)
+    OS << " __arm_inout(\"za\")";
 
   printFunctionAfter(Info, OS);
 
@@ -1772,6 +1786,10 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
     OS << "__arm_streaming";
     return;
   }
+  if (T->getAttrKind() == attr::ArmStreamingCompatible) {
+    OS << "__arm_streaming_compatible";
+    return;
+  }
 
   OS << " __attribute__((";
   switch (T->getAttrKind()) {
@@ -1814,6 +1832,11 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
   case attr::AnnotateType:
   case attr::WebAssemblyFuncref:
   case attr::ArmStreaming:
+  case attr::ArmStreamingCompatible:
+  case attr::ArmIn:
+  case attr::ArmOut:
+  case attr::ArmInOut:
+  case attr::ArmPreserves:
     llvm_unreachable("This attribute should have been handled already");
 
   case attr::NSReturnsRetained:
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 0889262b0c9ce839c3c1cfcd958a324c30f288e7..619677c1c1a491bccd191040f53a43140b677dbd 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -374,6 +374,11 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
 
   Builder.defineMacro("__ARM_ALIGN_MAX_STACK_PWR", "4");
 
+  // These macros are set when Clang can parse declarations with these
+  // attributes.
+  Builder.defineMacro("__ARM_STATE_ZA", "1");
+  Builder.defineMacro("__ARM_STATE_ZT0", "1");
+
   // 0xe implies support for half, single and double precision operations.
   if (FPU & FPUMode)
     Builder.defineMacro("__ARM_FP", "0xE");
@@ -418,6 +423,17 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasSVE2 && HasSVE2SM4)
     Builder.defineMacro("__ARM_FEATURE_SVE2_SM4", "1");
 
+  if (HasSME) {
+    Builder.defineMacro("__ARM_FEATURE_SME");
+    Builder.defineMacro("__ARM_FEATURE_LOCALLY_STREAMING", "1");
+  }
+
+  if (HasSME2) {
+    Builder.defineMacro("__ARM_FEATURE_SME");
+    Builder.defineMacro("__ARM_FEATURE_SME2");
+    Builder.defineMacro("__ARM_FEATURE_LOCALLY_STREAMING", "1");
+  }
+
   if (HasCRC)
     Builder.defineMacro("__ARM_FEATURE_CRC32", "1");
 
@@ -670,6 +686,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
       .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3)
       .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4)
       .Case("sme", HasSME)
+      .Case("sme2", HasSME2)
       .Case("sme-f64f64", HasSMEF64F64)
       .Case("sme-i16i64", HasSMEI16I64)
       .Case("sme-fa64", HasSMEFA64)
@@ -790,6 +807,12 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasBFloat16 = true;
       HasFullFP16 = true;
     }
+    if (Feature == "+sme2") {
+      HasSME = true;
+      HasSME2 = true;
+      HasBFloat16 = true;
+      HasFullFP16 = true;
+    }
     if (Feature == "+sme-f64f64") {
       HasSME = true;
       HasSMEF64F64 = true;
@@ -1146,6 +1169,8 @@ TargetInfo::BuiltinVaListKind AArch64TargetInfo::getBuiltinVaListKind() const {
 }
 
 const char *const AArch64TargetInfo::GCCRegNames[] = {
+    // clang-format off
+
     // 32-bit Integer registers
     "w0", "w1", "w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10", "w11",
     "w12", "w13", "w14", "w15", "w16", "w17", "w18", "w19", "w20", "w21", "w22",
@@ -1182,7 +1207,12 @@ const char *const AArch64TargetInfo::GCCRegNames[] = {
 
     // SVE predicate-as-counter registers
     "pn0",  "pn1",  "pn2",  "pn3",  "pn4",  "pn5",  "pn6",  "pn7",  "pn8",
-    "pn9",  "pn10", "pn11", "pn12", "pn13", "pn14", "pn15"
+    "pn9",  "pn10", "pn11", "pn12", "pn13", "pn14", "pn15",
+
+    // SME registers
+    "za", "zt0",
+
+    // clang-format on
 };
 
 ArrayRef<const char *> AArch64TargetInfo::getGCCRegNames() const {
@@ -1302,8 +1332,15 @@ bool AArch64TargetInfo::validateAsmConstraint(
     Info.setAllowsRegister();
     return true;
   case 'U':
-    if (Name[1] == 'p' && (Name[2] == 'l' || Name[2] == 'a')) {
-      // SVE predicate registers ("Upa"=P0-15, "Upl"=P0-P7)
+    if (Name[1] == 'p' &&
+        (Name[2] == 'l' || Name[2] == 'a' || Name[2] == 'h')) {
+      // SVE predicate registers ("Upa"=P0-15, "Upl"=P0-P7, "Uph"=P8-P15)
+      Info.setAllowsRegister();
+      Name += 2;
+      return true;
+    }
+    if (Name[1] == 'c' && (Name[2] == 'i' || Name[2] == 'j')) {
+      // Gpr registers ("Uci"=w8-11, "Ucj"=w12-15)
       Info.setAllowsRegister();
       Name += 2;
       return true;
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index 76d71aca76bd2276fd4489f36a7d26ea015ee77a..77040949fcbec24dfbb4bd3c0adec00f674ac250 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -68,6 +68,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   bool HasCCDP = false;
   bool HasFRInt3264 = false;
   bool HasSME = false;
+  bool HasSME2 = false;
   bool HasSMEF64F64 = false;
   bool HasSMEI16I64 = false;
   bool HasSB = false;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 8f87c4d461090ef96d36b23f5f8caccfc6af48f5..eff67c6ff7cc92024687f8a0460ec5aa4a86a129 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9126,13 +9126,6 @@ Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
   auto *OverloadedTy =
       llvm::ScalableVectorType::get(SVEBuiltinMemEltTy(TypeFlags), ResultTy);
 
-  // At the ACLE level there's only one predicate type, svbool_t, which is
-  // mapped to <n x 16 x i1>. However, this might be incompatible with the
-  // actual type being loaded. For example, when loading doubles (i64) the
-  // predicated should be <n x 2 x i1> instead. At the IR level the type of
-  // the predicate and the data being loaded must match. Cast accordingly.
-  Ops[0] = EmitSVEPredicateCast(Ops[0], OverloadedTy);
-
   Function *F = nullptr;
   if (Ops[1]->getType()->isVectorTy())
     // This is the "vector base, scalar offset" case. In order to uniquely
@@ -9146,6 +9139,16 @@ Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
     // intrinsic.
     F = CGM.getIntrinsic(IntID, OverloadedTy);
 
+  // At the ACLE level there's only one predicate type, svbool_t, which is
+  // mapped to <n x 16 x i1>. However, this might be incompatible with the
+  // actual type being loaded. For example, when loading doubles (i64) the
+  // predicate should be <n x 2 x i1> instead. At the IR level the type of
+  // the predicate and the data being loaded must match. Cast to the type
+  // expected by the intrinsic. The intrinsic itself should be defined in
+  // a way than enforces relations between parameter types.
+  Ops[0] = EmitSVEPredicateCast(
+      Ops[0], cast<llvm::ScalableVectorType>(F->getArg(0)->getType()));
+
   // Pass 0 when the offset is missing. This can only be applied when using
   // the "vector base" addressing mode for which ACLE allows no offset. The
   // corresponding LLVM IR always requires an offset.
@@ -9210,8 +9213,11 @@ Value *CodeGenFunction::EmitSVEScatterStore(const SVETypeFlags &TypeFlags,
   // mapped to <n x 16 x i1>. However, this might be incompatible with the
   // actual type being stored. For example, when storing doubles (i64) the
   // predicated should be <n x 2 x i1> instead. At the IR level the type of
-  // the predicate and the data being stored must match. Cast accordingly.
-  Ops[1] = EmitSVEPredicateCast(Ops[1], OverloadedTy);
+  // the predicate and the data being stored must match. Cast to the type
+  // expected by the intrinsic. The intrinsic itself should be defined in
+  // a way that enforces relations between parameter types.
+  Ops[1] = EmitSVEPredicateCast(
+      Ops[1], cast<llvm::ScalableVectorType>(F->getArg(1)->getType()));
 
   // For "vector base, scalar index" scale the index so that it becomes a
   // scalar offset.
@@ -9327,23 +9333,19 @@ Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
   Value *BasePtr = Builder.CreateBitCast(Ops[1], VecPtrTy);
 
   // Does the store have an offset?
-  if (Ops.size() > 3)
+  if (Ops.size() > (2 + N))
     BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]);
 
   BasePtr = Builder.CreateBitCast(BasePtr, EltPtrTy);
-  Value *Val = Ops.back();
-
+  
   // The llvm.aarch64.sve.st2/3/4 intrinsics take legal part vectors, so we
   // need to break up the tuple vector.
   SmallVector<llvm::Value*, 5> Operands;
-  unsigned MinElts = VTy->getElementCount().getKnownMinValue();
-  for (unsigned I = 0; I < N; ++I) {
-    Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
-    Operands.push_back(Builder.CreateExtractVector(VTy, Val, Idx));
-  }
+  for (unsigned I = Ops.size() - N; I < Ops.size(); ++I)
+    Operands.push_back(Ops[I]);
   Operands.append({Predicate, BasePtr});
-
   Function *F = CGM.getIntrinsic(IntID, { VTy });
+
   return Builder.CreateCall(F, Operands);
 }
 
@@ -9456,59 +9458,49 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
   return Store;
 }
 
-Value *CodeGenFunction::EmitTileslice(Value *Offset, Value *Base) {
-  llvm::Value *CastOffset = Builder.CreateIntCast(Offset, Int32Ty, false);
-  return Builder.CreateAdd(Base, CastOffset, "tileslice");
-}
-
-Value *CodeGenFunction::EmitSMELd1St1(SVETypeFlags TypeFlags,
+Value *CodeGenFunction::EmitSMELd1St1(const SVETypeFlags &TypeFlags,
                                       SmallVectorImpl<Value *> &Ops,
                                       unsigned IntID) {
-  Ops[3] = EmitSVEPredicateCast(
-      Ops[3], getSVEVectorForElementType(SVEBuiltinMemEltTy(TypeFlags)));
+  Ops[2] = EmitSVEPredicateCast(
+      Ops[2], getSVEVectorForElementType(SVEBuiltinMemEltTy(TypeFlags)));
 
   SmallVector<Value *> NewOps;
-  NewOps.push_back(Ops[3]);
+  NewOps.push_back(Ops[2]);
 
-  llvm::Value *BasePtr = Ops[4];
+  llvm::Value *BasePtr = Ops[3];
 
   // If the intrinsic contains the vnum parameter, multiply it with the vector
   // size in bytes.
-  if (Ops.size() == 6) {
+  if (Ops.size() == 5) {
     Function *StreamingVectorLength =
         CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb);
     llvm::Value *StreamingVectorLengthCall =
         Builder.CreateCall(StreamingVectorLength);
     llvm::Value *Mulvl =
-        Builder.CreateMul(StreamingVectorLengthCall, Ops[5], "mulvl");
+        Builder.CreateMul(StreamingVectorLengthCall, Ops[4], "mulvl");
     // The type of the ptr parameter is void *, so use Int8Ty here.
-    BasePtr = Builder.CreateGEP(Int8Ty, Ops[4], Mulvl);
+    BasePtr = Builder.CreateGEP(Int8Ty, Ops[3], Mulvl);
   }
   NewOps.push_back(BasePtr);
   NewOps.push_back(Ops[0]);
-  NewOps.push_back(EmitTileslice(Ops[2], Ops[1]));
+  NewOps.push_back(Ops[1]);
   Function *F = CGM.getIntrinsic(IntID);
   return Builder.CreateCall(F, NewOps);
 }
 
-Value *CodeGenFunction::EmitSMEReadWrite(SVETypeFlags TypeFlags,
+Value *CodeGenFunction::EmitSMEReadWrite(const SVETypeFlags &TypeFlags,
                                          SmallVectorImpl<Value *> &Ops,
                                          unsigned IntID) {
   auto *VecTy = getSVEType(TypeFlags);
   Function *F = CGM.getIntrinsic(IntID, VecTy);
-  if (TypeFlags.isReadZA()) {
+  if (TypeFlags.isReadZA())
     Ops[1] = EmitSVEPredicateCast(Ops[1], VecTy);
-    Ops[3] = EmitTileslice(Ops[4], Ops[3]);
-    Ops.erase(&Ops[4]);
-  } else if (TypeFlags.isWriteZA()) {
-    Ops[1] = EmitTileslice(Ops[2], Ops[1]);
-    Ops[2] = EmitSVEPredicateCast(Ops[3], VecTy);
-    Ops.erase(&Ops[3]);
-  }
+  else if (TypeFlags.isWriteZA())
+    Ops[2] = EmitSVEPredicateCast(Ops[2], VecTy);
   return Builder.CreateCall(F, Ops);
 }
 
-Value *CodeGenFunction::EmitSMEZero(SVETypeFlags TypeFlags,
+Value *CodeGenFunction::EmitSMEZero(const SVETypeFlags &TypeFlags,
                                     SmallVectorImpl<Value *> &Ops,
                                     unsigned IntID) {
   // svzero_za() intrinsic zeros the entire za tile and has no paramters.
@@ -9518,18 +9510,13 @@ Value *CodeGenFunction::EmitSMEZero(SVETypeFlags TypeFlags,
   return Builder.CreateCall(F, Ops);
 }
 
-Value *CodeGenFunction::EmitSMELdrStr(SVETypeFlags TypeFlags,
+Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags,
                                       SmallVectorImpl<Value *> &Ops,
                                       unsigned IntID) {
-  Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb);
-  llvm::Value *CntsbCall = Builder.CreateCall(Cntsb, {}, "svlb");
-  llvm::Value *MulVL = Builder.CreateMul(
-      CntsbCall,
-      Builder.getInt64(cast<llvm::ConstantInt>(Ops[1])->getZExtValue()),
-      "mulvl");
-  Ops[2] = Builder.CreateGEP(Int8Ty, Ops[2], MulVL);
-  Ops[0] = EmitTileslice(Ops[1], Ops[0]);
-  Ops.erase(&Ops[1]);
+  if (Ops.size() == 2)
+    Ops.push_back(Builder.getInt32(0));
+  else
+    Ops[2] = Builder.CreateIntCast(Ops[2], Int32Ty, true);
   Function *F = CGM.getIntrinsic(IntID, {});
   return Builder.CreateCall(F, Ops);
 }
@@ -9622,26 +9609,24 @@ Value *CodeGenFunction::EmitSVETupleCreate(const SVETypeFlags &TypeFlags,
   return Call;
 }
 
-Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
-                                                  const CallExpr *E) {
+void CodeGenFunction::GetAArch64SVEProcessedOperands(
+    unsigned BuiltinID, const CallExpr *E, SmallVectorImpl<Value *> &Ops,
+    SVETypeFlags TypeFlags) {
   // Find out if any arguments are required to be integer constant expressions.
   unsigned ICEArguments = 0;
   ASTContext::GetBuiltinTypeError Error;
   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
   assert(Error == ASTContext::GE_None && "Should not codegen an error");
 
-  llvm::Type *Ty = ConvertType(E->getType());
-  if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
-      BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) {
-    Value *Val = EmitScalarExpr(E->getArg(0));
-    return EmitSVEReinterpret(Val, Ty);
-  }
+  // Tuple set/get only requires one insert/extract vector, which is
+  // created by EmitSVETupleSetOrGet.
+  bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet();
 
-  llvm::SmallVector<Value *, 4> Ops;
   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
-    if ((ICEArguments & (1 << i)) == 0)
-      Ops.push_back(EmitScalarExpr(E->getArg(i)));
-    else {
+    bool IsICE = ICEArguments & (1 << i);
+    Value *Arg = EmitScalarExpr(E->getArg(i));
+
+    if (IsICE) {
       // If this is required to be a constant, constant fold it so that we know
       // that the generated intrinsic gets a ConstantInt.
       std::optional<llvm::APSInt> Result =
@@ -9653,12 +9638,49 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
       // immediate requires more than a handful of bits.
       *Result = Result->extOrTrunc(32);
       Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result));
+      continue;
+    }
+
+    if (IsTupleGetOrSet || !isa<ScalableVectorType>(Arg->getType())) {
+      Ops.push_back(Arg);
+      continue;
+    }
+
+    auto *VTy = cast<ScalableVectorType>(Arg->getType());
+    unsigned MinElts = VTy->getMinNumElements();
+    bool IsPred = VTy->getElementType()->isIntegerTy(1);
+    unsigned N = (MinElts * VTy->getScalarSizeInBits()) / (IsPred ? 16 : 128);
+
+    if (N == 1) {
+      Ops.push_back(Arg);
+      continue;
+    }
+
+    for (unsigned I = 0; I < N; ++I) {
+      Value *Idx = ConstantInt::get(CGM.Int64Ty, (I * MinElts) / N);
+      auto *NewVTy =
+          ScalableVectorType::get(VTy->getElementType(), MinElts / N);
+      Ops.push_back(Builder.CreateExtractVector(NewVTy, Arg, Idx));
     }
   }
+}
+
+Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
+                                                  const CallExpr *E) {
+  llvm::Type *Ty = ConvertType(E->getType());
+  if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
+      BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) {
+    Value *Val = EmitScalarExpr(E->getArg(0));
+    return EmitSVEReinterpret(Val, Ty);
+  }
 
   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SVEIntrinsicMap, BuiltinID,
                                               AArch64SVEIntrinsicsProvenSorted);
+
+  llvm::SmallVector<Value *, 4> Ops;
   SVETypeFlags TypeFlags(Builtin->TypeModifier);
+  GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
+
   if (TypeFlags.isLoad())
     return EmitSVEMaskedLoad(E, Ty, Ops, Builtin->LLVMIntrinsic,
                              TypeFlags.isZExtReturn());
@@ -9672,14 +9694,14 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
     return EmitSVEPrefetchLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
   else if (TypeFlags.isGatherPrefetch())
     return EmitSVEGatherPrefetch(TypeFlags, Ops, Builtin->LLVMIntrinsic);
-	else if (TypeFlags.isStructLoad())
-		return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
-	else if (TypeFlags.isStructStore())
-		return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
+  else if (TypeFlags.isStructLoad())
+    return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
+  else if (TypeFlags.isStructStore())
+    return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
   else if (TypeFlags.isTupleSet() || TypeFlags.isTupleGet())
-        return EmitSVETupleSetOrGet(TypeFlags, Ty, Ops);
+    return EmitSVETupleSetOrGet(TypeFlags, Ty, Ops);
   else if (TypeFlags.isTupleCreate())
-        return EmitSVETupleCreate(TypeFlags, Ty, Ops);
+    return EmitSVETupleCreate(TypeFlags, Ty, Ops);
   else if (TypeFlags.isUndef())
     return UndefValue::get(Ty);
   else if (Builtin->LLVMIntrinsic != 0) {
@@ -9898,13 +9920,8 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
   case SVE::BI__builtin_sve_svtbl2_f64: {
     SVETypeFlags TF(Builtin->TypeModifier);
     auto VTy = cast<llvm::ScalableVectorType>(getSVEType(TF));
-    Value *V0 = Builder.CreateExtractVector(VTy, Ops[0],
-                                            ConstantInt::get(CGM.Int64Ty, 0));
-    unsigned MinElts = VTy->getMinNumElements();
-    Value *V1 = Builder.CreateExtractVector(
-        VTy, Ops[0], ConstantInt::get(CGM.Int64Ty, MinElts));
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_tbl2, VTy);
-    return Builder.CreateCall(F, {V0, V1, Ops[1]});
+    return Builder.CreateCall(F, Ops);
   }
 
   case SVE::BI__builtin_sve_svset_neonq_s8:
@@ -9962,35 +9979,13 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
 
 Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
                                                   const CallExpr *E) {
-  // Find out if any arguments are required to be integer constant expressions.
-  unsigned ICEArguments = 0;
-  ASTContext::GetBuiltinTypeError Error;
-  getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
-  assert(Error == ASTContext::GE_None && "Should not codegen an error");
-
-  llvm::Type *Ty = ConvertType(E->getType());
-  llvm::SmallVector<Value *, 4> Ops;
-  for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
-    if ((ICEArguments & (1 << i)) == 0)
-      Ops.push_back(EmitScalarExpr(E->getArg(i)));
-    else {
-      // If this is required to be a constant, constant fold it so that we know
-      // that the generated intrinsic gets a ConstantInt.
-      std::optional<llvm::APSInt> Result =
-          E->getArg(i)->getIntegerConstantExpr(getContext());
-      assert(Result && "Expected argument to be a constant");
-
-      // Immediates for SVE llvm intrinsics are always 32bit.  We can safely
-      // truncate because the immediate has been range checked and no valid
-      // immediate requires more than a handful of bits.
-      *Result = Result->extOrTrunc(32);
-      Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result));
-    }
-  }
-
   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID,
                                               AArch64SMEIntrinsicsProvenSorted);
+
+  llvm::SmallVector<Value *, 4> Ops;
   SVETypeFlags TypeFlags(Builtin->TypeModifier);
+  GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
+
   if (TypeFlags.isLoad() || TypeFlags.isStore())
     return EmitSMELd1St1(TypeFlags, Ops, Builtin->LLVMIntrinsic);
   else if (TypeFlags.isReadZA() || TypeFlags.isWriteZA())
@@ -9999,23 +9994,28 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
            BuiltinID == SME::BI__builtin_sme_svzero_za)
     return EmitSMEZero(TypeFlags, Ops, Builtin->LLVMIntrinsic);
   else if (BuiltinID == SME::BI__builtin_sme_svldr_vnum_za ||
-           BuiltinID == SME::BI__builtin_sme_svstr_vnum_za)
+           BuiltinID == SME::BI__builtin_sme_svstr_vnum_za ||
+           BuiltinID == SME::BI__builtin_sme_svldr_za ||
+           BuiltinID == SME::BI__builtin_sme_svstr_za)
     return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
-  else if (Builtin->LLVMIntrinsic != 0) {
-    // Predicates must match the main datatype.
-    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
-        if (PredTy->getElementType()->isIntegerTy(1))
-          Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
 
-    Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic,
-                                   getSVEOverloadTypes(TypeFlags, Ty, Ops));
-    Value *Call = Builder.CreateCall(F, Ops);
-    return Call;
-  }
+  // Should not happen!
+  if (Builtin->LLVMIntrinsic == 0)
+    return nullptr;
 
-  /// Should not happen
-  return nullptr;
+  // Predicates must match the main datatype.
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
+      if (PredTy->getElementType()->isIntegerTy(1))
+        Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
+
+  Function *F =
+      TypeFlags.isOverloadNone()
+          ? CGM.getIntrinsic(Builtin->LLVMIntrinsic)
+          : CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)});
+  Value *Call = Builder.CreateCall(F, Ops);
+
+  return Call;
 }
 
 Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
@@ -10062,6 +10062,24 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID));
   }
 
+  if (BuiltinID == clang::AArch64::BI__builtin_arm_get_sme_state) {
+    // Create call to __arm_sme_state and store the results to the two pointers.
+    CallInst *CI = EmitRuntimeCall(CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(StructType::get(CGM.Int64Ty, CGM.Int64Ty), {},
+                                false),
+        "__arm_sme_state"));
+    auto Attrs = AttributeList().addFnAttribute(getLLVMContext(),
+                                                "aarch64_pstate_sm_compatible");
+    CI->setAttributes(Attrs);
+    CI->setCallingConv(
+        llvm::CallingConv::
+            AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2);
+    Builder.CreateStore(Builder.CreateExtractValue(CI, 0),
+                        EmitPointerWithAlignment(E->getArg(0)));
+    return Builder.CreateStore(Builder.CreateExtractValue(CI, 1),
+                               EmitPointerWithAlignment(E->getArg(1)));
+  }
+
   if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit) {
     assert((getContext().getTypeSize(E->getType()) == 32) &&
            "rbit of unusual size!");
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 6b8af9bf18c1fffb02365f527c57f7f7bba50b16..bb36dcb42aa7be8a8e9dd9d0232fb0500cba7972 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1762,6 +1762,22 @@ static void AddAttributesFromFunctionProtoType(ASTContext &Ctx,
   if (!isUnresolvedExceptionSpec(FPT->getExceptionSpecType()) &&
       FPT->isNothrow())
     FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
+
+  unsigned SMEBits = FPT->getAArch64SMEAttributes();
+  if (SMEBits & FunctionType::SME_PStateSMEnabledMask)
+    FuncAttrs.addAttribute("aarch64_pstate_sm_enabled");
+  if (SMEBits & FunctionType::SME_PStateSMCompatibleMask)
+    FuncAttrs.addAttribute("aarch64_pstate_sm_compatible");
+
+  // ZA
+  if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Preserves)
+    FuncAttrs.addAttribute("aarch64_preserves_za");
+  if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_In)
+    FuncAttrs.addAttribute("aarch64_in_za");
+  if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_Out)
+    FuncAttrs.addAttribute("aarch64_out_za");
+  if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_InOut)
+    FuncAttrs.addAttribute("aarch64_inout_za");
 }
 
 static void AddAttributesFromAssumes(llvm::AttrBuilder &FuncAttrs,
@@ -2402,6 +2418,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
                                llvm::toStringRef(CodeGenOpts.UniformWGSize));
       }
     }
+
+    if (TargetDecl->hasAttr<ArmLocallyStreamingAttr>())
+      FuncAttrs.addAttribute("aarch64_pstate_sm_body");
   }
 
   // Attach "no-builtins" attributes to:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 143e0707b9429ef46ea4b58a8e0aa7f216094e74..0553b4651fe9186b5a6e68ceaac517deff9f91e9 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4272,7 +4272,6 @@ public:
   llvm::Value *EmitSVEMaskedStore(const CallExpr *,
                                   SmallVectorImpl<llvm::Value *> &Ops,
                                   unsigned BuiltinID);
-  llvm::Value *EmitTileslice(llvm::Value *Offset, llvm::Value *Base);
   llvm::Value *EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
                                    SmallVectorImpl<llvm::Value *> &Ops,
                                    unsigned BuiltinID);
@@ -4287,18 +4286,23 @@ public:
                                   unsigned IntID);
   llvm::Value *EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
 
-  llvm::Value *EmitSMELd1St1(SVETypeFlags TypeFlags,
+  llvm::Value *EmitSMELd1St1(const SVETypeFlags &TypeFlags,
                              llvm::SmallVectorImpl<llvm::Value *> &Ops,
                              unsigned IntID);
-  llvm::Value *EmitSMEReadWrite(SVETypeFlags TypeFlags,
+  llvm::Value *EmitSMEReadWrite(const SVETypeFlags &TypeFlags,
                                 llvm::SmallVectorImpl<llvm::Value *> &Ops,
                                 unsigned IntID);
-  llvm::Value *EmitSMEZero(SVETypeFlags TypeFlags,
+  llvm::Value *EmitSMEZero(const SVETypeFlags &TypeFlags,
                            llvm::SmallVectorImpl<llvm::Value *> &Ops,
                            unsigned IntID);
-  llvm::Value *EmitSMELdrStr(SVETypeFlags TypeFlags,
+  llvm::Value *EmitSMELdrStr(const SVETypeFlags &TypeFlags,
                              llvm::SmallVectorImpl<llvm::Value *> &Ops,
                              unsigned IntID);
+
+  void GetAArch64SVEProcessedOperands(unsigned BuiltinID, const CallExpr *E,
+                                      SmallVectorImpl<llvm::Value *> &Ops,
+                                      SVETypeFlags TypeFlags);
+
   llvm::Value *EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
 
   llvm::Value *EmitAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index f09d1129b128a35b75eb2479de23bee88a953b3c..e353a649f9511d789c135d8586e21d9bd05b81d2 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2288,6 +2288,16 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
     return;
   }
 
+  // Handle SME attributes that apply to function definitions,
+  // rather than to function prototypes.
+  if (D->hasAttr<ArmLocallyStreamingAttr>())
+    B.addAttribute("aarch64_pstate_sm_body");
+
+  if (auto *Attr = D->getAttr<ArmNewAttr>()) {
+    if (Attr->isNewZA())
+      B.addAttribute("aarch64_new_za");
+  }
+
   // Track whether we need to add the optnone LLVM attribute,
   // starting with the default for this optimization level.
   bool ShouldAddOptNone =
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 561110ff8c0d8d500bc63a3e46e0defbe4a5261e..cca1aea5dc6706d2e60317e5acaca143a25a86e9 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -8,6 +8,7 @@
 
 #include "ABIInfoImpl.h"
 #include "TargetInfo.h"
+#include "clang/Basic/DiagnosticFrontend.h"
 
 using namespace clang;
 using namespace clang::CodeGen;
@@ -151,6 +152,11 @@ public:
     }
     return TargetCodeGenInfo::isScalarizableAsmOperand(CGF, Ty);
   }
+
+  void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc,
+                            const FunctionDecl *Caller,
+                            const FunctionDecl *Callee,
+                            const CallArgList &Args) const override;
 };
 
 class WindowsAArch64TargetCodeGenInfo : public AArch64TargetCodeGenInfo {
@@ -811,6 +817,43 @@ Address AArch64ABIInfo::EmitMSVAArg(CodeGenFunction &CGF, Address VAListAddr,
                           /*allowHigherAlign*/ false);
 }
 
+static bool isStreaming(const FunctionDecl *F) {
+  if (F->hasAttr<ArmLocallyStreamingAttr>())
+    return true;
+  if (const auto *T = F->getType()->getAs<FunctionProtoType>())
+    return T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask;
+  return false;
+}
+
+static bool isStreamingCompatible(const FunctionDecl *F) {
+  if (const auto *T = F->getType()->getAs<FunctionProtoType>())
+    return T->getAArch64SMEAttributes() &
+           FunctionType::SME_PStateSMCompatibleMask;
+  return false;
+}
+
+void AArch64TargetCodeGenInfo::checkFunctionCallABI(
+    CodeGenModule &CGM, SourceLocation CallLoc, const FunctionDecl *Caller,
+    const FunctionDecl *Callee, const CallArgList &Args) const {
+  if (!Caller || !Callee || !Callee->hasAttr<AlwaysInlineAttr>())
+    return;
+
+  bool CallerIsStreaming = isStreaming(Caller);
+  bool CalleeIsStreaming = isStreaming(Callee);
+  bool CallerIsStreamingCompatible = isStreamingCompatible(Caller);
+  bool CalleeIsStreamingCompatible = isStreamingCompatible(Callee);
+
+  if (!CalleeIsStreamingCompatible &&
+      (CallerIsStreaming != CalleeIsStreaming || CallerIsStreamingCompatible))
+    CGM.getDiags().Report(CallLoc,
+                          diag::err_function_always_inline_attribute_mismatch)
+        << Caller->getDeclName() << Callee->getDeclName() << "streaming";
+  if (auto *NewAttr = Callee->getAttr<ArmNewAttr>())
+    if (NewAttr->isNewZA())
+      CGM.getDiags().Report(CallLoc, diag::err_function_always_inline_new_za)
+          << Callee->getDeclName();
+}
+
 std::unique_ptr<TargetCodeGenInfo>
 CodeGen::createAArch64TargetCodeGenInfo(CodeGenModule &CGM,
                                         AArch64ABIKind Kind) {
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 356009ae9157c9231af761b8a78e9b85628d06fb..0477cad066b66c3c316539ca8874332e00279851 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -362,8 +362,8 @@ if(ARM IN_LIST LLVM_TARGETS_TO_BUILD OR AArch64 IN_LIST LLVM_TARGETS_TO_BUILD)
   clang_generate_header(-gen-arm-fp16 arm_fp16.td arm_fp16.h)
   # Generate arm_sve.h
   clang_generate_header(-gen-arm-sve-header arm_sve.td arm_sve.h)
-  # Generate arm_sme_draft_spec_subject_to_change.h
-  clang_generate_header(-gen-arm-sme-header arm_sme.td arm_sme_draft_spec_subject_to_change.h)
+  # Generate arm_sme.h
+  clang_generate_header(-gen-arm-sme-header arm_sme.td arm_sme.h)
   # Generate arm_bf16.h
   clang_generate_header(-gen-arm-bf16 arm_bf16.td arm_bf16.h)
   # Generate arm_mve.h
@@ -384,7 +384,7 @@ if(ARM IN_LIST LLVM_TARGETS_TO_BUILD OR AArch64 IN_LIST LLVM_TARGETS_TO_BUILD)
 
   list(APPEND aarch64_only_generated_files
     "${CMAKE_CURRENT_BINARY_DIR}/arm_sve.h"
-    "${CMAKE_CURRENT_BINARY_DIR}/arm_sme_draft_spec_subject_to_change.h"
+    "${CMAKE_CURRENT_BINARY_DIR}/arm_sme.h"
     "${CMAKE_CURRENT_BINARY_DIR}/arm_bf16.h"
     )
 endif()
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index cf1e3a94de7fdddb4002157bf112daf068e98462..31a71be8a0f5971fb85f2ae11561686f7a834711 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -288,6 +288,16 @@ static bool attributeHasIdentifierArg(const IdentifierInfo &II) {
 #undef CLANG_ATTR_IDENTIFIER_ARG_LIST
 }
 
+/// Determine whether the given attribute has an identifier argument.
+static ParsedAttributeArgumentsProperties
+attributeStringLiteralListArg(const IdentifierInfo &II) {
+#define CLANG_ATTR_STRING_LITERAL_ARG_LIST
+  return llvm::StringSwitch<uint32_t>(normalizeAttrName(II.getName()))
+#include "clang/Parse/AttrParserStringSwitches.inc"
+      .Default(0);
+#undef CLANG_ATTR_STRING_LITERAL_ARG_LIST
+}
+
 /// Determine whether the given attribute has a variadic identifier argument.
 static bool attributeHasVariadicIdentifierArg(const IdentifierInfo &II) {
 #define CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST
@@ -371,6 +381,81 @@ void Parser::ParseAttributeWithTypeArg(IdentifierInfo &AttrName,
                  ScopeName, ScopeLoc, nullptr, 0, Form);
 }
 
+ExprResult
+Parser::ParseUnevaluatedStringInAttribute(const IdentifierInfo &AttrName) {
+  if (Tok.is(tok::l_paren)) {
+    BalancedDelimiterTracker Paren(*this, tok::l_paren);
+    Paren.consumeOpen();
+    ExprResult Res = ParseUnevaluatedStringInAttribute(AttrName);
+    Paren.consumeClose();
+    return Res;
+  }
+  if (!isTokenStringLiteral()) {
+    Diag(Tok.getLocation(), diag::err_expected_string_literal)
+        << /*in attribute...*/ 4 << AttrName.getName();
+    return ExprError();
+  }
+  return ParseUnevaluatedStringLiteralExpression();
+}
+
+bool Parser::ParseAttributeArgumentList(
+    const IdentifierInfo &AttrName, SmallVectorImpl<Expr *> &Exprs,
+    ParsedAttributeArgumentsProperties ArgsProperties) {
+  bool SawError = false;
+  unsigned Arg = 0;
+  while (true) {
+    ExprResult Expr;
+    if (ArgsProperties.isStringLiteralArg(Arg)) {
+      Expr = ParseUnevaluatedStringInAttribute(AttrName);
+    } else if (getLangOpts().CPlusPlus11 && Tok.is(tok::l_brace)) {
+      Diag(Tok, diag::warn_cxx98_compat_generalized_initializer_lists);
+      Expr = ParseBraceInitializer();
+    } else {
+      Expr = ParseAssignmentExpression();
+    }
+    Expr = Actions.CorrectDelayedTyposInExpr(Expr);
+
+    if (Tok.is(tok::ellipsis))
+      Expr = Actions.ActOnPackExpansion(Expr.get(), ConsumeToken());
+    else if (Tok.is(tok::code_completion)) {
+      // There's nothing to suggest in here as we parsed a full expression.
+      // Instead fail and propagate the error since caller might have something
+      // the suggest, e.g. signature help in function call. Note that this is
+      // performed before pushing the \p Expr, so that signature help can report
+      // current argument correctly.
+      SawError = true;
+      cutOffParsing();
+      break;
+    }
+
+    if (Expr.isInvalid()) {
+      SawError = true;
+      break;
+    }
+
+    Exprs.push_back(Expr.get());
+
+    if (Tok.isNot(tok::comma))
+      break;
+    // Move to the next argument, remember where the comma was.
+    Token Comma = Tok;
+    ConsumeToken();
+    checkPotentialAngleBracketDelimiter(Comma);
+    Arg++;
+  }
+
+  if (SawError) {
+    // Ensure typos get diagnosed when errors were encountered while parsing the
+    // expression list.
+    for (auto &E : Exprs) {
+      ExprResult Expr = Actions.CorrectDelayedTyposInExpr(E);
+      if (Expr.isUsable())
+        E = Expr.get();
+    }
+  }
+  return SawError;
+}
+
 unsigned Parser::ParseAttributeArgsCommon(
     IdentifierInfo *AttrName, SourceLocation AttrNameLoc,
     ParsedAttributes &Attrs, SourceLocation *EndLoc, IdentifierInfo *ScopeName,
@@ -463,9 +548,9 @@ unsigned Parser::ParseAttributeArgsCommon(
                        : Sema::ExpressionEvaluationContext::ConstantEvaluated);
 
       ExprVector ParsedExprs;
-      if (ParseExpressionList(ParsedExprs, llvm::function_ref<void()>(),
-                              /*FailImmediatelyOnInvalidExpr=*/true,
-                              /*EarlyTypoCorrection=*/true)) {
+      ParsedAttributeArgumentsProperties ArgProperties =
+          attributeStringLiteralListArg(*AttrName);
+      if (ParseAttributeArgumentList(*AttrName, ParsedExprs, ArgProperties)) {
         SkipUntil(tok::r_paren, StopAtSemi);
         return 0;
       }
@@ -6701,7 +6786,13 @@ void Parser::ParseDirectDeclarator(Declarator &D) {
     } else if (Tok.isRegularKeywordAttribute()) {
       // For consistency with attribute parsing.
       Diag(Tok, diag::err_keyword_not_allowed) << Tok.getIdentifierInfo();
+      bool TakesArgs = doesKeywordAttributeTakeArgs(Tok.getKind());
       ConsumeToken();
+      if (TakesArgs) {
+        BalancedDelimiterTracker T(*this, tok::l_paren);
+        if (!T.consumeOpen())
+          T.skipToEnd();
+      }
     } else if (Tok.is(tok::kw_requires) && D.hasGroupingParens()) {
       // This declarator is declaring a function, but the requires clause is
       // in the wrong place:
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index d9ff6c42c502cee75c958c1fa68627242d870f65..d5c09e943f6a3d51c7e0f49d84884898ed346ede 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -1887,7 +1887,13 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
         if (!SkipUntil(tok::r_paren, StopAtSemi))
           break;
       } else if (Tok.isRegularKeywordAttribute()) {
+        bool TakesArgs = doesKeywordAttributeTakeArgs(Tok.getKind());
         ConsumeToken();
+        if (TakesArgs) {
+          BalancedDelimiterTracker T(*this, tok::l_paren);
+          if (!T.consumeOpen())
+            T.skipToEnd();
+        }
       } else {
         break;
       }
@@ -4510,8 +4516,18 @@ void Parser::ParseCXX11AttributeSpecifierInternal(ParsedAttributes &Attrs,
   if (Tok.isRegularKeywordAttribute()) {
     SourceLocation Loc = Tok.getLocation();
     IdentifierInfo *AttrName = Tok.getIdentifierInfo();
-    Attrs.addNew(AttrName, Loc, nullptr, Loc, nullptr, 0, Tok.getKind());
+    ParsedAttr::Form Form = ParsedAttr::Form(Tok.getKind());
+    bool TakesArgs = doesKeywordAttributeTakeArgs(Tok.getKind());
     ConsumeToken();
+    if (TakesArgs) {
+      if (!Tok.is(tok::l_paren))
+        Diag(Tok.getLocation(), diag::err_expected_lparen_after) << AttrName;
+      else
+        ParseAttributeArgsCommon(AttrName, Loc, Attrs, EndLoc,
+                                 /*ScopeName*/ nullptr,
+                                 /*ScopeLoc*/ Loc, Form);
+    } else
+      Attrs.addNew(AttrName, Loc, nullptr, Loc, nullptr, 0, Form);
     return;
   }
 
@@ -4677,11 +4693,13 @@ SourceLocation Parser::SkipCXX11Attributes() {
       T.consumeOpen();
       T.skipToEnd();
       EndLoc = T.getCloseLocation();
-    } else if (Tok.isRegularKeywordAttribute()) {
+    } else if (Tok.isRegularKeywordAttribute() &&
+               !doesKeywordAttributeTakeArgs(Tok.getKind())) {
       EndLoc = Tok.getLocation();
       ConsumeToken();
     } else {
-      assert(Tok.is(tok::kw_alignas) && "not an attribute specifier");
+      assert((Tok.is(tok::kw_alignas) || Tok.isRegularKeywordAttribute()) &&
+             "not an attribute specifier");
       ConsumeToken();
       BalancedDelimiterTracker T(*this, tok::l_paren);
       if (!T.consumeOpen())
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 75d04824d8b99eef0d280f746ac5b5b0bdb8c267..3438ab3ed9aa8d9b937194a2af7f05dc897aa96e 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -3504,7 +3504,7 @@ bool Parser::ParseExpressionList(SmallVectorImpl<Expr *> &Exprs,
       Expr = Actions.ActOnPackExpansion(Expr.get(), ConsumeToken());
     else if (Tok.is(tok::code_completion)) {
       // There's nothing to suggest in here as we parsed a full expression.
-      // Instead fail and propogate the error since caller might have something
+      // Instead fail and propagate the error since caller might have something
       // the suggest, e.g. signature help in function call. Note that this is
       // performed before pushing the \p Expr, so that signature help can report
       // current argument correctly.
diff --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp
index 66433705250010cd8f08d0a2ab13505b6805052f..449be3425411d788820e040cd3f717d76d13736c 100644
--- a/clang/lib/Parse/ParseTentative.cpp
+++ b/clang/lib/Parse/ParseTentative.cpp
@@ -883,7 +883,8 @@ bool Parser::TrySkipAttributes() {
       // Note that explicitly checking for `[[` and `]]` allows to fail as
       // expected in the case of the Objective-C message send syntax.
       ConsumeBracket();
-    } else if (Tok.isRegularKeywordAttribute()) {
+    } else if (Tok.isRegularKeywordAttribute() &&
+               !doesKeywordAttributeTakeArgs(Tok.getKind())) {
       ConsumeToken();
     } else {
       ConsumeToken();
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 46ae6fba8344bb7519a82c20542e9c44139ab078..03030a02ee792d3c118d16e1d5300b1bc6666763 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -2057,8 +2057,8 @@ void Sema::checkTypeSupport(QualType Ty, SourceLocation Loc, ValueDecl *D) {
     if (Ty->isSVESizelessBuiltinType() && FD && FD->hasBody()) {
       llvm::StringMap<bool> CallerFeatureMap;
       Context.getFunctionFeatureMap(CallerFeatureMap, FD);
-      if (!Builtin::evaluateRequiredTargetFeatures(
-          "sve", CallerFeatureMap))
+      if (!Builtin::evaluateRequiredTargetFeatures("sve", CallerFeatureMap) &&
+          !Builtin::evaluateRequiredTargetFeatures("sme", CallerFeatureMap))
         Diag(D->getLocation(), diag::err_sve_vector_in_non_sve_target) << Ty;
     }
   };
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 5ee20554c4cf32fb529250ab22ee89d0da2dc179..96356f1bdca26ae9deffbfd4b61052fc99494e30 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2893,28 +2893,26 @@ static QualType getNeonEltType(NeonTypeFlags Flags, ASTContext &Context,
   llvm_unreachable("Invalid NeonTypeFlag!");
 }
 
-bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
-  // Range check SVE intrinsics that take immediate values.
-  SmallVector<std::tuple<int,int,int>, 3> ImmChecks;
+enum ArmStreamingType { ArmNonStreaming, ArmStreaming, ArmStreamingCompatible };
 
-  switch (BuiltinID) {
-  default:
-    return false;
-#define GET_SVE_IMMEDIATE_CHECK
-#include "clang/Basic/arm_sve_sema_rangechecks.inc"
-#undef GET_SVE_IMMEDIATE_CHECK
-#define GET_SME_IMMEDIATE_CHECK
-#include "clang/Basic/arm_sme_sema_rangechecks.inc"
-#undef GET_SME_IMMEDIATE_CHECK
-  }
+enum ArmSMEState : unsigned {
+  ArmNoState = 0,
+
+  ArmInZA = 0b01,
+  ArmOutZA = 0b10,
+  ArmInOutZA = 0b11,
+  ArmZAMask = 0b11,
+};
 
+bool Sema::ParseSVEImmChecks(
+    CallExpr *TheCall, SmallVector<std::tuple<int, int, int>, 3> &ImmChecks) {
   // Perform all the immediate checks for this builtin call.
   bool HasError = false;
   for (auto &I : ImmChecks) {
     int ArgNum, CheckTy, ElementSizeInBits;
     std::tie(ArgNum, CheckTy, ElementSizeInBits) = I;
 
-    typedef bool(*OptionSetCheckFnTy)(int64_t Value);
+    typedef bool (*OptionSetCheckFnTy)(int64_t Value);
 
     // Function that checks whether the operand (ArgNum) is an immediate
     // that is one of the predefined values.
@@ -3029,8 +3027,135 @@ bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   return HasError;
 }
 
+static ArmStreamingType getArmStreamingFnType(const FunctionDecl *FD) {
+  if (FD->hasAttr<ArmLocallyStreamingAttr>())
+    return ArmStreaming;
+  if (const auto *T = FD->getType()->getAs<FunctionProtoType>()) {
+    if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask)
+      return ArmStreaming;
+    if (T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMCompatibleMask)
+      return ArmStreamingCompatible;
+  }
+  return ArmNonStreaming;
+}
+
+static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall,
+                                     const FunctionDecl *FD,
+                                     ArmStreamingType BuiltinType) {
+  ArmStreamingType FnType = getArmStreamingFnType(FD);
+  if (FnType == ArmStreaming && BuiltinType == ArmNonStreaming) {
+    S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin)
+        << TheCall->getSourceRange() << "streaming";
+  }
+
+  if (FnType == ArmStreamingCompatible &&
+      BuiltinType != ArmStreamingCompatible) {
+    S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin)
+        << TheCall->getSourceRange() << "streaming compatible";
+    return;
+  }
+
+  if (FnType == ArmNonStreaming && BuiltinType == ArmStreaming) {
+    S.Diag(TheCall->getBeginLoc(), diag::warn_attribute_arm_sm_incompat_builtin)
+        << TheCall->getSourceRange() << "non-streaming";
+  }
+}
+
+static bool hasArmZAState(const FunctionDecl *FD) {
+  const auto *T = FD->getType()->getAs<FunctionProtoType>();
+  return (T && FunctionType::getArmZAState(T->getAArch64SMEAttributes()) !=
+                   FunctionType::ARM_None) ||
+         (FD->hasAttr<ArmNewAttr>() && FD->getAttr<ArmNewAttr>()->isNewZA());
+}
+
+static ArmSMEState getSMEState(unsigned BuiltinID) {
+  switch (BuiltinID) {
+  default:
+    return ArmNoState;
+#define GET_SME_BUILTIN_GET_STATE
+#include "clang/Basic/arm_sme_builtins_za_state.inc"
+#undef GET_SME_BUILTIN_GET_STATE
+  }
+}
+
+bool Sema::CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
+  if (const FunctionDecl *FD = getCurFunctionDecl()) {
+    std::optional<ArmStreamingType> BuiltinType;
+
+    switch (BuiltinID) {
+#define GET_SME_STREAMING_ATTRS
+#include "clang/Basic/arm_sme_streaming_attrs.inc"
+#undef GET_SME_STREAMING_ATTRS
+    }
+
+    if (BuiltinType)
+      checkArmStreamingBuiltin(*this, TheCall, FD, *BuiltinType);
+
+    if ((getSMEState(BuiltinID) & ArmZAMask) && !hasArmZAState(FD))
+      Diag(TheCall->getBeginLoc(),
+           diag::warn_attribute_arm_za_builtin_no_za_state)
+          << TheCall->getSourceRange();
+  }
+
+  // Range check SME intrinsics that take immediate values.
+  SmallVector<std::tuple<int, int, int>, 3> ImmChecks;
+
+  switch (BuiltinID) {
+  default:
+    return false;
+#define GET_SME_IMMEDIATE_CHECK
+#include "clang/Basic/arm_sme_sema_rangechecks.inc"
+#undef GET_SME_IMMEDIATE_CHECK
+  }
+
+  return ParseSVEImmChecks(TheCall, ImmChecks);
+}
+
+bool Sema::CheckSVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
+  if (const FunctionDecl *FD = getCurFunctionDecl()) {
+    std::optional<ArmStreamingType> BuiltinType;
+
+    switch (BuiltinID) {
+#define GET_SVE_STREAMING_ATTRS
+#include "clang/Basic/arm_sve_streaming_attrs.inc"
+#undef GET_SVE_STREAMING_ATTRS
+    }
+    if (BuiltinType)
+      checkArmStreamingBuiltin(*this, TheCall, FD, *BuiltinType);
+  }
+  // Range check SVE intrinsics that take immediate values.
+  SmallVector<std::tuple<int, int, int>, 3> ImmChecks;
+
+  switch (BuiltinID) {
+  default:
+    return false;
+#define GET_SVE_IMMEDIATE_CHECK
+#include "clang/Basic/arm_sve_sema_rangechecks.inc"
+#undef GET_SVE_IMMEDIATE_CHECK
+  }
+
+  return ParseSVEImmChecks(TheCall, ImmChecks);
+}
+
 bool Sema::CheckNeonBuiltinFunctionCall(const TargetInfo &TI,
                                         unsigned BuiltinID, CallExpr *TheCall) {
+  if (const FunctionDecl *FD = getCurFunctionDecl()) {
+
+    switch (BuiltinID) {
+    default:
+      break;
+#define GET_NEON_BUILTINS
+#define TARGET_BUILTIN(id, ...) case NEON::BI##id:
+#define BUILTIN(id, ...) case NEON::BI##id:
+#include "clang/Basic/arm_neon.inc"
+      checkArmStreamingBuiltin(*this, TheCall, FD, ArmNonStreaming);
+      break;
+#undef TARGET_BUILTIN
+#undef BUILTIN
+#undef GET_NEON_BUILTINS
+    }
+  }
+
   llvm::APSInt Result;
   uint64_t mask = 0;
   unsigned TV = 0;
@@ -3393,6 +3518,9 @@ bool Sema::CheckAArch64BuiltinFunctionCall(const TargetInfo &TI,
   if (CheckSVEBuiltinFunctionCall(BuiltinID, TheCall))
     return true;
 
+  if (CheckSMEBuiltinFunctionCall(BuiltinID, TheCall))
+    return true;
+
   // For intrinsics which take an immediate value as part of the instruction,
   // range check them here.
   unsigned i = 0, l = 0, u = 0;
@@ -7053,8 +7181,8 @@ void Sema::CheckArgAlignment(SourceLocation Loc, NamedDecl *FDecl,
 }
 
 /// Handles the checks for format strings, non-POD arguments to vararg
-/// functions, NULL arguments passed to non-NULL parameters, and diagnose_if
-/// attributes.
+/// functions, NULL arguments passed to non-NULL parameters, diagnose_if
+/// attributes and AArch64 SME attributes.
 void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
                      const Expr *ThisArg, ArrayRef<const Expr *> Args,
                      bool IsMemberFunction, SourceLocation Loc,
@@ -7135,6 +7263,41 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
                           ArgTy, ParamTy);
       }
     }
+
+    // If the callee has an AArch64 SME attribute to indicate that it is an
+    // __arm_streaming function, then the caller requires SME to be available.
+    FunctionProtoType::ExtProtoInfo ExtInfo = Proto->getExtProtoInfo();
+    if (ExtInfo.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask) {
+      if (auto *CallerFD = dyn_cast<FunctionDecl>(CurContext)) {
+        llvm::StringMap<bool> CallerFeatureMap;
+        Context.getFunctionFeatureMap(CallerFeatureMap, CallerFD);
+        if (!CallerFeatureMap.contains("sme"))
+          Diag(Loc, diag::err_sme_call_in_non_sme_target);
+      } else if (!Context.getTargetInfo().hasFeature("sme")) {
+        Diag(Loc, diag::err_sme_call_in_non_sme_target);
+      }
+    }
+
+    // If the callee uses AArch64 SME ZA state but the caller doesn't define
+    // any, then this is an error.
+    FunctionType::ArmStateValue ArmZAState =
+        FunctionType::getArmZAState(ExtInfo.AArch64SMEAttributes);
+    if (ArmZAState != FunctionType::ARM_None) {
+      bool CallerHasZAState = false;
+      if (const auto *CallerFD = dyn_cast<FunctionDecl>(CurContext)) {
+        auto *Attr = CallerFD->getAttr<ArmNewAttr>();
+        if (Attr && Attr->isNewZA())
+          CallerHasZAState = true;
+        else if (const auto *FPT =
+                     CallerFD->getType()->getAs<FunctionProtoType>())
+          CallerHasZAState = FunctionType::getArmZAState(
+                                 FPT->getExtProtoInfo().AArch64SMEAttributes) !=
+                             FunctionType::ARM_None;
+      }
+
+      if (!CallerHasZAState)
+        Diag(Loc, diag::err_sme_za_call_no_za_state);
+    }
   }
 
   if (FDecl && FDecl->hasAttr<AllocAlignAttr>()) {
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 5481bbd22c66b687bfdbcf009f4151eba9232b60..f7503e1e8e02a482030274b58bf5706afc9c122a 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -3758,6 +3758,15 @@ bool Sema::MergeFunctionDecl(FunctionDecl *New, NamedDecl *&OldD, Scope *S,
     }
   }
 
+  // It is not permitted to redeclare an SME function with different SME
+  // attributes.
+  if (IsInvalidSMECallConversion(Old->getType(), New->getType())) {
+    Diag(New->getLocation(), diag::err_sme_attr_mismatch)
+        << New->getType() << Old->getType();
+    Diag(OldLocation, diag::note_previous_declaration);
+    return true;
+  }
+
   // If a function is first declared with a calling convention, but is later
   // declared or defined without one, all following decls assume the calling
   // convention of the first.
@@ -12081,6 +12090,35 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
     if (!Redeclaration && LangOpts.CUDA)
       checkCUDATargetOverload(NewFD, Previous);
   }
+
+  // Check if the function definition uses any AArch64 SME features without
+  // having the '+sme' feature enabled.
+  if (DeclIsDefn) {
+    const auto *Attr = NewFD->getAttr<ArmNewAttr>();
+    bool UsesSM = NewFD->hasAttr<ArmLocallyStreamingAttr>();
+    bool UsesZA = Attr && Attr->isNewZA();
+    if (const auto *FPT = NewFD->getType()->getAs<FunctionProtoType>()) {
+      FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
+      UsesSM |=
+          EPI.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask;
+      UsesZA |= FunctionType::getArmZAState(EPI.AArch64SMEAttributes) !=
+                FunctionType::ARM_None;
+    }
+
+    if (UsesSM || UsesZA) {
+      llvm::StringMap<bool> FeatureMap;
+      Context.getFunctionFeatureMap(FeatureMap, NewFD);
+      if (!FeatureMap.contains("sme")) {
+        if (UsesSM)
+          Diag(NewFD->getLocation(),
+               diag::err_sme_definition_using_sm_in_non_sme_target);
+        else
+          Diag(NewFD->getLocation(),
+               diag::err_sme_definition_using_za_in_non_sme_target);
+      }
+    }
+  }
+
   return Redeclaration;
 }
 
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index ed69e802c95dd51671c74fe417e5c8eb58431a3e..87c89299ace64dc85e44190c183895ef89453bd3 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -349,7 +349,7 @@ bool Sema::checkStringLiteralArgumentAttr(const AttributeCommonInfo &CI,
   if (ArgLocation)
     *ArgLocation = E->getBeginLoc();
 
-  if (!Literal || !Literal->isOrdinary()) {
+  if (!Literal || (!Literal->isUnevaluated() && !Literal->isOrdinary())) {
     Diag(E->getBeginLoc(), diag::err_attribute_argument_type)
         << CI << AANT_ArgumentString;
     return false;
@@ -381,6 +381,16 @@ bool Sema::checkStringLiteralArgumentAttr(const ParsedAttr &AL, unsigned ArgNum,
 
   // Now check for an actual string literal.
   Expr *ArgExpr = AL.getArgAsExpr(ArgNum);
+  const auto *Literal = dyn_cast<StringLiteral>(ArgExpr->IgnoreParenCasts());
+  if (ArgLocation)
+    *ArgLocation = ArgExpr->getBeginLoc();
+
+  if (!Literal || (!Literal->isUnevaluated() && !Literal->isOrdinary())) {
+    Diag(ArgExpr->getBeginLoc(), diag::err_attribute_argument_type)
+        << AL << AANT_ArgumentString;
+    return false;
+  }
+  Str = Literal->getString();
   return checkStringLiteralArgumentAttr(AL, ArgExpr, Str, ArgLocation);
 }
 
@@ -5349,9 +5359,6 @@ bool Sema::CheckCallingConvAttr(const ParsedAttr &Attrs, CallingConv &CC,
   case ParsedAttr::AT_AArch64SVEPcs:
     CC = CC_AArch64SVEPCS;
     break;
-  case ParsedAttr::AT_ArmStreaming:
-    CC = CC_C; // FIXME: placeholder until real SME support is added.
-    break;
   case ParsedAttr::AT_AMDGPUKernelCall:
     CC = CC_AMDGPUKernelCall;
     break;
@@ -8707,6 +8714,76 @@ static bool MustDelayAttributeArguments(const ParsedAttr &AL) {
   return false;
 }
 
+static bool checkArmNewAttrMutualExclusion(
+    Sema &S, const ParsedAttr &AL, const FunctionProtoType *FPT,
+    FunctionType::ArmStateValue CurrentState, StringRef StateName) {
+  auto CheckForIncompatibleAttr =
+      [&](FunctionType::ArmStateValue IncompatibleState,
+          StringRef IncompatibleStateName) {
+        if (CurrentState == IncompatibleState) {
+          S.Diag(AL.getLoc(), diag::err_attributes_are_not_compatible)
+              << (std::string("'__arm_new(\"") + StateName.str() + "\")'")
+              << (std::string("'") + IncompatibleStateName.str() + "(\"" +
+                  StateName.str() + "\")'")
+              << true;
+          AL.setInvalid();
+        }
+      };
+
+  CheckForIncompatibleAttr(FunctionType::ARM_In, "__arm_in");
+  CheckForIncompatibleAttr(FunctionType::ARM_Out, "__arm_out");
+  CheckForIncompatibleAttr(FunctionType::ARM_InOut, "__arm_inout");
+  CheckForIncompatibleAttr(FunctionType::ARM_Preserves, "__arm_preserves");
+  return AL.isInvalid();
+}
+
+static void handleArmNewAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  if (!AL.getNumArgs()) {
+    S.Diag(AL.getLoc(), diag::err_missing_arm_state) << AL;
+    AL.setInvalid();
+    return;
+  }
+
+  std::vector<StringRef> NewState;
+  if (const auto *ExistingAttr = D->getAttr<ArmNewAttr>()) {
+    for (StringRef S : ExistingAttr->newArgs())
+      NewState.push_back(S);
+  }
+
+  bool HasZA = false;
+  for (unsigned I = 0, E = AL.getNumArgs(); I != E; ++I) {
+    StringRef StateName;
+    SourceLocation LiteralLoc;
+    if (!S.checkStringLiteralArgumentAttr(AL, I, StateName, &LiteralLoc))
+      return;
+
+    if (StateName == "za")
+      HasZA = true;
+    else {
+      S.Diag(LiteralLoc, diag::err_unknown_arm_state) << StateName;
+      AL.setInvalid();
+      return;
+    }
+
+    if (std::find(NewState.begin(), NewState.end(), StateName) ==
+        NewState.end()) { // Avoid adding duplicates.
+      NewState.push_back(StateName);
+    }
+  }
+
+  if (auto *FPT = dyn_cast<FunctionProtoType>(D->getFunctionType())) {
+    FunctionType::ArmStateValue ZAState =
+        FunctionType::getArmZAState(FPT->getAArch64SMEAttributes());
+    if (HasZA && ZAState != FunctionType::ARM_None &&
+        checkArmNewAttrMutualExclusion(S, AL, FPT, ZAState, "za"))
+      return;
+  }
+
+  D->dropAttr<ArmNewAttr>();
+  D->addAttr(::new (S.Context)
+                 ArmNewAttr(S.Context, AL, NewState.data(), NewState.size()));
+}
+
 /// ProcessDeclAttribute - Apply the specific attribute to the specified decl if
 /// the attribute applies to decls.  If the attribute is a type attribute, just
 /// silently ignore it if a GNU attribute.
@@ -9462,6 +9539,14 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
     handleArmBuiltinAliasAttr(S, D, AL);
     break;
 
+  case ParsedAttr::AT_ArmLocallyStreaming:
+    handleSimpleAttribute<ArmLocallyStreamingAttr>(S, D, AL);
+    break;
+
+  case ParsedAttr::AT_ArmNew:
+    handleArmNewAttr(S, D, AL);
+    break;
+
   case ParsedAttr::AT_AcquireHandle:
     handleAcquireHandleAttr(S, D, AL);
     break;
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index b62f3c475c450c996d977f396f5640f3098f156d..a978ee0da6a0d1e2b3267944390f0b474b62e298 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -17985,6 +17985,14 @@ bool Sema::CheckOverridingFunctionAttributes(const CXXMethodDecl *New,
       }
   }
 
+  // SME attributes must match when overriding a function declaration.
+  if (IsInvalidSMECallConversion(Old->getType(), New->getType())) {
+    Diag(New->getLocation(), diag::err_conflicting_overriding_attributes)
+        << New << New->getType() << Old->getType();
+    Diag(Old->getLocation(), diag::note_overridden_virtual_function);
+    return true;
+  }
+
   // Virtual overrides must have the same code_seg.
   const auto *OldCSA = Old->getAttr<CodeSegAttr>();
   const auto *NewCSA = New->getAttr<CodeSegAttr>();
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 3a5e302cc03a319116a623e3b2d88148655cfa79..8804076f4d11c4d22c58478662bd9aaa6d6f1528 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -9679,6 +9679,21 @@ ExprResult Sema::ActOnConditionalOp(SourceLocation QuestionLoc,
       ColonLoc, result, VK, OK);
 }
 
+// Check that the SME attributes for PSTATE.ZA and PSTATE.SM are compatible.
+bool Sema::IsInvalidSMECallConversion(QualType FromType, QualType ToType) {
+  unsigned FromAttributes = 0, ToAttributes = 0;
+  if (const auto *FromFn =
+          dyn_cast<FunctionProtoType>(Context.getCanonicalType(FromType)))
+    FromAttributes =
+        FromFn->getAArch64SMEAttributes() & FunctionType::SME_AttributeMask;
+  if (const auto *ToFn =
+          dyn_cast<FunctionProtoType>(Context.getCanonicalType(ToType)))
+    ToAttributes =
+        ToFn->getAArch64SMEAttributes() & FunctionType::SME_AttributeMask;
+
+  return FromAttributes != ToAttributes;
+}
+
 // Check if we have a conversion between incompatible cmse function pointer
 // types, that is, a conversion between a function pointer with the
 // cmse_nonsecure_call attribute and one without.
@@ -9845,6 +9860,8 @@ checkPointerTypesForAssignment(Sema &S, QualType LHSType, QualType RHSType,
     return Sema::IncompatibleFunctionPointer;
   if (IsInvalidCmseNSCallConversion(S, ltrans, rtrans))
     return Sema::IncompatibleFunctionPointer;
+  if (S.IsInvalidSMECallConversion(rtrans, ltrans))
+    return Sema::IncompatibleFunctionPointer;
   return ConvTy;
 }
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 0aa691d24171f37fe0ddaa1e16921fc3e1ac8d56..61e2ada139c72e908f4814ca9ce519a4bbcab336 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -128,7 +128,6 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr,
   case ParsedAttr::AT_VectorCall:                                              \
   case ParsedAttr::AT_AArch64VectorPcs:                                        \
   case ParsedAttr::AT_AArch64SVEPcs:                                           \
-  case ParsedAttr::AT_ArmStreaming:                                            \
   case ParsedAttr::AT_AMDGPUKernelCall:                                        \
   case ParsedAttr::AT_MSABI:                                                   \
   case ParsedAttr::AT_SysVABI:                                                 \
@@ -143,6 +142,12 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr,
   case ParsedAttr::AT_NoReturn:                                                \
   case ParsedAttr::AT_Regparm:                                                 \
   case ParsedAttr::AT_CmseNSCall:                                              \
+  case ParsedAttr::AT_ArmStreaming:                                            \
+  case ParsedAttr::AT_ArmStreamingCompatible:                                  \
+  case ParsedAttr::AT_ArmPreserves:                                            \
+  case ParsedAttr::AT_ArmIn:                                                   \
+  case ParsedAttr::AT_ArmOut:                                                  \
+  case ParsedAttr::AT_ArmInOut:                                                \
   case ParsedAttr::AT_AnyX86NoCallerSavedRegisters:                            \
   case ParsedAttr::AT_AnyX86NoCfCheck:                                         \
     CALLING_CONV_ATTRS_CASELIST
@@ -7772,6 +7777,69 @@ static Attr *getCCTypeAttr(ASTContext &Ctx, ParsedAttr &Attr) {
   llvm_unreachable("unexpected attribute kind!");
 }
 
+static bool checkMutualExclusion(TypeProcessingState &state,
+                                 const FunctionProtoType::ExtProtoInfo &EPI,
+                                 ParsedAttr &Attr,
+                                 AttributeCommonInfo::Kind OtherKind) {
+  auto OtherAttr = std::find_if(
+      state.getCurrentAttributes().begin(), state.getCurrentAttributes().end(),
+      [OtherKind](const ParsedAttr &A) { return A.getKind() == OtherKind; });
+  if (OtherAttr == state.getCurrentAttributes().end() || OtherAttr->isInvalid())
+    return false;
+
+  Sema &S = state.getSema();
+  S.Diag(Attr.getLoc(), diag::err_attributes_are_not_compatible)
+      << *OtherAttr << Attr
+      << (OtherAttr->isRegularKeywordAttribute() ||
+          Attr.isRegularKeywordAttribute());
+  S.Diag(OtherAttr->getLoc(), diag::note_conflicting_attribute);
+  Attr.setInvalid();
+  return true;
+}
+
+static bool handleArmStateAttribute(Sema &S,
+                                    FunctionProtoType::ExtProtoInfo &EPI,
+                                    ParsedAttr &Attr,
+                                    FunctionType::ArmStateValue State) {
+  if (!Attr.getNumArgs()) {
+    S.Diag(Attr.getLoc(), diag::err_missing_arm_state) << Attr;
+    Attr.setInvalid();
+    return true;
+  }
+
+  for (unsigned I = 0; I < Attr.getNumArgs(); ++I) {
+    StringRef StateName;
+    SourceLocation LiteralLoc;
+    if (!S.checkStringLiteralArgumentAttr(Attr, I, StateName, &LiteralLoc))
+      return true;
+
+    unsigned Shift;
+    FunctionType::ArmStateValue ExistingState;
+    if (StateName == "za") {
+      Shift = FunctionType::SME_ZAShift;
+      ExistingState = FunctionType::getArmZAState(EPI.AArch64SMEAttributes);
+    } else {
+      S.Diag(LiteralLoc, diag::err_unknown_arm_state) << StateName;
+      Attr.setInvalid();
+      return true;
+    }
+
+    // __arm_in(S), __arm_out(S), __arm_inout(S) and __arm_preserves(S)
+    // are all mutually exclusive for the same S, so check if there are
+    // conflicting attributes.
+    if (ExistingState != FunctionType::ARM_None && ExistingState != State) {
+      S.Diag(LiteralLoc, diag::err_conflicting_attributes_arm_state)
+          << StateName;
+      Attr.setInvalid();
+      return true;
+    }
+
+    EPI.setArmSMEAttribute(
+        (FunctionType::AArch64SMETypeAttributes)((State << Shift)));
+  }
+  return false;
+}
+
 /// Process an individual function attribute.  Returns true to
 /// indicate that the attribute was handled, false if it wasn't.
 static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
@@ -7901,6 +7969,72 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
     return true;
   }
 
+  if (attr.getKind() == ParsedAttr::AT_ArmStreaming ||
+      attr.getKind() == ParsedAttr::AT_ArmStreamingCompatible ||
+      attr.getKind() == ParsedAttr::AT_ArmPreserves ||
+      attr.getKind() == ParsedAttr::AT_ArmIn ||
+      attr.getKind() == ParsedAttr::AT_ArmOut ||
+      attr.getKind() == ParsedAttr::AT_ArmInOut) {
+    if (S.CheckAttrTarget(attr))
+      return true;
+
+    if (attr.getKind() == ParsedAttr::AT_ArmStreaming ||
+        attr.getKind() == ParsedAttr::AT_ArmStreamingCompatible)
+      if (S.CheckAttrNoArgs(attr))
+        return true;
+
+    if (!unwrapped.isFunctionType())
+      return false;
+
+    const auto *FnTy = unwrapped.get()->getAs<FunctionProtoType>();
+    if (!FnTy) {
+      // SME ACLE attributes are not supported on K&R-style unprototyped C
+      // functions.
+      S.Diag(attr.getLoc(), diag::warn_attribute_wrong_decl_type) <<
+        attr << attr.isRegularKeywordAttribute() << ExpectedFunctionWithProtoType;
+      attr.setInvalid();
+      return false;
+    }
+
+    FunctionProtoType::ExtProtoInfo EPI = FnTy->getExtProtoInfo();
+    switch (attr.getKind()) {
+    case ParsedAttr::AT_ArmStreaming:
+      if (checkMutualExclusion(state, EPI, attr,
+                               ParsedAttr::AT_ArmStreamingCompatible))
+        return true;
+      EPI.setArmSMEAttribute(FunctionType::SME_PStateSMEnabledMask);
+      break;
+    case ParsedAttr::AT_ArmStreamingCompatible:
+      if (checkMutualExclusion(state, EPI, attr, ParsedAttr::AT_ArmStreaming))
+        return true;
+      EPI.setArmSMEAttribute(FunctionType::SME_PStateSMCompatibleMask);
+      break;
+    case ParsedAttr::AT_ArmPreserves:
+      if (handleArmStateAttribute(S, EPI, attr, FunctionType::ARM_Preserves))
+        return true;
+      break;
+    case ParsedAttr::AT_ArmIn:
+      if (handleArmStateAttribute(S, EPI, attr, FunctionType::ARM_In))
+        return true;
+      break;
+    case ParsedAttr::AT_ArmOut:
+      if (handleArmStateAttribute(S, EPI, attr, FunctionType::ARM_Out))
+        return true;
+      break;
+    case ParsedAttr::AT_ArmInOut:
+      if (handleArmStateAttribute(S, EPI, attr, FunctionType::ARM_InOut))
+        return true;
+      break;
+    default:
+      llvm_unreachable("Unsupported attribute");
+    }
+
+    QualType newtype = S.Context.getFunctionType(FnTy->getReturnType(),
+                                                 FnTy->getParamTypes(), EPI);
+    type = unwrapped.wrap(S, newtype->getAs<FunctionType>());
+    return true;
+  }
+
   if (attr.getKind() == ParsedAttr::AT_NoThrow) {
     // Delay if this is not a function type.
     if (!unwrapped.isFunctionType())
diff --git a/clang/test/AST/ast-dump-sme-attributes.cpp b/clang/test/AST/ast-dump-sme-attributes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..133648d90a15764a2c85511e3757014fb94eee92
--- /dev/null
+++ b/clang/test/AST/ast-dump-sme-attributes.cpp
@@ -0,0 +1,66 @@
+// Test without serialization:
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -std=c++2a -ast-dump -ast-dump-filter Foo %s | FileCheck -strict-whitespace %s
+
+// Test with serialization:
+// RUN: %clang_cc1 -std=c++20 -triple aarch64 -target-feature +sme -emit-pch -o %t %s
+// RUN: %clang_cc1 -x c++ -std=c++20 -triple aarch64 -target-feature +sme -include-pch %t -ast-dump-all -ast-dump-filter Foo /dev/null \
+// RUN: | sed -e "s/ <undeserialized declarations>//" -e "s/ imported//" \
+// RUN: | FileCheck --strict-whitespace %s
+
+struct Foo {
+// CHECK:      |-CXXRecordDecl {{.*}} implicit struct Foo
+// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_streaming 'void () __arm_streaming'
+// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_streaming_compatible 'void () __arm_streaming_compatible'
+// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_locally_streaming 'void ()'
+// CHECK-NEXT: | `-ArmLocallyStreamingAttr
+// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_shared_za 'void () __arm_inout("za")'
+// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_new_za 'void ()'
+// CHECK-NEXT: | `-ArmNewAttr {{.*}} za
+// CHECK-NEXT: |-CXXMethodDecl {{.*}} f_preserves_za 'void () __arm_preserves("za")'
+  void f_streaming() __arm_streaming;
+  void f_streaming_compatible() __arm_streaming_compatible;
+  __arm_locally_streaming void f_locally_streaming();
+  void f_shared_za() __arm_inout("za");
+  __arm_new("za") void f_new_za();
+  void f_preserves_za() __arm_preserves("za");
+
+
+// CHECK:      |-CXXMethodDecl {{.*}} test_lambda 'int (int)' implicit-inline
+// CHECK:         `-CompoundStmt
+// CHECK-NEXT:     |-DeclStmt
+// CHECK-NEXT:     | `-VarDecl
+// CHECK-NEXT:     |   `-LambdaExpr
+// CHECK-NEXT:     |     |-CXXRecordDecl
+// CHECK:          |     | |-CXXMethodDecl {{.*}} used constexpr operator() 'int (int) __arm_streaming const' inline
+// CHECK:          |     | |-CXXConversionDecl {{.*}} implicit constexpr operator int (*)(int) __arm_streaming 'int (*() const noexcept)(int) __arm_streaming' inline
+// CHECK:          |     | |-CXXMethodDecl {{.*}} implicit __invoke 'int (int) __arm_streaming' static inline
+// CHECK:          `-ReturnStmt
+// CHECK:            `-CXXOperatorCallExpr
+// CHECK-NEXT:         |-ImplicitCastExpr {{.*}} 'int (*)(int) __arm_streaming const' <FunctionToPointerDecay>
+// CHECK-NEXT:         | `-DeclRefExpr {{.*}} 'int (int) __arm_streaming const' lvalue CXXMethod {{.*}} 'operator()' 'int (int) __arm_streaming const'
+  int test_lambda(int x) {
+    auto F = [](int x) __arm_streaming { return x; };
+    return F(x);
+  }
+
+// CHECK: |-TypedefDecl {{.*}} referenced s_ptrty 'void (*)(int, int) __arm_streaming'
+// CHECK-NEXT: | `-PointerType {{.*}} 'void (*)(int, int) __arm_streaming'
+// CHECK-NEXT: |   `-ParenType {{.*}} 'void (int, int) __arm_streaming' sugar
+// CHECK-NEXT: |     `-FunctionProtoType {{.*}} 'void (int, int) __arm_streaming' cdecl
+  typedef void (*s_ptrty) (int, int) __arm_streaming;
+
+// CHECK:      `-CXXMethodDecl {{.*}} test_streaming_ptrty 'void (s_ptrty, int, int)' implicit-inline
+// CHECK-NEXT:   |-ParmVarDecl {{.*}} used f 's_ptrty':'void (*)(int, int) __arm_streaming'
+// CHECK-NEXT:   |-ParmVarDecl {{.*}} used x 'int'
+// CHECK-NEXT:   |-ParmVarDecl {{.*}} used y 'int'
+// CHECK:        `-CompoundStmt
+// CHECK-NEXT:     `-ReturnStmt
+// CHECK-NEXT:       `-CallExpr
+// CHECK-NEXT:         |-ImplicitCastExpr {{.*}} 's_ptrty':'void (*)(int, int) __arm_streaming' <LValueToRValue>
+// CHECK-NEXT:         | `-DeclRefExpr {{.*}} 's_ptrty':'void (*)(int, int) __arm_streaming' lvalue ParmVar {{.*}} 'f' 's_ptrty':'void (*)(int, int) __arm_streaming'
+// CHECK-NEXT:         |-ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT:         | `-DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} 'x' 'int'
+// CHECK-NEXT:         `-ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+// CHECK-NEXT:           `-DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} 'y' 'int'
+  void test_streaming_ptrty(s_ptrty f, int x, int y) { return f(x, y); };
+};
diff --git a/clang/test/CodeGen/aarch64-inline-asm.c b/clang/test/CodeGen/aarch64-inline-asm.c
index 439fb9e33f9ae15f51da654ec1f462d99065742c..8ddee560b11da4f1aec7c5026fe3a0c0a876b512 100644
--- a/clang/test/CodeGen/aarch64-inline-asm.c
+++ b/clang/test/CodeGen/aarch64-inline-asm.c
@@ -80,3 +80,26 @@ void test_tied_earlyclobber(void) {
   asm("" : "+&r"(a));
   // CHECK: call i32 asm "", "=&{x1},0"(i32 %0)
 }
+
+void test_reduced_gpr_constraints(int var32, long var64) {
+  asm("add w0, w0, %0" : : "Uci"(var32) : "w0");
+// CHECK: [[ARG1:%.+]] = load i32, ptr
+// CHECK: call void asm sideeffect "add w0, w0, $0", "@3Uci,~{w0}"(i32 [[ARG1]])
+  asm("add x0, x0, %0" : : "Uci"(var64) : "x0");
+// CHECK: [[ARG1:%.+]] = load i64, ptr
+// CHECK: call void asm sideeffect "add x0, x0, $0", "@3Uci,~{x0}"(i64 [[ARG1]])
+  asm("add w0, w0, %0" : : "Ucj"(var32) : "w0");
+// CHECK: [[ARG2:%.+]] = load i32, ptr
+// CHECK: call void asm sideeffect "add w0, w0, $0", "@3Ucj,~{w0}"(i32 [[ARG2]])
+  asm("add x0, x0, %0" : : "Ucj"(var64) : "x0");
+// CHECK: [[ARG2:%.+]] = load i64, ptr
+// CHECK: call void asm sideeffect "add x0, x0, $0", "@3Ucj,~{x0}"(i64 [[ARG2]])
+}
+
+void test_sme_constraints(){
+  asm("movt zt0[3, mul vl], z0" : : : "za");
+// CHECK: call void asm sideeffect "movt zt0[3, mul vl], z0", "~{za}"()
+
+  asm("movt zt0[3, mul vl], z0" : : : "zt0");
+// CHECK: call void asm sideeffect "movt zt0[3, mul vl], z0", "~{zt0}"()
+}
\ No newline at end of file
diff --git a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
new file mode 100644
index 0000000000000000000000000000000000000000..7eb74f28a1c85a73d9e003d7fedcdcb9e2cb920f
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
@@ -0,0 +1,47 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_NONE %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_COMPATIBLE %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_STREAMING %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_LOCALLY %s
+
+#define __ai __attribute__((always_inline))
+__ai void inlined_fn(void) {}
+__ai void inlined_fn_streaming_compatible(void) __arm_streaming_compatible {}
+__ai void inlined_fn_streaming(void) __arm_streaming {}
+__ai __arm_locally_streaming void inlined_fn_local(void) {}
+
+#ifdef TEST_NONE
+void caller(void) {
+    inlined_fn();
+    inlined_fn_streaming_compatible();
+    inlined_fn_streaming(); // expected-error {{always_inline function 'inlined_fn_streaming' and its caller 'caller' have mismatching streaming attributes}}
+    inlined_fn_local(); // expected-error {{always_inline function 'inlined_fn_local' and its caller 'caller' have mismatching streaming attributes}}
+}
+#endif
+
+#ifdef TEST_COMPATIBLE
+void caller_compatible(void) __arm_streaming_compatible {
+    inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_compatible' have mismatching streaming attributes}}
+    inlined_fn_streaming_compatible();
+    inlined_fn_streaming(); // expected-error {{always_inline function 'inlined_fn_streaming' and its caller 'caller_compatible' have mismatching streaming attributes}}
+    inlined_fn_local(); // expected-error {{always_inline function 'inlined_fn_local' and its caller 'caller_compatible' have mismatching streaming attributes}}
+}
+#endif
+
+#ifdef TEST_STREAMING
+void caller_streaming(void) __arm_streaming {
+    inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_streaming' have mismatching streaming attributes}}
+    inlined_fn_streaming_compatible();
+    inlined_fn_streaming();
+    inlined_fn_local();
+}
+#endif
+
+#ifdef TEST_LOCALLY
+__arm_locally_streaming
+void caller_local(void) {
+    inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_local' have mismatching streaming attributes}}
+    inlined_fn_streaming_compatible();
+    inlined_fn_streaming();
+    inlined_fn_local();
+}
+#endif
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fdd2de11365dd4a1f5ba31694cb3369ea702df7d
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
@@ -0,0 +1,303 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \
+// RUN:   -S -disable-O0-optnone -Werror -emit-llvm -o - %s \
+// RUN: | opt -S -passes=mem2reg \
+// RUN: | opt -S -passes=inline \
+// RUN: | FileCheck %s
+
+extern "C" {
+
+extern int normal_callee();
+
+// == FUNCTION DECLARATIONS ==
+
+int streaming_decl(void) __arm_streaming;
+int streaming_compatible_decl(void) __arm_streaming_compatible;
+int shared_za_decl(void) __arm_inout("za");
+int preserves_za_decl(void) __arm_preserves("za");
+int private_za_decl(void);
+
+// == FUNCTION DEFINITIONS ==
+
+// CHECK-LABEL: @streaming_caller()
+// CHECK-SAME: #[[SM_ENABLED:[0-9]+]]
+// CHECK: call i32 @normal_callee()
+//
+ int streaming_caller() __arm_streaming {
+  return normal_callee();
+}
+
+// CHECK: declare i32 @normal_callee() #[[NORMAL_DECL:[0-9]+]]
+
+
+// CHECK-LABEL: @streaming_callee()
+// CHECK-SAME: #[[SM_ENABLED]]
+// CHECK: call i32 @streaming_decl() #[[SM_ENABLED_CALL:[0-9]+]]
+//
+ int streaming_callee() __arm_streaming {
+  return streaming_decl();
+}
+
+// CHECK: declare i32 @streaming_decl() #[[SM_ENABLED_DECL:[0-9]+]]
+
+// CHECK-LABEL: @streaming_compatible_caller()
+// CHECK-SAME: #[[SM_COMPATIBLE:[0-9]+]]
+// CHECK: call i32 @normal_callee()
+//
+ int streaming_compatible_caller() __arm_streaming_compatible {
+  return normal_callee();
+}
+
+// CHECK-LABEL: @streaming_compatible_callee()
+// CHECK-SAME: #[[SM_COMPATIBLE]]
+// CHECK: call i32 @streaming_compatible_decl() #[[SM_COMPATIBLE_CALL:[0-9]+]]
+//
+ int streaming_compatible_callee() __arm_streaming_compatible {
+  return streaming_compatible_decl();
+}
+
+// CHECK: declare i32 @streaming_compatible_decl() #[[SM_COMPATIBLE_DECL:[0-9]+]]
+
+// CHECK-LABEL: @locally_streaming_caller()
+// CHECK-SAME: #[[SM_BODY:[0-9]+]]
+// CHECK: call i32 @normal_callee()
+//
+__arm_locally_streaming int locally_streaming_caller() {
+  return normal_callee();
+}
+
+// CHECK-LABEL: @locally_streaming_callee()
+// CHECK-SAME: #[[SM_BODY]]
+// CHECK: call i32 @locally_streaming_caller() #[[SM_BODY_CALL:[0-9]+]]
+//
+__arm_locally_streaming int locally_streaming_callee() {
+  return locally_streaming_caller();
+}
+
+
+// CHECK-LABEL: @shared_za_caller()
+// CHECK-SAME: #[[ZA_SHARED:[0-9]+]]
+// CHECK: call i32 @normal_callee()
+//
+ int shared_za_caller() __arm_inout("za") {
+  return normal_callee();
+}
+
+// CHECK-LABEL: @shared_za_callee()
+// CHECK-SAME: #[[ZA_SHARED]]
+// CHECK: call i32 @shared_za_decl() #[[ZA_SHARED_CALL:[0-9]+]]
+//
+ int shared_za_callee() __arm_inout("za") {
+  return shared_za_decl();
+}
+
+// CHECK: declare i32 @shared_za_decl() #[[ZA_SHARED_DECL:[0-9]+]]
+
+
+// CHECK-LABEL: @preserves_za_caller()
+// CHECK-SAME: #[[ZA_PRESERVED:[0-9]+]]
+// CHECK: call i32 @normal_callee()
+//
+ int preserves_za_caller() __arm_preserves("za") {
+  return normal_callee();
+}
+
+// CHECK-LABEL: @preserves_za_callee()
+// CHECK-SAME: #[[ZA_PRESERVED]]
+// CHECK: call i32 @preserves_za_decl() #[[ZA_PRESERVED_CALL:[0-9]+]]
+//
+ int preserves_za_callee() __arm_preserves("za") {
+  return preserves_za_decl();
+}
+
+// CHECK: declare i32 @preserves_za_decl() #[[ZA_PRESERVED_DECL:[0-9]+]]
+
+
+// CHECK-LABEL: @new_za_caller()
+// CHECK-SAME: #[[ZA_NEW:[0-9]+]]
+// CHECK: call i32 @normal_callee()
+//
+__arm_new("za") int new_za_caller() {
+  return normal_callee();
+}
+
+// CHECK-LABEL: @new_za_callee()
+// CHECK-SAME: #[[ZA_NEW]]
+// CHECK: call i32 @private_za_decl()
+//
+__arm_new("za") int new_za_callee() {
+  return private_za_decl();
+}
+
+// CHECK: declare i32 @private_za_decl()
+
+
+// Ensure that the attributes are correctly propagated to function types
+// and also to callsites.
+typedef void (*s_ptrty) (int, int) __arm_streaming;
+typedef void (*sc_ptrty) (int, int) __arm_streaming_compatible;
+typedef void (*sz_ptrty) (int, int) __arm_inout("za");
+typedef void (*pz_ptrty) (int, int) __arm_preserves("za");
+
+// CHECK-LABEL: @test_streaming_ptrty(
+// CHECK-SAME: #[[NORMAL_DEF:[0-9]+]]
+// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[SM_ENABLED_CALL]]
+//
+void test_streaming_ptrty(s_ptrty f, int x, int y) { return f(x, y); }
+// CHECK-LABEL: @test_streaming_compatible_ptrty(
+// CHECK-SAME: #[[NORMAL_DEF]]
+// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[SM_COMPATIBLE_CALL]]
+//
+void test_streaming_compatible_ptrty(sc_ptrty f, int x, int y) { return f(x, y); }
+// CHECK-LABEL: @test_shared_za(
+// CHECK-SAME: #[[ZA_SHARED]]
+// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ZA_SHARED_CALL]]
+//
+void  test_shared_za(sz_ptrty f, int x, int y) __arm_inout("za") { return f(x, y); }
+// CHECK-LABEL: @test_preserved_za(
+// CHECK-SAME: #[[ZA_SHARED]]
+// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ZA_PRESERVED_CALL]]
+//
+void  test_preserved_za(pz_ptrty f, int x, int y) __arm_inout("za") { return f(x, y); }
+
+// CHECK-LABEL: @test_indirect_streaming_ptrty(
+// CHECK-SAME: #[[NORMAL_DEF:[0-9]+]]
+// CHECK: call void [[F:%.*]](i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[SM_ENABLED_CALL]]
+//
+typedef s_ptrty **indirect_s_ptrty;
+void test_indirect_streaming_ptrty(indirect_s_ptrty fptr, int x, int y) { return (**fptr)(x, y); }
+} // extern "C"
+
+//
+// Test that having the attribute in different places (on declaration and on type)
+// both results in the attribute being applied to the type.
+//
+
+// CHECK-LABEL: @_Z24test_same_type_streamingv(
+// CHECK:   call void @_Z10streaming1v() #[[SM_ENABLED_CALL]]
+// CHECK:   call void @_Z10streaming2v() #[[SM_ENABLED_CALL]]
+// CHECK:   call void @_Z20same_type_streaming1v() #[[SM_ENABLED_CALL]]
+// CHECK:   call void @_Z20same_type_streaming2v() #[[SM_ENABLED_CALL]]
+// CHECK:   ret void
+// CHECK: }
+// CHECK: declare void @_Z10streaming1v() #[[SM_ENABLED_DECL]]
+// CHECK: declare void @_Z10streaming2v() #[[SM_ENABLED_DECL]]
+// CHECK: declare void @_Z20same_type_streaming1v() #[[SM_ENABLED_DECL]]
+// CHECK: declare void @_Z20same_type_streaming2v() #[[SM_ENABLED_DECL]]
+void streaming1(void) __arm_streaming;
+void streaming2() __arm_streaming;
+decltype(streaming1) same_type_streaming1;
+decltype(streaming2) same_type_streaming2;
+void test_same_type_streaming() {
+  streaming1();
+  streaming2();
+  same_type_streaming1();
+  same_type_streaming2();
+}
+
+//
+// Test overloading; the attribute is not required for overloaded types and
+// does not apply if not specified.
+//
+
+// CHECK-LABEL: @_Z12overloadedfni(
+// CHECK-SAME: #[[SM_ENABLED]]
+int  overloadedfn(int x) __arm_streaming { return x; }
+// CHECK-LABEL: @_Z12overloadedfnf(
+// CHECK-SAME: #[[NORMAL_DEF]]
+//
+float overloadedfn(float x) { return x; }
+// CHECK-LABEL: @_Z13test_overloadi(
+// CHECK-SAME: #[[NORMAL_DEF]]
+//
+int test_overload(int x) { return overloadedfn(x); }
+// CHECK-LABEL: @_Z13test_overloadf(
+// CHECK-SAME: #[[NORMAL_DEF]]
+//
+float test_overload(float x) { return overloadedfn(x); }
+
+// CHECK-LABEL: @_Z11test_lambdai(
+// CHECK-SAME: #[[NORMAL_DEF]]
+// CHECK: call noundef i32 @"_ZZ11test_lambdaiENK3$_0clEi"({{.*}}) #[[SM_ENABLED_CALL]]
+//
+// CHECK: @"_ZZ11test_lambdaiENK3$_0clEi"(
+// CHECK-SAME: #[[SM_ENABLED]]
+int test_lambda(int x) {
+  auto F = [](int x)  __arm_streaming { return x; };
+  return F(x);
+}
+
+// CHECK-LABEL: @_Z27test_template_instantiationv(
+// CHECK-SAME: #[[NORMAL_DEF]]
+// CHECK: call noundef i32 @_Z15template_functyIiET_S0_(i32 noundef 12) #[[SM_ENABLED_CALL]]
+//
+// CHECK: @_Z15template_functyIiET_S0_(
+// CHECK-SAME: #[[SM_ENABLED]]
+template <typename Ty>
+Ty template_functy(Ty x)  __arm_streaming { return x; }
+int test_template_instantiation() { return template_functy(12); }
+
+//
+// Test that arm_locally_streaming is inherited by future redeclarations,
+// even when they don't specify the attribute.
+//
+
+// CHECK: define {{.*}} @_Z25locally_streaming_inheritv(
+// CHECK-SAME: #[[SM_BODY]]
+__arm_locally_streaming void locally_streaming_inherit();
+void locally_streaming_inherit() {
+  streaming_decl();
+}
+
+// Test that the attributes are propagated properly to calls
+// when using a variadic template as indirection.
+__attribute__((always_inline))
+int call() { return 0; }
+
+template <typename T, typename... Other>
+__attribute__((always_inline))
+int call(T f, Other... other) __arm_inout("za") {
+    return f() + call(other...);
+}
+
+// CHECK: {{.*}} @_Z22test_variadic_templatev(
+// CHECK:      call {{.*}} i32 @normal_callee() #[[NOUNWIND_CALL:[0-9]+]]
+// CHECK-NEXT: call {{.*}} i32 @streaming_decl() #[[NOUNWIND_SM_ENABLED_CALL:[0-9]+]]
+// CHECK-NEXT: call {{.*}} i32 @streaming_compatible_decl() #[[NOUNWIND_SM_COMPATIBLE_CALL:[0-9]+]]
+// CHECK-NEXT: call {{.*}} i32 @shared_za_decl() #[[NOUNWIND_ZA_SHARED_CALL:[0-9]+]]
+// CHECK-NEXT: call {{.*}} i32 @preserves_za_decl() #[[NOUNWIND_ZA_PRESERVED_CALL:[0-9]+]]
+// CHECK-NEXT: add nsw
+// CHECK-NEXT: add nsw
+// CHECK-NEXT: add nsw
+// CHECK-NEXT: add nsw
+// CHECK-NEXT: ret
+int test_variadic_template() __arm_inout("za") {
+  return call(normal_callee,
+              streaming_decl,
+              streaming_compatible_decl,
+              shared_za_decl,
+              preserves_za_decl);
+}
+
+// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[NORMAL_DECL]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[SM_ENABLED_DECL]] = { "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[SM_COMPATIBLE]] = { mustprogress noinline nounwind "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[SM_COMPATIBLE_DECL]] = { "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind "aarch64_new_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[SM_ENABLED_CALL]] = { "aarch64_pstate_sm_enabled" }
+// CHECK: attributes #[[SM_COMPATIBLE_CALL]] = { "aarch64_pstate_sm_compatible" }
+// CHECK: attributes #[[SM_BODY_CALL]] = { "aarch64_pstate_sm_body" }
+// CHECK: attributes #[[ZA_SHARED_CALL]] = { "aarch64_inout_za" }
+// CHECK: attributes #[[ZA_PRESERVED_CALL]] = { "aarch64_preserves_za" }
+// CHECK: attributes #[[NOUNWIND_CALL]] = { nounwind }
+// CHECK: attributes #[[NOUNWIND_SM_ENABLED_CALL]] = { nounwind "aarch64_pstate_sm_enabled" }
+// CHECK: attributes #[[NOUNWIND_SM_COMPATIBLE_CALL]] = { nounwind "aarch64_pstate_sm_compatible" }
+// CHECK: attributes #[[NOUNWIND_ZA_SHARED_CALL]] = { nounwind "aarch64_inout_za" }
+// CHECK: attributes #[[NOUNWIND_ZA_PRESERVED_CALL]] = { nounwind "aarch64_preserves_za" }
+
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
index b0855553df79f6881acddebb88a9c65bb03a7943..ce951f4423b606166a802f3ba03bc5138a2025d0 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
@@ -1,11 +1,11 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -21,7 +21,7 @@
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) {
+void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za32, _u32, _m)(0, pn, pm, zn);
 }
 
@@ -33,7 +33,7 @@ void test_svaddha_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) {
+void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za32, _u32, _m)(3, pn, pm, zn);
 }
 
@@ -45,7 +45,7 @@ void test_svaddha_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) {
+void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za32, _s32, _m)(0, pn, pm, zn);
 }
 
@@ -57,7 +57,7 @@ void test_svaddha_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) {
+void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za32, _s32, _m)(3, pn, pm, zn);
 }
 
@@ -69,7 +69,7 @@ void test_svaddha_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) {
+void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za32, _u32, _m)(0, pn, pm, zn);
 }
 
@@ -81,7 +81,7 @@ void test_svaddva_za32_u32(svbool_t pn, svbool_t pm, svuint32_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) {
+void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za32, _u32, _m)(3, pn, pm, zn);
 }
 
@@ -93,7 +93,7 @@ void test_svaddva_za32_u32_1(svbool_t pn, svbool_t pm, svuint32_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 0, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) {
+void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za32, _s32, _m)(0, pn, pm, zn);
 }
 
@@ -105,6 +105,6 @@ void test_svaddva_za32_s32(svbool_t pn, svbool_t pm, svint32_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv4i32(i32 3, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddva_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) {
+void test_svaddva_za32_s32_1(svbool_t pn, svbool_t pm, svint32_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za32, _s32, _m)(3, pn, pm, zn);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
index 2f0f97e742e3e1707e9c20b5da832e67e3bc7e1c..d15a068fa6403beb71499bcc45372798e1720e2d 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
@@ -1,11 +1,11 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -21,7 +21,7 @@
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) {
+void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za64, _u64, _m)(0, pn, pm, zn);
 }
 
@@ -33,7 +33,7 @@ void test_svaddha_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) {
+void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za64, _u64, _m)(7, pn, pm, zn);
 }
 
@@ -45,7 +45,7 @@ void test_svaddha_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) {
+void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za64, _s64, _m)(0, pn, pm, zn);
 }
 
@@ -57,7 +57,7 @@ void test_svaddha_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addha.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) {
+void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddha_za64, _s64, _m)(7, pn, pm, zn);
 }
 
@@ -69,7 +69,7 @@ void test_svaddha_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) {
+void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za64, _u64, _m)(0, pn, pm, zn);
 }
 
@@ -81,7 +81,7 @@ void test_svaddva_za64_u64(svbool_t pn, svbool_t pm, svuint64_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) {
+void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za64, _u64, _m)(7, pn, pm, zn);
 }
 
@@ -93,7 +93,7 @@ void test_svaddva_za64_u64_1(svbool_t pn, svbool_t pm, svuint64_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 0, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) {
+void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za64, _s64, _m)(0, pn, pm, zn);
 }
 
@@ -105,6 +105,6 @@ void test_svaddva_za64_s64(svbool_t pn, svbool_t pm, svint64_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.addva.nxv2i64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svaddva_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) {
+void test_svaddva_za64_s64_1(svbool_t pn, svbool_t pm, svint64_t zn) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svaddva_za64, _s64, _m)(7, pn, pm, zn);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c
index b3b2499a383033244a4a73a5bcaf372977773a31..c80c8bd37e959a3619f6731da7ab7ade3b735861 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c
@@ -1,9 +1,9 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-C-LABEL: @test_svcntsb(
 // CHECK-CXX-LABEL: @_Z12test_svcntsbv(
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
index c309bde627f7df2629cd41403e6a6ffb440a9e8a..6fbbadc7fc8c6813a24b350a8e23e7cd7e79e999 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
@@ -1,15 +1,9 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
-
-#ifdef DISABLE_SME_ATTRIBUTES
-#define ARM_STREAMING_ATTR
-#else
-#define ARM_STREAMING_ATTR __attribute__((arm_streaming))
-#endif
+#include <arm_sme.h>
 
 // CHECK-C-LABEL:   @test_svld1_hor_za8(
 // CHECK-CXX-LABEL: @_Z18test_svld1_hor_za8ju10__SVBool_tPKv(
@@ -19,9 +13,9 @@
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) {
-  svld1_hor_za8(0, slice_base, 0, pg, ptr);
-  svld1_hor_za8(0, slice_base, 15, pg, ptr);
+void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
+  svld1_hor_za8(0, slice_base, pg, ptr);
+  svld1_hor_za8(0, slice_base + 15, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svld1_hor_za16(
@@ -33,9 +27,9 @@ ARM_STREAMING_ATTR void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, con
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) {
-  svld1_hor_za16(0, slice_base, 0, pg, ptr);
-  svld1_hor_za16(1, slice_base, 7, pg, ptr);
+void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
+  svld1_hor_za16(0, slice_base, pg, ptr);
+  svld1_hor_za16(1, slice_base + 7, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svld1_hor_za32(
@@ -47,9 +41,9 @@ ARM_STREAMING_ATTR void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, co
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) {
-  svld1_hor_za32(0, slice_base, 0, pg, ptr);
-  svld1_hor_za32(3, slice_base, 3, pg, ptr);
+void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
+  svld1_hor_za32(0, slice_base, pg, ptr);
+  svld1_hor_za32(3, slice_base + 3, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svld1_hor_za64(
@@ -61,9 +55,9 @@ ARM_STREAMING_ATTR void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, co
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) {
-  svld1_hor_za64(0, slice_base, 0, pg, ptr);
-  svld1_hor_za64(7, slice_base, 1, pg, ptr);
+void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
+  svld1_hor_za64(0, slice_base, pg, ptr);
+  svld1_hor_za64(7, slice_base + 1, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svld1_hor_za128(
@@ -74,9 +68,9 @@ ARM_STREAMING_ATTR void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, co
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) {
-  svld1_hor_za128(0, slice_base, 0, pg, ptr);
-  svld1_hor_za128(15, slice_base, 0, pg, ptr);
+void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
+  svld1_hor_za128(0, slice_base, pg, ptr);
+  svld1_hor_za128(15, slice_base, pg, ptr);
 }
 
 // CHECK-C-LABEL: @test_svld1_ver_za8(
@@ -87,9 +81,9 @@ ARM_STREAMING_ATTR void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, c
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) {
-  svld1_ver_za8(0, slice_base, 0, pg, ptr);
-  svld1_ver_za8(0, slice_base, 15, pg, ptr);
+void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
+  svld1_ver_za8(0, slice_base, pg, ptr);
+  svld1_ver_za8(0, slice_base + 15, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svld1_ver_za16(
@@ -101,9 +95,9 @@ ARM_STREAMING_ATTR void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, con
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) {
-  svld1_ver_za16(0, slice_base, 0, pg, ptr);
-  svld1_ver_za16(1, slice_base, 7, pg, ptr);
+void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
+  svld1_ver_za16(0, slice_base, pg, ptr);
+  svld1_ver_za16(1, slice_base + 7, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svld1_ver_za32(
@@ -115,9 +109,9 @@ ARM_STREAMING_ATTR void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, co
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) {
-  svld1_ver_za32(0, slice_base, 0, pg, ptr);
-  svld1_ver_za32(3, slice_base, 3, pg, ptr);
+void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
+  svld1_ver_za32(0, slice_base, pg, ptr);
+  svld1_ver_za32(3, slice_base + 3, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svld1_ver_za64(
@@ -129,9 +123,9 @@ ARM_STREAMING_ATTR void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, co
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) {
-  svld1_ver_za64(0, slice_base, 0, pg, ptr);
-  svld1_ver_za64(7, slice_base, 1, pg, ptr);
+void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
+  svld1_ver_za64(0, slice_base, pg, ptr);
+  svld1_ver_za64(7, slice_base + 1, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svld1_ver_za128(
@@ -142,7 +136,7 @@ ARM_STREAMING_ATTR void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, co
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) {
-  svld1_ver_za128(0, slice_base, 0, pg, ptr);
-  svld1_ver_za128(15, slice_base, 0, pg, ptr);
+void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) __arm_streaming __arm_out("za") {
+  svld1_ver_za128(0, slice_base, pg, ptr);
+  svld1_ver_za128(15, slice_base, pg, ptr);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
index 6c80ef55f81895d0fe57c10a9eedbd6f878a8c40..37e0a5853e7746c6fa6f336b61ce09d5d29ef197 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
@@ -1,15 +1,9 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
-
-#ifdef DISABLE_SME_ATTRIBUTES
-#define ARM_STREAMING_ATTR
-#else
-#define ARM_STREAMING_ATTR __attribute__((arm_streaming))
-#endif
+#include <arm_sme.h>
 
 // CHECK-C-LABEL:   @test_svld1_hor_vnum_za8(
 // CHECK-CXX-LABEL: @_Z23test_svld1_hor_vnum_za8ju10__SVBool_tPKvl(
@@ -22,9 +16,9 @@
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
-  svld1_hor_vnum_za8(0, slice_base, 0, pg, ptr, vnum);
-  svld1_hor_vnum_za8(0, slice_base, 15, pg, ptr, vnum);
+void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
+  svld1_hor_vnum_za8(0, slice_base, pg, ptr, vnum);
+  svld1_hor_vnum_za8(0, slice_base + 15, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svld1_hor_vnum_za16(
@@ -39,9 +33,9 @@ ARM_STREAMING_ATTR void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
-  svld1_hor_vnum_za16(0, slice_base, 0, pg, ptr, vnum);
-  svld1_hor_vnum_za16(1, slice_base, 7, pg, ptr, vnum);
+void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
+  svld1_hor_vnum_za16(0, slice_base, pg, ptr, vnum);
+  svld1_hor_vnum_za16(1, slice_base + 7, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svld1_hor_vnum_za32(
@@ -56,9 +50,9 @@ ARM_STREAMING_ATTR void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t p
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
-  svld1_hor_vnum_za32(0, slice_base, 0, pg, ptr, vnum);
-  svld1_hor_vnum_za32(3, slice_base, 3, pg, ptr, vnum);
+void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
+  svld1_hor_vnum_za32(0, slice_base, pg, ptr, vnum);
+  svld1_hor_vnum_za32(3, slice_base + 3, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svld1_hor_vnum_za64(
@@ -73,9 +67,9 @@ ARM_STREAMING_ATTR void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t p
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
-  svld1_hor_vnum_za64(0, slice_base, 0, pg, ptr, vnum);
-  svld1_hor_vnum_za64(7, slice_base, 1, pg, ptr, vnum);
+void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
+  svld1_hor_vnum_za64(0, slice_base, pg, ptr, vnum);
+  svld1_hor_vnum_za64(7, slice_base + 1, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svld1_hor_vnum_za128(
@@ -89,9 +83,9 @@ ARM_STREAMING_ATTR void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t p
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
-  svld1_hor_vnum_za128(0, slice_base, 0, pg, ptr, vnum);
-  svld1_hor_vnum_za128(15, slice_base, 0, pg, ptr, vnum);
+void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
+  svld1_hor_vnum_za128(0, slice_base, pg, ptr, vnum);
+  svld1_hor_vnum_za128(15, slice_base, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svld1_ver_hor_za8(
@@ -105,9 +99,9 @@ ARM_STREAMING_ATTR void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
-  svld1_ver_vnum_za8(0, slice_base, 0, pg, ptr, vnum);
-  svld1_ver_vnum_za8(0, slice_base, 15, pg, ptr, vnum);
+void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
+  svld1_ver_vnum_za8(0, slice_base, pg, ptr, vnum);
+  svld1_ver_vnum_za8(0, slice_base + 15, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svld1_ver_vnum_za16(
@@ -122,9 +116,9 @@ ARM_STREAMING_ATTR void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg,
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
-  svld1_ver_vnum_za16(0, slice_base, 0, pg, ptr, vnum);
-  svld1_ver_vnum_za16(1, slice_base, 7, pg, ptr, vnum);
+void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
+  svld1_ver_vnum_za16(0, slice_base, pg, ptr, vnum);
+  svld1_ver_vnum_za16(1, slice_base + 7, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svld1_ver_vnum_za32(
@@ -139,9 +133,9 @@ ARM_STREAMING_ATTR void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t p
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
-  svld1_ver_vnum_za32(0, slice_base, 0, pg, ptr, vnum);
-  svld1_ver_vnum_za32(3, slice_base, 3, pg, ptr, vnum);
+void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
+  svld1_ver_vnum_za32(0, slice_base, pg, ptr, vnum);
+  svld1_ver_vnum_za32(3, slice_base + 3, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svld1_ver_vnum_za64(
@@ -156,9 +150,9 @@ ARM_STREAMING_ATTR void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t p
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
-  svld1_ver_vnum_za64(0, slice_base, 0, pg, ptr, vnum);
-  svld1_ver_vnum_za64(7, slice_base, 1, pg, ptr, vnum);
+void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
+  svld1_ver_vnum_za64(0, slice_base, pg, ptr, vnum);
+  svld1_ver_vnum_za64(7, slice_base + 1, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svld1_ver_vnum_za128(
@@ -172,7 +166,7 @@ ARM_STREAMING_ATTR void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t p
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
-  svld1_ver_vnum_za128(0, slice_base, 0, pg, ptr, vnum);
-  svld1_ver_vnum_za128(15, slice_base, 0, pg, ptr, vnum);
+void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) __arm_streaming __arm_out("za") {
+  svld1_ver_vnum_za128(0, slice_base, pg, ptr, vnum);
+  svld1_ver_vnum_za128(15, slice_base, pg, ptr, vnum);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
index 7efa8b1556857cfdf0606c61a94bee39a46dec73..bf7cffc1b0834d600530bc131d2a43670d98d913 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
@@ -1,30 +1,57 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-C-LABEL: @test_svldr_vnum_za(
 // CHECK-CXX-LABEL: @_Z18test_svldr_vnum_zajPKv(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) {
-  svldr_vnum_za(slice_base, 0, ptr);
+void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) __arm_out("za") {
+  svldr_vnum_za(slice_base, ptr, 0);
 }
 
 // CHECK-C-LABEL: @test_svldr_vnum_za_1(
 // CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_1jPKv(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[SVLB]], 15
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]]
-// CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
-void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) {
-  svldr_vnum_za(slice_base, 15, ptr);
+void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) __arm_out("za") {
+  svldr_vnum_za(slice_base, ptr, 15);
+}
+
+// CHECK-C-LABEL: @test_svldr_za(
+// CHECK-CXX-LABEL: @_Z13test_svldr_zajPKv(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_svldr_za(uint32_t slice_base, const void *ptr) __arm_out("za") {
+  svldr_za(slice_base, ptr);
+}
+
+// CHECK-C-LABEL: @test_svldr_vnum_za_var(
+// CHECK-CXX-LABEL: @_Z22test_svldr_vnum_za_varjPKvl(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[VNUM:%.*]] to i32
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]])
+// CHECK-NEXT:    ret void
+//
+void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) __arm_out("za") {
+  svldr_vnum_za(slice_base, ptr, vnum);
+}
+
+// CHECK-C-LABEL: @test_svldr_vnum_za_2(
+// CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_2jPKv(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16)
+// CHECK-NEXT:    ret void
+//
+void test_svldr_vnum_za_2(uint32_t slice_base, const void *ptr) __arm_out("za") {
+  svldr_vnum_za(slice_base, ptr, 16);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
index b52aee12f9c7037f55b3af3d7bc70a3815e65935..bceb2d6f83df9d97678b1cf729142361469d4235 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
@@ -1,11 +1,11 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -19,7 +19,7 @@
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.smopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) {
+void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmopa_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -29,7 +29,7 @@ void test_svmopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.umopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) {
+void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmopa_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -41,7 +41,7 @@ void test_svmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.mopa.wide.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) {
+void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmopa_za32, _bf16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -53,7 +53,7 @@ void test_svmopa_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.mopa.wide.nxv8f16(i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) {
+void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmopa_za32, _f16, _m)(1, pn, pm, zn, zm);
 }
 
@@ -65,7 +65,7 @@ void test_svmopa_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) {
+void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmopa_za32, _f32, _m)(1, pn, pm, zn, zm);
 }
 
@@ -75,7 +75,7 @@ void test_svmopa_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) {
+void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
  SME_ACLE_FUNC(svsumopa_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -85,6 +85,6 @@ void test_svsumopa_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmopa.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svusmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) {
+void test_svusmopa_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svusmopa_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
index 6925a450ba386fbca53ed182db94bf699e335d09..7cdeb8731d0be6ffbf3297b2718dc60ff42da5c5 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
@@ -1,11 +1,11 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -18,11 +18,11 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smopa.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) {
-  SME_ACLE_FUNC(svmopa_za64, _s16, _m)(1, pn, pm, zn, zm);
+void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmopa_za64, _s16, _m)(7, pn, pm, zn, zm);
 }
 
 // CHECK-C-LABEL: @test_svmopa_za64_u16(
@@ -33,7 +33,7 @@ void test_svmopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.umopa.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) {
+void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmopa_za64, _u16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -42,11 +42,11 @@ void test_svmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PN:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PM:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv2f64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) {
-  SME_ACLE_FUNC(svmopa_za64, _f64, _m)(1, pn, pm, zn, zm);
+void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmopa_za64, _f64, _m)(7, pn, pm, zn, zm);
 }
 
 // CHECK-C-LABEL: @test_svsumopa_za64_s16(
@@ -57,7 +57,7 @@ void test_svmopa_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumopa.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) {
+void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
  SME_ACLE_FUNC(svsumopa_za64, _s16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -66,9 +66,9 @@ void test_svsumopa_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmopa.wide.nxv8i16(i32 2, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmopa.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svusmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) {
-  SME_ACLE_FUNC(svusmopa_za64, _u16, _m)(2, pn, pm, zn, zm);
+void test_svusmopa_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svusmopa_za64, _u16, _m)(7, pn, pm, zn, zm);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
index 923b6b96b4b4e0184ada7bb790971b6ad8fc6de7..51fffd42b9442765f6136cc50487e3a87d2c8f23 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
@@ -1,11 +1,11 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -19,7 +19,7 @@
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.smops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) {
+void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmops_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -29,7 +29,7 @@ void test_svmops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svint8_t zm) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.umops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) {
+void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmops_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -41,7 +41,7 @@ void test_svmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svuint8_t zm) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.mops.wide.nxv8bf16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) {
+void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmops_za32, _bf16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -53,7 +53,7 @@ void test_svmops_za32_bf16(svbool_t pn, svbool_t pm, svbfloat16_t zn, svbfloat16
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.mops.wide.nxv8f16(i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) {
+void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmops_za32, _f16, _m)(1, pn, pm, zn, zm);
 }
 
@@ -65,7 +65,7 @@ void test_svmops_za32_f16(svbool_t pn, svbool_t pm, svfloat16_t zn, svfloat16_t
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.mops.nxv4f32(i32 1, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) {
+void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmops_za32, _f32, _m)(1, pn, pm, zn, zm);
 }
 
@@ -75,7 +75,7 @@ void test_svmops_za32_f32(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) {
+void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") {
  SME_ACLE_FUNC(svsumops_za32, _s8, _m)(0, pn, pm, zn, zm);
 }
 
@@ -85,6 +85,6 @@ void test_svsumops_za32_s8(svbool_t pn, svbool_t pm, svint8_t zn, svuint8_t zm)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmops.wide.nxv16i8(i32 0, <vscale x 16 x i1> [[PN:%.*]], <vscale x 16 x i1> [[PM:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) {
+void test_svusmops_za32_u8(svbool_t pn, svbool_t pm, svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svusmops_za32, _u8, _m)(0, pn, pm, zn, zm);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
index 447bca055ad42201a24d6de7572b920bf8923efb..24141cd1f20bc98287febdd80524d0d547318f94 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
@@ -1,11 +1,11 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -18,11 +18,11 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 1, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smops.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) {
-  SME_ACLE_FUNC(svmops_za64, _s16, _m)(1, pn, pm, zn, zm);
+void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmops_za64, _s16, _m)(7, pn, pm, zn, zm);
 }
 
 // CHECK-C-LABEL: @test_svmops_za64_u16(
@@ -33,7 +33,7 @@ void test_svmops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svint16_t zm)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.umops.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) {
+void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
   SME_ACLE_FUNC(svmops_za64, _u16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -42,11 +42,11 @@ void test_svmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svuint16_t zm
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PN:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PM:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 1, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.mops.nxv2f64(i32 7, <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i1> [[TMP1]], <vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) {
-  SME_ACLE_FUNC(svmops_za64, _f64, _m)(1, pn, pm, zn, zm);
+void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svmops_za64, _f64, _m)(7, pn, pm, zn, zm);
 }
 
 // CHECK-C-LABEL: @test_svsumops_za64_s16(
@@ -57,7 +57,7 @@ void test_svmops_za64_f64(svbool_t pn, svbool_t pm, svfloat64_t zn, svfloat64_t
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumops.wide.nxv8i16(i32 0, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) {
+void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") {
  SME_ACLE_FUNC(svsumops_za64, _s16, _m)(0, pn, pm, zn, zm);
 }
 
@@ -66,9 +66,9 @@ void test_svsumops_za64_s16(svbool_t pn, svbool_t pm, svint16_t zn, svuint16_t z
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PN:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PM:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 2, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmops.wide.nxv8i16(i32 7, <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i1> [[TMP1]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) {
-  SME_ACLE_FUNC(svusmops_za64, _u16, _m)(2, pn, pm, zn, zm);
+void test_svusmops_za64_u16(svbool_t pn, svbool_t pm, svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svusmops_za64, _u16, _m)(7, pn, pm, zn, zm);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
index 19e2b42e13f2d64c3322d36cd10ecf28bbedd0f5..2974511e4ebbb12eb3cc93344a97f4b3fcb41f67 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
@@ -1,11 +1,11 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -19,8 +19,8 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) {
-    return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice_base, 0);
+svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za8_s8_1(
@@ -30,8 +30,9 @@ svint8_t test_svread_hor_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) {
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 0, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) {
-    return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice_base, 15);
+svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 15;
+    return SME_ACLE_FUNC(svread_hor_za8, _s8, _m)(zd, pg, 0, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za16_s16(
@@ -41,20 +42,21 @@ svint8_t test_svread_hor_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) {
-     return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 0, slice_base, 0);
+svint16_t test_svread_hor_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+     return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za16_s16_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_hor_za16_s16_1u11__SVInt16_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) {
-     return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 1, slice_base, 7);
+svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+     uint32_t slice = slice_base + 7;
+     return SME_ACLE_FUNC(svread_hor_za16, _s16, _m)(zd, pg, 1, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za32_s32(
@@ -64,20 +66,21 @@ svint16_t test_svread_hor_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) {
-    return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 0, slice_base, 0);
+svint32_t test_svread_hor_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za32_s32_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_hor_za32_s32_1u11__SVInt32_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 3, slice_base, 3);
+svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 3;
+    return SME_ACLE_FUNC(svread_hor_za32, _s32, _m)(zd, pg, 3, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za64_s64(
@@ -87,20 +90,21 @@ svint32_t test_svread_hor_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 0, slice_base, 0);
+svint64_t test_svread_hor_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za64_s64_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_hor_za64_s64_1u11__SVInt64_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 7, slice_base, 1);
+svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 1;
+    return SME_ACLE_FUNC(svread_hor_za64, _s64, _m)(zd, pg, 7, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za8_u8(
@@ -109,8 +113,8 @@ svint64_t test_svread_hor_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice_base, 0);
+svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za8_u8_1(
@@ -120,8 +124,9 @@ svuint8_t test_svread_hor_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 0, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice_base, 15);
+svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 15;
+    return SME_ACLE_FUNC(svread_hor_za8, _u8, _m)(zd, pg, 0, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za16_u16(
@@ -131,20 +136,21 @@ svuint8_t test_svread_hor_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 0, slice_base, 0);
+svuint16_t test_svread_hor_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za16_u16_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_hor_za16_u16_1u12__SVUint16_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 1, slice_base, 7);
+svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 7;
+    return SME_ACLE_FUNC(svread_hor_za16, _u16, _m)(zd, pg, 1, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za32_u32(
@@ -154,20 +160,21 @@ svuint16_t test_svread_hor_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 0, slice_base, 0);
+svuint32_t test_svread_hor_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za32_u32_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_hor_za32_u32_1u12__SVUint32_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 3, slice_base, 3);
+svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 3;
+    return SME_ACLE_FUNC(svread_hor_za32, _u32, _m)(zd, pg, 3, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za64_u64(
@@ -177,20 +184,21 @@ svuint32_t test_svread_hor_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 0, slice_base, 0);
+svuint64_t test_svread_hor_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za64_u64_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_hor_za64_u64_1u12__SVUint64_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.horiz.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 7, slice_base, 1);
+svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 1;
+    return SME_ACLE_FUNC(svread_hor_za64, _u64, _m)(zd, pg, 7, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za16_f16(
@@ -200,20 +208,21 @@ svuint64_t test_svread_hor_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 0, slice_base, 0);
+svfloat16_t test_svread_hor_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za16_f16_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_hor_za16_f16_1u13__SVFloat16_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 1, slice_base, 7);
+svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 7;
+    return SME_ACLE_FUNC(svread_hor_za16, _f16, _m)(zd, pg, 1, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za16_bf16(
@@ -223,20 +232,21 @@ svfloat16_t test_svread_hor_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 0, slice_base, 0);
+svbfloat16_t test_svread_hor_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za16_bf16_1(
 // CHECK-CXX-LABEL: @_Z27test_svread_hor_za16_bf16_1u14__SVBFloat16_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 1, slice_base, 7);
+svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 7;
+    return SME_ACLE_FUNC(svread_hor_za16, _bf16, _m)(zd, pg, 1, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za32_f32(
@@ -246,20 +256,21 @@ svbfloat16_t test_svread_hor_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 0, slice_base, 0);
+svfloat32_t test_svread_hor_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za32_f32_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_hor_za32_f32_1u13__SVFloat32_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 3, slice_base, 3);
+svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 3;
+    return SME_ACLE_FUNC(svread_hor_za32, _f32, _m)(zd, pg, 3, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za64_f64(
@@ -269,20 +280,21 @@ svfloat32_t test_svread_hor_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 0, slice_base, 0);
+svfloat64_t test_svread_hor_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za64_f64_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_hor_za64_f64_1u13__SVFloat64_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 7, slice_base, 1);
+svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 1;
+    return SME_ACLE_FUNC(svread_hor_za64, _f64, _m)(zd, pg, 7, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_s8(
@@ -291,8 +303,8 @@ svfloat64_t test_svread_hor_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 0, slice_base, 0);
+svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_s8_1(
@@ -301,8 +313,8 @@ svint8_t test_svread_hor_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 15, slice_base, 0);
+svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _s8, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_s16(
@@ -312,8 +324,8 @@ svint8_t test_svread_hor_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 0, slice_base, 0);
+svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_s16_1(
@@ -323,8 +335,8 @@ svint16_t test_svread_hor_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 15, slice_base, 0);
+svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _s16, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_s32(
@@ -334,8 +346,8 @@ svint16_t test_svread_hor_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 0, slice_base, 0);
+svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_s32_1(
@@ -345,8 +357,8 @@ svint32_t test_svread_hor_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 15, slice_base, 0);
+svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _s32, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_s64(
@@ -356,8 +368,8 @@ svint32_t test_svread_hor_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 0, slice_base, 0);
+svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_s64_1(
@@ -367,8 +379,8 @@ svint64_t test_svread_hor_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 15, slice_base, 0);
+svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _s64, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_u8(
@@ -377,8 +389,8 @@ svint64_t test_svread_hor_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 0, slice_base, 0);
+svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_u8_1(
@@ -387,8 +399,8 @@ svuint8_t test_svread_hor_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.horiz.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 15, slice_base, 0);
+svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _u8, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_u16(
@@ -398,8 +410,8 @@ svuint8_t test_svread_hor_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 0, slice_base, 0);
+svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_u16_1(
@@ -409,8 +421,8 @@ svuint16_t test_svread_hor_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.horiz.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 15, slice_base, 0);
+svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _u16, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_u32(
@@ -420,8 +432,8 @@ svuint16_t test_svread_hor_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 0, slice_base, 0);
+svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_u32_1(
@@ -431,8 +443,8 @@ svuint32_t test_svread_hor_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.horiz.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 15, slice_base, 0);
+svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _u32, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_u64(
@@ -442,8 +454,8 @@ svuint32_t test_svread_hor_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 0, slice_base, 0);
+svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_u64_1(
@@ -453,8 +465,8 @@ svuint64_t test_svread_hor_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.horiz.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 15, slice_base, 0);
+svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _u64, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_f16(
@@ -464,8 +476,8 @@ svuint64_t test_svread_hor_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.horiz.nxv8f16(<vscale x 8 x half> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 0, slice_base, 0);
+svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_f16_1(
@@ -475,8 +487,8 @@ svfloat16_t test_svread_hor_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.horiz.nxv8f16(<vscale x 8 x half> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 15, slice_base, 0);
+svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _f16, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_bf16(
@@ -486,8 +498,8 @@ svfloat16_t test_svread_hor_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 0, slice_base, 0);
+svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_bf16_1(
@@ -497,8 +509,8 @@ svbfloat16_t test_svread_hor_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.horiz.nxv8bf16(<vscale x 8 x bfloat> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 15, slice_base, 0);
+svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _bf16, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_f32(
@@ -508,8 +520,8 @@ svbfloat16_t test_svread_hor_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.horiz.nxv4f32(<vscale x 4 x float> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 0, slice_base, 0);
+svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_f32_1(
@@ -519,8 +531,8 @@ svfloat32_t test_svread_hor_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.horiz.nxv4f32(<vscale x 4 x float> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 15, slice_base, 0);
+svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _f32, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_f64(
@@ -530,8 +542,8 @@ svfloat32_t test_svread_hor_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.horiz.nxv2f64(<vscale x 2 x double> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 0, slice_base, 0);
+svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_hor_za128_f64_1(
@@ -541,8 +553,8 @@ svfloat64_t test_svread_hor_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.horiz.nxv2f64(<vscale x 2 x double> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 15, slice_base, 0);
+svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_hor_za128, _f64, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za8_s8(
@@ -551,8 +563,8 @@ svfloat64_t test_svread_hor_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sl
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice_base, 0);
+svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za8_s8_1(
@@ -562,8 +574,9 @@ svint8_t test_svread_ver_za8_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 0, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice_base, 15);
+svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 15;
+    return SME_ACLE_FUNC(svread_ver_za8, _s8, _m)(zd, pg, 0, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za16_s16(
@@ -573,20 +586,21 @@ svint8_t test_svread_ver_za8_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base)  {
-     return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 0, slice_base, 0);
+svint16_t test_svread_ver_za16_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+     return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za16_s16_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_ver_za16_s16_1u11__SVInt16_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base)  {
-     return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 1, slice_base, 7);
+svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+     uint32_t slice = slice_base + 7;
+     return SME_ACLE_FUNC(svread_ver_za16, _s16, _m)(zd, pg, 1, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za32_s32(
@@ -596,20 +610,21 @@ svint16_t test_svread_ver_za16_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 0, slice_base, 0);
+svint32_t test_svread_ver_za32_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za32_s32_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_ver_za32_s32_1u11__SVInt32_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 3, slice_base, 3);
+svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 3;
+    return SME_ACLE_FUNC(svread_ver_za32, _s32, _m)(zd, pg, 3, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za64_s64(
@@ -619,20 +634,21 @@ svint32_t test_svread_ver_za32_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 0, slice_base, 0);
+svint64_t test_svread_ver_za64_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za64_s64_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_ver_za64_s64_1u11__SVInt64_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 7, slice_base, 1);
+svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 1;
+    return SME_ACLE_FUNC(svread_ver_za64, _s64, _m)(zd, pg, 7, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za8_u8(
@@ -641,8 +657,8 @@ svint64_t test_svread_ver_za64_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice_base, 0);
+svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za8_u8_1(
@@ -652,8 +668,9 @@ svuint8_t test_svread_ver_za8_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 0, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice_base, 15);
+svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 15;
+    return SME_ACLE_FUNC(svread_ver_za8, _u8, _m)(zd, pg, 0, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za16_u16(
@@ -663,20 +680,21 @@ svuint8_t test_svread_ver_za8_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 0, slice_base, 0);
+svuint16_t test_svread_ver_za16_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za16_u16_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_ver_za16_u16_1u12__SVUint16_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 1, slice_base, 7);
+svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 7;
+    return SME_ACLE_FUNC(svread_ver_za16, _u16, _m)(zd, pg, 1, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za32_u32(
@@ -686,20 +704,21 @@ svuint16_t test_svread_ver_za16_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 0, slice_base, 0);
+svuint32_t test_svread_ver_za32_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za32_u32_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_ver_za32_u32_1u12__SVUint32_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 3, slice_base, 3);
+svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 3;
+    return SME_ACLE_FUNC(svread_ver_za32, _u32, _m)(zd, pg, 3, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za64_u64(
@@ -709,20 +728,21 @@ svuint32_t test_svread_ver_za32_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 0, slice_base, 0);
+svuint64_t test_svread_ver_za64_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za64_u64_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_ver_za64_u64_1u12__SVUint64_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.read.vert.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 7, slice_base, 1);
+svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 1;
+    return SME_ACLE_FUNC(svread_ver_za64, _u64, _m)(zd, pg, 7, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za16_f16(
@@ -732,20 +752,21 @@ svuint64_t test_svread_ver_za64_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 0, slice_base, 0);
+svfloat16_t test_svread_ver_za16_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za16_f16_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_ver_za16_f16_1u13__SVFloat16_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 1, slice_base, 7);
+svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 7;
+    return SME_ACLE_FUNC(svread_ver_za16, _f16, _m)(zd, pg, 1, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za16_bf16(
@@ -755,20 +776,21 @@ svfloat16_t test_svread_ver_za16_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sli
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 0, slice_base, 0);
+svbfloat16_t test_svread_ver_za16_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za16_bf16_1(
 // CHECK-CXX-LABEL: @_Z27test_svread_ver_za16_bf16_1u14__SVBFloat16_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 7
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 1, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 1, slice_base, 7);
+svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 7;
+    return SME_ACLE_FUNC(svread_ver_za16, _bf16, _m)(zd, pg, 1, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za32_f32(
@@ -778,20 +800,21 @@ svbfloat16_t test_svread_ver_za16_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 0, slice_base, 0);
+svfloat32_t test_svread_ver_za32_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za32_f32_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_ver_za32_f32_1u13__SVFloat32_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 3
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 3, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 3, slice_base, 3);
+svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 3;
+    return SME_ACLE_FUNC(svread_ver_za32, _f32, _m)(zd, pg, 3, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za64_f64(
@@ -801,20 +824,21 @@ svfloat32_t test_svread_ver_za32_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sli
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 0, slice_base, 0);
+svfloat64_t test_svread_ver_za64_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za64_f64_1(
 // CHECK-CXX-LABEL: @_Z26test_svread_ver_za64_f64_1u13__SVFloat64_tu10__SVBool_tj(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 1
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 7, i32 [[TILESLICE]])
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 7, slice_base, 1);
+svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    uint32_t slice = slice_base + 1;
+    return SME_ACLE_FUNC(svread_ver_za64, _f64, _m)(zd, pg, 7, slice);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_s8(
@@ -823,8 +847,8 @@ svfloat64_t test_svread_ver_za64_f64_1(svfloat64_t zd, svbool_t pg, uint32_t sli
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 0, slice_base, 0);
+svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_s8_1(
@@ -833,8 +857,8 @@ svint8_t test_svread_ver_za128_s8(svint8_t zd, svbool_t pg, uint32_t slice_base)
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 15, slice_base, 0);
+svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _s8, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_s16(
@@ -844,8 +868,8 @@ svint8_t test_svread_ver_za128_s8_1(svint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 0, slice_base, 0);
+svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_s16_1(
@@ -855,8 +879,8 @@ svint16_t test_svread_ver_za128_s16(svint16_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 15, slice_base, 0);
+svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _s16, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_s32(
@@ -866,8 +890,8 @@ svint16_t test_svread_ver_za128_s16_1(svint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 0, slice_base, 0);
+svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_s32_1(
@@ -877,8 +901,8 @@ svint32_t test_svread_ver_za128_s32(svint32_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 15, slice_base, 0);
+svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _s32, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_s64(
@@ -888,8 +912,8 @@ svint32_t test_svread_ver_za128_s32_1(svint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 0, slice_base, 0);
+svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_s64_1(
@@ -899,8 +923,8 @@ svint64_t test_svread_ver_za128_s64(svint64_t zd, svbool_t pg, uint32_t slice_ba
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 15, slice_base, 0);
+svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _s64, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_u8(
@@ -909,8 +933,8 @@ svint64_t test_svread_ver_za128_s64_1(svint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 0, slice_base, 0);
+svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_u8_1(
@@ -919,8 +943,8 @@ svuint8_t test_svread_ver_za128_u8(svuint8_t zd, svbool_t pg, uint32_t slice_bas
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readq.vert.nxv16i8(<vscale x 16 x i8> [[ZD:%.*]], <vscale x 16 x i1> [[PG:%.*]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 15, slice_base, 0);
+svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _u8, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_u16(
@@ -930,8 +954,8 @@ svuint8_t test_svread_ver_za128_u8_1(svuint8_t zd, svbool_t pg, uint32_t slice_b
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 0, slice_base, 0);
+svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_u16_1(
@@ -941,8 +965,8 @@ svuint16_t test_svread_ver_za128_u16(svuint16_t zd, svbool_t pg, uint32_t slice_
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readq.vert.nxv8i16(<vscale x 8 x i16> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
 //
-svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 15, slice_base, 0);
+svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _u16, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_u32(
@@ -952,8 +976,8 @@ svuint16_t test_svread_ver_za128_u16_1(svuint16_t zd, svbool_t pg, uint32_t slic
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 0, slice_base, 0);
+svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_u32_1(
@@ -963,8 +987,8 @@ svuint32_t test_svread_ver_za128_u32(svuint32_t zd, svbool_t pg, uint32_t slice_
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i32> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 //
-svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 15, slice_base, 0);
+svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _u32, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_u64(
@@ -974,8 +998,8 @@ svuint32_t test_svread_ver_za128_u32_1(svuint32_t zd, svbool_t pg, uint32_t slic
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 0, slice_base, 0);
+svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_u64_1(
@@ -985,8 +1009,8 @@ svuint64_t test_svread_ver_za128_u64(svuint64_t zd, svbool_t pg, uint32_t slice_
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 //
-svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 15, slice_base, 0);
+svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _u64, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_f16(
@@ -996,8 +1020,8 @@ svuint64_t test_svread_ver_za128_u64_1(svuint64_t zd, svbool_t pg, uint32_t slic
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.vert.nxv8f16(<vscale x 8 x half> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 0, slice_base, 0);
+svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_f16_1(
@@ -1007,8 +1031,8 @@ svfloat16_t test_svread_ver_za128_f16(svfloat16_t zd, svbool_t pg, uint32_t slic
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readq.vert.nxv8f16(<vscale x 8 x half> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP1]]
 //
-svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 15, slice_base, 0);
+svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _f16, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_bf16(
@@ -1018,8 +1042,8 @@ svfloat16_t test_svread_ver_za128_f16_1(svfloat16_t zd, svbool_t pg, uint32_t sl
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 0, slice_base, 0);
+svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_bf16_1(
@@ -1029,8 +1053,8 @@ svbfloat16_t test_svread_ver_za128_bf16(svbfloat16_t zd, svbool_t pg, uint32_t s
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readq.vert.nxv8bf16(<vscale x 8 x bfloat> [[ZD:%.*]], <vscale x 8 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP1]]
 //
-svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 15, slice_base, 0);
+svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _bf16, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_f32(
@@ -1040,8 +1064,8 @@ svbfloat16_t test_svread_ver_za128_bf16_1(svbfloat16_t zd, svbool_t pg, uint32_t
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.vert.nxv4f32(<vscale x 4 x float> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 0, slice_base, 0);
+svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_f32_1(
@@ -1051,8 +1075,8 @@ svfloat32_t test_svread_ver_za128_f32(svfloat32_t zd, svbool_t pg, uint32_t slic
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readq.vert.nxv4f32(<vscale x 4 x float> [[ZD:%.*]], <vscale x 4 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 //
-svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 15, slice_base, 0);
+svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _f32, _m)(zd, pg, 15, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_f64(
@@ -1062,8 +1086,8 @@ svfloat32_t test_svread_ver_za128_f32_1(svfloat32_t zd, svbool_t pg, uint32_t sl
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 0, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 0, slice_base, 0);
+svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 0, slice_base);
 }
 
 // CHECK-C-LABEL: @test_svread_ver_za128_f64_1(
@@ -1073,6 +1097,6 @@ svfloat64_t test_svread_ver_za128_f64(svfloat64_t zd, svbool_t pg, uint32_t slic
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double> [[ZD:%.*]], <vscale x 2 x i1> [[TMP0]], i32 15, i32 [[SLICE_BASE:%.*]])
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 //
-svfloat64_t test_svread_ver_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base)  {
-    return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 15, slice_base, 0);
+svfloat64_t test_svread_ver_za128_f64_1(svfloat64_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming __arm_in("za") {
+    return SME_ACLE_FUNC(svread_ver_za128, _f64, _m)(zd, pg, 15, slice_base);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
index 067745f7d4a0592cad054bfaf2be086fbdd09d26..6b0dd67a16fb6f0128f31a893195a994ab196a3c 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
@@ -1,15 +1,9 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
-
-#ifdef DISABLE_SME_ATTRIBUTES
-#define ARM_STREAMING_ATTR
-#else
-#define ARM_STREAMING_ATTR __attribute__((arm_streaming))
-#endif
+#include <arm_sme.h>
 
 // CHECK-C-LABEL:   @test_svst1_hor_za8(
 // CHECK-CXX-LABEL: @_Z18test_svst1_hor_za8ju10__SVBool_tPv(
@@ -19,9 +13,9 @@
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) {
-  svst1_hor_za8(0, slice_base, 0, pg, ptr);
-  svst1_hor_za8(0, slice_base, 15, pg, ptr);
+void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
+  svst1_hor_za8(0, slice_base, pg, ptr);
+  svst1_hor_za8(0, slice_base + 15, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svst1_hor_za16(
@@ -33,9 +27,9 @@ ARM_STREAMING_ATTR void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, voi
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) {
-  svst1_hor_za16(0, slice_base, 0, pg, ptr);
-  svst1_hor_za16(1, slice_base, 7, pg, ptr);
+void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
+  svst1_hor_za16(0, slice_base, pg, ptr);
+  svst1_hor_za16(1, slice_base + 7, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svst1_hor_za32(
@@ -47,9 +41,9 @@ ARM_STREAMING_ATTR void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, vo
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) {
-  svst1_hor_za32(0, slice_base, 0, pg, ptr);
-  svst1_hor_za32(3, slice_base, 3, pg, ptr);
+void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
+  svst1_hor_za32(0, slice_base, pg, ptr);
+  svst1_hor_za32(3, slice_base + 3, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svst1_hor_za64(
@@ -61,9 +55,9 @@ ARM_STREAMING_ATTR void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, vo
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) {
-  svst1_hor_za64(0, slice_base, 0, pg, ptr);
-  svst1_hor_za64(7, slice_base, 1, pg, ptr);
+void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
+  svst1_hor_za64(0, slice_base, pg, ptr);
+  svst1_hor_za64(7, slice_base + 1, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svst1_hor_za128(
@@ -74,9 +68,9 @@ ARM_STREAMING_ATTR void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, vo
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) {
-  svst1_hor_za128(0, slice_base, 0, pg, ptr);
-  svst1_hor_za128(15, slice_base, 0, pg, ptr);
+void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
+  svst1_hor_za128(0, slice_base, pg, ptr);
+  svst1_hor_za128(15, slice_base, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svst1_ver_za8(
@@ -87,9 +81,9 @@ ARM_STREAMING_ATTR void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, v
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], [[PTRTY]] [[PTR]], i32 0, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) {
-  svst1_ver_za8(0, slice_base, 0, pg, ptr);
-  svst1_ver_za8(0, slice_base, 15, pg, ptr);
+void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
+  svst1_ver_za8(0, slice_base, pg, ptr);
+  svst1_ver_za8(0, slice_base + 15, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svst1_ver_za16(
@@ -101,9 +95,9 @@ ARM_STREAMING_ATTR void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, voi
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 1, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) {
-  svst1_ver_za16(0, slice_base, 0, pg, ptr);
-  svst1_ver_za16(1, slice_base, 7, pg, ptr);
+void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
+  svst1_ver_za16(0, slice_base, pg, ptr);
+  svst1_ver_za16(1, slice_base + 7, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svst1_ver_za32(
@@ -115,9 +109,9 @@ ARM_STREAMING_ATTR void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, vo
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 3, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) {
-  svst1_ver_za32(0, slice_base, 0, pg, ptr);
-  svst1_ver_za32(3, slice_base, 3, pg, ptr);
+void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
+  svst1_ver_za32(0, slice_base, pg, ptr);
+  svst1_ver_za32(3, slice_base + 3, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svst1_ver_za64(
@@ -129,9 +123,9 @@ ARM_STREAMING_ATTR void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, vo
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 7, i32 [[TILESLICE1]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) {
-  svst1_ver_za64(0, slice_base, 0, pg, ptr);
-  svst1_ver_za64(7, slice_base, 1, pg, ptr);
+void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
+  svst1_ver_za64(0, slice_base, pg, ptr);
+  svst1_ver_za64(7, slice_base + 1, pg, ptr);
 }
 
 // CHECK-C-LABEL:   @test_svst1_ver_za128(
@@ -142,7 +136,7 @@ ARM_STREAMING_ATTR void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, vo
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], [[PTRTY]] [[PTR]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) {
-  svst1_ver_za128(0, slice_base, 0, pg, ptr);
-  svst1_ver_za128(15, slice_base, 0, pg, ptr);
+void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) __arm_streaming __arm_in("za") {
+  svst1_ver_za128(0, slice_base, pg, ptr);
+  svst1_ver_za128(15, slice_base, pg, ptr);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
index 4af93ac38dcca7530352e9b59efc16436fc0830b..be79674ea6840c31a456c5ca7e46342f959ee585 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
@@ -1,15 +1,9 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -DDISABLE_SME_ATTRIBUTES -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
-
-#ifdef DISABLE_SME_ATTRIBUTES
-#define ARM_STREAMING_ATTR
-#else
-#define ARM_STREAMING_ATTR __attribute__((arm_streaming))
-#endif
+#include <arm_sme.h>
 
 // CHECK-C-LABEL:   @test_svst1_hor_vnum_za8(
 // CHECK-CXX-LABEL: @_Z23test_svst1_hor_vnum_za8ju10__SVBool_tPvl(
@@ -22,9 +16,9 @@
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
-  svst1_hor_vnum_za8(0, slice_base, 0, pg, ptr, vnum);
-  svst1_hor_vnum_za8(0, slice_base, 15, pg, ptr, vnum);
+void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
+  svst1_hor_vnum_za8(0, slice_base, pg, ptr, vnum);
+  svst1_hor_vnum_za8(0, slice_base + 15, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svst1_hor_vnum_za16(
@@ -39,9 +33,9 @@ ARM_STREAMING_ATTR void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
-  svst1_hor_vnum_za16(0, slice_base, 0, pg, ptr, vnum);
-  svst1_hor_vnum_za16(1, slice_base, 7, pg, ptr, vnum);
+void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
+  svst1_hor_vnum_za16(0, slice_base, pg, ptr, vnum);
+  svst1_hor_vnum_za16(1, slice_base + 7, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svst1_hor_vnum_za32(
@@ -56,9 +50,9 @@ ARM_STREAMING_ATTR void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t p
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
-  svst1_hor_vnum_za32(0, slice_base, 0, pg, ptr, vnum);
-  svst1_hor_vnum_za32(3, slice_base, 3, pg, ptr, vnum);
+void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
+  svst1_hor_vnum_za32(0, slice_base, pg, ptr, vnum);
+  svst1_hor_vnum_za32(3, slice_base + 3, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svst1_hor_vnum_za64(
@@ -73,9 +67,9 @@ ARM_STREAMING_ATTR void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t p
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
-  svst1_hor_vnum_za64(0, slice_base, 0, pg, ptr, vnum);
-  svst1_hor_vnum_za64(7, slice_base, 1, pg, ptr, vnum);
+void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
+  svst1_hor_vnum_za64(0, slice_base, pg, ptr, vnum);
+  svst1_hor_vnum_za64(7, slice_base + 1, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svst1_hor_vnum_za128(
@@ -89,9 +83,9 @@ ARM_STREAMING_ATTR void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t p
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
-  svst1_hor_vnum_za128(0, slice_base, 0, pg, ptr, vnum);
-  svst1_hor_vnum_za128(15, slice_base, 0, pg, ptr, vnum);
+void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
+  svst1_hor_vnum_za128(0, slice_base, pg, ptr, vnum);
+  svst1_hor_vnum_za128(15, slice_base, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svst1_ver_vnum_za8(
@@ -105,9 +99,9 @@ ARM_STREAMING_ATTR void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], [[PTRTY]] [[TMP1]], i32 0, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
-  svst1_ver_vnum_za8(0, slice_base, 0, pg, ptr, vnum);
-  svst1_ver_vnum_za8(0, slice_base, 15, pg, ptr, vnum);
+void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
+  svst1_ver_vnum_za8(0, slice_base, pg, ptr, vnum);
+  svst1_ver_vnum_za8(0, slice_base + 15, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svst1_ver_vnum_za16(
@@ -122,9 +116,9 @@ ARM_STREAMING_ATTR void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 1, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
-  svst1_ver_vnum_za16(0, slice_base, 0, pg, ptr, vnum);
-  svst1_ver_vnum_za16(1, slice_base, 7, pg, ptr, vnum);
+void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
+  svst1_ver_vnum_za16(0, slice_base, pg, ptr, vnum);
+  svst1_ver_vnum_za16(1, slice_base + 7, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svst1_ver_vnum_za32(
@@ -139,9 +133,9 @@ ARM_STREAMING_ATTR void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t p
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 3, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
-  svst1_ver_vnum_za32(0, slice_base, 0, pg, ptr, vnum);
-  svst1_ver_vnum_za32(3, slice_base, 3, pg, ptr, vnum);
+void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
+  svst1_ver_vnum_za32(0, slice_base, pg, ptr, vnum);
+  svst1_ver_vnum_za32(3, slice_base + 3, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svst1_ver_vnum_za64(
@@ -156,9 +150,9 @@ ARM_STREAMING_ATTR void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t p
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 7, i32 [[TILESLICE2]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
-  svst1_ver_vnum_za64(0, slice_base, 0, pg, ptr, vnum);
-  svst1_ver_vnum_za64(7, slice_base, 1, pg, ptr, vnum);
+void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
+  svst1_ver_vnum_za64(0, slice_base, pg, ptr, vnum);
+  svst1_ver_vnum_za64(7, slice_base + 1, pg, ptr, vnum);
 }
 
 // CHECK-C-LABEL:   @test_svst1_ver_vnum_za128(
@@ -172,7 +166,7 @@ ARM_STREAMING_ATTR void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t p
 // CHECK-NEXT:        tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], [[PTRTY]] [[TMP2]], i32 15, i32 [[SLICE_BASE]])
 // CHECK-NEXT:        ret void
 //
-ARM_STREAMING_ATTR void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
-  svst1_ver_vnum_za128(0, slice_base, 0, pg, ptr, vnum);
-  svst1_ver_vnum_za128(15, slice_base, 0, pg, ptr, vnum);
+void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) __arm_streaming __arm_in("za") {
+  svst1_ver_vnum_za128(0, slice_base, pg, ptr, vnum);
+  svst1_ver_vnum_za128(15, slice_base, pg, ptr, vnum);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
new file mode 100644
index 0000000000000000000000000000000000000000..dc07efbb8160386fbc3a3dab98e7cca232614943
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
@@ -0,0 +1,72 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+// CHECK-LABEL: @test_in_streaming_mode(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
+// CHECK-NEXT:    [[AND_I:%.*]] = and i64 [[TMP1]], 1
+// CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0
+// CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+//
+// CPP-CHECK-LABEL: @_Z22test_in_streaming_modev(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3:[0-9]+]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[AND_I:%.*]] = and i64 [[TMP1]], 1
+// CPP-CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i64 [[AND_I]], 0
+// CPP-CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+//
+bool test_in_streaming_mode(void) __arm_streaming_compatible {
+  return __arm_in_streaming_mode();
+}
+
+// CHECK-LABEL: @test_za_disable(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_za_disablev(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR4:[0-9]+]]
+// CPP-CHECK-NEXT:    ret void
+//
+void test_za_disable(void) __arm_streaming_compatible {
+  __arm_za_disable();
+}
+
+// CHECK-LABEL: @test_has_sme(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3]]
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
+// CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0
+// CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+//
+// CPP-CHECK-LABEL: @_Z12test_has_smev(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call aarch64_sme_preservemost_from_x2 { i64, i64 } @__arm_sme_state() #[[ATTR3]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i64, i64 } [[TMP0]], 0
+// CPP-CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp slt i64 [[TMP1]], 0
+// CPP-CHECK-NEXT:    ret i1 [[TOBOOL_I]]
+//
+bool test_has_sme(void) __arm_streaming_compatible {
+  return __arm_has_sme();
+}
+
+// CHECK-LABEL: @test_svundef_za(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svundef_zav(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svundef_za(void) __arm_streaming_compatible __arm_out("za") {
+  svundef_za();
+}
+
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
index 12aa298858a18eb27838c98c2ade1a3d1cc36322..caac3ea6596f3e4f13ba95f3d794e91a6606eb3a 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
@@ -1,30 +1,57 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-C-LABEL: @test_svstr_vnum_za(
 // CHECK-CXX-LABEL: @_Z18test_svstr_vnum_zajPv(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_svstr_vnum_za(uint32_t slice_base, void *ptr) {
-  svstr_vnum_za(slice_base, 0, ptr);
+void test_svstr_vnum_za(uint32_t slice_base, void *ptr) __arm_in("za") {
+  svstr_vnum_za(slice_base, ptr, 0);
 }
 
 // CHECK-C-LABEL: @test_svstr_vnum_za_1(
 // CHECK-CXX-LABEL: @_Z20test_svstr_vnum_za_1jPv(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
-// CHECK-NEXT:    [[MULVL:%.*]] = mul i64 [[SVLB]], 15
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]]
-// CHECK-NEXT:    [[TILESLICE:%.*]] = add i32 [[SLICE_BASE:%.*]], 15
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15)
 // CHECK-NEXT:    ret void
 //
-void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) {
-  svstr_vnum_za(slice_base, 15, ptr);
+void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) __arm_in("za") {
+  svstr_vnum_za(slice_base, ptr, 15);
+}
+
+// CHECK-C-LABEL: @test_svstr_za(
+// CHECK-CXX-LABEL: @_Z13test_svstr_zajPv(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_svstr_za(uint32_t slice_base, void *ptr) __arm_in("za") {
+  svstr_za(slice_base, ptr);
+}
+
+// CHECK-C-LABEL: @test_svstr_vnum_za_var(
+// CHECK-CXX-LABEL: @_Z22test_svstr_vnum_za_varjPvl(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[VNUM:%.*]] to i32
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]])
+// CHECK-NEXT:    ret void
+//
+void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) __arm_in("za") {
+  svstr_vnum_za(slice_base, ptr, vnum);
+}
+
+// CHECK-C-LABEL: @test_svstr_vnum_za_2(
+// CHECK-CXX-LABEL: @_Z20test_svstr_vnum_za_2jPv(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16)
+// CHECK-NEXT:    ret void
+//
+void test_svstr_vnum_za_2(uint32_t slice_base, void *ptr) __arm_in("za") {
+  svstr_vnum_za(slice_base, ptr, 16);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
index 66e4550294ac7ae09881d88ce2c2a547cf279e59..c76be10518c0dd9ea7254fce55688e7352e15e9b 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
@@ -1,11 +1,11 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
@@ -19,8 +19,8 @@
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za8_s8_1(
@@ -30,8 +30,9 @@ void test_svwrite_hor_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TILESLICE]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice_base, 15, pg, zn);
+void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
+   uint32_t slice = slice_base + 15;
+  SME_ACLE_FUNC(svwrite_hor_za8, _s8, _m)(0, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za16_s16(
@@ -41,8 +42,8 @@ void test_svwrite_hor_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za16_s16_1(
@@ -53,8 +54,9 @@ void test_svwrite_hor_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TILESLICE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(1, slice_base, 7, pg, zn);
+void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 7;
+  SME_ACLE_FUNC(svwrite_hor_za16, _s16, _m)(1, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za32_s32(
@@ -64,8 +66,8 @@ void test_svwrite_hor_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za32_s32_1(
@@ -76,8 +78,9 @@ void test_svwrite_hor_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TILESLICE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(3, slice_base, 3, pg, zn);
+void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 3;
+  SME_ACLE_FUNC(svwrite_hor_za32, _s32, _m)(3, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za64_s64(
@@ -87,8 +90,8 @@ void test_svwrite_hor_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za64_s64_1(
@@ -99,8 +102,9 @@ void test_svwrite_hor_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TILESLICE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(7, slice_base, 1, pg, zn);
+void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 1;
+  SME_ACLE_FUNC(svwrite_hor_za64, _s64, _m)(7, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za8_u8(
@@ -109,8 +113,8 @@ void test_svwrite_hor_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za8_u8_1(
@@ -120,8 +124,9 @@ void test_svwrite_hor_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv16i8(i32 0, i32 [[TILESLICE]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice_base, 15, pg, zn);
+void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 15;
+  SME_ACLE_FUNC(svwrite_hor_za8, _u8, _m)(0, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za16_u16(
@@ -131,8 +136,8 @@ void test_svwrite_hor_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za16_u16_1(
@@ -143,8 +148,9 @@ void test_svwrite_hor_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8i16(i32 1, i32 [[TILESLICE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(1, slice_base, 7, pg, zn);
+void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 7;
+  SME_ACLE_FUNC(svwrite_hor_za16, _u16, _m)(1, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za32_u32(
@@ -154,8 +160,8 @@ void test_svwrite_hor_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za32_u32_1(
@@ -166,8 +172,9 @@ void test_svwrite_hor_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4i32(i32 3, i32 [[TILESLICE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(3, slice_base, 3, pg, zn);
+void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 3;
+  SME_ACLE_FUNC(svwrite_hor_za32, _u32, _m)(3, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za64_u64(
@@ -177,8 +184,8 @@ void test_svwrite_hor_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za64_u64_1(
@@ -189,8 +196,9 @@ void test_svwrite_hor_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2i64(i32 7, i32 [[TILESLICE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(7, slice_base, 1, pg, zn);
+void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 1;
+  SME_ACLE_FUNC(svwrite_hor_za64, _u64, _m)(7, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za16_f16(
@@ -200,8 +208,8 @@ void test_svwrite_hor_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za16_f16_1(
@@ -212,8 +220,9 @@ void test_svwrite_hor_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8f16(i32 1, i32 [[TILESLICE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(1, slice_base, 7, pg, zn);
+void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 7;
+  SME_ACLE_FUNC(svwrite_hor_za16, _f16, _m)(1, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za16_bf16(
@@ -223,8 +232,8 @@ void test_svwrite_hor_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za16_bf16_1(
@@ -235,8 +244,9 @@ void test_svwrite_hor_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i32 1, i32 [[TILESLICE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(1, slice_base, 7, pg, zn);
+void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
+   uint32_t slice = slice_base + 7;
+  SME_ACLE_FUNC(svwrite_hor_za16, _bf16, _m)(1, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za32_f32(
@@ -246,8 +256,8 @@ void test_svwrite_hor_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za32_f32_1(
@@ -258,8 +268,9 @@ void test_svwrite_hor_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 3, i32 [[TILESLICE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(3, slice_base, 3, pg, zn);
+void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 3;
+  SME_ACLE_FUNC(svwrite_hor_za32, _f32, _m)(3, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za64_f64(
@@ -269,8 +280,8 @@ void test_svwrite_hor_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za64_f64_1(
@@ -281,8 +292,9 @@ void test_svwrite_hor_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.horiz.nxv2f64(i32 7, i32 [[TILESLICE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(7, slice_base, 1, pg, zn);
+void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 1;
+  SME_ACLE_FUNC(svwrite_hor_za64, _f64, _m)(7, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_s8(
@@ -291,8 +303,8 @@ void test_svwrite_hor_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_s8_1(
@@ -301,8 +313,8 @@ void test_svwrite_hor_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _s8, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_s16(
@@ -312,8 +324,8 @@ void test_svwrite_hor_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_s16_1(
@@ -323,8 +335,8 @@ void test_svwrite_hor_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _s16, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_s32(
@@ -334,8 +346,8 @@ void test_svwrite_hor_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_s32_1(
@@ -345,8 +357,8 @@ void test_svwrite_hor_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _s32, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_s64(
@@ -356,8 +368,8 @@ void test_svwrite_hor_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_s64_1(
@@ -367,8 +379,8 @@ void test_svwrite_hor_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _s64, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_u8(
@@ -377,8 +389,8 @@ void test_svwrite_hor_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_u8_1(
@@ -387,8 +399,8 @@ void test_svwrite_hor_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _u8, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_u16(
@@ -398,8 +410,8 @@ void test_svwrite_hor_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_u16_1(
@@ -409,8 +421,8 @@ void test_svwrite_hor_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _u16, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_u32(
@@ -420,8 +432,8 @@ void test_svwrite_hor_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_u32_1(
@@ -431,8 +443,8 @@ void test_svwrite_hor_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _u32, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_u64(
@@ -442,8 +454,8 @@ void test_svwrite_hor_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_u64_1(
@@ -453,8 +465,8 @@ void test_svwrite_hor_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _u64, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_f16(
@@ -464,8 +476,8 @@ void test_svwrite_hor_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_f16_1(
@@ -475,8 +487,8 @@ void test_svwrite_hor_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _f16, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_bf16(
@@ -486,8 +498,8 @@ void test_svwrite_hor_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_bf16_1(
@@ -497,8 +509,8 @@ void test_svwrite_hor_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _bf16, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_f32(
@@ -508,8 +520,8 @@ void test_svwrite_hor_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_f32_1(
@@ -519,8 +531,8 @@ void test_svwrite_hor_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _f32, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_f64(
@@ -530,8 +542,8 @@ void test_svwrite_hor_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_hor_za128_f64_1(
@@ -541,8 +553,8 @@ void test_svwrite_hor_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
-  SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_hor_za128, _f64, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za8_s8(
@@ -551,8 +563,8 @@ void test_svwrite_hor_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za8_s8_1(
@@ -562,8 +574,9 @@ void test_svwrite_ver_za8_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TILESLICE]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice_base, 15, pg, zn);
+void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 15;
+  SME_ACLE_FUNC(svwrite_ver_za8, _s8, _m)(0, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za16_s16(
@@ -573,8 +586,8 @@ void test_svwrite_ver_za8_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za16_s16_1(
@@ -585,8 +598,9 @@ void test_svwrite_ver_za16_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TILESLICE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(1, slice_base, 7, pg, zn);
+void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 7;
+  SME_ACLE_FUNC(svwrite_ver_za16, _s16, _m)(1, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za32_s32(
@@ -596,8 +610,8 @@ void test_svwrite_ver_za16_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za32_s32_1(
@@ -608,8 +622,9 @@ void test_svwrite_ver_za32_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TILESLICE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(3, slice_base, 3, pg, zn);
+void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 3;
+  SME_ACLE_FUNC(svwrite_ver_za32, _s32, _m)(3, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za64_s64(
@@ -619,8 +634,8 @@ void test_svwrite_ver_za32_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za64_s64_1(
@@ -631,8 +646,9 @@ void test_svwrite_ver_za64_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TILESLICE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(7, slice_base, 1, pg, zn);
+void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 1;
+  SME_ACLE_FUNC(svwrite_ver_za64, _s64, _m)(7, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za8_u8(
@@ -641,8 +657,8 @@ void test_svwrite_ver_za64_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za8_u8_1(
@@ -652,8 +668,9 @@ void test_svwrite_ver_za8_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv16i8(i32 0, i32 [[TILESLICE]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice_base, 15, pg, zn);
+void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 15;
+  SME_ACLE_FUNC(svwrite_ver_za8, _u8, _m)(0, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za16_u16(
@@ -663,8 +680,8 @@ void test_svwrite_ver_za8_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za16_u16_1(
@@ -675,8 +692,9 @@ void test_svwrite_ver_za16_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8i16(i32 1, i32 [[TILESLICE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(1, slice_base, 7, pg, zn);
+void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 7;
+  SME_ACLE_FUNC(svwrite_ver_za16, _u16, _m)(1, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za32_u32(
@@ -686,8 +704,8 @@ void test_svwrite_ver_za16_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za32_u32_1(
@@ -698,8 +716,9 @@ void test_svwrite_ver_za32_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4i32(i32 3, i32 [[TILESLICE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(3, slice_base, 3, pg, zn);
+void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 3;
+  SME_ACLE_FUNC(svwrite_ver_za32, _u32, _m)(3, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za64_u64(
@@ -709,8 +728,8 @@ void test_svwrite_ver_za32_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za64_u64_1(
@@ -721,8 +740,9 @@ void test_svwrite_ver_za64_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2i64(i32 7, i32 [[TILESLICE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(7, slice_base, 1, pg, zn);
+void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 1;
+  SME_ACLE_FUNC(svwrite_ver_za64, _u64, _m)(7, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za16_f16(
@@ -732,8 +752,8 @@ void test_svwrite_ver_za64_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za16_f16_1(
@@ -744,8 +764,9 @@ void test_svwrite_ver_za16_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8f16(i32 1, i32 [[TILESLICE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(1, slice_base, 7, pg, zn);
+void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 7;
+  SME_ACLE_FUNC(svwrite_ver_za16, _f16, _m)(1, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za16_bf16(
@@ -755,8 +776,8 @@ void test_svwrite_ver_za16_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t z
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za16_bf16_1(
@@ -767,8 +788,9 @@ void test_svwrite_ver_za16_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t z
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv8bf16(i32 1, i32 [[TILESLICE]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(1, slice_base, 7, pg, zn);
+void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 7;
+  SME_ACLE_FUNC(svwrite_ver_za16, _bf16, _m)(1, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za32_f32(
@@ -778,8 +800,8 @@ void test_svwrite_ver_za16_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za32_f32_1(
@@ -790,8 +812,9 @@ void test_svwrite_ver_za32_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv4f32(i32 3, i32 [[TILESLICE]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(3, slice_base, 3, pg, zn);
+void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 3;
+  SME_ACLE_FUNC(svwrite_ver_za32, _f32, _m)(3, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za64_f64(
@@ -801,8 +824,8 @@ void test_svwrite_ver_za32_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t z
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za64_f64_1(
@@ -813,8 +836,9 @@ void test_svwrite_ver_za64_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.write.vert.nxv2f64(i32 7, i32 [[TILESLICE]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(7, slice_base, 1, pg, zn);
+void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
+  uint32_t slice = slice_base + 1;
+  SME_ACLE_FUNC(svwrite_ver_za64, _f64, _m)(7, slice, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_s8(
@@ -823,8 +847,8 @@ void test_svwrite_ver_za64_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t z
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_s8_1(
@@ -833,8 +857,8 @@ void test_svwrite_ver_za128_s8(uint32_t slice_base, svbool_t pg, svint8_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _s8, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_s16(
@@ -844,8 +868,8 @@ void test_svwrite_ver_za128_s8_1(uint32_t slice_base, svbool_t pg, svint8_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_s16_1(
@@ -855,8 +879,8 @@ void test_svwrite_ver_za128_s16(uint32_t slice_base, svbool_t pg, svint16_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _s16, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_s32(
@@ -866,8 +890,8 @@ void test_svwrite_ver_za128_s16_1(uint32_t slice_base, svbool_t pg, svint16_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_s32_1(
@@ -877,8 +901,8 @@ void test_svwrite_ver_za128_s32(uint32_t slice_base, svbool_t pg, svint32_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _s32, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_s64(
@@ -888,8 +912,8 @@ void test_svwrite_ver_za128_s32_1(uint32_t slice_base, svbool_t pg, svint32_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_s64_1(
@@ -899,8 +923,8 @@ void test_svwrite_ver_za128_s64(uint32_t slice_base, svbool_t pg, svint64_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _s64, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_u8(
@@ -909,8 +933,8 @@ void test_svwrite_ver_za128_s64_1(uint32_t slice_base, svbool_t pg, svint64_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_u8_1(
@@ -919,8 +943,8 @@ void test_svwrite_ver_za128_u8(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _u8, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_u16(
@@ -930,8 +954,8 @@ void test_svwrite_ver_za128_u8_1(uint32_t slice_base, svbool_t pg, svuint8_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_u16_1(
@@ -941,8 +965,8 @@ void test_svwrite_ver_za128_u16(uint32_t slice_base, svbool_t pg, svuint16_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x i16> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _u16, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_u32(
@@ -952,8 +976,8 @@ void test_svwrite_ver_za128_u16_1(uint32_t slice_base, svbool_t pg, svuint16_t z
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_u32_1(
@@ -963,8 +987,8 @@ void test_svwrite_ver_za128_u32(uint32_t slice_base, svbool_t pg, svuint32_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _u32, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_u64(
@@ -974,8 +998,8 @@ void test_svwrite_ver_za128_u32_1(uint32_t slice_base, svbool_t pg, svuint32_t z
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_u64_1(
@@ -985,8 +1009,8 @@ void test_svwrite_ver_za128_u64(uint32_t slice_base, svbool_t pg, svuint64_t zn)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _u64, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_f16(
@@ -996,8 +1020,8 @@ void test_svwrite_ver_za128_u64_1(uint32_t slice_base, svbool_t pg, svuint64_t z
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_f16_1(
@@ -1007,8 +1031,8 @@ void test_svwrite_ver_za128_f16(uint32_t slice_base, svbool_t pg, svfloat16_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x half> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _f16, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_bf16(
@@ -1018,8 +1042,8 @@ void test_svwrite_ver_za128_f16_1(uint32_t slice_base, svbool_t pg, svfloat16_t
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_bf16_1(
@@ -1029,8 +1053,8 @@ void test_svwrite_ver_za128_bf16(uint32_t slice_base, svbool_t pg, svbfloat16_t
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 8 x i1> [[TMP0]], <vscale x 8 x bfloat> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _bf16, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_f32(
@@ -1040,8 +1064,8 @@ void test_svwrite_ver_za128_bf16_1(uint32_t slice_base, svbool_t pg, svbfloat16_
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_f32_1(
@@ -1051,8 +1075,8 @@ void test_svwrite_ver_za128_f32(uint32_t slice_base, svbool_t pg, svfloat32_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x float> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _f32, _m)(15, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_f64(
@@ -1062,8 +1086,8 @@ void test_svwrite_ver_za128_f32_1(uint32_t slice_base, svbool_t pg, svfloat32_t
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 0, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(0, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(0, slice_base, pg, zn);
 }
 
 // CHECK-C-LABEL: @test_svwrite_ver_za128_f64_1(
@@ -1073,6 +1097,6 @@ void test_svwrite_ver_za128_f64(uint32_t slice_base, svbool_t pg, svfloat64_t zn
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i32 15, i32 [[SLICE_BASE:%.*]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x double> [[ZN:%.*]])
 // CHECK-NEXT:    ret void
 //
-void test_svwrite_ver_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) {
-  SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(15, slice_base, 0, pg, zn);
+void test_svwrite_ver_za128_f64_1(uint32_t slice_base, svbool_t pg, svfloat64_t zn) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svwrite_ver_za128, _f64, _m)(15, slice_base, pg, zn);
 }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
index 3ff9f6346c49212009bd89eabd5aa7f1309b8506..72db124537bda63bcec36b235fd7ae01ea158a99 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
@@ -1,9 +1,9 @@
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK,CHECK-C
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-C-LABEL: @test_svzero_mask_za(
 // CHECK-CXX-LABEL: @_Z19test_svzero_mask_zav(
@@ -11,7 +11,7 @@
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero(i32 0)
 // CHECK-NEXT:    ret void
 //
-void test_svzero_mask_za() {
+void test_svzero_mask_za(void) __arm_inout("za") {
   svzero_mask_za(0);
 }
 
@@ -21,7 +21,7 @@ void test_svzero_mask_za() {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero(i32 176)
 // CHECK-NEXT:    ret void
 //
-void test_svzero_mask_za_1() {
+void test_svzero_mask_za_1(void) __arm_inout("za") {
   svzero_mask_za(176);
 }
 
@@ -31,16 +31,22 @@ void test_svzero_mask_za_1() {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero(i32 255)
 // CHECK-NEXT:    ret void
 //
-void test_svzero_mask_za_2() {
+void test_svzero_mask_za_2(void) __arm_inout("za") {
   svzero_mask_za(255);
 }
 
-// CHECK-C-LABEL: @test_svzero_za(
-// CHECK-CXX-LABEL: @_Z14test_svzero_zav(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero(i32 255)
-// CHECK-NEXT:    ret void
+// CHECK-C-LABEL: define dso_local void @test_svzero_za(
+// CHECK-C-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-C-NEXT:  entry:
+// CHECK-C-NEXT:    tail call void @llvm.aarch64.sme.zero(i32 255)
+// CHECK-C-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void @_Z14test_svzero_zav(
+// CHECK-CXX-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.sme.zero(i32 255)
+// CHECK-CXX-NEXT:    ret void
 //
-void test_svzero_za() {
+void test_svzero_za(void) __arm_out("za") {
   svzero_za();
 }
diff --git a/clang/test/CodeGen/aarch64-sve-inline-asm-datatypes.c b/clang/test/CodeGen/aarch64-sve-inline-asm-datatypes.c
index 5c1e931a727124fd03e438526221bb728d30c3f3..14a29dfac2c7bdcae683af11a9fd0c98baf0b8e2 100644
--- a/clang/test/CodeGen/aarch64-sve-inline-asm-datatypes.c
+++ b/clang/test/CodeGen/aarch64-sve-inline-asm-datatypes.c
@@ -168,6 +168,30 @@ SVBOOL_TEST_UPL(__SVInt32_t, s) ;
 SVBOOL_TEST_UPL(__SVInt64_t, d) ;
 // CHECK: call <vscale x 16 x i1> asm sideeffect "fadd $0.d, $1.d, $2.d, $3.d\0A", "=w,@3Upl,w,w"(<vscale x 16 x i1> %in1, <vscale x 2 x i64> %in2, <vscale x 2 x i64> %in3)
 
+#define SVBOOL_TEST_UPH(DT, KIND)\
+__SVBool_t func_bool_uph_##KIND(__SVBool_t in1, DT in2, DT in3)\
+{\
+  __SVBool_t out;\
+  asm volatile (\
+    "fadd %[out]." #KIND ", %[in1]." #KIND ", %[in2]." #KIND ", %[in3]." #KIND "\n"\
+    : [out] "=w" (out)\
+    :  [in1] "Uph" (in1),\
+      [in2] "w" (in2),\
+      [in3] "w" (in3)\
+    :);\
+  return out;\
+}
+
+SVBOOL_TEST_UPH(__SVInt8_t, b) ;
+// CHECK: call <vscale x 16 x i1> asm sideeffect "fadd $0.b, $1.b, $2.b, $3.b\0A", "=w,@3Uph,w,w"(<vscale x 16 x i1> %in1, <vscale x 16 x i8> %in2, <vscale x 16 x i8> %in3)
+SVBOOL_TEST_UPH(__SVInt16_t, h) ;
+// CHECK: call <vscale x 16 x i1> asm sideeffect "fadd $0.h, $1.h, $2.h, $3.h\0A", "=w,@3Uph,w,w"(<vscale x 16 x i1> %in1, <vscale x 8 x i16> %in2, <vscale x 8 x i16> %in3)
+SVBOOL_TEST_UPH(__SVInt32_t, s) ;
+// CHECK: call <vscale x 16 x i1> asm sideeffect "fadd $0.s, $1.s, $2.s, $3.s\0A", "=w,@3Uph,w,w"(<vscale x 16 x i1> %in1, <vscale x 4 x i32> %in2, <vscale x 4 x i32> %in3)
+SVBOOL_TEST_UPH(__SVInt64_t, d) ;
+// CHECK: call <vscale x 16 x i1> asm sideeffect "fadd $0.d, $1.d, $2.d, $3.d\0A", "=w,@3Uph,w,w"(<vscale x 16 x i1> %in1, <vscale x 2 x i64> %in2, <vscale x 2 x i64> %in3)
+
+
 #define SVFLOAT_TEST(DT,KIND)\
 DT func_float_##DT##KIND(DT inout1, DT in2)\
 {\
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2-bfloat.c
index 2605d574b830440a267a0f347bd09550a998785d..dec7e1297ebf1dfe4f13eff12c638fafda95d91c 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2-bfloat.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -15,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svcreate2_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[X0:%.*]], i64 0)
@@ -27,7 +34,7 @@
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[X1:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP1]]
 //
-svbfloat16x2_t test_svcreate2_bf16(svbfloat16_t x0, svbfloat16_t x1)
+svbfloat16x2_t test_svcreate2_bf16(svbfloat16_t x0, svbfloat16_t x1) ATTR
 {
   return SVE_ACLE_FUNC(svcreate2,_bf16,,)(x0, x1);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2.c
index 08e4280064a6b4047c3fa443a92b6adaa633a51d..65d4e61184858c1680ef8919c37f7473104da7a9 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create2.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -15,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svcreate2_s8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[X0:%.*]], i64 0)
@@ -27,7 +34,7 @@
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP0]], <vscale x 16 x i8> [[X1:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP1]]
 //
-svint8x2_t test_svcreate2_s8(svint8_t x0, svint8_t x1)
+svint8x2_t test_svcreate2_s8(svint8_t x0, svint8_t x1) ATTR
 {
   return SVE_ACLE_FUNC(svcreate2,_s8,,)(x0, x1);
 }
@@ -44,7 +51,7 @@ svint8x2_t test_svcreate2_s8(svint8_t x0, svint8_t x1)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP0]], <vscale x 8 x i16> [[X1:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP1]]
 //
-svint16x2_t test_svcreate2_s16(svint16_t x0, svint16_t x1)
+svint16x2_t test_svcreate2_s16(svint16_t x0, svint16_t x1) ATTR
 {
   return SVE_ACLE_FUNC(svcreate2,_s16,,)(x0, x1);
 }
@@ -61,7 +68,7 @@ svint16x2_t test_svcreate2_s16(svint16_t x0, svint16_t x1)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP0]], <vscale x 4 x i32> [[X1:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP1]]
 //
-svint32x2_t test_svcreate2_s32(svint32_t x0, svint32_t x1)
+svint32x2_t test_svcreate2_s32(svint32_t x0, svint32_t x1) ATTR
 {
   return SVE_ACLE_FUNC(svcreate2,_s32,,)(x0, x1);
 }
@@ -78,7 +85,7 @@ svint32x2_t test_svcreate2_s32(svint32_t x0, svint32_t x1)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP0]], <vscale x 2 x i64> [[X1:%.*]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP1]]
 //
-svint64x2_t test_svcreate2_s64(svint64_t x0, svint64_t x1)
+svint64x2_t test_svcreate2_s64(svint64_t x0, svint64_t x1) ATTR
 {
   return SVE_ACLE_FUNC(svcreate2,_s64,,)(x0, x1);
 }
@@ -95,7 +102,7 @@ svint64x2_t test_svcreate2_s64(svint64_t x0, svint64_t x1)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP0]], <vscale x 16 x i8> [[X1:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP1]]
 //
-svuint8x2_t test_svcreate2_u8(svuint8_t x0, svuint8_t x1)
+svuint8x2_t test_svcreate2_u8(svuint8_t x0, svuint8_t x1) ATTR
 {
   return SVE_ACLE_FUNC(svcreate2,_u8,,)(x0, x1);
 }
@@ -112,7 +119,7 @@ svuint8x2_t test_svcreate2_u8(svuint8_t x0, svuint8_t x1)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP0]], <vscale x 8 x i16> [[X1:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP1]]
 //
-svuint16x2_t test_svcreate2_u16(svuint16_t x0, svuint16_t x1)
+svuint16x2_t test_svcreate2_u16(svuint16_t x0, svuint16_t x1) ATTR
 {
   return SVE_ACLE_FUNC(svcreate2,_u16,,)(x0, x1);
 }
@@ -129,7 +136,7 @@ svuint16x2_t test_svcreate2_u16(svuint16_t x0, svuint16_t x1)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP0]], <vscale x 4 x i32> [[X1:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP1]]
 //
-svuint32x2_t test_svcreate2_u32(svuint32_t x0, svuint32_t x1)
+svuint32x2_t test_svcreate2_u32(svuint32_t x0, svuint32_t x1) ATTR
 {
   return SVE_ACLE_FUNC(svcreate2,_u32,,)(x0, x1);
 }
@@ -146,7 +153,7 @@ svuint32x2_t test_svcreate2_u32(svuint32_t x0, svuint32_t x1)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP0]], <vscale x 2 x i64> [[X1:%.*]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP1]]
 //
-svuint64x2_t test_svcreate2_u64(svuint64_t x0, svuint64_t x1)
+svuint64x2_t test_svcreate2_u64(svuint64_t x0, svuint64_t x1) ATTR
 {
   return SVE_ACLE_FUNC(svcreate2,_u64,,)(x0, x1);
 }
@@ -163,7 +170,7 @@ svuint64x2_t test_svcreate2_u64(svuint64_t x0, svuint64_t x1)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP0]], <vscale x 8 x half> [[X1:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP1]]
 //
-svfloat16x2_t test_svcreate2_f16(svfloat16_t x0, svfloat16_t x1)
+svfloat16x2_t test_svcreate2_f16(svfloat16_t x0, svfloat16_t x1) ATTR
 {
   return SVE_ACLE_FUNC(svcreate2,_f16,,)(x0, x1);
 }
@@ -180,7 +187,7 @@ svfloat16x2_t test_svcreate2_f16(svfloat16_t x0, svfloat16_t x1)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP0]], <vscale x 4 x float> [[X1:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP1]]
 //
-svfloat32x2_t test_svcreate2_f32(svfloat32_t x0, svfloat32_t x1)
+svfloat32x2_t test_svcreate2_f32(svfloat32_t x0, svfloat32_t x1) ATTR
 {
   return SVE_ACLE_FUNC(svcreate2,_f32,,)(x0, x1);
 }
@@ -197,7 +204,7 @@ svfloat32x2_t test_svcreate2_f32(svfloat32_t x0, svfloat32_t x1)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP0]], <vscale x 2 x double> [[X1:%.*]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP1]]
 //
-svfloat64x2_t test_svcreate2_f64(svfloat64_t x0, svfloat64_t x1)
+svfloat64x2_t test_svcreate2_f64(svfloat64_t x0, svfloat64_t x1) ATTR
 {
   return SVE_ACLE_FUNC(svcreate2,_f64,,)(x0, x1);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3-bfloat.c
index 760708cfb7a052069e9601a7a78dd1ec82711254..d8c22cfb88a003e406747f607168fe84a2c7e5dd 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3-bfloat.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -15,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svcreate3_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> poison, <vscale x 8 x bfloat> [[X0:%.*]], i64 0)
@@ -29,7 +36,7 @@
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[X2:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 24 x bfloat> [[TMP2]]
 //
-svbfloat16x3_t test_svcreate3_bf16(svbfloat16_t x0, svbfloat16_t x1, svbfloat16_t x2)
+svbfloat16x3_t test_svcreate3_bf16(svbfloat16_t x0, svbfloat16_t x1, svbfloat16_t x2) ATTR
 {
   return SVE_ACLE_FUNC(svcreate3,_bf16,,)(x0, x1, x2);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3.c
index 325997ab6f91b7c39c12e1cd099cb1649d567b6b..9f485d6d3a6cfca2b46a5647506cae63d723adbc 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create3.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -15,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svcreate3_s8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> poison, <vscale x 16 x i8> [[X0:%.*]], i64 0)
@@ -29,7 +36,7 @@
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP1]], <vscale x 16 x i8> [[X2:%.*]], i64 32)
 // CPP-CHECK-NEXT:    ret <vscale x 48 x i8> [[TMP2]]
 //
-svint8x3_t test_svcreate3_s8(svint8_t x0, svint8_t x1, svint8_t x2)
+svint8x3_t test_svcreate3_s8(svint8_t x0, svint8_t x1, svint8_t x2) ATTR
 {
   return SVE_ACLE_FUNC(svcreate3,_s8,,)(x0, x1, x2);
 }
@@ -48,7 +55,7 @@ svint8x3_t test_svcreate3_s8(svint8_t x0, svint8_t x1, svint8_t x2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP1]], <vscale x 8 x i16> [[X2:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 24 x i16> [[TMP2]]
 //
-svint16x3_t test_svcreate3_s16(svint16_t x0, svint16_t x1, svint16_t x2)
+svint16x3_t test_svcreate3_s16(svint16_t x0, svint16_t x1, svint16_t x2) ATTR
 {
   return SVE_ACLE_FUNC(svcreate3,_s16,,)(x0, x1, x2);
 }
@@ -67,7 +74,7 @@ svint16x3_t test_svcreate3_s16(svint16_t x0, svint16_t x1, svint16_t x2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP1]], <vscale x 4 x i32> [[X2:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 12 x i32> [[TMP2]]
 //
-svint32x3_t test_svcreate3_s32(svint32_t x0, svint32_t x1, svint32_t x2)
+svint32x3_t test_svcreate3_s32(svint32_t x0, svint32_t x1, svint32_t x2) ATTR
 {
   return SVE_ACLE_FUNC(svcreate3,_s32,,)(x0, x1, x2);
 }
@@ -86,7 +93,7 @@ svint32x3_t test_svcreate3_s32(svint32_t x0, svint32_t x1, svint32_t x2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP1]], <vscale x 2 x i64> [[X2:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 6 x i64> [[TMP2]]
 //
-svint64x3_t test_svcreate3_s64(svint64_t x0, svint64_t x1, svint64_t x2)
+svint64x3_t test_svcreate3_s64(svint64_t x0, svint64_t x1, svint64_t x2) ATTR
 {
   return SVE_ACLE_FUNC(svcreate3,_s64,,)(x0, x1, x2);
 }
@@ -105,7 +112,7 @@ svint64x3_t test_svcreate3_s64(svint64_t x0, svint64_t x1, svint64_t x2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TMP1]], <vscale x 16 x i8> [[X2:%.*]], i64 32)
 // CPP-CHECK-NEXT:    ret <vscale x 48 x i8> [[TMP2]]
 //
-svuint8x3_t test_svcreate3_u8(svuint8_t x0, svuint8_t x1, svuint8_t x2)
+svuint8x3_t test_svcreate3_u8(svuint8_t x0, svuint8_t x1, svuint8_t x2) ATTR
 {
   return SVE_ACLE_FUNC(svcreate3,_u8,,)(x0, x1, x2);
 }
@@ -124,7 +131,7 @@ svuint8x3_t test_svcreate3_u8(svuint8_t x0, svuint8_t x1, svuint8_t x2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TMP1]], <vscale x 8 x i16> [[X2:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 24 x i16> [[TMP2]]
 //
-svuint16x3_t test_svcreate3_u16(svuint16_t x0, svuint16_t x1, svuint16_t x2)
+svuint16x3_t test_svcreate3_u16(svuint16_t x0, svuint16_t x1, svuint16_t x2) ATTR
 {
   return SVE_ACLE_FUNC(svcreate3,_u16,,)(x0, x1, x2);
 }
@@ -143,7 +150,7 @@ svuint16x3_t test_svcreate3_u16(svuint16_t x0, svuint16_t x1, svuint16_t x2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TMP1]], <vscale x 4 x i32> [[X2:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 12 x i32> [[TMP2]]
 //
-svuint32x3_t test_svcreate3_u32(svuint32_t x0, svuint32_t x1, svuint32_t x2)
+svuint32x3_t test_svcreate3_u32(svuint32_t x0, svuint32_t x1, svuint32_t x2) ATTR
 {
   return SVE_ACLE_FUNC(svcreate3,_u32,,)(x0, x1, x2);
 }
@@ -162,7 +169,7 @@ svuint32x3_t test_svcreate3_u32(svuint32_t x0, svuint32_t x1, svuint32_t x2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TMP1]], <vscale x 2 x i64> [[X2:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 6 x i64> [[TMP2]]
 //
-svuint64x3_t test_svcreate3_u64(svuint64_t x0, svuint64_t x1, svuint64_t x2)
+svuint64x3_t test_svcreate3_u64(svuint64_t x0, svuint64_t x1, svuint64_t x2) ATTR
 {
   return SVE_ACLE_FUNC(svcreate3,_u64,,)(x0, x1, x2);
 }
@@ -181,7 +188,7 @@ svuint64x3_t test_svcreate3_u64(svuint64_t x0, svuint64_t x1, svuint64_t x2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TMP1]], <vscale x 8 x half> [[X2:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 24 x half> [[TMP2]]
 //
-svfloat16x3_t test_svcreate3_f16(svfloat16_t x0, svfloat16_t x1, svfloat16_t x2)
+svfloat16x3_t test_svcreate3_f16(svfloat16_t x0, svfloat16_t x1, svfloat16_t x2) ATTR
 {
   return SVE_ACLE_FUNC(svcreate3,_f16,,)(x0, x1, x2);
 }
@@ -200,7 +207,7 @@ svfloat16x3_t test_svcreate3_f16(svfloat16_t x0, svfloat16_t x1, svfloat16_t x2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TMP1]], <vscale x 4 x float> [[X2:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 12 x float> [[TMP2]]
 //
-svfloat32x3_t test_svcreate3_f32(svfloat32_t x0, svfloat32_t x1, svfloat32_t x2)
+svfloat32x3_t test_svcreate3_f32(svfloat32_t x0, svfloat32_t x1, svfloat32_t x2) ATTR
 {
   return SVE_ACLE_FUNC(svcreate3,_f32,,)(x0, x1, x2);
 }
@@ -219,7 +226,7 @@ svfloat32x3_t test_svcreate3_f32(svfloat32_t x0, svfloat32_t x1, svfloat32_t x2)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TMP1]], <vscale x 2 x double> [[X2:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 6 x double> [[TMP2]]
 //
-svfloat64x3_t test_svcreate3_f64(svfloat64_t x0, svfloat64_t x1, svfloat64_t x2)
+svfloat64x3_t test_svcreate3_f64(svfloat64_t x0, svfloat64_t x1, svfloat64_t x2) ATTR
 {
   return SVE_ACLE_FUNC(svcreate3,_f64,,)(x0, x1, x2);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4-bfloat.c
index e3af56c5d1339fe209707e73b8c376f999d71e8e..ca90a435af8c3a1f03c525536e7c028bf59ca92e 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4-bfloat.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -15,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svcreate4_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[X0:%.*]], i64 0)
@@ -31,7 +38,7 @@
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[X4:%.*]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP3]]
 //
-svbfloat16x4_t test_svcreate4_bf16(svbfloat16_t x0, svbfloat16_t x1, svbfloat16_t x2, svbfloat16_t x4)
+svbfloat16x4_t test_svcreate4_bf16(svbfloat16_t x0, svbfloat16_t x1, svbfloat16_t x2, svbfloat16_t x4) ATTR
 {
   return SVE_ACLE_FUNC(svcreate4,_bf16,,)(x0, x1, x2, x4);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4.c
index 7e0c2a7f00d1d003dbd0bf62333729a906059c14..de8a5b06165735cf9cd64210a854c95002b626f3 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_create4.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -15,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svcreate4_s8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[X0:%.*]], i64 0)
@@ -31,7 +38,7 @@
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[X4:%.*]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP3]]
 //
-svint8x4_t test_svcreate4_s8(svint8_t x0, svint8_t x1, svint8_t x2, svint8_t x4)
+svint8x4_t test_svcreate4_s8(svint8_t x0, svint8_t x1, svint8_t x2, svint8_t x4) ATTR
 {
   return SVE_ACLE_FUNC(svcreate4,_s8,,)(x0, x1, x2, x4);
 }
@@ -52,7 +59,7 @@ svint8x4_t test_svcreate4_s8(svint8_t x0, svint8_t x1, svint8_t x2, svint8_t x4)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[X4:%.*]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP3]]
 //
-svint16x4_t test_svcreate4_s16(svint16_t x0, svint16_t x1, svint16_t x2, svint16_t x4)
+svint16x4_t test_svcreate4_s16(svint16_t x0, svint16_t x1, svint16_t x2, svint16_t x4) ATTR
 {
   return SVE_ACLE_FUNC(svcreate4,_s16,,)(x0, x1, x2, x4);
 }
@@ -73,7 +80,7 @@ svint16x4_t test_svcreate4_s16(svint16_t x0, svint16_t x1, svint16_t x2, svint16
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[X4:%.*]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP3]]
 //
-svint32x4_t test_svcreate4_s32(svint32_t x0, svint32_t x1, svint32_t x2, svint32_t x4)
+svint32x4_t test_svcreate4_s32(svint32_t x0, svint32_t x1, svint32_t x2, svint32_t x4) ATTR
 {
   return SVE_ACLE_FUNC(svcreate4,_s32,,)(x0, x1, x2, x4);
 }
@@ -94,7 +101,7 @@ svint32x4_t test_svcreate4_s32(svint32_t x0, svint32_t x1, svint32_t x2, svint32
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[X4:%.*]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP3]]
 //
-svint64x4_t test_svcreate4_s64(svint64_t x0, svint64_t x1, svint64_t x2, svint64_t x4)
+svint64x4_t test_svcreate4_s64(svint64_t x0, svint64_t x1, svint64_t x2, svint64_t x4) ATTR
 {
   return SVE_ACLE_FUNC(svcreate4,_s64,,)(x0, x1, x2, x4);
 }
@@ -115,7 +122,7 @@ svint64x4_t test_svcreate4_s64(svint64_t x0, svint64_t x1, svint64_t x2, svint64
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[X4:%.*]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP3]]
 //
-svuint8x4_t test_svcreate4_u8(svuint8_t x0, svuint8_t x1, svuint8_t x2, svuint8_t x4)
+svuint8x4_t test_svcreate4_u8(svuint8_t x0, svuint8_t x1, svuint8_t x2, svuint8_t x4) ATTR
 {
   return SVE_ACLE_FUNC(svcreate4,_u8,,)(x0, x1, x2, x4);
 }
@@ -136,7 +143,7 @@ svuint8x4_t test_svcreate4_u8(svuint8_t x0, svuint8_t x1, svuint8_t x2, svuint8_
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[X4:%.*]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP3]]
 //
-svuint16x4_t test_svcreate4_u16(svuint16_t x0, svuint16_t x1, svuint16_t x2, svuint16_t x4)
+svuint16x4_t test_svcreate4_u16(svuint16_t x0, svuint16_t x1, svuint16_t x2, svuint16_t x4) ATTR
 {
   return SVE_ACLE_FUNC(svcreate4,_u16,,)(x0, x1, x2, x4);
 }
@@ -157,7 +164,7 @@ svuint16x4_t test_svcreate4_u16(svuint16_t x0, svuint16_t x1, svuint16_t x2, svu
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[X4:%.*]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP3]]
 //
-svuint32x4_t test_svcreate4_u32(svuint32_t x0, svuint32_t x1, svuint32_t x2, svuint32_t x4)
+svuint32x4_t test_svcreate4_u32(svuint32_t x0, svuint32_t x1, svuint32_t x2, svuint32_t x4) ATTR
 {
   return SVE_ACLE_FUNC(svcreate4,_u32,,)(x0, x1, x2, x4);
 }
@@ -178,7 +185,7 @@ svuint32x4_t test_svcreate4_u32(svuint32_t x0, svuint32_t x1, svuint32_t x2, svu
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[X4:%.*]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP3]]
 //
-svuint64x4_t test_svcreate4_u64(svuint64_t x0, svuint64_t x1, svuint64_t x2, svuint64_t x4)
+svuint64x4_t test_svcreate4_u64(svuint64_t x0, svuint64_t x1, svuint64_t x2, svuint64_t x4) ATTR
 {
   return SVE_ACLE_FUNC(svcreate4,_u64,,)(x0, x1, x2, x4);
 }
@@ -199,7 +206,7 @@ svuint64x4_t test_svcreate4_u64(svuint64_t x0, svuint64_t x1, svuint64_t x2, svu
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[X4:%.*]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP3]]
 //
-svfloat16x4_t test_svcreate4_f16(svfloat16_t x0, svfloat16_t x1, svfloat16_t x2, svfloat16_t x4)
+svfloat16x4_t test_svcreate4_f16(svfloat16_t x0, svfloat16_t x1, svfloat16_t x2, svfloat16_t x4) ATTR
 {
   return SVE_ACLE_FUNC(svcreate4,_f16,,)(x0, x1, x2, x4);
 }
@@ -220,7 +227,7 @@ svfloat16x4_t test_svcreate4_f16(svfloat16_t x0, svfloat16_t x1, svfloat16_t x2,
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[X4:%.*]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP3]]
 //
-svfloat32x4_t test_svcreate4_f32(svfloat32_t x0, svfloat32_t x1, svfloat32_t x2, svfloat32_t x4)
+svfloat32x4_t test_svcreate4_f32(svfloat32_t x0, svfloat32_t x1, svfloat32_t x2, svfloat32_t x4) ATTR
 {
   return SVE_ACLE_FUNC(svcreate4,_f32,,)(x0, x1, x2, x4);
 }
@@ -241,7 +248,7 @@ svfloat32x4_t test_svcreate4_f32(svfloat32_t x0, svfloat32_t x1, svfloat32_t x2,
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[X4:%.*]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP3]]
 //
-svfloat64x4_t test_svcreate4_f64(svfloat64_t x0, svfloat64_t x1, svfloat64_t x2, svfloat64_t x4)
+svfloat64x4_t test_svcreate4_f64(svfloat64_t x0, svfloat64_t x1, svfloat64_t x2, svfloat64_t x4) ATTR
 {
   return SVE_ACLE_FUNC(svcreate4,_f64,,)(x0, x1, x2, x4);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2-bfloat.c
index 25dc49a4c2bd3b6487a955988172fdd178724b6f..b9c46b2261f5d13132428daeab2f92cdf1e944db 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2-bfloat.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -15,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svget2_bf16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TUPLE:%.*]], i64 0)
@@ -25,7 +32,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svget2_bf16_0(svbfloat16x2_t tuple)
+svbfloat16_t test_svget2_bf16_0(svbfloat16x2_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget2,_bf16,,)(tuple, 0);
 }
@@ -40,7 +47,7 @@ svbfloat16_t test_svget2_bf16_0(svbfloat16x2_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[TUPLE:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svget2_bf16_1(svbfloat16x2_t tuple)
+svbfloat16_t test_svget2_bf16_1(svbfloat16x2_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget2,_bf16,,)(tuple, 1);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c
index 32a84c91b74d4c6b3b64775d43b95f9e0847c47d..8cd887aaff407d58e4fb957d143597a6d61934ef 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get2.c
@@ -5,6 +5,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
@@ -14,6 +15,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svget2_s8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[TUPLE:%.*]], i64 0)
@@ -24,7 +31,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svget2_s8(svint8x2_t tuple)
+svint8_t test_svget2_s8(svint8x2_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget2,_s8,,)(tuple, 0);
 }
@@ -39,7 +46,7 @@ svint8_t test_svget2_s8(svint8x2_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TUPLE:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svget2_s16(svint16x2_t tuple)
+svint16_t test_svget2_s16(svint16x2_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget2,_s16,,)(tuple, 1);
 }
@@ -54,7 +61,7 @@ svint16_t test_svget2_s16(svint16x2_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svget2_s32(svint32x2_t tuple)
+svint32_t test_svget2_s32(svint32x2_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget2,_s32,,)(tuple, 0);
 }
@@ -69,7 +76,7 @@ svint32_t test_svget2_s32(svint32x2_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TUPLE:%.*]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svint64_t test_svget2_s64(svint64x2_t tuple)
+svint64_t test_svget2_s64(svint64x2_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget2,_s64,,)(tuple, 1);
 }
@@ -84,7 +91,7 @@ svint64_t test_svget2_s64(svint64x2_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svget2_u8(svuint8x2_t tuple)
+svuint8_t test_svget2_u8(svuint8x2_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget2,_u8,,)(tuple, 0);
 }
@@ -99,7 +106,7 @@ svuint8_t test_svget2_u8(svuint8x2_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[TUPLE:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svget2_u16(svuint16x2_t tuple)
+svuint16_t test_svget2_u16(svuint16x2_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget2,_u16,,)(tuple, 1);
 }
@@ -114,7 +121,7 @@ svuint16_t test_svget2_u16(svuint16x2_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svget2_u32(svuint32x2_t tuple)
+svuint32_t test_svget2_u32(svuint32x2_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget2,_u32,,)(tuple, 0);
 }
@@ -129,7 +136,7 @@ svuint32_t test_svget2_u32(svuint32x2_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TUPLE:%.*]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svuint64_t test_svget2_u64(svuint64x2_t tuple)
+svuint64_t test_svget2_u64(svuint64x2_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget2,_u64,,)(tuple, 1);
 }
@@ -144,7 +151,7 @@ svuint64_t test_svget2_u64(svuint64x2_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svget2_f16(svfloat16x2_t tuple)
+svfloat16_t test_svget2_f16(svfloat16x2_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget2,_f16,,)(tuple, 0);
 }
@@ -159,7 +166,7 @@ svfloat16_t test_svget2_f16(svfloat16x2_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[TUPLE:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svget2_f32(svfloat32x2_t tuple)
+svfloat32_t test_svget2_f32(svfloat32x2_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget2,_f32,,)(tuple, 1);
 }
@@ -174,7 +181,7 @@ svfloat32_t test_svget2_f32(svfloat32x2_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svget2_f64(svfloat64x2_t tuple)
+svfloat64_t test_svget2_f64(svfloat64x2_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget2,_f64,,)(tuple, 0);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3-bfloat.c
index 47ce6bd19244e677c0f6b3d47ba29252c7f15257..7a991bc7431d036705cfd0d5298aaac4a7a99cb2 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3-bfloat.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -15,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svget3_bf16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[TUPLE:%.*]], i64 0)
@@ -25,7 +32,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svget3_bf16_0(svbfloat16x3_t tuple)
+svbfloat16_t test_svget3_bf16_0(svbfloat16x3_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget3,_bf16,,)(tuple, 0);
 }
@@ -40,7 +47,7 @@ svbfloat16_t test_svget3_bf16_0(svbfloat16x3_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[TUPLE:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svget3_bf16_1(svbfloat16x3_t tuple)
+svbfloat16_t test_svget3_bf16_1(svbfloat16x3_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget3,_bf16,,)(tuple, 1);
 }
@@ -55,7 +62,7 @@ svbfloat16_t test_svget3_bf16_1(svbfloat16x3_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[TUPLE:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svget3_bf16_2(svbfloat16x3_t tuple)
+svbfloat16_t test_svget3_bf16_2(svbfloat16x3_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget3,_bf16,,)(tuple, 2);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c
index 54847152dee7c5161a01c44622e3c572128221dd..de7c3c303ffcb83b8439964a42b2aad85ae8df23 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get3.c
@@ -5,6 +5,8 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
@@ -14,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svget3_s8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[TUPLE:%.*]], i64 0)
@@ -24,7 +32,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svget3_s8(svint8x3_t tuple)
+svint8_t test_svget3_s8(svint8x3_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget3,_s8,,)(tuple, 0);
 }
@@ -39,7 +47,7 @@ svint8_t test_svget3_s8(svint8x3_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[TUPLE:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svget3_s16(svint16x3_t tuple)
+svint16_t test_svget3_s16(svint16x3_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget3,_s16,,)(tuple, 2);
 }
@@ -54,7 +62,7 @@ svint16_t test_svget3_s16(svint16x3_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[TUPLE:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svget3_s32(svint32x3_t tuple)
+svint32_t test_svget3_s32(svint32x3_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget3,_s32,,)(tuple, 1);
 }
@@ -69,7 +77,7 @@ svint32_t test_svget3_s32(svint32x3_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svint64_t test_svget3_s64(svint64x3_t tuple)
+svint64_t test_svget3_s64(svint64x3_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget3,_s64,,)(tuple, 0);
 }
@@ -84,7 +92,7 @@ svint64_t test_svget3_s64(svint64x3_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[TUPLE:%.*]], i64 32)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svget3_u8(svuint8x3_t tuple)
+svuint8_t test_svget3_u8(svuint8x3_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget3,_u8,,)(tuple, 2);
 }
@@ -99,7 +107,7 @@ svuint8_t test_svget3_u8(svuint8x3_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[TUPLE:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svget3_u16(svuint16x3_t tuple)
+svuint16_t test_svget3_u16(svuint16x3_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget3,_u16,,)(tuple, 1);
 }
@@ -114,7 +122,7 @@ svuint16_t test_svget3_u16(svuint16x3_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svget3_u32(svuint32x3_t tuple)
+svuint32_t test_svget3_u32(svuint32x3_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget3,_u32,,)(tuple, 0);
 }
@@ -129,7 +137,7 @@ svuint32_t test_svget3_u32(svuint32x3_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[TUPLE:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svuint64_t test_svget3_u64(svuint64x3_t tuple)
+svuint64_t test_svget3_u64(svuint64x3_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget3,_u64,,)(tuple, 2);
 }
@@ -144,7 +152,7 @@ svuint64_t test_svget3_u64(svuint64x3_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[TUPLE:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svget3_f16(svfloat16x3_t tuple)
+svfloat16_t test_svget3_f16(svfloat16x3_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget3,_f16,,)(tuple, 1);
 }
@@ -159,7 +167,7 @@ svfloat16_t test_svget3_f16(svfloat16x3_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svget3_f32(svfloat32x3_t tuple)
+svfloat32_t test_svget3_f32(svfloat32x3_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget3,_f32,,)(tuple, 0);
 }
@@ -174,7 +182,7 @@ svfloat32_t test_svget3_f32(svfloat32x3_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[TUPLE:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svget3_f64(svfloat64x3_t tuple)
+svfloat64_t test_svget3_f64(svfloat64x3_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget3,_f64,,)(tuple, 2);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4-bfloat.c
index 454b3bf38bd318ccbe85087c9c16f7803a8478fd..3a5e282bfdfa36c4b6a89fe65097ffcc9175daef 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4-bfloat.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -15,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svget4_bf16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[TUPLE:%.*]], i64 0)
@@ -25,7 +32,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svget4_bf16_0(svbfloat16x4_t tuple)
+svbfloat16_t test_svget4_bf16_0(svbfloat16x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 0);
 }
@@ -40,7 +47,7 @@ svbfloat16_t test_svget4_bf16_0(svbfloat16x4_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[TUPLE:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svget4_bf16_1(svbfloat16x4_t tuple)
+svbfloat16_t test_svget4_bf16_1(svbfloat16x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 1);
 }
@@ -55,7 +62,7 @@ svbfloat16_t test_svget4_bf16_1(svbfloat16x4_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[TUPLE:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svget4_bf16_2(svbfloat16x4_t tuple)
+svbfloat16_t test_svget4_bf16_2(svbfloat16x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 2);
 }
@@ -70,7 +77,7 @@ svbfloat16_t test_svget4_bf16_2(svbfloat16x4_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[TUPLE:%.*]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svget4_bf16_3(svbfloat16x4_t tuple)
+svbfloat16_t test_svget4_bf16_3(svbfloat16x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_bf16,,)(tuple, 3);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c
index 13f8c2a2906ef786585076163727b7b730bc01c9..9b4f9e5332a57228359b44009b9a16189ba9523b 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_get4.c
@@ -5,6 +5,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
@@ -14,6 +15,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // NOTE: For these tests clang converts the struct parameter into
 // several parameters, one for each member of the original struct.
 // CHECK-LABEL: @test_svget4_s8(
@@ -26,7 +33,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svget4_s8(svint8x4_t tuple)
+svint8_t test_svget4_s8(svint8x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_s8,,)(tuple, 0);
 }
@@ -41,7 +48,7 @@ svint8_t test_svget4_s8(svint8x4_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[TUPLE:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svget4_s16(svint16x4_t tuple)
+svint16_t test_svget4_s16(svint16x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_s16,,)(tuple, 2);
 }
@@ -56,7 +63,7 @@ svint16_t test_svget4_s16(svint16x4_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[TUPLE:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svget4_s32(svint32x4_t tuple)
+svint32_t test_svget4_s32(svint32x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_s32,,)(tuple, 2);
 }
@@ -71,7 +78,7 @@ svint32_t test_svget4_s32(svint32x4_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[TUPLE:%.*]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svint64_t test_svget4_s64(svint64x4_t tuple)
+svint64_t test_svget4_s64(svint64x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_s64,,)(tuple, 3);
 }
@@ -86,7 +93,7 @@ svint64_t test_svget4_s64(svint64x4_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[TUPLE:%.*]], i64 32)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svget4_u8(svuint8x4_t tuple)
+svuint8_t test_svget4_u8(svuint8x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_u8,,)(tuple, 2);
 }
@@ -101,7 +108,7 @@ svuint8_t test_svget4_u8(svuint8x4_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[TUPLE:%.*]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svget4_u16(svuint16x4_t tuple)
+svuint16_t test_svget4_u16(svuint16x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_u16,,)(tuple, 3);
 }
@@ -116,7 +123,7 @@ svuint16_t test_svget4_u16(svuint16x4_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svget4_u32(svuint32x4_t tuple)
+svuint32_t test_svget4_u32(svuint32x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_u32,,)(tuple, 0);
 }
@@ -131,7 +138,7 @@ svuint32_t test_svget4_u32(svuint32x4_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[TUPLE:%.*]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
-svuint64_t test_svget4_u64(svuint64x4_t tuple)
+svuint64_t test_svget4_u64(svuint64x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_u64,,)(tuple, 3);
 }
@@ -146,7 +153,7 @@ svuint64_t test_svget4_u64(svuint64x4_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[TUPLE:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svget4_f16(svfloat16x4_t tuple)
+svfloat16_t test_svget4_f16(svfloat16x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_f16,,)(tuple, 2);
 }
@@ -161,7 +168,7 @@ svfloat16_t test_svget4_f16(svfloat16x4_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[TUPLE:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svget4_f32(svfloat32x4_t tuple)
+svfloat32_t test_svget4_f32(svfloat32x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_f32,,)(tuple, 0);
 }
@@ -176,7 +183,7 @@ svfloat32_t test_svget4_f32(svfloat32x4_t tuple)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[TUPLE:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
-svfloat64_t test_svget4_f64(svfloat64x4_t tuple)
+svfloat64_t test_svget4_f64(svfloat64x4_t tuple) ATTR
 {
   return SVE_ACLE_FUNC(svget4,_f64,,)(tuple, 2);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c
new file mode 100644
index 0000000000000000000000000000000000000000..b430a0ba3cc75a3f3dc8392c730df33b97fdda88
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c
@@ -0,0 +1,35 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -O1 -Werror -Wall -o /dev/null %s
+
+// Note: We need to run this test with '-O1' because oddly enough the svreinterpret is always inlined at -O0.
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+// Test that svreinterpret is inlined (because it should be streaming-compatible)
+__attribute__((target("sme")))
+// CHECK-LABEL: @test_svreinterpret_s16_s8_from_streaming_mode(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x i16>
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z45test_svreinterpret_s16_s8_from_streaming_modeu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x i16>
+// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svreinterpret_s16_s8_from_streaming_mode(svint8_t op) __arm_streaming {
+  return SVE_ACLE_FUNC(svreinterpret_s16,_s8,,)(op);
+}
+
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2-bfloat.c
index f6217977d26ab916490e32553dd3c9059987b319..a92e5aedb59fd1fc904257f36e8e8478ee9f60ff 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2-bfloat.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -15,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svset2_bf16_0(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TUPLE:%.*]], <vscale x 8 x bfloat> [[X:%.*]], i64 0)
@@ -25,7 +32,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TUPLE:%.*]], <vscale x 8 x bfloat> [[X:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
 //
-svbfloat16x2_t test_svset2_bf16_0(svbfloat16x2_t tuple, svbfloat16_t x)
+svbfloat16x2_t test_svset2_bf16_0(svbfloat16x2_t tuple, svbfloat16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset2,_bf16,,)(tuple, 0, x);
 }
@@ -40,7 +47,7 @@ svbfloat16x2_t test_svset2_bf16_0(svbfloat16x2_t tuple, svbfloat16_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TUPLE:%.*]], <vscale x 8 x bfloat> [[X:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
 //
-svbfloat16x2_t test_svset2_bf16_1(svbfloat16x2_t tuple, svbfloat16_t x)
+svbfloat16x2_t test_svset2_bf16_1(svbfloat16x2_t tuple, svbfloat16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset2,_bf16,,)(tuple, 1, x);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2.c
index 9ae3011549207cedb993108aaf5f624007eae085..b2bf4ad08aa9ebeba44233b588c566ffa20168fa 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set2.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -15,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svset2_s8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TUPLE:%.*]], <vscale x 16 x i8> [[X:%.*]], i64 16)
@@ -25,7 +32,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TUPLE:%.*]], <vscale x 16 x i8> [[X:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
-svint8x2_t test_svset2_s8(svint8x2_t tuple, svint8_t x)
+svint8x2_t test_svset2_s8(svint8x2_t tuple, svint8_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset2,_s8,,)(tuple, 1, x);
 }
@@ -40,7 +47,7 @@ svint8x2_t test_svset2_s8(svint8x2_t tuple, svint8_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TUPLE:%.*]], <vscale x 8 x i16> [[X:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
-svint16x2_t test_svset2_s16(svint16x2_t tuple, svint16_t x)
+svint16x2_t test_svset2_s16(svint16x2_t tuple, svint16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset2,_s16,,)(tuple, 0, x);
 }
@@ -55,7 +62,7 @@ svint16x2_t test_svset2_s16(svint16x2_t tuple, svint16_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TUPLE:%.*]], <vscale x 4 x i32> [[X:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-svint32x2_t test_svset2_s32(svint32x2_t tuple, svint32_t x)
+svint32x2_t test_svset2_s32(svint32x2_t tuple, svint32_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset2,_s32,,)(tuple, 1, x);
 }
@@ -70,7 +77,7 @@ svint32x2_t test_svset2_s32(svint32x2_t tuple, svint32_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TUPLE:%.*]], <vscale x 2 x i64> [[X:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
-svint64x2_t test_svset2_s64(svint64x2_t tuple, svint64_t x)
+svint64x2_t test_svset2_s64(svint64x2_t tuple, svint64_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset2,_s64,,)(tuple, 0, x);
 }
@@ -85,7 +92,7 @@ svint64x2_t test_svset2_s64(svint64x2_t tuple, svint64_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TUPLE:%.*]], <vscale x 16 x i8> [[X:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP0]]
 //
-svuint8x2_t test_svset2_u8(svuint8x2_t tuple, svuint8_t x)
+svuint8x2_t test_svset2_u8(svuint8x2_t tuple, svuint8_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset2,_u8,,)(tuple, 1, x);
 }
@@ -100,7 +107,7 @@ svuint8x2_t test_svset2_u8(svuint8x2_t tuple, svuint8_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TUPLE:%.*]], <vscale x 8 x i16> [[X:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP0]]
 //
-svuint16x2_t test_svset2_u16(svuint16x2_t tuple, svuint16_t x)
+svuint16x2_t test_svset2_u16(svuint16x2_t tuple, svuint16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset2,_u16,,)(tuple, 0, x);
 }
@@ -115,7 +122,7 @@ svuint16x2_t test_svset2_u16(svuint16x2_t tuple, svuint16_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TUPLE:%.*]], <vscale x 4 x i32> [[X:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-svuint32x2_t test_svset2_u32(svuint32x2_t tuple, svuint32_t x)
+svuint32x2_t test_svset2_u32(svuint32x2_t tuple, svuint32_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset2,_u32,,)(tuple, 1, x);
 }
@@ -130,7 +137,7 @@ svuint32x2_t test_svset2_u32(svuint32x2_t tuple, svuint32_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TUPLE:%.*]], <vscale x 2 x i64> [[X:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP0]]
 //
-svuint64x2_t test_svset2_u64(svuint64x2_t tuple, svuint64_t x)
+svuint64x2_t test_svset2_u64(svuint64x2_t tuple, svuint64_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset2,_u64,,)(tuple, 0, x);
 }
@@ -145,7 +152,7 @@ svuint64x2_t test_svset2_u64(svuint64x2_t tuple, svuint64_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TUPLE:%.*]], <vscale x 8 x half> [[X:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP0]]
 //
-svfloat16x2_t test_svset2_f16(svfloat16x2_t tuple, svfloat16_t x)
+svfloat16x2_t test_svset2_f16(svfloat16x2_t tuple, svfloat16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset2,_f16,,)(tuple, 1, x);
 }
@@ -160,7 +167,7 @@ svfloat16x2_t test_svset2_f16(svfloat16x2_t tuple, svfloat16_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TUPLE:%.*]], <vscale x 4 x float> [[X:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP0]]
 //
-svfloat32x2_t test_svset2_f32(svfloat32x2_t tuple, svfloat32_t x)
+svfloat32x2_t test_svset2_f32(svfloat32x2_t tuple, svfloat32_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset2,_f32,,)(tuple, 0, x);
 }
@@ -175,7 +182,7 @@ svfloat32x2_t test_svset2_f32(svfloat32x2_t tuple, svfloat32_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TUPLE:%.*]], <vscale x 2 x double> [[X:%.*]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP0]]
 //
-svfloat64x2_t test_svset2_f64(svfloat64x2_t tuple, svfloat64_t x)
+svfloat64x2_t test_svset2_f64(svfloat64x2_t tuple, svfloat64_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset2,_f64,,)(tuple, 1, x);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3-bfloat.c
index aeef3ab59ee0dd4b96a861094394d8957e17dfc4..3bc8698ef89b7dce2b7394835428d440adccf17a 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3-bfloat.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -15,6 +16,11 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
 
 // CHECK-LABEL: @test_svset3_bf16_0(
 // CHECK-NEXT:  entry:
@@ -26,7 +32,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TUPLE:%.*]], <vscale x 8 x bfloat> [[X:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
 //
-svbfloat16x3_t test_svset3_bf16_0(svbfloat16x3_t tuple, svbfloat16_t x)
+svbfloat16x3_t test_svset3_bf16_0(svbfloat16x3_t tuple, svbfloat16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset3,_bf16,,)(tuple, 0, x);
 }
@@ -41,7 +47,7 @@ svbfloat16x3_t test_svset3_bf16_0(svbfloat16x3_t tuple, svbfloat16_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TUPLE:%.*]], <vscale x 8 x bfloat> [[X:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
 //
-svbfloat16x3_t test_svset3_bf16_1(svbfloat16x3_t tuple, svbfloat16_t x)
+svbfloat16x3_t test_svset3_bf16_1(svbfloat16x3_t tuple, svbfloat16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset3,_bf16,,)(tuple, 1, x);
 }
@@ -56,7 +62,7 @@ svbfloat16x3_t test_svset3_bf16_1(svbfloat16x3_t tuple, svbfloat16_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 24 x bfloat> @llvm.vector.insert.nxv24bf16.nxv8bf16(<vscale x 24 x bfloat> [[TUPLE:%.*]], <vscale x 8 x bfloat> [[X:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 24 x bfloat> [[TMP0]]
 //
-svbfloat16x3_t test_svset3_bf16_2(svbfloat16x3_t tuple, svbfloat16_t x)
+svbfloat16x3_t test_svset3_bf16_2(svbfloat16x3_t tuple, svbfloat16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset3,_bf16,,)(tuple, 2, x);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3.c
index 1b9191cc8a330880d2b18f1df9e39059ceef432f..9d10e6afca93532172914b0fb8c647e4b3dd9713 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set3.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -15,6 +16,11 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
 
 // NOTE: For these tests clang converts the struct parameter into
 // several parameters, one for each member of the original struct.
@@ -28,7 +34,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TUPLE:%.*]], <vscale x 16 x i8> [[X:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
 //
-svint8x3_t test_svset3_s8(svint8x3_t tuple, svint8_t x)
+svint8x3_t test_svset3_s8(svint8x3_t tuple, svint8_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset3,_s8,,)(tuple, 1, x);
 }
@@ -43,7 +49,7 @@ svint8x3_t test_svset3_s8(svint8x3_t tuple, svint8_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TUPLE:%.*]], <vscale x 8 x i16> [[X:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
 //
-svint16x3_t test_svset3_s16(svint16x3_t tuple, svint16_t x)
+svint16x3_t test_svset3_s16(svint16x3_t tuple, svint16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset3,_s16,,)(tuple, 2, x);
 }
@@ -58,7 +64,7 @@ svint16x3_t test_svset3_s16(svint16x3_t tuple, svint16_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TUPLE:%.*]], <vscale x 4 x i32> [[X:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
 //
-svint32x3_t test_svset3_s32(svint32x3_t tuple, svint32_t x)
+svint32x3_t test_svset3_s32(svint32x3_t tuple, svint32_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset3,_s32,,)(tuple, 0, x);
 }
@@ -73,7 +79,7 @@ svint32x3_t test_svset3_s32(svint32x3_t tuple, svint32_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TUPLE:%.*]], <vscale x 2 x i64> [[X:%.*]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
 //
-svint64x3_t test_svset3_s64(svint64x3_t tuple, svint64_t x)
+svint64x3_t test_svset3_s64(svint64x3_t tuple, svint64_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset3,_s64,,)(tuple, 1, x);
 }
@@ -88,7 +94,7 @@ svint64x3_t test_svset3_s64(svint64x3_t tuple, svint64_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 48 x i8> @llvm.vector.insert.nxv48i8.nxv16i8(<vscale x 48 x i8> [[TUPLE:%.*]], <vscale x 16 x i8> [[X:%.*]], i64 32)
 // CPP-CHECK-NEXT:    ret <vscale x 48 x i8> [[TMP0]]
 //
-svuint8x3_t test_svset3_u8(svuint8x3_t tuple, svuint8_t x)
+svuint8x3_t test_svset3_u8(svuint8x3_t tuple, svuint8_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset3,_u8,,)(tuple, 2, x);
 }
@@ -103,7 +109,7 @@ svuint8x3_t test_svset3_u8(svuint8x3_t tuple, svuint8_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 24 x i16> @llvm.vector.insert.nxv24i16.nxv8i16(<vscale x 24 x i16> [[TUPLE:%.*]], <vscale x 8 x i16> [[X:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 24 x i16> [[TMP0]]
 //
-svuint16x3_t test_svset3_u16(svuint16x3_t tuple, svuint16_t x)
+svuint16x3_t test_svset3_u16(svuint16x3_t tuple, svuint16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset3,_u16,,)(tuple, 0, x);
 }
@@ -118,7 +124,7 @@ svuint16x3_t test_svset3_u16(svuint16x3_t tuple, svuint16_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 12 x i32> @llvm.vector.insert.nxv12i32.nxv4i32(<vscale x 12 x i32> [[TUPLE:%.*]], <vscale x 4 x i32> [[X:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 12 x i32> [[TMP0]]
 //
-svuint32x3_t test_svset3_u32(svuint32x3_t tuple, svuint32_t x)
+svuint32x3_t test_svset3_u32(svuint32x3_t tuple, svuint32_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset3,_u32,,)(tuple, 1, x);
 }
@@ -133,7 +139,7 @@ svuint32x3_t test_svset3_u32(svuint32x3_t tuple, svuint32_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 6 x i64> @llvm.vector.insert.nxv6i64.nxv2i64(<vscale x 6 x i64> [[TUPLE:%.*]], <vscale x 2 x i64> [[X:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 6 x i64> [[TMP0]]
 //
-svuint64x3_t test_svset3_u64(svuint64x3_t tuple, svuint64_t x)
+svuint64x3_t test_svset3_u64(svuint64x3_t tuple, svuint64_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset3,_u64,,)(tuple, 2, x);
 }
@@ -148,7 +154,7 @@ svuint64x3_t test_svset3_u64(svuint64x3_t tuple, svuint64_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 24 x half> @llvm.vector.insert.nxv24f16.nxv8f16(<vscale x 24 x half> [[TUPLE:%.*]], <vscale x 8 x half> [[X:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 24 x half> [[TMP0]]
 //
-svfloat16x3_t test_svset3_f16(svfloat16x3_t tuple, svfloat16_t x)
+svfloat16x3_t test_svset3_f16(svfloat16x3_t tuple, svfloat16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset3,_f16,,)(tuple, 0, x);
 }
@@ -163,7 +169,7 @@ svfloat16x3_t test_svset3_f16(svfloat16x3_t tuple, svfloat16_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 12 x float> @llvm.vector.insert.nxv12f32.nxv4f32(<vscale x 12 x float> [[TUPLE:%.*]], <vscale x 4 x float> [[X:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 12 x float> [[TMP0]]
 //
-svfloat32x3_t test_svset3_f32(svfloat32x3_t tuple, svfloat32_t x)
+svfloat32x3_t test_svset3_f32(svfloat32x3_t tuple, svfloat32_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset3,_f32,,)(tuple, 1, x);
 }
@@ -178,7 +184,7 @@ svfloat32x3_t test_svset3_f32(svfloat32x3_t tuple, svfloat32_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 6 x double> @llvm.vector.insert.nxv6f64.nxv2f64(<vscale x 6 x double> [[TUPLE:%.*]], <vscale x 2 x double> [[X:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 6 x double> [[TMP0]]
 //
-svfloat64x3_t test_svset3_f64(svfloat64x3_t tuple, svfloat64_t x)
+svfloat64x3_t test_svset3_f64(svfloat64x3_t tuple, svfloat64_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset3,_f64,,)(tuple, 2, x);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4-bfloat.c
index 013f591e8d72fc30e7c8a4d0091dbc24729639e9..31ca805dafc741bc1f2e0b584907603310d783c1 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4-bfloat.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -15,6 +16,11 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
 
 // CHECK-LABEL: @test_svset4_bf16_0(
 // CHECK-NEXT:  entry:
@@ -26,7 +32,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TUPLE:%.*]], <vscale x 8 x bfloat> [[X:%.*]], i64 0)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-svbfloat16x4_t test_svset4_bf16_0(svbfloat16x4_t tuple, svbfloat16_t x)
+svbfloat16x4_t test_svset4_bf16_0(svbfloat16x4_t tuple, svbfloat16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 0, x);
 }
@@ -41,7 +47,7 @@ svbfloat16x4_t test_svset4_bf16_0(svbfloat16x4_t tuple, svbfloat16_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TUPLE:%.*]], <vscale x 8 x bfloat> [[X:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-svbfloat16x4_t test_svset4_bf16_1(svbfloat16x4_t tuple, svbfloat16_t x)
+svbfloat16x4_t test_svset4_bf16_1(svbfloat16x4_t tuple, svbfloat16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 1, x);
 }
@@ -56,7 +62,7 @@ svbfloat16x4_t test_svset4_bf16_1(svbfloat16x4_t tuple, svbfloat16_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TUPLE:%.*]], <vscale x 8 x bfloat> [[X:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-svbfloat16x4_t test_svset4_bf16_2(svbfloat16x4_t tuple, svbfloat16_t x)
+svbfloat16x4_t test_svset4_bf16_2(svbfloat16x4_t tuple, svbfloat16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 2, x);
 }
@@ -71,7 +77,7 @@ svbfloat16x4_t test_svset4_bf16_2(svbfloat16x4_t tuple, svbfloat16_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TUPLE:%.*]], <vscale x 8 x bfloat> [[X:%.*]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-svbfloat16x4_t test_svset4_bf16_3(svbfloat16x4_t tuple, svbfloat16_t x)
+svbfloat16x4_t test_svset4_bf16_3(svbfloat16x4_t tuple, svbfloat16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_bf16,,)(tuple, 3, x);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4.c
index e4ece8c2a65ff7f5b5ab67e3fb10709eb0a8fa15..ce35bfb83c88d941b45f263f80482886c20e19a9 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_set4.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DTEST_SME -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -15,6 +16,11 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
 
 // CHECK-LABEL: @test_svset4_s8(
 // CHECK-NEXT:  entry:
@@ -26,7 +32,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TUPLE:%.*]], <vscale x 16 x i8> [[X:%.*]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-svint8x4_t test_svset4_s8(svint8x4_t tuple, svint8_t x)
+svint8x4_t test_svset4_s8(svint8x4_t tuple, svint8_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_s8,,)(tuple, 1, x);
 }
@@ -41,7 +47,7 @@ svint8x4_t test_svset4_s8(svint8x4_t tuple, svint8_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TUPLE:%.*]], <vscale x 8 x i16> [[X:%.*]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-svint16x4_t test_svset4_s16(svint16x4_t tuple, svint16_t x)
+svint16x4_t test_svset4_s16(svint16x4_t tuple, svint16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_s16,,)(tuple, 3, x);
 }
@@ -56,7 +62,7 @@ svint16x4_t test_svset4_s16(svint16x4_t tuple, svint16_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TUPLE:%.*]], <vscale x 4 x i32> [[X:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-svint32x4_t test_svset4_s32(svint32x4_t tuple, svint32_t x)
+svint32x4_t test_svset4_s32(svint32x4_t tuple, svint32_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_s32,,)(tuple, 1, x);
 }
@@ -71,7 +77,7 @@ svint32x4_t test_svset4_s32(svint32x4_t tuple, svint32_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TUPLE:%.*]], <vscale x 2 x i64> [[X:%.*]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-svint64x4_t test_svset4_s64(svint64x4_t tuple, svint64_t x)
+svint64x4_t test_svset4_s64(svint64x4_t tuple, svint64_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_s64,,)(tuple, 1, x);
 }
@@ -86,7 +92,7 @@ svint64x4_t test_svset4_s64(svint64x4_t tuple, svint64_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TUPLE:%.*]], <vscale x 16 x i8> [[X:%.*]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-svuint8x4_t test_svset4_u8(svuint8x4_t tuple, svuint8_t x)
+svuint8x4_t test_svset4_u8(svuint8x4_t tuple, svuint8_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_u8,,)(tuple, 3, x);
 }
@@ -101,7 +107,7 @@ svuint8x4_t test_svset4_u8(svuint8x4_t tuple, svuint8_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TUPLE:%.*]], <vscale x 8 x i16> [[X:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-svuint16x4_t test_svset4_u16(svuint16x4_t tuple, svuint16_t x)
+svuint16x4_t test_svset4_u16(svuint16x4_t tuple, svuint16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_u16,,)(tuple, 1, x);
 }
@@ -116,7 +122,7 @@ svuint16x4_t test_svset4_u16(svuint16x4_t tuple, svuint16_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TUPLE:%.*]], <vscale x 4 x i32> [[X:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-svuint32x4_t test_svset4_u32(svuint32x4_t tuple, svuint32_t x)
+svuint32x4_t test_svset4_u32(svuint32x4_t tuple, svuint32_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_u32,,)(tuple, 1, x);
 }
@@ -131,7 +137,7 @@ svuint32x4_t test_svset4_u32(svuint32x4_t tuple, svuint32_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TUPLE:%.*]], <vscale x 2 x i64> [[X:%.*]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-svuint64x4_t test_svset4_u64(svuint64x4_t tuple, svuint64_t x)
+svuint64x4_t test_svset4_u64(svuint64x4_t tuple, svuint64_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_u64,,)(tuple, 3, x);
 }
@@ -146,7 +152,7 @@ svuint64x4_t test_svset4_u64(svuint64x4_t tuple, svuint64_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TUPLE:%.*]], <vscale x 8 x half> [[X:%.*]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-svfloat16x4_t test_svset4_f16(svfloat16x4_t tuple, svfloat16_t x)
+svfloat16x4_t test_svset4_f16(svfloat16x4_t tuple, svfloat16_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_f16,,)(tuple, 1, x);
 }
@@ -161,7 +167,7 @@ svfloat16x4_t test_svset4_f16(svfloat16x4_t tuple, svfloat16_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TUPLE:%.*]], <vscale x 4 x float> [[X:%.*]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-svfloat32x4_t test_svset4_f32(svfloat32x4_t tuple, svfloat32_t x)
+svfloat32x4_t test_svset4_f32(svfloat32x4_t tuple, svfloat32_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_f32,,)(tuple, 1, x);
 }
@@ -176,7 +182,7 @@ svfloat32x4_t test_svset4_f32(svfloat32x4_t tuple, svfloat32_t x)
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TUPLE:%.*]], <vscale x 2 x double> [[X:%.*]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-svfloat64x4_t test_svset4_f64(svfloat64x4_t tuple, svfloat64_t x)
+svfloat64x4_t test_svset4_f64(svfloat64x4_t tuple, svfloat64_t x) ATTR
 {
   return SVE_ACLE_FUNC(svset4,_f64,,)(tuple, 3, x);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c
index fcfa7eb5768363c7667b740c699001f412644623..a9a285bdbfb8d6349cf00207355f6d9b2f581cff 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c
@@ -16,18 +16,18 @@
 #endif
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z15test_svst2_bf16u10__SVBool_tPu6__bf1614svbfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x2_t data)
@@ -37,20 +37,20 @@ void test_svst2_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x2_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z20test_svst2_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x2_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
index 3235a8b9e11155bf248240bb782fe3611049e1b3..dece2ca82c53955377509c9262e406761ade2558 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
@@ -35,18 +35,18 @@ void test_svst2_s8(svbool_t pg, int8_t *base, svint8x2_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_s16u10__SVBool_tPs11svint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_s16(svbool_t pg, int16_t *base, svint16x2_t data)
@@ -56,18 +56,18 @@ void test_svst2_s16(svbool_t pg, int16_t *base, svint16x2_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_s32u10__SVBool_tPi11svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_s32(svbool_t pg, int32_t *base, svint32x2_t data)
@@ -77,18 +77,18 @@ void test_svst2_s32(svbool_t pg, int32_t *base, svint32x2_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_s64u10__SVBool_tPl11svint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_s64(svbool_t pg, int64_t *base, svint64x2_t data)
@@ -117,18 +117,18 @@ void test_svst2_u8(svbool_t pg, uint8_t *base, svuint8x2_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_u16u10__SVBool_tPt12svuint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_u16(svbool_t pg, uint16_t *base, svuint16x2_t data)
@@ -138,18 +138,18 @@ void test_svst2_u16(svbool_t pg, uint16_t *base, svuint16x2_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_u32u10__SVBool_tPj12svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_u32(svbool_t pg, uint32_t *base, svuint32x2_t data)
@@ -159,18 +159,18 @@ void test_svst2_u32(svbool_t pg, uint32_t *base, svuint32x2_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_u64u10__SVBool_tPm12svuint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_u64(svbool_t pg, uint64_t *base, svuint64x2_t data)
@@ -180,18 +180,18 @@ void test_svst2_u64(svbool_t pg, uint64_t *base, svuint64x2_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_f16u10__SVBool_tPDh13svfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_f16(svbool_t pg, float16_t *base, svfloat16x2_t data)
@@ -201,18 +201,18 @@ void test_svst2_f16(svbool_t pg, float16_t *base, svfloat16x2_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_f32u10__SVBool_tPf13svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_f32(svbool_t pg, float32_t *base, svfloat32x2_t data)
@@ -222,18 +222,18 @@ void test_svst2_f32(svbool_t pg, float32_t *base, svfloat32x2_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst2_f64u10__SVBool_tPd13svfloat64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_f64(svbool_t pg, float64_t *base, svfloat64x2_t data)
@@ -243,18 +243,18 @@ void test_svst2_f64(svbool_t pg, float64_t *base, svfloat64x2_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst2_vnum_s8u10__SVBool_tPal10svint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x2_t data)
@@ -264,20 +264,20 @@ void test_svst2_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x2_t data
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_s16u10__SVBool_tPsl11svint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x2_t data)
@@ -287,20 +287,20 @@ void test_svst2_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x2_t d
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_s32u10__SVBool_tPil11svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x2_t data)
@@ -310,20 +310,20 @@ void test_svst2_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x2_t d
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_s64u10__SVBool_tPll11svint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x2_t data)
@@ -333,18 +333,18 @@ void test_svst2_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x2_t d
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst2_vnum_u8u10__SVBool_tPhl11svuint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x2_t data)
@@ -354,20 +354,20 @@ void test_svst2_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x2_t da
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_u16u10__SVBool_tPtl12svuint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x2_t data)
@@ -377,20 +377,20 @@ void test_svst2_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x2_t
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_u32u10__SVBool_tPjl12svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x2_t data)
@@ -400,20 +400,20 @@ void test_svst2_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x2_t
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_u64u10__SVBool_tPml12svuint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x2_t data)
@@ -423,20 +423,20 @@ void test_svst2_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x2_t
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_f16u10__SVBool_tPDhl13svfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x2_t data)
@@ -446,20 +446,20 @@ void test_svst2_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x2
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_f32u10__SVBool_tPfl13svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x2_t data)
@@ -469,20 +469,20 @@ void test_svst2_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x2
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst2_vnum_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst2_vnum_f64u10__SVBool_tPdl13svfloat64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x2_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c
index 096699191e88492f5f9fac6a67f23f7bcd9fea41..ffd24a45b94e5dc928b648d1dc5b819b0a93af9e 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c
@@ -17,20 +17,20 @@
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z15test_svst3_bf16u10__SVBool_tPu6__bf1614svbfloat16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x3_t data)
@@ -40,22 +40,22 @@ void test_svst3_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x3_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z20test_svst3_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x3_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c
index a962f0734dc9a2625ef050259dc25b980df91874..d52ff0b6db8858a5d74ed84cf49b9cacb5a58684 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c
@@ -37,20 +37,20 @@ void test_svst3_s8(svbool_t pg, int8_t *base, svint8x3_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_s16u10__SVBool_tPs11svint16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_s16(svbool_t pg, int16_t *base, svint16x3_t data)
@@ -60,20 +60,20 @@ void test_svst3_s16(svbool_t pg, int16_t *base, svint16x3_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_s32u10__SVBool_tPi11svint32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_s32(svbool_t pg, int32_t *base, svint32x3_t data)
@@ -83,20 +83,20 @@ void test_svst3_s32(svbool_t pg, int32_t *base, svint32x3_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_s64u10__SVBool_tPl11svint64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_s64(svbool_t pg, int64_t *base, svint64x3_t data)
@@ -127,20 +127,20 @@ void test_svst3_u8(svbool_t pg, uint8_t *base, svuint8x3_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_u16u10__SVBool_tPt12svuint16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_u16(svbool_t pg, uint16_t *base, svuint16x3_t data)
@@ -150,20 +150,20 @@ void test_svst3_u16(svbool_t pg, uint16_t *base, svuint16x3_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_u32u10__SVBool_tPj12svuint32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_u32(svbool_t pg, uint32_t *base, svuint32x3_t data)
@@ -173,20 +173,20 @@ void test_svst3_u32(svbool_t pg, uint32_t *base, svuint32x3_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_u64u10__SVBool_tPm12svuint64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_u64(svbool_t pg, uint64_t *base, svuint64x3_t data)
@@ -196,20 +196,20 @@ void test_svst3_u64(svbool_t pg, uint64_t *base, svuint64x3_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_f16u10__SVBool_tPDh13svfloat16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_f16(svbool_t pg, float16_t *base, svfloat16x3_t data)
@@ -219,20 +219,20 @@ void test_svst3_f16(svbool_t pg, float16_t *base, svfloat16x3_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_f32u10__SVBool_tPf13svfloat32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_f32(svbool_t pg, float32_t *base, svfloat32x3_t data)
@@ -242,20 +242,20 @@ void test_svst3_f32(svbool_t pg, float32_t *base, svfloat32x3_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst3_f64u10__SVBool_tPd13svfloat64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_f64(svbool_t pg, float64_t *base, svfloat64x3_t data)
@@ -265,20 +265,20 @@ void test_svst3_f64(svbool_t pg, float64_t *base, svfloat64x3_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst3_vnum_s8u10__SVBool_tPal10svint8x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x3_t data)
@@ -288,22 +288,22 @@ void test_svst3_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x3_t data
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_s16u10__SVBool_tPsl11svint16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x3_t data)
@@ -313,22 +313,22 @@ void test_svst3_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x3_t d
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_s32u10__SVBool_tPil11svint32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x3_t data)
@@ -338,22 +338,22 @@ void test_svst3_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x3_t d
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_s64u10__SVBool_tPll11svint64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x3_t data)
@@ -363,20 +363,20 @@ void test_svst3_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x3_t d
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst3_vnum_u8u10__SVBool_tPhl11svuint8x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x3_t data)
@@ -386,22 +386,22 @@ void test_svst3_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x3_t da
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_u16u10__SVBool_tPtl12svuint16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x3_t data)
@@ -411,22 +411,22 @@ void test_svst3_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x3_t
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_u32u10__SVBool_tPjl12svuint32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x3_t data)
@@ -436,22 +436,22 @@ void test_svst3_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x3_t
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_u64u10__SVBool_tPml12svuint64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x3_t data)
@@ -461,22 +461,22 @@ void test_svst3_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x3_t
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_f16u10__SVBool_tPDhl13svfloat16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x3_t data)
@@ -486,22 +486,22 @@ void test_svst3_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x3
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_f32u10__SVBool_tPfl13svfloat32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x3_t data)
@@ -511,22 +511,22 @@ void test_svst3_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x3
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst3_vnum_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst3_vnum_f64u10__SVBool_tPdl13svfloat64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x3_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c
index 484fc6333b577d98b0e403746181cc2fcbbfd9b5..80efdf9df65c92edc1439ee9be62782b5d56dd96 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c
@@ -17,22 +17,22 @@
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z15test_svst4_bf16u10__SVBool_tPu6__bf1614svbfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x4_t data)
@@ -42,24 +42,24 @@ void test_svst4_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x4_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z20test_svst4_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x4_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c
index 774460c9151926a77161c87599aa1afe6f41e976..c61e78a941b228d304bf97fe1886fa797f6c9552 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c
@@ -39,22 +39,22 @@ void test_svst4_s8(svbool_t pg, int8_t *base, svint8x4_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_s16u10__SVBool_tPs11svint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_s16(svbool_t pg, int16_t *base, svint16x4_t data)
@@ -64,22 +64,22 @@ void test_svst4_s16(svbool_t pg, int16_t *base, svint16x4_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_s32u10__SVBool_tPi11svint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_s32(svbool_t pg, int32_t *base, svint32x4_t data)
@@ -89,22 +89,22 @@ void test_svst4_s32(svbool_t pg, int32_t *base, svint32x4_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_s64u10__SVBool_tPl11svint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_s64(svbool_t pg, int64_t *base, svint64x4_t data)
@@ -137,22 +137,22 @@ void test_svst4_u8(svbool_t pg, uint8_t *base, svuint8x4_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_u16u10__SVBool_tPt12svuint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_u16(svbool_t pg, uint16_t *base, svuint16x4_t data)
@@ -162,22 +162,22 @@ void test_svst4_u16(svbool_t pg, uint16_t *base, svuint16x4_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_u32u10__SVBool_tPj12svuint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_u32(svbool_t pg, uint32_t *base, svuint32x4_t data)
@@ -187,22 +187,22 @@ void test_svst4_u32(svbool_t pg, uint32_t *base, svuint32x4_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_u64u10__SVBool_tPm12svuint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_u64(svbool_t pg, uint64_t *base, svuint64x4_t data)
@@ -212,22 +212,22 @@ void test_svst4_u64(svbool_t pg, uint64_t *base, svuint64x4_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_f16u10__SVBool_tPDh13svfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_f16(svbool_t pg, float16_t *base, svfloat16x4_t data)
@@ -237,22 +237,22 @@ void test_svst4_f16(svbool_t pg, float16_t *base, svfloat16x4_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_f32u10__SVBool_tPf13svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_f32(svbool_t pg, float32_t *base, svfloat32x4_t data)
@@ -262,22 +262,22 @@ void test_svst4_f32(svbool_t pg, float32_t *base, svfloat32x4_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z14test_svst4_f64u10__SVBool_tPd13svfloat64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_f64(svbool_t pg, float64_t *base, svfloat64x4_t data)
@@ -287,22 +287,22 @@ void test_svst4_f64(svbool_t pg, float64_t *base, svfloat64x4_t data)
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst4_vnum_s8u10__SVBool_tPal10svint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x4_t data)
@@ -312,24 +312,24 @@ void test_svst4_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x4_t data
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_s16u10__SVBool_tPsl11svint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x4_t data)
@@ -339,24 +339,24 @@ void test_svst4_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x4_t d
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_s32u10__SVBool_tPil11svint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x4_t data)
@@ -366,24 +366,24 @@ void test_svst4_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x4_t d
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_s64u10__SVBool_tPll11svint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x4_t data)
@@ -393,22 +393,22 @@ void test_svst4_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x4_t d
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z18test_svst4_vnum_u8u10__SVBool_tPhl11svuint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x4_t data)
@@ -418,24 +418,24 @@ void test_svst4_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x4_t da
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_u16u10__SVBool_tPtl12svuint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x4_t data)
@@ -445,24 +445,24 @@ void test_svst4_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x4_t
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_u32u10__SVBool_tPjl12svuint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x4_t data)
@@ -472,24 +472,24 @@ void test_svst4_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x4_t
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_u64u10__SVBool_tPml12svuint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x4_t data)
@@ -499,24 +499,24 @@ void test_svst4_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x4_t
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_f16u10__SVBool_tPDhl13svfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x4_t data)
@@ -526,24 +526,24 @@ void test_svst4_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x4
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_f32u10__SVBool_tPfl13svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x4_t data)
@@ -553,24 +553,24 @@ void test_svst4_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x4
 
 // CHECK-LABEL: define {{[^@]+}}@test_svst4_vnum_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: define {{[^@]+}}@_Z19test_svst4_vnum_f64u10__SVBool_tPdl13svfloat64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x4_t data)
diff --git a/clang/test/Modules/aarch64-sme-keywords.cppm b/clang/test/Modules/aarch64-sme-keywords.cppm
new file mode 100644
index 0000000000000000000000000000000000000000..759701a633cebed2b242ffe4a44f60a3c50af1da
--- /dev/null
+++ b/clang/test/Modules/aarch64-sme-keywords.cppm
@@ -0,0 +1,65 @@
+// REQUIRES: aarch64-registered-target
+//
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 -triple aarch64 -target-feature +sme %t/A.cppm -emit-module-interface -o %t/A.pcm
+// RUN: %clang_cc1 -std=c++20 -triple aarch64 -target-feature +sme -fprebuilt-module-path=%t -I%t %t/Use.cpp -emit-llvm
+// RUN: cat %t/Use.ll | FileCheck %s
+
+//--- A.cppm
+module;
+export module A;
+
+export void f_streaming(void) __arm_streaming { }
+export void f_streaming_compatible(void) __arm_streaming_compatible { }
+export void f_shared_za(void) __arm_inout("za") { }
+export void f_preserves_za(void) __arm_preserves("za") { }
+
+//--- Use.cpp
+// expected-no-diagnostics
+import A;
+
+// CHECK: define dso_local void @_Z18f_shared_za_callerv() #[[SHARED_ZA_DEF:[0-9]+]] {
+// CHECK: entry:
+// CHECK:   call void @_ZW1A11f_shared_zav() #[[SHARED_ZA_USE:[0-9]+]]
+// CHECK:   call void @_ZW1A14f_preserves_zav() #[[PRESERVES_ZA_USE:[0-9]+]]
+// CHECK:   ret void
+// CHECK: }
+//
+// CHECK:declare void @_ZW1A11f_shared_zav() #[[SHARED_ZA_DECL:[0-9]+]]
+//
+// CHECK:declare void @_ZW1A14f_preserves_zav() #[[PRESERVES_ZA_DECL:[0-9]+]]
+//
+// CHECK:; Function Attrs: mustprogress noinline nounwind optnone
+// CHECK:define dso_local void @_Z21f_nonstreaming_callerv() #[[NORMAL_DEF:[0-9]+]] {
+// CHECK:entry:
+// CHECK:  call void @_ZW1A11f_streamingv() #[[STREAMING_USE:[0-9]+]]
+// CHECK:  call void @_ZW1A22f_streaming_compatiblev() #[[STREAMING_COMPATIBLE_USE:[0-9]+]]
+// CHECK:  ret void
+// CHECK:}
+//
+// CHECK:declare void @_ZW1A11f_streamingv() #[[STREAMING_DECL:[0-9]+]]
+//
+// CHECK:declare void @_ZW1A22f_streaming_compatiblev() #[[STREAMING_COMPATIBLE_DECL:[0-9]+]]
+//
+// CHECK-DAG: attributes #[[SHARED_ZA_DEF]] = {{{.*}} "aarch64_inout_za" {{.*}}}
+// CHECK-DAG: attributes #[[SHARED_ZA_DECL]] = {{{.*}} "aarch64_inout_za" {{.*}}}
+// CHECK-DAG: attributes #[[PRESERVES_ZA_DECL]] = {{{.*}} "aarch64_preserves_za" {{.*}}}
+// CHECK-DAG: attributes #[[NORMAL_DEF]] = {{{.*}}}
+// CHECK-DAG: attributes #[[STREAMING_DECL]] = {{{.*}} "aarch64_pstate_sm_enabled" {{.*}}}
+// CHECK-DAG: attributes #[[STREAMING_COMPATIBLE_DECL]] = {{{.*}} "aarch64_pstate_sm_compatible" {{.*}}}
+// CHECK-DAG: attributes #[[SHARED_ZA_USE]] = { "aarch64_inout_za" }
+// CHECK-DAG: attributes #[[PRESERVES_ZA_USE]] = { "aarch64_preserves_za" }
+// CHECK-DAG: attributes #[[STREAMING_USE]] = { "aarch64_pstate_sm_enabled" }
+// CHECK-DAG: attributes #[[STREAMING_COMPATIBLE_USE]] = { "aarch64_pstate_sm_compatible" }
+
+void f_shared_za_caller(void) __arm_inout("za") {
+  f_shared_za();
+  f_preserves_za();
+}
+
+void f_nonstreaming_caller(void) {
+  f_streaming();
+  f_streaming_compatible();
+}
diff --git a/clang/test/Parser/MicrosoftExtensions.cpp b/clang/test/Parser/MicrosoftExtensions.cpp
index 01e59aa1945a863cdb14f26a1ab6e8fd8b8c87bc..6bf802a29ace398398e4a4de1a96e0c9aa386931 100644
--- a/clang/test/Parser/MicrosoftExtensions.cpp
+++ b/clang/test/Parser/MicrosoftExtensions.cpp
@@ -44,8 +44,8 @@ typedef struct _GUID
     unsigned char  Data4[8];
 } GUID;
 
-struct __declspec(uuid(L"00000000-0000-0000-1234-000000000047")) uuid_attr_bad1 { };// expected-error {{'uuid' attribute requires a string}}
-struct __declspec(uuid(3)) uuid_attr_bad2 { };// expected-error {{'uuid' attribute requires a string}}
+struct __declspec(uuid(L"00000000-0000-0000-1234-000000000047")) uuid_attr_bad1 { };// expected-warning {{encoding prefix 'L' on an unevaluated string literal has no effect and is incompatible with c++2c}}
+struct __declspec(uuid(3)) uuid_attr_bad2 { };// expected-error {{expected string literal as argument of 'uuid' attribute}}
 struct __declspec(uuid("0000000-0000-0000-1234-0000500000047")) uuid_attr_bad3 { };// expected-error {{uuid attribute contains a malformed GUID}}
 struct __declspec(uuid("0000000-0000-0000-Z234-000000000047")) uuid_attr_bad4 { };// expected-error {{uuid attribute contains a malformed GUID}}
 struct __declspec(uuid("000000000000-0000-1234-000000000047")) uuid_attr_bad5 { };// expected-error {{uuid attribute contains a malformed GUID}}
diff --git a/clang/test/Parser/c2x-attribute-keywords.c b/clang/test/Parser/c2x-attribute-keywords.c
index 757dc82860110a5f0f1e4e14e56d2b90ca33ac8a..b88d2b9c23e697ad449d887d6b2f430152e8c5ba 100644
--- a/clang/test/Parser/c2x-attribute-keywords.c
+++ b/clang/test/Parser/c2x-attribute-keywords.c
@@ -1,60 +1,64 @@
-// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,notc2x -Wno-strict-prototypes %s
-// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,c2x %s
-
-enum __arm_streaming E { // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-  One __arm_streaming, // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+// RUN: sed -e "s@ATTR_USE@__arm_streaming@g" -e "s@ATTR_NAME@__arm_streaming@g" %s > %t
+// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,notc2x -Wno-strict-prototypes %t
+// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,c2x %t
+// RUN: sed -e "s@ATTR_USE@__arm_inout\(\"za\"\)@g" -e "s@ATTR_NAME@__arm_inout@g" %s > %t
+// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,notc2x -Wno-strict-prototypes %t
+// RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify=expected,c2x %t
+
+enum ATTR_USE E { // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  One ATTR_USE, // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
   Two,
-  Three __arm_streaming // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+  Three ATTR_USE // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 };
 
-enum __arm_streaming { Four }; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-__arm_streaming enum E2 { Five }; // expected-error {{misplaced '__arm_streaming'}}
+enum ATTR_USE { Four }; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+ATTR_USE enum E2 { Five }; // expected-error {{misplaced 'ATTR_NAME'}}
 
 // FIXME: this diagnostic can be improved.
-enum { __arm_streaming Six }; // expected-error {{expected identifier}}
+enum { ATTR_USE Six }; // expected-error {{expected identifier}}
 
 // FIXME: this diagnostic can be improved.
-enum E3 __arm_streaming { Seven }; // expected-error {{expected identifier or '('}}
-
-struct __arm_streaming S1 { // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-  int i __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
-  int __arm_streaming j; // expected-error {{'__arm_streaming' only applies to function types}}
-  int k[10] __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
-  int l __arm_streaming[10]; // expected-error {{'__arm_streaming' only applies to function types}}
-  __arm_streaming int m, n; // expected-error {{'__arm_streaming' only applies to function types}}
-  int o __arm_streaming : 12; // expected-error {{'__arm_streaming' only applies to function types}}
-  int __arm_streaming : 0; // expected-error {{'__arm_streaming' only applies to function types}}
-  int p, __arm_streaming : 0; // expected-error {{'__arm_streaming' cannot appear here}}
-  int q, __arm_streaming r; // expected-error {{'__arm_streaming' cannot appear here}}
-  __arm_streaming int; // expected-error {{'__arm_streaming' cannot appear here}} \
+enum E3 ATTR_USE { Seven }; // expected-error {{expected identifier or '('}}
+
+struct ATTR_USE S1 { // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  int i ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int ATTR_USE j; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int k[10] ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int l ATTR_USE[10]; // expected-error {{'ATTR_NAME' only applies to function types}}
+  ATTR_USE int m, n; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int o ATTR_USE : 12; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int ATTR_USE : 0; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int p, ATTR_USE : 0; // expected-error {{'ATTR_NAME' cannot appear here}}
+  int q, ATTR_USE r; // expected-error {{'ATTR_NAME' cannot appear here}}
+  ATTR_USE int; // expected-error {{'ATTR_NAME' cannot appear here}} \
             // expected-warning {{declaration does not declare anything}}
 };
 
-__arm_streaming struct S2 { int a; }; // expected-error {{misplaced '__arm_streaming'}}
-struct S3 __arm_streaming { int a; }; // expected-error {{'__arm_streaming' cannot appear here}} \
-                                         expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+ATTR_USE struct S2 { int a; }; // expected-error {{misplaced 'ATTR_NAME'}}
+struct S3 ATTR_USE { int a; }; // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                         expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
-union __arm_streaming U { // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-  double d __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types; type here is 'double'}}
-  __arm_streaming int i; // expected-error {{'__arm_streaming' only applies to function types; type here is 'int'}}
+union ATTR_USE U { // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  double d ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types; type here is 'double'}}
+  ATTR_USE int i; // expected-error {{'ATTR_NAME' only applies to function types; type here is 'int'}}
 };
 
-__arm_streaming union U2 { double d; }; // expected-error {{misplaced '__arm_streaming'}}
-union U3 __arm_streaming { double d; }; // expected-error {{'__arm_streaming' cannot appear here}} \
-                                           expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+ATTR_USE union U2 { double d; }; // expected-error {{misplaced 'ATTR_NAME'}}
+union U3 ATTR_USE { double d; }; // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                           expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
-struct __arm_streaming IncompleteStruct; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-union __arm_streaming IncompleteUnion; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-enum __arm_streaming IncompleteEnum; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+struct ATTR_USE IncompleteStruct; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+union ATTR_USE IncompleteUnion; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+enum ATTR_USE IncompleteEnum; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
-__arm_streaming void f1(void); // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-void __arm_streaming f2(void); // expected-error {{'__arm_streaming' only applies to function types}}
-void f3 __arm_streaming (void); // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-void f4(void) __arm_streaming;
+ATTR_USE void f1(void); // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+void ATTR_USE f2(void); // expected-error {{'ATTR_NAME' only applies to function types}}
+void f3 ATTR_USE (void); // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+void f4(void) ATTR_USE;
 
-void f5(int i __arm_streaming, __arm_streaming int j, int __arm_streaming k); // expected-error 3 {{'__arm_streaming' only applies to function types}}
+void f5(int i ATTR_USE, ATTR_USE int j, int ATTR_USE k); // expected-error 3 {{'ATTR_NAME' only applies to function types}}
 
-void f6(a, b) __arm_streaming int a; int b; { // expected-error {{'__arm_streaming' cannot appear here}} \
+void f6(a, b) ATTR_USE int a; int b; { // expected-error {{'ATTR_NAME' cannot appear here}} \
                                                  c2x-warning {{deprecated}}
 }
 
@@ -63,57 +67,74 @@ void f6(a, b) __arm_streaming int a; int b; { // expected-error {{'__arm_streami
 // behavior given that we *don't* want to parse it as part of the K&R parameter
 // declarations. It is disallowed to avoid a parsing ambiguity we already
 // handle well.
-int (*f7(a, b))(int, int) __arm_streaming int a; int b; { // c2x-warning {{deprecated}}
+int (*f7(a, b))(int, int) ATTR_USE int a; int b; { // c2x-warning {{deprecated}}
   return 0;
 }
 
-__arm_streaming int a, b; // expected-error {{'__arm_streaming' only applies to function types}}
-int c __arm_streaming, d __arm_streaming; // expected-error 2 {{'__arm_streaming' only applies to function types}}
+ATTR_USE int a, b; // expected-error {{'ATTR_NAME' only applies to function types}}
+int c ATTR_USE, d ATTR_USE; // expected-error 2 {{'ATTR_NAME' only applies to function types}}
 
-void f8(void) __arm_streaming {
-  __arm_streaming int i, j; // expected-error {{'__arm_streaming' only applies to function types}}
-  int k, l __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
+void f8(void) ATTR_USE {
+  ATTR_USE int i, j; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int k, l ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
 }
 
-__arm_streaming void f9(void) { // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-  int i[10] __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
-  int (*fp1)(void)__arm_streaming;
-  int (*fp2 __arm_streaming)(void); // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+ATTR_USE void f9(void) { // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+  int i[10] ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
+  int (*fp1)(void)ATTR_USE;
+  int (*fp2 ATTR_USE)(void); // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
 
-  int * __arm_streaming *ipp; // expected-error {{'__arm_streaming' only applies to function types}}
+  int * ATTR_USE *ipp; // expected-error {{'ATTR_NAME' only applies to function types}}
 }
 
-void f10(int j[static 10] __arm_streaming, int k[*] __arm_streaming); // expected-error 2 {{'__arm_streaming' only applies to function types}}
+void f10(int j[static 10] ATTR_USE, int k[*] ATTR_USE); // expected-error 2 {{'ATTR_NAME' only applies to function types}}
 
 void f11(void) {
-  __arm_streaming {} // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming if (1) {} // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE {} // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE if (1) {} // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
 
-  __arm_streaming switch (1) { // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming case 1: __arm_streaming break; // expected-error 2 {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming default: break; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE switch (1) { // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE case 1: ATTR_USE break; // expected-error 2 {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE default: break; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
   }
 
   goto foo;
-  __arm_streaming foo: (void)1; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+  ATTR_USE foo: (void)1; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
-  __arm_streaming for (;;); // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming while (1); // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming do __arm_streaming { } while(1); // expected-error 2 {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE for (;;); // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE while (1); // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE do ATTR_USE { } while(1); // expected-error 2 {{'ATTR_NAME' cannot be applied to a statement}}
 
-  __arm_streaming (void)1; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE (void)1; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
 
-  __arm_streaming; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
 
-  (void)sizeof(int [4]__arm_streaming); // expected-error {{'__arm_streaming' only applies to function types}}
-  (void)sizeof(struct __arm_streaming S3 { int a __arm_streaming; }); // expected-error {{'__arm_streaming' cannot be applied to a declaration}} \
-                                                                      // expected-error {{'__arm_streaming' only applies to function types; type here is 'int'}}
+  (void)sizeof(int [4]ATTR_USE); // expected-error {{'ATTR_NAME' only applies to function types}}
+  (void)sizeof(struct ATTR_USE S3 { int a ATTR_USE; }); // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} \
+                                                                      // expected-error {{'ATTR_NAME' only applies to function types; type here is 'int'}}
 
-  __arm_streaming return; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE return; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
 
-  __arm_streaming asm (""); // expected-error {{'__arm_streaming' cannot appear here}}
+  ATTR_USE asm (""); // expected-error {{'ATTR_NAME' cannot appear here}}
 }
 
-struct __arm_streaming S4 *s; // expected-error {{'__arm_streaming' cannot appear here}}
+struct ATTR_USE S4 *s; // expected-error {{'ATTR_NAME' cannot appear here}}
 struct S5 {};
-int c = sizeof(struct __arm_streaming S5); // expected-error {{'__arm_streaming' cannot appear here}}
+int c = sizeof(struct ATTR_USE S5); // expected-error {{'ATTR_NAME' cannot appear here}}
+
+void invalid_parentheses1() __arm_inout; // expected-error {{expected '(' after ''__arm_inout''}}
+void invalid_parentheses2() __arm_inout(; // expected-error {{expected string literal as argument of '__arm_inout' attribute}}
+void invalid_parentheses3() __arm_inout((); // expected-error {{expected string literal as argument of '__arm_inout' attribute}}
+void invalid_parentheses4() __arm_inout); // expected-error {{expected '(' after ''__arm_inout''}} \
+                                          // expected-error {{expected function body after function declarator}}
+void invalid_parentheses5() __arm_inout(());  // expected-error {{expected string literal as argument of '__arm_inout' attribute}}
+void invalid_parentheses6() __arm_inout("za"; // expected-error {{expected ')'}}
+void invalid_parentheses7() __arm_streaming(; // expected-error {{expected parameter declarator}} \
+                                              // expected-error {{expected ')'}} \
+                                              // expected-note {{to match this '('}} \
+                                              // expected-error {{function cannot return function type 'void ()'}} \
+                                              // expected-error {{'__arm_streaming' only applies to function types; type here is 'int ()'}} \
+                                              // expected-warning {{'__arm_streaming' only applies to non-K&R-style functions}}
+void invalid_parentheses8() __arm_streaming();  // expected-error {{function cannot return function type 'void ()'}} \
+                                                // expected-error {{'__arm_streaming' only applies to function types; type here is 'int ()'}} \
+                                                // expected-warning {{'__arm_streaming' only applies to non-K&R-style functions}}
diff --git a/clang/test/Parser/c2x-attribute-keywords.m b/clang/test/Parser/c2x-attribute-keywords.m
index d1c45da34fbc60df330034fc738fbbd20de09002..575c88ffffc3d6699bd11d9ff36286a21fd5c843 100644
--- a/clang/test/Parser/c2x-attribute-keywords.m
+++ b/clang/test/Parser/c2x-attribute-keywords.m
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -triple aarch64-none-linux-gnu -target-feature +sme -verify %s
 
-enum __arm_streaming E1 : int; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+enum __arm_inout("za") E1 : int; // expected-error {{'__arm_inout' only applies to non-K&R-style functions}}
 
 @interface Base
 @end
@@ -15,5 +15,5 @@ enum __arm_streaming E1 : int; // expected-error {{'__arm_streaming' cannot be a
 
 
 void f(T *t) {
-  __arm_streaming[[t foo] bar]; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  __arm_inout("za")[[t foo] bar]; // expected-error {{'__arm_inout' cannot be applied to a statement}}
 }
diff --git a/clang/test/Parser/c2x-attributes.c b/clang/test/Parser/c2x-attributes.c
index 3e9469f8fa8b5fa1013ce9a54a16683aa159e982..be039e40f98ef19ac98326e3c5352f2c4fb85fcd 100644
--- a/clang/test/Parser/c2x-attributes.c
+++ b/clang/test/Parser/c2x-attributes.c
@@ -16,7 +16,7 @@ enum { [[]] Six }; // expected-error {{expected identifier}}
 // FIXME: this diagnostic can be improved.
 enum E3 [[]] { Seven }; // expected-error {{expected identifier or '('}}
 
-[[deprecated([""])]] int WrongArgs; // expected-error {{expected expression}}
+[[deprecated([""])]] int WrongArgs; // expected-error {{expected string literal as argument of 'deprecated' attribute}}
 [[,,,,,]] int Commas1; // ok
 [[,, maybe_unused]] int Commas2; // ok
 [[maybe_unused,,,]] int Commas3; // ok
@@ -24,6 +24,13 @@ enum E3 [[]] { Seven }; // expected-error {{expected identifier or '('}}
 [[foo bar]] int NoComma; // expected-error {{expected ','}} \
                          // expected-warning {{unknown attribute 'foo' ignored}}
 
+
+[[deprecated(L"abc")]] void unevaluated_string(void);
+// expected-warning@-1 {{encoding prefix 'L' on an unevaluated string literal has no effect}}
+
+[[nodiscard("\123")]] int unevaluated_string2(void);
+// expected-error@-1 {{invalid escape sequence '\123' in an unevaluated string literal}}
+
 struct [[]] S1 {
   int i [[]];
   int [[]] j;
diff --git a/clang/test/Parser/cxx-attributes.cpp b/clang/test/Parser/cxx-attributes.cpp
index b46326c4d474f0d74cac87a5b98efd05f4d3272a..51d1f9c8228121886e2e2db65f1755689484174c 100644
--- a/clang/test/Parser/cxx-attributes.cpp
+++ b/clang/test/Parser/cxx-attributes.cpp
@@ -41,7 +41,7 @@ void fn() {
   pi = &i[0];
 }
 
-[[deprecated([""])]] int WrongArgs; // expected-error {{expected variable name or 'this' in lambda capture list}}
+[[deprecated([""])]] int WrongArgs; // expected-error {{expected string literal as argument of 'deprecated' attribute}}
 [[,,,,,]] int Commas1; // ok
 [[,, maybe_unused]] int Commas2; // ok
 [[maybe_unused,,,]] int Commas3; // ok
diff --git a/clang/test/Parser/cxx0x-attributes.cpp b/clang/test/Parser/cxx0x-attributes.cpp
index 3cb4e180e5f5f1f834ba2fe52338ec0c9c7bc285..10c5bbcac10227be31de09ecdb1fc94344c940b6 100644
--- a/clang/test/Parser/cxx0x-attributes.cpp
+++ b/clang/test/Parser/cxx0x-attributes.cpp
@@ -94,7 +94,7 @@ class [[]] [[]] final_class_another
   [[]] [[]] alignas(16) [[]]{}; // expected-error {{an attribute list cannot appear here}}
 
 // The diagnostics here don't matter much, this just shouldn't crash:
-class C final [[deprecated(l]] {}); // expected-error {{use of undeclared identifier}} expected-error {{expected ']'}} expected-error {{an attribute list cannot appear here}} expected-error {{expected unqualified-id}}
+class C final [[deprecated(l]] {}); //expected-error {{expected string literal as argument of 'deprecated' attribute}} expected-error {{an attribute list cannot appear here}} expected-error {{expected unqualified-id}}
 class D final alignas ([l) {}]{}); // expected-error {{expected ',' or ']' in lambda capture list}} expected-error {{an attribute list cannot appear here}}
 
 [[]] struct with_init_declarators {} init_declarator;
@@ -266,7 +266,7 @@ template<typename...Ts> void variadic() {
 
 template <int... Is> void variadic_nttp() {
   void bar [[noreturn...]] ();                        // expected-error {{attribute 'noreturn' cannot be used as an attribute pack}}
-  void baz [[clang::no_sanitize(Is...)]] ();          // expected-error {{attribute 'no_sanitize' does not support argument pack expansion}}
+  void baz [[clang::no_sanitize(Is...)]] ();          // expected-error {{expected string literal as argument of 'no_sanitize' attribute}}
   void bor [[clang::annotate("A", "V" ...)]] ();      // expected-error {{pack expansion does not contain any unexpanded parameter packs}}
   void bir [[clang::annotate("B", {1, 2, 3, 4})]] (); // expected-error {{'annotate' attribute requires parameter 1 to be a constant expression}} expected-note {{subexpression not valid in a constant expression}}
   void boo [[unknown::foo(Is...)]] ();                // expected-warning {{unknown attribute 'foo' ignored}}
@@ -445,3 +445,9 @@ class Ordering {
   ) {
   }
 };
+
+namespace P2361 {
+[[deprecated(L"abc")]] void a(); // expected-warning{{encoding prefix 'L' on an unevaluated string literal has no effect and is incompatible with c++2c}} \
+                                 // expected-warning {{use of the 'deprecated' attribute is a C++14 extension}}
+[[nodiscard("\123")]] int b(); // expected-error{{invalid escape sequence '\123' in an unevaluated string literal}}
+}
diff --git a/clang/test/Parser/cxx0x-keyword-attributes.cpp b/clang/test/Parser/cxx0x-keyword-attributes.cpp
index 256a834e9e5460602814a4d01b009371e17c47b6..be7423cc7ecee367fdafc8051e8ba79ebb8e54bc 100644
--- a/clang/test/Parser/cxx0x-keyword-attributes.cpp
+++ b/clang/test/Parser/cxx0x-keyword-attributes.cpp
@@ -1,4 +1,7 @@
-// RUN: %clang_cc1 -fcxx-exceptions -fdeclspec -fexceptions -fsyntax-only -verify -std=c++11 -Wc++14-compat -Wc++14-extensions -Wc++17-extensions -triple aarch64-none-linux-gnu %s
+// RUN: sed -e "s@ATTR_USE@__arm_streaming@g" -e "s@ATTR_NAME@__arm_streaming@g" %s > %t
+// RUN: %clang_cc1 -fcxx-exceptions -fdeclspec -fexceptions -fsyntax-only -verify -std=c++11 -Wc++14-compat -Wc++14-extensions -Wc++17-extensions -triple aarch64-none-linux-gnu -target-feature +sme -x c++ %t
+// RUN: sed -e "s@ATTR_USE@__arm_inout\(\"za\"\)@g" -e "s@ATTR_NAME@__arm_inout@g" %s > %t
+// RUN: %clang_cc1 -fcxx-exceptions -fdeclspec -fexceptions -fsyntax-only -verify -std=c++11 -Wc++14-compat -Wc++14-extensions -Wc++17-extensions -triple aarch64-none-linux-gnu -target-feature +sme -x c++ %t
 
 // Need std::initializer_list
 namespace std {
@@ -35,136 +38,136 @@ namespace std {
 
 
 // Declaration syntax checks
-__arm_streaming int before_attr; // expected-error {{'__arm_streaming' only applies to function types}}
-int __arm_streaming between_attr; // expected-error {{'__arm_streaming' only applies to function types}}
-const __arm_streaming int between_attr_2 = 0; // expected-error {{'__arm_streaming' cannot appear here}}
-int after_attr __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
-int * __arm_streaming ptr_attr; // expected-error {{'__arm_streaming' only applies to function types}}
-int & __arm_streaming ref_attr = after_attr; // expected-error {{'__arm_streaming' only applies to function types}}
-int && __arm_streaming rref_attr = 0; // expected-error {{'__arm_streaming' only applies to function types}}
-int array_attr [1] __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
-void fn_attr () __arm_streaming;
-void noexcept_fn_attr () noexcept __arm_streaming;
+ATTR_USE int before_attr; // expected-error {{'ATTR_NAME' only applies to function types}}
+int ATTR_USE between_attr; // expected-error {{'ATTR_NAME' only applies to function types}}
+const ATTR_USE int between_attr_2 = 0; // expected-error {{'ATTR_NAME' cannot appear here}}
+int after_attr ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
+int * ATTR_USE ptr_attr; // expected-error {{'ATTR_NAME' only applies to function types}}
+int & ATTR_USE ref_attr = after_attr; // expected-error {{'ATTR_NAME' only applies to function types}}
+int && ATTR_USE rref_attr = 0; // expected-error {{'ATTR_NAME' only applies to function types}}
+int array_attr [1] ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
+void fn_attr () ATTR_USE;
+void noexcept_fn_attr () noexcept ATTR_USE;
 struct MemberFnOrder {
-  virtual void f() const volatile && noexcept __arm_streaming final = 0;
+  virtual void f() const volatile && noexcept ATTR_USE final = 0;
 };
-struct __arm_streaming struct_attr; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-class __arm_streaming class_attr {}; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-union __arm_streaming union_attr; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-enum __arm_streaming E { }; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+struct ATTR_USE struct_attr; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+class ATTR_USE class_attr {}; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+union ATTR_USE union_attr; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+enum ATTR_USE E { }; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 namespace test_misplacement {
-__arm_streaming struct struct_attr2;  // expected-error {{misplaced '__arm_streaming'}}
-__arm_streaming class class_attr2; // expected-error {{misplaced '__arm_streaming'}}
-__arm_streaming union union_attr2; // expected-error {{misplaced '__arm_streaming'}}
-__arm_streaming enum  E2 { }; // expected-error {{misplaced '__arm_streaming'}}
+ATTR_USE struct struct_attr2;  // expected-error {{misplaced 'ATTR_NAME'}}
+ATTR_USE class class_attr2; // expected-error {{misplaced 'ATTR_NAME'}}
+ATTR_USE union union_attr2; // expected-error {{misplaced 'ATTR_NAME'}}
+ATTR_USE enum  E2 { }; // expected-error {{misplaced 'ATTR_NAME'}}
 }
 
 // Checks attributes placed at wrong syntactic locations of class specifiers.
-class __arm_streaming __arm_streaming // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
-  attr_after_class_name_decl __arm_streaming __arm_streaming; // expected-error {{'__arm_streaming' cannot appear here}} \
-                                                                 expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
+class ATTR_USE ATTR_USE // expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  attr_after_class_name_decl ATTR_USE ATTR_USE; // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                                                 expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
-class __arm_streaming __arm_streaming // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
- attr_after_class_name_definition __arm_streaming __arm_streaming __arm_streaming{}; // expected-error {{'__arm_streaming' cannot appear here}} \
-                                                                                        expected-error 3 {{'__arm_streaming' cannot be applied to a declaration}}
+class ATTR_USE ATTR_USE // expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
+ attr_after_class_name_definition ATTR_USE ATTR_USE ATTR_USE{}; // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                                                                        expected-error 3 {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
-class __arm_streaming c {}; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-class c __arm_streaming __arm_streaming x; // expected-error 2 {{'__arm_streaming' only applies to function types}}
-class c __arm_streaming __arm_streaming y __arm_streaming __arm_streaming; // expected-error 4 {{'__arm_streaming' only applies to function types}}
+class ATTR_USE c {}; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+class c ATTR_USE ATTR_USE x; // expected-error 2 {{'ATTR_NAME' only applies to function types}}
+class c ATTR_USE ATTR_USE y ATTR_USE ATTR_USE; // expected-error 4 {{'ATTR_NAME' only applies to function types}}
 class c final [(int){0}];
 
 class base {};
-class __arm_streaming __arm_streaming final_class // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
-  __arm_streaming alignas(float) final // expected-error {{'__arm_streaming' cannot appear here}} \
-                                          expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-  __arm_streaming alignas(float) __arm_streaming alignas(float): base{}; // expected-error {{'__arm_streaming' cannot appear here}}
+class ATTR_USE ATTR_USE final_class // expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  ATTR_USE alignas(float) final // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                          expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  ATTR_USE alignas(float) ATTR_USE alignas(float): base{}; // expected-error {{'ATTR_NAME' cannot appear here}}
 
-class __arm_streaming __arm_streaming final_class_another // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
-  __arm_streaming __arm_streaming alignas(16) final // expected-error {{'__arm_streaming' cannot appear here}} \
-                                                       expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
-  __arm_streaming __arm_streaming alignas(16) __arm_streaming{}; // expected-error {{'__arm_streaming' cannot appear here}}
+class ATTR_USE ATTR_USE final_class_another // expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  ATTR_USE ATTR_USE alignas(16) final // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                                       expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  ATTR_USE ATTR_USE alignas(16) ATTR_USE{}; // expected-error {{'ATTR_NAME' cannot appear here}}
 
-class after_class_close {} __arm_streaming; // expected-error {{'__arm_streaming' cannot appear here, place it after "class" to apply it to the type declaration}}
+class after_class_close {} ATTR_USE; // expected-error {{'ATTR_NAME' cannot appear here, place it after "class" to apply it to the type declaration}}
 
 class C {};
 
-__arm_streaming struct with_init_declarators {} init_declarator; // expected-error {{'__arm_streaming' only applies to function types}}
-__arm_streaming struct no_init_declarators; // expected-error {{misplaced '__arm_streaming'}}
-template<typename> __arm_streaming struct no_init_declarators_template; // expected-error {{'__arm_streaming' cannot appear here}}
+ATTR_USE struct with_init_declarators {} init_declarator; // expected-error {{'ATTR_NAME' only applies to function types}}
+ATTR_USE struct no_init_declarators; // expected-error {{misplaced 'ATTR_NAME'}}
+template<typename> ATTR_USE struct no_init_declarators_template; // expected-error {{'ATTR_NAME' cannot appear here}}
 void fn_with_structs() {
-  __arm_streaming struct with_init_declarators {} init_declarator; // expected-error {{'__arm_streaming' only applies to function types}}
-  __arm_streaming struct no_init_declarators; // expected-error {{'__arm_streaming' cannot appear here}}
+  ATTR_USE struct with_init_declarators {} init_declarator; // expected-error {{'ATTR_NAME' only applies to function types}}
+  ATTR_USE struct no_init_declarators; // expected-error {{'ATTR_NAME' cannot appear here}}
 }
-__arm_streaming; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+ATTR_USE; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 struct ctordtor {
-  __arm_streaming ctordtor __arm_streaming () __arm_streaming; // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
-  ctordtor (C) __arm_streaming;
-  __arm_streaming ~ctordtor __arm_streaming () __arm_streaming; // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
+  ATTR_USE ctordtor ATTR_USE () ATTR_USE; // expected-error 2 {{'ATTR_NAME' cannot be applied to a declaration}}
+  ctordtor (C) ATTR_USE;
+  ATTR_USE ~ctordtor ATTR_USE () ATTR_USE; // expected-error 2 {{'ATTR_NAME' cannot be applied to a declaration}}
 };
-__arm_streaming ctordtor::ctordtor __arm_streaming () __arm_streaming {} // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
-__arm_streaming ctordtor::ctordtor (C) __arm_streaming try {} catch (...) {} // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-__arm_streaming ctordtor::~ctordtor __arm_streaming () __arm_streaming {} // expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
-extern "C++" __arm_streaming int extern_attr; // expected-error {{'__arm_streaming' only applies to function types}}
-template <typename T> __arm_streaming void template_attr (); // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-__arm_streaming __arm_streaming int __arm_streaming __arm_streaming multi_attr __arm_streaming __arm_streaming; // expected-error 6 {{'__arm_streaming' only applies to function types}}
-
-int (paren_attr) __arm_streaming; // expected-error {{'__arm_streaming' cannot appear here}}
-unsigned __arm_streaming int attr_in_decl_spec; // expected-error {{'__arm_streaming' cannot appear here}}
-unsigned __arm_streaming int __arm_streaming const double_decl_spec = 0; // expected-error 2 {{'__arm_streaming' cannot appear here}}
+ATTR_USE ctordtor::ctordtor ATTR_USE () ATTR_USE {} // expected-error 2 {{'ATTR_NAME' cannot be applied to a declaration}}
+ATTR_USE ctordtor::ctordtor (C) ATTR_USE try {} catch (...) {} // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+ATTR_USE ctordtor::~ctordtor ATTR_USE () ATTR_USE {} // expected-error 2 {{'ATTR_NAME' cannot be applied to a declaration}}
+extern "C++" ATTR_USE int extern_attr; // expected-error {{'ATTR_NAME' only applies to function types}}
+template <typename T> ATTR_USE void template_attr (); // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+ATTR_USE ATTR_USE int ATTR_USE ATTR_USE multi_attr ATTR_USE ATTR_USE; // expected-error 6 {{'ATTR_NAME' only applies to function types}}
+
+int (paren_attr) ATTR_USE; // expected-error {{'ATTR_NAME' cannot appear here}}
+unsigned ATTR_USE int attr_in_decl_spec; // expected-error {{'ATTR_NAME' cannot appear here}}
+unsigned ATTR_USE int ATTR_USE const double_decl_spec = 0; // expected-error 2 {{'ATTR_NAME' cannot appear here}}
 class foo {
-  void const_after_attr () __arm_streaming const; // expected-error {{expected ';'}}
+  void const_after_attr () ATTR_USE const; // expected-error {{expected ';'}}
 };
-extern "C++" __arm_streaming { } // expected-error {{'__arm_streaming' cannot appear here}}
-__arm_streaming extern "C++" { } // expected-error {{'__arm_streaming' cannot appear here}}
-__arm_streaming template <typename T> void before_template_attr (); // expected-error {{'__arm_streaming' cannot appear here}}
-__arm_streaming namespace ns { int i; } // expected-error {{'__arm_streaming' cannot appear here}}
-__arm_streaming static_assert(true, ""); //expected-error {{'__arm_streaming' cannot appear here}}
-__arm_streaming asm(""); // expected-error {{'__arm_streaming' cannot appear here}}
-
-__arm_streaming using ns::i; // expected-warning {{ISO C++}} \
-                                expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-__arm_streaming using namespace ns; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-namespace __arm_streaming ns2 {} // expected-warning {{attributes on a namespace declaration are a C++17 extension}} \
-                                    expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-
-using __arm_streaming alignas(4)__arm_streaming ns::i;          // expected-warning 2 {{ISO C++}} \
-                                                                   expected-error {{'__arm_streaming' cannot appear here}} \
+extern "C++" ATTR_USE { } // expected-error {{'ATTR_NAME' cannot appear here}}
+ATTR_USE extern "C++" { } // expected-error {{'ATTR_NAME' cannot appear here}}
+ATTR_USE template <typename T> void before_template_attr (); // expected-error {{'ATTR_NAME' cannot appear here}}
+ATTR_USE namespace ns { int i; } // expected-error {{'ATTR_NAME' cannot appear here}}
+ATTR_USE static_assert(true, ""); //expected-error {{'ATTR_NAME' cannot appear here}}
+ATTR_USE asm(""); // expected-error {{'ATTR_NAME' cannot appear here}}
+
+ATTR_USE using ns::i; // expected-warning {{ISO C++}} \
+                                expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+ATTR_USE using namespace ns; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+namespace ATTR_USE ns2 {} // expected-warning {{attributes on a namespace declaration are a C++17 extension}} \
+                                    expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+
+using ATTR_USE alignas(4)ATTR_USE ns::i;          // expected-warning 2 {{ISO C++}} \
+                                                                   expected-error {{'ATTR_NAME' cannot appear here}} \
                                                                    expected-error {{'alignas' attribute only applies to variables, data members and tag types}} \
                                                                    expected-warning {{ISO C++}} \
-                                                                   expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
-using __arm_streaming alignas(4) __arm_streaming foobar = int; // expected-error {{'__arm_streaming' cannot appear here}} \
+                                                                   expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
+using ATTR_USE alignas(4) ATTR_USE foobar = int; // expected-error {{'ATTR_NAME' cannot appear here}} \
                                                                   expected-error {{'alignas' attribute only applies to}} \
-                                                                  expected-error 2 {{'__arm_streaming' only applies to function types}}
-
-__arm_streaming using T = int; // expected-error {{'__arm_streaming' cannot appear here}}
-using T __arm_streaming = int; // expected-error {{'__arm_streaming' only applies to function types}}
-template<typename T> using U __arm_streaming = T; // expected-error {{'__arm_streaming' only applies to function types}}
-using ns::i __arm_streaming; // expected-warning {{ISO C++}} \
-                                expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-using ns::i __arm_streaming, ns::i __arm_streaming; // expected-warning 2 {{ISO C++}} \
+                                                                  expected-error 2 {{'ATTR_NAME' only applies to function types}}
+
+ATTR_USE using T = int; // expected-error {{'ATTR_NAME' cannot appear here}}
+using T ATTR_USE = int; // expected-error {{'ATTR_NAME' only applies to function types}}
+template<typename T> using U ATTR_USE = T; // expected-error {{'ATTR_NAME' only applies to function types}}
+using ns::i ATTR_USE; // expected-warning {{ISO C++}} \
+                                expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+using ns::i ATTR_USE, ns::i ATTR_USE; // expected-warning 2 {{ISO C++}} \
                                                        expected-warning {{use of multiple declarators in a single using declaration is a C++17 extension}} \
-                                                       expected-error 2 {{'__arm_streaming' cannot be applied to a declaration}}
+                                                       expected-error 2 {{'ATTR_NAME' only applies to non-K&R-style functions}}
 struct using_in_struct_base {
   typedef int i, j, k, l;
 };
 struct using_in_struct : using_in_struct_base {
-  __arm_streaming using using_in_struct_base::i; // expected-warning {{ISO C++}} \
-                                                    expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-  using using_in_struct_base::j __arm_streaming; // expected-warning {{ISO C++}} \
-                                                    expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-  __arm_streaming using using_in_struct_base::k __arm_streaming, using_in_struct_base::l __arm_streaming; // expected-warning 3 {{ISO C++}} \
+  ATTR_USE using using_in_struct_base::i; // expected-warning {{ISO C++}} \
+                                                    expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  using using_in_struct_base::j ATTR_USE; // expected-warning {{ISO C++}} \
+                                                    expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  ATTR_USE using using_in_struct_base::k ATTR_USE, using_in_struct_base::l ATTR_USE; // expected-warning 3 {{ISO C++}} \
                                                                                                              expected-warning {{use of multiple declarators in a single using declaration is a C++17 extension}} \
-                                                                                                             expected-error 4 {{'__arm_streaming' cannot be applied to a declaration}}
+                                                                                                             expected-error 4 {{'ATTR_NAME' only applies to non-K&R-style functions}}
 };
-using __arm_streaming ns::i; // expected-warning {{ISO C++}} \
-                                expected-error {{'__arm_streaming' cannot appear here}} \
-                                expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-using T __arm_streaming = int; // expected-error {{'__arm_streaming' only applies to function types}}
+using ATTR_USE ns::i; // expected-warning {{ISO C++}} \
+                                expected-error {{'ATTR_NAME' cannot appear here}} \
+                                expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+using T ATTR_USE = int; // expected-error {{'ATTR_NAME' only applies to function types}}
 
-auto trailing() -> __arm_streaming const int; // expected-error {{'__arm_streaming' cannot appear here}}
-auto trailing() -> const __arm_streaming int; // expected-error {{'__arm_streaming' cannot appear here}}
-auto trailing() -> const int __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
-auto trailing_2() -> struct struct_attr __arm_streaming; // expected-error {{'__arm_streaming' only applies to function types}}
+auto trailing() -> ATTR_USE const int; // expected-error {{'ATTR_NAME' cannot appear here}}
+auto trailing() -> const ATTR_USE int; // expected-error {{'ATTR_NAME' cannot appear here}}
+auto trailing() -> const int ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
+auto trailing_2() -> struct struct_attr ATTR_USE; // expected-error {{'ATTR_NAME' only applies to function types}}
 
 namespace N {
   struct S {};
@@ -172,88 +175,88 @@ namespace N {
 template<typename> struct Template {};
 
 // FIXME: Improve this diagnostic
-struct __arm_streaming N::S s; // expected-error {{'__arm_streaming' cannot appear here}}
-struct __arm_streaming Template<int> t; // expected-error {{'__arm_streaming' cannot appear here}}
-struct __arm_streaming ::template Template<int> u; // expected-error {{'__arm_streaming' cannot appear here}}
-template struct __arm_streaming Template<char>; // expected-error {{'__arm_streaming' cannot appear here}}
+struct ATTR_USE N::S s; // expected-error {{'ATTR_NAME' cannot appear here}}
+struct ATTR_USE Template<int> t; // expected-error {{'ATTR_NAME' cannot appear here}}
+struct ATTR_USE ::template Template<int> u; // expected-error {{'ATTR_NAME' cannot appear here}}
+template struct ATTR_USE Template<char>; // expected-error {{'ATTR_NAME' cannot appear here}}
 template struct __attribute__((pure)) Template<std::size_t>; // We still allow GNU-style attributes here
-template <> struct __arm_streaming Template<void>; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-
-enum __arm_streaming E1 {}; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-enum __arm_streaming E2; // expected-error {{forbids forward references}} \
-                            expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-enum __arm_streaming E1; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-enum __arm_streaming E3 : int; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-enum __arm_streaming { // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-  k_123 __arm_streaming = 123 // expected-warning {{attributes on an enumerator declaration are a C++17 extension}} \
-                                 expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+template <> struct ATTR_USE Template<void>; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+
+enum ATTR_USE E1 {}; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+enum ATTR_USE E2; // expected-error {{forbids forward references}} \
+                            expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+enum ATTR_USE E1; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+enum ATTR_USE E3 : int; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+enum ATTR_USE { // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+  k_123 ATTR_USE = 123 // expected-warning {{attributes on an enumerator declaration are a C++17 extension}} \
+                                 expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 };
-enum __arm_streaming E1 e; // expected-error {{'__arm_streaming' cannot appear here}}
-enum __arm_streaming class E4 { }; // expected-error {{'__arm_streaming' cannot appear here}}
-enum struct __arm_streaming E5; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-enum E6 {} __arm_streaming; // expected-error {{'__arm_streaming' cannot appear here, place it after "enum" to apply it to the type declaration}}
+enum ATTR_USE E1 e; // expected-error {{'ATTR_NAME' cannot appear here}}
+enum ATTR_USE class E4 { }; // expected-error {{'ATTR_NAME' cannot appear here}}
+enum struct ATTR_USE E5; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
+enum E6 {} ATTR_USE; // expected-error {{'ATTR_NAME' cannot appear here, place it after "enum" to apply it to the type declaration}}
 
 struct S {
-  friend int f __arm_streaming (); // expected-error {{'__arm_streaming' cannot appear here}} \
-                                      expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-  friend int f2 __arm_streaming () {} // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-  __arm_streaming friend int g(); // expected-error {{'__arm_streaming' cannot appear here}}
-  __arm_streaming friend int h() { // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+  friend int f ATTR_USE (); // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                      expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+  friend int f2 ATTR_USE () {} // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+  ATTR_USE friend int g(); // expected-error {{'ATTR_NAME' cannot appear here}}
+  ATTR_USE friend int h() { // expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
   }
-  __arm_streaming friend int f3(), f4(), f5(); // expected-error {{'__arm_streaming' cannot appear here}}
-  friend int f6 __arm_streaming (), f7 __arm_streaming (), f8 __arm_streaming (); // expected-error3 {{'__arm_streaming' cannot appear here}} \
-                                                                                     expected-error 3 {{'__arm_streaming' cannot be applied to a declaration}}
-  friend class __arm_streaming C; // expected-error {{'__arm_streaming' cannot appear here}}
-  __arm_streaming friend class D; // expected-error {{'__arm_streaming' cannot appear here}}
-  __arm_streaming friend int; // expected-error {{'__arm_streaming' cannot appear here}}
+  ATTR_USE friend int f3(), f4(), f5(); // expected-error {{'ATTR_NAME' cannot appear here}}
+  friend int f6 ATTR_USE (), f7 ATTR_USE (), f8 ATTR_USE (); // expected-error3 {{'ATTR_NAME' cannot appear here}} \
+                                                                                     expected-error 3 {{'ATTR_NAME' cannot be applied to a declaration}}
+  friend class ATTR_USE C; // expected-error {{'ATTR_NAME' cannot appear here}}
+  ATTR_USE friend class D; // expected-error {{'ATTR_NAME' cannot appear here}}
+  ATTR_USE friend int; // expected-error {{'ATTR_NAME' cannot appear here}}
 };
 template<typename T> void tmpl (T) {}
-template __arm_streaming void tmpl(char); // expected-error {{'__arm_streaming' cannot appear here}}
-template void __arm_streaming tmpl(short); // expected-error {{'__arm_streaming' only applies to function types}}
+template ATTR_USE void tmpl(char); // expected-error {{'ATTR_NAME' cannot appear here}}
+template void ATTR_USE tmpl(short); // expected-error {{'ATTR_NAME' only applies to function types}}
 
 // Statement tests
 void foo () {
-  __arm_streaming ; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming { } // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming if (0) { } // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming for (;;); // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming do { // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-    __arm_streaming continue; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE ; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE { } // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE if (0) { } // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE for (;;); // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE do { // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+    ATTR_USE continue; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
   } while (0);
-  __arm_streaming while (0); // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE while (0); // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
 
-  __arm_streaming switch (i) { // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-    __arm_streaming case 0: // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-    __arm_streaming default: // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-      __arm_streaming break; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE switch (i) { // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+    ATTR_USE case 0: // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+    ATTR_USE default: // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+      ATTR_USE break; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
   }
 
-  __arm_streaming goto there; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  __arm_streaming there: // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+  ATTR_USE goto there; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  ATTR_USE there: // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
-  __arm_streaming try { // expected-error {{'__arm_streaming' cannot be applied to a statement}}
-  } __arm_streaming catch (...) { // expected-error {{'__arm_streaming' cannot appear here}}
+  ATTR_USE try { // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
+  } ATTR_USE catch (...) { // expected-error {{'ATTR_NAME' cannot appear here}}
   }
 
-  void bar __arm_streaming (__arm_streaming int i, __arm_streaming int j); // expected-error 2 {{'__arm_streaming' only applies to function types}} \
-                                                                              expected-error {{'__arm_streaming' cannot be applied to a declaration}}
-  using FuncType = void (__arm_streaming int); // expected-error {{'__arm_streaming' only applies to function types}}
-  void baz(__arm_streaming...); // expected-error {{expected parameter declarator}}
+  void bar ATTR_USE (ATTR_USE int i, ATTR_USE int j); // expected-error 2 {{'ATTR_NAME' only applies to function types}} \
+                                                                              expected-error {{'ATTR_NAME' cannot be applied to a declaration}}
+  using FuncType = void (ATTR_USE int); // expected-error {{'ATTR_NAME' only applies to function types}}
+  void baz(ATTR_USE...); // expected-error {{expected parameter declarator}}
 
-  __arm_streaming return; // expected-error {{'__arm_streaming' cannot be applied to a statement}}
+  ATTR_USE return; // expected-error {{'ATTR_NAME' cannot be applied to a statement}}
 }
 
 // Expression tests
 void bar () {
-  new int[42]__arm_streaming[5]__arm_streaming{}; // expected-error {{'__arm_streaming' only applies to function types}}
+  new int[42]ATTR_USE[5]ATTR_USE{}; // expected-error {{'ATTR_NAME' only applies to function types}}
 }
 
 // Condition tests
 void baz () {
-  if (__arm_streaming bool b = true) { // expected-error {{'__arm_streaming' only applies to function types}}
-    switch (__arm_streaming int n { 42 }) { // expected-error {{'__arm_streaming' only applies to function types}}
+  if (ATTR_USE bool b = true) { // expected-error {{'ATTR_NAME' only applies to function types}}
+    switch (ATTR_USE int n { 42 }) { // expected-error {{'ATTR_NAME' only applies to function types}}
     default:
-      for (__arm_streaming int n = 0; __arm_streaming char b = n < 5; ++b) { // expected-error 2 {{'__arm_streaming' only applies to function types}}
+      for (ATTR_USE int n = 0; ATTR_USE char b = n < 5; ++b) { // expected-error 2 {{'ATTR_NAME' only applies to function types}}
       }
     }
   }
@@ -261,37 +264,37 @@ void baz () {
   // An attribute can be applied to an expression-statement, such as the first
   // statement in a for. But it can't be applied to a condition which is an
   // expression.
-  for (__arm_streaming x = 0; ; ) {} // expected-error {{'__arm_streaming' cannot appear here}}
-  for (; __arm_streaming x < 5; ) {} // expected-error {{'__arm_streaming' cannot appear here}}
-  while (__arm_streaming bool k { false }) { // expected-error {{'__arm_streaming' only applies to function types}}
+  for (ATTR_USE x = 0; ; ) {} // expected-error {{'ATTR_NAME' cannot appear here}}
+  for (; ATTR_USE x < 5; ) {} // expected-error {{'ATTR_NAME' cannot appear here}}
+  while (ATTR_USE bool k { false }) { // expected-error {{'ATTR_NAME' only applies to function types}}
   }
-  while (__arm_streaming true) { // expected-error {{'__arm_streaming' cannot appear here}}
+  while (ATTR_USE true) { // expected-error {{'ATTR_NAME' cannot appear here}}
   }
   do {
-  } while (__arm_streaming false); // expected-error {{'__arm_streaming' cannot appear here}}
+  } while (ATTR_USE false); // expected-error {{'ATTR_NAME' cannot appear here}}
 
-  for (__arm_streaming int n : { 1, 2, 3 }) { // expected-error {{'__arm_streaming' only applies to function types}}
+  for (ATTR_USE int n : { 1, 2, 3 }) { // expected-error {{'ATTR_NAME' only applies to function types}}
   }
 }
 
 enum class __attribute__((visibility("hidden"))) SecretKeepers {
   one, /* rest are deprecated */ two, three
 };
-enum class __arm_streaming EvenMoreSecrets {}; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+enum class ATTR_USE EvenMoreSecrets {}; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
 // Forbid attributes on decl specifiers.
-unsigned __arm_streaming static int __arm_streaming v1; // expected-error {{'__arm_streaming' only applies to function types}} \
-           expected-error {{'__arm_streaming' cannot appear here}}
-typedef __arm_streaming unsigned long __arm_streaming v2; // expected-error {{'__arm_streaming' only applies to function types}} \
-          expected-error {{'__arm_streaming' cannot appear here}}
-int __arm_streaming foo(int __arm_streaming x); // expected-error 2 {{'__arm_streaming' only applies to function types}}
+unsigned ATTR_USE static int ATTR_USE v1; // expected-error {{'ATTR_NAME' only applies to function types}} \
+           expected-error {{'ATTR_NAME' cannot appear here}}
+typedef ATTR_USE unsigned long ATTR_USE v2; // expected-error {{'ATTR_NAME' only applies to function types}} \
+          expected-error {{'ATTR_NAME' cannot appear here}}
+int ATTR_USE foo(int ATTR_USE x); // expected-error 2 {{'ATTR_NAME' only applies to function types}}
 
-__arm_streaming; // expected-error {{'__arm_streaming' cannot be applied to a declaration}}
+ATTR_USE; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}}
 
 class A {
-  A(__arm_streaming int a); // expected-error {{'__arm_streaming' only applies to function types}}
+  A(ATTR_USE int a); // expected-error {{'ATTR_NAME' only applies to function types}}
 };
-A::A(__arm_streaming int a) {} // expected-error {{'__arm_streaming' only applies to function types}}
+A::A(ATTR_USE int a) {} // expected-error {{'ATTR_NAME' only applies to function types}}
 
 template<typename T> struct TemplateStruct {};
 class FriendClassesWithAttributes {
@@ -299,47 +302,47 @@ class FriendClassesWithAttributes {
   template <class _Tp, class _Alloc> friend class __attribute__((__type_visibility__("default"))) vector;
   template <class _Tp, class _Alloc> friend class __declspec(code_seg("foo,whatever")) vector2;
   // But not C++11 ones
-  template <class _Tp, class _Alloc> friend class __arm_streaming vector3;                                         // expected-error {{'__arm_streaming' cannot appear here}}
+  template <class _Tp, class _Alloc> friend class ATTR_USE vector3;                                         // expected-error {{'ATTR_NAME' cannot appear here}}
 
   // Also allowed
   friend struct __attribute__((__type_visibility__("default"))) TemplateStruct<FriendClassesWithAttributes>;
   friend struct __declspec(code_seg("foo,whatever")) TemplateStruct<FriendClassesWithAttributes>;
-  friend struct __arm_streaming TemplateStruct<FriendClassesWithAttributes>;                                       // expected-error {{'__arm_streaming' cannot appear here}}
+  friend struct ATTR_USE TemplateStruct<FriendClassesWithAttributes>;                                       // expected-error {{'ATTR_NAME' cannot appear here}}
 };
 
 // Check ordering: C++11 attributes must appear before GNU attributes.
 class Ordering {
   void f1(
-    int (__arm_streaming __attribute__(()) int n) // expected-error {{'__arm_streaming' only applies to function types}}
+    int (ATTR_USE __attribute__(()) int n) // expected-error {{'ATTR_NAME' only applies to function types}}
   ) {
   }
 
   void f2(
-      int (*)(__arm_streaming __attribute__(()) int n) // expected-error {{'__arm_streaming' only applies to function types}}
+      int (*)(ATTR_USE __attribute__(()) int n) // expected-error {{'ATTR_NAME' only applies to function types}}
   ) {
   }
 
   void f3(
-    int (__attribute__(()) __arm_streaming int n) // expected-error {{'__arm_streaming' cannot appear here}}
+    int (__attribute__(()) ATTR_USE int n) // expected-error {{'ATTR_NAME' cannot appear here}}
   ) {
   }
 
   void f4(
-      int (*)(__attribute__(()) __arm_streaming int n) // expected-error {{'__arm_streaming' cannot appear here}}
+      int (*)(__attribute__(()) ATTR_USE int n) // expected-error {{'ATTR_NAME' cannot appear here}}
   ) {
   }
 };
 
 namespace base_specs {
 struct A {};
-struct B : __arm_streaming A {}; // expected-error {{'__arm_streaming' cannot be applied to a base specifier}}
-struct C : __arm_streaming virtual A {}; // expected-error {{'__arm_streaming' cannot be applied to a base specifier}}
-struct D : __arm_streaming public virtual A {}; // expected-error {{'__arm_streaming' cannot be applied to a base specifier}}
-struct E : public __arm_streaming virtual A {}; // expected-error {{'__arm_streaming' cannot appear here}} \
-                                                   expected-error {{'__arm_streaming' cannot be applied to a base specifier}}
-struct F : virtual __arm_streaming public A {}; // expected-error {{'__arm_streaming' cannot appear here}} \
-                                                   expected-error {{'__arm_streaming' cannot be applied to a base specifier}}
+struct B : ATTR_USE A {}; // expected-error {{'ATTR_NAME' cannot be applied to a base specifier}}
+struct C : ATTR_USE virtual A {}; // expected-error {{'ATTR_NAME' cannot be applied to a base specifier}}
+struct D : ATTR_USE public virtual A {}; // expected-error {{'ATTR_NAME' cannot be applied to a base specifier}}
+struct E : public ATTR_USE virtual A {}; // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                                   expected-error {{'ATTR_NAME' cannot be applied to a base specifier}}
+struct F : virtual ATTR_USE public A {}; // expected-error {{'ATTR_NAME' cannot appear here}} \
+                                                   expected-error {{'ATTR_NAME' cannot be applied to a base specifier}}
 }
 
-namespace __arm_streaming ns_attr {}; // expected-error {{'__arm_streaming' cannot be applied to a declaration}} \
+namespace ATTR_USE ns_attr {}; // expected-error {{'ATTR_NAME' only applies to non-K&R-style functions}} \
                                          expected-warning {{attributes on a namespace declaration are a C++17 extension}}
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 7f2b353ab18c0d009fdfb2041fffc69563ad5baa..703294ef13bb08b74603a29d34aa1443977ea75d 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -58,6 +58,10 @@
 // CHECK-NOT: __ARM_FEATURE_SVE_BITS 512
 // CHECK-NOT: __ARM_FEATURE_SVE_BITS 1024
 // CHECK-NOT: __ARM_FEATURE_SVE_BITS 2048
+// CHECK: __ARM_STATE_ZA 1
+// CHECK: __ARM_STATE_ZT0 1
+// CHECK-NOT: __ARM_FEATURE_SME
+// CHECK-NOT: __ARM_FEATURE_SME2
 
 // RUN: %clang -target aarch64-none-elf -march=armv8-r -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-R-PROFILE
 // RUN: %clang -target arm64-none-linux-gnu -march=armv8-r -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-R-PROFILE
@@ -616,3 +620,12 @@
 
 // RUN: %clang --target=aarch64 -march=armv8.2-a+rcpc3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-RCPC3 %s
 // CHECK-RCPC3: __ARM_FEATURE_RCPC 3
+
+// RUN: %clang --target=aarch64 -march=armv9-a+sme -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME %s
+// CHECK-SME: __ARM_FEATURE_LOCALLY_STREAMING 1
+// CHECK-SME: __ARM_FEATURE_SME 1
+//
+// RUN: %clang --target=aarch64 -march=armv9-a+sme2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SME2 %s
+// CHECK-SME2: __ARM_FEATURE_LOCALLY_STREAMING 1
+// CHECK-SME2: __ARM_FEATURE_SME 1
+// CHECK-SME2: __ARM_FEATURE_SME2 1
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index 2b7cc57f230333386feb3231a0eb4718aacf94f5..6cc9cb22c7af2cc64f015bb2c365586d257197a0 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -32,6 +32,8 @@
 // AARCH64-NEXT: #define __ARM_PCS_AAPCS64 1
 // AARCH64-NEXT: #define __ARM_SIZEOF_MINIMAL_ENUM 4
 // AARCH64-NEXT: #define __ARM_SIZEOF_WCHAR_T 4
+// AARCH64-NEXT: #define __ARM_STATE_ZA 1
+// AARCH64-NEXT: #define __ARM_STATE_ZT0 1
 // AARCH64-NEXT: #define __ATOMIC_ACQUIRE 2
 // AARCH64-NEXT: #define __ATOMIC_ACQ_REL 4
 // AARCH64-NEXT: #define __ATOMIC_CONSUME 1
diff --git a/clang/test/Sema/MicrosoftExtensions.c b/clang/test/Sema/MicrosoftExtensions.c
index 50077d90314886568f80a7c984a27db5d8cb47eb..cf7463d2e76ebc2d43ff2d7b874832a2b416cb90 100644
--- a/clang/test/Sema/MicrosoftExtensions.c
+++ b/clang/test/Sema/MicrosoftExtensions.c
@@ -123,7 +123,7 @@ struct __declspec(deprecated) DS1 { int i; float f; }; // expected-note {{'DS1'
 #define MY_TEXT		"This is also deprecated"
 __declspec(deprecated(MY_TEXT)) void Dfunc1( void ) {} // expected-note {{'Dfunc1' has been explicitly marked deprecated here}}
 
-struct __declspec(deprecated(123)) DS2 {};	// expected-error {{'deprecated' attribute requires a string}}
+struct __declspec(deprecated(123)) DS2 {};	// expected-error {{expected string literal as argument of 'deprecated' attribute}}
 
 void test( void ) {
 	e1 = one;	// expected-warning {{'e1' is deprecated: This is deprecated}}
diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
new file mode 100644
index 0000000000000000000000000000000000000000..079cff5a5bbae95d495c311dfaf59f95c59b3012
--- /dev/null
+++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
@@ -0,0 +1,110 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1  -triple aarch64-none-linux-gnu -target-feature +sve \
+// RUN:   -target-feature +sme -target-feature +sve2 -target-feature +neon -fsyntax-only -verify %s
+
+// REQUIRES: aarch64-registered-target
+
+#include "arm_neon.h"
+#include "arm_sme.h"
+#include "arm_sve.h"
+
+int16x8_t incompat_neon_sm(int16x8_t splat) __arm_streaming {
+  // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}}
+  return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33);
+}
+
+__arm_locally_streaming int16x8_t incompat_neon_ls(int16x8_t splat) {
+  // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}}
+  return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33);
+}
+
+int16x8_t incompat_neon_smc(int16x8_t splat) __arm_streaming_compatible {
+  // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}}
+  return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)splat, (int8x16_t)splat, 33);
+}
+
+void incompat_sme_smc(svbool_t pg, void const *ptr) __arm_streaming_compatible __arm_inout("za") {
+  // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}}
+  return __builtin_sme_svld1_hor_za128(0, 0, pg, ptr);
+}
+
+svuint32_t incompat_sve_sm(svbool_t pg, svuint32_t a, int16_t b) __arm_streaming {
+  // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}}
+  return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b);
+}
+
+__arm_locally_streaming svuint32_t incompat_sve_ls(svbool_t pg, svuint32_t a, int64_t b) {
+  // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}}
+  return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b);
+}
+
+svuint32_t incompat_sve_smc(svbool_t pg, svuint32_t a, int64_t b) __arm_streaming_compatible {
+  // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}}
+  return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b);
+}
+
+svuint32_t incompat_sve2_sm(svbool_t pg, svuint32_t a, int64_t b) __arm_streaming {
+  // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}}
+  return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b);
+}
+
+__arm_locally_streaming svuint32_t incompat_sve2_ls(svbool_t pg, svuint32_t a, int64_t b) {
+  // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}}
+  return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b);
+}
+
+svuint32_t incompat_sve2_smc(svbool_t pg, svuint32_t a, int64_t b) __arm_streaming_compatible {
+  // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming compatible function}}
+  return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b);
+}
+
+void incompat_sme_sm(svbool_t pn, svbool_t pm, svfloat32_t zn, svfloat32_t zm) __arm_inout("za") {
+  // expected-warning@+1 {{builtin call has undefined behaviour when called from a non-streaming function}}
+  svmops_za32_f32_m(0, pn, pm, zn, zm);
+}
+
+svfloat64_t streaming_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) __arm_streaming {
+  // expected-no-warning
+  return svadd_n_f64_m(pg, a, b);
+}
+
+__arm_locally_streaming svfloat64_t locally_streaming_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) {
+  // expected-no-warning
+  return svadd_n_f64_m(pg, a, b);
+}
+
+svfloat64_t streaming_compatible_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) __arm_streaming_compatible {
+  // expected-no-warning
+  return svadd_n_f64_m(pg, a, b);
+}
+
+svint16_t streaming_caller_sve2(svint16_t op1, svint16_t op2) __arm_streaming {
+  // expected-no-warning
+  return svmul_lane_s16(op1, op2, 0);
+}
+
+__arm_locally_streaming svint16_t locally_streaming_caller_sve2(svint16_t op1, svint16_t op2) {
+  // expected-no-warning
+  return svmul_lane_s16(op1, op2, 0);
+}
+
+svint16_t streaming_compatible_caller_sve2(svint16_t op1, svint16_t op2) __arm_streaming_compatible {
+  // expected-no-warning
+  return svmul_lane_s16(op1, op2, 0);
+}
+
+svbool_t streaming_caller_ptrue(void) __arm_streaming {
+  // expected-no-warning
+  return svand_z(svptrue_b16(), svptrue_pat_b16(SV_ALL), svptrue_pat_b16(SV_VL4));
+}
+
+svint8_t missing_za(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+  // expected-warning@+1 {{builtin call is not valid when calling from a function without active ZA state}}
+    return svread_hor_za8_s8_m(zd, pg, 0, slice_base);
+}
+
+__arm_new("za")
+svint8_t new_za(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+    // expected-no-warning
+    return svread_hor_za8_s8_m(zd, pg, 0, slice_base);
+}
diff --git a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a54a94f408b79521cdef2ad9a9146b068a2f6de
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
@@ -0,0 +1,48 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -fsyntax-only -verify %s
+
+// This test is testing the diagnostics that Clang emits when compiling without '+sme'.
+
+void streaming_compatible_def() __arm_streaming_compatible {} // OK
+void streaming_def() __arm_streaming { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}}
+void shared_za_def() __arm_inout("za") { } // expected-error {{function using ZA state requires 'sme'}}
+__arm_new("za") void new_za_def() { } // expected-error {{function using ZA state requires 'sme'}}
+__arm_locally_streaming void locally_streaming_def() { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}}
+void streaming_shared_za_def() __arm_streaming __arm_inout("za") { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}}
+
+// It should work fine when we explicitly add the target("sme") attribute.
+__attribute__((target("sme"))) void streaming_compatible_def_sme_attr() __arm_streaming_compatible {} // OK
+__attribute__((target("sme"))) void streaming_def_sme_attr() __arm_streaming { } // OK
+__attribute__((target("sme"))) void shared_za_def_sme_attr() __arm_inout("za") { } // OK
+__arm_new("za") __attribute__((target("sme"))) void new_za_def_sme_attr() {} // OK
+__arm_locally_streaming __attribute__((target("sme"))) void locally_streaming_def_sme_attr() {} // OK
+
+// Test that it also works with the target("sme2") attribute.
+__attribute__((target("sme2"))) void streaming_def_sme2_attr() __arm_streaming { } // OK
+
+// No code is generated for declarations, so it should be fine to declare using the attribute.
+void streaming_compatible_decl() __arm_streaming_compatible; // OK
+void streaming_decl() __arm_streaming; // OK
+void shared_za_decl() __arm_inout("za"); // OK
+
+void non_streaming_decl();
+void non_streaming_def(void (*streaming_fn_ptr)(void) __arm_streaming,
+                       void (*streaming_compatible_fn_ptr)(void) __arm_streaming_compatible) {
+  streaming_compatible_decl(); // OK
+  streaming_compatible_fn_ptr(); // OK
+  streaming_decl(); // expected-error {{call to a streaming function requires 'sme'}}
+  streaming_fn_ptr(); // expected-error {{call to a streaming function requires 'sme'}}
+}
+
+void streaming_compatible_def2(void (*streaming_fn_ptr)(void) __arm_streaming,
+                               void (*streaming_compatible_fn_ptr)(void) __arm_streaming_compatible)
+                                __arm_streaming_compatible {
+  non_streaming_decl(); // OK
+  streaming_compatible_decl(); // OK
+  streaming_compatible_fn_ptr(); // OK
+  streaming_decl(); // expected-error {{call to a streaming function requires 'sme'}}
+  streaming_fn_ptr(); // expected-error {{call to a streaming function requires 'sme'}}
+}
+
+// Also test when call-site is not a function.
+int streaming_decl_ret_int() __arm_streaming;
+int x = streaming_decl_ret_int(); // expected-error {{call to a streaming function requires 'sme'}}
diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c
new file mode 100644
index 0000000000000000000000000000000000000000..b986b0b3de2e14d265ffe9883368a57e27056344
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme-func-attrs.c
@@ -0,0 +1,402 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify=expected-cpp -x c++ %s
+
+// Valid attributes
+
+void sme_arm_streaming(void) __arm_streaming;
+void sme_arm_streaming_compatible(void) __arm_streaming_compatible;
+
+__arm_new("za") void sme_arm_new_za(void) {}
+void sme_arm_shared_za(void) __arm_inout("za");
+void sme_arm_preserves_za(void) __arm_preserves("za");
+
+__arm_new("za") void sme_arm_streaming_new_za(void) __arm_streaming {}
+void sme_arm_streaming_shared_za(void) __arm_streaming __arm_inout("za");
+void sme_arm_streaming_preserves_za(void) __arm_streaming __arm_preserves("za");
+
+__arm_new("za") void sme_arm_sc_new_za(void) __arm_streaming_compatible {}
+void sme_arm_sc_shared_za(void) __arm_streaming_compatible __arm_inout("za");
+void sme_arm_sc_preserves_za(void) __arm_streaming_compatible __arm_preserves("za");
+
+__arm_locally_streaming void sme_arm_locally_streaming(void) { }
+__arm_locally_streaming void sme_arm_streaming_and_locally_streaming(void) __arm_streaming { }
+__arm_locally_streaming void sme_arm_streaming_and_streaming_compatible(void) __arm_streaming_compatible { }
+
+__arm_locally_streaming __arm_new("za") void sme_arm_ls_new_za(void) { }
+__arm_locally_streaming void sme_arm_ls_shared_za(void) __arm_inout("za") { }
+__arm_locally_streaming void sme_arm_ls_preserves_za(void) __arm_preserves("za") { }
+
+// Valid attributes on function pointers
+
+void streaming_ptr(void) __arm_streaming;
+typedef  void (*fptrty1) (void) __arm_streaming;
+fptrty1 call_streaming_func() { return streaming_ptr; }
+
+void streaming_compatible_ptr(void) __arm_streaming_compatible;
+typedef void (*fptrty2) (void) __arm_streaming_compatible;
+fptrty2 call_sc_func() { return streaming_compatible_ptr; }
+
+void shared_za_ptr(void) __arm_inout("za");
+typedef void (*fptrty3) (void) __arm_inout("za");
+fptrty3 call_shared_za_func() { return shared_za_ptr; }
+
+void preserves_za_ptr(void) __arm_preserves("za");
+typedef void (*fptrty4) (void) __arm_preserves("za");
+fptrty4 call_preserve_za_func() { return preserves_za_ptr; }
+
+typedef void (*fptrty6) (void);
+fptrty6 cast_nza_func_to_normal() { return sme_arm_new_za; }
+fptrty6 cast_ls_func_to_normal() { return sme_arm_locally_streaming; }
+
+// Invalid attributes
+
+// expected-cpp-error@+4 {{'__arm_streaming_compatible' and '__arm_streaming' are not compatible}}
+// expected-cpp-note@+3 {{conflicting attribute is here}}
+// expected-error@+2 {{'__arm_streaming_compatible' and '__arm_streaming' are not compatible}}
+// expected-note@+1 {{conflicting attribute is here}}
+void streaming_mode(void) __arm_streaming __arm_streaming_compatible;
+
+// expected-cpp-error@+4 {{'__arm_streaming' and '__arm_streaming_compatible' are not compatible}}
+// expected-cpp-note@+3 {{conflicting attribute is here}}
+// expected-error@+2 {{'__arm_streaming' and '__arm_streaming_compatible' are not compatible}}
+// expected-note@+1 {{conflicting attribute is here}}
+void streaming_compatible(void) __arm_streaming_compatible __arm_streaming;
+
+// expected-cpp-error@+2 {{'__arm_new("za")' and '__arm_inout("za")' are not compatible}}
+// expected-error@+1 {{'__arm_new("za")' and '__arm_inout("za")' are not compatible}}
+__arm_new("za") void new_shared_za(void) __arm_inout("za") {}
+
+// expected-cpp-error@+2 {{'__arm_new("za")' and '__arm_preserves("za")' are not compatible}}
+// expected-error@+1 {{'__arm_new("za")' and '__arm_preserves("za")' are not compatible}}
+__arm_new("za") void new_preserves_za(void) __arm_preserves("za") {}
+
+// Invalid attributes on function pointers
+
+// expected-cpp-error@+4 {{'__arm_streaming_compatible' and '__arm_streaming' are not compatible}}
+// expected-cpp-note@+3 {{conflicting attribute is here}}
+// expected-error@+2 {{'__arm_streaming_compatible' and '__arm_streaming' are not compatible}}
+// expected-note@+1 {{conflicting attribute is here}}
+void streaming_ptr_invalid(void) __arm_streaming __arm_streaming_compatible;
+// expected-cpp-error@+4 {{'__arm_streaming_compatible' and '__arm_streaming' are not compatible}}
+// expected-cpp-note@+3 {{conflicting attribute is here}}
+// expected-error@+2 {{'__arm_streaming_compatible' and '__arm_streaming' are not compatible}}
+// expected-note@+1 {{conflicting attribute is here}}
+typedef void (*fptrty7) (void) __arm_streaming __arm_streaming_compatible;
+fptrty7 invalid_streaming_func() { return streaming_ptr_invalid; }
+
+// expected-warning@+2 {{'__arm_streaming' only applies to non-K&R-style functions}}
+// expected-error@+1 {{'__arm_streaming' only applies to function types; type here is 'void ()'}}
+void function_no_prototype() __arm_streaming;
+
+//
+// Check for incorrect conversions of function pointers with the attributes
+//
+
+typedef void (*n_ptrty) (void);
+typedef void (*s_ptrty) (void) __arm_streaming;
+s_ptrty return_valid_streaming_fptr(s_ptrty f) { return f; }
+
+// expected-cpp-error@+2 {{cannot initialize return object of type 's_ptrty' (aka 'void (*)() __arm_streaming') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}}
+// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 's_ptrty' (aka 'void (*)(void) __arm_streaming')}}
+s_ptrty return_invalid_fptr_streaming_normal(n_ptrty f) { return f; }
+// expected-cpp-error@+2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 's_ptrty' (aka 'void (*)() __arm_streaming')}}
+// expected-error@+1 {{incompatible function pointer types returning 's_ptrty' (aka 'void (*)(void) __arm_streaming') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}}
+n_ptrty return_invalid_fptr_normal_streaming(s_ptrty f) { return f; }
+
+// Test an instance where the result type is not a prototyped function, such that we still get a diagnostic.
+typedef void (*nonproto_n_ptrty) ();
+// expected-cpp-error@+2 {{cannot initialize return object of type 'nonproto_n_ptrty' (aka 'void (*)()') with an lvalue of type 's_ptrty' (aka 'void (*)() __arm_streaming')}}
+// expected-error@+1 {{incompatible function pointer types returning 's_ptrty' (aka 'void (*)(void) __arm_streaming') from a function with result type 'nonproto_n_ptrty' (aka 'void (*)()')}}
+nonproto_n_ptrty return_invalid_fptr_streaming_nonprotonormal(s_ptrty f) { return f; }
+
+typedef void (*sc_ptrty) (void) __arm_streaming_compatible;
+sc_ptrty return_valid_streaming_compatible_fptr(sc_ptrty f) { return f; }
+
+// expected-cpp-error@+2 {{cannot initialize return object of type 'sc_ptrty' (aka 'void (*)() __arm_streaming_compatible') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}}
+// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'sc_ptrty' (aka 'void (*)(void) __arm_streaming_compatible')}}
+sc_ptrty return_invalid_fptr_streaming_compatible_normal(n_ptrty f) { return f; }
+// expected-cpp-error@+2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 'sc_ptrty' (aka 'void (*)() __arm_streaming_compatible')}}
+// expected-error@+1 {{incompatible function pointer types returning 'sc_ptrty' (aka 'void (*)(void) __arm_streaming_compatible') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}}
+n_ptrty return_invalid_fptr_normal_streaming_compatible(sc_ptrty f) { return f; }
+
+typedef void (*sz_ptrty) (void) __arm_inout("za");
+sz_ptrty return_valid_shared_za_fptr(sz_ptrty f) { return f; }
+
+
+// expected-cpp-error@+2 {{cannot initialize return object of type 'sz_ptrty' (aka 'void (*)() __arm_inout("za")') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}}
+// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'sz_ptrty' (aka 'void (*)(void) __arm_inout("za")')}}
+sz_ptrty return_invalid_fptr_shared_za_normal(n_ptrty f) { return f; }
+// expected-cpp-error@+2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 'sz_ptrty' (aka 'void (*)() __arm_inout("za")')}}
+// expected-error@+1 {{incompatible function pointer types returning 'sz_ptrty' (aka 'void (*)(void) __arm_inout("za")') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}}
+n_ptrty return_invalid_fptr_normal_shared_za(sz_ptrty f) { return f; }
+
+typedef void (*pz_ptrty) (void) __arm_preserves("za");
+pz_ptrty return_valid_preserves_za_fptr(pz_ptrty f) { return f; }
+
+// expected-cpp-error@+2 {{cannot initialize return object of type 'pz_ptrty' (aka 'void (*)() __arm_preserves("za")') with an lvalue of type 'n_ptrty' (aka 'void (*)()')}}
+// expected-error@+1 {{incompatible function pointer types returning 'n_ptrty' (aka 'void (*)(void)') from a function with result type 'pz_ptrty' (aka 'void (*)(void) __arm_preserves("za")')}}
+pz_ptrty return_invalid_fptr_preserves_za_normal(n_ptrty f) { return f; }
+// expected-cpp-error@+2 {{cannot initialize return object of type 'n_ptrty' (aka 'void (*)()') with an lvalue of type 'pz_ptrty' (aka 'void (*)() __arm_preserves("za")')}}
+// expected-error@+1 {{incompatible function pointer types returning 'pz_ptrty' (aka 'void (*)(void) __arm_preserves("za")') from a function with result type 'n_ptrty' (aka 'void (*)(void)')}}
+n_ptrty return_invalid_fptr_normal_preserves_za(pz_ptrty f) { return f; }
+
+// Test template instantiations
+#ifdef __cplusplus
+template <typename T> T templated(T x) __arm_streaming { return x; }
+template <> int templated<int>(int x) __arm_streaming { return x + 1; }
+template <> float templated<float>(float x) __arm_streaming { return x + 2; }
+// expected-cpp-error@+2 {{explicit instantiation of 'templated' does not refer to a function template, variable template, member function, member class, or static data member}}
+// expected-cpp-note@-4 {{candidate template ignored: could not match 'short (short) __arm_streaming' against 'short (short)'}}
+template short templated<short>(short);
+#endif
+
+// Conflicting attributes on redeclarations
+
+// expected-error@+5 {{function declared 'void (void) __arm_streaming_compatible' was previously declared 'void (void) __arm_streaming', which has different SME function attributes}}
+// expected-note@+3 {{previous declaration is here}}
+// expected-cpp-error@+3 {{function declared 'void () __arm_streaming_compatible' was previously declared 'void () __arm_streaming', which has different SME function attributes}}
+// expected-cpp-note@+1 {{previous declaration is here}}
+void redecl(void) __arm_streaming;
+void redecl(void) __arm_streaming_compatible { }
+
+// expected-error@+5 {{function declared 'void (void)' was previously declared 'void (void) __arm_preserves("za")', which has different SME function attributes}}
+// expected-note@+3 {{previous declaration is here}}
+// expected-cpp-error@+3 {{function declared 'void ()' was previously declared 'void () __arm_preserves("za")', which has different SME function attributes}}
+// expected-cpp-note@+1 {{previous declaration is here}}
+void redecl_preserve_za(void) __arm_preserves("za");;
+void redecl_preserve_za(void) {}
+
+// expected-error@+5 {{function declared 'void (void) __arm_preserves("za")' was previously declared 'void (void)', which has different SME function attributes}}
+// expected-note@+3 {{previous declaration is here}}
+// expected-cpp-error@+3 {{function declared 'void () __arm_preserves("za")' was previously declared 'void ()', which has different SME function attributes}}
+// expected-cpp-note@+1 {{previous declaration is here}}
+void redecl_nopreserve_za(void);
+void redecl_nopreserve_za(void) __arm_preserves("za") {}
+
+void non_za_definition(void (*shared_za_fn_ptr)(void) __arm_inout("za"), void (*preserves_za_fn_ptr)(void) __arm_preserves("za")) {
+  sme_arm_new_za(); // OK
+  // expected-error@+2 {{call to a shared ZA function requires the caller to have ZA state}}
+  // expected-cpp-error@+1 {{call to a shared ZA function requires the caller to have ZA state}}
+  sme_arm_shared_za();
+  // expected-error@+2 {{call to a shared ZA function requires the caller to have ZA state}}
+  // expected-cpp-error@+1 {{call to a shared ZA function requires the caller to have ZA state}}
+  shared_za_fn_ptr();
+  // expected-error@+2 {{call to a shared ZA function requires the caller to have ZA state}}
+  // expected-cpp-error@+1 {{call to a shared ZA function requires the caller to have ZA state}}
+  preserves_za_fn_ptr();
+}
+
+void shared_za_definition(void (*shared_za_fn_ptr)(void) __arm_inout("za")) __arm_inout("za") {
+  sme_arm_shared_za(); // OK
+  shared_za_fn_ptr(); // OK
+}
+
+__arm_new("za") void new_za_definition(void (*shared_za_fn_ptr)(void) __arm_inout("za")) {
+  sme_arm_shared_za(); // OK
+  shared_za_fn_ptr(); // OK
+}
+
+#ifdef __cplusplus
+int shared_za_initializer(void) __arm_inout("za");
+// expected-cpp-error@+1 {{call to a shared ZA function requires the caller to have ZA state}}
+int global = shared_za_initializer();
+
+struct S {
+  virtual void shared_za_memberfn(void) __arm_inout("za");
+};
+
+struct S2 : public S {
+// expected-cpp-error@+2 {{virtual function 'shared_za_memberfn' has different attributes ('void ()') than the function it overrides (which has 'void () __arm_inout("za")')}}
+// expected-cpp-note@-5 {{overridden virtual function is here}}
+  __arm_new("za") void shared_za_memberfn(void) override {}
+};
+
+// The '__arm_preserves("za")' property cannot be dropped when overriding a virtual
+// function. It is however fine for the overriding function to be '__arm_preserves("za")'
+// even though the function that it overrides is not.
+
+struct S_PreservesZA {
+  virtual void memberfn(void) __arm_preserves("za");
+};
+
+struct S_Drop_PreservesZA : S_PreservesZA {
+// expected-cpp-error@+2 {{virtual function 'memberfn' has different attributes ('void ()') than the function it overrides (which has 'void () __arm_preserves("za")')}}
+// expected-cpp-note@-5 {{overridden virtual function is here}}
+  void memberfn(void) override {}
+};
+
+struct S_NoPreservesZA {
+  virtual void memberfn(void);
+};
+
+struct S_AddPreservesZA : S_NoPreservesZA {
+// expected-cpp-error@+2 {{virtual function 'memberfn' has different attributes ('void () __arm_preserves("za")') than the function it overrides (which has 'void ()')}}
+// expected-cpp-note@-5 {{overridden virtual function is here}}
+  void memberfn(void) __arm_preserves("za") override {}
+};
+
+
+// Check that the attribute propagates through template instantiations.
+template <typename Ty>
+struct S3 {
+  static constexpr int value = 0;
+};
+
+template <>
+struct S3<void (*)()> {
+  static constexpr int value = 1;
+};
+
+template <>
+struct S3<void (* __arm_streaming)()> {
+  static constexpr int value = 2;
+};
+
+template <>
+struct S3<void (* __arm_streaming_compatible)()> {
+  static constexpr int value = 4;
+};
+
+template <>
+struct S3<void (* __arm_inout("za"))()> {
+  static constexpr int value = 8;
+};
+
+template <>
+struct S3<void (* __arm_preserves("za"))()> {
+  static constexpr int value = 16;
+};
+
+void normal_func(void) {}
+void streaming_func(void) __arm_streaming {}
+void streaming_compatible_func(void) __arm_streaming_compatible {}
+void shared_za_func(void) __arm_inout("za") {}
+void preserves_za_func(void) __arm_preserves("za") {}
+
+static_assert(S3<decltype(+normal_func)>::value == 1, "why are we picking the wrong specialization?");
+static_assert(S3<decltype(+streaming_func)>::value == 2, "why are we picking the wrong specialization?");
+static_assert(S3<decltype(+streaming_compatible_func)>::value == 4, "why are we picking the wrong specialization?");
+static_assert(S3<decltype(+shared_za_func)>::value == 8, "why are we picking the wrong specialization?");
+static_assert(S3<decltype(+preserves_za_func)>::value == 16, "why are we picking the wrong specialization?");
+
+// Also test the attribute is propagated with variadic templates
+constexpr int eval_variadic_template() { return 0; }
+template <typename T, typename... Other>
+constexpr int eval_variadic_template(T f, Other... other) {
+    return S3<decltype(f)>::value + eval_variadic_template(other...);
+}
+static_assert(eval_variadic_template(normal_func, streaming_func,
+                                     streaming_compatible_func,
+                                     shared_za_func, preserves_za_func) == 31,
+              "attributes  not propagated properly in variadic template");
+
+// Test that the attribute is propagated with template specialization.
+template<typename T> int test_templated_f(T);
+template<> constexpr int test_templated_f<void(*)(void)>(void(*)(void)) { return 1; }
+template<> constexpr int test_templated_f<void(*)(void)__arm_streaming>(void(*)(void)__arm_streaming) { return 2; }
+template<> constexpr int test_templated_f<void(*)(void)__arm_streaming_compatible>(void(*)(void)__arm_streaming_compatible) { return 4; }
+template<> constexpr int test_templated_f<void(*)(void)__arm_inout("za")>(void(*)(void)__arm_inout("za")) { return 8; }
+template<> constexpr int test_templated_f<void(*)(void)__arm_preserves("za")>(void(*)(void)__arm_preserves("za")) { return 16; }
+
+static_assert(test_templated_f(&normal_func) == 1, "Instantiated to wrong function");
+static_assert(test_templated_f(&streaming_func) == 2, "Instantiated to wrong function");
+static_assert(test_templated_f(&streaming_compatible_func) == 4, "Instantiated to wrong function");
+static_assert(test_templated_f(&shared_za_func) == 8, "Instantiated to wrong function");
+static_assert(test_templated_f(&preserves_za_func) == 16, "Instantiated to wrong function");
+
+// expected-cpp-error@+2 {{'__arm_streaming' only applies to function types; type here is 'int'}}
+// expected-error@+1 {{'__arm_streaming' only applies to function types; type here is 'int'}}
+int invalid_type_for_attribute __arm_streaming;
+
+// Test overloads
+constexpr int overload(void f(void)) { return 1; }
+constexpr int overload(void f(void) __arm_streaming) { return 2; }
+constexpr int overload(void f(void) __arm_streaming_compatible) { return 4; }
+constexpr int overload(void f(void) __arm_inout("za")) { return 8; }
+constexpr int overload(void f(void) __arm_preserves("za")) { return 16; }
+static_assert(overload(&normal_func) == 1, "Overloaded to wrong function");
+static_assert(overload(&streaming_func) == 2, "Overloaded to wrong function");
+static_assert(overload(&streaming_compatible_func) == 4, "Overloaded to wrong function");
+static_assert(overload(&shared_za_func) == 8, "Overloaded to wrong function");
+static_assert(overload(&preserves_za_func) == 16, "Overloaded to wrong function");
+
+// Test implicit instantiation
+template <typename T> struct X {
+  static void foo(T) __arm_streaming { }
+};
+constexpr int overload_int(void f(int)) { return 1; }
+constexpr int overload_int(void f(int) __arm_streaming) { return 2; }
+constexpr X<int> *ptr = 0;
+static_assert(overload_int(ptr->foo) == 2, "Overloaded to the wrong function after implicit instantiation");
+
+#endif // ifdef __cplusplus
+
+// expected-cpp-error@+2 {{unknown state ''}}
+// expected-error@+1 {{unknown state ''}}
+__arm_new("") void invalid_arm_new_empty_string(void);
+// expected-cpp-error@+2 {{expected string literal as argument of '__arm_new' attribute}}
+// expected-error@+1 {{expected string literal as argument of '__arm_new' attribute}}
+__arm_new(0) void invalid_arm_new_non_literal_string(void);
+// expected-cpp-error@+2 {{unknown state 'unknownstate'}}
+// expected-error@+1 {{unknown state 'unknownstate'}}
+__arm_new("unknownstate") void invalid_arm_new_unknown_state(void);
+
+// expected-cpp-error@+2 {{unknown state ''}}
+// expected-error@+1 {{unknown state ''}}
+void invalid_arm_in_empty_string(void) __arm_in("");
+// expected-cpp-error@+2 {{expected string literal as argument of '__arm_in' attribute}}
+// expected-error@+1 {{expected string literal as argument of '__arm_in' attribute}}
+void invalid_arm_in_non_literal_string(void) __arm_in(0);
+// expected-cpp-error@+2 {{unknown state 'unknownstate'}}
+// expected-error@+1 {{unknown state 'unknownstate'}}
+void invalid_arm_in_unknown_state(void) __arm_in("unknownstate");
+
+void valid_state_attrs_in_in1(void) __arm_in("za");
+void valid_state_attrs_in_in2(void) __arm_in("za", "za");
+
+// expected-cpp-error@+2 {{missing state for '__arm_in'}}
+// expected-error@+1 {{missing state for '__arm_in'}}
+void invalid_state_attrs_no_arg1(void) __arm_in();
+// expected-cpp-error@+2 {{missing state for '__arm_new'}}
+// expected-error@+1 {{missing state for '__arm_new'}}
+__arm_new() void invalid_state_attrs_no_arg2(void);
+
+// expected-cpp-error@+2 {{conflicting attributes for state 'za'}}
+// expected-error@+1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_in_out(void) __arm_in("za") __arm_out("za");
+// expected-cpp-error@+2 {{conflicting attributes for state 'za'}}
+// expected-error@+1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_in_inout(void) __arm_in("za") __arm_inout("za");
+// expected-cpp-error@+2 {{conflicting attributes for state 'za'}}
+// expected-error@+1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_in_preserves(void) __arm_in("za") __arm_preserves("za");
+
+// expected-cpp-error@+2 {{conflicting attributes for state 'za'}}
+// expected-error@+1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_out_in(void) __arm_out("za") __arm_in("za");
+// expected-cpp-error@+2 {{conflicting attributes for state 'za'}}
+// expected-error@+1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_out_inout(void) __arm_out("za") __arm_inout("za");
+// expected-cpp-error@+2 {{conflicting attributes for state 'za'}}
+// expected-error@+1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_out_preserves(void) __arm_out("za") __arm_preserves("za");
+
+// expected-cpp-error@+2 {{conflicting attributes for state 'za'}}
+// expected-error@+1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_inout_in(void) __arm_inout("za") __arm_in("za");
+// expected-cpp-error@+2 {{conflicting attributes for state 'za'}}
+// expected-error@+1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_inout_out(void) __arm_inout("za") __arm_out("za");
+// expected-cpp-error@+2 {{conflicting attributes for state 'za'}}
+// expected-error@+1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_inout_preserves(void) __arm_inout("za") __arm_preserves("za");
+
+// expected-cpp-error@+2 {{conflicting attributes for state 'za'}}
+// expected-error@+1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_preserves_in(void) __arm_preserves("za") __arm_in("za");
+// expected-cpp-error@+2 {{conflicting attributes for state 'za'}}
+// expected-error@+1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_preserves_out(void) __arm_preserves("za") __arm_out("za");
+// expected-cpp-error@+2 {{conflicting attributes for state 'za'}}
+// expected-error@+1 {{conflicting attributes for state 'za'}}
+void conflicting_state_attrs_preserves_inout(void) __arm_preserves("za") __arm_inout("za");
diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
index c08f9f25eb473befb4f7e0b07b8b187013d5bfb6..8f3f98394492ddb30e3dcad1c81fe1fa43053ce2 100644
--- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
+++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
@@ -10,114 +10,66 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
-void test_range_0_0(svbool_t pg, void *ptr) {
+void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svld1_hor_za8,,,)(-1, -1, 0, pg, ptr);
+  SVE_ACLE_FUNC(svld1_hor_za8,,,)(-1, slice, pg, ptr);
   // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svst1_ver_za8,,,)(1, -1, 15, pg, ptr);
+  SVE_ACLE_FUNC(svst1_ver_za8,,,)(1, slice, pg, ptr);
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svld1_hor_za128,,,)(0, -1, -1, pg, ptr);
+  SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(-1, slice, pg, ptr, 1);
   // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svst1_ver_za128,,,)(15, -1, 1, pg, ptr);
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(-1, -1, 0, pg, ptr, 1);
-  // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svst1_ver_vnum_za8,,,)(1, -1, 15, pg, ptr, 1);
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svld1_hor_vnum_za128,,,)(0, -1, -1, pg, ptr, 1);
-  // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svst1_ver_vnum_za128,,,)(15, -1, 1, pg, ptr, 1);
+  SVE_ACLE_FUNC(svst1_ver_vnum_za8,,,)(1, slice, pg, ptr, 1);
 
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svread_hor_za8, _s8, _m,)(svundef_s8(), pg, -1, -1, 0);
-  // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svread_ver_za8, _s8, _m,)(svundef_s8(), pg, 1, -1, 15);
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svread_hor_za128, _s8, _m,)(svundef_s8(), pg, 0, -1, -1);
+  SVE_ACLE_FUNC(svread_hor_za8, _s8, _m,)(svundef_s8(), pg, -1, slice);
   // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svread_ver_za128, _s8, _m,)(svundef_s8(), pg, 15, -1, 1);
+  SVE_ACLE_FUNC(svread_ver_za8, _s8, _m,)(svundef_s8(), pg, 1, slice);
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svwrite_hor_za8, _s8, _m,)(-1, -1, 0, pg, svundef_s8());
+  SVE_ACLE_FUNC(svwrite_hor_za8, _s8, _m,)(-1, slice, pg, svundef_s8());
   // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svwrite_ver_za8, _s8, _m,)(1, -1, 15, pg, svundef_s8());
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svwrite_hor_za128, _s8, _m,)(0, -1, -1, pg, svundef_s8());
-  // expected-error@+1 {{argument value 1 is outside the valid range [0, 0]}}
-  SVE_ACLE_FUNC(svwrite_ver_za128, _s8, _m,)(15, -1, 1, pg, svundef_s8());
+  SVE_ACLE_FUNC(svwrite_ver_za8, _s8, _m,)(1, slice, pg, svundef_s8());
 }
 
-void test_range_0_1(svbool_t pg, void *ptr) {
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svld1_hor_za16,,,)(-1, -1, 0, pg, ptr);
-  // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svst1_ver_za16,,,)(2, -1, 7, pg, ptr);
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svld1_hor_za64,,,)(0, -1, -1, pg, ptr);
-  // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svst1_ver_za64,,,)(7, -1, 2, pg, ptr);
+void test_range_0_1(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svld1_hor_vnum_za16,,,)(-1, -1, 0, pg, ptr, 1);
+  SVE_ACLE_FUNC(svld1_hor_za16,,,)(-1, slice, pg, ptr);
   // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svst1_ver_vnum_za16,,,)(2, -1, 7, pg, ptr, 1);
+  SVE_ACLE_FUNC(svst1_ver_za16,,,)(2, slice, pg, ptr);
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svld1_hor_vnum_za64,,,)(0, -1, -1, pg, ptr, 1);
+  SVE_ACLE_FUNC(svld1_hor_vnum_za16,,,)(-1, slice, pg, ptr, 1);
   // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svst1_ver_vnum_za64,,,)(7, -1, 2, pg, ptr, 1);
+  SVE_ACLE_FUNC(svst1_ver_vnum_za16,,,)(2, slice, pg, ptr, 1);
 
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svread_hor_za16, _s16, _m,)(svundef_s16(), pg, -1, -1, 0);
+  SVE_ACLE_FUNC(svread_hor_za16, _s16, _m,)(svundef_s16(), pg, -1, slice);
   // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svread_ver_za16, _s16, _m,)(svundef_s16(), pg, 2, -1, 7);
+  SVE_ACLE_FUNC(svread_ver_za16, _s16, _m,)(svundef_s16(), pg, 2, slice);
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svread_hor_za64, _s64, _m,)(svundef_s64(), pg, 0, -1, -1);
+  SVE_ACLE_FUNC(svwrite_hor_za16, _s16, _m,)(-1, slice, pg, svundef_s16());
   // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svread_ver_za64, _s64, _m,)(svundef_s64(), pg, 7, -1, 2);
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svwrite_hor_za16, _s16, _m,)(-1, -1, 0, pg, svundef_s16());
-  // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svwrite_ver_za16, _s16, _m,)(2, -1, 7, pg, svundef_s16());
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svwrite_hor_za64, _s64, _m,)(0, -1, -1, pg, svundef_s64());
-  // expected-error@+1 {{argument value 2 is outside the valid range [0, 1]}}
-  SVE_ACLE_FUNC(svwrite_ver_za64, _s64, _m,)(7, -1, 2, pg, svundef_s64());
+  SVE_ACLE_FUNC(svwrite_ver_za16, _s16, _m,)(2, slice, pg, svundef_s16());
 }
 
-void test_range_0_3(svbool_t pg, void *ptr) {
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svld1_hor_za32,,,)(-1, -1, 0, pg, ptr);
-  // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svst1_ver_za32,,,)(4, -1, 3, pg, ptr);
+void test_range_0_3(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svld1_hor_za32,,,)(0, -1, -1, pg, ptr);
+  SVE_ACLE_FUNC(svld1_hor_za32,,,)(-1, slice, pg, ptr);
   // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svst1_ver_za32,,,)(3, -1, 4, pg, ptr);
+  SVE_ACLE_FUNC(svst1_ver_za32,,,)(4, slice, pg, ptr);
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svld1_hor_vnum_za32,,,)(-1, -1, 0, pg, ptr, 1);
+  SVE_ACLE_FUNC(svld1_hor_vnum_za32,,,)(-1, slice, pg, ptr, 1);
   // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svst1_ver_vnum_za32,,,)(4, -1, 3, pg, ptr, 1);
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svld1_hor_vnum_za32,,,)(0, -1, -1, pg, ptr, 1);
-  // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svst1_ver_vnum_za32,,,)(3, -1, 4, pg, ptr, 1);
+  SVE_ACLE_FUNC(svst1_ver_vnum_za32,,,)(4, slice, pg, ptr, 1);
 
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svread_hor_za32, _s32, _m,)(svundef_s32(), pg, -1, -1, 0);
-  // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svread_ver_za32, _s32, _m,)(svundef_s32(), pg, 4, -1, 3);
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svread_hor_za32, _s32, _m,)(svundef_s32(), pg, 0, -1, -1);
-  // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svread_ver_za32, _s32, _m,)(svundef_s32(), pg, 3, -1, 4);
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svwrite_hor_za32, _s32, _m,)(-1, -1, 0, pg, svundef_s32());
+  SVE_ACLE_FUNC(svread_hor_za32, _s32, _m,)(svundef_s32(), pg, -1, slice);
   // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svwrite_ver_za32, _s32, _m,)(4, -1, 3, pg, svundef_s32());
+  SVE_ACLE_FUNC(svread_ver_za32, _s32, _m,)(svundef_s32(), pg, 4, slice);
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svwrite_hor_za32, _s32, _m,)(0, -1, -1, pg, svundef_s32());
+  SVE_ACLE_FUNC(svwrite_hor_za32, _s32, _m,)(-1, slice, pg, svundef_s32());
   // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}}
-  SVE_ACLE_FUNC(svwrite_ver_za32, _s32, _m,)(3, -1, 4, pg, svundef_s32());
+  SVE_ACLE_FUNC(svwrite_ver_za32, _s32, _m,)(4, slice, pg, svundef_s32());
 
   // expected-error@+1 {{argument value 4 is outside the valid range [0, 3]}}
   SVE_ACLE_FUNC(svaddha_za32, _s32, _m,)(4, pg, pg, svundef_s32());
@@ -138,40 +90,24 @@ void test_range_0_3(svbool_t pg, void *ptr) {
   SVE_ACLE_FUNC(svusmops_za32, _u8, _m,)(-1, pg, pg, svundef_u8(), svundef_s8());
 }
 
-void test_range_0_7(svbool_t pg, void *ptr) {
+void test_range_0_7(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svld1_hor_za64,,,)(-1, -1, 0, pg, ptr);
+  SVE_ACLE_FUNC(svld1_hor_za64,,,)(-1, slice, pg, ptr);
   // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svst1_ver_za64,,,)(8, -1, 1, pg, ptr);
+  SVE_ACLE_FUNC(svst1_ver_za64,,,)(8, slice, pg, ptr);
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svld1_hor_za16,,,)(0, -1, -1, pg, ptr);
+  SVE_ACLE_FUNC(svld1_hor_vnum_za64,,,)(-1, slice, pg, ptr, 1);
   // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svst1_ver_za16,,,)(1, -1, 8, pg, ptr);
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svld1_hor_vnum_za64,,,)(-1, -1, 0, pg, ptr, 1);
-  // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svst1_ver_vnum_za64,,,)(8, -1, 1, pg, ptr, 1);
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svld1_hor_vnum_za16,,,)(0, -1, -1, pg, ptr, 1);
-  // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svst1_ver_vnum_za16,,,)(1, -1, 8, pg, ptr, 1);
+  SVE_ACLE_FUNC(svst1_ver_vnum_za64,,,)(8, slice, pg, ptr, 1);
 
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svread_hor_za64, _s64, _m,)(svundef_s64(), pg, -1, -1, 0);
-  // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svread_ver_za64, _s64, _m,)(svundef_s64(), pg, 8, -1, 1);
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svread_hor_za16, _s16, _m,)(svundef_s16(), pg, 0, -1, -1);
+  SVE_ACLE_FUNC(svread_hor_za64, _s64, _m,)(svundef_s64(), pg, -1, slice);
   // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svread_ver_za16, _s16, _m,)(svundef_s16(), pg, 1, -1, 8);
+  SVE_ACLE_FUNC(svread_ver_za64, _s64, _m,)(svundef_s64(), pg, 8, slice);
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svwrite_hor_za64, _s64, _m,)(-1, -1, 0, pg, svundef_s64());
+  SVE_ACLE_FUNC(svwrite_hor_za64, _s64, _m,)(-1, slice, pg, svundef_s64());
   // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svwrite_ver_za64, _s64, _m,)(8, -1, 1, pg, svundef_s64());
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svwrite_hor_za16, _s16, _m,)(0, -1, -1, pg, svundef_s16());
-  // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}}
-  SVE_ACLE_FUNC(svwrite_ver_za16, _s16, _m,)(1, -1, 8, pg, svundef_s16());
+  SVE_ACLE_FUNC(svwrite_ver_za64, _s64, _m,)(8, slice, pg, svundef_s64());
 
   // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}}
   SVE_ACLE_FUNC(svaddha_za64, _s64, _m,)(8, pg, pg, svundef_s64());
@@ -190,71 +126,46 @@ void test_range_0_7(svbool_t pg, void *ptr) {
   SVE_ACLE_FUNC(svusmopa_za64, _u16, _m,)(8, pg, pg, svundef_u16(), svundef_s16());
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
   SVE_ACLE_FUNC(svusmops_za64, _u16, _m,)(-1, pg, pg, svundef_u16(), svundef_s16());
+
+  // expected-error@+1 {{argument value 8 is outside the valid range [0, 7]}}
+  SVE_ACLE_FUNC(svmopa_za64, _f64, _m,)(8, pg, pg, svundef_f64(), svundef_f64());
+  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+  SVE_ACLE_FUNC(svmops_za64, _f64, _m,)(-1, pg, pg, svundef_f64(), svundef_f64());
 }
 
-void test_range_0_15(svbool_t pg, void *ptr) {
+void test_range_0_15(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svld1_hor_za128,,,)(-1, -1, 0, pg, ptr);
+  SVE_ACLE_FUNC(svld1_hor_za128,,,)(-1, slice, pg, ptr);
   // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svst1_ver_za128,,,)(16, -1, 0, pg, ptr);
+  SVE_ACLE_FUNC(svst1_ver_za128,,,)(16, slice, pg, ptr);
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svld1_hor_za8,,,)(0, -1, -1, pg, ptr);
+  SVE_ACLE_FUNC(svld1_hor_vnum_za128,,,)(-1, slice, pg, ptr, 1);
   // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svst1_ver_za8,,,)(0, -1, 16, pg, ptr);
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svld1_hor_vnum_za128,,,)(-1, -1, 0, pg, ptr, 1);
-  // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svst1_ver_vnum_za128,,,)(16, -1, 0, pg, ptr, 1);
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(0, -1, -1, pg, ptr, 1);
-  // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svst1_ver_vnum_za8,,,)(0, -1, 16, pg, ptr, 1);
-
-  // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svldr_vnum_za,,,)(-1, 16, ptr);
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svstr_vnum_za,,,)(-1, -1, ptr);
+  SVE_ACLE_FUNC(svst1_ver_vnum_za128,,,)(16, slice, pg, ptr, 1);
 
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svread_hor_za128, _s8, _m,)(svundef_s8(), pg, -1, -1, 0);
+  SVE_ACLE_FUNC(svread_hor_za128, _s8, _m,)(svundef_s8(), pg, -1, slice);
   // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svread_ver_za128, _s8, _m,)(svundef_s8(), pg, 16, -1, 0);
+  SVE_ACLE_FUNC(svread_ver_za128, _s8, _m,)(svundef_s8(), pg, 16, slice);
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svread_hor_za8, _s8, _m,)(svundef_s8(), pg, 0, -1, -1);
+  SVE_ACLE_FUNC(svwrite_hor_za128, _s8, _m,)(-1, slice, pg, svundef_s8());
   // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svread_ver_za8, _s8, _m,)(svundef_s8(), pg, 0, -1, 16);
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svwrite_hor_za128, _s8, _m,)(-1, -1, 0, pg, svundef_s8());
-  // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svwrite_ver_za128, _s8, _m,)(16, -1, 0, pg, svundef_s8());
-  // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svwrite_hor_za8, _s8, _m,)(0, -1, -1, pg, svundef_s8());
-  // expected-error@+1 {{argument value 16 is outside the valid range [0, 15]}}
-  SVE_ACLE_FUNC(svwrite_ver_za8, _s8, _m,)(0, -1, 16, pg, svundef_s8());
+  SVE_ACLE_FUNC(svwrite_ver_za128, _s8, _m,)(16, slice, pg, svundef_s8());
 }
 
-void test_range_0_255(svbool_t pg, void *ptr) {
+void test_range_0_255(svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   // expected-error@+1 {{argument value 256 is outside the valid range [0, 255]}}
   SVE_ACLE_FUNC(svzero_mask_za,,,)(256);
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 255]}}
   SVE_ACLE_FUNC(svzero_mask_za,,,)(-1);
 }
 
-void test_constant(uint64_t u64, svbool_t pg, void *ptr) {
-  SVE_ACLE_FUNC(svld1_hor_za8,,,)(u64, u64, 0, pg, ptr);  // expected-error {{argument to 'svld1_hor_za8' must be a constant integer}}
-  SVE_ACLE_FUNC(svld1_ver_za16,,,)(0, u64, u64, pg, ptr); // expected-error {{argument to 'svld1_ver_za16' must be a constant integer}}
-  SVE_ACLE_FUNC(svst1_hor_za32,,,)(u64, u64, 0, pg, ptr); // expected-error {{argument to 'svst1_hor_za32' must be a constant integer}}
-  SVE_ACLE_FUNC(svst1_ver_za64,,,)(0, u64, u64, pg, ptr); // expected-error {{argument to 'svst1_ver_za64' must be a constant integer}}
-  SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(u64, u64, 0, pg, ptr, u64);  // expected-error {{argument to 'svld1_hor_vnum_za8' must be a constant integer}}
-  SVE_ACLE_FUNC(svld1_ver_vnum_za16,,,)(0, u64, u64, pg, ptr, u64); // expected-error {{argument to 'svld1_ver_vnum_za16' must be a constant integer}}
-  SVE_ACLE_FUNC(svst1_hor_vnum_za32,,,)(u64, u64, 0, pg, ptr, u64); // expected-error {{argument to 'svst1_hor_vnum_za32' must be a constant integer}}
-  SVE_ACLE_FUNC(svst1_ver_vnum_za64,,,)(0, u64, u64, pg, ptr, u64); // expected-error {{argument to 'svst1_ver_vnum_za64' must be a constant integer}}
-
-  SVE_ACLE_FUNC(svldr_vnum_za,,,)(u64, u64, ptr); // expected-error {{argument to 'svldr_vnum_za' must be a constant integer}}
-  SVE_ACLE_FUNC(svstr_vnum_za,,,)(u64, u64, ptr); // expected-error {{argument to 'svstr_vnum_za' must be a constant integer}}
+void test_constant(uint64_t u64, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
+  SVE_ACLE_FUNC(svld1_hor_za8,,,)(u64, u64, pg, ptr);  // expected-error {{argument to 'svld1_hor_za8' must be a constant integer}}
+  SVE_ACLE_FUNC(svst1_hor_za32,,,)(u64, 0, pg, ptr); // expected-error {{argument to 'svst1_hor_za32' must be a constant integer}}
+  SVE_ACLE_FUNC(svld1_hor_vnum_za8,,,)(u64, 0, pg, ptr, u64);  // expected-error {{argument to 'svld1_hor_vnum_za8' must be a constant integer}}
+  SVE_ACLE_FUNC(svst1_hor_vnum_za32,,,)(u64, 0, pg, ptr, u64); // expected-error {{argument to 'svst1_hor_vnum_za32' must be a constant integer}}
 
-  SVE_ACLE_FUNC(svread_hor_za8, _s8, _m,)(svundef_s8(), pg, 0, u64, u64);     // expected-error-re {{argument to 'svread_hor_za8{{.*}}_m' must be a constant integer}}
-  SVE_ACLE_FUNC(svread_ver_za16, _s16, _m,)(svundef_s16(), pg, u64, u64, 0);  // expected-error-re {{argument to 'svread_ver_za16{{.*}}_m' must be a constant integer}}
-  SVE_ACLE_FUNC(svwrite_hor_za32, _s32, _m,)(0, u64, u64, pg, svundef_s32()); // expected-error-re {{argument to 'svwrite_hor_za32{{.*}}_m' must be a constant integer}}
-  SVE_ACLE_FUNC(svwrite_ver_za64, _s64, _m,)(u64, u64, 0, pg, svundef_s64()); // expected-error-re {{argument to 'svwrite_ver_za64{{.*}}_m' must be a constant integer}}
+  SVE_ACLE_FUNC(svread_ver_za16, _s16, _m,)(svundef_s16(), pg, u64, 0);  // expected-error-re {{argument to 'svread_ver_za16{{.*}}_m' must be a constant integer}}
+  SVE_ACLE_FUNC(svwrite_ver_za64, _s64, _m,)(u64, 0, pg, svundef_s64()); // expected-error-re {{argument to 'svwrite_ver_za64{{.*}}_m' must be a constant integer}}
 }
diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
index b384244ac6c6a9206ecca0874cb74dc22f4210fb..6af115beba8e49a6921d56db43be17d5c4e94682 100644
--- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
+++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
@@ -3,23 +3,24 @@
 
 // Test that functions with the correct target attributes can use the correct SME intrinsics.
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 __attribute__((target("sme")))
-void test_sme(svbool_t pg, void *ptr) {
-  svld1_hor_za8(0, 0, 0, pg, ptr);
+void test_sme(svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
+  svld1_hor_za8(0, 0, pg, ptr);
 }
 
 __attribute__((target("arch=armv8-a+sme")))
-void test_arch_sme(svbool_t pg, void *ptr) {
-  svld1_hor_vnum_za32(0, 0, 0, pg, ptr, 0);
+void test_arch_sme(svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
+  svld1_hor_vnum_za32(0, 0, pg, ptr, 0);
 }
 
 __attribute__((target("+sme")))
-void test_plus_sme(svbool_t pg, void *ptr) {
-  svst1_ver_za16(0, 0, 0, pg, ptr);
+void test_plus_sme(svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
+  svst1_ver_za16(0, 0, pg, ptr);
 }
 
-void undefined(svbool_t pg, void *ptr) {
-  svst1_ver_vnum_za64(0, 0, 0, pg, ptr, 0); // expected-error {{'svst1_ver_vnum_za64' needs target feature sme}}
+__attribute__((target("+sme")))
+void undefined(svbool_t pg, void *ptr) __arm_inout("za") {
+  svst1_ver_vnum_za64(0, 0, pg, ptr, 0); // expected-warning {{builtin call has undefined behaviour when called from a non-streaming function}}
 }
diff --git a/clang/test/Sema/annotate-type.c b/clang/test/Sema/annotate-type.c
index 9fd95f953c5d48a9ff82d527031f29b132eb11eb..901cef7ffa8b3ada3b57558e96cc9ae08329a245 100644
--- a/clang/test/Sema/annotate-type.c
+++ b/clang/test/Sema/annotate-type.c
@@ -20,10 +20,10 @@ void foo(float *[[clang::annotate_type("foo")]] a) {
   [[clang::annotate_type("bar")]] int *z1; // expected-error {{'annotate_type' attribute cannot be applied to a declaration}}
   int *z2 [[clang::annotate_type("bar")]]; // expected-error {{'annotate_type' attribute cannot be applied to a declaration}}
   [[clang::annotate_type("bar")]]; // expected-error {{'annotate_type' attribute cannot be applied to a statement}}
-  int *[[clang::annotate_type(1)]] z3; // expected-error {{'annotate_type' attribute requires a string}}
+  int *[[clang::annotate_type(1)]] z3; // expected-error {{expected string literal as argument of 'annotate_type' attribute}}
   int *[[clang::annotate_type()]] z4; // expected-error {{'annotate_type' attribute takes at least 1 argument}}
   int *[[clang::annotate_type]] z5; // expected-error {{'annotate_type' attribute takes at least 1 argument}}
-  int *[[clang::annotate_type(some_function())]] z6; // expected-error {{'annotate_type' attribute requires a string}}
+  int *[[clang::annotate_type(some_function())]] z6; // expected-error {{expected string literal as argument of 'annotate_type' attribute}}
   int *[[clang::annotate_type("bar", some_function())]] z7; // expected-error {{'annotate_type' attribute requires parameter 1 to be a constant expression}} expected-note{{subexpression not valid in a constant expression}}
   int *[[clang::annotate_type("bar", z7)]] z8; // expected-error {{'annotate_type' attribute requires parameter 1 to be a constant expression}} expected-note{{subexpression not valid in a constant expression}}
   int *[[clang::annotate_type("bar", int)]] z9; // expected-error {{expected expression}}
diff --git a/clang/test/Sema/annotate.c b/clang/test/Sema/annotate.c
index 2e7a37936fedec7356e13ecb6eeab972ccfac226..b4551a102e6174f7b769a68e8813462eb84e7de5 100644
--- a/clang/test/Sema/annotate.c
+++ b/clang/test/Sema/annotate.c
@@ -3,8 +3,8 @@
 void __attribute__((annotate("foo"))) foo(float *a) {
   __attribute__((annotate("bar"))) int x;
   [[clang::annotate("bar")]] int x2;
-  __attribute__((annotate(1))) int y; // expected-error {{'annotate' attribute requires a string}}
-  [[clang::annotate(1)]] int y2; // expected-error {{'annotate' attribute requires a string}}
+  __attribute__((annotate(1))) int y; // expected-error {{expected string literal as argument of 'annotate' attribute}}
+  [[clang::annotate(1)]] int y2; // expected-error {{expected string literal as argument of 'annotate' attribute}}
   __attribute__((annotate("bar", 1))) int z;
   [[clang::annotate("bar", 1)]] int z2;
 
diff --git a/clang/test/Sema/attr-assume.c b/clang/test/Sema/attr-assume.c
index cb07940d02b5b57166c7c0a467ad601463c1e3fb..98deffa3a746094cd0f3eaf0cf9d5b0c89d82051 100644
--- a/clang/test/Sema/attr-assume.c
+++ b/clang/test/Sema/attr-assume.c
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -triple i386-apple-darwin9 -fsyntax-only -verify %s
 
-void f1(void) __attribute__((assume(3))); // expected-error {{'assume' attribute requires a string}}
-void f2(void) __attribute__((assume(int))); // expected-error {{expected expression}}
-void f3(void) __attribute__((assume(for))); // expected-error {{expected expression}}
+void f1(void) __attribute__((assume(3))); // expected-error {{expected string literal as argument of 'assume' attribute}}
+void f2(void) __attribute__((assume(int))); // expected-error {{expected string literal as argument of 'assume' attribute}}
+void f3(void) __attribute__((assume(for))); // expected-error {{expected string literal as argument of 'assume' attribute}}
 void f4(void) __attribute__((assume("QQQQ"))); // expected-warning {{unknown assumption string 'QQQQ'; attribute is potentially ignored}}
 void f5(void) __attribute__((assume("omp_no_openmp")));
 void f6(void) __attribute__((assume("omp_noopenmp"))); // expected-warning {{unknown assumption string 'omp_noopenmp' may be misspelled; attribute is potentially ignored, did you mean 'omp_no_openmp'?}}
@@ -10,5 +10,5 @@ void f7(void) __attribute__((assume("omp_no_openmp_routine"))); // expected-warn
 void f8(void) __attribute__((assume("omp_no_openmp1"))); // expected-warning {{unknown assumption string 'omp_no_openmp1' may be misspelled; attribute is potentially ignored, did you mean 'omp_no_openmp'?}}
 void f9(void) __attribute__((assume("omp_no_openmp", "omp_no_openmp"))); // expected-error {{'assume' attribute takes one argument}}
 
-int g1 __attribute__((assume(0))); // expected-warning {{'assume' attribute only applies to functions and Objective-C methods}}
+int g1 __attribute__((assume(0))); // expected-error {{expected string literal as argument of 'assume' attribute}}
 int g2 __attribute__((assume("omp_no_openmp"))); // expected-warning {{'assume' attribute only applies to functions and Objective-C methods}}
diff --git a/clang/test/Sema/attr-btf_tag.c b/clang/test/Sema/attr-btf_tag.c
index d33ccdf962825f9920a6b0db02074370f91df826..cbb21a5a88bb68b90db61803503dfb317838d417 100644
--- a/clang/test/Sema/attr-btf_tag.c
+++ b/clang/test/Sema/attr-btf_tag.c
@@ -21,7 +21,7 @@ struct __tag2 __tag3 t2 {
 int g1 __tag1;
 int g2 __tag_no_arg; // expected-error {{'btf_decl_tag' attribute takes one argument}}
 int g3 __tag_2_arg; // expected-error {{'btf_decl_tag' attribute takes one argument}}
-int i1 __invalid; // expected-error {{'btf_decl_tag' attribute requires a string}}
+int i1 __invalid; // expected-error {{expected string literal as argument of 'btf_decl_tag' attribute}}
 
 enum e1 {
   E1
diff --git a/clang/test/Sema/attr-btf_type_tag.c b/clang/test/Sema/attr-btf_type_tag.c
index a3ef404f38238b35ceff407edaaed5822b8b4f81..aa3230a3e9b7f314db94855be9f2737763be2726 100644
--- a/clang/test/Sema/attr-btf_type_tag.c
+++ b/clang/test/Sema/attr-btf_type_tag.c
@@ -8,7 +8,7 @@
 #define __tag6 __attribute__((btf_type_tag("tag6")))
 
 int __attribute__((btf_type_tag("tag1", "tag2"))) *invalid1; // expected-error {{'btf_type_tag' attribute takes one argument}}
-int __attribute__((btf_type_tag(2))) *invalid2; // expected-error {{'btf_type_tag' attribute requires a string}}
+int __attribute__((btf_type_tag(2))) *invalid2; // expected-error {{expected string literal as argument of 'btf_type_tag' attribute}}
 
 int * __tag1 __tag2 * __tag3 __tag4 * __tag5 __tag6 *g;
 
diff --git a/clang/test/Sema/attr-capabilities.c b/clang/test/Sema/attr-capabilities.c
index b28c437935a130e0a0ec011f5f0d3e4aa4aec707..5138803bd5eb7ad74c85b7ff0ca1ee2505114bad 100644
--- a/clang/test/Sema/attr-capabilities.c
+++ b/clang/test/Sema/attr-capabilities.c
@@ -17,8 +17,8 @@ int Test3 __attribute__((acquire_capability("test3")));  // expected-warning {{'
 int Test4 __attribute__((try_acquire_capability("test4"))); // expected-error {{'try_acquire_capability' attribute only applies to functions}}
 int Test5 __attribute__((release_capability("test5"))); // expected-warning {{'release_capability' attribute only applies to functions}}
 
-struct __attribute__((capability(12))) Test3 {}; // expected-error {{'capability' attribute requires a string}}
-struct __attribute__((shared_capability(Test2))) Test4 {}; // expected-error {{'shared_capability' attribute requires a string}}
+struct __attribute__((capability(12))) Test3 {}; // expected-error {{expected string literal as argument of 'capability' attribute}}
+struct __attribute__((shared_capability(Test2))) Test4 {}; // expected-error {{expected string literal as argument of 'shared_capability' attribute}}
 
 struct __attribute__((capability)) Test5 {}; // expected-error {{'capability' attribute takes one argument}}
 struct __attribute__((shared_capability("test1", 12))) Test6 {}; // expected-error {{'shared_capability' attribute takes one argument}}
diff --git a/clang/test/Sema/attr-enforce-tcb-errors.cpp b/clang/test/Sema/attr-enforce-tcb-errors.cpp
index 1ce147ab32df9287ba8b965c7b1cdf1908b4a4a2..b5effe2fe51809867045cfe138bee9c57f513a83 100644
--- a/clang/test/Sema/attr-enforce-tcb-errors.cpp
+++ b/clang/test/Sema/attr-enforce-tcb-errors.cpp
@@ -6,14 +6,14 @@ void no_arguments() __attribute__((enforce_tcb)); // expected-error{{'enforce_tc
 
 void too_many_arguments() __attribute__((enforce_tcb("test", 12))); // expected-error{{'enforce_tcb' attribute takes one argument}}
 
-void wrong_argument_type() __attribute__((enforce_tcb(12))); // expected-error{{'enforce_tcb' attribute requires a string}}
+void wrong_argument_type() __attribute__((enforce_tcb(12))); // expected-error{{expected string literal as argument of 'enforce_tcb' attribute}}
 
 [[clang::enforce_tcb_leaf("oops")]] int wrong_subject_type_leaf; // expected-warning{{'enforce_tcb_leaf' attribute only applies to functions}}
 
 void no_arguments_leaf() __attribute__((enforce_tcb_leaf)); // expected-error{{'enforce_tcb_leaf' attribute takes one argument}}
 
 void too_many_arguments_leaf() __attribute__((enforce_tcb_leaf("test", 12))); // expected-error{{'enforce_tcb_leaf' attribute takes one argument}}
-void wrong_argument_type_leaf() __attribute__((enforce_tcb_leaf(12))); // expected-error{{'enforce_tcb_leaf' attribute requires a string}}
+void wrong_argument_type_leaf() __attribute__((enforce_tcb_leaf(12))); // expected-error{{expected string literal as argument of 'enforce_tcb_leaf' attribute}}
 
 void foo();
 
diff --git a/clang/test/Sema/attr-enforce-tcb-errors.m b/clang/test/Sema/attr-enforce-tcb-errors.m
index f82d38c919d1d4122cb5956de82fd0cd13857413..c8d0716553cc6aad9fe3cdf8bca4c4e39c093dc6 100644
--- a/clang/test/Sema/attr-enforce-tcb-errors.m
+++ b/clang/test/Sema/attr-enforce-tcb-errors.m
@@ -20,13 +20,13 @@ __attribute__((objc_root_class))
 
 - (void)tooManyArguments __attribute__((enforce_tcb("test", 12))); // expected-error{{'enforce_tcb' attribute takes one argument}}
 
-- (void)wrongArgumentType __attribute__((enforce_tcb(12))); // expected-error{{'enforce_tcb' attribute requires a string}}
+- (void)wrongArgumentType __attribute__((enforce_tcb(12))); // expected-error{{expected string literal as argument of 'enforce_tcb' attribute}}
 
 - (void)noArgumentsLeaf __attribute__((enforce_tcb_leaf)); // expected-error{{'enforce_tcb_leaf' attribute takes one argument}}
 
 - (void)tooManyArgumentsLeaf __attribute__((enforce_tcb_leaf("test", 12))); // expected-error{{'enforce_tcb_leaf' attribute takes one argument}}
 
-- (void)wrongArgumentTypeLeaf __attribute__((enforce_tcb_leaf(12))); // expected-error{{'enforce_tcb_leaf' attribute requires a string}}
+- (void)wrongArgumentTypeLeaf __attribute__((enforce_tcb_leaf(12))); // expected-error{{expected string literal as argument of 'enforce_tcb_leaf' attribute}}
 @end
 
 @implementation AClass
diff --git a/clang/test/Sema/attr-error.c b/clang/test/Sema/attr-error.c
index 581bfc43cbc06b1c6ff8292cea7f185551a8b911..9f500a87be14c6e1fec2e8c4368d93f2bdbc3eb1 100644
--- a/clang/test/Sema/attr-error.c
+++ b/clang/test/Sema/attr-error.c
@@ -15,7 +15,7 @@ int bad2(void) {
   __attribute__((error("bad2"))); // expected-error {{'error' attribute cannot be applied to a statement}}
 }
 
-__attribute__((error(3))) // expected-error {{'error' attribute requires a string}}
+__attribute__((error(3))) // expected-error {{expected string literal as argument of 'error' attribute}}
 int
 bad3(void);
 
diff --git a/clang/test/Sema/attr-handles.cpp b/clang/test/Sema/attr-handles.cpp
index 135467b6c0a8c2fc169914f1a582b81802066b96..ff1c1f68dfec8ef93763b312f0763238d47bb2d4 100644
--- a/clang/test/Sema/attr-handles.cpp
+++ b/clang/test/Sema/attr-handles.cpp
@@ -6,7 +6,7 @@ void (*fp)(int handle [[clang::use_handle("Fuchsia")]]);
 auto lambda = [](int handle [[clang::use_handle("Fuchsia")]]){};
 void g(int a __attribute__((acquire_handle("Fuchsia")))); // expected-error {{attribute only applies to output parameters}}
 void h(int *a __attribute__((acquire_handle))); // expected-error {{'acquire_handle' attribute takes one argument}}
-void h(int *a __attribute__((acquire_handle(1)))); // expected-error {{attribute requires a string}}
+void h(int *a __attribute__((acquire_handle(1)))); // expected-error {{expected string literal as argument of 'acquire_handle' attribute}}
 void h(int *a __attribute__((acquire_handle("RandomString", "AndAnother")))); // expected-error {{'acquire_handle' attribute takes one argument}}
 __attribute__((release_handle("Fuchsia"))) int i(); // expected-warning {{'release_handle' attribute only applies to parameters}}
 __attribute__((use_handle("Fuchsia"))) int j(); // expected-warning {{'use_handle' attribute only applies to parameters}}
diff --git a/clang/test/Sema/attr-section.c b/clang/test/Sema/attr-section.c
index 3ea922c91947e38520ed0d03f7bb769e5cf9595a..1f058c24f980fdbdbb6237519395dba803bc25fe 100644
--- a/clang/test/Sema/attr-section.c
+++ b/clang/test/Sema/attr-section.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -verify -fsyntax-only -triple x86_64-apple-darwin9 %s
 
 int x __attribute__((section(
-   42)));  // expected-error {{'section' attribute requires a string}}
+   42)));  // expected-error {{expected string literal as argument of 'section' attribute}}
 
 
 // rdar://4341926
diff --git a/clang/test/Sema/attr-tls_model.c b/clang/test/Sema/attr-tls_model.c
index 9106d39b55a3af11d162202e530ce0001b5a0092..ec99c7aacaa3a881948a2114583d81cf73a6ab73 100644
--- a/clang/test/Sema/attr-tls_model.c
+++ b/clang/test/Sema/attr-tls_model.c
@@ -10,5 +10,5 @@ int x __attribute((tls_model("global-dynamic"))); // expected-error {{'tls_model
 static __thread int y __attribute((tls_model("global-dynamic"))); // no-warning
 
 static __thread int y __attribute((tls_model("local", "dynamic"))); // expected-error {{'tls_model' attribute takes one argument}}
-static __thread int y __attribute((tls_model(123))); // expected-error {{'tls_model' attribute requires a string}}
+static __thread int y __attribute((tls_model(123))); // expected-error {{expected string literal as argument of 'tls_model' attribute}}
 static __thread int y __attribute((tls_model("foobar"))); // expected-error {{tls_model must be "global-dynamic", "local-dynamic", "initial-exec" or "local-exec"}}
diff --git a/clang/test/Sema/attr-unavailable-message.c b/clang/test/Sema/attr-unavailable-message.c
index b4ae70e0a39cba340d3c0f6a620c461bc8ad9589..0caa943ad8a42f8e439e53e141fd711549ffd466 100644
--- a/clang/test/Sema/attr-unavailable-message.c
+++ b/clang/test/Sema/attr-unavailable-message.c
@@ -8,7 +8,7 @@ double dfoo(double)  __attribute__((__unavailable__("NO LONGER"))); // expected-
 
 void bar(void) __attribute__((__unavailable__)); // expected-note {{explicitly marked unavailable}}
 
-int quux(void) __attribute__((__unavailable__(12))); // expected-error {{'__unavailable__' attribute requires a string}}
+int quux(void) __attribute__((__unavailable__(12))); // expected-error {{expected string literal as argument of '__unavailable__' attribute}}
 
 #define ACCEPTABLE	"Use something else"
 int quux2(void) __attribute__((__unavailable__(ACCEPTABLE)));
diff --git a/clang/test/Sema/attr-warning.c b/clang/test/Sema/attr-warning.c
index 0973f3c8eb4bd76b8bb5ec3a59b233e2e2436d6a..7510e88d291b1abf04ffd996b19738beff6ba43d 100644
--- a/clang/test/Sema/attr-warning.c
+++ b/clang/test/Sema/attr-warning.c
@@ -15,7 +15,7 @@ int bad2(void) {
   __attribute__((warning("bad2"))); // expected-error {{'warning' attribute cannot be applied to a statement}}
 }
 
-__attribute__((warning(3))) // expected-error {{'warning' attribute requires a string}}
+__attribute__((warning(3))) // expected-error {{expected string literal as argument of 'warning' attribute}}
 int
 bad3(void);
 
diff --git a/clang/test/Sema/diagnose_if.c b/clang/test/Sema/diagnose_if.c
index 6297545781f90d31cb50b3665eb720fc2a552918..4df39916c031e30bb13ea419596f53937eb0873f 100644
--- a/clang/test/Sema/diagnose_if.c
+++ b/clang/test/Sema/diagnose_if.c
@@ -6,7 +6,7 @@ void failure1(void) _diagnose_if(); // expected-error{{exactly 3 arguments}}
 void failure2(void) _diagnose_if(0); // expected-error{{exactly 3 arguments}}
 void failure3(void) _diagnose_if(0, ""); // expected-error{{exactly 3 arguments}}
 void failure4(void) _diagnose_if(0, "", "error", 1); // expected-error{{exactly 3 arguments}}
-void failure5(void) _diagnose_if(0, 0, "error"); // expected-error{{requires a string}}
+void failure5(void) _diagnose_if(0, 0, "error"); // expected-error{{expected string literal as argument of 'diagnose_if' attribute}}
 void failure6(void) _diagnose_if(0, "", "invalid"); // expected-error{{invalid diagnostic type for 'diagnose_if'; use "error" or "warning" instead}}
 void failure7(void) _diagnose_if(0, "", "ERROR"); // expected-error{{invalid diagnostic type}}
 void failure8(int a) _diagnose_if(a, "", ""); // expected-error{{invalid diagnostic type}}
diff --git a/clang/test/Sema/enable_if.c b/clang/test/Sema/enable_if.c
index 22eb84fa7275c0572abe9ecf51f3e7cd5631d53d..9d46c71274d69223d872b36467125bf49931f59a 100644
--- a/clang/test/Sema/enable_if.c
+++ b/clang/test/Sema/enable_if.c
@@ -105,7 +105,7 @@ __attribute__((enable_if(n == 0, "chosen when 'n' is zero"))) void f1(int n); //
 
 int n __attribute__((enable_if(1, "always chosen"))); // expected-warning{{'enable_if' attribute only applies to functions}}
 
-void f(int n) __attribute__((enable_if("chosen when 'n' is zero", n == 0)));  // expected-error{{'enable_if' attribute requires a string}}
+void f(int n) __attribute__((enable_if("chosen when 'n' is zero", n == 0)));  // expected-error{{expected string literal as argument of 'enable_if' attribute}}
 
 void f(int n) __attribute__((enable_if()));  // expected-error{{'enable_if' attribute requires exactly 2 arguments}}
 
diff --git a/clang/test/SemaCXX/attr-deprecated-replacement-error.cpp b/clang/test/SemaCXX/attr-deprecated-replacement-error.cpp
index 54d0f9e74f3403ada41f8e64c1d4e9c97775cd29..8c9d9b29c5535275f0bc8397fe18cbcf568b08c8 100644
--- a/clang/test/SemaCXX/attr-deprecated-replacement-error.cpp
+++ b/clang/test/SemaCXX/attr-deprecated-replacement-error.cpp
@@ -5,13 +5,13 @@
 #endif
 
 int a1 [[deprecated("warning", "fixit")]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
-int a2 [[deprecated("warning", 1)]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
+int a2 [[deprecated("warning", 1)]]; // expected-error{{expected string literal as argument of 'deprecated' attribute}}
 
 int b1 [[gnu::deprecated("warning", "fixit")]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
-int b2 [[gnu::deprecated("warning", 1)]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
+int b2 [[gnu::deprecated("warning", 1)]]; // expected-error{{expected string literal as argument of 'deprecated' attribute}}
 
 __declspec(deprecated("warning", "fixit")) int c1; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
-__declspec(deprecated("warning", 1)) int c2; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
+__declspec(deprecated("warning", 1)) int c2; // expected-error{{expected string literal as argument of 'deprecated' attribute}}
 
 int d1 __attribute__((deprecated("warning", "fixit")));
-int d2 __attribute__((deprecated("warning", 1))); // expected-error{{'deprecated' attribute requires a string}}
+int d2 __attribute__((deprecated("warning", 1))); // expected-error{{expected string literal as argument of 'deprecated' attribute}}
diff --git a/clang/test/SemaCXX/attr-no-sanitize.cpp b/clang/test/SemaCXX/attr-no-sanitize.cpp
index 9e13fd3c02702e6bd2a3e21c16568c10b965f3ae..a464947fe5a3469964c9eb69b82a9a4e4b27a21f 100644
--- a/clang/test/SemaCXX/attr-no-sanitize.cpp
+++ b/clang/test/SemaCXX/attr-no-sanitize.cpp
@@ -4,7 +4,7 @@
 
 int f1() __attribute__((no_sanitize)); // expected-error{{'no_sanitize' attribute takes at least 1 argument}}
 
-int f2() __attribute__((no_sanitize(1))); // expected-error{{'no_sanitize' attribute requires a string}}
+int f2() __attribute__((no_sanitize(1))); // expected-error{{expected string literal as argument of 'no_sanitize' attribute}}
 
 __attribute__((no_sanitize("all"))) int global; // expected-warning{{'no_sanitize' attribute argument 'all' not supported on a global variable}}
 __attribute__((no_sanitize("unknown"))) int global2; // expected-warning{{unknown sanitizer 'unknown' ignored}}
diff --git a/clang/test/SemaCXX/attr-section.cpp b/clang/test/SemaCXX/attr-section.cpp
index 12c0da283997df783cf33daa42eadab337ce32f9..62b597b4ff36f675c763a9af020b5b658aecfd79 100644
--- a/clang/test/SemaCXX/attr-section.cpp
+++ b/clang/test/SemaCXX/attr-section.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -verify -fsyntax-only -triple x86_64-linux-gnu %s
 
 int x __attribute__((section(
-   42)));  // expected-error {{'section' attribute requires a string}}
+   42)));  // expected-error {{expected string literal as argument of 'section' attribute}}
 
 
 // PR6007
diff --git a/clang/test/SemaCXX/attr-weakref.cpp b/clang/test/SemaCXX/attr-weakref.cpp
index 46ca5ab20682037e1952f39665a314a4420a1c84..4e1834096ebedefb6e18dc2225c63a26e164f4e8 100644
--- a/clang/test/SemaCXX/attr-weakref.cpp
+++ b/clang/test/SemaCXX/attr-weakref.cpp
@@ -33,6 +33,6 @@ int a9 __attribute__((weakref));  // expected-error {{weakref declaration of 'a9
 static int a10();
 int a10() __attribute__((weakref ("foo")));
 
-static int v __attribute__((weakref(a1), alias("foo"))); // expected-error {{'weakref' attribute requires a string}}
+static int v __attribute__((weakref(a1), alias("foo"))); // expected-error {{expected string literal as argument of 'weakref' attribute}}
 
 __attribute__((weakref ("foo"))) auto a11 = 1; // expected-error {{weakref declaration must have internal linkage}}
diff --git a/clang/test/SemaCXX/suppress.cpp b/clang/test/SemaCXX/suppress.cpp
index d88ae0bbca00f9526b7c6abe1209c06521f4c4ed..29544b3c573ccc8695589605675ad1f54d33a75b 100644
--- a/clang/test/SemaCXX/suppress.cpp
+++ b/clang/test/SemaCXX/suppress.cpp
@@ -16,7 +16,7 @@ void f_() {
   [[gsl::suppress]] int x; // expected-error {{'suppress' attribute takes at least 1 argument}}
   [[gsl::suppress()]] int y; // expected-error {{'suppress' attribute takes at least 1 argument}}
   int [[gsl::suppress("r")]] z; // expected-error {{'suppress' attribute cannot be applied to types}}
-  [[gsl::suppress(f_)]] float f; // expected-error {{'suppress' attribute requires a string}}
+  [[gsl::suppress(f_)]] float f; // expected-error {{expected string literal as argument of 'suppress' attribut}}
 }
 
 union [[gsl::suppress("type.1")]] U {
diff --git a/clang/test/SemaObjC/attr-swift_bridge.m b/clang/test/SemaObjC/attr-swift_bridge.m
index 8f53ed8154588ca720f4714f471a0008742e42d9..0464e3f326e0abec946310a3f3d6e28cb7e61f41 100644
--- a/clang/test/SemaObjC/attr-swift_bridge.m
+++ b/clang/test/SemaObjC/attr-swift_bridge.m
@@ -5,7 +5,7 @@ __attribute__((__swift_bridge__))
 @interface I
 @end
 
-// expected-error@+1 {{'__swift_bridge__' attribute requires a string}}
+// expected-error@+1 {{expected string literal as argument of '__swift_bridge__' attribute}}
 __attribute__((__swift_bridge__(1)))
 @interface J
 @end
diff --git a/clang/test/SemaObjC/objc-asm-attribute-neg-test.m b/clang/test/SemaObjC/objc-asm-attribute-neg-test.m
index 9941189357ba1efa37983962339c72bcfaa5ba1d..ac3871969fe21a3c5213694e776ae7fa86c87e86 100644
--- a/clang/test/SemaObjC/objc-asm-attribute-neg-test.m
+++ b/clang/test/SemaObjC/objc-asm-attribute-neg-test.m
@@ -5,7 +5,7 @@ __attribute__((objc_runtime_name)) // expected-error {{'objc_runtime_name' attri
 @interface BInterface
 @end
 
-__attribute__((objc_runtime_name(123))) // expected-error {{'objc_runtime_name' attribute requires a string}}
+__attribute__((objc_runtime_name(123))) // expected-error {{expected string literal as argument of 'objc_runtime_name' attribute}}
 @protocol BProtocol1
 @end
 
@@ -14,7 +14,7 @@ __attribute__((objc_runtime_name("MySecretNamespace.Protocol")))
 @end
 
 __attribute__((objc_runtime_name("MySecretNamespace.Message")))
-@interface Message <Protocol> { 
+@interface Message <Protocol> {
 __attribute__((objc_runtime_name("MySecretNamespace.Message"))) // expected-error {{'objc_runtime_name' attribute only applies to Objective-C interfaces and Objective-C protocols}}
   id MyIVAR;
 }
diff --git a/clang/test/SemaObjC/validate-attr-swift_attr.m b/clang/test/SemaObjC/validate-attr-swift_attr.m
index 4ff434d179725d1bd21f67fda1013d4249caf495..2c73b0a892722c16e34da578bdfaafdeb06faeda 100644
--- a/clang/test/SemaObjC/validate-attr-swift_attr.m
+++ b/clang/test/SemaObjC/validate-attr-swift_attr.m
@@ -5,7 +5,7 @@ __attribute__((swift_attr))
 @interface I
 @end
 
-// expected-error@+1 {{'swift_attr' attribute requires a string}}
+// expected-error@+1 {{expected string literal as argument of 'swift_attr' attribute}}
 __attribute__((swift_attr(1)))
 @interface J
 @end
diff --git a/clang/test/SemaTemplate/attributes.cpp b/clang/test/SemaTemplate/attributes.cpp
index a7081e83470aa13bd448fd637bfb98f3809d0a85..9fd448a5e9353e19335200f70ccb1446cf0fc4cd 100644
--- a/clang/test/SemaTemplate/attributes.cpp
+++ b/clang/test/SemaTemplate/attributes.cpp
@@ -25,7 +25,7 @@ namespace attribute_aligned {
   {
     __attribute__((aligned(Align))) char storage[Size];
   };
-  
+
   template<typename T>
   class C {
   public:
@@ -95,11 +95,11 @@ void UseAnnotations() { HasAnnotations<int>(); }
 template <int... Is> [[clang::annotate("ANNOTATE_BAZ", Is...)]] void HasPackAnnotations();
 void UsePackAnnotations() { HasPackAnnotations<1, 2, 3>(); }
 
-template <int... Is> [[clang::annotate(Is...)]] void HasOnlyPackAnnotation() {} // expected-error {{'annotate' attribute takes at least 1 argument}} expected-error {{'annotate' attribute requires a string}}
+template <int... Is> [[clang::annotate(Is...)]] void HasOnlyPackAnnotation() {} // expected-error {{expected string literal as argument of 'annotate' attribute}}
 
 void UseOnlyPackAnnotations() {
-  HasOnlyPackAnnotation<>();  // expected-note {{in instantiation of function template specialization 'attribute_annotate::HasOnlyPackAnnotation<>' requested here}}
-  HasOnlyPackAnnotation<1>(); // expected-note {{in instantiation of function template specialization 'attribute_annotate::HasOnlyPackAnnotation<1>' requested here}}
+  HasOnlyPackAnnotation<>();
+  HasOnlyPackAnnotation<1>();
 }
 
 // CHECK:      ClassTemplateDecl {{.*}} AnnotatedPackTemplateStruct
@@ -276,40 +276,21 @@ void UseOnlyPackAnnotations() {
 // CHECK-NEXT:       value: Int 6
 // CHECK-NEXT:       IntegerLiteral {{.*}} 'int' 6
 // CHECK-NEXT:   CXXRecordDecl {{.*}} implicit struct AnnotatedPackTemplateStruct
-// CHECK-NEXT: ClassTemplatePartialSpecializationDecl {{.*}} struct AnnotatedPackTemplateStruct definition
-// CHECK-NEXT:   DefinitionData
-// CHECK-NEXT:     DefaultConstructor
-// CHECK-NEXT:     CopyConstructor
-// CHECK-NEXT:     MoveConstructor
-// CHECK-NEXT:     CopyAssignment
-// CHECK-NEXT:     MoveAssignment
-// CHECK-NEXT:     Destructor
-// CHECK-NEXT:   TemplateArgument{{.*}} type 'char'
-// CHECK-NEXT:     BuiltinType {{.*}} 'char'
-// CHECK-NEXT:   TemplateArgument{{.*}} pack
-// CHECK-NEXT:     TemplateArgument{{.*}} expr
-// CHECK-NEXT:       PackExpansionExpr {{.*}} 'int'
-// CHECK-NEXT:         DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int'
-// CHECK-NEXT:   NonTypeTemplateParmDecl {{.*}} referenced 'int' depth 0 index 0 ... Is
-// CHECK-NEXT:   AnnotateAttr {{.*}} ""
-// CHECK-NEXT:     PackExpansionExpr {{.*}} '<dependent type>'
-// CHECK-NEXT:       DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int'
-// CHECK-NEXT:   CXXRecordDecl {{.*}} implicit struct AnnotatedPackTemplateStruct
 template <typename T, int... Is> struct [[clang::annotate("ANNOTATE_FOZ", Is...)]] AnnotatedPackTemplateStruct{};
 template <int... Is> struct [[clang::annotate("ANNOTATE_BOO", Is...)]] AnnotatedPackTemplateStruct<int, Is...>{};
 template <int... Is> struct [[clang::annotate("ANNOTATE_FOZ", 4, 5, 6)]] AnnotatedPackTemplateStruct<float, Is...>{};
-template <int... Is> struct [[clang::annotate(Is...)]] AnnotatedPackTemplateStruct<char, Is...>{}; // expected-error {{'annotate' attribute requires a string}} expected-error {{'annotate' attribute takes at least 1 argument}}
+template <int... Is> struct [[clang::annotate(Is...)]] AnnotatedPackTemplateStruct<char, Is...>{}; // expected-error {{expected string literal as argument of 'annotate' attribute}}
 void UseAnnotatedPackTemplateStructSpecializations() {
   AnnotatedPackTemplateStruct<int, 1, 2, 3> Instance1{};
   AnnotatedPackTemplateStruct<float, 3, 2, 1> Instance2{};
   AnnotatedPackTemplateStruct<bool, 7, 8, 9> Instance3{};
-  AnnotatedPackTemplateStruct<char, 1, 2, 3> Instance4{}; // expected-note {{in instantiation of template class 'attribute_annotate::AnnotatedPackTemplateStruct<char, 1, 2, 3>' requested here}}
-  AnnotatedPackTemplateStruct<char> Instance5{};          // expected-note {{in instantiation of template class 'attribute_annotate::AnnotatedPackTemplateStruct<char>' requested here}}
+  AnnotatedPackTemplateStruct<char, 1, 2, 3> Instance4{};
+  AnnotatedPackTemplateStruct<char> Instance5{};
 }
 
 // CHECK:      ClassTemplateDecl {{.*}} InvalidAnnotatedPackTemplateStruct
 // CHECK-NEXT:   TemplateTypeParmDecl {{.*}} typename depth 0 index 0 T
-// CHECK-NEXT:   NonTypeTemplateParmDecl {{.*}} referenced 'int' depth 0 index 1 ... Is
+// CHECK-NEXT:   NonTypeTemplateParmDecl {{.*}} 'int' depth 0 index 1 ... Is
 // CHECK-NEXT:   CXXRecordDecl {{.*}} struct InvalidAnnotatedPackTemplateStruct definition
 // CHECK-NEXT:     DefinitionData
 // CHECK-NEXT:       DefaultConstructor
@@ -318,9 +299,6 @@ void UseAnnotatedPackTemplateStructSpecializations() {
 // CHECK-NEXT:       CopyAssignment
 // CHECK-NEXT:       MoveAssignment
 // CHECK-NEXT:       Destructor
-// CHECK-NEXT:     AnnotateAttr {{.*}} ""
-// CHECK-NEXT:       PackExpansionExpr {{.*}} '<dependent type>'
-// CHECK-NEXT:         DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int'
 // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct InvalidAnnotatedPackTemplateStruct
 // CHECK-NEXT:   ClassTemplateSpecialization {{.*}} 'InvalidAnnotatedPackTemplateStruct'
 // CHECK-NEXT:   ClassTemplateSpecializationDecl {{.*}} struct InvalidAnnotatedPackTemplateStruct definition
@@ -446,7 +424,7 @@ void UseAnnotatedPackTemplateStructSpecializations() {
 // CHECK-NEXT:     TemplateArgument{{.*}} integral 6
 // CHECK-NEXT:     TemplateArgument{{.*}} integral 7
 // CHECK-NEXT:   CXXRecordDecl {{.*}} implicit struct InvalidAnnotatedPackTemplateStruct
-template <typename T, int... Is> struct [[clang::annotate(Is...)]] InvalidAnnotatedPackTemplateStruct{}; // expected-error {{'annotate' attribute requires a string}} expected-error {{'annotate' attribute takes at least 1 argument}}
+template <typename T, int... Is> struct InvalidAnnotatedPackTemplateStruct{};
 template <int... Is> struct [[clang::annotate("ANNOTATE_BIR", Is...)]] InvalidAnnotatedPackTemplateStruct<int, Is...>{};
 template <int... Is> struct InvalidAnnotatedPackTemplateStruct<float, Is...> {};
 template <> struct InvalidAnnotatedPackTemplateStruct<char, 5, 6, 7> {};
@@ -454,8 +432,8 @@ void UseInvalidAnnotatedPackTemplateStruct() {
   InvalidAnnotatedPackTemplateStruct<int, 1, 2, 3> Instance1{};
   InvalidAnnotatedPackTemplateStruct<float, 3, 2, 1> Instance2{};
   InvalidAnnotatedPackTemplateStruct<char, 5, 6, 7> Instance3{};
-  InvalidAnnotatedPackTemplateStruct<bool, 7, 8, 9> Instance4{}; // expected-note {{in instantiation of template class 'attribute_annotate::InvalidAnnotatedPackTemplateStruct<bool, 7, 8, 9>' requested here}}
-  InvalidAnnotatedPackTemplateStruct<bool> Instance5{};          // expected-note {{in instantiation of template class 'attribute_annotate::InvalidAnnotatedPackTemplateStruct<bool>' requested here}}
+  InvalidAnnotatedPackTemplateStruct<bool, 7, 8, 9> Instance4{};
+  InvalidAnnotatedPackTemplateStruct<bool> Instance5{};
 }
 
 // CHECK:      FunctionTemplateDecl {{.*}} RedeclaredAnnotatedFunc
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 79db17501b64f06442e13872a0478a548db6892c..30790bc45ce35b1346c8d1153340b96bc2b54b8e 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -2297,6 +2297,22 @@ static bool isVariadicExprArgument(const Record *Arg) {
              .Default(false);
 }
 
+static bool isStringLiteralArgument(const Record *Arg) {
+  return !Arg->getSuperClasses().empty() &&
+         llvm::StringSwitch<bool>(
+             Arg->getSuperClasses().back().first->getName())
+             .Case("StringArgument", true)
+             .Default(false);
+}
+
+static bool isVariadicStringLiteralArgument(const Record *Arg) {
+  return !Arg->getSuperClasses().empty() &&
+         llvm::StringSwitch<bool>(
+             Arg->getSuperClasses().back().first->getName())
+             .Case("VariadicStringArgument", true)
+             .Default(false);
+}
+
 static void emitClangAttrVariadicIdentifierArgList(RecordKeeper &Records,
                                                    raw_ostream &OS) {
   OS << "#if defined(CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST)\n";
@@ -2317,6 +2333,34 @@ static void emitClangAttrVariadicIdentifierArgList(RecordKeeper &Records,
   OS << "#endif // CLANG_ATTR_VARIADIC_IDENTIFIER_ARG_LIST\n\n";
 }
 
+// Emits the list of arguments that should be parsed as unevaluated string
+// literals for each attribute.
+static void emitClangAttrUnevaluatedStringLiteralList(RecordKeeper &Records,
+                                                      raw_ostream &OS) {
+  OS << "#if defined(CLANG_ATTR_STRING_LITERAL_ARG_LIST)\n";
+  std::vector<Record *> Attrs = Records.getAllDerivedDefinitions("Attr");
+  for (const auto *Attr : Attrs) {
+    std::vector<Record *> Args = Attr->getValueAsListOfDefs("Args");
+    uint32_t Bits = 0;
+    assert(Args.size() <= 32 && "unsupported number of arguments in attribute");
+    for (uint32_t N = 0; N < Args.size(); ++N) {
+      Bits |= (isStringLiteralArgument(Args[N]) << N);
+      // If we have a variadic string argument, set all the remaining bits to 1
+      if (isVariadicStringLiteralArgument(Args[N])) {
+        Bits |= maskTrailingZeros<decltype(Bits)>(N);
+        break;
+      }
+    }
+    if (!Bits)
+      continue;
+    // All these spellings have at least one string literal has argument.
+    forEachUniqueSpelling(*Attr, [&](const FlattenedSpelling &S) {
+      OS << ".Case(\"" << S.name() << "\", " << Bits << ")\n";
+    });
+  }
+  OS << "#endif // CLANG_ATTR_STRING_LITERAL_ARG_LIST\n\n";
+}
+
 // Emits the first-argument-is-identifier property for attributes.
 static void emitClangAttrIdentifierArgList(RecordKeeper &Records, raw_ostream &OS) {
   OS << "#if defined(CLANG_ATTR_IDENTIFIER_ARG_LIST)\n";
@@ -3415,23 +3459,27 @@ static void GenerateHasAttrSpellingStringSwitch(
   OS << "    .Default(0);\n";
 }
 
-// Emits the list of tokens for regular keyword attributes.
-void EmitClangAttrTokenKinds(RecordKeeper &Records, raw_ostream &OS) {
-  emitSourceFileHeader("A list of tokens generated from the attribute"
-                       " definitions",
-                       OS);
+// Emits list of regular keyword attributes with info about their arguments.
+void EmitClangRegularKeywordAttributeInfo(RecordKeeper &Records,
+                                          raw_ostream &OS) {
+  emitSourceFileHeader(
+      "A list of regular keyword attributes generated from the attribute"
+      " definitions",
+      OS);
   // Assume for now that the same token is not used in multiple regular
   // keyword attributes.
   for (auto *R : Records.getAllDerivedDefinitions("Attr"))
-    for (const auto &S : GetFlattenedSpellings(*R))
-      if (isRegularKeywordAttribute(S)) {
-        if (!R->getValueAsListOfDefs("Args").empty())
-          PrintError(R->getLoc(),
-                     "RegularKeyword attributes with arguments are not "
-                     "yet supported");
-        OS << "KEYWORD_ATTRIBUTE("
-           << S.getSpellingRecord().getValueAsString("Name") << ", )\n";
-      }
+    for (const auto &S : GetFlattenedSpellings(*R)) {
+      if (!isRegularKeywordAttribute(S))
+        continue;
+      std::vector<Record *> Args = R->getValueAsListOfDefs("Args");
+      bool HasArgs = llvm::any_of(
+          Args, [](const Record *Arg) { return !Arg->getValueAsBit("Fake"); });
+
+      OS << "KEYWORD_ATTRIBUTE("
+         << S.getSpellingRecord().getValueAsString("Name") << ", "
+         << (HasArgs ? "true" : "false") << ",)\n";
+    }
   OS << "#undef KEYWORD_ATTRIBUTE\n";
 }
 
@@ -4616,6 +4664,7 @@ void EmitClangAttrParserStringSwitches(RecordKeeper &Records,
   emitSourceFileHeader("Parser-related llvm::StringSwitch cases", OS);
   emitClangAttrArgContextList(Records, OS);
   emitClangAttrIdentifierArgList(Records, OS);
+  emitClangAttrUnevaluatedStringLiteralList(Records, OS);
   emitClangAttrVariadicIdentifierArgList(Records, OS);
   emitClangAttrThisIsaIdentifierArgList(Records, OS);
   emitClangAttrAcceptsExprPack(Records, OS);
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 936724b9ce38f4e3053adb2fde74fbdf1a210b76..b19048eff5db206cc78882d24219eab200e1233e 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -550,6 +550,8 @@ class NeonEmitter {
 
   void createIntrinsic(Record *R, SmallVectorImpl<Intrinsic *> &Out);
   void genBuiltinsDef(raw_ostream &OS, SmallVectorImpl<Intrinsic *> &Defs);
+  void genStreamingSVECompatibleList(raw_ostream &OS,
+                                     SmallVectorImpl<Intrinsic *> &Defs);
   void genOverloadTypeCheckCode(raw_ostream &OS,
                                 SmallVectorImpl<Intrinsic *> &Defs);
   void genIntrinsicRangeCheckCode(raw_ostream &OS,
@@ -2039,6 +2041,30 @@ void NeonEmitter::genBuiltinsDef(raw_ostream &OS,
   OS << "#endif\n\n";
 }
 
+void NeonEmitter::genStreamingSVECompatibleList(
+    raw_ostream &OS, SmallVectorImpl<Intrinsic *> &Defs) {
+  OS << "#ifdef GET_NEON_STREAMING_COMPAT_FLAG\n";
+
+  std::set<std::string> Emitted;
+  for (auto *Def : Defs) {
+    // If the def has a body (that is, it has Operation DAGs), it won't call
+    // __builtin_neon_* so we don't need to generate a definition for it.
+    if (Def->hasBody())
+      continue;
+
+    std::string Name = Def->getMangledName();
+    if (Emitted.find(Name) != Emitted.end())
+      continue;
+
+    // FIXME: We should make exceptions here for some NEON builtins that are
+    // permitted in streaming mode.
+    OS << "case NEON::BI__builtin_neon_" << Name
+       << ": BuiltinType = ArmNonStreaming; break;\n";
+    Emitted.insert(Name);
+  }
+  OS << "#endif\n\n";
+}
+
 /// Generate the ARM and AArch64 overloaded type checking code for
 /// SemaChecking.cpp, checking for unique builtin declarations.
 void NeonEmitter::genOverloadTypeCheckCode(raw_ostream &OS,
@@ -2222,6 +2248,8 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
   // Generate ARM overloaded type checking code for SemaChecking.cpp
   genOverloadTypeCheckCode(OS, Defs);
 
+  genStreamingSVECompatibleList(OS, Defs);
+
   // Generate ARM range checking code for shift/lane immediates.
   genIntrinsicRangeCheckCode(OS, Defs);
 }
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index dbf5122fdf22317cbc5258f41a44ad46c02ce4c1..2b61f8b81587df4a4c398186ca7bb343cd2a1512 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -43,6 +43,8 @@ enum ClassKind {
   ClassG,     // Overloaded name without type suffix
 };
 
+enum class ACLEKind { SVE, SME };
+
 using TypeSpec = std::string;
 
 namespace {
@@ -236,7 +238,7 @@ public:
   }
 
   /// Emits the intrinsic declaration to the ostream.
-  void emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter) const;
+  void emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter, ACLEKind Kind) const;
 
 private:
   std::string getMergeSuffix() const { return MergeSuffix; }
@@ -344,6 +346,10 @@ public:
   /// Emit arm_sve.h.
   void createHeader(raw_ostream &o);
 
+  // Emits core intrinsics in both arm_sme.h and arm_sve.h
+  void createCoreHeaderIntrinsics(raw_ostream &o, SVEEmitter &Emitter,
+                                  ACLEKind Kind);
+
   /// Emit all the __builtin prototypes and code needed by Sema.
   void createBuiltins(raw_ostream &o);
 
@@ -365,9 +371,15 @@ public:
   /// Emit all the information needed to map builtin -> LLVM IR intrinsic.
   void createSMECodeGenMap(raw_ostream &o);
 
+  /// Create a table for a builtin's requirement for PSTATE.SM.
+  void createStreamingAttrs(raw_ostream &o, ACLEKind Kind);
+
   /// Emit all the range checks for the immediates.
   void createSMERangeChecks(raw_ostream &o);
 
+  /// Create a table for a builtin's requirement for PSTATE.ZA.
+  void createBuiltinZAState(raw_ostream &OS);
+
   /// Create intrinsic and add it to \p Out
   void createIntrinsic(Record *R,
                        SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out);
@@ -987,28 +999,24 @@ std::string Intrinsic::mangleName(ClassKind LocalCK) const {
          getMergeSuffix();
 }
 
-void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter) const {
+void Intrinsic::emitIntrinsic(raw_ostream &OS, SVEEmitter &Emitter,
+                              ACLEKind Kind) const {
   bool IsOverloaded = getClassKind() == ClassG && getProto().size() > 1;
 
   std::string FullName = mangleName(ClassS);
   std::string ProtoName = mangleName(getClassKind());
-  std::string SMEAttrs = "";
+  OS << (IsOverloaded ? "__aio " : "__ai ")
+     << "__attribute__((__clang_arm_builtin_alias(";
 
-  if (Flags & Emitter.getEnumValueForFlag("IsStreaming"))
-    SMEAttrs += ", arm_streaming";
-  if (Flags & Emitter.getEnumValueForFlag("IsStreamingCompatible"))
-    SMEAttrs += ", arm_streaming_compatible";
-  if (Flags & Emitter.getEnumValueForFlag("IsSharedZA"))
-    SMEAttrs += ", arm_shared_za";
-  if (Flags & Emitter.getEnumValueForFlag("IsPreservesZA"))
-    SMEAttrs += ", arm_preserves_za";
+  switch (Kind) {
+  case ACLEKind::SME:
+    OS << "__builtin_sme_" << FullName << ")";
+    break;
+  case ACLEKind::SVE:
+    OS << "__builtin_sve_" << FullName << ")";
+    break;
+  }
 
-  OS << (IsOverloaded ? "__aio " : "__ai ")
-     << "__attribute__((__clang_arm_builtin_alias("
-     << (SMEAttrs.empty() ? "__builtin_sve_" : "__builtin_sme_")
-     << FullName << ")";
-  if (!SMEAttrs.empty())
-    OS << SMEAttrs;
   OS << "))\n";
 
   OS << getTypes()[0].str() << " " << ProtoName << "(";
@@ -1143,6 +1151,34 @@ void SVEEmitter::createIntrinsic(
   }
 }
 
+void SVEEmitter::createCoreHeaderIntrinsics(raw_ostream &OS,
+                                            SVEEmitter &Emitter,
+                                            ACLEKind Kind) {
+  SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  for (auto *R : RV)
+    createIntrinsic(R, Defs);
+
+  // Sort intrinsics in header file by following order/priority:
+  // - Architectural guard (i.e. does it require SVE2 or SVE2_AES)
+  // - Class (is intrinsic overloaded or not)
+  // - Intrinsic name
+  std::stable_sort(Defs.begin(), Defs.end(),
+                   [](const std::unique_ptr<Intrinsic> &A,
+                      const std::unique_ptr<Intrinsic> &B) {
+                     auto ToTuple = [](const std::unique_ptr<Intrinsic> &I) {
+                       return std::make_tuple(I->getGuard(),
+                                              (unsigned)I->getClassKind(),
+                                              I->getName());
+                     };
+                     return ToTuple(A) < ToTuple(B);
+                   });
+
+  // Actually emit the intrinsic declarations.
+  for (auto &I : Defs)
+    I->emitIntrinsic(OS, Emitter, Kind);
+}
+
 void SVEEmitter::createHeader(raw_ostream &OS) {
   OS << "/*===---- arm_sve.h - ARM SVE intrinsics "
         "-----------------------------------===\n"
@@ -1284,7 +1320,7 @@ void SVEEmitter::createHeader(raw_ostream &OS) {
         if (ShortForm) {
           OS << "__aio __attribute__((target(\"sve\"))) " << From.Type
              << " svreinterpret_" << From.Suffix;
-          OS << "(" << To.Type << " op) {\n";
+          OS << "(" << To.Type << " op) __arm_streaming_compatible {\n";
           OS << "  return __builtin_sve_reinterpret_" << From.Suffix << "_"
              << To.Suffix << "(op);\n";
           OS << "}\n\n";
@@ -1294,27 +1330,7 @@ void SVEEmitter::createHeader(raw_ostream &OS) {
              << To.Suffix << "(__VA_ARGS__)\n";
       }
 
-  SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
-  for (auto *R : RV)
-    createIntrinsic(R, Defs);
-
-  // Sort intrinsics in header file by following order/priority:
-  // - Architectural guard (i.e. does it require SVE2 or SVE2_AES)
-  // - Class (is intrinsic overloaded or not)
-  // - Intrinsic name
-  std::stable_sort(
-      Defs.begin(), Defs.end(), [](const std::unique_ptr<Intrinsic> &A,
-                                   const std::unique_ptr<Intrinsic> &B) {
-        auto ToTuple = [](const std::unique_ptr<Intrinsic> &I) {
-          return std::make_tuple(I->getGuard(), (unsigned)I->getClassKind(), I->getName());
-        };
-        return ToTuple(A) < ToTuple(B);
-      });
-
-  // Actually emit the intrinsic declarations.
-  for (auto &I : Defs)
-    I->emitIntrinsic(OS, *this);
+  createCoreHeaderIntrinsics(OS, *this, ACLEKind::SVE);
 
   OS << "#define svcvtnt_bf16_x      svcvtnt_bf16_m\n";
   OS << "#define svcvtnt_bf16_f32_x  svcvtnt_bf16_f32_m\n";
@@ -1464,7 +1480,7 @@ void SVEEmitter::createTypeFlags(raw_ostream &OS) {
 }
 
 void SVEEmitter::createSMEHeader(raw_ostream &OS) {
-  OS << "/*===---- arm_sme_draft_spec_subject_to_change.h - ARM SME intrinsics "
+  OS << "/*===---- arm_sme.h - ARM SME intrinsics "
         "------===\n"
         " *\n"
         " *\n"
@@ -1481,7 +1497,7 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) {
   OS << "#define __ARM_SME_H\n\n";
 
   OS << "#if !defined(__LITTLE_ENDIAN__)\n";
-  OS << "#error \"Big endian is currently not supported for arm_sme_draft_spec_subject_to_change.h\"\n";
+  OS << "#error \"Big endian is currently not supported for arm_sme.h\"\n";
   OS << "#endif\n";
 
   OS << "#include <arm_sve.h> \n\n";
@@ -1496,30 +1512,26 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) {
   OS << "extern \"C\" {\n";
   OS << "#endif\n\n";
 
-  SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
-  for (auto *R : RV)
-    createIntrinsic(R, Defs);
+  OS << "void __arm_za_disable(void) __arm_streaming_compatible;\n\n";
 
-  // Sort intrinsics in header file by following order/priority similar to SVE:
-  // - Architectural guard
-  // - Class (is intrinsic overloaded or not)
-  // - Intrinsic name
-  std::stable_sort(Defs.begin(), Defs.end(),
-                   [](const std::unique_ptr<Intrinsic> &A,
-                      const std::unique_ptr<Intrinsic> &B) {
-                     auto ToTuple = [](const std::unique_ptr<Intrinsic> &I) {
-                       return std::make_tuple(I->getGuard(),
-                                              (unsigned)I->getClassKind(),
-                                              I->getName());
-                     };
-                     return ToTuple(A) < ToTuple(B);
-                   });
+  OS << "__ai bool __arm_has_sme(void) __arm_streaming_compatible {\n";
+  OS << "  uint64_t x0, x1;\n";
+  OS << "  __builtin_arm_get_sme_state(&x0, &x1);\n";
+  OS << "  return x0 & (1ULL << 63);\n";
+  OS << "}\n\n";
 
-  // Actually emit the intrinsic declaration.
-  for (auto &I : Defs) {
-    I->emitIntrinsic(OS, *this);
-  }
+  OS << "__ai bool __arm_in_streaming_mode(void) __arm_streaming_compatible "
+        "{\n";
+  OS << "  uint64_t x0, x1;\n";
+  OS << "  __builtin_arm_get_sme_state(&x0, &x1);\n";
+  OS << "  return x0 & 1;\n";
+  OS << "}\n\n";
+
+  OS << "__ai __attribute__((target(\"sme\"))) void svundef_za(void) "
+        "__arm_streaming_compatible __arm_out(\"za\") "
+        "{ }\n\n";
+
+  createCoreHeaderIntrinsics(OS, *this, ACLEKind::SME);
 
   OS << "#ifdef __cplusplus\n";
   OS << "} // extern \"C\"\n";
@@ -1624,6 +1636,76 @@ void SVEEmitter::createSMERangeChecks(raw_ostream &OS) {
   OS << "#endif\n\n";
 }
 
+void SVEEmitter::createBuiltinZAState(raw_ostream &OS) {
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
+  for (auto *R : RV)
+    createIntrinsic(R, Defs);
+
+  std::map<std::string, std::set<std::string>> IntrinsicsPerState;
+  for (auto &Def : Defs) {
+    if (Def->isFlagSet(getEnumValueForFlag("IsInZA")))
+      IntrinsicsPerState["ArmInZA"].insert(Def->getMangledName());
+    else if (Def->isFlagSet(getEnumValueForFlag("IsOutZA")))
+      IntrinsicsPerState["ArmOutZA"].insert(Def->getMangledName());
+    else if (Def->isFlagSet(getEnumValueForFlag("IsInOutZA")))
+      IntrinsicsPerState["ArmInOutZA"].insert(Def->getMangledName());
+  }
+
+  OS << "#ifdef GET_SME_BUILTIN_GET_STATE\n";
+  for (auto &KV : IntrinsicsPerState) {
+    for (StringRef Name : KV.second)
+      OS << "case SME::BI__builtin_sme_" << Name << ":\n";
+    OS << "  return " << KV.first << ";\n";
+  }
+  OS << "#endif\n\n";
+}
+
+void SVEEmitter::createStreamingAttrs(raw_ostream &OS, ACLEKind Kind) {
+  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
+  for (auto *R : RV)
+    createIntrinsic(R, Defs);
+
+  StringRef ExtensionKind;
+  switch (Kind) {
+  case ACLEKind::SME:
+    ExtensionKind = "SME";
+    break;
+  case ACLEKind::SVE:
+    ExtensionKind = "SVE";
+    break;
+  }
+
+  OS << "#ifdef GET_" << ExtensionKind << "_STREAMING_ATTRS\n";
+
+  llvm::StringMap<std::set<std::string>> StreamingMap;
+
+  uint64_t IsStreamingFlag = getEnumValueForFlag("IsStreaming");
+  uint64_t IsStreamingCompatibleFlag =
+      getEnumValueForFlag("IsStreamingCompatible");
+  for (auto &Def : Defs) {
+    if (Def->isFlagSet(IsStreamingFlag))
+      StreamingMap["ArmStreaming"].insert(Def->getMangledName());
+    else if (Def->isFlagSet(IsStreamingCompatibleFlag))
+      StreamingMap["ArmStreamingCompatible"].insert(Def->getMangledName());
+    else
+      StreamingMap["ArmNonStreaming"].insert(Def->getMangledName());
+  }
+
+  for (auto BuiltinType : StreamingMap.keys()) {
+    for (auto Name : StreamingMap[BuiltinType]) {
+      OS << "case " << ExtensionKind << "::BI__builtin_"
+         << ExtensionKind.lower() << "_";
+      OS << Name << ":\n";
+    }
+    OS << "  BuiltinType = " << BuiltinType << ";\n";
+    OS << "  break;\n";
+  }
+
+  OS << "#endif\n\n";
+}
+
 namespace clang {
 void EmitSveHeader(RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createHeader(OS);
@@ -1645,6 +1727,10 @@ void EmitSveTypeFlags(RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createTypeFlags(OS);
 }
 
+void EmitSveStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) {
+  SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SVE);
+}
+
 void EmitSmeHeader(RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createSMEHeader(OS);
 }
@@ -1660,4 +1746,12 @@ void EmitSmeBuiltinCG(RecordKeeper &Records, raw_ostream &OS) {
 void EmitSmeRangeChecks(RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createSMERangeChecks(OS);
 }
+
+void EmitSmeStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) {
+  SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SME);
+}
+
+void EmitSmeBuiltinZAState(RecordKeeper &Records, raw_ostream &OS) {
+  SVEEmitter(Records).createBuiltinZAState(OS);
+}
 } // End namespace clang
diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp
index 38215abd9d9b7b68ba88fc941c7b164c70c33a5a..558c901be53b59ddad453519b507deafec7e321c 100644
--- a/clang/utils/TableGen/TableGen.cpp
+++ b/clang/utils/TableGen/TableGen.cpp
@@ -35,7 +35,7 @@ enum ActionType {
   GenClangAttrSubjectMatchRuleList,
   GenClangAttrPCHRead,
   GenClangAttrPCHWrite,
-  GenClangAttrTokenKinds,
+  GenClangRegularKeywordAttributeInfo,
   GenClangAttrHasAttributeImpl,
   GenClangAttrSpellingListIndex,
   GenClangAttrASTVisitor,
@@ -83,10 +83,13 @@ enum ActionType {
   GenArmSveBuiltinCG,
   GenArmSveTypeFlags,
   GenArmSveRangeChecks,
+  GenArmSveStreamingAttrs,
   GenArmSmeHeader,
   GenArmSmeBuiltins,
   GenArmSmeBuiltinCG,
   GenArmSmeRangeChecks,
+  GenArmSmeStreamingAttrs,
+  GenArmSmeBuiltinZAState,
   GenArmCdeHeader,
   GenArmCdeBuiltinDef,
   GenArmCdeBuiltinSema,
@@ -136,8 +139,10 @@ cl::opt<ActionType> Action(
                    "Generate clang PCH attribute reader"),
         clEnumValN(GenClangAttrPCHWrite, "gen-clang-attr-pch-write",
                    "Generate clang PCH attribute writer"),
-        clEnumValN(GenClangAttrTokenKinds, "gen-clang-attr-token-kinds",
-                   "Generate a list of attribute-related clang tokens"),
+        clEnumValN(GenClangRegularKeywordAttributeInfo,
+                   "gen-clang-regular-keyword-attr-info",
+                   "Generate a list of regular keyword attributes with info "
+                   "about their arguments"),
         clEnumValN(GenClangAttrHasAttributeImpl,
                    "gen-clang-attr-has-attribute-impl",
                    "Generate a clang attribute spelling list"),
@@ -233,6 +238,8 @@ cl::opt<ActionType> Action(
                    "Generate arm_sve_typeflags.inc for clang"),
         clEnumValN(GenArmSveRangeChecks, "gen-arm-sve-sema-rangechecks",
                    "Generate arm_sve_sema_rangechecks.inc for clang"),
+        clEnumValN(GenArmSveStreamingAttrs, "gen-arm-sve-streaming-attrs",
+                   "Generate arm_sve_streaming_attrs.inc for clang"),
         clEnumValN(GenArmSmeHeader, "gen-arm-sme-header",
                    "Generate arm_sme.h for clang"),
         clEnumValN(GenArmSmeBuiltins, "gen-arm-sme-builtins",
@@ -241,6 +248,10 @@ cl::opt<ActionType> Action(
                    "Generate arm_sme_builtin_cg_map.inc for clang"),
         clEnumValN(GenArmSmeRangeChecks, "gen-arm-sme-sema-rangechecks",
                    "Generate arm_sme_sema_rangechecks.inc for clang"),
+        clEnumValN(GenArmSmeStreamingAttrs, "gen-arm-sme-streaming-attrs",
+                   "Generate arm_sme_streaming_attrs.inc for clang"),
+        clEnumValN(GenArmSmeBuiltinZAState, "gen-arm-sme-builtin-za-state",
+                   "Generate arm_sme_builtins_za_state.inc for clang"),
         clEnumValN(GenArmMveHeader, "gen-arm-mve-header",
                    "Generate arm_mve.h for clang"),
         clEnumValN(GenArmMveBuiltinDef, "gen-arm-mve-builtin-def",
@@ -269,11 +280,14 @@ cl::opt<ActionType> Action(
                    "Generate riscv_vector_builtin_cg.inc for clang"),
         clEnumValN(GenRISCVVectorBuiltinSema, "gen-riscv-vector-builtin-sema",
                    "Generate riscv_vector_builtin_sema.inc for clang"),
-        clEnumValN(GenRISCVSiFiveVectorBuiltins, "gen-riscv-sifive-vector-builtins",
+        clEnumValN(GenRISCVSiFiveVectorBuiltins,
+                   "gen-riscv-sifive-vector-builtins",
                    "Generate riscv_sifive_vector_builtins.inc for clang"),
-        clEnumValN(GenRISCVSiFiveVectorBuiltinCG, "gen-riscv-sifive-vector-builtin-codegen",
+        clEnumValN(GenRISCVSiFiveVectorBuiltinCG,
+                   "gen-riscv-sifive-vector-builtin-codegen",
                    "Generate riscv_sifive_vector_builtin_cg.inc for clang"),
-        clEnumValN(GenRISCVSiFiveVectorBuiltinSema, "gen-riscv-sifive-vector-builtin-sema",
+        clEnumValN(GenRISCVSiFiveVectorBuiltinSema,
+                   "gen-riscv-sifive-vector-builtin-sema",
                    "Generate riscv_sifive_vector_builtin_sema.inc for clang"),
         clEnumValN(GenAttrDocs, "gen-attr-docs",
                    "Generate attribute documentation"),
@@ -327,8 +341,8 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenClangAttrPCHWrite:
     EmitClangAttrPCHWrite(Records, OS);
     break;
-  case GenClangAttrTokenKinds:
-    EmitClangAttrTokenKinds(Records, OS);
+  case GenClangRegularKeywordAttributeInfo:
+    EmitClangRegularKeywordAttributeInfo(Records, OS);
     break;
   case GenClangAttrHasAttributeImpl:
     EmitClangAttrHasAttrImpl(Records, OS);
@@ -472,6 +486,9 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenArmSveRangeChecks:
     EmitSveRangeChecks(Records, OS);
     break;
+  case GenArmSveStreamingAttrs:
+    EmitSveStreamingAttrs(Records, OS);
+    break;
   case GenArmSmeHeader:
     EmitSmeHeader(Records, OS);
     break;
@@ -484,6 +501,12 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenArmSmeRangeChecks:
     EmitSmeRangeChecks(Records, OS);
     break;
+  case GenArmSmeStreamingAttrs:
+    EmitSmeStreamingAttrs(Records, OS);
+    break;
+  case GenArmSmeBuiltinZAState:
+    EmitSmeBuiltinZAState(Records, OS);
+    break;
   case GenArmCdeHeader:
     EmitCdeHeader(Records, OS);
     break;
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index 8265a531a98fb2a1e0957634ce01c8a826b05ad3..8b8e2668efa3757b26e9ed9f7fd88782df6378fe 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -43,8 +43,8 @@ void EmitClangAttrSubjectMatchRuleList(llvm::RecordKeeper &Records,
                                        llvm::raw_ostream &OS);
 void EmitClangAttrPCHRead(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitClangAttrPCHWrite(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitClangAttrTokenKinds(llvm::RecordKeeper &Records,
-                             llvm::raw_ostream &OS);
+void EmitClangRegularKeywordAttributeInfo(llvm::RecordKeeper &Records,
+                                          llvm::raw_ostream &OS);
 void EmitClangAttrHasAttrImpl(llvm::RecordKeeper &Records,
                               llvm::raw_ostream &OS);
 void EmitClangAttrSpellingListIndex(llvm::RecordKeeper &Records,
@@ -102,11 +102,14 @@ void EmitSveBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSveBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSveTypeFlags(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSveRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSveStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
 void EmitSmeHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSmeBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSmeBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSmeRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSmeStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSmeBuiltinZAState(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
 void EmitMveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitMveBuiltinDef(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index 23dc53b5a3cbc3e1520b1dfac9a09c3c003600a5..08ac5b9a5db2ea99feef3e76a64a07dabd5203e6 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -115,12 +115,7 @@ C++23, informally referred to as C++26.</p>
  <tr>
   <td>Unevaluated strings</td>
   <td><a href="https://wg21.link/P2361R6">P2361R6</a></td>
-  <td class="partial" align="center">
-    <details>
-      <summary>Clang 17 (Partial)</summary>
-      Attributes arguments don't yet parse as unevaluated string literals.
-    </details>
-  </td>
+  <td class="unreleased" align="center">Clang 18</td>
  </tr>
  <tr>
   <td>Add @, $, and ` to the basic character set</td>
diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
index a72e279dd75e8fe76e65c9031d2425388d3b3273..5ed49f0f558814437ce95712047b64ea0e6980da 100644
--- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake
+++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake
@@ -312,6 +312,10 @@ function(add_compiler_rt_runtime name type)
       set(COMPONENT_OPTION COMPONENT ${libname})
     endif()
 
+    if(type STREQUAL "SHARED")
+      list(APPEND LIB_DEFS COMPILER_RT_SHARED_LIB)
+    endif()
+
     if(type STREQUAL "OBJECT")
       if(CMAKE_C_COMPILER_ID MATCHES Clang AND CMAKE_C_COMPILER_TARGET)
         list(APPEND extra_cflags_${libname} "--target=${CMAKE_C_COMPILER_TARGET}")
diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index 9cf4877baf48953aba0dca47dbb3150a5ed02b8b..e91e3923a756c5352996f7055cceeb6cd2cacbcd 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -33,6 +33,12 @@ asm(\".arch armv8-a+lse\");
 asm(\"cas w0, w1, [x2]\");
 ")
 
+builtin_check_c_compiler_source(COMPILER_RT_HAS_ASM_SME
+"
+asm(\".arch armv9-a+sme\");
+asm(\"smstart\");
+")
+
 if(ANDROID)
   set(OS_NAME "Android")
 else()
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index d62fa0432e2a5a52774a043e0926c86dfa0850b2..cf376b4a021c319fe46db2c2b252b739cd32c99d 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -555,6 +555,13 @@ set(aarch64_SOURCES
   aarch64/fp_mode.c
 )
 
+if(COMPILER_RT_HAS_ASM_SME)
+  list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c)
+  message(STATUS "AArch64 SME ABI routines enabled")
+else()
+  message(STATUS "AArch64 SME ABI routines disabled")
+endif()
+
 # Generate outline atomics helpers from lse.S base
 set(OA_HELPERS_DIR "${CMAKE_CURRENT_BINARY_DIR}/outline_atomic_helpers.dir")
 file(MAKE_DIRECTORY "${OA_HELPERS_DIR}")
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi-init.c b/compiler-rt/lib/builtins/aarch64/sme-abi-init.c
new file mode 100644
index 0000000000000000000000000000000000000000..b6ee12170d56dbd4890445ef74d6e76249b66bdb
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi-init.c
@@ -0,0 +1,52 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+__attribute__((visibility("hidden"), nocommon))
+_Bool __aarch64_has_sme_and_tpidr2_el0;
+
+// We have multiple ways to check that the function has SME, depending on our
+// target.
+// * For Linux we can use __getauxval().
+// * For newlib we can use __aarch64_sme_accessible().
+
+#if defined(__linux__)
+
+#ifndef AT_HWCAP2
+#define AT_HWCAP2 26
+#endif
+
+#ifndef HWCAP2_SME
+#define HWCAP2_SME (1 << 23)
+#endif
+
+extern unsigned long int __getauxval (unsigned long int);
+
+static _Bool has_sme(void) {
+  return __getauxval(AT_HWCAP2) & HWCAP2_SME;
+}
+
+#else  // defined(__linux__)
+
+#if defined(COMPILER_RT_SHARED_LIB)
+__attribute__((weak))
+#endif
+extern _Bool __aarch64_sme_accessible(void);
+
+static _Bool has_sme(void)  {
+#if defined(COMPILER_RT_SHARED_LIB)
+  if (!__aarch64_sme_accessible)
+    return 0;
+#endif
+  return __aarch64_sme_accessible();
+}
+
+#endif // defined(__linux__)
+
+#if __GNUC__ >= 9
+#pragma GCC diagnostic ignored "-Wprio-ctor-dtor"
+#endif
+__attribute__((constructor(90)))
+static void init_aarch64_has_sme(void) {
+  __aarch64_has_sme_and_tpidr2_el0 = has_sme();
+}
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
new file mode 100644
index 0000000000000000000000000000000000000000..4c0ff66931db7b2ccd6fcb5c2131ff8250b3e85e
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -0,0 +1,186 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// This patch implements the support routines for the SME ABI,
+// described here:
+//  https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#sme-support-routines
+
+#include "../assembly.h"
+
+
+#if !defined(__APPLE__)
+#define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)
+#define TPIDR2_SYMBOL_OFFSET :lo12:SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)
+#else
+// MachO requires @page/@pageoff directives because the global is defined
+// in a different file. Otherwise this file may fail to build.
+#define TPIDR2_SYMBOL SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)@page
+#define TPIDR2_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_has_sme_and_tpidr2_el0)@pageoff
+#endif
+
+.arch armv9-a+sme
+
+// Utility function which calls a system's abort() routine. Because the function
+// is streaming-compatible it should disable streaming-SVE mode before calling
+// abort(). Note that there is no need to preserve any state before the call,
+// because the function does not return.
+DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort)
+  .cfi_startproc
+  .variant_pcs SYMBOL_NAME(do_abort)
+  BTI_C
+  stp  x29, x30, [sp, #-32]!
+  cntd x0
+  // Store VG to a stack location that we describe with .cfi_offset
+  str x0, [sp, #16]
+  .cfi_def_cfa_offset 32
+  .cfi_offset w30, -24
+  .cfi_offset w29, -32
+  .cfi_offset 46, -16
+  bl  __arm_sme_state
+  tbz  x0, #0, 2f
+1:
+  smstop sm
+2:
+  // We can't make this into a tail-call because the unwinder would
+  // need to restore the value of VG.
+  bl  SYMBOL_NAME(abort)
+  .cfi_endproc
+END_COMPILERRT_FUNCTION(do_abort)
+
+// __arm_sme_state fills the result registers based on a local
+// that is set as part of the compiler-rt startup code.
+//   __aarch64_has_sme_and_tpidr2_el0
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state)
+  .variant_pcs __arm_sme_state
+  BTI_C
+  mov x0, xzr
+  mov x1, xzr
+
+  adrp  x16, TPIDR2_SYMBOL
+  ldrb w16, [x16, TPIDR2_SYMBOL_OFFSET]
+  cbz w16, 1f
+0:
+  orr x0, x0, #0xC000000000000000
+  mrs x16, SVCR
+  bfxil x0, x16, #0, #2
+  mrs x1, TPIDR2_EL0
+1:
+  ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_sme_state)
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore)
+  .variant_pcs __arm_tpidr2_restore
+  BTI_C
+  // If TPIDR2_EL0 is nonnull, the subroutine aborts in some platform-specific
+  // manner.
+  mrs x14, TPIDR2_EL0
+  cbnz  x14, 2f
+
+  // If any of the reserved bytes in the first 16 bytes of BLK are nonzero,
+  // the subroutine [..] aborts in some platform-defined manner.
+  ldrh  w14, [x0, #10]
+  cbnz  w14, 2f
+  ldr w14, [x0, #12]
+  cbnz  w14, 2f
+
+  // If BLK.za_save_buffer is NULL, the subroutine does nothing.
+  ldr x16, [x0]
+  cbz x16, 1f
+
+  // If BLK.num_za_save_slices is zero, the subroutine does nothing.
+  ldrh  w14, [x0, #8]
+  cbz x14, 1f
+
+  mov x15, xzr
+0:
+  ldr za[w15,0], [x16]
+  addsvl x16, x16, #1
+  add x15, x15, #1
+  cmp x14, x15
+  b.ne  0b
+1:
+  ret
+2:
+  b  SYMBOL_NAME(do_abort)
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_restore)
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save)
+  .variant_pcs __arm_tpidr2_restore
+  BTI_C
+  // If the current thread does not have access to TPIDR2_EL0, the subroutine
+  // does nothing.
+  adrp  x14, TPIDR2_SYMBOL
+  ldrb w14, [x14, TPIDR2_SYMBOL_OFFSET]
+  cbz w14, 1f
+
+  // If TPIDR2_EL0 is null, the subroutine does nothing.
+  mrs x16, TPIDR2_EL0
+  cbz x16, 1f
+
+  // If any of the reserved bytes in the first 16 bytes of the TPIDR2 block are
+  // nonzero, the subroutine [..] aborts in some platform-defined manner.
+  ldrh  w14, [x16, #10]
+  cbnz  w14, 2f
+  ldr w14, [x16, #12]
+  cbnz  w14, 2f
+
+  // If num_za_save_slices is zero, the subroutine does nothing.
+  ldrh  w14, [x16, #8]
+  cbz x14, 1f
+
+  // If za_save_buffer is NULL, the subroutine does nothing.
+  ldr x16, [x16]
+  cbz x16, 1f
+
+  mov x15, xzr
+0:
+  str za[w15,0], [x16]
+  addsvl x16, x16, #1
+  add x15, x15, #1
+  cmp x14, x15
+  b.ne  0b
+1:
+  ret
+2:
+  b  SYMBOL_NAME(do_abort)
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_save)
+
+DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
+  .variant_pcs __arm_tpidr2_restore
+  BTI_C
+  // If the current thread does not have access to SME, the subroutine does
+  // nothing.
+  adrp  x14, TPIDR2_SYMBOL
+  ldrb w14, [x14, TPIDR2_SYMBOL_OFFSET]
+  cbz w14, 0f
+
+  // Otherwise, the subroutine behaves as if it did the following:
+  // * Call __arm_tpidr2_save.
+  stp x29, x30, [sp, #-16]!
+  .cfi_def_cfa_offset 16
+  mov x29, sp
+  .cfi_def_cfa w29, 16
+  .cfi_offset w30, -8
+  .cfi_offset w29, -16
+  bl  __arm_tpidr2_save
+
+  // * Set TPIDR2_EL0 to null.
+  msr TPIDR2_EL0, xzr
+
+  // * Set PSTATE.ZA to 0.
+  smstop za
+
+  .cfi_def_cfa wsp, 16
+  ldp x29, x30, [sp], #16
+  .cfi_def_cfa_offset 0
+  .cfi_restore w30
+  .cfi_restore w29
+0:
+  ret
+END_COMPILERRT_OUTLINE_FUNCTION(__arm_za_disable)
+
+NO_EXEC_STACK_DIRECTIVE
+
+// GNU property note for BTI and PAC
+GNU_PROPERTY_BTI_PAC
diff --git a/llvm/docs/AArch64SME.rst b/llvm/docs/AArch64SME.rst
index 63573bf91eacb56686a63c363fa127c508120ce5..b5a01cb204b812bfa8aaa813f540910a8580a9db 100644
--- a/llvm/docs/AArch64SME.rst
+++ b/llvm/docs/AArch64SME.rst
@@ -22,26 +22,32 @@ Below we describe the LLVM IR attributes and their relation to the C/C++
 level ACLE attributes:
 
 ``aarch64_pstate_sm_enabled``
-    is used for functions with ``__attribute__((arm_streaming))``
+    is used for functions with ``__arm_streaming``
 
 ``aarch64_pstate_sm_compatible``
-    is used for functions with ``__attribute__((arm_streaming_compatible))``
+    is used for functions with ``__arm_streaming_compatible``
 
 ``aarch64_pstate_sm_body``
-  is used for functions with ``__attribute__((arm_locally_streaming))`` and is
+  is used for functions with ``__arm_locally_streaming`` and is
   only valid on function definitions (not declarations)
 
-``aarch64_pstate_za_new``
-  is used for functions with ``__attribute__((arm_new_za))``
+``aarch64_new_za``
+  is used for functions with ``__arm_new("za")``
 
-``aarch64_pstate_za_shared``
-  is used for functions with ``__attribute__((arm_shared_za))``
+``aarch64_in_za``
+  is used for functions with ``__arm_in("za")``
 
-``aarch64_pstate_za_preserved``
-  is used for functions with ``__attribute__((arm_preserves_za))``
+``aarch64_out_za``
+  is used for functions with ``__arm_out("za")``
+
+``aarch64_inout_za``
+  is used for functions with ``__arm_inout("za")``
+
+``aarch64_preserves_za``
+  is used for functions with ``__arm_preserves("za")``
 
 ``aarch64_expanded_pstate_za``
-  is used for functions with ``__attribute__((arm_new_za))``
+  is used for functions with ``__arm_new_za``
 
 Clang must ensure that the above attributes are added both to the
 function's declaration/definition as well as to their call-sites. This is
@@ -89,11 +95,10 @@ Restrictions on attributes
 * It is not allowed for a function to be decorated with both
   ``aarch64_pstate_sm_compatible`` and ``aarch64_pstate_sm_enabled``.
 
-* It is not allowed for a function to be decorated with both
-  ``aarch64_pstate_za_new`` and ``aarch64_pstate_za_preserved``.
-
-* It is not allowed for a function to be decorated with both
-  ``aarch64_pstate_za_new`` and ``aarch64_pstate_za_shared``.
+* It is not allowed for a function to be decorated with more than one of the
+  following attributes:
+  ``aarch64_new_za``, ``aarch64_in_za``, ``aarch64_out_za``, ``aarch64_inout_za``,
+  ``aarch64_preserves_za``.
 
 These restrictions also apply in the higher level SME ACLE, which means we can
 emit diagnostics in Clang to signal users about incorrect behaviour.
@@ -426,7 +431,7 @@ to toggle PSTATE.ZA using intrinsics. This also makes it simpler to setup a
 lazy-save mechanism for calls to private-ZA functions (i.e. functions that may
 either directly or indirectly clobber ZA state).
 
-For the purpose of handling functions marked with ``aarch64_pstate_za_new``,
+For the purpose of handling functions marked with ``aarch64_new_za``,
 we have introduced a new LLVM IR pass (SMEABIPass) that is run just before
 SelectionDAG. Any such functions dealt with by this pass are marked with
 ``aarch64_expanded_pstate_za``.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 701cd6e5dbb6f7ba26fd643b3c702a57ca0fd92c..2f4d01c4cddd67bfe1e8d80387eca5ccd3d53473 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -4991,10 +4991,13 @@ AArch64:
   offsets). (However, LLVM currently does this for the ``m`` constraint as
   well.)
 - ``r``: A 32 or 64-bit integer register (W* or X*).
+- ``Uci``: Like r, but restricted to registers 8 to 11 inclusive.
+- ``Ucj``: Like r, but restricted to registers 12 to 15 inclusive.
 - ``w``: A 32, 64, or 128-bit floating-point, SIMD or SVE vector register.
 - ``x``: Like w, but restricted to registers 0 to 15 inclusive.
 - ``y``: Like w, but restricted to SVE vector registers Z0 to Z7 inclusive.
-- ``Upl``: One of the low eight SVE predicate registers (P0 to P7)
+- ``Uph``: One of the upper eight SVE predicate registers (P8 to P15)
+- ``Upl``: One of the lower eight SVE predicate registers (P0 to P7)
 - ``Upa``: Any of the SVE predicate registers (P0 to P15)
 
 AMDGPU:
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 557063c8813268e4fba13df2e30b10634e206afa..21d4b9062206acf108e3bd995310b2c345936e94 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2679,10 +2679,10 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_st1q_vert  : SME_Load_Store_Intrinsic<llvm_nxv1i1_ty>;
 
   // Spill + fill
-  def int_aarch64_sme_ldr : DefaultAttrsIntrinsic<
-    [], [llvm_i32_ty, llvm_ptr_ty]>;
-  def int_aarch64_sme_str : DefaultAttrsIntrinsic<
-    [], [llvm_i32_ty, llvm_ptr_ty]>;
+  class SME_LDR_STR_ZA_Intrinsic
+    : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty]>;
+  def int_aarch64_sme_ldr : SME_LDR_STR_ZA_Intrinsic;
+  def int_aarch64_sme_str : SME_LDR_STR_ZA_Intrinsic;
 
   class SME_TileToVector_Intrinsic
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
@@ -3438,4 +3438,9 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sve_sel_x2  : SVE2_VG2_Sel_Intrinsic;
   def int_aarch64_sve_sel_x4  : SVE2_VG4_Sel_Intrinsic;
 
+  class SME_LDR_STR_ZT_Intrinsic
+    : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty]>;
+  def int_aarch64_sme_ldr_zt : SME_LDR_STR_ZT_Intrinsic;
+  def int_aarch64_sme_str_zt : SME_LDR_STR_ZT_Intrinsic;
+
 }
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 1408ce293ca6544716a5e3cb4aaa03df2226d555..438b4f49c75a226fae4f6f83839a5856779d9ce1 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2116,17 +2116,13 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
            V);
   }
 
-  if (Attrs.hasFnAttr("aarch64_pstate_za_new")) {
-    Check(!Attrs.hasFnAttr("aarch64_pstate_za_preserved"),
-           "Attributes 'aarch64_pstate_za_new and aarch64_pstate_za_preserved' "
-           "are incompatible!",
-           V);
-
-    Check(!Attrs.hasFnAttr("aarch64_pstate_za_shared"),
-           "Attributes 'aarch64_pstate_za_new and aarch64_pstate_za_shared' "
-           "are incompatible!",
-           V);
-  }
+  Check((Attrs.hasFnAttr("aarch64_new_za") + Attrs.hasFnAttr("aarch64_in_za") +
+         Attrs.hasFnAttr("aarch64_inout_za") +
+         Attrs.hasFnAttr("aarch64_out_za") +
+         Attrs.hasFnAttr("aarch64_preserves_za")) <= 1,
+        "Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', "
+        "'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive",
+        V);
 
   if (Attrs.hasFnAttr(Attribute::JumpTable)) {
     const GlobalValue *GV = cast<GlobalValue>(V);
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 76f1cc782b248888d598a5ba3efc3a255f505745..b090efe3bb3411aa63cccc9e4f5ae00dea6e5896 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -970,6 +970,8 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       RegClass = &AArch64::ZPRRegClass;
     } else if (AArch64::PPRRegClass.contains(Reg)) {
       RegClass = &AArch64::PPRRegClass;
+    } else if (AArch64::PNRRegClass.contains(Reg)) {
+      RegClass = &AArch64::PNRRegClass;
     } else {
       RegClass = &AArch64::FPR128RegClass;
       AltName = AArch64::vreg;
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index dcb73ae2dce2b59a1010e8bdabb123b3e6851e52..fe392ddcf057da85d86376bd9a90902b123334ff 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1000,10 +1000,12 @@ AArch64ExpandPseudo::expandCondSMToggle(MachineBasicBlock &MBB,
   // expected value for the callee (0 for a normal callee and 1 for a streaming
   // callee).
   auto PStateSM = MI.getOperand(2).getReg();
+  auto TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
+  unsigned SMReg32 = TRI->getSubReg(PStateSM, AArch64::sub_32);
   bool IsStreamingCallee = MI.getOperand(3).getImm();
-  unsigned Opc = IsStreamingCallee ? AArch64::TBZX : AArch64::TBNZX;
+  unsigned Opc = IsStreamingCallee ? AArch64::TBZW : AArch64::TBNZW;
   MachineInstrBuilder Tbx =
-      BuildMI(MBB, MBBI, DL, TII->get(Opc)).addReg(PStateSM).addImm(0);
+      BuildMI(MBB, MBBI, DL, TII->get(Opc)).addReg(SMReg32).addImm(0);
 
   // Split MBB and create two new blocks:
   //  - MBB now contains all instructions before MSRcond_pstatesvcrImm1.
@@ -1481,17 +1483,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
        NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
      return true;
    }
-   case AArch64::OBSCURE_COPY: {
-     if (MI.getOperand(0).getReg() != MI.getOperand(1).getReg()) {
-       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXrs))
-           .add(MI.getOperand(0))
-           .addReg(AArch64::XZR)
-           .add(MI.getOperand(1))
-           .addImm(0);
-     }
+   case AArch64::COALESCER_BARRIER_FPR16:
+   case AArch64::COALESCER_BARRIER_FPR32:
+   case AArch64::COALESCER_BARRIER_FPR64:
+   case AArch64::COALESCER_BARRIER_FPR128:
      MI.eraseFromParent();
      return true;
-   }
   }
   return false;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 1ae3709e9588abbf3c4541b777902a596bdbd4b3..b2c46939e58469ac1508c1fabf92e072fe8bf3b5 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -5187,8 +5187,8 @@ FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
                                         const TargetLibraryInfo *LibInfo) {
 
   SMEAttrs CallerAttrs(*FuncInfo.Fn);
-  if (CallerAttrs.hasZAState() ||
-      (!CallerAttrs.hasStreamingInterface() && CallerAttrs.hasStreamingBody()))
+  if (CallerAttrs.hasZAState() || CallerAttrs.hasStreamingInterfaceOrBody() ||
+      CallerAttrs.hasStreamingCompatibleInterface())
     return nullptr;
   return new AArch64FastISel(FuncInfo, LibInfo);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 4d5676f341017270c556407795a2b5b209b0692e..e8071bd7171e0bcea937df69c2317e498d14a8f5 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -427,6 +427,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+
   // Win64 EH requires a frame pointer if funclets are present, as the locals
   // are accessed off the frame pointer in both the parent function and the
   // funclets.
@@ -2977,11 +2978,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     return MIB->getIterator();
   };
 
-  // SVE objects are always restored in reverse order.
-  for (const RegPairInfo &RPI : reverse(RegPairs))
-    if (RPI.isScalable())
-      EmitMI(RPI);
-
   if (homogeneousPrologEpilog(MF, &MBB)) {
     auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
                    .setMIFlag(MachineInstr::FrameDestroy);
@@ -2992,11 +2988,19 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     return true;
   }
 
+  // For performance reasons restore SVE register in increasing order
+  auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; };
+  auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
+  auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR);
+  std::reverse(PPRBegin, PPREnd);
+  auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
+  auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
+  auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR);
+  std::reverse(ZPRBegin, ZPREnd);
+
   if (ReverseCSRRestoreSeq) {
     MachineBasicBlock::iterator First = MBB.end();
     for (const RegPairInfo &RPI : reverse(RegPairs)) {
-      if (RPI.isScalable())
-        continue;
       MachineBasicBlock::iterator It = EmitMI(RPI);
       if (First == MBB.end())
         First = It;
@@ -3005,8 +3009,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
       MBB.splice(MBBI, &MBB, First);
   } else {
     for (const RegPairInfo &RPI : RegPairs) {
-      if (RPI.isScalable())
-        continue;
       (void)EmitMI(RPI);
     }
   }
@@ -3252,6 +3254,12 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
 bool AArch64FrameLowering::enableStackSlotScavenging(
     const MachineFunction &MF) const {
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  // If the function has streaming-mode changes, don't scavenge a
+  // spillslot in the callee-save area, as that might require an
+  // 'addvl' in the streaming-mode-changing call-sequence when the
+  // function doesn't use a FP.
+  if (AFI->hasStreamingModeChanges() && !hasFP(MF))
+    return false;
   return AFI->hasCalleeSaveStackFreeSpace();
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0cc5e7fc5cc3398e1de637aa298e699adbf1d790..b7c857b72a54552c9394c9194490743c29dcc0a0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -2276,7 +2277,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((AArch64ISD::NodeType)Opcode) {
   case AArch64ISD::FIRST_NUMBER:
     break;
-    MAKE_CASE(AArch64ISD::OBSCURE_COPY)
+    MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
     MAKE_CASE(AArch64ISD::SMSTART)
     MAKE_CASE(AArch64ISD::SMSTOP)
     MAKE_CASE(AArch64ISD::RESTORE_ZA)
@@ -2347,6 +2348,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::FCMP)
     MAKE_CASE(AArch64ISD::STRICT_FCMP)
     MAKE_CASE(AArch64ISD::STRICT_FCMPE)
+    MAKE_CASE(AArch64ISD::SME_ZA_LDR)
+    MAKE_CASE(AArch64ISD::SME_ZA_STR)
     MAKE_CASE(AArch64ISD::DUP)
     MAKE_CASE(AArch64ISD::DUPLANE8)
     MAKE_CASE(AArch64ISD::DUPLANE16)
@@ -4759,17 +4762,9 @@ static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
 }
 
-SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain,
-                                           SMEAttrs Attrs, SDLoc DL,
-                                           EVT VT) const {
-  if (Attrs.hasStreamingInterfaceOrBody())
-    return DAG.getConstant(1, DL, VT);
-
-  if (Attrs.hasNonStreamingInterfaceAndBody())
-    return DAG.getConstant(0, DL, VT);
-
-  assert(Attrs.hasStreamingCompatibleInterface() && "Unexpected interface");
-
+SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
+                                                  SDValue Chain, SDLoc DL,
+                                                  EVT VT) const {
   SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
                                          getPointerTy(DAG.getDataLayout()));
   Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
@@ -4785,15 +4780,88 @@ SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain,
                      Mask);
 }
 
-static std::optional<SMEAttrs> getCalleeAttrsFromExternalFunction(SDValue V) {
-  if (auto *ES = dyn_cast<ExternalSymbolSDNode>(V)) {
-    StringRef S(ES->getSymbol());
-    if (S == "__arm_sme_state" || S == "__arm_tpidr2_save")
-      return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Preserved);
-    if (S == "__arm_tpidr2_restore")
-      return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Shared);
+// Lower an SME LDR/STR ZA intrinsic
+// Case 1: If the vector number (vecnum) is an immediate in range, it gets
+// folded into the instruction
+//    ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
+// Case 2: If the vecnum is not an immediate, then it is used to modify the base
+// and tile slice registers
+//    ldr(%tileslice, %ptr, %vecnum)
+//    ->
+//    %svl = rdsvl
+//    %ptr2 = %ptr + %svl * %vecnum
+//    %tileslice2 = %tileslice + %vecnum
+//    ldr [%tileslice2, 0], [%ptr2, 0]
+// Case 3: If the vecnum is an immediate out of range, then the same is done as
+// case 2, but the base and slice registers are modified by the greatest
+// multiple of 15 lower than the vecnum and the remainder is folded into the
+// instruction. This means that successive loads and stores that are offset from
+// each other can share the same base and slice register updates.
+//    ldr(%tileslice, %ptr, 22)
+//    ldr(%tileslice, %ptr, 23)
+//    ->
+//    %svl = rdsvl
+//    %ptr2 = %ptr + %svl * 15
+//    %tileslice2 = %tileslice + 15
+//    ldr [%tileslice2, 7], [%ptr2, 7]
+//    ldr [%tileslice2, 8], [%ptr2, 8]
+// Case 4: If the vecnum is an add of an immediate, then the non-immediate
+// operand and the immediate can be folded into the instruction, like case 2.
+//    ldr(%tileslice, %ptr, %vecnum + 7)
+//    ldr(%tileslice, %ptr, %vecnum + 8)
+//    ->
+//    %svl = rdsvl
+//    %ptr2 = %ptr + %svl * %vecnum
+//    %tileslice2 = %tileslice + %vecnum
+//    ldr [%tileslice2, 7], [%ptr2, 7]
+//    ldr [%tileslice2, 8], [%ptr2, 8]
+// Case 5: The vecnum being an add of an immediate out of range is also handled,
+// in which case the same remainder logic as case 3 is used.
+SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) {
+  SDLoc DL(N);
+
+  SDValue TileSlice = N->getOperand(2);
+  SDValue Base = N->getOperand(3);
+  SDValue VecNum = N->getOperand(4);
+  int32_t ConstAddend = 0;
+  SDValue VarAddend = VecNum;
+
+  // If the vnum is an add of an immediate, we can fold it into the instruction
+  if (VecNum.getOpcode() == ISD::ADD &&
+      isa<ConstantSDNode>(VecNum.getOperand(1))) {
+    ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
+    VarAddend = VecNum.getOperand(0);
+  } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
+    ConstAddend = ImmNode->getSExtValue();
+    VarAddend = SDValue();
+  }
+
+  int32_t ImmAddend = ConstAddend % 16;
+  if (int32_t C = (ConstAddend - ImmAddend)) {
+    SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
+    VarAddend = VarAddend
+                    ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
+                    : CVal;
+  }
+
+  if (VarAddend) {
+    // Get the vector length that will be multiplied by vnum
+    auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+                           DAG.getConstant(1, DL, MVT::i32));
+
+    // Multiply SVL and vnum then add it to the base
+    SDValue Mul = DAG.getNode(
+        ISD::MUL, DL, MVT::i64,
+        {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
+    Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
+    // Just add vnum to the tileslice
+    TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
   }
-  return std::nullopt;
+
+  return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
+                     DL, MVT::Other,
+                     {/*Chain=*/N.getOperand(0), TileSlice, Base,
+                      DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
 }
 
 SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
@@ -4819,6 +4887,10 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
                        DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
   }
+  case Intrinsic::aarch64_sme_str:
+  case Intrinsic::aarch64_sme_ldr: {
+    return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
+  }
   case Intrinsic::aarch64_sme_za_enable:
     return DAG.getNode(
         AArch64ISD::SMSTART, DL, MVT::Other,
@@ -6283,9 +6355,25 @@ AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
       DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
   Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
 
+  // Set the reserved bytes (10-15) to zero
+  EVT PtrTy = Ptr.getValueType();
+  SDValue ReservedPtr =
+      DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy));
+  Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
+                       MPI);
+  ReservedPtr =
+      DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy));
+  Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
+                       MPI);
+
   return TPIDR2Obj;
 }
 
+static bool isPassedInFPR(EVT VT) {
+  return VT.isFixedLengthVector() ||
+         (VT.isFloatingPoint() && !VT.isScalableVector());
+}
+
 SDValue AArch64TargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -6420,6 +6508,13 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
         // This will be the new Chain/Root node.
         ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
         Glue = ArgValue.getValue(2);
+        if (isPassedInFPR(ArgValue.getValueType())) {
+          ArgValue =
+              DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
+                          DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
+                          {ArgValue, Glue});
+          Glue = ArgValue.getValue(1);
+        }
       } else
         ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
 
@@ -6579,12 +6674,19 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   // make sure it is Glued to the last CopyFromReg value.
   if (IsLocallyStreaming) {
     const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
-    Chain = DAG.getNode(
-        AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
-        {DAG.getRoot(),
-          DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32),
-         DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64),
-         DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()), Glue});
+    SDValue PStateSM;
+    if (Attrs.hasStreamingCompatibleInterface()) {
+      PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
+      Register Reg = MF.getRegInfo().createVirtualRegister(
+          getRegClassFor(PStateSM.getValueType().getSimpleVT()));
+      FuncInfo->setPStateSMReg(Reg);
+      Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
+    } else {
+      PStateSM = DAG.getConstant(0, DL, MVT::i64);
+    }
+    Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue, PStateSM,
+                                /*Entry*/ true);
+
     // Ensure that the SMSTART happens after the CopyWithChain such that its
     // chain result is used.
     for (unsigned I=0; I<InVals.size(); ++I) {
@@ -6785,7 +6887,7 @@ SDValue AArch64TargetLowering::LowerCallResult(
     SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
-    SDValue ThisVal) const {
+    SDValue ThisVal, bool RequiresSMChange) const {
   DenseMap<unsigned, SDValue> CopiedRegs;
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -6830,6 +6932,10 @@ SDValue AArch64TargetLowering::LowerCallResult(
       break;
     }
 
+    if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
+      Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL, Val.getValueType(),
+                        Val);
+
     InVals.push_back(Val);
   }
 
@@ -6926,7 +7032,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   SMEAttrs CallerAttrs(MF.getFunction());
   auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
   if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
-      CallerAttrs.requiresLazySave(CalleeAttrs))
+      CallerAttrs.requiresLazySave(CalleeAttrs) ||
+      CallerAttrs.hasStreamingBody())
     return false;
 
   // Functions using the C or Fast calling convention that have an SVE signature
@@ -7109,9 +7216,55 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
   return ZExtBool;
 }
 
+void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                                          SDNode *Node) const {
+  // Live-in physreg copies that are glued to SMSTART are applied as
+  // implicit-def's in the InstrEmitter. Here we remove them, allowing the
+  // register allocator to pass call args in callee saved regs, without extra
+  // copies to avoid these fake clobbers of actually-preserved GPRs.
+  if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
+      MI.getOpcode() == AArch64::MSRpstatePseudo) {
+    for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
+      if (MachineOperand &MO = MI.getOperand(I);
+          MO.isReg() && MO.isImplicit() && MO.isDef() &&
+          (AArch64::GPR32RegClass.contains(MO.getReg()) ||
+           AArch64::GPR64RegClass.contains(MO.getReg())))
+        MI.removeOperand(I);
+
+    // The SVE vector length can change when entering/leaving streaming mode.
+    if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
+        MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
+      MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
+                                              /*IsImplicit=*/true));
+      MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
+                                              /*IsImplicit=*/true));
+    }
+  }
+
+  // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
+  // have nothing to do with VG, were it not that they are used to materialise a
+  // frame-address. If they contain a frame-index to a scalable vector, this
+  // will likely require an ADDVL instruction to materialise the address, thus
+  // reading VG.
+  const MachineFunction &MF = *MI.getMF();
+  if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
+      (MI.getOpcode() == AArch64::ADDXri ||
+       MI.getOpcode() == AArch64::SUBXri)) {
+    const MachineOperand &MO = MI.getOperand(1);
+    if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
+                         TargetStackID::ScalableVector)
+      MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
+                                              /*IsImplicit=*/true));
+  }
+}
+
 SDValue AArch64TargetLowering::changeStreamingMode(
     SelectionDAG &DAG, SDLoc DL, bool Enable,
     SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  FuncInfo->setHasStreamingModeChanges(true);
+
   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
   SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
   SDValue MSROp =
@@ -7258,39 +7411,68 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
   if (CLI.CB)
     CalleeAttrs = SMEAttrs(*CLI.CB);
-  else if (std::optional<SMEAttrs> Attrs =
-               getCalleeAttrsFromExternalFunction(CLI.Callee))
-    CalleeAttrs = *Attrs;
+  else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
+    CalleeAttrs = SMEAttrs(ES->getSymbol());
+
+  auto DescribeCallsite =
+      [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
+    R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
+    if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
+      R << ore::NV("Callee", ES->getSymbol());
+    else if (CLI.CB && CLI.CB->getCalledFunction())
+      R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
+    else
+      R << "unknown callee";
+    R << "'";
+    return R;
+  };
 
   bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
-
-  MachineFrameInfo &MFI = MF.getFrameInfo();
   if (RequiresLazySave) {
-    // Set up a lazy save mechanism by storing the runtime live slices
-    // (worst-case N*N) to the TPIDR2 stack object.
-    SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
-                            DAG.getConstant(1, DL, MVT::i32));
-    SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
     unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
-
     MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
     SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
         DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
-    SDValue BufferPtrAddr =
+    SDValue NumZaSaveSlicesAddr =
         DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
                     DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
-    Chain = DAG.getTruncStore(Chain, DL, NN, BufferPtrAddr, MPI, MVT::i16);
+    SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+                                          DAG.getConstant(1, DL, MVT::i32));
+    Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
+                              MPI, MVT::i16);
     Chain = DAG.getNode(
         ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
         DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
         TPIDR2ObjAddr);
+    OptimizationRemarkEmitter ORE(&MF.getFunction());
+    ORE.emit([&]() {
+      auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
+                                                   CLI.CB)
+                      : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
+                                                   &MF.getFunction());
+      return DescribeCallsite(R) << " sets up a lazy save for ZA";
+    });
   }
 
   SDValue PStateSM;
-  std::optional<bool> RequiresSMChange =
-      CallerAttrs.requiresSMChange(CalleeAttrs);
-  if (RequiresSMChange)
-    PStateSM = getPStateSM(DAG, Chain, CallerAttrs, DL, MVT::i64);
+  bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
+  if (RequiresSMChange) {
+    if (CallerAttrs.hasStreamingInterfaceOrBody())
+      PStateSM = DAG.getConstant(1, DL, MVT::i64);
+    else if (CallerAttrs.hasNonStreamingInterface())
+      PStateSM = DAG.getConstant(0, DL, MVT::i64);
+    else
+      PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
+    OptimizationRemarkEmitter ORE(&MF.getFunction());
+    ORE.emit([&]() {
+      auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
+                                                   CLI.CB)
+                      : OptimizationRemarkAnalysis("sme", "SMETransition",
+                                                   &MF.getFunction());
+      DescribeCallsite(R) << " requires a streaming mode transition";
+      return R;
+    });
+  }
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
@@ -7386,6 +7568,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
       Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
       Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
+      MachineFrameInfo &MFI = MF.getFrameInfo();
       int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
       if (isScalable)
         MFI.setStackID(FI, TargetStackID::ScalableVector);
@@ -7459,8 +7642,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         // Add an extra level of indirection for streaming mode changes by
         // using a pseudo copy node that cannot be rematerialised between a
         // smstart/smstop and the call by the simple register coalescer.
-        if (RequiresSMChange && isa<FrameIndexSDNode>(Arg))
-          Arg = DAG.getNode(AArch64ISD::OBSCURE_COPY, DL, MVT::i64, Arg);
+        if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
+          Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
+                            Arg.getValueType(), Arg);
         RegsToPass.emplace_back(VA.getLocReg(), Arg);
         RegsUsed.insert(VA.getLocReg());
         const TargetOptions &Options = DAG.getTarget().Options;
@@ -7551,8 +7735,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   SDValue InGlue;
   if (RequiresSMChange) {
-    SDValue NewChain = changeStreamingMode(DAG, DL, *RequiresSMChange, Chain,
-                                           InGlue, PStateSM, true);
+    SDValue NewChain =
+        changeStreamingMode(DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain,
+                            InGlue, PStateSM, true);
     Chain = NewChain.getValue(0);
     InGlue = NewChain.getValue(1);
   }
@@ -7693,17 +7878,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
-  SDValue Result = LowerCallResult(Chain, InGlue, CallConv, IsVarArg, RVLocs,
-                                   DL, DAG, InVals, IsThisReturn,
-                                   IsThisReturn ? OutVals[0] : SDValue());
+  SDValue Result = LowerCallResult(
+      Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
+      IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
 
   if (!Ins.empty())
     InGlue = Result.getValue(Result->getNumValues() - 1);
 
   if (RequiresSMChange) {
     assert(PStateSM && "Expected a PStateSM to be set");
-    Result = changeStreamingMode(DAG, DL, !*RequiresSMChange, Result, InGlue,
-                                 PStateSM, false);
+    Result = changeStreamingMode(DAG, DL, !CalleeAttrs.hasStreamingInterface(),
+                                 Result, InGlue, PStateSM, false);
   }
 
   if (RequiresLazySave) {
@@ -7729,12 +7914,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     SDValue TPIDR2Block = DAG.getFrameIndex(
         FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
     Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
-    Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
-                         {Result, TPIDR2_EL0,
-                          DAG.getRegister(AArch64::X0, MVT::i64),
-                          RestoreRoutine,
-                          RegMask,
-                          Result.getValue(1)});
+    Result =
+        DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
+                    {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
+                     RestoreRoutine, RegMask, Result.getValue(1)});
 
     // Finally reset the TPIDR2_EL0 register to 0.
     Result = DAG.getNode(
@@ -7838,16 +8021,26 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   // Emit SMSTOP before returning from a locally streaming function
   SMEAttrs FuncAttrs(MF.getFunction());
   if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
-    Chain = DAG.getNode(
-        AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
-        DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32),
-        DAG.getConstant(1, DL, MVT::i64), DAG.getConstant(0, DL, MVT::i64),
-        DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()));
+    if (FuncAttrs.hasStreamingCompatibleInterface()) {
+      Register Reg = FuncInfo->getPStateSMReg();
+      assert(Reg.isValid() && "PStateSM Register is invalid");
+      SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
+      Chain =
+          changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
+                              /*Glue*/ SDValue(), PStateSM, /*Entry*/ false);
+    } else
+      Chain = changeStreamingMode(
+          DAG, DL, /*Enable*/ false, Chain,
+          /*Glue*/ SDValue(), DAG.getConstant(1, DL, MVT::i64), /*Entry*/ true);
     Glue = Chain.getValue(1);
   }
 
   SmallVector<SDValue, 4> RetOps(1, Chain);
   for (auto &RetVal : RetVals) {
+    if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
+        isPassedInFPR(RetVal.second.getValueType()))
+      RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
+                                  RetVal.second.getValueType(), RetVal.second);
     Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
     Glue = Chain.getValue(1);
     RetOps.push_back(
@@ -9950,19 +10143,60 @@ const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
   return "r";
 }
 
-enum PredicateConstraint {
-  Upl,
-  Upa,
-  Invalid
-};
+enum class PredicateConstraint { Uph, Upl, Upa };
+
+static std::optional<PredicateConstraint>
+parsePredicateConstraint(StringRef Constraint) {
+  return StringSwitch<std::optional<PredicateConstraint>>(Constraint)
+      .Case("Uph", PredicateConstraint::Uph)
+      .Case("Upl", PredicateConstraint::Upl)
+      .Case("Upa", PredicateConstraint::Upa)
+      .Default(std::nullopt);
+}
+
+static const TargetRegisterClass *
+getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) {
+  if (VT != MVT::aarch64svcount &&
+      (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
+    return nullptr;
 
-static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
-  PredicateConstraint P = PredicateConstraint::Invalid;
-  if (Constraint == "Upa")
-    P = PredicateConstraint::Upa;
-  if (Constraint == "Upl")
-    P = PredicateConstraint::Upl;
-  return P;
+  switch (Constraint) {
+  case PredicateConstraint::Uph:
+    return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
+                                     : &AArch64::PPR_p8to15RegClass;
+  case PredicateConstraint::Upl:
+    return VT == MVT::aarch64svcount ? nullptr : &AArch64::PPR_3bRegClass;
+  case PredicateConstraint::Upa:
+    return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
+                                     : &AArch64::PPRRegClass;
+  }
+
+  llvm_unreachable("Missing PredicateConstraint!");
+}
+
+enum class ReducedGprConstraint { Uci, Ucj };
+
+static std::optional<ReducedGprConstraint>
+parseReducedGprConstraint(StringRef Constraint) {
+  return StringSwitch<std::optional<ReducedGprConstraint>>(Constraint)
+      .Case("Uci", ReducedGprConstraint::Uci)
+      .Case("Ucj", ReducedGprConstraint::Ucj)
+      .Default(std::nullopt);
+}
+
+static const TargetRegisterClass *
+getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT) {
+  if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
+    return nullptr;
+
+  switch (Constraint) {
+  case ReducedGprConstraint::Uci:
+    return &AArch64::MatrixIndexGPR32_8_11RegClass;
+  case ReducedGprConstraint::Ucj:
+    return &AArch64::MatrixIndexGPR32_12_15RegClass;
+  }
+
+  llvm_unreachable("Missing ReducedGprConstraint!");
 }
 
 // The set of cc code supported is from
@@ -10060,9 +10294,10 @@ AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
     case 'S': // A symbolic address
       return C_Other;
     }
-  } else if (parsePredicateConstraint(Constraint) !=
-             PredicateConstraint::Invalid)
-      return C_RegisterClass;
+  } else if (parsePredicateConstraint(Constraint))
+    return C_RegisterClass;
+  else if (parseReducedGprConstraint(Constraint))
+    return C_RegisterClass;
   else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
     return C_Other;
   return TargetLowering::getConstraintType(Constraint);
@@ -10096,7 +10331,8 @@ AArch64TargetLowering::getSingleConstraintMatchWeight(
     weight = CW_Constant;
     break;
   case 'U':
-    if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
+    if (parsePredicateConstraint(constraint) ||
+        parseReducedGprConstraint(constraint))
       weight = CW_Register;
     break;
   }
@@ -10153,19 +10389,26 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
       break;
     }
   } else {
-    PredicateConstraint PC = parsePredicateConstraint(Constraint);
-    if (PC != PredicateConstraint::Invalid) {
-      if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
-        return std::make_pair(0U, nullptr);
-      bool restricted = (PC == PredicateConstraint::Upl);
-      return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
-                        : std::make_pair(0U, &AArch64::PPRRegClass);
-    }
+    if (const auto PC = parsePredicateConstraint(Constraint))
+      if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
+        return std::make_pair(0U, RegClass);
+
+    if (const auto RGC = parseReducedGprConstraint(Constraint))
+      if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
+        return std::make_pair(0U, RegClass);
   }
   if (StringRef("{cc}").equals_insensitive(Constraint) ||
       parseConstraintCode(Constraint) != AArch64CC::Invalid)
     return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
 
+  if (Constraint == "{za}") {
+    return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
+  }
+
+  if (Constraint == "{zt0}") {
+    return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
+  }
+
   // Use the default implementation in TargetLowering to convert the register
   // constraint into a member of a register class.
   std::pair<unsigned, const TargetRegisterClass *> Res;
@@ -16555,7 +16798,8 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
   // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
   // This eliminates an "integer-to-vector-move" UOP and improves throughput.
   SDValue N0 = N->getOperand(0);
-  if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+  if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
+      N0.hasOneUse() &&
       // Do not change the width of a volatile load.
       !cast<LoadSDNode>(N0)->isVolatile()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
@@ -24749,8 +24993,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
   if (auto *Base = dyn_cast<CallBase>(&Inst)) {
     auto CallerAttrs = SMEAttrs(*Inst.getFunction());
     auto CalleeAttrs = SMEAttrs(*Base);
-    if (CallerAttrs.requiresSMChange(CalleeAttrs,
-                                     /*BodyOverridesInterface=*/false) ||
+    if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
         CallerAttrs.requiresLazySave(CalleeAttrs))
       return true;
   }
@@ -24931,7 +25174,15 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
   EVT VT = Op.getValueType();
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
 
-  SDValue Mask = convertFixedMaskToScalableVector(Load->getMask(), DAG);
+  SDValue Mask = Load->getMask();
+  // If this is an extending load and the mask type is not the same as
+  // load's type then we have to extend the mask type.
+  if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
+    assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
+           "Incorrect mask type");
+    Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
+  }
+  Mask = convertFixedMaskToScalableVector(Mask, DAG);
 
   SDValue PassThru;
   bool IsPassThruZeroOrUndef = false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index aca45f113e736679e22961efba1668df7e17ef19..2f6dc303934d1c387d1ced799a9348580d5eef4e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -58,13 +58,8 @@ enum NodeType : unsigned {
 
   CALL_BTI, // Function call followed by a BTI instruction.
 
-  // Essentially like a normal COPY that works on GPRs, but cannot be
-  // rematerialised by passes like the simple register coalescer. It's
-  // required for SME when lowering calls because we cannot allow frame
-  // index calculations using addvl to slip in between the smstart/smstop
-  // and the bl instruction. The scalable vector length may change across
-  // the smstart/smstop boundary.
-  OBSCURE_COPY,
+  COALESCER_BARRIER,
+
   SMSTART,
   SMSTOP,
   RESTORE_ZA,
@@ -444,6 +439,10 @@ enum NodeType : unsigned {
   STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
   STRICT_FCMPE,
 
+  // SME ZA loads and stores
+  SME_ZA_LDR,
+  SME_ZA_STR,
+
   // NEON Load/Store with post-increment base updates
   LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
   LD3post,
@@ -963,6 +962,9 @@ private:
                                const SDLoc &DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
+  void AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                     SDNode *Node) const override;
+
   SDValue LowerCall(CallLoweringInfo & /*CLI*/,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
@@ -971,7 +973,7 @@ private:
                           const SmallVectorImpl<CCValAssign> &RVLocs,
                           const SDLoc &DL, SelectionDAG &DAG,
                           SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
-                          SDValue ThisVal) const;
+                          SDValue ThisVal, bool RequiresSMChange) const;
 
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
@@ -1238,10 +1240,10 @@ private:
   // This function does not handle predicate bitcasts.
   SDValue getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const;
 
-  // Returns the runtime value for PSTATE.SM. When the function is streaming-
-  // compatible, this generates a call to __arm_sme_state.
-  SDValue getPStateSM(SelectionDAG &DAG, SDValue Chain, SMEAttrs Attrs,
-                      SDLoc DL, EVT VT) const;
+  // Returns the runtime value for PSTATE.SM by generating a call to
+  // __arm_sme_state.
+  SDValue getRuntimePStateSM(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
+                             EVT VT) const;
 
   bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1,
                                               LLT Ty2) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 39135df285c2382bc8c67dfc5736527e7febe2e0..b645f16d062a506f29238c6133f9b17e65226d44 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1485,7 +1485,7 @@ def UImm3s8Operand : UImmScaledMemoryIndexed<3, 8>;
 
 def uimm3s8 : Operand<i64>, ImmLeaf<i64,
 [{ return Imm >= 0 && Imm <= 56 && ((Imm % 8) == 0); }], UImmS8XForm> {
-  let PrintMethod = "printVectorIndex<8>";
+  let PrintMethod = "printMatrixIndex<8>";
   let ParserMatchClass = UImm3s8Operand;
 }
 
@@ -2670,6 +2670,7 @@ class AddSubImmShift<bit isSub, bit setFlags, RegisterClass dstRegtype,
   let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
   let Inst{21-10} = imm{11-0};
   let DecoderMethod = "DecodeAddSubImmShift";
+  let hasPostISelHook = 1;
 }
 
 class BaseAddSubRegPseudo<RegisterClass regtype,
@@ -5941,11 +5942,11 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
 
 // ARMv8.2-A Dot Product Instructions (Vector): These instructions extract
 // bytes from S-sized elements.
-class BaseSIMDThreeSameVectorDot<bit Q, bit U, bit Mixed, string asm, string kind1,
-                                 string kind2, RegisterOperand RegType,
+class BaseSIMDThreeSameVectorDot<bit Q, bit U, bits<2> sz, bits<4> opc, string asm,
+                                 string kind1, string kind2, RegisterOperand RegType,
                                  ValueType AccumType, ValueType InputType,
                                  SDPatternOperator OpNode> :
-        BaseSIMDThreeSameVectorTied<Q, U, 0b100, {0b1001, Mixed}, RegType, asm, kind1,
+        BaseSIMDThreeSameVectorTied<Q, U, {sz, 0b0}, {0b1, opc}, RegType, asm, kind1,
         [(set (AccumType RegType:$dst),
               (OpNode (AccumType RegType:$Rd),
                       (InputType RegType:$Rn),
@@ -5954,9 +5955,9 @@ class BaseSIMDThreeSameVectorDot<bit Q, bit U, bit Mixed, string asm, string kin
 }
 
 multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64,
+  def v8i8  : BaseSIMDThreeSameVectorDot<0, U, 0b10, {0b001, Mixed}, asm, ".2s", ".8b", V64,
                                          v2i32, v8i8, OpNode>;
-  def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128,
+  def v16i8 : BaseSIMDThreeSameVectorDot<1, U, 0b10, {0b001, Mixed}, asm, ".4s", ".16b", V128,
                                          v4i32, v16i8, OpNode>;
 }
 
@@ -8412,12 +8413,12 @@ class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNo
 
 //----------------------------------------------------------------------------
 // ARMv8.2-A Dot Product Instructions (Indexed)
-class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, bit Mixed, bits<2> size, string asm,
+class BaseSIMDThreeSameVectorIndexS<bit Q, bit U, bits<2> size, bits<4> opc, string asm,
                                       string dst_kind, string lhs_kind, string rhs_kind,
                                       RegisterOperand RegType,
                                       ValueType AccumType, ValueType InputType,
                                       SDPatternOperator OpNode> :
-        BaseSIMDIndexedTied<Q, U, 0b0, size, {0b111, Mixed}, RegType, RegType, V128,
+        BaseSIMDIndexedTied<Q, U, 0b0, size, opc, RegType, RegType, V128,
                             VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
         [(set (AccumType RegType:$dst),
               (AccumType (OpNode (AccumType RegType:$Rd),
@@ -8432,20 +8433,20 @@ class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, bit Mixed, bits<2> size, str
 
 multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string asm,
                                        SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b",
+  def v8i8  : BaseSIMDThreeSameVectorIndexS<0, U, size, {0b111, Mixed}, asm, ".2s", ".8b", ".4b",
                                               V64, v2i32, v8i8, OpNode>;
-  def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b",
+  def v16i8 : BaseSIMDThreeSameVectorIndexS<1, U, size, {0b111, Mixed}, asm, ".4s", ".16b", ".4b",
                                               V128, v4i32, v16i8, OpNode>;
 }
 
 // ARMv8.2-A Fused Multiply Add-Long Instructions (Indexed)
 let mayRaiseFPException = 1, Uses = [FPCR] in
-class BaseSIMDThreeSameVectorFMLIndex<bit Q, bit U, bits<4> opc, string asm,
+class BaseSIMDThreeSameVectorIndexH<bit Q, bit U, bits<2> sz, bits<4> opc, string asm,
                                       string dst_kind, string lhs_kind,
                                       string rhs_kind, RegisterOperand RegType,
-                                      ValueType AccumType, ValueType InputType,
-                                      SDPatternOperator OpNode> :
-        BaseSIMDIndexedTied<Q, U, 0, 0b10, opc, RegType, RegType, V128_lo,
+                                      RegisterOperand RegType_lo, ValueType AccumType,
+                                      ValueType InputType, SDPatternOperator OpNode> :
+        BaseSIMDIndexedTied<Q, U, 0, sz, opc, RegType, RegType, RegType_lo,
                             VectorIndexH, asm, "", dst_kind, lhs_kind, rhs_kind,
           [(set (AccumType RegType:$dst),
                 (AccumType (OpNode (AccumType RegType:$Rd),
@@ -8461,10 +8462,10 @@ class BaseSIMDThreeSameVectorFMLIndex<bit Q, bit U, bits<4> opc, string asm,
 
 multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
                                        SDPatternOperator OpNode> {
-  def v4f16 : BaseSIMDThreeSameVectorFMLIndex<0, U, opc, asm, ".2s", ".2h", ".h",
-                                              V64, v2f32, v4f16, OpNode>;
-  def v8f16 : BaseSIMDThreeSameVectorFMLIndex<1, U, opc, asm, ".4s", ".4h", ".h",
-                                              V128, v4f32, v8f16, OpNode>;
+  def v4f16 : BaseSIMDThreeSameVectorIndexH<0, U, 0b10, opc, asm, ".2s", ".2h", ".h",
+                                              V64, V128_lo, v2f32, v4f16, OpNode>;
+  def v8f16 : BaseSIMDThreeSameVectorIndexH<1, U, 0b10, opc, asm, ".4s", ".4h", ".h",
+                                              V128, V128_lo, v4f32, v8f16, OpNode>;
 }
 
 multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 0691e07a639beee77f8096e2cdb1f70e259a0e03..e12dfeca8e24286b2b657761024eb053afe9c3d5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3580,6 +3580,39 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  if (AArch64::PNRRegClass.contains(DestReg) &&
+      AArch64::PPRRegClass.contains(SrcReg)) {
+    assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
+           "Unexpected predicate-as-counter register.");
+    // Copy from pX to pnX is a no-op
+    if ((DestReg.id() - AArch64::PN0) == (SrcReg.id() - AArch64::P0))
+      return;
+    MCRegister PPRDestReg = (DestReg - AArch64::PN0) + AArch64::P0;
+    BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
+        .addReg(SrcReg)
+        .addReg(SrcReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addDef(DestReg, RegState::Implicit);
+    return;
+  }
+
+  if (AArch64::PPRRegClass.contains(DestReg) &&
+      AArch64::PNRRegClass.contains(SrcReg)) {
+    assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
+           "Unexpected predicate-as-counter register.");
+    // Copy from pnX to pX is a no-op
+    if ((DestReg.id() - AArch64::P0) == (SrcReg.id() - AArch64::PN0))
+      return;
+    MCRegister PNRDestReg = (DestReg - AArch64::P0) + AArch64::PN0;
+    MCRegister PPRSrcReg = (SrcReg - AArch64::PN0) + AArch64::P0;
+    BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
+        .addReg(PPRSrcReg)
+        .addReg(PPRSrcReg)
+        .addReg(PPRSrcReg, getKillRegState(KillSrc))
+        .addDef(PNRDestReg, RegState::Implicit);
+    return;
+  }
+
   // Copy a Z register by ORRing with itself.
   if (AArch64::ZPRRegClass.contains(DestReg) &&
       AArch64::ZPRRegClass.contains(SrcReg)) {
@@ -3869,6 +3902,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
   unsigned Opc = 0;
   bool Offset = true;
+  MCRegister PNRReg = MCRegister::NoRegister;
   unsigned StackID = TargetStackID::Default;
   switch (TRI->getSpillSize(*RC)) {
   case 1:
@@ -3882,6 +3916,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
       Opc = AArch64::STR_PXI;
       StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) {
+      assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
+             "Unexpected register store without SVE2p1 or SME2");
+      SrcReg = (SrcReg - AArch64::PN0) + AArch64::P0;
+      Opc = AArch64::STR_PXI;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 4:
@@ -3982,6 +4022,8 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
   if (Offset)
     MI.addImm(0);
+  if (PNRReg.isValid())
+    MI.addDef(PNRReg, RegState::Implicit);
   MI.addMemOperand(MMO);
 }
 
@@ -4026,6 +4068,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   unsigned Opc = 0;
   bool Offset = true;
   unsigned StackID = TargetStackID::Default;
+  MCRegister PNRReg = MCRegister::NoRegister;
   switch (TRI->getSpillSize(*RC)) {
   case 1:
     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
@@ -4038,6 +4081,13 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
       Opc = AArch64::LDR_PXI;
       StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) {
+      assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
+             "Unexpected register load without SVE2p1 or SME2");
+      PNRReg = DestReg;
+      DestReg = (DestReg - AArch64::PN0) + AArch64::P0;
+      Opc = AArch64::LDR_PXI;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 4:
@@ -4138,6 +4188,8 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                      .addFrameIndex(FI);
   if (Offset)
     MI.addImm(0);
+  if (PNRReg.isValid())
+    MI.addDef(PNRReg, RegState::Implicit);
   MI.addMemOperand(MMO);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 27a7e26c5e188a1d0894e006b38f8dc281fb4b76..8adceb9ffd1536e38d2a7ae55ecd70dff0ba3640 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1198,7 +1198,7 @@ defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_ne
 class BaseSIMDSUDOTIndex<bit Q, string dst_kind, string lhs_kind,
                          string rhs_kind, RegisterOperand RegType,
                          ValueType AccumType, ValueType InputType>
-      : BaseSIMDThreeSameVectorDotIndex<Q, 0, 1, 0b00, "sudot", dst_kind,
+      : BaseSIMDThreeSameVectorIndexS<Q, 0, 0b00, 0b1111, "sudot", dst_kind,
                                         lhs_kind, rhs_kind, RegType, AccumType,
                                         InputType, null_frag> {
   let Pattern = [(set (AccumType RegType:$dst),
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index d82fb436925ec6fa2bd1b7f61bb73b9caba5dc29..7d841cdc22dfe38ff7ddc770a0932f37b99f4e9c 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -185,6 +185,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// The frame-index for the TPIDR2 object used for lazy saves.
   Register LazySaveTPIDR2Obj = 0;
 
+  /// Whether this function changes streaming mode within the function.
+  bool HasStreamingModeChanges = false;
 
   /// True if the function need unwind information.
   mutable std::optional<bool> NeedsDwarfUnwindInfo;
@@ -192,6 +194,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// True if the function need asynchronous unwind information.
   mutable std::optional<bool> NeedsAsyncDwarfUnwindInfo;
 
+  // Holds a register containing pstate.sm. This is set
+  // on function entry to record the initial pstate of a function.
+  Register PStateSMReg = MCRegister::NoRegister;
+
 public:
   AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI);
 
@@ -200,6 +206,9 @@ public:
         const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
       const override;
 
+  Register getPStateSMReg() const { return PStateSMReg; };
+  void setPStateSMReg(Register Reg) { PStateSMReg = Reg; };
+
   bool isSVECC() const { return IsSVECC; };
   void setIsSVECC(bool s) { IsSVECC = s; };
 
@@ -447,6 +456,11 @@ public:
   bool needsDwarfUnwindInfo(const MachineFunction &MF) const;
   bool needsAsyncDwarfUnwindInfo(const MachineFunction &MF) const;
 
+  bool hasStreamingModeChanges() const { return HasStreamingModeChanges; }
+  void setHasStreamingModeChanges(bool HasChanges) {
+    HasStreamingModeChanges = HasChanges;
+  }
+
 private:
   // Hold the lists of LOHs.
   MILOHContainer LOHContainerSet;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index d1ddf6d76975476e9ba53bea95ab73e4682591c2..465e4c5b8df58a38ec80a8c532788b92d57bc95c 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -440,6 +440,9 @@ AArch64RegisterInfo::getStrictlyReservedRegs(const MachineFunction &MF) const {
       Reserved.set(SubReg);
   }
 
+  // VG cannot be allocated
+  Reserved.set(AArch64::VG);
+
   markSuperRegs(Reserved, AArch64::FPCR);
 
   assert(checkAllSuperRegsMarked(Reserved));
@@ -491,6 +494,10 @@ bool AArch64RegisterInfo::isAsmClobberable(const MachineFunction &MF,
         MCRegisterInfo::regsOverlap(PhysReg, AArch64::X16))
     return true;
 
+  // ZA/ZT0 registers are reserved but may be permitted in the clobber list.
+  if (PhysReg == AArch64::ZA || PhysReg == AArch64::ZT0)
+    return true;
+
   return !isReservedReg(MF, PhysReg);
 }
 
@@ -987,6 +994,8 @@ bool AArch64RegisterInfo::shouldCoalesce(
     MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg,
     const TargetRegisterClass *DstRC, unsigned DstSubReg,
     const TargetRegisterClass *NewRC, LiveIntervals &LIS) const {
+  MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+
   if (MI->isCopy() &&
       ((DstRC->getID() == AArch64::GPR64RegClassID) ||
        (DstRC->getID() == AArch64::GPR64commonRegClassID)) &&
@@ -995,5 +1004,38 @@ bool AArch64RegisterInfo::shouldCoalesce(
     // which implements a 32 to 64 bit zero extension
     // which relies on the upper 32 bits being zeroed.
     return false;
+
+  auto IsCoalescerBarrier = [](const MachineInstr &MI) {
+    switch (MI.getOpcode()) {
+    case AArch64::COALESCER_BARRIER_FPR16:
+    case AArch64::COALESCER_BARRIER_FPR32:
+    case AArch64::COALESCER_BARRIER_FPR64:
+    case AArch64::COALESCER_BARRIER_FPR128:
+      return true;
+    default:
+      return false;
+    }
+  };
+
+  // For calls that temporarily have to toggle streaming mode as part of the
+  // call-sequence, we need to be more careful when coalescing copy instructions
+  // so that we don't end up coalescing the NEON/FP result or argument register
+  // with a whole Z-register, such that after coalescing the register allocator
+  // will try to spill/reload the entire Z register.
+  //
+  // We do this by checking if the node has any defs/uses that are
+  // COALESCER_BARRIER pseudos. These are 'nops' in practice, but they exist to
+  // instruct the coalescer to avoid coalescing the copy.
+  if (MI->isCopy() && SubReg != DstSubReg &&
+      (AArch64::ZPRRegClass.hasSubClassEq(DstRC) ||
+       AArch64::ZPRRegClass.hasSubClassEq(SrcRC))) {
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    if (any_of(MRI.def_instructions(SrcReg), IsCoalescerBarrier))
+      return false;
+    unsigned DstReg = MI->getOperand(0).getReg();
+    if (any_of(MRI.use_nodbg_instructions(DstReg), IsCoalescerBarrier))
+      return false;
+  }
+
   return true;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 4bb1f9413f2ba8b8af6b634f24cefc17abad658c..eeb09389a1b8cbe0d7da030d5ab5df5e709e4bb1 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -55,6 +55,8 @@ let Namespace = "AArch64" in {
   def zasubd1 : SubRegIndex<256>;  // (16 x 16)/8 bytes  = 256 bits
   def zasubq0 : SubRegIndex<128>;  // (16 x 16)/16 bytes = 128 bits
   def zasubq1 : SubRegIndex<128>;  // (16 x 16)/16 bytes = 128 bits
+
+  def psub : SubRegIndex<16>;
 }
 
 let Namespace = "AArch64" in {
@@ -767,23 +769,43 @@ def GPR64x8 : RegisterOperand<GPR64x8Class, "printGPR64x8"> {
 
 //===----- END: v8.7a accelerator extension register operands -------------===//
 
+// SVE predicate-as-counter registers
+  def PN0    : AArch64Reg<0,   "pn0">, DwarfRegNum<[48]>;
+  def PN1    : AArch64Reg<1,   "pn1">, DwarfRegNum<[49]>;
+  def PN2    : AArch64Reg<2,   "pn2">, DwarfRegNum<[50]>;
+  def PN3    : AArch64Reg<3,   "pn3">, DwarfRegNum<[51]>;
+  def PN4    : AArch64Reg<4,   "pn4">, DwarfRegNum<[52]>;
+  def PN5    : AArch64Reg<5,   "pn5">, DwarfRegNum<[53]>;
+  def PN6    : AArch64Reg<6,   "pn6">, DwarfRegNum<[54]>;
+  def PN7    : AArch64Reg<7,   "pn7">, DwarfRegNum<[55]>;
+  def PN8    : AArch64Reg<8,   "pn8">, DwarfRegNum<[56]>;
+  def PN9    : AArch64Reg<9,   "pn9">, DwarfRegNum<[57]>;
+  def PN10   : AArch64Reg<10, "pn10">, DwarfRegNum<[58]>;
+  def PN11   : AArch64Reg<11, "pn11">, DwarfRegNum<[59]>;
+  def PN12   : AArch64Reg<12, "pn12">, DwarfRegNum<[60]>;
+  def PN13   : AArch64Reg<13, "pn13">, DwarfRegNum<[61]>;
+  def PN14   : AArch64Reg<14, "pn14">, DwarfRegNum<[62]>;
+  def PN15   : AArch64Reg<15, "pn15">, DwarfRegNum<[63]>;
+
 // SVE predicate registers
-def P0    : AArch64Reg<0,   "p0">, DwarfRegNum<[48]>;
-def P1    : AArch64Reg<1,   "p1">, DwarfRegNum<[49]>;
-def P2    : AArch64Reg<2,   "p2">, DwarfRegNum<[50]>;
-def P3    : AArch64Reg<3,   "p3">, DwarfRegNum<[51]>;
-def P4    : AArch64Reg<4,   "p4">, DwarfRegNum<[52]>;
-def P5    : AArch64Reg<5,   "p5">, DwarfRegNum<[53]>;
-def P6    : AArch64Reg<6,   "p6">, DwarfRegNum<[54]>;
-def P7    : AArch64Reg<7,   "p7">, DwarfRegNum<[55]>;
-def P8    : AArch64Reg<8,   "p8">, DwarfRegNum<[56]>;
-def P9    : AArch64Reg<9,   "p9">, DwarfRegNum<[57]>;
-def P10   : AArch64Reg<10, "p10">, DwarfRegNum<[58]>;
-def P11   : AArch64Reg<11, "p11">, DwarfRegNum<[59]>;
-def P12   : AArch64Reg<12, "p12">, DwarfRegNum<[60]>;
-def P13   : AArch64Reg<13, "p13">, DwarfRegNum<[61]>;
-def P14   : AArch64Reg<14, "p14">, DwarfRegNum<[62]>;
-def P15   : AArch64Reg<15, "p15">, DwarfRegNum<[63]>;
+let SubRegIndices = [psub] in {
+  def P0    : AArch64Reg<0,   "p0", [PN0]>, DwarfRegAlias<PN0>;
+  def P1    : AArch64Reg<1,   "p1", [PN1]>, DwarfRegAlias<PN1>;
+  def P2    : AArch64Reg<2,   "p2", [PN2]>, DwarfRegAlias<PN2>;
+  def P3    : AArch64Reg<3,   "p3", [PN3]>, DwarfRegAlias<PN3>;
+  def P4    : AArch64Reg<4,   "p4", [PN4]>, DwarfRegAlias<PN4>;
+  def P5    : AArch64Reg<5,   "p5", [PN5]>, DwarfRegAlias<PN5>;
+  def P6    : AArch64Reg<6,   "p6", [PN6]>, DwarfRegAlias<PN6>;
+  def P7    : AArch64Reg<7,   "p7", [PN7]>, DwarfRegAlias<PN7>;
+  def P8    : AArch64Reg<8,   "p8", [PN8]>, DwarfRegAlias<PN8>;
+  def P9    : AArch64Reg<9,   "p9", [PN9]>, DwarfRegAlias<PN9>;
+  def P10   : AArch64Reg<10, "p10", [PN10]>, DwarfRegAlias<PN10>;
+  def P11   : AArch64Reg<11, "p11", [PN11]>, DwarfRegAlias<PN11>;
+  def P12   : AArch64Reg<12, "p12", [PN12]>, DwarfRegAlias<PN12>;
+  def P13   : AArch64Reg<13, "p13", [PN13]>, DwarfRegAlias<PN13>;
+  def P14   : AArch64Reg<14, "p14", [PN14]>, DwarfRegAlias<PN14>;
+  def P15   : AArch64Reg<15, "p15", [PN15]>, DwarfRegAlias<PN15>;
+}
 
 // The part of SVE registers that don't overlap Neon registers.
 // These are only used as part of clobber lists.
@@ -881,8 +903,6 @@ class SVERegOp <string Suffix, AsmOperandClass C,
   let ParserMatchClass = C;
 }
 
-class PPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size,
-                RegisterClass RC> : SVERegOp<Suffix, C, Size, RC> {}
 class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size,
                 RegisterClass RC> : SVERegOp<Suffix, C, Size, RC> {}
 
@@ -891,7 +911,7 @@ class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size,
 // SVE predicate register classes.
 class PPRClass<int firstreg, int lastreg> : RegisterClass<
                                   "AArch64",
-                                  [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1, aarch64svcount ], 16,
+                                  [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1 ], 16,
                                   (sequence "P%u", firstreg, lastreg)> {
   let Size = 16;
 }
@@ -909,69 +929,89 @@ class PPRAsmOperand <string name, string RegClass, int Width>: AsmOperandClass {
   let ParserMethod = "tryParseSVEPredicateVector<RegKind::SVEPredicateVector>";
 }
 
-def PPRAsmOpAny : PPRAsmOperand<"PredicateAny", "PPR",  0>;
-def PPRAsmOp8   : PPRAsmOperand<"PredicateB",   "PPR",  8>;
-def PPRAsmOp16  : PPRAsmOperand<"PredicateH",   "PPR", 16>;
-def PPRAsmOp32  : PPRAsmOperand<"PredicateS",   "PPR", 32>;
-def PPRAsmOp64  : PPRAsmOperand<"PredicateD",   "PPR", 64>;
-
-def PPRAny : PPRRegOp<"",  PPRAsmOpAny, ElementSizeNone, PPR>;
-def PPR8   : PPRRegOp<"b", PPRAsmOp8,   ElementSizeB,  PPR>;
-def PPR16  : PPRRegOp<"h", PPRAsmOp16,  ElementSizeH,  PPR>;
-def PPR32  : PPRRegOp<"s", PPRAsmOp32,  ElementSizeS,  PPR>;
-def PPR64  : PPRRegOp<"d", PPRAsmOp64,  ElementSizeD,  PPR>;
-
+def PPRAsmOpAny   : PPRAsmOperand<"PredicateAny",   "PPR",     0>;
+def PPRAsmOp8     : PPRAsmOperand<"PredicateB",     "PPR",     8>;
+def PPRAsmOp16    : PPRAsmOperand<"PredicateH",     "PPR",    16>;
+def PPRAsmOp32    : PPRAsmOperand<"PredicateS",     "PPR",    32>;
+def PPRAsmOp64    : PPRAsmOperand<"PredicateD",     "PPR",    64>;
 def PPRAsmOp3bAny : PPRAsmOperand<"Predicate3bAny", "PPR_3b",  0>;
 
+class PPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size,
+                RegisterClass RC> : SVERegOp<Suffix, C, Size, RC> {}
+
+def PPRAny   : PPRRegOp<"",  PPRAsmOpAny,   ElementSizeNone, PPR>;
+def PPR8     : PPRRegOp<"b", PPRAsmOp8,     ElementSizeB,    PPR>;
+def PPR16    : PPRRegOp<"h", PPRAsmOp16,    ElementSizeH,    PPR>;
+def PPR32    : PPRRegOp<"s", PPRAsmOp32,    ElementSizeS,    PPR>;
+def PPR64    : PPRRegOp<"d", PPRAsmOp64,    ElementSizeD,    PPR>;
 def PPR3bAny : PPRRegOp<"",  PPRAsmOp3bAny, ElementSizeNone, PPR_3b>;
 
+class PNRClass<int firstreg, int lastreg> : RegisterClass<
+                                  "AArch64",
+                                  [ aarch64svcount ], 16,
+                                  (sequence "PN%u", firstreg, lastreg)> {
+  let Size = 16;
+}
+
+def PNR        : PNRClass<0, 15>;
+def PNR_p8to15 : PNRClass<8, 15>;
 
 // SVE predicate-as-counter operand
-class PNRAsmOperand<string name, string RegClass, int Width>
-    : PPRAsmOperand<name, RegClass, Width> {
+class PNRAsmOperand<string name, string RegClass, int Width>: AsmOperandClass {
+  let Name = "SVE" # name # "Reg";
   let PredicateMethod = "isSVEPredicateAsCounterRegOfWidth<"
                             # Width # ", " # "AArch64::"
                             # RegClass # "RegClassID>";
   let DiagnosticType = "InvalidSVE" # name # "Reg";
+  let RenderMethod = "addRegOperands";
   let ParserMethod   = "tryParseSVEPredicateVector<RegKind::SVEPredicateAsCounter>";
 }
 
-class PNRRegOp<string Suffix, AsmOperandClass C, int EltSize, RegisterClass RC>
-    : PPRRegOp<Suffix, C, ElementSizeNone, RC> {
-  let PrintMethod = "printPredicateAsCounter<" # EltSize # ">";
+let RenderMethod = "addPNRasPPRRegOperands" in {
+  def PNRasPPROpAny : PNRAsmOperand<"PNRasPPRPredicateAny", "PNR",  0>;
+  def PNRasPPROp8   : PNRAsmOperand<"PNRasPPRPredicateB",   "PNR",  8>;
 }
 
-def PNRAsmOpAny: PNRAsmOperand<"PNPredicateAny", "PPR", 0>;
-def PNRAsmOp8  : PNRAsmOperand<"PNPredicateB", "PPR", 8>;
-def PNRAsmOp16 : PNRAsmOperand<"PNPredicateH", "PPR", 16>;
-def PNRAsmOp32 : PNRAsmOperand<"PNPredicateS", "PPR", 32>;
-def PNRAsmOp64 : PNRAsmOperand<"PNPredicateD", "PPR", 64>;
-
-def PNRAny : PNRRegOp<"", PNRAsmOpAny, 0,  PPR>;
-def PNR8   : PNRRegOp<"b", PNRAsmOp8,  8,  PPR>;
-def PNR16  : PNRRegOp<"h", PNRAsmOp16, 16, PPR>;
-def PNR32  : PNRRegOp<"s", PNRAsmOp32, 32, PPR>;
-def PNR64  : PNRRegOp<"d", PNRAsmOp64, 64, PPR>;
-
-class PNRP8to15RegOp<string Suffix, AsmOperandClass C, int EltSize, RegisterClass RC>
-    : PPRRegOp<Suffix, C, ElementSizeNone, RC> {
-  let PrintMethod   = "printPredicateAsCounter<" # EltSize # ">";
-  let EncoderMethod = "EncodePPR_p8to15";
-  let DecoderMethod = "DecodePPR_p8to15RegisterClass";
-}
-
-def PNRAsmAny_p8to15  : PNRAsmOperand<"PNPredicateAny_p8to15", "PPR_p8to15", 0>;
-def PNRAsmOp8_p8to15  : PNRAsmOperand<"PNPredicateB_p8to15",   "PPR_p8to15", 8>;
-def PNRAsmOp16_p8to15 : PNRAsmOperand<"PNPredicateH_p8to15",   "PPR_p8to15", 16>;
-def PNRAsmOp32_p8to15 : PNRAsmOperand<"PNPredicateS_p8to15",   "PPR_p8to15", 32>;
-def PNRAsmOp64_p8to15 : PNRAsmOperand<"PNPredicateD_p8to15",   "PPR_p8to15", 64>;
-
-def PNRAny_p8to15 : PNRP8to15RegOp<"",  PNRAsmAny_p8to15,  0,  PPR_p8to15>;
-def PNR8_p8to15   : PNRP8to15RegOp<"b", PNRAsmOp8_p8to15,  8,  PPR_p8to15>;
-def PNR16_p8to15  : PNRP8to15RegOp<"h", PNRAsmOp16_p8to15, 16, PPR_p8to15>;
-def PNR32_p8to15  : PNRP8to15RegOp<"s", PNRAsmOp32_p8to15, 32, PPR_p8to15>;
-def PNR64_p8to15  : PNRP8to15RegOp<"d", PNRAsmOp64_p8to15, 64, PPR_p8to15>;
+class PNRasPPRRegOp<string Suffix, AsmOperandClass C, ElementSizeEnum Size,
+                RegisterClass RC> : SVERegOp<Suffix, C, Size, RC> {}
 
+def PNRasPPRAny : PNRasPPRRegOp<"",  PNRasPPROpAny, ElementSizeNone, PPR>;
+def PNRasPPR8   : PNRasPPRRegOp<"b", PNRasPPROp8,   ElementSizeB,    PPR>;
+
+def PNRAsmOpAny: PNRAsmOperand<"PNPredicateAny", "PNR", 0>;
+def PNRAsmOp8  : PNRAsmOperand<"PNPredicateB",   "PNR", 8>;
+def PNRAsmOp16 : PNRAsmOperand<"PNPredicateH",   "PNR", 16>;
+def PNRAsmOp32 : PNRAsmOperand<"PNPredicateS",   "PNR", 32>;
+def PNRAsmOp64 : PNRAsmOperand<"PNPredicateD",   "PNR", 64>;
+
+class PNRRegOp<string Suffix, AsmOperandClass C, int Size, RegisterClass RC>
+    : SVERegOp<Suffix, C, ElementSizeNone, RC> {
+  let PrintMethod = "printPredicateAsCounter<" # Size # ">";
+}
+def PNRAny : PNRRegOp<"",  PNRAsmOpAny, 0,  PNR>;
+def PNR8   : PNRRegOp<"b", PNRAsmOp8,   8,  PNR>;
+def PNR16  : PNRRegOp<"h", PNRAsmOp16,  16, PNR>;
+def PNR32  : PNRRegOp<"s", PNRAsmOp32,  32, PNR>;
+def PNR64  : PNRRegOp<"d", PNRAsmOp64,  64, PNR>;
+
+def PNRAsmAny_p8to15  : PNRAsmOperand<"PNPredicateAny_p8to15", "PNR_p8to15", 0>;
+def PNRAsmOp8_p8to15  : PNRAsmOperand<"PNPredicateB_p8to15",   "PNR_p8to15", 8>;
+def PNRAsmOp16_p8to15 : PNRAsmOperand<"PNPredicateH_p8to15",   "PNR_p8to15", 16>;
+def PNRAsmOp32_p8to15 : PNRAsmOperand<"PNPredicateS_p8to15",   "PNR_p8to15", 32>;
+def PNRAsmOp64_p8to15 : PNRAsmOperand<"PNPredicateD_p8to15",   "PNR_p8to15", 64>;
+
+class PNRP8to15RegOp<string Suffix, AsmOperandClass C, int Width, RegisterClass RC>
+    : SVERegOp<Suffix, C, ElementSizeNone, RC> {
+  let PrintMethod   = "printPredicateAsCounter<" # Width # ">";
+  let EncoderMethod = "EncodePNR_p8to15";
+  let DecoderMethod = "DecodePNR_p8to15RegisterClass";
+}
+
+def PNRAny_p8to15 : PNRP8to15RegOp<"",  PNRAsmAny_p8to15,  0,  PNR_p8to15>;
+def PNR8_p8to15   : PNRP8to15RegOp<"b", PNRAsmOp8_p8to15,  8,  PNR_p8to15>;
+def PNR16_p8to15  : PNRP8to15RegOp<"h", PNRAsmOp16_p8to15, 16, PNR_p8to15>;
+def PNR32_p8to15  : PNRP8to15RegOp<"s", PNRAsmOp32_p8to15, 32, PNR_p8to15>;
+def PNR64_p8to15  : PNRP8to15RegOp<"d", PNRAsmOp64_p8to15, 64, PNR_p8to15>;
 
 let Namespace = "AArch64" in {
   def psub0 : SubRegIndex<16, -1>;
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index cabfe9def7c293e9a6254611f0a1902f4663cf8c..5b4070aaeea98a2c776e5aa46c6a9b68e6e45b32 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -22,8 +22,8 @@ def AArch64_restore_za : SDNode<"AArch64ISD::RESTORE_ZA", SDTypeProfile<0, 3,
                              [SDTCisInt<0>, SDTCisPtrTy<1>]>,
                              [SDNPHasChain, SDNPSideEffect, SDNPVariadic,
                               SDNPOptInGlue]>;
-
-def AArch64ObscureCopy : SDNode<"AArch64ISD::OBSCURE_COPY", SDTypeProfile<1, 1, []>, []>;
+def AArch64CoalescerBarrier
+    : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
 
 //===----------------------------------------------------------------------===//
 // Instruction naming conventions.
@@ -68,8 +68,8 @@ let Predicates = [HasSME] in {
 defm BFMOPA_MPPZZ  : sme_bf16_outer_product<0b000, "bfmopa", int_aarch64_sme_mopa_wide>;
 defm BFMOPS_MPPZZ  : sme_bf16_outer_product<0b001, "bfmops", int_aarch64_sme_mops_wide>;
 
-defm FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, "fmopa", int_aarch64_sme_mopa>;
-defm FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, "fmops", int_aarch64_sme_mops>;
+defm FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, 0b00, ZPR32, "fmopa", int_aarch64_sme_mopa>;
+defm FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, 0b00, ZPR32, "fmops", int_aarch64_sme_mops>;
 }
 
 let Predicates = [HasSMEF64F64] in {
@@ -134,52 +134,6 @@ defm ZERO_M : sme_zero<"zero">;
 // Mode selection and state access instructions
 //===----------------------------------------------------------------------===//
 
-// SME defines three pstate fields to set or clear PSTATE.SM, PSTATE.ZA, or
-// both fields:
-//
-//   MSR SVCRSM, #<imm1>
-//   MSR SVCRZA, #<imm1>
-//   MSR SVCRSMZA, #<imm1>
-//
-// It's tricky to using the existing pstate operand defined in
-// AArch64SystemOperands.td since it only encodes 5 bits including op1;op2,
-// when these fields are also encoded in CRm[3:1].
-def MSRpstatesvcrImm1
-  : PstateWriteSimple<(ins svcr_op:$pstatefield, timm0_1:$imm), "msr",
-                      "\t$pstatefield, $imm">,
-    Sched<[WriteSys]> {
-  bits<3> pstatefield;
-  bit imm;
-  let Inst{18-16} = 0b011; // op1
-  let Inst{11-9} = pstatefield;
-  let Inst{8} = imm;
-  let Inst{7-5} = 0b011; // op2
-}
-
-def : InstAlias<"smstart",    (MSRpstatesvcrImm1 0b011, 0b1)>;
-def : InstAlias<"smstart sm", (MSRpstatesvcrImm1 0b001, 0b1)>;
-def : InstAlias<"smstart za", (MSRpstatesvcrImm1 0b010, 0b1)>;
-
-def : InstAlias<"smstop",     (MSRpstatesvcrImm1 0b011, 0b0)>;
-def : InstAlias<"smstop sm",  (MSRpstatesvcrImm1 0b001, 0b0)>;
-def : InstAlias<"smstop za",  (MSRpstatesvcrImm1 0b010, 0b0)>;
-
-
-// Pseudo to match to smstart/smstop. This expands:
-//
-//  pseudonode (pstate_za|pstate_sm), before_call, expected_value
-//
-// Into:
-//
-//   if (before_call != expected_value)
-//     node (pstate_za|pstate_sm)
-//
-// where node can be either 'smstart' or 'smstop'.
-def MSRpstatePseudo :
-  Pseudo<(outs),
-           (ins svcr_op:$pstatefield, timm0_1:$imm, GPR64:$rtpstate, timm0_1:$expected_pstate, variable_ops), []>,
-    Sched<[WriteSys]>;
-
 // Pseudo to conditionally restore ZA state. This expands:
 //
 //   pseudonode tpidr2_el0, tpidr2obj, restore_routine
@@ -226,52 +180,91 @@ def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 0), (i64 1)),  // before
 def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 0), (i64 1)),   // after call
           (MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>;
 
-// The generic case which gets expanded to a pseudo node.
-def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)),
-          (MSRpstatePseudo svcr_op:$pstate, 0b1, GPR64:$rtpstate, timm0_1:$expected_pstate)>;
-def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)),
-          (MSRpstatePseudo svcr_op:$pstate, 0b0, GPR64:$rtpstate, timm0_1:$expected_pstate)>;
-
 // Read and write TPIDR2_EL0
 def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val),
           (MSR 0xde85, GPR64:$val)>;
 def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
           (MRS 0xde85)>;
 
-def OBSCURE_COPY : Pseudo<(outs GPR64:$dst), (ins GPR64:$idx), []>, Sched<[]> { }
-def : Pat<(i64 (AArch64ObscureCopy (i64 GPR64:$idx))),
-          (OBSCURE_COPY GPR64:$idx)>;
 } // End let Predicates = [HasSME]
 
+multiclass CoalescerBarrierPseudo<RegisterClass rc, list<ValueType> vts> {
+  def NAME : Pseudo<(outs rc:$dst), (ins rc:$src), []>, Sched<[]> {
+    let Constraints = "$dst = $src";
+  }
+  foreach vt = vts in {
+    def : Pat<(vt (AArch64CoalescerBarrier (vt rc:$src))),
+              (!cast<Instruction>(NAME) rc:$src)>;
+  }
+}
+
+multiclass CoalescerBarriers {
+  defm _FPR16  : CoalescerBarrierPseudo<FPR16, [bf16, f16]>;
+  defm _FPR32  : CoalescerBarrierPseudo<FPR32, [f32]>;
+  defm _FPR64  : CoalescerBarrierPseudo<FPR64, [f64, v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64, v4bf16]>;
+  defm _FPR128 : CoalescerBarrierPseudo<FPR128, [f128, v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64, v8bf16]>;
+}
+
+defm COALESCER_BARRIER : CoalescerBarriers;
+
+// Pseudo to match to smstart/smstop. This expands:
+//
+//  pseudonode (pstate_za|pstate_sm), before_call, expected_value
+//
+// Into:
+//
+//   if (before_call != expected_value)
+//     node (pstate_za|pstate_sm)
+//
+// where node can be either 'smstart' or 'smstop'.
+//
+// This pseudo and corresponding patterns don't need to be predicated by SME,
+// because when they're emitted for streaming-compatible functions and run
+// in a non-SME context the generated code-paths will never execute any
+// SME instructions.
+def MSRpstatePseudo :
+  Pseudo<(outs),
+           (ins svcr_op:$pstatefield, timm0_1:$imm, GPR64:$rtpstate, timm0_1:$expected_pstate, variable_ops), []>,
+    Sched<[WriteSys]> {
+  let hasPostISelHook = 1;
+  let Uses = [VG];
+  let Defs = [VG];
+}
+
+def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)),
+          (MSRpstatePseudo svcr_op:$pstate, 0b1, GPR64:$rtpstate, timm0_1:$expected_pstate)>;
+def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)),
+          (MSRpstatePseudo svcr_op:$pstate, 0b0, GPR64:$rtpstate, timm0_1:$expected_pstate)>;
+
 //===----------------------------------------------------------------------===//
 // SME2 Instructions
 //===----------------------------------------------------------------------===//
 let Predicates = [HasSME2] in {
 defm ADD_VG2_M2ZZ_S  : sme2_dot_mla_add_sub_array_vg2_single<"add", 0b0011010, MatrixOp32, ZZ_s, ZPR4b32, nxv4i32, int_aarch64_sme_add_write_single_za_vg1x2>;
 defm ADD_VG4_M4ZZ_S  : sme2_dot_mla_add_sub_array_vg4_single<"add", 0b0111010, MatrixOp32, ZZZZ_s, ZPR4b32, nxv4i32, int_aarch64_sme_add_write_single_za_vg1x4>;
-defm ADD_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b011010, MatrixOp32, ZZ_s_mul_r, nxv4i32, int_aarch64_sme_add_write_za_vg1x2>;
-defm ADD_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b011010, MatrixOp32, ZZZZ_s_mul_r, nxv4i32, int_aarch64_sme_add_write_za_vg1x4>;
+defm ADD_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b0110010, MatrixOp32, ZZ_s_mul_r, nxv4i32, int_aarch64_sme_add_write_za_vg1x2>;
+defm ADD_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b0110010, MatrixOp32, ZZZZ_s_mul_r, nxv4i32, int_aarch64_sme_add_write_za_vg1x4>;
 
 defm ADD_VG2_2ZZ  : sme2_int_sve_destructive_vector_vg2_single<"add", 0b0110000>;
 defm ADD_VG4_4ZZ  : sme2_int_sve_destructive_vector_vg4_single<"add", 0b0110000>;
 
 defm SUB_VG2_M2ZZ_S  : sme2_dot_mla_add_sub_array_vg2_single<"sub", 0b0011011, MatrixOp32, ZZ_s, ZPR4b32, nxv4i32, int_aarch64_sme_sub_write_single_za_vg1x2>;
 defm SUB_VG4_M4ZZ_S  : sme2_dot_mla_add_sub_array_vg4_single<"sub", 0b0111011, MatrixOp32, ZZZZ_s, ZPR4b32, nxv4i32, int_aarch64_sme_sub_write_single_za_vg1x4>;
-defm SUB_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b011011, MatrixOp32, ZZ_s_mul_r, nxv4i32, int_aarch64_sme_sub_write_za_vg1x2>;
-defm SUB_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b011011, MatrixOp32, ZZZZ_s_mul_r, nxv4i32, int_aarch64_sme_sub_write_za_vg1x4>;
+defm SUB_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b0110011, MatrixOp32, ZZ_s_mul_r, nxv4i32, int_aarch64_sme_sub_write_za_vg1x2>;
+defm SUB_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b0110011, MatrixOp32, ZZZZ_s_mul_r, nxv4i32, int_aarch64_sme_sub_write_za_vg1x4>;
 
 defm FMLA_VG2_M2ZZ_S  : sme2_dot_mla_add_sub_array_vg2_single<"fmla", 0b0011000, MatrixOp32, ZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_single_vg1x2>;
 defm FMLA_VG4_M4ZZ_S  : sme2_dot_mla_add_sub_array_vg4_single<"fmla", 0b0111000, MatrixOp32, ZZZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_single_vg1x4>;
-defm FMLA_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b011000, MatrixOp32, ZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmla_vg1x2>;
-defm FMLA_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b011000, MatrixOp32, ZZZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmla_vg1x4>;
-defm FMLA_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmla", 0b0000, ZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_lane_vg1x2>;
+defm FMLA_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b0110000, MatrixOp32, ZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmla_vg1x2>;
+defm FMLA_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b0110000, MatrixOp32, ZZZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmla_vg1x4>;
+defm FMLA_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmla", 0b01, 0b0000, ZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_lane_vg1x2>;
 defm FMLA_VG4_M4ZZI_S : sme2_multi_vec_array_vg4_index_32b<"fmla", 0b0000, ZZZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmla_lane_vg1x4>;
 
 defm FMLS_VG2_M2ZZ_S  : sme2_dot_mla_add_sub_array_vg2_single<"fmls", 0b0011001, MatrixOp32, ZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_single_vg1x2>;
 defm FMLS_VG4_M4ZZ_S  : sme2_dot_mla_add_sub_array_vg4_single<"fmls", 0b0111001, MatrixOp32, ZZZZ_s, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_single_vg1x4>;
-defm FMLS_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b011001, MatrixOp32, ZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmls_vg1x2>;
-defm FMLS_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b011001, MatrixOp32, ZZZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmls_vg1x4>;
-defm FMLS_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmls", 0b0010, ZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_lane_vg1x2>;
+defm FMLS_VG2_M2Z2Z_S : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b0110001, MatrixOp32, ZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmls_vg1x2>;
+defm FMLS_VG4_M4Z4Z_S : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0110001, MatrixOp32, ZZZZ_s_mul_r, nxv4f32, int_aarch64_sme_fmls_vg1x4>;
+defm FMLS_VG2_M2ZZI_S : sme2_multi_vec_array_vg2_index_32b<"fmls", 0b01, 0b0010, ZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_lane_vg1x2>;
 defm FMLS_VG4_M4ZZI_S : sme2_multi_vec_array_vg4_index_32b<"fmls", 0b0010, ZZZZ_s_mul_r, ZPR4b32, nxv4f32, int_aarch64_sme_fmls_lane_vg1x4>;
 
 defm ADD_VG2_M2Z_S : sme2_multivec_accum_add_sub_vg2<"add", 0b0010, MatrixOp32, ZZ_s_mul_r, nxv4i32, int_aarch64_sme_add_za32_vg1x2>;
@@ -295,37 +288,37 @@ defm FMLAL_MZZI      : sme2_mla_long_array_index<"fmlal",  0b10,   0b00, nxv8f16
 defm FMLAL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"fmlal",   0b00, nxv8f16, int_aarch64_sme_fmlal_lane_vg2x2>;
 defm FMLAL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"fmlal",   0b00, nxv8f16, int_aarch64_sme_fmlal_lane_vg2x4>;
 defm FMLAL_MZZ       : sme2_mla_long_array_single<"fmlal", 0b00,   0b00, nxv8f16, int_aarch64_sme_fmlal_single_vg2x1>;
-defm FMLAL_VG2_M2ZZ  : sme2_fp_mla_long_array_vg2_single<"fmlal",  0b00, nxv8f16, int_aarch64_sme_fmlal_single_vg2x2>;
-defm FMLAL_VG4_M4ZZ  : sme2_fp_mla_long_array_vg4_single<"fmlal",  0b00, nxv8f16, int_aarch64_sme_fmlal_single_vg2x4>;
-defm FMLAL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"fmlal",   0b00, nxv8f16, int_aarch64_sme_fmlal_vg2x2>;
-defm FMLAL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"fmlal",   0b00, nxv8f16, int_aarch64_sme_fmlal_vg2x4>;
+defm FMLAL_VG2_M2ZZ_HtoS  : sme2_fp_mla_long_array_vg2_single<"fmlal",  0b000, MatrixOp32, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmlal_single_vg2x2>;
+defm FMLAL_VG4_M4ZZ_HtoS  : sme2_fp_mla_long_array_vg4_single<"fmlal",  0b000, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmlal_single_vg2x4>;
+defm FMLAL_VG2_M2Z2Z_HtoS : sme2_fp_mla_long_array_vg2_multi<"fmlal",   0b000, MatrixOp32, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmlal_vg2x2>;
+defm FMLAL_VG4_M4Z4Z_HtoS : sme2_fp_mla_long_array_vg4_multi<"fmlal",   0b000, MatrixOp32, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmlal_vg2x4>;
 
 defm FMLSL_MZZI      : sme2_mla_long_array_index<"fmlsl",  0b10,   0b01, nxv8f16, int_aarch64_sme_fmlsl_lane_vg2x1>;
 defm FMLSL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"fmlsl",   0b01, nxv8f16, int_aarch64_sme_fmlsl_lane_vg2x2>;
 defm FMLSL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"fmlsl",   0b01, nxv8f16, int_aarch64_sme_fmlsl_lane_vg2x4>;
 defm FMLSL_MZZ       : sme2_mla_long_array_single<"fmlsl", 0b00,   0b01, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x1>;
-defm FMLSL_VG2_M2ZZ  : sme2_fp_mla_long_array_vg2_single<"fmlsl",  0b01, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x2>;
-defm FMLSL_VG4_M4ZZ  : sme2_fp_mla_long_array_vg4_single<"fmlsl",  0b01, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x4>;
-defm FMLSL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"fmlsl",   0b01, nxv8f16, int_aarch64_sme_fmlsl_vg2x2>;
-defm FMLSL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"fmlsl",   0b01, nxv8f16, int_aarch64_sme_fmlsl_vg2x4>;
+defm FMLSL_VG2_M2ZZ_HtoS  : sme2_fp_mla_long_array_vg2_single<"fmlsl",  0b010,  MatrixOp32, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x2>;
+defm FMLSL_VG4_M4ZZ_HtoS  : sme2_fp_mla_long_array_vg4_single<"fmlsl",  0b010, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fmlsl_single_vg2x4>;
+defm FMLSL_VG2_M2Z2Z_HtoS : sme2_fp_mla_long_array_vg2_multi<"fmlsl",   0b001, MatrixOp32, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmlsl_vg2x2>;
+defm FMLSL_VG4_M4Z4Z_HtoS : sme2_fp_mla_long_array_vg4_multi<"fmlsl",   0b001, MatrixOp32, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fmlsl_vg2x4>;
 
 defm BFMLAL_MZZI      : sme2_mla_long_array_index<"bfmlal",  0b10,   0b10, nxv8bf16, int_aarch64_sme_fmlal_lane_vg2x1>;
 defm BFMLAL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"bfmlal",   0b10, nxv8bf16, int_aarch64_sme_fmlal_lane_vg2x2>;
 defm BFMLAL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"bfmlal",   0b10, nxv8bf16, int_aarch64_sme_fmlal_lane_vg2x4>;
 defm BFMLAL_MZZ       : sme2_mla_long_array_single<"bfmlal", 0b00,   0b10, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x1>;
-defm BFMLAL_VG2_M2ZZ  : sme2_fp_mla_long_array_vg2_single<"bfmlal",  0b10, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x2>;
-defm BFMLAL_VG4_M4ZZ  : sme2_fp_mla_long_array_vg4_single<"bfmlal",  0b10, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x4>;
-defm BFMLAL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"bfmlal",   0b10, nxv8bf16, int_aarch64_sme_fmlal_vg2x2>;
-defm BFMLAL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"bfmlal",   0b10, nxv8bf16, int_aarch64_sme_fmlal_vg2x4>;
+defm BFMLAL_VG2_M2ZZ_HtoS  : sme2_fp_mla_long_array_vg2_single<"bfmlal",  0b100, MatrixOp32, ZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x2>;
+defm BFMLAL_VG4_M4ZZ_HtoS  : sme2_fp_mla_long_array_vg4_single<"bfmlal",  0b100, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmlal_single_vg2x4>;
+defm BFMLAL_VG2_M2Z2Z_HtoS : sme2_fp_mla_long_array_vg2_multi<"bfmlal",   0b010, MatrixOp32, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmlal_vg2x2>;
+defm BFMLAL_VG4_M4Z4Z_HtoS : sme2_fp_mla_long_array_vg4_multi<"bfmlal",   0b010, MatrixOp32, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmlal_vg2x4>;
 
 defm BFMLSL_MZZI      : sme2_mla_long_array_index<"bfmlsl",  0b10,   0b11, nxv8bf16, int_aarch64_sme_fmlsl_lane_vg2x1>;
 defm BFMLSL_VG2_M2ZZI : sme2_fp_mla_long_array_vg2_index<"bfmlsl",   0b11, nxv8bf16, int_aarch64_sme_fmlsl_lane_vg2x2>;
 defm BFMLSL_VG4_M4ZZI : sme2_fp_mla_long_array_vg4_index<"bfmlsl",   0b11, nxv8bf16, int_aarch64_sme_fmlsl_lane_vg2x4>;
 defm BFMLSL_MZZ       : sme2_mla_long_array_single<"bfmlsl", 0b00,   0b11, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x1>;
-defm BFMLSL_VG2_M2ZZ  : sme2_fp_mla_long_array_vg2_single<"bfmlsl",  0b11, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x2>;
-defm BFMLSL_VG4_M4ZZ  : sme2_fp_mla_long_array_vg4_single<"bfmlsl",  0b11, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x4>;
-defm BFMLSL_VG2_M2Z2Z : sme2_fp_mla_long_array_vg2_multi<"bfmlsl",   0b11, nxv8bf16, int_aarch64_sme_fmlsl_vg2x2>;
-defm BFMLSL_VG4_M4Z4Z : sme2_fp_mla_long_array_vg4_multi<"bfmlsl",   0b11, nxv8bf16, int_aarch64_sme_fmlsl_vg2x4>;
+defm BFMLSL_VG2_M2ZZ_HtoS  : sme2_fp_mla_long_array_vg2_single<"bfmlsl",  0b110, MatrixOp32, ZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x2>;
+defm BFMLSL_VG4_M4ZZ_HtoS  : sme2_fp_mla_long_array_vg4_single<"bfmlsl",  0b110, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fmlsl_single_vg2x4>;
+defm BFMLSL_VG2_M2Z2Z_HtoS : sme2_fp_mla_long_array_vg2_multi<"bfmlsl",   0b011, MatrixOp32, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmlsl_vg2x2>;
+defm BFMLSL_VG4_M4Z4Z_HtoS : sme2_fp_mla_long_array_vg4_multi<"bfmlsl",   0b011, MatrixOp32, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fmlsl_vg2x4>;
 
 defm SMLAL_MZZI      : sme2_mla_long_array_index<"smlal", 0b11,    0b00, nxv8i16, int_aarch64_sme_smlal_lane_vg2x1>;
 defm SMLAL_VG2_M2ZZI : sme2_int_mla_long_array_vg2_index<"smlal",  0b00, int_aarch64_sme_smlal_lane_vg2x2>;
@@ -446,122 +439,122 @@ defm SCLAMP_VG4_4Z4Z : sme2_int_clamp_vector_vg4_multi<"sclamp", 0b0>;
 defm UCLAMP_VG2_2Z2Z : sme2_int_clamp_vector_vg2_multi<"uclamp", 0b1>;
 defm UCLAMP_VG4_4Z4Z : sme2_int_clamp_vector_vg4_multi<"uclamp", 0b1>;
 
-defm FDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fdot", 0b1001, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fdot_lane_za32_vg1x2>;
+defm FDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fdot", 0b01, 0b1001, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fdot_lane_za32_vg1x2>;
 defm FDOT_VG4_M4ZZI_HtoS : sme2_multi_vec_array_vg4_index_32b<"fdot", 0b1001, ZZZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fdot_lane_za32_vg1x4>;
 defm FDOT_VG2_M2ZZ_HtoS  : sme2_dot_mla_add_sub_array_vg2_single<"fdot", 0b0010000, MatrixOp32, ZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fdot_single_za32_vg1x2>;
 defm FDOT_VG4_M4ZZ_HtoS  : sme2_dot_mla_add_sub_array_vg4_single<"fdot", 0b0110000, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8f16, int_aarch64_sme_fdot_single_za32_vg1x4>;
-defm FDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"fdot",  0b010000, MatrixOp32, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fdot_za32_vg1x2>;
-defm FDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"fdot",  0b010000, MatrixOp32, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fdot_za32_vg1x4>;
+defm FDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"fdot",  0b0100000, MatrixOp32, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_fdot_za32_vg1x2>;
+defm FDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"fdot",  0b0100000, MatrixOp32, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_fdot_za32_vg1x4>;
 
-defm BFDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfdot", 0b1011, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fdot_lane_za32_vg1x2>;
+defm BFDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfdot", 0b01, 0b1011, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fdot_lane_za32_vg1x2>;
 defm BFDOT_VG4_M4ZZI_HtoS : sme2_multi_vec_array_vg4_index_32b<"bfdot", 0b1011, ZZZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fdot_lane_za32_vg1x4>;
 defm BFDOT_VG2_M2ZZ_HtoS  : sme2_dot_mla_add_sub_array_vg2_single<"bfdot", 0b0010010, MatrixOp32, ZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fdot_single_za32_vg1x2>;
 defm BFDOT_VG4_M4ZZ_HtoS  : sme2_dot_mla_add_sub_array_vg4_single<"bfdot", 0b0110010, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8bf16, int_aarch64_sme_fdot_single_za32_vg1x4>;
-defm BFDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"bfdot",  0b010010, MatrixOp32, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fdot_za32_vg1x2>;
-defm BFDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"bfdot",  0b010010, MatrixOp32, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fdot_za32_vg1x4>;
+defm BFDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"bfdot",  0b0100010, MatrixOp32, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fdot_za32_vg1x2>;
+defm BFDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"bfdot",  0b0100010, MatrixOp32, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_fdot_za32_vg1x4>;
 
-defm BFVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfvdot", 0b0011, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fvdot_lane_za32_vg1x2>;
+defm BFVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"bfvdot", 0b01,  0b0011, ZZ_h_mul_r, ZPR4b16, nxv8bf16, int_aarch64_sme_fvdot_lane_za32_vg1x2>;
 
-defm FVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fvdot", 0b0001, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fvdot_lane_za32_vg1x2>;
+defm FVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"fvdot", 0b01, 0b0001, ZZ_h_mul_r, ZPR4b16, nxv8f16, int_aarch64_sme_fvdot_lane_za32_vg1x2>;
 
-defm SDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b1000, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_lane_za32_vg1x2>;
-defm SDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b1100, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_sdot_lane_za32_vg1x2>;
+defm SDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b01,  0b1000, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_lane_za32_vg1x2>;
+defm SDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sdot", 0b01, 0b1100, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_sdot_lane_za32_vg1x2>;
 defm SDOT_VG4_M4ZZI_HToS : sme2_multi_vec_array_vg4_index_32b<"sdot", 0b1000, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_lane_za32_vg1x4>;
 defm SDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"sdot", 0b1100, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_sdot_lane_za32_vg1x4>;
 defm SDOT_VG2_M2ZZ_HtoS  : sme2_dot_mla_add_sub_array_vg2_single<"sdot", 0b1010101, MatrixOp32, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_single_za32_vg1x2>;
 defm SDOT_VG4_M4ZZ_HtoS  : sme2_dot_mla_add_sub_array_vg4_single<"sdot", 0b1110101, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_single_za32_vg1x4>;
-defm SDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b110101, MatrixOp32, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za32_vg1x2>;
-defm SDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b110101, MatrixOp32, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za32_vg1x4>;
+defm SDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b1101001, MatrixOp32, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za32_vg1x2>;
+defm SDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b1101001, MatrixOp32, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za32_vg1x4>;
 defm SDOT_VG2_M2ZZ_BtoS : sme2_dot_mla_add_sub_array_vg2_single<"sdot", 0b0010100, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_sdot_single_za32_vg1x2>;
 defm SDOT_VG4_M4ZZ_BtoS : sme2_dot_mla_add_sub_array_vg4_single<"sdot", 0b0110100, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_sdot_single_za32_vg1x4>;
-defm SDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b010100, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_sdot_za32_vg1x2>;
-defm SDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b010100, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_sdot_za32_vg1x4>;
+defm SDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b0101000, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_sdot_za32_vg1x2>;
+defm SDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b0101000, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_sdot_za32_vg1x4>;
 
-defm SUDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sudot", 0b1111, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_sudot_lane_za32_vg1x2>;
+defm SUDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"sudot", 0b01, 0b1111, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_sudot_lane_za32_vg1x2>;
 defm SUDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"sudot", 0b1111, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_sudot_lane_za32_vg1x4>;
 defm SUDOT_VG2_M2ZZ_BToS  : sme2_dot_mla_add_sub_array_vg2_single<"sudot", 0b0010111, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_sudot_single_za32_vg1x2>;
 defm SUDOT_VG4_M4ZZ_BToS  : sme2_dot_mla_add_sub_array_vg4_single<"sudot", 0b0110111, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_sudot_single_za32_vg1x4>;
 
-defm SVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"svdot", 0b0100, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_svdot_lane_za32_vg1x2>;
+defm SVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"svdot", 0b01, 0b0100, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_svdot_lane_za32_vg1x2>;
 defm SVDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"svdot", 0b0100, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_svdot_lane_za32_vg1x4>;
 
 defm SUVDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"suvdot", 0b0111, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_suvdot_lane_za32_vg1x4>;
 
-defm UDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b1010, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_udot_lane_za32_vg1x2>;
-defm UDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b1110, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_udot_lane_za32_vg1x2>;
+defm UDOT_VG2_M2ZZI_HToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b01, 0b1010, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_udot_lane_za32_vg1x2>;
+defm UDOT_VG2_M2ZZI_BToS : sme2_multi_vec_array_vg2_index_32b<"udot", 0b01, 0b1110, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_udot_lane_za32_vg1x2>;
 defm UDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"udot", 0b1110, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_udot_lane_za32_vg1x4>;
 defm UDOT_VG4_M4ZZI_HToS : sme2_multi_vec_array_vg4_index_32b<"udot", 0b1010, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_udot_lane_za32_vg1x4>;
 defm UDOT_VG2_M2ZZ_HtoS  : sme2_dot_mla_add_sub_array_vg2_single<"udot", 0b1010111, MatrixOp32, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_udot_single_za32_vg1x2>;
 defm UDOT_VG4_M4ZZ_HtoS  : sme2_dot_mla_add_sub_array_vg4_single<"udot", 0b1110111, MatrixOp32, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_udot_single_za32_vg1x4>;
-defm UDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b110111, MatrixOp32, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za32_vg1x2>;
-defm UDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b110111, MatrixOp32, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za32_vg1x4>;
+defm UDOT_VG2_M2Z2Z_HtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b1101011, MatrixOp32, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za32_vg1x2>;
+defm UDOT_VG4_M4Z4Z_HtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b1101011, MatrixOp32, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za32_vg1x4>;
 defm UDOT_VG2_M2ZZ_BtoS : sme2_dot_mla_add_sub_array_vg2_single<"udot", 0b0010110, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_udot_single_za32_vg1x2>;
 defm UDOT_VG4_M4ZZ_BtoS : sme2_dot_mla_add_sub_array_vg4_single<"udot", 0b0110110, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_udot_single_za32_vg1x4>;
-defm UDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b010110, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_udot_za32_vg1x2>;
-defm UDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b010110, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_udot_za32_vg1x4>;
+defm UDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b0101010, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_udot_za32_vg1x2>;
+defm UDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b0101010, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_udot_za32_vg1x4>;
 
-defm USDOT_VG2_M2ZZI_BToS: sme2_multi_vec_array_vg2_index_32b<"usdot", 0b1101, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_usdot_lane_za32_vg1x2>;
+defm USDOT_VG2_M2ZZI_BToS: sme2_multi_vec_array_vg2_index_32b<"usdot", 0b01, 0b1101, ZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_usdot_lane_za32_vg1x2>;
 defm USDOT_VG4_M4ZZI_BToS: sme2_multi_vec_array_vg4_index_32b<"usdot", 0b1101, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_usdot_lane_za32_vg1x4>;
 defm USDOT_VG2_M2ZZ_BToS : sme2_dot_mla_add_sub_array_vg2_single<"usdot",  0b0010101, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_usdot_single_za32_vg1x2>;
 defm USDOT_VG4_M4ZZ_BToS  : sme2_dot_mla_add_sub_array_vg4_single<"usdot", 0b0110101, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_usdot_single_za32_vg1x4>;
-defm USDOT_VG2_M2Z2Z_BToS : sme2_dot_mla_add_sub_array_vg2_multi<"usdot", 0b010101, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_usdot_za32_vg1x2>;
-defm USDOT_VG4_M4Z4Z_BToS : sme2_dot_mla_add_sub_array_vg4_multi<"usdot", 0b010101, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_usdot_za32_vg1x4>;
+defm USDOT_VG2_M2Z2Z_BToS : sme2_dot_mla_add_sub_array_vg2_multi<"usdot", 0b0101001, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_usdot_za32_vg1x2>;
+defm USDOT_VG4_M4Z4Z_BToS : sme2_dot_mla_add_sub_array_vg4_multi<"usdot", 0b0101001, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_usdot_za32_vg1x4>;
 
 defm USVDOT_VG4_M4ZZI_BToS : sme2_multi_vec_array_vg4_index_32b<"usvdot", 0b0101, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_usvdot_lane_za32_vg1x4>;
 
-defm UVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"uvdot", 0b0110, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_uvdot_lane_za32_vg1x2>;
+defm UVDOT_VG2_M2ZZI_HtoS : sme2_multi_vec_array_vg2_index_32b<"uvdot", 0b01, 0b0110, ZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_uvdot_lane_za32_vg1x2>;
 defm UVDOT_VG4_M4ZZI_BtoS : sme2_multi_vec_array_vg4_index_32b<"uvdot", 0b0110, ZZZZ_b_mul_r, ZPR4b8, nxv16i8, int_aarch64_sme_uvdot_lane_za32_vg1x4>;
 
-defm SMLALL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"smlall", 0b000, int_aarch64_sme_smla_za32_lane_vg4x1>;
-defm SMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"smlall", 0b000, int_aarch64_sme_smla_za32_lane_vg4x2>;
-defm SMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"smlall", 0b000, int_aarch64_sme_smla_za32_lane_vg4x4>;
-defm SMLALL_MZZ_BtoS       : sme2_mla_ll_array_single<"smlall", 0b0000, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_smla_za32_single_vg4x1>;
+defm SMLALL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"smlall", 0b00, 0b000, int_aarch64_sme_smla_za32_lane_vg4x1>;
+defm SMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"smlall", 0b00, 0b000, int_aarch64_sme_smla_za32_lane_vg4x2>;
+defm SMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"smlall", 0b00, 0b0000, int_aarch64_sme_smla_za32_lane_vg4x4>;
+defm SMLALL_MZZ_BtoS       : sme2_mla_ll_array_single<"smlall", 0b00000, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_smla_za32_single_vg4x1>;
 defm SMLALL_VG2_M2ZZ_BtoS  : sme2_mla_ll_array_vg2_single<"smlall", 0b00000, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smla_za32_single_vg4x2>;
 defm SMLALL_VG4_M4ZZ_BtoS  : sme2_mla_ll_array_vg4_single<"smlall", 0b01000, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smla_za32_single_vg4x4>;
-defm SMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlall", 0b0000, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_smla_za32_vg4x2>;
-defm SMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlall", 0b0000, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_smla_za32_vg4x4>;
+defm SMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlall", 0b00000, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_smla_za32_vg4x2>;
+defm SMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlall", 0b00000, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_smla_za32_vg4x4>;
 
-defm USMLALL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"usmlall", 0b001, int_aarch64_sme_usmla_za32_lane_vg4x1>;
-defm USMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"usmlall", 0b100, int_aarch64_sme_usmla_za32_lane_vg4x2>;
-defm USMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"usmlall", 0b100, int_aarch64_sme_usmla_za32_lane_vg4x4>;
-defm USMLALL_MZZ_BtoS       : sme2_mla_ll_array_single<"usmlall", 0b0001, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_usmla_za32_single_vg4x1>;
+defm USMLALL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"usmlall", 0b00, 0b001, int_aarch64_sme_usmla_za32_lane_vg4x1>;
+defm USMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"usmlall", 0b00, 0b100, int_aarch64_sme_usmla_za32_lane_vg4x2>;
+defm USMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"usmlall", 0b00, 0b0100, int_aarch64_sme_usmla_za32_lane_vg4x4>;
+defm USMLALL_MZZ_BtoS       : sme2_mla_ll_array_single<"usmlall", 0b00001, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_usmla_za32_single_vg4x1>;
 defm USMLALL_VG2_M2ZZ_BtoS  : sme2_mla_ll_array_vg2_single<"usmlall", 0b00001, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_usmla_za32_single_vg4x2>;
 defm USMLALL_VG4_M4ZZ_BtoS  : sme2_mla_ll_array_vg4_single<"usmlall", 0b01001, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_usmla_za32_single_vg4x4>;
-defm USMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"usmlall", 0b0001, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_usmla_za32_vg4x2>;
-defm USMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"usmlall", 0b0001, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_usmla_za32_vg4x4>;
+defm USMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"usmlall", 0b00001, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_usmla_za32_vg4x2>;
+defm USMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"usmlall", 0b00001, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_usmla_za32_vg4x4>;
 
-defm SMLSLL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"smlsll", 0b010, int_aarch64_sme_smls_za32_lane_vg4x1>;
-defm SMLSLL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"smlsll", 0b001, int_aarch64_sme_smls_za32_lane_vg4x2>;
-defm SMLSLL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"smlsll", 0b001, int_aarch64_sme_smls_za32_lane_vg4x4>;
-defm SMLSLL_MZZ_BtoS       : sme2_mla_ll_array_single<"smlsll", 0b0010, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_smls_za32_single_vg4x1>;
+defm SMLSLL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"smlsll", 0b00, 0b010, int_aarch64_sme_smls_za32_lane_vg4x1>;
+defm SMLSLL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"smlsll", 0b00, 0b001, int_aarch64_sme_smls_za32_lane_vg4x2>;
+defm SMLSLL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"smlsll", 0b00, 0b0001, int_aarch64_sme_smls_za32_lane_vg4x4>;
+defm SMLSLL_MZZ_BtoS       : sme2_mla_ll_array_single<"smlsll", 0b00010, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_smls_za32_single_vg4x1>;
 defm SMLSLL_VG2_M2ZZ_BtoS  : sme2_mla_ll_array_vg2_single<"smlsll", 0b00010, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smls_za32_single_vg4x2>;
 defm SMLSLL_VG4_M4ZZ_BtoS  : sme2_mla_ll_array_vg4_single<"smlsll", 0b01010, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_smls_za32_single_vg4x4>;
-defm SMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlsll", 0b0010, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_smls_za32_vg4x2>;
-defm SMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlsll", 0b0010, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_smls_za32_vg4x4>;
+defm SMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"smlsll", 0b00010, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_smls_za32_vg4x2>;
+defm SMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"smlsll", 0b00010, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_smls_za32_vg4x4>;
 
-defm UMLALL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"umlall", 0b100, int_aarch64_sme_umla_za32_lane_vg4x1>;
-defm UMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"umlall", 0b010, int_aarch64_sme_umla_za32_lane_vg4x2>;
-defm UMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"umlall", 0b010, int_aarch64_sme_umla_za32_lane_vg4x4>;
-defm UMLALL_MZZ_BtoS       : sme2_mla_ll_array_single<"umlall", 0b0100, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_umla_za32_single_vg4x1>;
+defm UMLALL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"umlall", 0b00, 0b100, int_aarch64_sme_umla_za32_lane_vg4x1>;
+defm UMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"umlall", 0b00, 0b010, int_aarch64_sme_umla_za32_lane_vg4x2>;
+defm UMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"umlall", 0b00, 0b0010, int_aarch64_sme_umla_za32_lane_vg4x4>;
+defm UMLALL_MZZ_BtoS       : sme2_mla_ll_array_single<"umlall", 0b00100, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_umla_za32_single_vg4x1>;
 defm UMLALL_VG2_M2ZZ_BtoS  : sme2_mla_ll_array_vg2_single<"umlall", 0b00100, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umla_za32_single_vg4x2>;
 defm UMLALL_VG4_M4ZZ_BtoS  : sme2_mla_ll_array_vg4_single<"umlall", 0b01100, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umla_za32_single_vg4x4>;
-defm UMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlall", 0b0100, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_umla_za32_vg4x2>;
-defm UMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlall", 0b0100, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_umla_za32_vg4x4>;
+defm UMLALL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlall", 0b00100, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_umla_za32_vg4x2>;
+defm UMLALL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlall", 0b00100, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_umla_za32_vg4x4>;
 
-defm SUMLALL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"sumlall", 0b101, int_aarch64_sme_sumla_za32_lane_vg4x1>;
-defm SUMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"sumlall", 0b110, int_aarch64_sme_sumla_za32_lane_vg4x2>;
-defm SUMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"sumlall", 0b110, int_aarch64_sme_sumla_za32_lane_vg4x4>;
+defm SUMLALL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"sumlall", 0b00, 0b101, int_aarch64_sme_sumla_za32_lane_vg4x1>;
+defm SUMLALL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"sumlall", 0b00, 0b110, int_aarch64_sme_sumla_za32_lane_vg4x2>;
+defm SUMLALL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"sumlall", 0b00, 0b0110, int_aarch64_sme_sumla_za32_lane_vg4x4>;
 defm SUMLALL_VG2_M2ZZ_BtoS  : sme2_mla_ll_array_vg2_single<"sumlall", 0b00101, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_sumla_za32_single_vg4x2>;
 defm SUMLALL_VG4_M4ZZ_BtoS  : sme2_mla_ll_array_vg4_single<"sumlall", 0b01101, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_sumla_za32_single_vg4x4>;
 
-defm UMLSLL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"umlsll", 0b110, int_aarch64_sme_umls_za32_lane_vg4x1>;
-defm UMLSLL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"umlsll", 0b011, int_aarch64_sme_umls_za32_lane_vg4x2>;
-defm UMLSLL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"umlsll", 0b011, int_aarch64_sme_umls_za32_lane_vg4x4>;
-defm UMLSLL_MZZ_BtoS       : sme2_mla_ll_array_single<"umlsll", 0b0110, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_umls_za32_single_vg4x1>;
+defm UMLSLL_MZZI_BtoS      : sme2_mla_ll_array_index_32b<"umlsll", 0b00, 0b110, int_aarch64_sme_umls_za32_lane_vg4x1>;
+defm UMLSLL_VG2_M2ZZI_BtoS : sme2_mla_ll_array_vg2_index_32b<"umlsll", 0b00, 0b011, int_aarch64_sme_umls_za32_lane_vg4x2>;
+defm UMLSLL_VG4_M4ZZI_BtoS : sme2_mla_ll_array_vg4_index_32b<"umlsll", 0b00, 0b0011, int_aarch64_sme_umls_za32_lane_vg4x4>;
+defm UMLSLL_MZZ_BtoS       : sme2_mla_ll_array_single<"umlsll", 0b00110, MatrixOp32, ZPR8, ZPR4b8, nxv16i8, int_aarch64_sme_umls_za32_single_vg4x1>;
 defm UMLSLL_VG2_M2ZZ_BtoS  : sme2_mla_ll_array_vg2_single<"umlsll", 0b00110, MatrixOp32, ZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umls_za32_single_vg4x2>;
 defm UMLSLL_VG4_M4ZZ_BtoS  : sme2_mla_ll_array_vg4_single<"umlsll", 0b01110, MatrixOp32, ZZZZ_b, ZPR4b8, nxv16i8, int_aarch64_sme_umls_za32_single_vg4x4>;
-defm UMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlsll", 0b0110, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_umls_za32_vg4x2>;
-defm UMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlsll", 0b0110, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_umls_za32_vg4x4>;
+defm UMLSLL_VG2_M2Z2Z_BtoS : sme2_mla_ll_array_vg2_multi<"umlsll", 0b00110, MatrixOp32, ZZ_b_mul_r, nxv16i8, int_aarch64_sme_umls_za32_vg4x2>;
+defm UMLSLL_VG4_M4Z4Z_BtoS : sme2_mla_ll_array_vg4_multi<"umlsll", 0b00110, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, int_aarch64_sme_umls_za32_vg4x4>;
 
 defm BMOPA_MPPZZ_S : sme2_int_bmopx_tile<"bmopa", 0b100, int_aarch64_sme_bmopa_za32>;
 defm BMOPS_MPPZZ_S : sme2_int_bmopx_tile<"bmops", 0b101, int_aarch64_sme_bmops_za32>;
@@ -707,13 +700,13 @@ defm STNT1D_4Z_STRIDED_IMM : sme2_st_vector_vg4_multi_scalar_immediate<0b11, 0b1
 let Predicates = [HasSME2, HasSMEI16I64] in {
 defm ADD_VG2_M2ZZ_D  : sme2_dot_mla_add_sub_array_vg2_single<"add", 0b1011010, MatrixOp64, ZZ_d, ZPR4b64, nxv2i64, int_aarch64_sme_add_write_single_za_vg1x2>;
 defm ADD_VG4_M4ZZ_D  : sme2_dot_mla_add_sub_array_vg4_single<"add", 0b1111010, MatrixOp64, ZZZZ_d, ZPR4b64, nxv2i64, int_aarch64_sme_add_write_single_za_vg1x4>;
-defm ADD_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b111010, MatrixOp64, ZZ_d_mul_r, nxv2i64, int_aarch64_sme_add_write_za_vg1x2>;
-defm ADD_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b111010, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, int_aarch64_sme_add_write_za_vg1x4>;
+defm ADD_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"add", 0b1110010, MatrixOp64, ZZ_d_mul_r, nxv2i64, int_aarch64_sme_add_write_za_vg1x2>;
+defm ADD_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"add", 0b1110010, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, int_aarch64_sme_add_write_za_vg1x4>;
 
 defm SUB_VG2_M2ZZ_D  : sme2_dot_mla_add_sub_array_vg2_single<"sub", 0b1011011, MatrixOp64, ZZ_d, ZPR4b64, nxv2i64, int_aarch64_sme_sub_write_single_za_vg1x2>;
 defm SUB_VG4_M4ZZ_D  : sme2_dot_mla_add_sub_array_vg4_single<"sub", 0b1111011, MatrixOp64, ZZZZ_d, ZPR4b64, nxv2i64, int_aarch64_sme_sub_write_single_za_vg1x4>;
-defm SUB_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b111011, MatrixOp64, ZZ_d_mul_r, nxv2i64, int_aarch64_sme_sub_write_za_vg1x2>;
-defm SUB_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b111011, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, int_aarch64_sme_sub_write_za_vg1x4>;
+defm SUB_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"sub", 0b1110011, MatrixOp64, ZZ_d_mul_r, nxv2i64, int_aarch64_sme_sub_write_za_vg1x2>;
+defm SUB_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"sub", 0b1110011, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, int_aarch64_sme_sub_write_za_vg1x4>;
 
 defm ADD_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"add", 0b1010, MatrixOp64, ZZ_d_mul_r, nxv2i64, int_aarch64_sme_add_za64_vg1x2>;
 defm ADD_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"add", 0b1010, MatrixOp64, ZZZZ_d_mul_r, nxv2i64, int_aarch64_sme_add_za64_vg1x4>;
@@ -725,8 +718,8 @@ defm SDOT_VG2_M2ZZI_HtoD : sme2_multi_vec_array_vg2_index_64b<"sdot", 0b01, ZZ_h
 defm SDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"sdot", 0b001, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_lane_za64_vg1x4>;
 defm SDOT_VG2_M2ZZ_HtoD : sme2_dot_mla_add_sub_array_vg2_single<"sdot", 0b1010100, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_single_za64_vg1x2>;
 defm SDOT_VG4_M4ZZ_HtoD : sme2_dot_mla_add_sub_array_vg4_single<"sdot", 0b1110100, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_sdot_single_za64_vg1x4>;
-defm SDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b110100, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za64_vg1x2>;
-defm SDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b110100, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za64_vg1x4>;
+defm SDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"sdot", 0b1101000, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za64_vg1x2>;
+defm SDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"sdot", 0b1101000, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_sdot_za64_vg1x4>;
 
 defm SVDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"svdot", 0b101, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_svdot_lane_za64_vg1x4>;
 
@@ -734,46 +727,46 @@ defm UDOT_VG2_M2ZZI_HtoD : sme2_multi_vec_array_vg2_index_64b<"udot", 0b11, ZZ_h
 defm UDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"udot", 0b011, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_udot_lane_za64_vg1x4>;
 defm UDOT_VG2_M2ZZ_HtoD : sme2_dot_mla_add_sub_array_vg2_single<"udot", 0b1010110, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_udot_single_za64_vg1x2>;
 defm UDOT_VG4_M4ZZ_HtoD : sme2_dot_mla_add_sub_array_vg4_single<"udot", 0b1110110, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_udot_single_za64_vg1x4>;
-defm UDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b110110, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za64_vg1x2>;
-defm UDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b110110, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za64_vg1x4>;
+defm UDOT_VG2_M2Z2Z_HtoD : sme2_dot_mla_add_sub_array_vg2_multi<"udot", 0b1101010, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za64_vg1x2>;
+defm UDOT_VG4_M4Z4Z_HtoD : sme2_dot_mla_add_sub_array_vg4_multi<"udot", 0b1101010, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_udot_za64_vg1x4>;
 
 defm UVDOT_VG4_M4ZZI_HtoD : sme2_multi_vec_array_vg4_index_64b<"uvdot", 0b111, ZZZZ_h_mul_r, ZPR4b16, nxv8i16, int_aarch64_sme_uvdot_lane_za64_vg1x4>;
 
 defm SMLALL_MZZI_HtoD      : sme2_mla_ll_array_index_64b<"smlall", 0b00, int_aarch64_sme_smla_za64_lane_vg4x1>;
 defm SMLALL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"smlall", 0b00, int_aarch64_sme_smla_za64_lane_vg4x2>;
 defm SMLALL_VG4_M4ZZI_HtoD : sme2_mla_ll_array_vg4_index_64b<"smlall", 0b00, int_aarch64_sme_smla_za64_lane_vg4x4>;
-defm SMLALL_MZZ_HtoD       : sme2_mla_ll_array_single<"smlall", 0b1000, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_smla_za64_single_vg4x1>;
+defm SMLALL_MZZ_HtoD       : sme2_mla_ll_array_single<"smlall", 0b10000, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_smla_za64_single_vg4x1>;
 defm SMLALL_VG2_M2ZZ_HtoD  : sme2_mla_ll_array_vg2_single<"smlall", 0b10000, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smla_za64_single_vg4x2>;
 defm SMLALL_VG4_M4ZZ_HtoD  : sme2_mla_ll_array_vg4_single<"smlall", 0b11000, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smla_za64_single_vg4x4>;
-defm SMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlall",  0b1000, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_smla_za64_vg4x2>;
-defm SMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlall",  0b1000, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_smla_za64_vg4x4>;
+defm SMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlall",  0b10000, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_smla_za64_vg4x2>;
+defm SMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlall",  0b10000, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_smla_za64_vg4x4>;
 
 defm SMLSLL_MZZI_HtoD      : sme2_mla_ll_array_index_64b<"smlsll", 0b01, int_aarch64_sme_smls_za64_lane_vg4x1>;
 defm SMLSLL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"smlsll", 0b01, int_aarch64_sme_smls_za64_lane_vg4x2>;
 defm SMLSLL_VG4_M4ZZI_HtoD : sme2_mla_ll_array_vg4_index_64b<"smlsll", 0b01, int_aarch64_sme_smls_za64_lane_vg4x4>;
-defm SMLSLL_MZZ_HtoD       : sme2_mla_ll_array_single<"smlsll", 0b1010, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_smls_za64_single_vg4x1>;
+defm SMLSLL_MZZ_HtoD       : sme2_mla_ll_array_single<"smlsll", 0b10010, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_smls_za64_single_vg4x1>;
 defm SMLSLL_VG2_M2ZZ_HtoD  : sme2_mla_ll_array_vg2_single<"smlsll", 0b10010, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smls_za64_single_vg4x2>;
 defm SMLSLL_VG4_M4ZZ_HtoD  : sme2_mla_ll_array_vg4_single<"smlsll", 0b11010, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_smls_za64_single_vg4x4>;
-defm SMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlsll",  0b1010, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_smls_za64_vg4x2>;
-defm SMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlsll",  0b1010, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_smls_za64_vg4x4>;
+defm SMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"smlsll",  0b10010, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_smls_za64_vg4x2>;
+defm SMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"smlsll",  0b10010, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_smls_za64_vg4x4>;
 
 defm UMLALL_MZZI_HtoD      : sme2_mla_ll_array_index_64b<"umlall", 0b10, int_aarch64_sme_umla_za64_lane_vg4x1>;
 defm UMLALL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"umlall", 0b10, int_aarch64_sme_umla_za64_lane_vg4x2>;
 defm UMLALL_VG4_M4ZZI_HtoD : sme2_mla_ll_array_vg4_index_64b<"umlall", 0b10, int_aarch64_sme_umla_za64_lane_vg4x4>;
-defm UMLALL_MZZ_HtoD       : sme2_mla_ll_array_single<"umlall", 0b1100, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_umla_za64_single_vg4x1>;
+defm UMLALL_MZZ_HtoD       : sme2_mla_ll_array_single<"umlall", 0b10100, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_umla_za64_single_vg4x1>;
 defm UMLALL_VG2_M2ZZ_HtoD  : sme2_mla_ll_array_vg2_single<"umlall", 0b10100, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umla_za64_single_vg4x2>;
 defm UMLALL_VG4_M4ZZ_HtoD  : sme2_mla_ll_array_vg4_single<"umlall", 0b11100, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umla_za64_single_vg4x4>;
-defm UMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlall",  0b1100, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_umla_za64_vg4x2>;
-defm UMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlall",  0b1100, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_umla_za64_vg4x4>;
+defm UMLALL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlall",  0b10100, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_umla_za64_vg4x2>;
+defm UMLALL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlall",  0b10100, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_umla_za64_vg4x4>;
 
 defm UMLSLL_MZZI_HtoD      : sme2_mla_ll_array_index_64b<"umlsll", 0b11, int_aarch64_sme_umls_za64_lane_vg4x1>;
 defm UMLSLL_VG2_M2ZZI_HtoD : sme2_mla_ll_array_vg2_index_64b<"umlsll", 0b11, int_aarch64_sme_umls_za64_lane_vg4x2>;
 defm UMLSLL_VG4_M4ZZI_HtoD : sme2_mla_ll_array_vg4_index_64b<"umlsll", 0b11, int_aarch64_sme_umls_za64_lane_vg4x4>;
-defm UMLSLL_MZZ_HtoD       : sme2_mla_ll_array_single<"umlsll", 0b1110, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_umls_za64_single_vg4x1>;
+defm UMLSLL_MZZ_HtoD       : sme2_mla_ll_array_single<"umlsll", 0b10110, MatrixOp64, ZPR16, ZPR4b16, nxv8i16, int_aarch64_sme_umls_za64_single_vg4x1>;
 defm UMLSLL_VG2_M2ZZ_HtoD  : sme2_mla_ll_array_vg2_single<"umlsll", 0b10110, MatrixOp64, ZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umls_za64_single_vg4x2>;
 defm UMLSLL_VG4_M4ZZ_HtoD  : sme2_mla_ll_array_vg4_single<"umlsll", 0b11110, MatrixOp64, ZZZZ_h, ZPR4b16, nxv8i16, int_aarch64_sme_umls_za64_single_vg4x4>;
-defm UMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlsll",  0b1110, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_umls_za64_vg4x2>;
-defm UMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlsll",  0b1110, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_umls_za64_vg4x4>;
+defm UMLSLL_VG2_M2Z2Z_HtoD : sme2_mla_ll_array_vg2_multi<"umlsll",  0b10110, MatrixOp64, ZZ_h_mul_r, nxv8i16, int_aarch64_sme_umls_za64_vg4x2>;
+defm UMLSLL_VG4_M4Z4Z_HtoD : sme2_mla_ll_array_vg4_multi<"umlsll",  0b10110, MatrixOp64, ZZZZ_h_mul_r, nxv8i16, int_aarch64_sme_umls_za64_vg4x4>;
 }
 
 let Predicates = [HasSME2, HasSMEF64F64] in {
@@ -781,15 +774,15 @@ defm FMLA_VG2_M2ZZI_D : sme2_multi_vec_array_vg2_index_64b<"fmla", 0b00, ZZ_d_mu
 defm FMLA_VG4_M4ZZI_D : sme2_multi_vec_array_vg4_index_64b<"fmla", 0b000, ZZZZ_d_mul_r, ZPR4b64, nxv2f64, int_aarch64_sme_fmla_lane_vg1x4>;
 defm FMLA_VG2_M2ZZ_D  : sme2_dot_mla_add_sub_array_vg2_single<"fmla", 0b1011000, MatrixOp64, ZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmla_single_vg1x2>;
 defm FMLA_VG4_M4ZZ_D  : sme2_dot_mla_add_sub_array_vg4_single<"fmla", 0b1111000, MatrixOp64, ZZZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmla_single_vg1x4>;
-defm FMLA_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b111000, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmla_vg1x2>;
-defm FMLA_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b111000, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmla_vg1x4>;
+defm FMLA_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b1110000, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmla_vg1x2>;
+defm FMLA_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b1110000, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmla_vg1x4>;
 
 defm FMLS_VG2_M2ZZI_D : sme2_multi_vec_array_vg2_index_64b<"fmls", 0b10, ZZ_d_mul_r, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_lane_vg1x2>;
 defm FMLS_VG4_M4ZZI_D : sme2_multi_vec_array_vg4_index_64b<"fmls", 0b010, ZZZZ_d_mul_r, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_lane_vg1x4>;
 defm FMLS_VG2_M2ZZ_D  : sme2_dot_mla_add_sub_array_vg2_single<"fmls", 0b1011001, MatrixOp64, ZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_single_vg1x2>;
 defm FMLS_VG4_M4ZZ_D  : sme2_dot_mla_add_sub_array_vg4_single<"fmls", 0b1111001, MatrixOp64, ZZZZ_d, ZPR4b64, nxv2f64, int_aarch64_sme_fmls_single_vg1x4>;
-defm FMLS_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b111001, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmls_vg1x2>;
-defm FMLS_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b111001, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmls_vg1x4>;
+defm FMLS_VG2_M2Z2Z_D : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b1110001, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmls_vg1x2>;
+defm FMLS_VG4_M4Z4Z_D : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b1110001, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_fmls_vg1x4>;
 
 defm FADD_VG2_M2Z_D : sme2_multivec_accum_add_sub_vg2<"fadd", 0b1000, MatrixOp64, ZZ_d_mul_r, nxv2f64, int_aarch64_sme_add_za64_vg1x2>;
 defm FADD_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"fadd", 0b1000, MatrixOp64, ZZZZ_d_mul_r, nxv2f64, int_aarch64_sme_add_za64_vg1x4>;
@@ -820,25 +813,25 @@ defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16
 defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
 defm FSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0101, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
 
-defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00>;
-defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b00>;
+defm FMLA_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmla", 0b00, 0b100, ZZ_h_mul_r, ZPR4b16>;
+defm FMLA_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmla", 0b000, ZZZZ_h_mul_r, ZPR4b16>;
 defm FMLA_VG2_M2ZZ_H :  sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0011100, MatrixOp16, ZZ_h, ZPR4b16>;
 defm FMLA_VG4_M4ZZ_H :  sme2_dot_mla_add_sub_array_vg24_single<"fmla", 0b0111100, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b010001, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
-defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b010001, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
+defm FMLA_VG2_M2Z4Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmla", 0b0100001, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
+defm FMLA_VG4_M4Z4Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmla", 0b0100001, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
 
-defm FMLS_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmls", 0b01>;
-defm FMLS_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmls", 0b01>;
+defm FMLS_VG2_M2ZZI_H : sme2p1_multi_vec_array_vg2_index_16b<"fmls", 0b00, 0b101, ZZ_h_mul_r, ZPR4b16>;
+defm FMLS_VG4_M4ZZI_H : sme2p1_multi_vec_array_vg4_index_16b<"fmls", 0b001, ZZZZ_h_mul_r, ZPR4b16>;
 defm FMLS_VG2_M2ZZ_H :  sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0011101, MatrixOp16, ZZ_h, ZPR4b16>;
 defm FMLS_VG4_M4ZZ_H :  sme2_dot_mla_add_sub_array_vg24_single<"fmls", 0b0111101, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b010011, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
-defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b010011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
+defm FMLS_VG2_M2Z2Z_H : sme2_dot_mla_add_sub_array_vg2_multi<"fmls", 0b0100011, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>;
+defm FMLS_VG4_M4Z2Z_H : sme2_dot_mla_add_sub_array_vg4_multi<"fmls", 0b0100011, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>;
 
 defm FCVT_2ZZ_H  : sme2p1_fp_cvt_vector_vg2_single<"fcvt", 0b0>;
 defm FCVTL_2ZZ_H : sme2p1_fp_cvt_vector_vg2_single<"fcvtl", 0b1>;
 
-defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0>;
-defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1>;
+defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0, 0b11, ZPR16>;
+defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, 0b11, ZPR16>;
 }
 
 let Predicates = [HasSME2p1, HasB16B16] in {
@@ -847,19 +840,19 @@ defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp
 defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r,  nxv8bf16, null_frag>;
 defm BFSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfsub", 0b1101, MatrixOp16, ZZZZ_h_mul_r,  nxv8bf16, null_frag>;
 
-defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b10>;
-defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b10>;
+defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b00, 0b110, ZZ_h_mul_r, ZPR4b16>;
+defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b010, ZZZZ_h_mul_r, ZPR4b16>;
 defm BFMLA_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1011100, MatrixOp16, ZZ_h, ZPR4b16>;
 defm BFMLA_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmla", 0b1111100, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b110001, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
-defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b110001, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
+defm BFMLA_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmla", 0b1100001, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
+defm BFMLA_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmla", 0b1100001, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
 
-defm BFMLS_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmls", 0b11>;
-defm BFMLS_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmls", 0b11>;
+defm BFMLS_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmls", 0b00, 0b111, ZZ_h_mul_r, ZPR4b16>;
+defm BFMLS_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmls", 0b011, ZZZZ_h_mul_r, ZPR4b16>;
 defm BFMLS_VG2_M2ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1011101, MatrixOp16, ZZ_h, ZPR4b16>;
 defm BFMLS_VG4_M4ZZ : sme2_dot_mla_add_sub_array_vg24_single<"bfmls", 0b1111101, MatrixOp16, ZZZZ_h, ZPR4b16>;
-defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b110011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
-defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b110011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
+defm BFMLS_VG2_M2Z2Z : sme2_dot_mla_add_sub_array_vg2_multi<"bfmls", 0b1100011, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;
+defm BFMLS_VG4_M4Z4Z : sme2_dot_mla_add_sub_array_vg4_multi<"bfmls", 0b1100011, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;
 
 
 defm BFMAX_VG2_2ZZ  : sme2p1_bf_max_min_vector_vg2_single<"bfmax", 0b0010000>;
@@ -885,6 +878,6 @@ defm BFMINNM_VG4_4Z2Z : sme2p1_bf_max_min_vector_vg4_multi<"bfminnm",  0b0010011
 defm BFCLAMP_VG2_2ZZZ: sme2p1_bfclamp_vector_vg2_multi<"bfclamp">;
 defm BFCLAMP_VG4_4ZZZ: sme2p1_bfclamp_vector_vg4_multi<"bfclamp">;
 
-defm BFMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmopa", 0b1, 0b0>;
-defm BFMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmops", 0b1, 0b1>;
+defm BFMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmopa", 0b1, 0b0, 0b11, ZPR16>;
+defm BFMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmops", 0b1, 0b1, 0b11, ZPR16>;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index b4f02e0dd20374eb58fc3dc34a9dd51226623a0e..1d00f152940a98d5e6b16f72aa9ac061cfdf78fc 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2204,8 +2204,8 @@ let Predicates = [HasSVEorSME] in {
 } // End HasSVEorSME
 
 let Predicates = [HasBF16, HasSVEorSME] in {
-  defm BFDOT_ZZZ    : sve_float_dot<0b1, "bfdot", nxv8bf16, int_aarch64_sve_bfdot>;
-  defm BFDOT_ZZI    : sve_float_dot_indexed<0b1, "bfdot", nxv8bf16, int_aarch64_sve_bfdot_lane_v2>;
+  defm BFDOT_ZZZ    : sve_float_dot<0b1, 0b0, ZPR32, ZPR16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot>;
+  defm BFDOT_ZZI    : sve_float_dot_indexed<0b1, 0b00, ZPR16, ZPR3b16, "bfdot", nxv8bf16, int_aarch64_sve_bfdot_lane_v2>;
 } // End HasBF16, HasSVEorSME
 
 let Predicates = [HasBF16, HasSVE] in {
@@ -2544,8 +2544,8 @@ let Predicates = [HasSVEorSME] in {
     def : Pat<(nxv4f32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
     def : Pat<(nxv2f64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
 
-    def : Pat<(nxv16i1 (bitconvert (aarch64svcount PPR:$src))), (nxv16i1 PPR:$src)>;
-    def : Pat<(aarch64svcount (bitconvert (nxv16i1 PPR:$src))), (aarch64svcount PPR:$src)>;
+    def : Pat<(nxv16i1 (bitconvert (aarch64svcount PNR:$src))), (nxv16i1 PPR:$src)>;
+    def : Pat<(aarch64svcount (bitconvert (nxv16i1 PPR:$src))), (aarch64svcount PNR:$src)>;
   }
 
   // These allow casting from/to unpacked predicate types.
@@ -3753,8 +3753,8 @@ defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>;
 
 let Predicates = [HasSVE2p1_or_HasSME2] in {
 defm FCLAMP_ZZZ : sve2p1_fclamp<"fclamp", int_aarch64_sve_fclamp>;
-defm FDOT_ZZZ_S  : sve_float_dot<0b0, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
-defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>;
+defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
+defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>;
 def BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb">;
 def BFMLSLT_ZZZ_S : sve2_fp_mla_long<0b111, "bfmlslt">;
 def BFMLSLB_ZZZI_S : sve2_fp_mla_long_by_indexed_elem<0b110, "bfmlslb">;
@@ -3852,9 +3852,9 @@ defm STNT1D_4Z_IMM : sve2p1_mem_cst_si_4z<"stnt1d", 0b11, 0b1, ZZZZ_d_mul_r>;
 multiclass store_pn_x2<ValueType Ty, SDPatternOperator Store,
                         Instruction RegImmInst> {
   def : Pat<(Store (Ty ZPR:$vec0), (Ty ZPR:$vec1),
-                   (aarch64svcount PPR:$PNg), GPR64:$base),
+                   (aarch64svcount PNR:$PNg), GPR64:$base),
             (RegImmInst (REG_SEQUENCE ZPR2Mul2, Ty:$vec0, zsub0, Ty:$vec1, zsub1),
-                         PPR:$PNg, GPR64:$base, (i64 0))>;
+                         PNR:$PNg, GPR64:$base, (i64 0))>;
 }
 
 // Stores of 2 consecutive vectors
@@ -3878,10 +3878,10 @@ defm : store_pn_x2<nxv2f64, int_aarch64_sve_stnt1_pn_x2, STNT1D_2Z_IMM>;
 multiclass store_pn_x4<ValueType Ty, SDPatternOperator Store,
                         Instruction RegImmInst> {
   def : Pat<(Store (Ty ZPR:$vec0), (Ty ZPR:$vec1), (Ty ZPR:$vec2), (Ty ZPR:$vec3),
-                   (aarch64svcount PPR:$PNg), GPR64:$base),
+                   (aarch64svcount PNR:$PNg), GPR64:$base),
             (RegImmInst (REG_SEQUENCE ZPR4Mul4, Ty:$vec0, zsub0, Ty:$vec1, zsub1,
                                                 Ty:$vec2, zsub2, Ty:$vec3, zsub3),
-                        PPR:$PNg, GPR64:$base, (i64 0))>;
+                        PNR:$PNg, GPR64:$base, (i64 0))>;
 }
 
 // Stores of 4 consecutive vectors
@@ -3918,26 +3918,28 @@ defm WHILEHS_CXX  : sve2p1_int_while_rr_pn<"whilehs", 0b100>;
 defm WHILEHI_CXX  : sve2p1_int_while_rr_pn<"whilehi", 0b101>;
 defm WHILELO_CXX  : sve2p1_int_while_rr_pn<"whilelo", 0b110>;
 defm WHILELS_CXX  : sve2p1_int_while_rr_pn<"whilels", 0b111>;
+} // End HasSVE2p1_or_HasSME2
 
+let Predicates = [HasSVEorSME] in {
 
 // Aliases for existing SVE instructions for which predicate-as-counter are
 // accepted as an operand to the instruction
 def : InstAlias<"ldr $Pt, [$Rn, $imm9, mul vl]",
-               (LDR_PXI PNRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), 0>;
+               (LDR_PXI PNRasPPRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), 0>;
 def : InstAlias<"ldr $Pt, [$Rn]",
-               (LDR_PXI PNRAny:$Pt, GPR64sp:$Rn, 0), 0>;
+               (LDR_PXI PNRasPPRAny:$Pt, GPR64sp:$Rn, 0), 0>;
 
 def : InstAlias<"str $Pt, [$Rn, $imm9, mul vl]",
-               (STR_PXI PNRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), 0>;
+               (STR_PXI PNRasPPRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), 0>;
 def : InstAlias<"str $Pt, [$Rn]",
-               (STR_PXI PNRAny:$Pt, GPR64sp:$Rn, 0), 0>;
+               (STR_PXI PNRasPPRAny:$Pt, GPR64sp:$Rn, 0), 0>;
 
 def : InstAlias<"mov $Pd, $Pn",
-               (ORR_PPzPP PNR8:$Pd, PNR8:$Pn, PNR8:$Pn, PNR8:$Pn), 0>;
+               (ORR_PPzPP PNRasPPR8:$Pd, PNRasPPR8:$Pn, PNRasPPR8:$Pn, PNRasPPR8:$Pn), 0>;
 
-def : InstAlias<"pfalse\t$Pd", (PFALSE PNR8:$Pd), 0>;
+def : InstAlias<"pfalse\t$Pd", (PFALSE PNRasPPR8:$Pd), 0>;
 
-} // End HasSVE2p1_or_HasSME2
+}
 
 //===----------------------------------------------------------------------===//
 // SVE2.1 non-widening BFloat16 to BFloat16 instructions
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 559879139758ba6e7fc6cfcf0b5e6498c55e40a2..d2c12e6dfdc7307e9c58ddb18e24115a0fcf87c3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -446,8 +446,6 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
 
   assert((!StreamingSVEMode || I->hasSME()) &&
          "Expected SME to be available");
-  assert((!StreamingCompatibleSVEMode || I->hasSVEorSME()) &&
-         "Expected SVE or SME to be available");
 
   return I.get();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index c60234fd85b51cb3e6d855693b7d6ea63fdf20c9..029b0931eb95039f03f16a89b01290a621e60ab4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -190,16 +190,55 @@ static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
 static cl::opt<bool> EnableScalableAutovecInStreamingMode(
     "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
 
+static bool isSMEABIRoutineCall(const CallInst &CI) {
+  const auto *F = CI.getCalledFunction();
+  return F && StringSwitch<bool>(F->getName())
+                  .Case("__arm_sme_state", true)
+                  .Case("__arm_tpidr2_save", true)
+                  .Case("__arm_tpidr2_restore", true)
+                  .Case("__arm_za_disable", true)
+                  .Default(false);
+}
+
+/// Returns true if the function has explicit operations that can only be
+/// lowered using incompatible instructions for the selected mode. This also
+/// returns true if the function F may use or modify ZA state.
+static bool hasPossibleIncompatibleOps(const Function *F) {
+  for (const BasicBlock &BB : *F) {
+    for (const Instruction &I : BB) {
+      // Be conservative for now and assume that any call to inline asm or to
+      // intrinsics could could result in non-streaming ops (e.g. calls to
+      // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
+      // all native LLVM instructions can be lowered to compatible instructions.
+      if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
+          (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
+           isSMEABIRoutineCall(cast<CallInst>(I))))
+        return true;
+    }
+  }
+  return false;
+}
+
 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
                                          const Function *Callee) const {
-  SMEAttrs CallerAttrs(*Caller);
-  SMEAttrs CalleeAttrs(*Callee);
-  if (CallerAttrs.requiresSMChange(CalleeAttrs,
-                                   /*BodyOverridesInterface=*/true) ||
-      CallerAttrs.requiresLazySave(CalleeAttrs) ||
-      CalleeAttrs.hasNewZAInterface())
+  SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
+
+  // When inlining, we should consider the body of the function, not the
+  // interface.
+  if (CalleeAttrs.hasStreamingBody()) {
+    CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
+    CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
+  }
+
+  if (CalleeAttrs.isNewZA())
     return false;
 
+  if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
+      CallerAttrs.requiresSMChange(CalleeAttrs)) {
+    if (hasPossibleIncompatibleOps(Callee))
+      return false;
+  }
+
   const TargetMachine &TM = getTLI()->getTargetMachine();
 
   const FeatureBitset &CallerBits =
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index f4b731db05b6e20b2213f5e8311177aa729cb991..139ad21f6ad3f826526a3d02993c14556e63011f 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1229,6 +1229,8 @@ public:
     case AArch64::PPRRegClassID:
     case AArch64::PPR_3bRegClassID:
     case AArch64::PPR_p8to15RegClassID:
+    case AArch64::PNRRegClassID:
+    case AArch64::PNR_p8to15RegClassID:
       RK = RegKind::SVEPredicateAsCounter;
       break;
     default:
@@ -1249,6 +1251,9 @@ public:
       break;
     case AArch64::PPRRegClassID:
     case AArch64::PPR_3bRegClassID:
+    case AArch64::PPR_p8to15RegClassID:
+    case AArch64::PNRRegClassID:
+    case AArch64::PNR_p8to15RegClassID:
       RK = RegKind::SVEPredicateVector;
       break;
     default:
@@ -1733,6 +1738,12 @@ public:
     Inst.addOperand(MCOperand::createReg(AArch64::Z0 + getReg() - Base));
   }
 
+  void addPNRasPPRRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(
+        MCOperand::createReg((getReg() - AArch64::PN0) + AArch64::P0));
+  }
+
   void addVectorReg64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     assert(
@@ -2697,22 +2708,22 @@ static unsigned matchSVEPredicateVectorRegName(StringRef Name) {
 
 static unsigned matchSVEPredicateAsCounterRegName(StringRef Name) {
   return StringSwitch<unsigned>(Name.lower())
-      .Case("pn0", AArch64::P0)
-      .Case("pn1", AArch64::P1)
-      .Case("pn2", AArch64::P2)
-      .Case("pn3", AArch64::P3)
-      .Case("pn4", AArch64::P4)
-      .Case("pn5", AArch64::P5)
-      .Case("pn6", AArch64::P6)
-      .Case("pn7", AArch64::P7)
-      .Case("pn8", AArch64::P8)
-      .Case("pn9", AArch64::P9)
-      .Case("pn10", AArch64::P10)
-      .Case("pn11", AArch64::P11)
-      .Case("pn12", AArch64::P12)
-      .Case("pn13", AArch64::P13)
-      .Case("pn14", AArch64::P14)
-      .Case("pn15", AArch64::P15)
+      .Case("pn0", AArch64::PN0)
+      .Case("pn1", AArch64::PN1)
+      .Case("pn2", AArch64::PN2)
+      .Case("pn3", AArch64::PN3)
+      .Case("pn4", AArch64::PN4)
+      .Case("pn5", AArch64::PN5)
+      .Case("pn6", AArch64::PN6)
+      .Case("pn7", AArch64::PN7)
+      .Case("pn8", AArch64::PN8)
+      .Case("pn9", AArch64::PN9)
+      .Case("pn10", AArch64::PN10)
+      .Case("pn11", AArch64::PN11)
+      .Case("pn12", AArch64::PN12)
+      .Case("pn13", AArch64::PN13)
+      .Case("pn14", AArch64::PN14)
+      .Case("pn15", AArch64::PN15)
       .Default(0);
 }
 
@@ -4525,6 +4536,8 @@ ParseStatus AArch64AsmParser::tryParseZTOperand(OperandVector &Operands) {
 
   // Check if register is followed by an index
   if (parseOptionalToken(AsmToken::LBrac)) {
+    Operands.push_back(
+        AArch64Operand::CreateToken("[", getLoc(), getContext()));
     const MCExpr *ImmVal;
     if (getParser().parseExpression(ImmVal))
       return ParseStatus::NoMatch;
@@ -4537,6 +4550,8 @@ ParseStatus AArch64AsmParser::tryParseZTOperand(OperandVector &Operands) {
     Operands.push_back(AArch64Operand::CreateImm(
         MCConstantExpr::create(MCE->getValue(), getContext()), StartLoc,
         getLoc(), getContext()));
+    Operands.push_back(
+        AArch64Operand::CreateToken("]", getLoc(), getContext()));
   }
 
   return ParseStatus::Success;
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index e50ac5c92d50cfa259bd0a2da13b899fc4a381ee..df817f62f99fbf7be5dc834656032523218adcd8 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -140,11 +140,14 @@ DecodeMatrixTileListRegisterClass(MCInst &Inst, unsigned RegMask,
 static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
                                            const MCDisassembler *Decoder);
+static DecodeStatus DecodePNRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Address,
                                               const MCDisassembler *Decoder);
 static DecodeStatus
-DecodePPR_p8to15RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+DecodePNR_p8to15RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
                               const MCDisassembler *Decoder);
 static DecodeStatus DecodePPR2RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
@@ -736,6 +739,18 @@ static DecodeStatus DecodePPRRegisterClass(MCInst &Inst, unsigned RegNo,
   return Success;
 }
 
+static DecodeStatus DecodePNRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Addr,
+                                           const MCDisassembler *Decoder) {
+  if (RegNo > 15)
+    return Fail;
+
+  unsigned Register =
+      AArch64MCRegisterClasses[AArch64::PNRRegClassID].getRegister(RegNo);
+  Inst.addOperand(MCOperand::createReg(Register));
+  return Success;
+}
+
 static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
                                               uint64_t Addr,
                                               const MCDisassembler *Decoder) {
@@ -747,13 +762,13 @@ static DecodeStatus DecodePPR_3bRegisterClass(MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus
-DecodePPR_p8to15RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
+DecodePNR_p8to15RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
                               const MCDisassembler *Decoder) {
   if (RegNo > 7)
     return Fail;
 
   // Just reuse the PPR decode table
-  return DecodePPRRegisterClass(Inst, RegNo + 8, Addr, Decoder);
+  return DecodePNRRegisterClass(Inst, RegNo + 8, Addr, Decoder);
 }
 
 static DecodeStatus DecodePPR2RegisterClass(MCInst &Inst, unsigned RegNo,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index c56e3373d3a7fc7fe799a53e4716ef8becb1a884..4cf7e0bd7be4650bca4e224f56ac985adb474b98 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -532,8 +532,8 @@ bool AArch64CallLowering::fallBackToDAGISel(const MachineFunction &MF) const {
   }
 
   SMEAttrs Attrs(F);
-  if (Attrs.hasNewZAInterface() ||
-      (!Attrs.hasStreamingInterface() && Attrs.hasStreamingBody()))
+  if (Attrs.hasZAState() || Attrs.hasStreamingInterfaceOrBody() ||
+      Attrs.hasStreamingCompatibleInterface())
     return true;
 
   return false;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 2983e9a9be92841e503b89cbed5806f14ded8a7c..016e898bebf932497fbef46cd0e9aeb136625a7a 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -1335,9 +1335,10 @@ void AArch64InstPrinter::printPredicateAsCounter(const MCInst *MI,
                                                  const MCSubtargetInfo &STI,
                                                  raw_ostream &O) {
   unsigned Reg = MI->getOperand(OpNum).getReg();
+  if (Reg < AArch64::PN0 || Reg > AArch64::PN15)
+    llvm_unreachable("Unsupported predicate-as-counter register");
+  O << "pn" << Reg - AArch64::PN0;
 
-  assert(Reg <= AArch64::P15 && "Unsupported predicate register");
-  O << "pn" << (Reg - AArch64::P0);
   switch (EltSize) {
   case 0:
     break;
@@ -1743,10 +1744,11 @@ void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
   O << "[" << Scale * MI->getOperand(OpNum).getImm() << "]";
 }
 
+template <unsigned Scale>
 void AArch64InstPrinter::printMatrixIndex(const MCInst *MI, unsigned OpNum,
                                           const MCSubtargetInfo &STI,
                                           raw_ostream &O) {
-  O << MI->getOperand(OpNum).getImm();
+  O << Scale * MI->getOperand(OpNum).getImm();
 }
 
 void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, uint64_t Address,
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
index fcaa57402bc2df7760d9d394b5c49e4e66a4ced9..218ddfd918ec7cef8eac27a52173e9f51d809f58 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
@@ -172,6 +172,7 @@ protected:
   template <unsigned Scale = 1>
   void printVectorIndex(const MCInst *MI, unsigned OpNum,
                         const MCSubtargetInfo &STI, raw_ostream &O);
+  template <unsigned Scale = 1>
   void printMatrixIndex(const MCInst *MI, unsigned OpNum,
                         const MCSubtargetInfo &STI, raw_ostream &O);
   void printAdrAdrpLabel(const MCInst *MI, uint64_t Address, unsigned OpNum,
@@ -191,7 +192,6 @@ protected:
   template <int EltSize>
   void printPredicateAsCounter(const MCInst *MI, unsigned OpNum,
                                const MCSubtargetInfo &STI, raw_ostream &O);
-
   template<int64_t Angle, int64_t Remainder>
   void printComplexRotationOp(const MCInst *MI, unsigned OpNo,
                             const MCSubtargetInfo &STI, raw_ostream &O);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 2dbbab13e8f3b9a0b537412a178bcb778854e3a0..d8070e21908a3075d791bddd3b3ef35c245c0640 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -192,6 +192,9 @@ public:
   uint32_t EncodePPR_p8to15(const MCInst &MI, unsigned OpIdx,
                             SmallVectorImpl<MCFixup> &Fixups,
                             const MCSubtargetInfo &STI) const;
+  uint32_t EncodePNR_p8to15(const MCInst &MI, unsigned OpIdx,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
 
   uint32_t EncodeZPR2StridedRegisterClass(const MCInst &MI, unsigned OpIdx,
                                           SmallVectorImpl<MCFixup> &Fixups,
@@ -551,6 +554,14 @@ AArch64MCCodeEmitter::EncodePPR_p8to15(const MCInst &MI, unsigned OpIdx,
   return RegOpnd - AArch64::P8;
 }
 
+uint32_t
+AArch64MCCodeEmitter::EncodePNR_p8to15(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  auto RegOpnd = MI.getOperand(OpIdx).getReg();
+  return RegOpnd - AArch64::PN8;
+}
+
 uint32_t AArch64MCCodeEmitter::EncodeZPR2StridedRegisterClass(
     const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
     const MCSubtargetInfo &STI) const {
diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp
index 83010017c761fdfda385b7a5dbaf8ce4d8439003..8ed33602a96f7b58ee479e32f7b983054dfd9370 100644
--- a/llvm/lib/Target/AArch64/SMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp
@@ -41,7 +41,8 @@ struct SMEABI : public FunctionPass {
   bool runOnFunction(Function &F) override;
 
 private:
-  bool updateNewZAFunctions(Module *M, Function *F, IRBuilder<> &Builder);
+  bool updateNewStateFunctions(Module *M, Function *F, IRBuilder<> &Builder,
+                               SMEAttrs FnAttrs);
 };
 } // end anonymous namespace
 
@@ -60,10 +61,8 @@ FunctionPass *llvm::createSMEABIPass() { return new SMEABI(); }
 void emitTPIDR2Save(Module *M, IRBuilder<> &Builder) {
   auto *TPIDR2SaveTy =
       FunctionType::get(Builder.getVoidTy(), {}, /*IsVarArgs=*/false);
-  auto Attrs =
-      AttributeList()
-          .addFnAttribute(M->getContext(), "aarch64_pstate_sm_compatible")
-          .addFnAttribute(M->getContext(), "aarch64_pstate_za_preserved");
+  auto Attrs = AttributeList().addFnAttribute(M->getContext(),
+                                              "aarch64_pstate_sm_compatible");
   FunctionCallee Callee =
       M->getOrInsertFunction("__arm_tpidr2_save", TPIDR2SaveTy, Attrs);
   CallInst *Call = Builder.CreateCall(Callee);
@@ -77,50 +76,80 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder) {
                      Builder.getInt64(0));
 }
 
-/// This function generates code to commit a lazy save at the beginning of a
-/// function marked with `aarch64_pstate_za_new`. If the value read from
-/// TPIDR2_EL0 is not null on entry to the function then the lazy-saving scheme
-/// is active and we should call __arm_tpidr2_save to commit the lazy save.
-/// Additionally, PSTATE.ZA should be enabled at the beginning of the function
-/// and disabled before returning.
-bool SMEABI::updateNewZAFunctions(Module *M, Function *F,
-                                  IRBuilder<> &Builder) {
+/// This function generates code at the beginning and end of a function marked
+/// with either `aarch64_new_za` or `aarch64_new_zt0`.
+/// At the beginning of the function, the following code is generated:
+///  - Commit lazy-save if active   [Private-ZA Interface*]
+///  - Enable PSTATE.ZA             [Private-ZA Interface]
+///  - Zero ZA                      [Has New ZA State]
+///  - Zero ZT0                     [Has New ZT0 State]
+///
+/// * A function with new ZT0 state will not change ZA, so committing the
+/// lazy-save is not strictly necessary. However, the lazy-save mechanism
+/// may be active on entry to the function, with PSTATE.ZA set to 1. If
+/// the new ZT0 function calls a function that does not share ZT0, we will
+/// need to conditionally SMSTOP ZA before the call, setting PSTATE.ZA to 0.
+/// For this reason, it's easier to always commit the lazy-save at the
+/// beginning of the function regardless of whether it has ZA state.
+///
+/// At the end of the function, PSTATE.ZA is disabled if the function has a
+/// Private-ZA Interface. A function is considered to have a Private-ZA
+/// interface if it does not share ZA or ZT0.
+///
+bool SMEABI::updateNewStateFunctions(Module *M, Function *F,
+                                     IRBuilder<> &Builder, SMEAttrs FnAttrs) {
   LLVMContext &Context = F->getContext();
   BasicBlock *OrigBB = &F->getEntryBlock();
-
-  // Create the new blocks for reading TPIDR2_EL0 & enabling ZA state.
-  auto *SaveBB = OrigBB->splitBasicBlock(OrigBB->begin(), "save.za", true);
-  auto *PreludeBB = BasicBlock::Create(Context, "prelude", F, SaveBB);
-
-  // Read TPIDR2_EL0 in PreludeBB & branch to SaveBB if not 0.
-  Builder.SetInsertPoint(PreludeBB);
-  Function *TPIDR2Intr =
-      Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_get_tpidr2);
-  auto *TPIDR2 = Builder.CreateCall(TPIDR2Intr->getFunctionType(), TPIDR2Intr,
-                                    {}, "tpidr2");
-  auto *Cmp =
-      Builder.CreateCmp(ICmpInst::ICMP_NE, TPIDR2, Builder.getInt64(0), "cmp");
-  Builder.CreateCondBr(Cmp, SaveBB, OrigBB);
-
-  // Create a call __arm_tpidr2_save, which commits the lazy save.
-  Builder.SetInsertPoint(&SaveBB->back());
-  emitTPIDR2Save(M, Builder);
-
-  // Enable pstate.za at the start of the function.
   Builder.SetInsertPoint(&OrigBB->front());
-  Function *EnableZAIntr =
-      Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable);
-  Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr);
-
-  // Before returning, disable pstate.za
-  for (BasicBlock &BB : *F) {
-    Instruction *T = BB.getTerminator();
-    if (!T || !isa<ReturnInst>(T))
-      continue;
-    Builder.SetInsertPoint(T);
-    Function *DisableZAIntr =
-        Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_disable);
-    Builder.CreateCall(DisableZAIntr->getFunctionType(), DisableZAIntr);
+
+  // Commit any active lazy-saves if this is a Private-ZA function. If the
+  // value read from TPIDR2_EL0 is not null on entry to the function then
+  // the lazy-saving scheme is active and we should call __arm_tpidr2_save
+  // to commit the lazy save.
+  if (FnAttrs.hasPrivateZAInterface()) {
+    // Create the new blocks for reading TPIDR2_EL0 & enabling ZA state.
+    auto *SaveBB = OrigBB->splitBasicBlock(OrigBB->begin(), "save.za", true);
+    auto *PreludeBB = BasicBlock::Create(Context, "prelude", F, SaveBB);
+
+    // Read TPIDR2_EL0 in PreludeBB & branch to SaveBB if not 0.
+    Builder.SetInsertPoint(PreludeBB);
+    Function *TPIDR2Intr =
+        Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_get_tpidr2);
+    auto *TPIDR2 = Builder.CreateCall(TPIDR2Intr->getFunctionType(), TPIDR2Intr,
+                                      {}, "tpidr2");
+    auto *Cmp = Builder.CreateCmp(ICmpInst::ICMP_NE, TPIDR2,
+                                  Builder.getInt64(0), "cmp");
+    Builder.CreateCondBr(Cmp, SaveBB, OrigBB);
+
+    // Create a call __arm_tpidr2_save, which commits the lazy save.
+    Builder.SetInsertPoint(&SaveBB->back());
+    emitTPIDR2Save(M, Builder);
+
+    // Enable pstate.za at the start of the function.
+    Builder.SetInsertPoint(&OrigBB->front());
+    Function *EnableZAIntr =
+        Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable);
+    Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr);
+  }
+
+  if (FnAttrs.isNewZA()) {
+    Function *ZeroIntr =
+        Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero);
+    Builder.CreateCall(ZeroIntr->getFunctionType(), ZeroIntr,
+                       Builder.getInt32(0xff));
+  }
+
+  if (FnAttrs.hasPrivateZAInterface()) {
+    // Before returning, disable pstate.za
+    for (BasicBlock &BB : *F) {
+      Instruction *T = BB.getTerminator();
+      if (!T || !isa<ReturnInst>(T))
+        continue;
+      Builder.SetInsertPoint(T);
+      Function *DisableZAIntr =
+          Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_disable);
+      Builder.CreateCall(DisableZAIntr->getFunctionType(), DisableZAIntr);
+    }
   }
 
   F->addFnAttr("aarch64_expanded_pstate_za");
@@ -137,8 +166,8 @@ bool SMEABI::runOnFunction(Function &F) {
 
   bool Changed = false;
   SMEAttrs FnAttrs(F);
-  if (FnAttrs.hasNewZAInterface())
-    Changed |= updateNewZAFunctions(M, &F, Builder);
+  if (FnAttrs.isNewZA())
+    Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs);
 
   return Changed;
 }
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 6e3aadd5dd8cc7be1aebee00bd45b0e33da4ebd1..835e74fdbc6414ad8425dc296bb16c7eb6fb7d5d 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -33,6 +33,12 @@ def tileslicerange0s4 : ComplexPattern<i32, 2, "SelectSMETileSlice<0,  4>", []>;
 
 def am_sme_indexed_b4 :ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0,15>", [], [SDNPWantRoot]>;
 
+def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>;
+def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore,
+                             [SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>;
+def AArch64SMEStr : SDNode<"AArch64ISD::SME_ZA_STR", SDTZALoadStore,
+                             [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
+
 //===----------------------------------------------------------------------===//
 // SME Pseudo Classes
 //===----------------------------------------------------------------------===//
@@ -190,11 +196,48 @@ class SME_ZA_Tile_TwoPred_TwoVec_Pat<string name, SDPatternOperator intrinsic, O
     : Pat<(intrinsic imm_ty:$tile, (pg_ty PPR3bAny:$Pn), (pg_ty PPR3bAny:$Pm), vt:$Zn, vt:$Zm),
           (!cast<Instruction>(name # _PSEUDO) $tile, $Pn, $Pm, $Zn, $Zm)>;
 
+
+//===----------------------------------------------------------------------===//
+// SME smstart/smstop
+//===----------------------------------------------------------------------===//
+
+// SME defines three pstate fields to set or clear PSTATE.SM, PSTATE.ZA, or
+// both fields:
+//
+//   MSR SVCRSM, #<imm1>
+//   MSR SVCRZA, #<imm1>
+//   MSR SVCRSMZA, #<imm1>
+//
+// It's tricky to using the existing pstate operand defined in
+// AArch64SystemOperands.td since it only encodes 5 bits including op1;op2,
+// when these fields are also encoded in CRm[3:1].
+def MSRpstatesvcrImm1
+  : PstateWriteSimple<(ins svcr_op:$pstatefield, timm0_1:$imm), "msr",
+                      "\t$pstatefield, $imm">,
+    Sched<[WriteSys]> {
+  bits<3> pstatefield;
+  bit imm;
+  let Inst{18-16} = 0b011; // op1
+  let Inst{11-9} = pstatefield;
+  let Inst{8} = imm;
+  let Inst{7-5} = 0b011; // op2
+  let hasPostISelHook = 1;
+}
+
+def : InstAlias<"smstart",    (MSRpstatesvcrImm1 0b011, 0b1)>;
+def : InstAlias<"smstart sm", (MSRpstatesvcrImm1 0b001, 0b1)>;
+def : InstAlias<"smstart za", (MSRpstatesvcrImm1 0b010, 0b1)>;
+
+def : InstAlias<"smstop",     (MSRpstatesvcrImm1 0b011, 0b0)>;
+def : InstAlias<"smstop sm",  (MSRpstatesvcrImm1 0b001, 0b0)>;
+def : InstAlias<"smstop za",  (MSRpstatesvcrImm1 0b010, 0b0)>;
+
+
 //===----------------------------------------------------------------------===//
 // SME Outer Products
 //===----------------------------------------------------------------------===//
 
-class sme_fp_outer_product_inst<bit S, bits<2> sz, bit op, MatrixTileOperand za_ty,
+class sme_fp_outer_product_inst<bit S, bits<2> sz, bits<2> op, MatrixTileOperand za_ty,
                                 ZPRRegOp zpr_ty, string mnemonic>
     : I<(outs za_ty:$ZAda),
       (ins za_ty:$_ZAda, PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm),
@@ -206,7 +249,7 @@ class sme_fp_outer_product_inst<bit S, bits<2> sz, bit op, MatrixTileOperand za_
   bits<3> Pn;
   bits<5> Zn;
   let Inst{31-25} = 0b1000000;
-  let Inst{24}    = op;
+  let Inst{24}    = op{1};
   let Inst{23}    = 0b1;
   let Inst{22-21} = sz;
   let Inst{20-16} = Zm;
@@ -214,25 +257,25 @@ class sme_fp_outer_product_inst<bit S, bits<2> sz, bit op, MatrixTileOperand za_
   let Inst{12-10} = Pn;
   let Inst{9-5}   = Zn;
   let Inst{4}     = S;
-  let Inst{3}     = op;
+  let Inst{3}     = op{0};
 
   let Constraints = "$ZAda = $_ZAda";
 }
 
-multiclass sme_outer_product_fp32<bit S, string mnemonic, SDPatternOperator op> {
-  def NAME : sme_fp_outer_product_inst<S, 0b00, 0b0, TileOp32, ZPR32, mnemonic>, SMEPseudo2Instr<NAME, 1> {
+multiclass sme_outer_product_fp32<bit S, bits<2> sz, ZPRRegOp zpr_ty, string mnemonic, SDPatternOperator op> {
+  def NAME : sme_fp_outer_product_inst<S, sz, 0b00, TileOp32, zpr_ty, mnemonic>, SMEPseudo2Instr<NAME, 1> {
     bits<2> ZAda;
     let Inst{1-0} = ZAda;
     let Inst{2}   = 0b0;
   }
 
-  def NAME # _PSEUDO : sme_outer_product_pseudo<ZPR32, SMEMatrixTileS>, SMEPseudo2Instr<NAME, 0>;
+  def NAME # _PSEUDO : sme_outer_product_pseudo<zpr_ty, SMEMatrixTileS>, SMEPseudo2Instr<NAME, 0>;
 
   def : SME_ZA_Tile_TwoPred_TwoVec_Pat<NAME, op, timm32_0_3, nxv4i1, nxv4f32>;
 }
 
 multiclass sme_outer_product_fp64<bit S, string mnemonic, SDPatternOperator op> {
-  def NAME : sme_fp_outer_product_inst<S, 0b10, 0b0, TileOp64, ZPR64, mnemonic>, SMEPseudo2Instr<NAME, 1> {
+  def NAME : sme_fp_outer_product_inst<S, 0b10, 0b00, TileOp64, ZPR64, mnemonic>, SMEPseudo2Instr<NAME, 1> {
     bits<3> ZAda;
     let Inst{2-0} = ZAda;
   }
@@ -242,8 +285,8 @@ multiclass sme_outer_product_fp64<bit S, string mnemonic, SDPatternOperator op>
   def : SME_ZA_Tile_TwoPred_TwoVec_Pat<NAME, op, timm32_0_7, nxv2i1, nxv2f64>;
 }
 
-multiclass sme2p1_fmop_tile_fp16<string mnemonic, bit bf, bit s>{
-  def NAME : sme_fp_outer_product_inst<s, {0,bf}, 0b1, TileOp16, ZPR16, mnemonic> {
+multiclass sme2p1_fmop_tile_fp16<string mnemonic, bit bf, bit s, bits<2> op, ZPRRegOp zpr_ty>{
+  def NAME : sme_fp_outer_product_inst<s, {0,bf}, op, TileOp16, zpr_ty, mnemonic> {
     bits<1> ZAda;
     let Inst{2-1} = 0b00;
     let Inst{0}   = ZAda;
@@ -743,23 +786,23 @@ class sme_spill_inst<string opcodestr>
     : sme_spill_fill_base<0b1, (outs),
                           (ins MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv,
                                sme_elm_idx0_15:$imm4, GPR64sp:$Rn,
-                               imm0_15:$offset),
+                               imm32_0_15:$offset),
                           opcodestr>;
 let mayLoad = 1 in
 class sme_fill_inst<string opcodestr>
     : sme_spill_fill_base<0b0, (outs MatrixOp:$ZAt),
                           (ins MatrixIndexGPR32Op12_15:$Rv,
                                sme_elm_idx0_15:$imm4, GPR64sp:$Rn,
-                               imm0_15:$offset),
+                               imm32_0_15:$offset),
                           opcodestr>;
 multiclass sme_spill<string opcodestr> {
   def NAME : sme_spill_inst<opcodestr>;
   def : InstAlias<opcodestr # "\t$ZAt[$Rv, $imm4], [$Rn]",
                   (!cast<Instruction>(NAME) MatrixOp:$ZAt,
                    MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>;
-  // base
-  def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base),
-            (!cast<Instruction>(NAME) ZA, $idx, 0, $base, 0)>;
+
+  def : Pat<(AArch64SMEStr (i32 MatrixIndexGPR32Op12_15:$slice), (i64 GPR64sp:$base), (i32 sme_elm_idx0_15:$imm)),
+          (!cast<Instruction>(NAME) ZA, MatrixIndexGPR32Op12_15:$slice, sme_elm_idx0_15:$imm, GPR64sp:$base, imm32_0_15:$imm)>;
 }
 
 multiclass sme_fill<string opcodestr> {
@@ -769,16 +812,15 @@ multiclass sme_fill<string opcodestr> {
                    MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>;
   def NAME # _PSEUDO
       : Pseudo<(outs),
-               (ins MatrixIndexGPR32Op12_15:$idx, imm0_15:$imm4,
+               (ins MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm4,
                     GPR64sp:$base), []>,
         Sched<[]> {
     // Translated to actual instruction in AArch64ISelLowering.cpp
     let usesCustomInserter = 1;
     let mayLoad = 1;
   }
-  // base
-  def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base),
-            (!cast<Instruction>(NAME # _PSEUDO) $idx, 0, $base)>;
+  def : Pat<(AArch64SMELdr MatrixIndexGPR32Op12_15:$slice, GPR64sp:$base, sme_elm_idx0_15:$imm),
+          (!cast<Instruction>(NAME # _PSEUDO) MatrixIndexGPR32Op12_15:$slice, sme_elm_idx0_15:$imm, GPR64sp:$base)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1297,17 +1339,17 @@ multiclass sve2_int_perm_sel_p<string asm, SDPatternOperator op> {
   }
 
   def : InstAlias<asm # "\t$Pd, $Pn, $Pm[$Rv, $imm]",
-                  (!cast<Instruction>(NAME # _B) PNRAny:$Pd,
-                      PNRAny:$Pn, PPR8:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm), 0>;
+                  (!cast<Instruction>(NAME # _B) PNRasPPRAny:$Pd,
+                      PNRasPPRAny:$Pn, PPR8:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm), 0>;
   def : InstAlias<asm # "\t$Pd, $Pn, $Pm[$Rv, $imm]",
-                  (!cast<Instruction>(NAME # _H) PNRAny:$Pd,
-                      PNRAny:$Pn, PPR16:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_7:$imm), 0>;
+                  (!cast<Instruction>(NAME # _H) PNRasPPRAny:$Pd,
+                      PNRasPPRAny:$Pn, PPR16:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_7:$imm), 0>;
   def : InstAlias<asm # "\t$Pd, $Pn, $Pm[$Rv, $imm]",
-                  (!cast<Instruction>(NAME # _S) PNRAny:$Pd,
-                      PNRAny:$Pn, PPR32:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_3:$imm), 0>;
+                  (!cast<Instruction>(NAME # _S) PNRasPPRAny:$Pd,
+                      PNRasPPRAny:$Pn, PPR32:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_3:$imm), 0>;
   def : InstAlias<asm # "\t$Pd, $Pn, $Pm[$Rv, $imm]",
-                  (!cast<Instruction>(NAME # _D) PNRAny:$Pd,
-                      PNRAny:$Pn, PPR64:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_1:$imm), 0>;
+                  (!cast<Instruction>(NAME # _D) PNRasPPRAny:$Pd,
+                      PNRasPPRAny:$Pn, PPR64:$Pm, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_1:$imm), 0>;
 
   def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPR8:$Pm),
              MatrixIndexGPR32Op12_15:$idx)),
@@ -1413,7 +1455,7 @@ multiclass sme2_dot_mla_add_sub_array_vg4_single<string mnemonic, bits<7> op,
 
 //===----------------------------------------------------------------------===//
 // SME2 multiple vectors ternary INT/FP  two and four registers
-class sme2_dot_mla_add_sub_array_vg2_multi<bits<6> op,
+class sme2_dot_mla_add_sub_array_vg2_multi<bits<7> op,
                                        MatrixOperand matrix_ty,
                                        RegisterOperand multi_vector_ty,
                                        string mnemonic>
@@ -1427,20 +1469,19 @@ class sme2_dot_mla_add_sub_array_vg2_multi<bits<6> op,
   bits<2> Rv;
   bits<3> imm3;
   let Inst{31-23} = 0b110000011;
-  let Inst{22}    = op{5}; //sz
+  let Inst{22}    = op{6}; //sz
   let Inst{21}    = 0b1;
   let Inst{20-17} = Zm;
   let Inst{16-15} = 0b00;
   let Inst{14-13} = Rv;
-  let Inst{12-10} = op{4-2};
+  let Inst{12-10} = op{5-3};
   let Inst{9-6}   = Zn;
-  let Inst{5}     = 0b0;
-  let Inst{4-3}   = op{1-0};
+  let Inst{5-3}   = op{2-0};
   let Inst{2-0}   = imm3;
   let Constraints = "$ZAd = $_ZAd";
 }
 
-multiclass sme2_dot_mla_add_sub_array_vg2_multi<string mnemonic, bits<6> op,
+multiclass sme2_dot_mla_add_sub_array_vg2_multi<string mnemonic, bits<7> op,
                                             MatrixOperand  matrix_ty,
                                             RegisterOperand multi_vector_ty, ValueType zpr_ty,
                                             SDPatternOperator intrinsic> {
@@ -1454,7 +1495,7 @@ multiclass sme2_dot_mla_add_sub_array_vg2_multi<string mnemonic, bits<6> op,
                   (!cast<Instruction>(NAME) matrix_ty:$ZAd,  MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), 0>;
 }
 
-class sme2_dot_mla_add_sub_array_vg4_multi<bits<6> op,
+class sme2_dot_mla_add_sub_array_vg4_multi<bits<7> op,
                                             MatrixOperand matrix_ty,
                                             RegisterOperand multi_vector_ty,
                                             string mnemonic>
@@ -1468,20 +1509,20 @@ class sme2_dot_mla_add_sub_array_vg4_multi<bits<6> op,
   bits<2> Rv;
   bits<3> imm3;
   let Inst{31-23} = 0b110000011;
-  let Inst{22}    = op{5}; //sz
+  let Inst{22}    = op{6}; //sz
   let Inst{21}    = 0b1;
   let Inst{20-18} = Zm;
   let Inst{17-15} = 0b010;
   let Inst{14-13} = Rv;
-  let Inst{12-10} = op{4-2};
+  let Inst{12-10} = op{5-3};
   let Inst{9-7}   = Zn;
-  let Inst{6-5}   = 0b00;
-  let Inst{4-3}   = op{1-0};
+  let Inst{6}     = 0b0;
+  let Inst{5-3}   = op{2-0};
   let Inst{2-0}   = imm3;
   let Constraints = "$ZAd = $_ZAd";
 }
 
-multiclass sme2_dot_mla_add_sub_array_vg4_multi<string mnemonic, bits<6> op,
+multiclass sme2_dot_mla_add_sub_array_vg4_multi<string mnemonic, bits<7> op,
                                             MatrixOperand  matrix_ty,
                                             RegisterOperand multi_vector_ty,
                                             ValueType zpr_ty, SDPatternOperator intrinsic>{
@@ -1758,8 +1799,8 @@ class sme2_mla_long_array_index_base<bits<2> op0, bits<2> op, Operand index_ty,
 }
 
 multiclass sme2_mla_long_array_index<string mnemonic, bits<2> op0, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
-  def _S : sme2_mla_long_array_index_base<op0, op, uimm3s2range, ZPR16,
-                                          mnemonic>, SMEPseudo2Instr<NAME # _S, 1> {
+  def _HtoS : sme2_mla_long_array_index_base<op0, op, uimm3s2range, ZPR16,
+                                          mnemonic>, SMEPseudo2Instr<NAME # _HtoS, 1> {
     bits<3> i3;
     bits<5> Zn;
     bits<3> imm;
@@ -1769,9 +1810,9 @@ multiclass sme2_mla_long_array_index<string mnemonic, bits<2> op0, bits<2> op, V
     let Inst{2-0}   = imm;
   }
 
-  def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME # _S, uimm3s2range, ZPR16, ZPR4b16, VectorIndexH32b_timm, SMEMatrixArray>;
+  def _HtoS_PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME # _HtoS, uimm3s2range, ZPR16, ZPR4b16, VectorIndexH32b_timm, SMEMatrixArray>;
 
-  def : SME2_ZA_TwoOp_Multi_Index_Pat<NAME # _S, intrinsic, uimm3s2range, ZPR4b16, zpr_ty, VectorIndexH32b_timm, tileslicerange3s2>;
+  def : SME2_ZA_TwoOp_Multi_Index_Pat<NAME # _HtoS, intrinsic, uimm3s2range, ZPR4b16, zpr_ty, VectorIndexH32b_timm, tileslicerange3s2>;
 }
 
 class sme2_mla_long_array_vg2_index<string mnemonic, bits<2> op0, bits<2> op>
@@ -1789,14 +1830,14 @@ class sme2_mla_long_array_vg2_index<string mnemonic, bits<2> op0, bits<2> op>
 }
 
 multiclass sme2_fp_mla_long_array_vg2_index<string mnemonic, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
-  def _S : sme2_mla_long_array_vg2_index<mnemonic, 0b10, op>, SMEPseudo2Instr<NAME # _S, 1>;
+  def _HtoS : sme2_mla_long_array_vg2_index<mnemonic, 0b10, op>, SMEPseudo2Instr<NAME # _HtoS, 1>;
 
-  def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME # _S, uimm2s2range, ZZ_h_mul_r, ZPR4b16, VectorIndexH32b_timm, SMEMatrixArray>;
+  def _HtoS_PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME # _HtoS, uimm2s2range, ZZ_h_mul_r, ZPR4b16, VectorIndexH32b_timm, SMEMatrixArray>;
 
-  def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat<NAME # _S, intrinsic, uimm2s2range, ZPR4b16, zpr_ty, VectorIndexH32b_timm, tileslicerange2s2>;
+  def : SME2_ZA_TwoOp_VG2_Multi_Index_Pat<NAME # _HtoS, intrinsic, uimm2s2range, ZPR4b16, zpr_ty, VectorIndexH32b_timm, tileslicerange2s2>;
 
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm$i3",
-                 (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>;
+                 (!cast<Instruction>(NAME #_HtoS) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>;
 }
 
 multiclass sme2_int_mla_long_array_vg2_index<string mnemonic, bits<2> op, SDPatternOperator intrinsic> {
@@ -1825,33 +1866,35 @@ class sme2_mla_long_array_vg4_index<string mnemonic, bits<2> op0, bits<2> op>
 }
 
 multiclass sme2_fp_mla_long_array_vg4_index<string mnemonic, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
-  def _S : sme2_mla_long_array_vg4_index<mnemonic, 0b10, op>, SMEPseudo2Instr<NAME # _S, 1>;
+  def _HtoS : sme2_mla_long_array_vg4_index<mnemonic, 0b10, op>, SMEPseudo2Instr<NAME # _HtoS, 1>;
 
-  def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME # _S, uimm2s2range, ZZZZ_h_mul_r, ZPR4b16, VectorIndexH32b_timm, SMEMatrixArray>;
+  def _HtoS_PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME # _HtoS, uimm2s2range, ZZZZ_h_mul_r, ZPR4b16, VectorIndexH32b_timm, SMEMatrixArray>;
 
-  def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME # _S, intrinsic, uimm2s2range, ZPR4b16, zpr_ty, VectorIndexH32b_timm, tileslicerange2s2>;
+  def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME # _HtoS, intrinsic, uimm2s2range, ZPR4b16, zpr_ty, VectorIndexH32b_timm, tileslicerange2s2>;
 
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm$i3",
-                 (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>;
+                 (!cast<Instruction>(NAME #_HtoS) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>;
 }
 
 multiclass sme2_int_mla_long_array_vg4_index<string mnemonic, bits<2> op, SDPatternOperator intrinsic> {
-  def _S : sme2_mla_long_array_vg4_index<mnemonic, 0b11, op>, SMEPseudo2Instr<NAME # _S, 1>;
+  def _HtoS : sme2_mla_long_array_vg4_index<mnemonic, 0b11, op>, SMEPseudo2Instr<NAME # _HtoS, 1>;
 
-  def _S_PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME # _S, uimm2s2range, ZZZZ_h_mul_r, ZPR4b16, VectorIndexH32b_timm, SMEMatrixArray>;
+  def _HtoS_PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME # _HtoS, uimm2s2range, ZZZZ_h_mul_r, ZPR4b16, VectorIndexH32b_timm, SMEMatrixArray>;
 
-  def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME # _S, intrinsic, uimm2s2range, ZPR4b16, nxv8i16, VectorIndexH32b_timm, tileslicerange2s2>;
+  def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME # _HtoS, intrinsic, uimm2s2range, ZPR4b16, nxv8i16, VectorIndexH32b_timm, tileslicerange2s2>;
 
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm$i3",
-                 (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>;
+                 (!cast<Instruction>(NAME #_HtoS) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH32b_timm:$i3), 0>;
 }
 
-class sme2_mla_long_array<bits<2>op0, bits<2> op, Operand index_ty,
+class sme2_mla_long_array<bits<2>op0, bits<2> op,
+                          MatrixOperand matrix_ty,
+                          Operand index_ty,
                           RegisterOperand first_vector_ty,
                           RegisterOperand second_vector_ty,
                           string mnemonic, string vg_acronym="">
-   : I<(outs MatrixOp32:$ZAda),
-       (ins  MatrixOp32:$_ZAda, MatrixIndexGPR32Op8_11:$Rv,
+   : I<(outs matrix_ty:$ZAda),
+       (ins  matrix_ty:$_ZAda, MatrixIndexGPR32Op8_11:$Rv,
        index_ty:$imm, first_vector_ty:$Zn, second_vector_ty:$Zm),
        mnemonic,"\t$ZAda[$Rv, $imm" # !if(!eq(vg_acronym, ""), "", ", " # vg_acronym) # "], $Zn, $Zm",
        "", []> , Sched<[]> {
@@ -1869,8 +1912,8 @@ class sme2_mla_long_array<bits<2>op0, bits<2> op, Operand index_ty,
 }
 
 multiclass sme2_mla_long_array_single<string mnemonic, bits<2> op0, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
-  def _S : sme2_mla_long_array<op0, op, uimm3s2range, ZPR16, ZPR4b16,
-                               mnemonic> , SMEPseudo2Instr<NAME # _S, 1>{
+  def _HtoS : sme2_mla_long_array<op0, op, MatrixOp32, uimm3s2range, ZPR16, ZPR4b16,
+                               mnemonic> , SMEPseudo2Instr<NAME # _HtoS, 1>{
     bits<4> Zm;
     bits<5> Zn;
     bits<3> imm;
@@ -1880,15 +1923,15 @@ multiclass sme2_mla_long_array_single<string mnemonic, bits<2> op0, bits<2> op,
     let Inst{2-0}   = imm;
   }
 
-  def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME # _S, uimm3s2range, ZPR16, ZPR4b16, SMEMatrixArray>;
+  def _HtoS_PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME # _HtoS, uimm3s2range, ZPR16, ZPR4b16, SMEMatrixArray>;
 
-  def : SME2_ZA_TwoOp_Multi_Single_Pat<NAME # _S, intrinsic, uimm3s2range, ZPR4b16, zpr_ty, tileslicerange3s2>;
+  def : SME2_ZA_TwoOp_Multi_Single_Pat<NAME # _HtoS, intrinsic, uimm3s2range, ZPR4b16, zpr_ty, tileslicerange3s2>;
 }
 
-class sme2_mla_long_array_vg24_single<bits<2> op0, bit vg4, bits<2> op,
-                                      RegisterOperand first_vector_ty,
-                                      string mnemonic, string vg_acronym>
-    : sme2_mla_long_array<op0, op, uimm2s2range, first_vector_ty, ZPR4b16,
+class sme2_mla_long_array_vg24_single<bits<2> op0, bit vg4, bits<2> op, bit o2,
+                                      MatrixOperand matrix_ty, RegisterOperand multi_vector_ty,
+                                      ZPRRegOp zpr_ty, string mnemonic, string vg_acronym>
+    : sme2_mla_long_array<op0, op, matrix_ty, uimm2s2range, multi_vector_ty, zpr_ty,
                           mnemonic, vg_acronym> {
   bits<4> Zm;
   bits<5> Zn;
@@ -1896,96 +1939,117 @@ class sme2_mla_long_array_vg24_single<bits<2> op0, bit vg4, bits<2> op,
   let Inst{20}    = vg4;
   let Inst{19-16} = Zm;
   let Inst{9-5}   = Zn;
-  let Inst{2}     = 0b0;
+  let Inst{2}     = o2;
   let Inst{1-0}   = imm;
 }
 
-multiclass sme2_fp_mla_long_array_vg2_single<string mnemonic, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
-  def _S : sme2_mla_long_array_vg24_single<0b00, 0b0, op, ZZ_h, mnemonic,
-                                           "vgx2">, SMEPseudo2Instr<NAME # _S, 1>;
+	
+multiclass sme2_fp_mla_long_array_vg2_single<string mnemonic, bits<3> op, MatrixOperand matrix_ty,
+                                             RegisterOperand multi_vector_ty, ZPRRegOp vector_ty,
+                                             ValueType zpr_ty, SDPatternOperator intrinsic> {
+  def NAME : sme2_mla_long_array_vg24_single<0b00, 0b0, op{2-1}, op{0}, matrix_ty,  multi_vector_ty,
+                                           vector_ty, mnemonic, "vgx2">, SMEPseudo2Instr<NAME, 1>;
 
-  def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME # _S, uimm2s2range, ZZ_h, ZPR4b16, SMEMatrixArray>;
+  def _PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME, uimm2s2range, multi_vector_ty,
+                                                        vector_ty, SMEMatrixArray>;
 
-  def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat<NAME # _S, intrinsic, uimm2s2range, ZPR4b16, zpr_ty, tileslicerange2s2>;
+  def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat<NAME, intrinsic, uimm2s2range, vector_ty, zpr_ty,
+                                           tileslicerange2s2>;
 
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm",
-                 (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h:$Zn, ZPR4b16:$Zm), 0>;
+                 (!cast<Instruction>(NAME) matrix_ty:$ZAda,  MatrixIndexGPR32Op8_11:$Rv,
+                  uimm2s2range:$imm, multi_vector_ty:$Zn, vector_ty:$Zm), 0>;
 }
 
 multiclass sme2_int_mla_long_array_vg2_single<string mnemonic, bits<2> op, SDPatternOperator intrinsic> {
-  def _S : sme2_mla_long_array_vg24_single<0b01, 0b0, op, ZZ_h, mnemonic,
-                                           "vgx2">, SMEPseudo2Instr<NAME # _S, 1>;
+  def _HtoS : sme2_mla_long_array_vg24_single<0b01, 0b0, op, 0b0, MatrixOp32, ZZ_h, ZPR4b16, mnemonic,
+                                             "vgx2">, SMEPseudo2Instr<NAME # _HtoS, 1>;
 
-  def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME # _S, uimm2s2range, ZZ_h, ZPR4b16, SMEMatrixArray>;
+  def _HtoS_PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME # _HtoS, uimm2s2range, ZZ_h, ZPR4b16, SMEMatrixArray>;
 
-  def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat<NAME # _S, intrinsic, uimm2s2range, ZPR4b16, nxv8i16, tileslicerange2s2>;
+  def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat<NAME # _HtoS, intrinsic, uimm2s2range, ZPR4b16, nxv8i16, tileslicerange2s2>;
 
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm",
-                 (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h:$Zn, ZPR4b16:$Zm), 0>;
+                 (!cast<Instruction>(NAME #_HtoS) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h:$Zn, ZPR4b16:$Zm), 0>;
 }
 
-multiclass sme2_fp_mla_long_array_vg4_single<string mnemonic, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
-  def _S : sme2_mla_long_array_vg24_single<0b00, 0b1, op, ZZZZ_h, mnemonic,
-                                           "vgx4">, SMEPseudo2Instr<NAME # _S, 1>;
+multiclass sme2_fp_mla_long_array_vg4_single<string mnemonic, bits<3> op, MatrixOperand matrix_ty,
+                                             RegisterOperand multi_vector_ty, ZPRRegOp vector_ty,
+                                             ValueType zpr_ty, SDPatternOperator intrinsic> {
+  def NAME : sme2_mla_long_array_vg24_single<0b00, 0b1, op{2-1}, op{0}, matrix_ty, multi_vector_ty, 
+                                             vector_ty, mnemonic, "vgx4">, SMEPseudo2Instr<NAME, 1>;
 
-  def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME # _S, uimm2s2range, ZZZZ_h, ZPR4b16, SMEMatrixArray>;
+  def _PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME, uimm2s2range, multi_vector_ty, vector_ty,
+                                                      SMEMatrixArray>;
 
-  def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat<NAME # _S, intrinsic, uimm2s2range, ZPR4b16, zpr_ty, tileslicerange2s2>;
+  def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat<NAME, intrinsic, uimm2s2range, vector_ty, zpr_ty,
+                                           tileslicerange2s2>;
 
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm",
-                 (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h:$Zn, ZPR4b16:$Zm), 0>;
+                 (!cast<Instruction>(NAME) matrix_ty:$ZAda, MatrixIndexGPR32Op8_11:$Rv,
+                  uimm2s2range:$imm, multi_vector_ty:$Zn, vector_ty:$Zm), 0>;
 }
 
 multiclass sme2_int_mla_long_array_vg4_single<string mnemonic, bits<2> op, SDPatternOperator intrinsic> {
-  def _S : sme2_mla_long_array_vg24_single<0b01, 0b1, op, ZZZZ_h, mnemonic,
-                                           "vgx4">, SMEPseudo2Instr<NAME # _S, 1>;
+  def _HtoS : sme2_mla_long_array_vg24_single<0b01, 0b1, op, 0b0, MatrixOp32, ZZZZ_h, ZPR4b16,  mnemonic,
+                                           "vgx4">, SMEPseudo2Instr<NAME # _HtoS, 1>;
 
-  def _S_PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME # _S, uimm2s2range, ZZZZ_h, ZPR4b16, SMEMatrixArray>;
+  def _HtoS_PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME # _HtoS, uimm2s2range, ZZZZ_h, ZPR4b16, SMEMatrixArray>;
 
-  def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat<NAME # _S, intrinsic, uimm2s2range, ZPR4b16, nxv8i16, tileslicerange2s2>;
+  def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat<NAME # _HtoS, intrinsic, uimm2s2range, ZPR4b16, nxv8i16, tileslicerange2s2>;
 
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm",
-                 (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h:$Zn, ZPR4b16:$Zm), 0>;
+                 (!cast<Instruction>(NAME #_HtoS) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h:$Zn, ZPR4b16:$Zm), 0>;
 }
-class sme2_mla_long_array_vg2_multi<string mnemonic, bits<2> op0, bits<2> op>
-   : sme2_mla_long_array<op0, op, uimm2s2range, ZZ_h_mul_r, ZZ_h_mul_r,  mnemonic,
-                         "vgx2"> {
+
+class sme2_mla_long_array_vg2_multi<string mnemonic, bits<2> op0, bits<3> op,
+                                    MatrixOperand matrix_ty, RegisterOperand multi_vector_ty>
+   : sme2_mla_long_array<op0, op{1-0},  matrix_ty, uimm2s2range, multi_vector_ty, multi_vector_ty,
+                        mnemonic, "vgx2"> {
   bits<4> Zm;
   bits<4> Zn;
   bits<2> imm;
   let Inst{20-17} = Zm;
   let Inst{16}    = 0b0;
   let Inst{9-6}   = Zn;
-  let Inst{5}     = 0b0;
+  let Inst{5}     = op{2};  // fp8
   let Inst{2}     = 0b0;
   let Inst{1-0}   = imm;
 }
 
-multiclass sme2_fp_mla_long_array_vg2_multi<string mnemonic, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
-  def _S : sme2_mla_long_array_vg2_multi<mnemonic, 0b10, op>, SMEPseudo2Instr<NAME # _S, 1>;
+multiclass sme2_fp_mla_long_array_vg2_multi<string mnemonic, bits<3> op, MatrixOperand matrix_ty,
+                                            RegisterOperand multi_vector_ty,
+                                            ValueType zpr_ty, SDPatternOperator intrinsic> {
+
+  def NAME : sme2_mla_long_array_vg2_multi<mnemonic, 0b10, op, matrix_ty, multi_vector_ty>,
+                                           SMEPseudo2Instr<NAME, 1>;
 
-  def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME # _S, uimm2s2range, ZZ_h_mul_r, SMEMatrixArray>;
+  def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME, uimm2s2range, multi_vector_ty, SMEMatrixArray>;
 
-  def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<NAME # _S, intrinsic, uimm2s2range, zpr_ty, tileslicerange2s2>;
+  def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<NAME, intrinsic, uimm2s2range, zpr_ty, tileslicerange2s2>;
 
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm",
-                 (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZ_h_mul_r:$Zn, ZZ_h_mul_r:$Zm), 0>;
+                  (!cast<Instruction>(NAME) matrix_ty:$ZAda,  MatrixIndexGPR32Op8_11:$Rv,
+                  uimm2s2range:$imm, multi_vector_ty:$Zn, multi_vector_ty:$Zm), 0>;
 }
 
 multiclass sme2_int_mla_long_array_vg2_multi<string mnemonic, bits<2> op, SDPatternOperator intrinsic> {
-  def _S : sme2_mla_long_array_vg2_multi<mnemonic, 0b11, op>, SMEPseudo2Instr<NAME # _S, 1>;
+  def _HtoS : sme2_mla_long_array_vg2_multi<mnemonic, 0b11, {0b0, op}, MatrixOp32, ZZ_h_mul_r>,
+                                         SMEPseudo2Instr<NAME # _HtoS, 1>;
 
-  def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME # _S, uimm2s2range, ZZ_h_mul_r, SMEMatrixArray>;
+  def _HtoS_PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME # _HtoS, uimm2s2range, ZZ_h_mul_r, SMEMatrixArray>;
 
-  def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<NAME # _S, intrinsic, uimm2s2range, nxv8i16, tileslicerange2s2>;
+  def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<NAME # _HtoS, intrinsic, uimm2s2range, nxv8i16, tileslicerange2s2>;
 
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm2], $Zn, $Zm",
-                 (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm2, ZZ_h_mul_r:$Zn, ZZ_h_mul_r:$Zm), 0>;
+                 (!cast<Instruction>(NAME #_HtoS) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm2, ZZ_h_mul_r:$Zn, ZZ_h_mul_r:$Zm), 0>;
 }
 
-class sme2_mla_long_array_vg4_multi<string mnemonic, bits<2> op0, bits<2> op>
-   : sme2_mla_long_array<op0, op, uimm2s2range, ZZZZ_h_mul_r, ZZZZ_h_mul_r, mnemonic,
-                        "vgx4"> {
+class sme2_mla_long_array_vg4_multi<string mnemonic, bits<2> op0, bits<3> op,
+                                    MatrixOperand matrix_ty,
+                                    RegisterOperand multi_vector_ty>
+   : sme2_mla_long_array<op0, op{1-0}, matrix_ty, uimm2s2range, multi_vector_ty, multi_vector_ty,
+                         mnemonic, "vgx4"> {
   bits<3> Zm;
   bits<3> Zn;
   bits<2> imm;
@@ -1993,31 +2057,37 @@ class sme2_mla_long_array_vg4_multi<string mnemonic, bits<2> op0, bits<2> op>
   let Inst{17}    = 0b0;
   let Inst{16}    = 0b1;
   let Inst{9-7}   = Zn;
-  let Inst{6-5}   = 0b00;
+  let Inst{6}     = 0b0;
+  let Inst{5}     = op{2};  //fp8
   let Inst{2}     = 0b0;
   let Inst{1-0}   = imm;
 }
 
-multiclass sme2_fp_mla_long_array_vg4_multi<string mnemonic, bits<2> op, ValueType zpr_ty, SDPatternOperator intrinsic> {
-  def _S : sme2_mla_long_array_vg4_multi<mnemonic, 0b10, op>, SMEPseudo2Instr<NAME # _S, 1>;
+multiclass sme2_fp_mla_long_array_vg4_multi<string mnemonic, bits<3> op, MatrixOperand matrix_ty,
+                                            RegisterOperand multi_vector_ty, ValueType zpr_ty,
+                                            SDPatternOperator intrinsic> {
+  def NAME : sme2_mla_long_array_vg4_multi<mnemonic, 0b10, op, matrix_ty, multi_vector_ty>,
+                                           SMEPseudo2Instr<NAME, 1>;
 
-  def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME # _S, uimm2s2range, ZZZZ_h_mul_r, SMEMatrixArray>;
+  def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME, uimm2s2range, multi_vector_ty, SMEMatrixArray>;
 
-  def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat<NAME # _S, intrinsic, uimm2s2range, zpr_ty, tileslicerange2s2>;
+  def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat<NAME, intrinsic, uimm2s2range, zpr_ty, tileslicerange2s2>;
 
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm], $Zn, $Zm",
-                 (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm, ZZZZ_h_mul_r:$Zn, ZZZZ_h_mul_r:$Zm), 0>;
+                 (!cast<Instruction>(NAME) matrix_ty:$ZAda,  MatrixIndexGPR32Op8_11:$Rv,
+                  uimm2s2range:$imm, multi_vector_ty:$Zn, multi_vector_ty:$Zm), 0>;
 }
 
 multiclass sme2_int_mla_long_array_vg4_multi<string mnemonic, bits<2> op, SDPatternOperator intrinsic> {
-  def _S : sme2_mla_long_array_vg4_multi<mnemonic, 0b11, op>, SMEPseudo2Instr<NAME # _S, 1>;
+  def _HtoS : sme2_mla_long_array_vg4_multi<mnemonic, 0b11, {0b0, op}, MatrixOp32, ZZZZ_h_mul_r>,
+                                            SMEPseudo2Instr<NAME # _HtoS, 1>;
 
-  def _S_PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME # _S, uimm2s2range, ZZZZ_h_mul_r, SMEMatrixArray>;
+  def _HtoS_PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME # _HtoS, uimm2s2range, ZZZZ_h_mul_r, SMEMatrixArray>;
 
-  def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat<NAME # _S, intrinsic, uimm2s2range, nxv8i16, tileslicerange2s2>;
+  def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat<NAME # _HtoS, intrinsic, uimm2s2range, nxv8i16, tileslicerange2s2>;
 
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm2], $Zn, $Zm",
-                 (!cast<Instruction>(NAME #_S) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm2, ZZZZ_h_mul_r:$Zn, ZZZZ_h_mul_r:$Zm), 0>;
+                 (!cast<Instruction>(NAME #_HtoS) MatrixOp32:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, uimm2s2range:$imm2, ZZZZ_h_mul_r:$Zn, ZZZZ_h_mul_r:$Zm), 0>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2308,7 +2378,7 @@ multiclass sme2_zip_vector_vg2<string mnemonic, bit op> {
 //===----------------------------------------------------------------------===//
 // SME2 Dot Products and MLA
 
-class sme2_multi_vec_array_vg2_index<bit sz, bits<6> op, MatrixOperand matrix_ty,
+class sme2_multi_vec_array_vg2_index<bits<2> sz, bits<6> op, MatrixOperand matrix_ty,
                                      RegisterOperand multi_vector_ty,
                                      ZPRRegOp vector_ty, Operand index_ty,
                                      string mnemonic>
@@ -2321,8 +2391,8 @@ class sme2_multi_vec_array_vg2_index<bit sz, bits<6> op, MatrixOperand matrix_ty
   bits<2> Rv;
   bits<4> Zn;
   bits<3> imm3;
-  let Inst{31-23} = 0b110000010;
-  let Inst{22}    = sz;
+  let Inst{31-24} = 0b11000001;
+  let Inst{23-22} = sz;
   let Inst{21-20} = 0b01;
   let Inst{19-16} = Zm;
   let Inst{15}    = 0b0;
@@ -2336,11 +2406,11 @@ class sme2_multi_vec_array_vg2_index<bit sz, bits<6> op, MatrixOperand matrix_ty
 }
 
 // SME2 multi-vec ternary indexed two registers 32-bit
-multiclass sme2_multi_vec_array_vg2_index_32b<string mnemonic, bits<4> op,
+multiclass sme2_multi_vec_array_vg2_index_32b<string mnemonic, bits<2> sz, bits<4> op,
                                               RegisterOperand multi_vector_ty,
                                               ZPRRegOp vector_ty, ValueType vt,
                                               SDPatternOperator intrinsic> {
-  def NAME : sme2_multi_vec_array_vg2_index<0b1, {op{3},?,?,op{2-0}}, MatrixOp32, multi_vector_ty, vector_ty,
+  def NAME : sme2_multi_vec_array_vg2_index<sz, {op{3},?,?,op{2-0}}, MatrixOp32, multi_vector_ty, vector_ty,
                                              VectorIndexS32b_timm,  mnemonic>, SMEPseudo2Instr<NAME, 1> {
     bits<2> i;
     let Inst{11-10} = i;
@@ -2356,9 +2426,10 @@ multiclass sme2_multi_vec_array_vg2_index_32b<string mnemonic, bits<4> op,
 }
 
 // SME2.1 multi-vec ternary indexed two registers 16-bit
-multiclass sme2p1_multi_vec_array_vg2_index_16b<string mnemonic, bits<2> op> {
-  def NAME : sme2_multi_vec_array_vg2_index<0b0, {0b1,?,?,op,?}, MatrixOp16,
-                                            ZZ_h_mul_r, ZPR4b16,
+multiclass sme2p1_multi_vec_array_vg2_index_16b<string mnemonic, bits<2> sz, bits<3> op,
+                                                RegisterOperand multi_vector_ty, ZPRRegOp zpr_ty> {
+  def NAME : sme2_multi_vec_array_vg2_index<sz, {op{2},?,?,op{1-0},?}, MatrixOp16,
+                                            multi_vector_ty, zpr_ty,
                                             VectorIndexH, mnemonic> {
     bits<3> i;
     let Inst{11-10} = i{2-1};
@@ -2366,7 +2437,7 @@ multiclass sme2p1_multi_vec_array_vg2_index_16b<string mnemonic, bits<2> op> {
   }
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
         (!cast<Instruction>(NAME) MatrixOp16:$ZAda,  MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
-        ZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH:$i), 0>;
+        multi_vector_ty:$Zn, zpr_ty:$Zm, VectorIndexH:$i), 0>;
 }
 
 // SME2 multi-vec ternary indexed two registers 64-bit
@@ -2415,7 +2486,7 @@ multiclass sme2_multi_vec_array_vg2_index_64b<string mnemonic, bits<2> op,
         multi_vector_ty:$Zn, vector_ty:$Zm, VectorIndexD32b_timm:$i1), 0>;
 }
 
-class sme2_multi_vec_array_vg4_index<bit sz, bits<6> op, MatrixOperand matrix_ty,
+class sme2_multi_vec_array_vg4_index<bit sz, bits<7> op, MatrixOperand matrix_ty,
                                      RegisterOperand multi_vector_ty,
                                      ZPRRegOp vector_ty, Operand index_ty,
                                      string mnemonic>
@@ -2434,10 +2505,9 @@ class sme2_multi_vec_array_vg4_index<bit sz, bits<6> op, MatrixOperand matrix_ty
   let Inst{19-16} = Zm;
   let Inst{15}    = 0b1;
   let Inst{14-13} = Rv;
-  let Inst{12-10} = op{5-3};
+  let Inst{12-10} = op{6-4};
   let Inst{9-7}   = Zn;
-  let Inst{6}     = 0b0;
-  let Inst{5-3}   = op{2-0};
+  let Inst{6-3}   = op{3-0};
   let Inst{2-0}   = imm3;
 
   let Constraints = "$ZAda = $_ZAda";
@@ -2448,7 +2518,7 @@ multiclass sme2_multi_vec_array_vg4_index_32b<string mnemonic, bits<4> op,
                                               RegisterOperand multi_vector_ty,
                                               ZPRRegOp vector_ty, ValueType vt,
                                               SDPatternOperator intrinsic> {
-  def NAME : sme2_multi_vec_array_vg4_index<0b1, {op{3},?,?,op{2-0}}, MatrixOp32,  multi_vector_ty,
+  def NAME : sme2_multi_vec_array_vg4_index<0b1, {op{3},?,?,0b0, op{2-0}}, MatrixOp32,  multi_vector_ty,
                                             vector_ty, VectorIndexS32b_timm, mnemonic>, SMEPseudo2Instr<NAME, 1> {
    bits<2> i;
    let Inst{11-10} = i;
@@ -2464,9 +2534,11 @@ multiclass sme2_multi_vec_array_vg4_index_32b<string mnemonic, bits<4> op,
 }
 
 // SME2.1 multi-vec ternary indexed four registers 16-bit
-multiclass sme2p1_multi_vec_array_vg4_index_16b<string mnemonic, bits<2> op> {
+multiclass sme2p1_multi_vec_array_vg4_index_16b<string mnemonic, bits<3> op,
+                                                RegisterOperand multi_vector_ty,
+                                                ZPRRegOp zpr_ty> {
   def NAME : sme2_multi_vec_array_vg4_index<0b0,{0b1,?,?,op,?}, MatrixOp16,
-                                            ZZZZ_h_mul_r, ZPR4b16,
+                                            multi_vector_ty, zpr_ty,
                                             VectorIndexH, mnemonic>{
     bits<3> i;
     let Inst{11-10} = i{2-1};
@@ -2475,7 +2547,7 @@ multiclass sme2p1_multi_vec_array_vg4_index_16b<string mnemonic, bits<2> op> {
 
   def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
         (!cast<Instruction>(NAME) MatrixOp16:$ZAda,  MatrixIndexGPR32Op8_11:$Rv,
-        sme_elm_idx0_7:$imm3, ZZZZ_h_mul_r:$Zn, ZPR4b16:$Zm, VectorIndexH:$i), 0>;
+        sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm, VectorIndexH:$i), 0>;
 }
 
 // SME2 multi-vec ternary indexed four registers 64-bit
@@ -2525,7 +2597,7 @@ multiclass sme2_multi_vec_array_vg4_index_64b<string mnemonic, bits<3> op,
 }
 //===----------------------------------------------------------------------===//
 // SME2 multi-vec indexed long long MLA one source 32-bit
-class sme2_mla_ll_array_index_32b<string mnemonic, bits<3> op>
+class sme2_mla_ll_array_index_32b<string mnemonic, bits<2> sz, bits<3> op>
     : I<(outs MatrixOp32:$ZAda),
         (ins MatrixOp32:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm2s4range:$imm2, ZPR8:$Zn, ZPR4b8:$Zm, VectorIndexB32b_timm:$i),
         mnemonic, "\t$ZAda[$Rv, $imm2], $Zn, $Zm$i",
@@ -2535,7 +2607,9 @@ class sme2_mla_ll_array_index_32b<string mnemonic, bits<3> op>
   bits<4> i;
   bits<5> Zn;
   bits<2> imm2;
-  let Inst{31-20} = 0b110000010000;
+  let Inst{31-24} = 0b11000001;
+  let Inst{23-22} = sz;
+  let Inst{21-20} = 0b00;
   let Inst{19-16} = Zm;
   let Inst{15}    = i{3};
   let Inst{14-13} = Rv;
@@ -2547,8 +2621,8 @@ class sme2_mla_ll_array_index_32b<string mnemonic, bits<3> op>
   let Constraints = "$ZAda = $_ZAda";
 }
 
-multiclass sme2_mla_ll_array_index_32b<string mnemonic, bits<3> op, SDPatternOperator intrinsic> {
-  def NAME : sme2_mla_ll_array_index_32b<mnemonic, op>, SMEPseudo2Instr<NAME, 1>;
+multiclass sme2_mla_ll_array_index_32b<string mnemonic, bits<2> sz, bits<3> op, SDPatternOperator intrinsic> {
+  def NAME : sme2_mla_ll_array_index_32b<mnemonic, sz, op>, SMEPseudo2Instr<NAME, 1>;
 
   def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, uimm2s4range, ZPR8, ZPR4b8, VectorIndexB32b_timm, SMEMatrixArray>;
 
@@ -2589,7 +2663,7 @@ multiclass sme2_mla_ll_array_index_64b<string mnemonic, bits<2> op, SDPatternOpe
   def : SME2_ZA_TwoOp_Multi_Index_Pat<NAME, intrinsic, uimm2s4range, ZPR4b16, nxv8i16, VectorIndexH32b_timm, tileslicerange2s4>;
 }
 
-class sme2_mla_ll_array_vg24_index_32b<bit vg4, bits<3> op,
+class sme2_mla_ll_array_vg24_index_32b<bits<2> sz, bit vg4, bits<3> op,
                                        RegisterOperand vector_ty,
                                        string mnemonic>
     : I<(outs MatrixOp32:$ZAda),
@@ -2601,7 +2675,9 @@ class sme2_mla_ll_array_vg24_index_32b<bit vg4, bits<3> op,
   bits<2> Rv;
   bits<4> i;
   bit     imm;
-  let Inst{31-20} = 0b110000010001;
+  let Inst{31-24} = 0b11000001;
+  let Inst{23-22} = sz;
+  let Inst{21-20} = 0b01;
   let Inst{19-16} = Zm;
   let Inst{15}    = vg4;
   let Inst{14-13} = Rv;
@@ -2616,8 +2692,8 @@ class sme2_mla_ll_array_vg24_index_32b<bit vg4, bits<3> op,
 
 //SME2 multi-vec indexed long long MLA two sources 32-bit
 
-multiclass sme2_mla_ll_array_vg2_index_32b<string mnemonic, bits<3> op, SDPatternOperator intrinsic> {
-  def NAME: sme2_mla_ll_array_vg24_index_32b<0b0, op, ZZ_b_mul_r, mnemonic>, SMEPseudo2Instr<NAME, 1> {
+multiclass sme2_mla_ll_array_vg2_index_32b<string mnemonic, bits<2> sz, bits<3> op, SDPatternOperator intrinsic> {
+  def NAME: sme2_mla_ll_array_vg24_index_32b<sz, 0b0, op, ZZ_b_mul_r, mnemonic>, SMEPseudo2Instr<NAME, 1> {
    bits<4> Zn;
    let Inst{9-6} = Zn;
   }
@@ -2632,11 +2708,11 @@ multiclass sme2_mla_ll_array_vg2_index_32b<string mnemonic, bits<3> op, SDPatter
 
 // SME2 multi-vec indexed long long MLA four sources 32-bit
 
-multiclass sme2_mla_ll_array_vg4_index_32b<string mnemonic, bits<3> op, SDPatternOperator intrinsic> {
-  def NAME: sme2_mla_ll_array_vg24_index_32b<0b1, op, ZZZZ_b_mul_r, mnemonic>, SMEPseudo2Instr<NAME, 1> {
+multiclass sme2_mla_ll_array_vg4_index_32b<string mnemonic, bits<2> sz, bits<4> op, SDPatternOperator intrinsic> {
+  def NAME: sme2_mla_ll_array_vg24_index_32b<sz, 0b1, op{2-0}, ZZZZ_b_mul_r, mnemonic>, SMEPseudo2Instr<NAME, 1> {
    bits<3> Zn;
    let Inst{9-7} = Zn;
-   let Inst{6}   = 0b0;
+   let Inst{6}   = op{3};
   }
 
   def _PSEUDO : sme2_za_array_2op_multi_index_pseudo<NAME, uimm1s4range, ZZZZ_b_mul_r, ZPR4b8, VectorIndexB32b_timm, SMEMatrixArray>;
@@ -2708,7 +2784,7 @@ multiclass sme2_mla_ll_array_vg4_index_64b<string mnemonic, bits<2> op, SDPatter
 
 //SME2 multiple and single vector long long FMA one source
 
-class sme2_mla_ll_array_single<string mnemonic, bits<4> op,
+class sme2_mla_ll_array_single<string mnemonic, bits<5> op,
                                MatrixOperand matrix_ty, ZPRRegOp vector_ty,
                                ZPRRegOp zpr_ty>
     : I<(outs matrix_ty:$ZAda),
@@ -2721,8 +2797,9 @@ class sme2_mla_ll_array_single<string mnemonic, bits<4> op,
   bits<5> Zn;
   bits<2> imm;
   let Inst{31-23} = 0b110000010;
-  let Inst{22}    = op{3}; //sz
-  let Inst{21-20} = 0b10;
+  let Inst{22}    = op{4}; //sz
+  let Inst{21}    = 0b1;
+  let Inst{20}    = op{3}; //fp8
   let Inst{19-16} = Zm;
   let Inst{15}    = 0b0;
   let Inst{14-13} = Rv;
@@ -2734,7 +2811,7 @@ class sme2_mla_ll_array_single<string mnemonic, bits<4> op,
   let Constraints = "$ZAda = $_ZAda";
 }
 
-multiclass sme2_mla_ll_array_single<string mnemonic, bits<4> op,
+multiclass sme2_mla_ll_array_single<string mnemonic, bits<5> op,
                                     MatrixOperand matrix_ty, ZPRRegOp vector_ty,
                                     ZPRRegOp zpr_ty, ValueType vt, SDPatternOperator intrinsic> {
   def NAME : sme2_mla_ll_array_single<mnemonic, op, matrix_ty, vector_ty, zpr_ty>, SMEPseudo2Instr<NAME, 1>;
@@ -2744,29 +2821,28 @@ multiclass sme2_mla_ll_array_single<string mnemonic, bits<4> op,
   def : SME2_ZA_TwoOp_Multi_Single_Pat<NAME, intrinsic, uimm2s4range, zpr_ty, vt, tileslicerange2s4>;
 }
 
-class sme2_mla_ll_array_vg24_single<bits<5> op, MatrixOperand matrix_ty,
+class sme2_mla_ll_array_vg24_single<bits<6> op, MatrixOperand matrix_ty,
                                     RegisterOperand vector_ty, ZPRRegOp zpr_ty,
                                     string mnemonic>
     : I<(outs matrix_ty:$ZAda),
         (ins matrix_ty:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm1s4range:$imm,
              vector_ty:$Zn, zpr_ty:$Zm),
-        mnemonic, "\t$ZAda[$Rv, $imm,  " # !if(op{3}, "vgx4", "vgx2") # "], $Zn, $Zm",
+        mnemonic, "\t$ZAda[$Rv, $imm,  " # !if(op{4}, "vgx4", "vgx2") # "], $Zn, $Zm",
         "", []>, Sched<[]> {
   bits<4> Zm;
   bits<2> Rv;
   bits<5> Zn;
   bit     imm;
   let Inst{31-23} = 0b110000010;
-  let Inst{22}    = op{4}; //sz
+  let Inst{22}    = op{5}; //sz
   let Inst{21}    = 0b1;
-  let Inst{20}    = op{3}; //vg4
+  let Inst{20}    = op{4}; //vg4
   let Inst{19-16} = Zm;
   let Inst{15}    = 0b0;
   let Inst{14-13} = Rv;
   let Inst{12-10} = 0b000;
   let Inst{9-5}   = Zn;
-  let Inst{4-2}   = op{2-0};
-  let Inst{1}     = 0b0;
+  let Inst{4-1}   = op{3-0};
   let Inst{0}     = imm;
 
   let Constraints = "$ZAda = $_ZAda";
@@ -2774,7 +2850,7 @@ class sme2_mla_ll_array_vg24_single<bits<5> op, MatrixOperand matrix_ty,
 
 //SME2 single-multi long long MLA two and four sources
 
-multiclass sme2_mla_ll_array_vg24_single<string mnemonic, bits<5> op,
+multiclass sme2_mla_ll_array_vg24_single<string mnemonic, bits<6> op,
                                           MatrixOperand matrix_ty,
                                           RegisterOperand multi_vector_ty,
                                           ZPRRegOp zpr_ty> {
@@ -2792,7 +2868,7 @@ multiclass sme2_mla_ll_array_vg2_single<string mnemonic, bits<5> op,
                                         RegisterOperand multi_vector_ty,
                                         ZPRRegOp zpr_ty, ValueType vt, SDPatternOperator intrinsic> {
 
-  defm NAME: sme2_mla_ll_array_vg24_single<mnemonic, op, matrix_ty, multi_vector_ty, zpr_ty>;
+  defm NAME: sme2_mla_ll_array_vg24_single<mnemonic, {op, 0b0}, matrix_ty, multi_vector_ty, zpr_ty>;
 
   def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat<NAME, intrinsic, uimm1s4range, zpr_ty, vt, tileslicerange1s4>;
 }
@@ -2801,14 +2877,14 @@ multiclass sme2_mla_ll_array_vg4_single<string mnemonic, bits<5> op,
                                         MatrixOperand matrix_ty,
                                         RegisterOperand multi_vector_ty,
                                         ZPRRegOp zpr_ty, ValueType vt, SDPatternOperator intrinsic> {
-  defm NAME: sme2_mla_ll_array_vg24_single<mnemonic, op, matrix_ty, multi_vector_ty, zpr_ty>;
+  defm NAME: sme2_mla_ll_array_vg24_single<mnemonic, {op, 0b0}, matrix_ty, multi_vector_ty, zpr_ty>;
 
   def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat<NAME, intrinsic, uimm1s4range, zpr_ty, vt, tileslicerange1s4>;
 }
 
 // SME2 multiple vectors long long MLA two sources
 
-class sme2_mla_ll_array_vg2_multi<bits<4> op, MatrixOperand matrix_ty,
+class sme2_mla_ll_array_vg2_multi<bits<5> op, MatrixOperand matrix_ty,
                                   RegisterOperand vector_ty,string mnemonic>
     : I<(outs matrix_ty:$ZAda),
         (ins matrix_ty:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, uimm1s4range:$imm,
@@ -2820,22 +2896,21 @@ class sme2_mla_ll_array_vg2_multi<bits<4> op, MatrixOperand matrix_ty,
   bits<4> Zn;
   bit     imm;
   let Inst{31-23} = 0b110000011;
-  let Inst{22}    = op{3};  // sz
+  let Inst{22}    = op{4};  // sz
   let Inst{21}    = 0b1;
   let Inst{20-17} = Zm;
   let Inst{16-15} = 0b00;
   let Inst{14-13} = Rv;
   let Inst{12-10} = 0b000;
   let Inst{9-6}   = Zn;
-  let Inst{5}     = 0b0;
-  let Inst{4-2}   = op{2-0};
+  let Inst{5-2}   = op{3-0};
   let Inst{1}     = 0b0;
   let Inst{0}     = imm;
 
   let Constraints = "$ZAda = $_ZAda";
 }
 
-multiclass sme2_mla_ll_array_vg2_multi<string mnemonic, bits<4> op,
+multiclass sme2_mla_ll_array_vg2_multi<string mnemonic, bits<5> op,
                                        MatrixOperand matrix_ty,
                                        RegisterOperand vector_ty,
                                        ValueType vt, SDPatternOperator intrinsic> {
@@ -2851,7 +2926,7 @@ multiclass sme2_mla_ll_array_vg2_multi<string mnemonic, bits<4> op,
 
 // SME2 multiple vectors long long MLA four sources
 
-class sme2_mla_ll_array_vg4_multi<bits<4> op,MatrixOperand matrix_ty,
+class sme2_mla_ll_array_vg4_multi<bits<5> op,MatrixOperand matrix_ty,
                                   RegisterOperand vector_ty,
                                   string mnemonic>
     : I<(outs matrix_ty:$ZAda),
@@ -2864,22 +2939,22 @@ class sme2_mla_ll_array_vg4_multi<bits<4> op,MatrixOperand matrix_ty,
   bits<3> Zn;
   bit     imm;
   let Inst{31-23} = 0b110000011;
-  let Inst{22}    = op{3}; // sz
+  let Inst{22}    = op{4}; // sz
   let Inst{21}    = 0b1;
   let Inst{20-18} = Zm;
   let Inst{17-15} = 0b010;
   let Inst{14-13} = Rv;
   let Inst{12-10} = 0b000;
   let Inst{9-7}   = Zn;
-  let Inst{6-5}   = 0b00;
-  let Inst{4-2}   = op{2-0};
+  let Inst{6}     = 0b0;
+  let Inst{5-2}   = op{3-0};
   let Inst{1}     = 0b0;
   let Inst{0}     = imm;
 
   let Constraints = "$ZAda = $_ZAda";
 }
 
-multiclass sme2_mla_ll_array_vg4_multi<string mnemonic, bits<4> op,
+multiclass sme2_mla_ll_array_vg4_multi<string mnemonic, bits<5> op,
                                        MatrixOperand matrix_ty,
                                        RegisterOperand vector_ty,
                                        ValueType vt, SDPatternOperator intrinsic> {
@@ -2949,7 +3024,7 @@ class sme2_spill_fill_vector<string mnemonic, bits<8> opc>
 // SME2 move to/from lookup table
 class sme2_movt_zt_to_scalar<string mnemonic, bits<7> opc>
     : I<(outs GPR64:$Rt), (ins ZTR:$ZTt, uimm3s8:$imm3),
-         mnemonic, "\t$Rt, $ZTt$imm3",
+         mnemonic, "\t$Rt, $ZTt[$imm3]",
          "", []>, Sched<[]> {
   bits<3> imm3;
   bits<5> Rt;
@@ -2961,7 +3036,7 @@ class sme2_movt_zt_to_scalar<string mnemonic, bits<7> opc>
 
 class sme2_movt_scalar_to_zt<string mnemonic, bits<7> opc>
     : I<(outs ZTR:$ZTt), (ins uimm3s8:$imm3, GPR64:$Rt),
-         mnemonic, "\t$ZTt$imm3, $Rt",
+         mnemonic, "\t$ZTt[$imm3], $Rt",
          "", []>, Sched<[]> {
   bits<3> imm3;
   bits<5> Rt;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 4902ec3639ec54494b32e1ad08d3f322f3cf3adc..05115aa252a15a02db2d4bf433256868dc6a448a 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -365,6 +365,7 @@ class sve_int_ptrue<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
   let ElementSize = pprty.ElementSize;
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> {
@@ -755,6 +756,7 @@ class sve_int_pfalse<bits<6> opc, string asm>
 
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 multiclass sve_int_pfalse<bits<6> opc, string asm> {
@@ -1090,6 +1092,7 @@ class sve_int_count<bits<3> opc, string asm>
 
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 multiclass sve_int_count<bits<3> opc, string asm, SDPatternOperator op> {
@@ -1982,6 +1985,7 @@ class sve_int_dup_mask_imm<string asm>
   let DecoderMethod = "DecodeSVELogicalImmInstruction";
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 multiclass sve_int_dup_mask_imm<string asm> {
@@ -2819,6 +2823,7 @@ class sve_int_arith_vl<bit opc, string asm, bit streaming_sve = 0b0>
   let Inst{4-0}   = Rd;
 
   let hasSideEffects = 0;
+  let Uses = [VG];
 }
 
 class sve_int_read_vl_a<bit op, bits<5> opc2, string asm, bit streaming_sve = 0b0>
@@ -2839,6 +2844,7 @@ class sve_int_read_vl_a<bit op, bits<5> opc2, string asm, bit streaming_sve = 0b
 
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 //===----------------------------------------------------------------------===//
@@ -4656,6 +4662,7 @@ class sve_int_dup_imm<bits<2> sz8_64, string asm,
 
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 multiclass sve_int_dup_imm<string asm> {
@@ -4698,6 +4705,7 @@ class sve_int_dup_fpimm<bits<2> sz8_64, Operand fpimmtype,
 
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 multiclass sve_int_dup_fpimm<string asm> {
@@ -5614,6 +5622,7 @@ class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty,
 
   let hasSideEffects = 0;
   let isReMaterializable = 1;
+  let Uses = [VG];
 }
 
 multiclass sve_int_index_ii<string asm> {
@@ -8721,8 +8730,8 @@ multiclass sve2_crypto_unary_op<bit opc, string asm, SDPatternOperator op> {
 // SVE BFloat16 Group
 //===----------------------------------------------------------------------===//
 
-class sve_float_dot<bit bf, string asm>
-: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm),
+class sve_float_dot<bit bf, bit o2, ZPRRegOp dst_ty, ZPRRegOp src_ty, string asm>
+: I<(outs dst_ty:$Zda), (ins dst_ty:$_Zda, src_ty:$Zn, src_ty:$Zm),
      asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
   bits<5> Zda;
   bits<5> Zn;
@@ -8731,7 +8740,8 @@ class sve_float_dot<bit bf, string asm>
   let Inst{22}    = bf;
   let Inst{21}    = 0b1;
   let Inst{20-16} = Zm;
-  let Inst{15-10} = 0b100000;
+  let Inst{15-11} = 0b10000;
+  let Inst{10}    = o2;
   let Inst{9-5}   = Zn;
   let Inst{4-0}   = Zda;
 
@@ -8741,24 +8751,24 @@ class sve_float_dot<bit bf, string asm>
   let mayRaiseFPException = 1;
 }
 
-multiclass sve_float_dot<bit bf, string asm, ValueType InVT, SDPatternOperator op> {
-  def NAME : sve_float_dot<bf, asm>;
+multiclass sve_float_dot<bit bf, bit o2, ZPRRegOp dst_ty, ZPRRegOp src_ty,
+                         string asm, ValueType InVT, SDPatternOperator op> {
+  def NAME : sve_float_dot<bf, o2, dst_ty, src_ty, asm>;
   def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, InVT, InVT, !cast<Instruction>(NAME)>;
 }
 
-class sve_float_dot_indexed<bit bf, string asm>
-: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS32b:$iop),
+class sve_float_dot_indexed<bit bf, ZPRRegOp dst_ty, ZPRRegOp src1_ty,
+                            ZPRRegOp src2_ty, Operand iop_ty, string asm>
+: I<(outs dst_ty:$Zda), (ins dst_ty:$_Zda, src1_ty:$Zn, src2_ty:$Zm, iop_ty:$iop),
     asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
   bits<5> Zda;
   bits<5> Zn;
   bits<3> Zm;
-  bits<2> iop;
   let Inst{31-23} = 0b011001000;
   let Inst{22}    = bf;
   let Inst{21}    = 0b1;
-  let Inst{20-19} = iop;
   let Inst{18-16} = Zm;
-  let Inst{15-10} = 0b010000;
+  let Inst{15-12} = 0b0100;
   let Inst{9-5}   = Zn;
   let Inst{4-0}   = Zda;
 
@@ -8768,8 +8778,14 @@ class sve_float_dot_indexed<bit bf, string asm>
   let mayRaiseFPException = 1;
 }
 
-multiclass sve_float_dot_indexed<bit bf, string asm, ValueType InVT, SDPatternOperator op> {
-  def NAME : sve_float_dot_indexed<bf, asm>;
+multiclass sve_float_dot_indexed<bit bf, bits<2> opc, ZPRRegOp src1_ty,
+                                 ZPRRegOp src2_ty, string asm, ValueType InVT,
+                                 SDPatternOperator op> {
+  def NAME : sve_float_dot_indexed<bf, ZPR32, src1_ty, src2_ty, VectorIndexS32b, asm> {
+    bits<2> iop;
+    let Inst{20-19} = iop;
+    let Inst{11-10} = opc;
+  }
   def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, InVT, InVT, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME)>;
 }
 
@@ -9242,6 +9258,7 @@ class sve2p1_ptrue_pn<string mnemonic, bits<2> sz, PNRP8to15RegOp pnrty, SDPatte
   let Inst{2-0}   = PNd;
 
   let hasSideEffects = 0;
+  let Uses = [VG];
 }
 
 
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index 0edb7cb986405bb274fd13f0f91984aec39b9e48..cd72944bcdb2a39471955692d74d74c3bea56638 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -20,16 +20,31 @@ void SMEAttrs::set(unsigned M, bool Enable) {
 
   assert(!(hasStreamingInterface() && hasStreamingCompatibleInterface()) &&
          "SM_Enabled and SM_Compatible are mutually exclusive");
-  assert(!(hasNewZAInterface() && hasSharedZAInterface()) &&
-         "ZA_New and ZA_Shared are mutually exclusive");
-  assert(!(hasNewZAInterface() && preservesZA()) &&
-         "ZA_New and ZA_Preserved are mutually exclusive");
+  
+  // ZA Attrs
+  assert(!(isNewZA() && (Bitmask & SME_ABI_Routine)) &&
+         "ZA_New and SME_ABI_Routine are mutually exclusive");
+
+  assert(
+      (!sharesZA() ||
+       (isNewZA() ^ isInZA() ^ isInOutZA() ^ isOutZA() ^ isPreservesZA())) &&
+      "Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', "
+      "'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive");
 }
 
 SMEAttrs::SMEAttrs(const CallBase &CB) {
   *this = SMEAttrs(CB.getAttributes());
-  if (auto *F = CB.getCalledFunction())
-    set(SMEAttrs(*F).Bitmask);
+  if (auto *F = CB.getCalledFunction()) {
+    set(SMEAttrs(*F).Bitmask | SMEAttrs(F->getName()).Bitmask);
+  }
+}
+
+SMEAttrs::SMEAttrs(StringRef FuncName) : Bitmask(0) {
+  if (FuncName == "__arm_tpidr2_save" || FuncName == "__arm_sme_state")
+    Bitmask |= (SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine);
+  if (FuncName == "__arm_tpidr2_restore")
+    Bitmask |= SMEAttrs::SM_Compatible | encodeZAState(StateValue::In) |
+               SMEAttrs::SME_ABI_Routine;
 }
 
 SMEAttrs::SMEAttrs(const AttributeList &Attrs) {
@@ -40,35 +55,29 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) {
     Bitmask |= SM_Compatible;
   if (Attrs.hasFnAttr("aarch64_pstate_sm_body"))
     Bitmask |= SM_Body;
-  if (Attrs.hasFnAttr("aarch64_pstate_za_shared"))
-    Bitmask |= ZA_Shared;
-  if (Attrs.hasFnAttr("aarch64_pstate_za_new"))
-    Bitmask |= ZA_New;
-  if (Attrs.hasFnAttr("aarch64_pstate_za_preserved"))
-    Bitmask |= ZA_Preserved;
+  if (Attrs.hasFnAttr("aarch64_in_za"))
+    Bitmask |= encodeZAState(StateValue::In);
+  if (Attrs.hasFnAttr("aarch64_out_za"))
+    Bitmask |= encodeZAState(StateValue::Out);
+  if (Attrs.hasFnAttr("aarch64_inout_za"))
+    Bitmask |= encodeZAState(StateValue::InOut);
+  if (Attrs.hasFnAttr("aarch64_preserves_za"))
+    Bitmask |= encodeZAState(StateValue::Preserved);
+  if (Attrs.hasFnAttr("aarch64_new_za"))
+    Bitmask |= encodeZAState(StateValue::New);
 }
 
-std::optional<bool>
-SMEAttrs::requiresSMChange(const SMEAttrs &Callee,
-                           bool BodyOverridesInterface) const {
-  // If the transition is not through a call (e.g. when considering inlining)
-  // and Callee has a streaming body, then we can ignore the interface of
-  // Callee.
-  if (BodyOverridesInterface && Callee.hasStreamingBody()) {
-    return hasStreamingInterfaceOrBody() ? std::nullopt
-                                         : std::optional<bool>(true);
-  }
-
+bool SMEAttrs::requiresSMChange(const SMEAttrs &Callee) const {
   if (Callee.hasStreamingCompatibleInterface())
-    return std::nullopt;
+    return false;
 
   // Both non-streaming
   if (hasNonStreamingInterfaceAndBody() && Callee.hasNonStreamingInterface())
-    return std::nullopt;
+    return false;
 
   // Both streaming
   if (hasStreamingInterfaceOrBody() && Callee.hasStreamingInterface())
-    return std::nullopt;
+    return false;
 
-  return Callee.hasStreamingInterface();
+  return true;
 }
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
index 1146fd4e3fa8e12c28ee456e2270fc6611d3b026..b57e2176b020739765dd79a553db8ceee0e689bd 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
@@ -26,22 +26,31 @@ class SMEAttrs {
   unsigned Bitmask;
 
 public:
+  enum class StateValue {
+    None = 0,
+    In = 1,        //aarch64_in_zt0
+    Out = 2,       //aarch64_out_zt0
+    InOut = 3,     //aarch64_inout_zt0
+    Preserved = 4, //aarch64_preserves_zt0
+    New = 5        //aarch64_new_zt0
+  };
+
   // Enum with bitmasks for each individual SME feature.
   enum Mask {
     Normal = 0,
-    SM_Enabled = 1 << 0,    // aarch64_pstate_sm_enabled
-    SM_Compatible = 1 << 1, // aarch64_pstate_sm_compatible
-    SM_Body = 1 << 2,       // aarch64_pstate_sm_body
-    ZA_Shared = 1 << 3,     // aarch64_pstate_sm_shared
-    ZA_New = 1 << 4,        // aarch64_pstate_sm_new
-    ZA_Preserved = 1 << 5,  // aarch64_pstate_sm_preserved
-    All = ZA_Preserved - 1
+    SM_Enabled = 1 << 0,      // aarch64_pstate_sm_enabled
+    SM_Compatible = 1 << 1,   // aarch64_pstate_sm_compatible
+    SM_Body = 1 << 2,         // aarch64_pstate_sm_body
+    SME_ABI_Routine = 1 << 3, // Used for SME ABI routines to avoid lazy saves
+    ZA_Shift = 4,
+    ZA_Mask = 0b111 << ZA_Shift,
   };
 
   SMEAttrs(unsigned Mask = Normal) : Bitmask(0) { set(Mask); }
   SMEAttrs(const Function &F) : SMEAttrs(F.getAttributes()) {}
   SMEAttrs(const CallBase &CB);
   SMEAttrs(const AttributeList &L);
+  SMEAttrs(StringRef FuncName);
 
   void set(unsigned M, bool Enable = true);
 
@@ -63,26 +72,34 @@ public:
 
   /// \return true if a call from Caller -> Callee requires a change in
   /// streaming mode.
-  /// If \p BodyOverridesInterface is true and Callee has a streaming body,
-  /// then requiresSMChange considers a call to Callee as having a Streaming
-  /// interface. This can be useful when considering e.g. inlining, where we
-  /// explicitly want the body to overrule the interface (because after inlining
-  /// the interface is no longer relevant).
-  std::optional<bool>
-  requiresSMChange(const SMEAttrs &Callee,
-                   bool BodyOverridesInterface = false) const;
+  bool requiresSMChange(const SMEAttrs &Callee) const;
 
-  // Interfaces to query PSTATE.ZA
-  bool hasNewZAInterface() const { return Bitmask & ZA_New; }
-  bool hasSharedZAInterface() const { return Bitmask & ZA_Shared; }
-  bool hasPrivateZAInterface() const { return !hasSharedZAInterface(); }
-  bool preservesZA() const { return Bitmask & ZA_Preserved; }
-  bool hasZAState() const {
-    return hasNewZAInterface() || hasSharedZAInterface();
+  // Interfaces to query ZA
+  static StateValue decodeZAState(unsigned Bitmask) {
+    return static_cast<StateValue>((Bitmask & ZA_Mask) >> ZA_Shift);
+  }
+  static unsigned encodeZAState(StateValue S) {
+    return static_cast<unsigned>(S) << ZA_Shift;
   }
+
+  bool isNewZA() const { return decodeZAState(Bitmask) == StateValue::New; }
+  bool isInZA() const { return decodeZAState(Bitmask) == StateValue::In; }
+  bool isOutZA() const { return decodeZAState(Bitmask) == StateValue::Out; }
+  bool isInOutZA() const { return decodeZAState(Bitmask) == StateValue::InOut; }
+  bool isPreservesZA() const {
+    return decodeZAState(Bitmask) == StateValue::Preserved;
+  }
+  bool sharesZA() const {
+    StateValue State = decodeZAState(Bitmask);
+    return State == StateValue::In || State == StateValue::Out ||
+           State == StateValue::InOut || State == StateValue::Preserved;
+  }
+  bool hasSharedZAInterface() const { return sharesZA(); }
+  bool hasPrivateZAInterface() const { return !hasSharedZAInterface(); }
+  bool hasZAState() const { return isNewZA() || sharesZA(); }
   bool requiresLazySave(const SMEAttrs &Callee) const {
     return hasZAState() && Callee.hasPrivateZAInterface() &&
-           !Callee.preservesZA();
+           !(Callee.Bitmask & SME_ABI_Routine);
   }
 };
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll
index 8cdb0696f7acc5149deb3f638f3bbc071a93e01d..d531c29da7551a96d4f8ceba3c0009fa4fb25848 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-inline-asm.ll
@@ -26,7 +26,7 @@ define void @asm_simple_register_clobber() {
 define i64 @asm_register_early_clobber() {
   ; CHECK-LABEL: name: asm_register_early_clobber
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"mov $0, 7; mov $1, 7", 1 /* sideeffect attdialect */, 2555915 /* regdef-ec:GPR64common */, def early-clobber %0, 2555915 /* regdef-ec:GPR64common */, def early-clobber %1, !0
+  ; CHECK-NEXT:   INLINEASM &"mov $0, 7; mov $1, 7", 1 /* sideeffect attdialect */, 2686987 /* regdef-ec:GPR64common */, def early-clobber %0, 2686987 /* regdef-ec:GPR64common */, def early-clobber %1, !0
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY %0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY %1
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
@@ -54,7 +54,7 @@ entry:
 define i32 @test_single_register_output() nounwind ssp {
   ; CHECK-LABEL: name: test_single_register_output
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %0
+  ; CHECK-NEXT:   INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %0
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %0
   ; CHECK-NEXT:   $w0 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   RET_ReallyLR implicit $w0
@@ -66,7 +66,7 @@ entry:
 define i64 @test_single_register_output_s64() nounwind ssp {
   ; CHECK-LABEL: name: test_single_register_output_s64
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"mov $0, 7", 0 /* attdialect */, 2555914 /* regdef:GPR64common */, def %0
+  ; CHECK-NEXT:   INLINEASM &"mov $0, 7", 0 /* attdialect */, 2686986 /* regdef:GPR64common */, def %0
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s64) = COPY %0
   ; CHECK-NEXT:   $x0 = COPY [[COPY]](s64)
   ; CHECK-NEXT:   RET_ReallyLR implicit $x0
@@ -79,7 +79,7 @@ entry:
 define float @test_multiple_register_outputs_same() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_same
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %0, 1507338 /* regdef:GPR32common */, def %1
+  ; CHECK-NEXT:   INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %0, 1638410 /* regdef:GPR32common */, def %1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
   ; CHECK-NEXT:   [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]]
@@ -96,7 +96,7 @@ define float @test_multiple_register_outputs_same() #0 {
 define double @test_multiple_register_outputs_mixed() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_mixed
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %0, 2359306 /* regdef:FPR64 */, def %1
+  ; CHECK-NEXT:   INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %0, 2490378 /* regdef:FPR64 */, def %1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY %1
   ; CHECK-NEXT:   $d0 = COPY [[COPY1]](s64)
@@ -120,13 +120,12 @@ entry:
 }
 
 define zeroext i8 @test_register_output_trunc(ptr %src) nounwind {
-  ;
   ; CHECK-LABEL: name: test_register_output_trunc
   ; CHECK: bb.1.entry:
   ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-  ; CHECK-NEXT:   INLINEASM &"mov ${0:w}, 32", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %1
+  ; CHECK-NEXT:   INLINEASM &"mov ${0:w}, 32", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %1
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s8)
@@ -156,7 +155,7 @@ define void @test_input_register_imm() {
   ; CHECK: bb.1 (%ir-block.0):
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 42
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY [[C]](s64)
-  ; CHECK-NEXT:   INLINEASM &"mov x0, $0", 1 /* sideeffect attdialect */, 2555913 /* reguse:GPR64common */, [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"mov x0, $0", 1 /* sideeffect attdialect */, 2686985 /* reguse:GPR64common */, [[COPY]]
   ; CHECK-NEXT:   RET_ReallyLR
   call void asm sideeffect "mov x0, $0", "r"(i64 42)
   ret void
@@ -191,7 +190,7 @@ define zeroext i8 @test_input_register(ptr %src) nounwind {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr64common = COPY [[COPY]](p0)
-  ; CHECK-NEXT:   INLINEASM &"ldtrb ${0:w}, [$1]", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %1, 2555913 /* reguse:GPR64common */, [[COPY1]]
+  ; CHECK-NEXT:   INLINEASM &"ldtrb ${0:w}, [$1]", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %1, 2686985 /* reguse:GPR64common */, [[COPY1]]
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %1
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
   ; CHECK-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s8)
@@ -208,7 +207,7 @@ define i32 @test_memory_constraint(ptr %a) nounwind {
   ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-  ; CHECK-NEXT:   INLINEASM &"ldr $0, $1", 8 /* mayload attdialect */, 1507338 /* regdef:GPR32common */, def %1, 262158 /* mem:m */, [[COPY]](p0)
+  ; CHECK-NEXT:   INLINEASM &"ldr $0, $1", 8 /* mayload attdialect */, 1638410 /* regdef:GPR32common */, def %1, 262158 /* mem:m */, [[COPY]](p0)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %1
   ; CHECK-NEXT:   $w0 = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   RET_ReallyLR implicit $w0
@@ -222,7 +221,7 @@ define i16 @test_anyext_input() {
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s16)
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32common = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1507338 /* regdef:GPR32common */, def %0, 1507337 /* reguse:GPR32common */, [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1638410 /* regdef:GPR32common */, def %0, 1638409 /* reguse:GPR32common */, [[COPY]]
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
@@ -238,7 +237,7 @@ define i16 @test_anyext_input_with_matching_constraint() {
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s16)
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32common = COPY [[ANYEXT]](s32)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1507338 /* regdef:GPR32common */, def %0, 2147483657 /* reguse tiedto:$0 */, [[COPY]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1638410 /* regdef:GPR32common */, def %0, 2147483657 /* reguse tiedto:$0 */, [[COPY]](tied-def 3)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %0
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-unwind-inline-asm.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-unwind-inline-asm.ll
index bb7c9e274814ef8acba580c029c91176670fe4f5..c224b0a259fcd18858036a773ef7f3fd0dc3edfd 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-unwind-inline-asm.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-unwind-inline-asm.ll
@@ -71,7 +71,7 @@ define void @test2() #0 personality ptr @__gcc_personality_v0 {
   ; CHECK-NEXT:   G_INVOKE_REGION_START
   ; CHECK-NEXT:   EH_LABEL <mcsymbol >
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr64common = COPY [[DEF]](p0)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2555913 /* reguse:GPR64common */, [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2686985 /* reguse:GPR64common */, [[COPY]]
   ; CHECK-NEXT:   EH_LABEL <mcsymbol >
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir
index 28b56945c7775a980d4c26bea36d415574004257..f75731e351c7826e9a34286dce0dbafc74e998b3 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-inlineasm.mir
@@ -57,7 +57,7 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     ; CHECK-LABEL: name: inlineasm_virt_reg_output
-    ; CHECK: INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 1310730 /* regdef:FPR32 */, def %0
+    ; CHECK: INLINEASM &"mov ${0:w}, 7", 0 /* attdialect */, 1310730 /* regdef:PPR2_with_psub_in_PNR_p8to15_and_PPR2_with_psub1_in_PPR_3b */, def %0
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY %0
     ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
@@ -75,7 +75,7 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     ; CHECK-LABEL: name: inlineasm_virt_mixed_types
-    ; CHECK: INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1310730 /* regdef:FPR32 */, def %0, 2162698 /* regdef:WSeqPairsClass_with_sube32_in_MatrixIndexGPR32_12_15 */, def %1
+    ; CHECK: INLINEASM &"mov $0, #0; mov $1, #0", 0 /* attdialect */, 1310730 /* regdef:PPR2_with_psub_in_PNR_p8to15_and_PPR2_with_psub1_in_PPR_3b */, def %0, 2162698 /* regdef:WSeqPairsClass_with_subo32_in_GPR32common */, def %1
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY %0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr(s64) = COPY %1
     ; CHECK-NEXT: $d0 = COPY [[COPY1]](s64)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..98e7ad740681e68c11bb1ce1f946c715e9149278
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sme2 -stop-after=finalize-isel | FileCheck %s
+
+define dso_local void @UphPNR(target("aarch64.svcount") %predcnt) {
+entry:
+; CHECK:  %0:ppr = COPY $p0
+; CHECK:  STR_PXI %0, %stack.0.predcnt.addr, 0 :: (store unknown-size into %ir.predcnt.addr, align 2)
+; CHECK:  %1:pnr_p8to15 = COPY %0
+; CHECK:  INLINEASM &"ld1w {z0.s,z1.s,z2.s,z3.s}, $0/z, [x10]", 1 /* sideeffect attdialect */, 393225 /* reguse:PNR_p8to15 */, %1
+; CHECK:  RET_ReallyLR
+  %predcnt.addr = alloca target("aarch64.svcount"), align 2
+  store target("aarch64.svcount") %predcnt, ptr %predcnt.addr, align 2
+  %0 = load target("aarch64.svcount"), ptr %predcnt.addr, align 2
+  call void asm sideeffect "ld1w {z0.s,z1.s,z2.s,z3.s}, $0/z, [x10]", "@3Uph"(target("aarch64.svcount") %0)
+  ret void
+}
+
+define dso_local void @UpaPNR(target("aarch64.svcount") %predcnt) {
+entry:
+; CHECK:  %0:ppr = COPY $p0
+; CHECK:  STR_PXI %0, %stack.0.predcnt.addr, 0 :: (store unknown-size into %ir.predcnt.addr, align 2)
+; CHECK:  %1:pnr = COPY %0
+; CHECK:  INLINEASM &"ld1w {z0.s,z1.s,z2.s,z3.s}, $0/z, [x10]", 1 /* sideeffect attdialect */, 262153 /* reguse:PNR */, %1
+; CHECK:  RET_ReallyLR
+  %predcnt.addr = alloca target("aarch64.svcount"), align 2
+  store target("aarch64.svcount") %predcnt, ptr %predcnt.addr, align 2
+  %0 = load target("aarch64.svcount"), ptr %predcnt.addr, align 2
+  call void asm sideeffect "ld1w {z0.s,z1.s,z2.s,z3.s}, $0/z, [x10]", "@3Upa"(target("aarch64.svcount") %0)
+  ret void
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
index ad1093028c1a67181d167cc2769b376479a2c5da..4ca2fb8815797f02275713f742cfe168ccc2b792 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve -stop-after=finalize-isel | FileCheck %s --check-prefix=CHECK
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
@@ -68,3 +69,16 @@ define <vscale x 4 x i32> @test_incp(<vscale x 16 x i1> %Pg, <vscale x 4 x i32>
   %1 = tail call <vscale x 4 x i32> asm "incp $0.s, $1", "=w,@3Upa,0"(<vscale x 16 x i1> %Pg, <vscale x 4 x i32> %Zn)
   ret <vscale x 4 x i32> %1
 }
+
+; Function Attrs: nounwind readnone
+; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
+; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
+; CHECK: [[ARG3:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[ARG4:%[0-9]+]]:ppr_p8to15 = COPY [[ARG3]]
+; CHECK: INLINEASM {{.*}} [[ARG4]]
+define <vscale x 8 x half> @test_svfadd_f16_Uph_constraint(<vscale x 16 x i1> %Pg, <vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm) {
+  %1 = tail call <vscale x 8 x half> asm "fadd $0.h, $1/m, $2.h, $3.h", "=w,@3Uph,w,w"(<vscale x 16 x i1> %Pg, <vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm)
+  ret <vscale x 8 x half> %1
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-za-clobber.ll b/llvm/test/CodeGen/AArch64/aarch64-za-clobber.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a8cba7dc9a91e9aed4284bf556babc2d309525f8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-za-clobber.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu -stop-after=aarch64-isel < %s -o - | FileCheck %s
+
+define void @alpha(<vscale x 4 x i32> %x) local_unnamed_addr {
+entry:
+; CHECK: INLINEASM &"movt zt0[3, mul vl], z0", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $za
+  tail call void asm sideeffect "movt zt0[3, mul vl], z0", "~{za}"()
+  ret void
+}
+
+define void @beta(<vscale x 4 x i32> %x) local_unnamed_addr {
+entry:
+; CHECK: INLINEASM &"movt zt0[3, mul vl], z0", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $zt0
+  tail call void asm sideeffect "movt zt0[3, mul vl], z0", "~{zt0}"()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/callbr-asm-outputs-indirect-isel.ll b/llvm/test/CodeGen/AArch64/callbr-asm-outputs-indirect-isel.ll
index cf22216daf516a45a1a66bcee11f10bca4f666a5..ed02fdfc996a4335be07f3ffdef8428a5b0bd912 100644
--- a/llvm/test/CodeGen/AArch64/callbr-asm-outputs-indirect-isel.ll
+++ b/llvm/test/CodeGen/AArch64/callbr-asm-outputs-indirect-isel.ll
@@ -18,7 +18,7 @@ define i32 @test0() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000), %bb.1(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"# $0", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %5, 13 /* imm */, %bb.1
+  ; CHECK-NEXT:   INLINEASM_BR &"# $0", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %5, 13 /* imm */, %bb.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %5
   ; CHECK-NEXT:   B %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -31,7 +31,7 @@ define i32 @test0() {
   ; CHECK-NEXT: bb.2.direct:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000), %bb.3(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"# $0", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %7, 13 /* imm */, %bb.3
+  ; CHECK-NEXT:   INLINEASM_BR &"# $0", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %7, 13 /* imm */, %bb.3
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY %7
   ; CHECK-NEXT:   B %bb.4
   ; CHECK-NEXT: {{  $}}
@@ -107,7 +107,7 @@ define i32 @dont_split1() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000), %bb.2(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %1, 13 /* imm */, %bb.2
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %1, 13 /* imm */, %bb.2
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %1
   ; CHECK-NEXT:   B %bb.1
   ; CHECK-NEXT: {{  $}}
@@ -168,7 +168,7 @@ define i32 @dont_split3() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000), %bb.2(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %0, 13 /* imm */, %bb.2
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %0, 13 /* imm */, %bb.2
   ; CHECK-NEXT:   B %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.x:
@@ -195,7 +195,7 @@ define i32 @split_me0() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000), %bb.1(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %3
   ; CHECK-NEXT:   B %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -245,7 +245,7 @@ define i32 @split_me1(i1 %z) {
   ; CHECK-NEXT: bb.1.w:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000), %bb.2(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %5, 13 /* imm */, %bb.2, 13 /* imm */, %bb.2
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %5, 13 /* imm */, %bb.2, 13 /* imm */, %bb.2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32all = COPY %5
   ; CHECK-NEXT:   B %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -298,7 +298,7 @@ define i32 @split_me2(i1 %z) {
   ; CHECK-NEXT: bb.1.w:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000), %bb.2(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %6, 13 /* imm */, %bb.2, 13 /* imm */, %bb.2
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %6, 13 /* imm */, %bb.2, 13 /* imm */, %bb.2
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY %6
   ; CHECK-NEXT:   B %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -341,7 +341,7 @@ define i32 @dont_split4() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000), %bb.2(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.2
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.2
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %3
   ; CHECK-NEXT:   B %bb.1
   ; CHECK-NEXT: {{  $}}
@@ -380,7 +380,7 @@ define i32 @dont_split5() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000), %bb.1(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %3
   ; CHECK-NEXT:   B %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -411,7 +411,7 @@ define i32 @split_me3() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000), %bb.1(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %3
   ; CHECK-NEXT:   B %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -458,7 +458,7 @@ define i32 @dont_split6(i32 %0) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr32all = PHI [[COPY]], %bb.0, %2, %bb.2
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32common = COPY [[PHI]]
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %4, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3), 13 /* imm */, %bb.2
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %4, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3), 13 /* imm */, %bb.2
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY %4
   ; CHECK-NEXT:   B %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -493,7 +493,7 @@ define i32 @split_me4() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000), %bb.1(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %3
   ; CHECK-NEXT:   B %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -524,7 +524,7 @@ define i32 @split_me5() {
   ; CHECK: bb.0.entry:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000), %bb.1(0x00000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1507338 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
+  ; CHECK-NEXT:   INLINEASM_BR &"", 0 /* attdialect */, 1638410 /* regdef:GPR32common */, def %3, 13 /* imm */, %bb.1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32all = COPY %3
   ; CHECK-NEXT:   B %bb.2
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
index 22cff36afaf7f3e11cf10ae821c1681dd28ef04e..015e8e9ba2b6bb94d97ef66bc7f573c2648df211 100644
--- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
+++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
@@ -193,7 +193,7 @@ body:             |
     liveins: $z0, $z1, $p0, $p1, $w0
 
     renamable $p2 = COPY killed $p0
-    renamable $p0 = PTRUE_S 31
+    renamable $p0 = PTRUE_S 31, implicit $vg
     ST1W_IMM killed renamable $z0, renamable $p0, %stack.0.z0.addr, 0 :: (store unknown-size into %ir.z0.addr, align 16)
     ST1W_IMM killed renamable $z1, renamable $p0, %stack.1.z1.addr, 0 :: (store unknown-size into %ir.z1.addr, align 16)
     STR_PXI killed renamable $p2, %stack.2.p0.addr, 0 :: (store unknown-size into %ir.p0.addr, align 2)
diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
index 75917ef32ae2adda4e9f632f2db33b0748247d76..0ea180b20730f2d86120933dc0b580cecb5036f0 100644
--- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
+++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
@@ -111,7 +111,7 @@ body:             |
     STRXui killed renamable $x1, %stack.1, 0, debug-location !8
     DBG_VALUE %stack.1, $noreg, !11, !DIExpression(DW_OP_constu, 16, DW_OP_plus, DW_OP_deref), debug-location !8
 
-    renamable $p2 = PTRUE_S 31, debug-location !DILocation(line: 4, column: 1, scope: !5)
+    renamable $p2 = PTRUE_S 31, implicit $vg, debug-location !DILocation(line: 4, column: 1, scope: !5)
     ST1W_IMM renamable $z0, renamable $p2, %stack.2, 0, debug-location !DILocation(line: 5, column: 1, scope: !5)
     DBG_VALUE %stack.2, $noreg, !12, !DIExpression(DW_OP_deref), debug-location !DILocation(line: 5, column: 1, scope: !5)
     ST1W_IMM renamable $z1, killed renamable $p2, %stack.3, 0, debug-location !DILocation(line: 6, column: 1, scope: !5)
diff --git a/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir
index 6fe094cc6cbb4e8b71dbdf0b5ac5ff9e347bb8ea..8255c7dd6f1e4d9e118e209490726932142e6bc2 100644
--- a/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir
+++ b/llvm/test/CodeGen/AArch64/emit_fneg_with_non_register_operand.mir
@@ -91,10 +91,10 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[LOADgot:%[0-9]+]]:gpr64common = LOADgot target-flags(aarch64-got) @c
   ; CHECK-NEXT:   [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:FPR64 */, def %2, 2147483657 /* reguse tiedto:$0 */, [[LDRDui]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:WSeqPairsClass_with_sube32_in_MatrixIndexGPR32_8_11 */, def %2, 2147483657 /* reguse tiedto:$0 */, [[LDRDui]](tied-def 3)
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:fpr64 = COPY %2
   ; CHECK-NEXT:   [[LDRDui1:%[0-9]+]]:fpr64 = LDRDui [[LOADgot]], 0 :: (dereferenceable load (s64) from @c)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:FPR64 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[LDRDui1]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2359306 /* regdef:WSeqPairsClass_with_sube32_in_MatrixIndexGPR32_8_11 */, def %4, 2147483657 /* reguse tiedto:$0 */, [[LDRDui1]](tied-def 3)
   ; CHECK-NEXT:   [[FNEGDr:%[0-9]+]]:fpr64 = FNEGDr %2
   ; CHECK-NEXT:   nofpexcept FCMPDrr %4, killed [[FNEGDr]], implicit-def $nzcv, implicit $fpcr
   ; CHECK-NEXT:   Bcc 1, %bb.2, implicit $nzcv
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
index 7d7b3ace8a915cdc5a57230a96eb288ebb008dbb..8725bb7061fabef0e5bfccc1cf0072211f6389f7 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
@@ -19,8 +19,8 @@
   ; CHECK-NEXT:    // implicit-def: $p4
   ; CHECK-NEXT:    addvl sp, sp, #1
   ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-  ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
   ; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+  ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
   ; CHECK-NEXT:    addvl sp, sp, #2
   ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
   ; CHECK-NEXT:    .cfi_restore z8
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 7c87587c6dc4e2c6b2d127b45d43e6d4b47ce49f..eae547fc7033aa5315a9b2fe864e6c23094a59cc 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -771,9 +771,9 @@ body:             |
 
 # CHECK:      $sp  = frame-destroy ADDXri $sp, 32, 0
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0
+# CHECK-NEXT: $z10  = frame-destroy LDR_ZXI $sp, 0
 # CHECK-NEXT: $z9  = frame-destroy LDR_ZXI $sp, 1
-# CHECK-NEXT: $z8  = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
 # CHECK-NEXT: $sp  = frame-destroy ADDVL_XXI $sp, 3
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
@@ -872,14 +872,14 @@ body:             |
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
 # CHECK:      $sp = frame-destroy ADDVL_XXI $sp, 1
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK:      $p15 = frame-destroy LDR_PXI $sp, 4
-# CHECK:      $p14 = frame-destroy LDR_PXI $sp, 5
-# CHECK:      $p5 = frame-destroy LDR_PXI $sp, 14
-# CHECK:      $p4 = frame-destroy LDR_PXI $sp, 15
 # CHECK:      $z23 = frame-destroy LDR_ZXI $sp, 2
 # CHECK:      $z22 = frame-destroy LDR_ZXI $sp, 3
 # CHECK:      $z9 = frame-destroy LDR_ZXI $sp, 16
 # CHECK:      $z8 = frame-destroy LDR_ZXI $sp, 17
+# CHECK:      $p15 = frame-destroy LDR_PXI $sp, 4
+# CHECK:      $p14 = frame-destroy LDR_PXI $sp, 5
+# CHECK:      $p5 = frame-destroy LDR_PXI $sp, 14
+# CHECK:      $p4 = frame-destroy LDR_PXI $sp, 15
 # CHECK:      $sp = frame-destroy ADDVL_XXI $sp, 18
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 32
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
@@ -1036,14 +1036,14 @@ body:             |
 # CHECK-NEXT: $sp = ANDXri killed $[[TMP]]
 
 # CHECK:      $sp = frame-destroy ADDVL_XXI $fp, -18
+# CHECK:      $z23 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3
+# CHECK:      $z9 = frame-destroy LDR_ZXI $sp, 16
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17
 # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4
 # CHECK-NEXT: $p14 = frame-destroy LDR_PXI $sp, 5
 # CHECK:      $p5 = frame-destroy LDR_PXI $sp, 14
 # CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 15
-# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2
-# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3
-# CHECK:      $z9 = frame-destroy LDR_ZXI $sp, 16
-# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10
@@ -1197,10 +1197,10 @@ body:             |
 
 # CHECK:      $sp = frame-destroy ADDVL_XXI $sp, 7
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6
-# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7
 # CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 1
 # CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6
+# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7
 # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
diff --git a/llvm/test/CodeGen/AArch64/inlineasm-Uc-constraint.ll b/llvm/test/CodeGen/AArch64/inlineasm-Uc-constraint.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0bee7ea40cc1aeb460e6854bb63c520e43933362
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/inlineasm-Uc-constraint.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -o - | FileCheck %s
+
+target triple = "arm64-none-linux-gnu"
+
+define void @test_constraints_Uci_w(i32 %a) {
+; CHECK-LABEL: test_constraints_Uci_w:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    add x0, x0, x8
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ret
+  call void asm sideeffect "add x0, x0, $0", "@3Uci,~{x0}"(i32 %a)
+  ret void
+}
+
+; As test_constraints_Uci_w but ensures non-legal types are also covered.
+define void @test_constraints_Uci_w_i8(i8 %a) {
+; CHECK-LABEL: test_constraints_Uci_w_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    add x0, x0, x8
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ret
+  call void asm sideeffect "add x0, x0, $0", "@3Uci,~{x0}"(i8 %a)
+  ret void
+}
+
+define void @test_constraints_Uci_x(i64 %a) {
+; CHECK-LABEL: test_constraints_Uci_x:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    add x0, x0, x8
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ret
+  call void asm sideeffect "add x0, x0, $0", "@3Uci,~{x0}"(i64 %a)
+  ret void
+}
+
+define void @test_constraint_Ucj_w(i32 %a) {
+; CHECK-LABEL: test_constraint_Ucj_w:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    add x0, x0, x12
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ret
+  call void asm sideeffect "add x0, x0, $0", "@3Ucj,~{x0}"(i32 %a)
+  ret void
+}
+
+; As test_constraints_Ucj_w but ensures non-legal types are also covered.
+define void @test_constraint_Ucj_w_i8(i8 %a) {
+; CHECK-LABEL: test_constraint_Ucj_w_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    add x0, x0, x12
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ret
+  call void asm sideeffect "add x0, x0, $0", "@3Ucj,~{x0}"(i8 %a)
+  ret void
+}
+
+define void @test_constraint_Ucj_x(i64 %a) {
+; CHECK-LABEL: test_constraint_Ucj_x:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x12, x0
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    add x0, x0, x12
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ret
+  call void asm sideeffect "add x0, x0, $0", "@3Ucj,~{x0}"(i64 %a)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/live-debugvalues-sve.mir b/llvm/test/CodeGen/AArch64/live-debugvalues-sve.mir
index 8903ca2b865b98939f83c1d49c37988fc374ad81..612453ab53f4385bc9b5c0978e74da0f8da26b4a 100644
--- a/llvm/test/CodeGen/AArch64/live-debugvalues-sve.mir
+++ b/llvm/test/CodeGen/AArch64/live-debugvalues-sve.mir
@@ -145,7 +145,7 @@ body:             |
     liveins: $z1
 
     ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp, debug-location !34
-    renamable $p0 = PTRUE_S 31, debug-location !34
+    renamable $p0 = PTRUE_S 31, implicit $vg, debug-location !34
     $x0 = ADDXri %stack.0, 0, 0, debug-location !34
     ST1W_IMM renamable $z1, killed renamable $p0, %stack.0, 0, debug-location !34 :: (store unknown-size into %stack.0, align 16)
     $z0 = COPY renamable $z1, debug-location !34
@@ -157,7 +157,7 @@ body:             |
     $z7 = COPY renamable $z1, debug-location !34
     BL @bar, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit $z1, implicit $z2, implicit $z3, implicit $z4, implicit $z5, implicit $z6, implicit $z7, implicit $x0, implicit-def $sp, implicit-def $z0, implicit-def $z1, debug-location !34
     ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp, debug-location !34
-    renamable $p0 = PTRUE_S 31, debug-location !34
+    renamable $p0 = PTRUE_S 31, implicit $vg, debug-location !34
     $z3 = IMPLICIT_DEF
     renamable $z1 = LD1W_IMM renamable $p0, %stack.0, 0, debug-location !34 :: (load unknown-size from %stack.0, align 16)
     ST1W_IMM renamable $z3, killed renamable $p0, %stack.0, 0 :: (store unknown-size into %stack.0, align 16)
diff --git a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir
index 1c2fe27cdbc397ab36564bf7b6f197f7774f9450..546f2ec0c4171525932ad22073a151feec4d9c51 100644
--- a/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir
+++ b/llvm/test/CodeGen/AArch64/peephole-insvigpr.mir
@@ -26,7 +26,6 @@
     ret void
   }
 
-  ; The optimization is not applicable when the source is not a virtual register
   define void @insert_vec_from_gpr(i32 %v, ptr %p) {
   entry:
     ret void
@@ -488,7 +487,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:gpr64all = IMPLICIT_DEF
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[DEF]]
-    ; CHECK-NEXT: INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, 2359306 /* regdef:FPR64 */, def %1, 262158 /* mem:m */, killed [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"ldr ${0:s}, $1", 8 /* mayload attdialect */, 2359306 /* regdef:WSeqPairsClass_with_sube32_in_MatrixIndexGPR32_8_11 */, def %1, 262158 /* mem:m */, killed [[COPY1]]
     ; CHECK-NEXT: [[MOVIv2d_ns:%[0-9]+]]:fpr128 = MOVIv2d_ns 0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY [[MOVIv2d_ns]].dsub
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AArch64/preserve.ll b/llvm/test/CodeGen/AArch64/preserve.ll
index 924e6f8487ebf0bc4af3db73153ad6c857b6214f..7f57420a5fa1a7e913b704dfed57b99e0d69b99e 100644
--- a/llvm/test/CodeGen/AArch64/preserve.ll
+++ b/llvm/test/CodeGen/AArch64/preserve.ll
@@ -4,13 +4,13 @@
 target triple = "aarch64-unknown-unknown"
 declare void @bar1()
 define preserve_mostcc void @baz() #0 {
-; CHECK: baz Clobbered Registers: $ffr $fpcr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $b16 $b17 $b18 $b19 $b20 $b21 $b22 $b23 $b24 $b25 $b26 $b27 $b28 $b29 $b30 $b31 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $d16 $d17 $d18 $d19 $d20 $d21 $d22 $d23 $d24 $d25 $d26 $d27 $d28 $d29 $d30 $d31 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $h16 $h17 $h18 $h19 $h20 $h21 $h22 $h23 $h24 $h25 $h26 $h27 $h28 $h29 $h30 $h31 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $q8 $q9 $q10 $q11 $q12 $q13 $q14 $q15 $q16 $q17 $q18 $q19 $q20 $q21 $q22 $q23 $q24 $q25 $q26 $q27 $q28 $q29 $q30 $q31 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $s16 $s17 $s18 $s19 $s20 $s21 $s22 $s23 $s24 $s25 $s26 $s27 $s28 $s29 $s30 $s31 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $z0_hi $z1_hi $z2_hi $z3_hi $z4_hi $z5_hi $z6_hi $z7_hi $z8_hi $z9_hi $z10_hi $z11_hi $z12_hi $z13_hi $z14_hi $z15_hi $z16_hi $z17_hi $z18_hi $z19_hi $z20_hi $z21_hi $z22_hi $z23_hi $z24_hi $z25_hi $z26_hi $z27_hi $z28_hi $z29_hi $z30_hi $z31_hi $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15
+; CHECK: baz Clobbered Registers: $ffr $fpcr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $b16 $b17 $b18 $b19 $b20 $b21 $b22 $b23 $b24 $b25 $b26 $b27 $b28 $b29 $b30 $b31 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $d16 $d17 $d18 $d19 $d20 $d21 $d22 $d23 $d24 $d25 $d26 $d27 $d28 $d29 $d30 $d31 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $h16 $h17 $h18 $h19 $h20 $h21 $h22 $h23 $h24 $h25 $h26 $h27 $h28 $h29 $h30 $h31 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $pn0 $pn1 $pn2 $pn3 $pn4 $pn5 $pn6 $pn7 $pn8 $pn9 $pn10 $pn11 $pn12 $pn13 $pn14 $pn15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $q8 $q9 $q10 $q11 $q12 $q13 $q14 $q15 $q16 $q17 $q18 $q19 $q20 $q21 $q22 $q23 $q24 $q25 $q26 $q27 $q28 $q29 $q30 $q31 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $s16 $s17 $s18 $s19 $s20 $s21 $s22 $s23 $s24 $s25 $s26 $s27 $s28 $s29 $s30 $s31 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $z0_hi $z1_hi $z2_hi $z3_hi $z4_hi $z5_hi $z6_hi $z7_hi $z8_hi $z9_hi $z10_hi $z11_hi $z12_hi $z13_hi $z14_hi $z15_hi $z16_hi $z17_hi $z18_hi $z19_hi $z20_hi $z21_hi $z22_hi $z23_hi $z24_hi $z25_hi $z26_hi $z27_hi $z28_hi $z29_hi $z30_hi $z31_hi $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15
   call void @bar1()
   call void @bar2()
   ret void
 }
 define preserve_allcc void @foo() #0 {
-; CHECK: foo Clobbered Registers: $ffr $fpcr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $z0_hi $z1_hi $z2_hi $z3_hi $z4_hi $z5_hi $z6_hi $z7_hi $z8_hi $z9_hi $z10_hi $z11_hi $z12_hi $z13_hi $z14_hi $z15_hi $z16_hi $z17_hi $z18_hi $z19_hi $z20_hi $z21_hi $z22_hi $z23_hi $z24_hi $z25_hi $z26_hi $z27_hi $z28_hi $z29_hi $z30_hi $z31_hi $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15
+; CHECK: foo Clobbered Registers: $ffr $fpcr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $pn0 $pn1 $pn2 $pn3 $pn4 $pn5 $pn6 $pn7 $pn8 $pn9 $pn10 $pn11 $pn12 $pn13 $pn14 $pn15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $z0_hi $z1_hi $z2_hi $z3_hi $z4_hi $z5_hi $z6_hi $z7_hi $z8_hi $z9_hi $z10_hi $z11_hi $z12_hi $z13_hi $z14_hi $z15_hi $z16_hi $z17_hi $z18_hi $z19_hi $z20_hi $z21_hi $z22_hi $z23_hi $z24_hi $z25_hi $z26_hi $z27_hi $z28_hi $z29_hi $z30_hi $z31_hi $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15
   call void @bar1()
   call void @bar2()
   ret void
diff --git a/llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll b/llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cd5046a9a64752cea8c8f2c70e4e7d3645685bc0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mattr=+sme -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK-COALESCER-BARRIER
+; RUN: llc -mattr=+sme -stop-after=virtregrewriter < %s | FileCheck %s --check-prefix=CHECK-REGALLOC
+
+target triple = "aarch64"
+
+define void @dont_coalesce_args(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
+  ; CHECK-COALESCER-BARRIER-LABEL: name: dont_coalesce_args
+  ; CHECK-COALESCER-BARRIER: bb.0 (%ir-block.0):
+  ; CHECK-COALESCER-BARRIER-NEXT:   liveins: $q0
+  ; CHECK-COALESCER-BARRIER-NEXT: {{  $}}
+  ; CHECK-COALESCER-BARRIER-NEXT:   [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+  ; CHECK-COALESCER-BARRIER-NEXT:   [[COALESCER_BARRIER_FPR128_:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COPY]]
+  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-COALESCER-BARRIER-NEXT:   [[DEF:%[0-9]+]]:zpr = IMPLICIT_DEF
+  ; CHECK-COALESCER-BARRIER-NEXT:   [[INSERT_SUBREG:%[0-9]+]]:zpr = INSERT_SUBREG [[DEF]], [[COALESCER_BARRIER_FPR128_]], %subreg.zsub
+  ; CHECK-COALESCER-BARRIER-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-COALESCER-BARRIER-NEXT:   $z0 = COPY [[INSERT_SUBREG]]
+  ; CHECK-COALESCER-BARRIER-NEXT:   BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
+  ; CHECK-COALESCER-BARRIER-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-COALESCER-BARRIER-NEXT:   RET_ReallyLR
+  ;
+  ; CHECK-REGALLOC-LABEL: name: dont_coalesce_args
+  ; CHECK-REGALLOC: bb.0 (%ir-block.0):
+  ; CHECK-REGALLOC-NEXT:   liveins: $q0
+  ; CHECK-REGALLOC-NEXT: {{  $}}
+  ; CHECK-REGALLOC-NEXT:   renamable $q0 = COALESCER_BARRIER_FPR128 killed renamable $q0
+  ; CHECK-REGALLOC-NEXT:   STRQui killed renamable $q0, %stack.0, 0 :: (store (s128) into %stack.0)
+  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-REGALLOC-NEXT:   renamable $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
+  ; CHECK-REGALLOC-NEXT:   renamable $q0 = KILL killed renamable $q0, implicit-def $z0
+  ; CHECK-REGALLOC-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-REGALLOC-NEXT:   BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
+  ; CHECK-REGALLOC-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-REGALLOC-NEXT:   RET_ReallyLR
+  %sa = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> %a, i64 0)
+  call void @scalable_args(<vscale x 2 x i64> %sa)
+  ret void
+}
+
+define <2 x i64> @dont_coalesce_res() "aarch64_pstate_sm_body" nounwind {
+  ; CHECK-COALESCER-BARRIER-LABEL: name: dont_coalesce_res
+  ; CHECK-COALESCER-BARRIER: bb.0 (%ir-block.0):
+  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-COALESCER-BARRIER-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-COALESCER-BARRIER-NEXT:   BL @scalable_res, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
+  ; CHECK-COALESCER-BARRIER-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-COALESCER-BARRIER-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-COALESCER-BARRIER-NEXT:   [[COPY1:%[0-9]+]]:fpr128 = COPY [[COPY]].zsub
+  ; CHECK-COALESCER-BARRIER-NEXT:   [[COALESCER_BARRIER_FPR128_:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COPY1]]
+  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $q0, implicit $vg, implicit-def $vg
+  ; CHECK-COALESCER-BARRIER-NEXT:   $q0 = COPY [[COALESCER_BARRIER_FPR128_]]
+  ; CHECK-COALESCER-BARRIER-NEXT:   RET_ReallyLR implicit $q0
+  ;
+  ; CHECK-REGALLOC-LABEL: name: dont_coalesce_res
+  ; CHECK-REGALLOC: bb.0 (%ir-block.0):
+  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-REGALLOC-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-REGALLOC-NEXT:   BL @scalable_res, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
+  ; CHECK-REGALLOC-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-REGALLOC-NEXT:   renamable $q0 = KILL renamable $q0, implicit killed $z0
+  ; CHECK-REGALLOC-NEXT:   renamable $q0 = COALESCER_BARRIER_FPR128 killed renamable $q0
+  ; CHECK-REGALLOC-NEXT:   STRQui killed renamable $q0, %stack.0, 0 :: (store (s128) into %stack.0)
+  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def dead $q0, implicit $vg, implicit-def $vg
+  ; CHECK-REGALLOC-NEXT:   $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
+  ; CHECK-REGALLOC-NEXT:   RET_ReallyLR implicit $q0
+  %sa = call <vscale x 2 x i64> @scalable_res()
+  %res = call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> %sa, i64 0)
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @dont_coalesce_arg_that_is_also_res(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
+  ; CHECK-COALESCER-BARRIER-LABEL: name: dont_coalesce_arg_that_is_also_res
+  ; CHECK-COALESCER-BARRIER: bb.0 (%ir-block.0):
+  ; CHECK-COALESCER-BARRIER-NEXT:   liveins: $q0
+  ; CHECK-COALESCER-BARRIER-NEXT: {{  $}}
+  ; CHECK-COALESCER-BARRIER-NEXT:   [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+  ; CHECK-COALESCER-BARRIER-NEXT:   [[COALESCER_BARRIER_FPR128_:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COPY]]
+  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-COALESCER-BARRIER-NEXT:   [[DEF:%[0-9]+]]:zpr = IMPLICIT_DEF
+  ; CHECK-COALESCER-BARRIER-NEXT:   [[INSERT_SUBREG:%[0-9]+]]:zpr = INSERT_SUBREG [[DEF]], [[COALESCER_BARRIER_FPR128_]], %subreg.zsub
+  ; CHECK-COALESCER-BARRIER-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-COALESCER-BARRIER-NEXT:   $z0 = COPY [[INSERT_SUBREG]]
+  ; CHECK-COALESCER-BARRIER-NEXT:   BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
+  ; CHECK-COALESCER-BARRIER-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-COALESCER-BARRIER-NEXT:   [[COALESCER_BARRIER_FPR128_1:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COALESCER_BARRIER_FPR128_]]
+  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $q0, implicit $vg, implicit-def $vg
+  ; CHECK-COALESCER-BARRIER-NEXT:   $q0 = COPY [[COALESCER_BARRIER_FPR128_1]]
+  ; CHECK-COALESCER-BARRIER-NEXT:   RET_ReallyLR implicit $q0
+  ;
+  ; CHECK-REGALLOC-LABEL: name: dont_coalesce_arg_that_is_also_res
+  ; CHECK-REGALLOC: bb.0 (%ir-block.0):
+  ; CHECK-REGALLOC-NEXT:   liveins: $q0
+  ; CHECK-REGALLOC-NEXT: {{  $}}
+  ; CHECK-REGALLOC-NEXT:   renamable $q0 = COALESCER_BARRIER_FPR128 killed renamable $q0
+  ; CHECK-REGALLOC-NEXT:   STRQui killed renamable $q0, %stack.0, 0 :: (store (s128) into %stack.0)
+  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-REGALLOC-NEXT:   renamable $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
+  ; CHECK-REGALLOC-NEXT:   renamable $q0 = KILL killed renamable $q0, implicit-def $z0
+  ; CHECK-REGALLOC-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-REGALLOC-NEXT:   BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
+  ; CHECK-REGALLOC-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  ; CHECK-REGALLOC-NEXT:   renamable $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
+  ; CHECK-REGALLOC-NEXT:   renamable $q0 = COALESCER_BARRIER_FPR128 killed renamable $q0
+  ; CHECK-REGALLOC-NEXT:   STRQui killed renamable $q0, %stack.0, 0 :: (store (s128) into %stack.0)
+  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def dead $q0, implicit $vg, implicit-def $vg
+  ; CHECK-REGALLOC-NEXT:   $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
+  ; CHECK-REGALLOC-NEXT:   RET_ReallyLR implicit $q0
+  %sa = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> %a, i64 0)
+  call void @scalable_args(<vscale x 2 x i64> %sa)
+  ret <2 x i64> %a
+}
+
+declare void @scalable_args(<vscale x 2 x i64>) "aarch64_pstate_sm_enabled"
+declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
+
+declare <vscale x 2 x i64> @scalable_res() "aarch64_pstate_sm_enabled"
+declare <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ec3c972483b4ca781d1a83f9593c0d3c939a8207
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s | FileCheck %s
+
+; Verify that the following code can be compiled without +sme, because if the
+; call is not entered in streaming-SVE mode at runtime, the codepath leading
+; to the smstop/smstart pair will not be executed either.
+
+target triple = "aarch64"
+
+define void @streaming_compatible() #0 {
+; CHECK-LABEL: streaming_compatible:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    tbz w19, #0, .LBB0_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    bl non_streaming
+; CHECK-NEXT:    tbz w19, #0, .LBB0_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @non_streaming()
+  ret void
+}
+
+declare void @non_streaming()
+
+
+; Verify that COALESCER_BARRIER is also supported without +sme.
+
+define void @streaming_compatible_arg(float %f) #0 {
+; CHECK-LABEL: streaming_compatible_arg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    tbz w19, #0, .LBB1_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-NEXT:    bl non_streaming
+; CHECK-NEXT:    tbz w19, #0, .LBB1_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  call void @non_streaming(float %f)
+  ret void
+}
+
+
+attributes #0 = { nounwind "aarch64_pstate_sm_compatible" }
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 0118d7df25640b86b1d8358f07ca8e269d3cb244..94a711ba1c71332e91a631953b55e467a08bb7ab 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -17,15 +17,15 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
 ; CHECK-FISEL-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-FISEL-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-FISEL-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-FISEL-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-FISEL-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-FISEL-NEXT:    smstart sm
-; CHECK-FISEL-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-FISEL-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
 ; CHECK-FISEL-NEXT:    bl streaming_callee
-; CHECK-FISEL-NEXT:    str d0, [sp, #88] // 8-byte Folded Spill
+; CHECK-FISEL-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-FISEL-NEXT:    smstop sm
+; CHECK-FISEL-NEXT:    ldr d1, [sp, #8] // 8-byte Folded Reload
 ; CHECK-FISEL-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-FISEL-NEXT:    ldr d0, [x8, :lo12:.LCPI0_0]
-; CHECK-FISEL-NEXT:    ldr d1, [sp, #88] // 8-byte Folded Reload
 ; CHECK-FISEL-NEXT:    fadd d0, d1, d0
 ; CHECK-FISEL-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-FISEL-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
@@ -43,15 +43,15 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
 ; CHECK-GISEL-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GISEL-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GISEL-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-GISEL-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-GISEL-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-GISEL-NEXT:    smstart sm
-; CHECK-GISEL-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-GISEL-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
 ; CHECK-GISEL-NEXT:    bl streaming_callee
-; CHECK-GISEL-NEXT:    str d0, [sp, #88] // 8-byte Folded Spill
+; CHECK-GISEL-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-GISEL-NEXT:    smstop sm
-; CHECK-GISEL-NEXT:    mov x8, #4631107791820423168
+; CHECK-GISEL-NEXT:    ldr d1, [sp, #8] // 8-byte Folded Reload
+; CHECK-GISEL-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; CHECK-GISEL-NEXT:    fmov d0, x8
-; CHECK-GISEL-NEXT:    ldr d1, [sp, #88] // 8-byte Folded Reload
 ; CHECK-GISEL-NEXT:    fadd d0, d1, d0
 ; CHECK-GISEL-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-GISEL-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
@@ -68,92 +68,68 @@ entry:
 
 
 define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_enabled" {
-; CHECK-FISEL-LABEL: streaming_caller_nonstreaming_callee:
-; CHECK-FISEL:       // %bb.0: // %entry
-; CHECK-FISEL-NEXT:    sub sp, sp, #96
-; CHECK-FISEL-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK-FISEL-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
-; CHECK-FISEL-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK-FISEL-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-FISEL-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-FISEL-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
-; CHECK-FISEL-NEXT:    smstop sm
-; CHECK-FISEL-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
-; CHECK-FISEL-NEXT:    bl normal_callee
-; CHECK-FISEL-NEXT:    str d0, [sp, #88] // 8-byte Folded Spill
-; CHECK-FISEL-NEXT:    smstart sm
-; CHECK-FISEL-NEXT:    adrp x8, .LCPI1_0
-; CHECK-FISEL-NEXT:    ldr d0, [x8, :lo12:.LCPI1_0]
-; CHECK-FISEL-NEXT:    ldr d1, [sp, #88] // 8-byte Folded Reload
-; CHECK-FISEL-NEXT:    fadd d0, d1, d0
-; CHECK-FISEL-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
-; CHECK-FISEL-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-FISEL-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-FISEL-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-FISEL-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-FISEL-NEXT:    add sp, sp, #96
-; CHECK-FISEL-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: streaming_caller_nonstreaming_callee:
-; CHECK-GISEL:       // %bb.0: // %entry
-; CHECK-GISEL-NEXT:    sub sp, sp, #96
-; CHECK-GISEL-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK-GISEL-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
-; CHECK-GISEL-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK-GISEL-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-GISEL-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-GISEL-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
-; CHECK-GISEL-NEXT:    smstop sm
-; CHECK-GISEL-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
-; CHECK-GISEL-NEXT:    bl normal_callee
-; CHECK-GISEL-NEXT:    str d0, [sp, #88] // 8-byte Folded Spill
-; CHECK-GISEL-NEXT:    smstart sm
-; CHECK-GISEL-NEXT:    mov x8, #4631107791820423168
-; CHECK-GISEL-NEXT:    fmov d0, x8
-; CHECK-GISEL-NEXT:    ldr d1, [sp, #88] // 8-byte Folded Reload
-; CHECK-GISEL-NEXT:    fadd d0, d1, d0
-; CHECK-GISEL-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
-; CHECK-GISEL-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-GISEL-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-GISEL-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-GISEL-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-GISEL-NEXT:    add sp, sp, #96
-; CHECK-GISEL-NEXT:    ret
-entry:
-  %call = call double @normal_callee(double %x)
-  %add = fadd double %call, 4.200000e+01
-  ret double %add
-}
-
-define double @locally_streaming_caller_normal_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_body" {
-; CHECK-COMMON-LABEL: locally_streaming_caller_normal_callee:
-; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-LABEL: streaming_caller_nonstreaming_callee:
+; CHECK-COMMON:       // %bb.0: // %entry
 ; CHECK-COMMON-NEXT:    sub sp, sp, #96
 ; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    str d0, [sp, #88] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    smstart sm
+; CHECK-COMMON-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstop sm
-; CHECK-COMMON-NEXT:    ldr d0, [sp, #88] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    bl normal_callee
 ; CHECK-COMMON-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
-; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168
-; CHECK-COMMON-NEXT:    fmov d0, x8
 ; CHECK-COMMON-NEXT:    ldr d1, [sp, #8] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
+; CHECK-COMMON-NEXT:    fmov d0, x8
 ; CHECK-COMMON-NEXT:    fadd d0, d1, d0
-; CHECK-COMMON-NEXT:    str d0, [sp] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    smstop sm
-; CHECK-COMMON-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    add sp, sp, #96
+; CHECK-COMMON-NEXT:    ret
+entry:
+  %call = call double @normal_callee(double %x)
+  %add = fadd double %call, 4.200000e+01
+  ret double %add
+}
+
+define double @locally_streaming_caller_normal_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_body" {
+; CHECK-COMMON-LABEL: locally_streaming_caller_normal_callee:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    sub sp, sp, #112
+; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    str d0, [sp, #24] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    smstart sm
+; CHECK-COMMON-NEXT:    ldr d0, [sp, #24] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    str d0, [sp, #24] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    smstop sm
+; CHECK-COMMON-NEXT:    ldr d0, [sp, #24] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    bl normal_callee
+; CHECK-COMMON-NEXT:    str d0, [sp, #16] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    smstart sm
+; CHECK-COMMON-NEXT:    ldr d1, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
+; CHECK-COMMON-NEXT:    fmov d0, x8
+; CHECK-COMMON-NEXT:    fadd d0, d1, d0
+; CHECK-COMMON-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    smstop sm
+; CHECK-COMMON-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    add sp, sp, #112
 ; CHECK-COMMON-NEXT:    ret
   %call = call double  @normal_callee(double %x);
   %add = fadd double %call, 4.200000e+01
@@ -223,9 +199,9 @@ define void @normal_call_to_streaming_callee_ptr(ptr %p) nounwind noinline optno
 ; Check ZA state
 ;
 
-declare double @za_shared_callee(double) "aarch64_pstate_za_shared"
+declare double @za_shared_callee(double) "aarch64_inout_za"
 
-define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_pstate_za_new"{
+define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{
 ; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee:
 ; CHECK-COMMON:       // %bb.0: // %prelude
 ; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
@@ -236,6 +212,8 @@ define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
 ; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
 ; CHECK-COMMON-NEXT:    mov sp, x8
 ; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-COMMON-NEXT:    cbz x8, .LBB6_2
 ; CHECK-COMMON-NEXT:    b .LBB6_1
@@ -245,6 +223,7 @@ define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
 ; CHECK-COMMON-NEXT:    b .LBB6_2
 ; CHECK-COMMON-NEXT:  .LBB6_2: // %entry
 ; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    zero {za}
 ; CHECK-COMMON-NEXT:    bl za_shared_callee
 ; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168
 ; CHECK-COMMON-NEXT:    fmov d1, x8
@@ -259,18 +238,19 @@ entry:
   ret double %add;
 }
 
-define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline optnone "aarch64_pstate_za_shared"{
+define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline optnone "aarch64_inout_za"{
 ; CHECK-COMMON-LABEL: za_shared_caller_to_za_none_callee:
 ; CHECK-COMMON:       // %bb.0: // %entry
 ; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    mul x8, x8, x8
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    subs x9, x9, x8
+; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
 ; CHECK-COMMON-NEXT:    mov sp, x9
 ; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
 ; CHECK-COMMON-NEXT:    sub x8, x29, #16
 ; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x8
@@ -298,20 +278,21 @@ entry:
 }
 
 ; Ensure we set up and restore the lazy save correctly for instructions which are lowered to lib calls.
-define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwind {
+define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-LABEL: f128_call_za:
 ; CHECK-COMMON:       // %bb.0:
 ; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
-; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    mul x8, x8, x8
-; CHECK-COMMON-NEXT:    sub x9, x9, x8
-; CHECK-COMMON-NEXT:    mov sp, x9
+; CHECK-COMMON-NEXT:    mov x8, sp
+; CHECK-COMMON-NEXT:    rdsvl x9, #1
+; CHECK-COMMON-NEXT:    msub x8, x9, x9, x8
+; CHECK-COMMON-NEXT:    mov sp, x8
 ; CHECK-COMMON-NEXT:    sub x10, x29, #16
-; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
-; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
+; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT:    sturh w9, [x29, #-8]
 ; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-COMMON-NEXT:    bl __addtf3
 ; CHECK-COMMON-NEXT:    smstart za
@@ -340,9 +321,9 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp q1, q0, [sp] // 32-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstop sm
-; CHECK-COMMON-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
 ; CHECK-COMMON-NEXT:    bl __addtf3
 ; CHECK-COMMON-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
@@ -357,3 +338,103 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
   %res = fadd fp128 %a, %b
   ret fp128 %res
 }
+
+; As above this should use Selection DAG to make sure the libcall call is lowered correctly.
+define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
+; CHECK-COMMON-LABEL: frem_call_za:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    mov x29, sp
+; CHECK-COMMON-NEXT:    sub sp, sp, #16
+; CHECK-COMMON-NEXT:    mov x8, sp
+; CHECK-COMMON-NEXT:    rdsvl x9, #1
+; CHECK-COMMON-NEXT:    msub x8, x9, x9, x8
+; CHECK-COMMON-NEXT:    mov sp, x8
+; CHECK-COMMON-NEXT:    sub x10, x29, #16
+; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT:    sturh w9, [x29, #-8]
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-COMMON-NEXT:    bl fmod
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    sub x0, x29, #16
+; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-COMMON-NEXT:    cbnz x8, .LBB10_2
+; CHECK-COMMON-NEXT:  // %bb.1:
+; CHECK-COMMON-NEXT:    bl __arm_tpidr2_restore
+; CHECK-COMMON-NEXT:  .LBB10_2:
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-COMMON-NEXT:    mov sp, x29
+; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ret
+  %res = frem double %a, %b
+  ret double %res
+}
+
+; As above this should use Selection DAG to make sure the libcall is lowered correctly.
+define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounwind {
+; CHECK-COMMON-LABEL: frem_call_sm:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    sub sp, sp, #96
+; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp s1, s0, [sp, #8] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    smstop sm
+; CHECK-COMMON-NEXT:    ldp s1, s0, [sp, #8] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    bl fmodf
+; CHECK-COMMON-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-COMMON-NEXT:    smstart sm
+; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    add sp, sp, #96
+; CHECK-COMMON-NEXT:    ret
+  %res = frem float %a, %b
+  ret float %res
+}
+
+; As above this should use Selection DAG to make sure the libcall is lowered correctly.
+define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compatible" nounwind {
+; CHECK-COMMON-LABEL: frem_call_sm_compat:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    sub sp, sp, #96
+; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    bl __arm_sme_state
+; CHECK-COMMON-NEXT:    and x19, x0, #0x1
+; CHECK-COMMON-NEXT:    ldr s2, [sp, #8] // 4-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-COMMON-NEXT:    stp s2, s0, [sp, #8] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    tbz w19, #0, .LBB12_2
+; CHECK-COMMON-NEXT:  // %bb.1:
+; CHECK-COMMON-NEXT:    smstop sm
+; CHECK-COMMON-NEXT:  .LBB12_2:
+; CHECK-COMMON-NEXT:    ldp s0, s1, [sp, #8] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    bl fmodf
+; CHECK-COMMON-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-COMMON-NEXT:    tbz w19, #0, .LBB12_4
+; CHECK-COMMON-NEXT:  // %bb.3:
+; CHECK-COMMON-NEXT:    smstart sm
+; CHECK-COMMON-NEXT:  .LBB12_4:
+; CHECK-COMMON-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-COMMON-NEXT:    add sp, sp, #96
+; CHECK-COMMON-NEXT:    ret
+  %res = frem float %a, %b
+  ret float %res
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes.ll b/llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b3aeb1fcc42da1b6253343349d7652e32387ab18
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-disable-rematerialize-with-streaming-mode-changes.ll
@@ -0,0 +1,71 @@
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64"
+
+
+define void @dont_rematerialize_cntd(i32 %N) #0 {
+; CHECK-LABEL: dont_rematerialize_cntd:
+; CHECK:        cntd
+; CHECK:        smstop sm
+; CHECK-NOT:    cntd
+; CHECK:        bl      foo
+; CHECK:        smstart  sm
+entry:
+  %cmp2 = icmp sgt i32 %N, 0
+  br i1 %cmp2, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %index.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"() nounwind
+  %.tr = call i32 @llvm.vscale.i32()
+  %conv = shl nuw nsw i32 %.tr, 4
+  call void @foo(i32 %conv)
+  %inc = add nuw nsw i32 %index.03, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+}
+
+; This test doesn't strictly make sense, because it passes a scalable predicate
+; to a function, which makes little sense if the VL is not the same in/out of
+; streaming-SVE mode. If the VL is known to be the same, then we could just as
+; well rematerialize the `ptrue` inside the call sequence. However, the purpose
+; of this test is more to ensure that the logic works, which may also trigger
+; when the value is not being passed as argument (e.g. when it is hoisted from
+; a loop and placed inside the call sequence).
+;
+; FIXME: This test also exposes another bug, where the 'mul vl' addressing mode
+; is used before/after the smstop. This will be fixed in a future patch.
+define void @dont_rematerialize_ptrue(i32 %N) #0 {
+; CHECK-LABEL: dont_rematerialize_ptrue:
+; CHECK:        ptrue [[PTRUE:p[0-9]+]].b
+; CHECK:        str [[PTRUE]], [[[SPILL_ADDR:.*]]]
+; CHECK:        smstop sm
+; CHECK:        ldr p0, [[[SPILL_ADDR]]]
+; CHECK-NOT:    ptrue
+; CHECK:        bl      bar
+; CHECK:        smstart  sm
+entry:
+  %cmp2 = icmp sgt i32 %N, 0
+  br i1 %cmp2, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %index.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"() nounwind
+  %ptrue.ins = insertelement <vscale x 16 x i1> poison, i1 1, i32 0
+  %ptrue = shufflevector <vscale x 16 x i1> %ptrue.ins, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  call void @bar(<vscale x 16 x i1> %ptrue)
+  %inc = add nuw nsw i32 %index.03, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+}
+declare void @foo(i32)
+declare void @bar(<vscale x 16 x i1>)
+declare i32 @llvm.vscale.i32()
+
+attributes #0 = { "aarch64_pstate_sm_enabled" "frame-pointer"="non-leaf" "target-features"="+sme,+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
index ccb3975f0c5b422cadce2ef7485948b1c5197a15..985b0581200ff345b59e6696b20b089fb478107b 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
@@ -252,7 +252,7 @@ define void @ldr(ptr %ptr) {
 ; CHECK-NEXT:    mov w12, wzr
 ; CHECK-NEXT:    ldr za[w12, 0], [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr)
+  call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr, i32 0)
   ret void;
 }
 
@@ -264,7 +264,7 @@ define void @ldr_with_off_15(ptr %ptr) {
 ; CHECK-NEXT:    ldr za[w12, 0], [x8]
 ; CHECK-NEXT:    ret
   %base = getelementptr i8, ptr %ptr, i64 15
-  call void @llvm.aarch64.sme.ldr(i32 15, ptr %base)
+  call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i32 0)
   ret void;
 }
 
@@ -278,7 +278,7 @@ define void @ldr_with_off_15mulvl(ptr %ptr) {
   %vscale = call i64 @llvm.vscale.i64()
   %mulvl = mul i64 %vscale, 240
   %base = getelementptr i8, ptr %ptr, i64 %mulvl
-  call void @llvm.aarch64.sme.ldr(i32 15, ptr %base)
+  call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i32 0)
   ret void;
 }
 
@@ -292,23 +292,205 @@ define void @ldr_with_off_16mulvl(ptr %ptr) {
   %vscale = call i64 @llvm.vscale.i64()
   %mulvl = mul i64 %vscale, 256
   %base = getelementptr i8, ptr %ptr, i64 %mulvl
-  call void @llvm.aarch64.sme.ldr(i32 16, ptr %base)
+  call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 0)
   ret void;
 }
 
+define void @ldr_with_off_var(ptr %base, i32 %off) {
+; CHECK-LABEL: ldr_with_off_var:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    sxtw x8, w1
+; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    add w12, w1, #16
+; CHECK-NEXT:    madd x8, x9, x8, x0
+; CHECK-NEXT:    ldr za[w12, 0], [x8]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 %off)
+  ret void;
+}
+
+define void @ldr_with_off_15imm(ptr %base) {
+; CHECK-LABEL: ldr_with_off_15imm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, #16 // =0x10
+; CHECK-NEXT:    ldr za[w12, 15], [x0, #15, mul vl]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 15)
+  ret void;
+}
+
+define void @ldr_with_off_16imm(ptr %base) {
+; CHECK-LABEL: ldr_with_off_16imm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov w12, #32 // =0x20
+; CHECK-NEXT:    add x8, x0, x8, lsl #4
+; CHECK-NEXT:    ldr za[w12, 0], [x8]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 16)
+  ret void;
+}
+
+define void @ldr_with_off_many_imm(i32 %tile_slice, ptr %ptr) {
+; CHECK-LABEL: ldr_with_off_many_imm:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    ldr za[w12, 1], [x1, #1, mul vl]
+; CHECK-NEXT:    ldr za[w12, 2], [x1, #2, mul vl]
+; CHECK-NEXT:    ldr za[w12, 3], [x1, #3, mul vl]
+; CHECK-NEXT:    ldr za[w12, 4], [x1, #4, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 1)
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 2)
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 3)
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 4)
+  ret void
+}
+
+define void @ldr_with_off_many_imm_15_18(i32 %tile_slice, ptr %ptr) {
+; CHECK-LABEL: ldr_with_off_many_imm_15_18:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    add x8, x1, x8, lsl #4
+; CHECK-NEXT:    add w13, w0, #16
+; CHECK-NEXT:    ldr za[w12, 15], [x1, #15, mul vl]
+; CHECK-NEXT:    ldr za[w13, 0], [x8]
+; CHECK-NEXT:    ldr za[w13, 1], [x8, #1, mul vl]
+; CHECK-NEXT:    ldr za[w13, 2], [x8, #2, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 15)
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 16)
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 17)
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 18)
+  ret void
+}
+
+define void @ldr_with_off_many_imm_16_19(i32 %tile_slice, ptr %ptr) {
+; CHECK-LABEL: ldr_with_off_many_imm_16_19:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    add w12, w0, #16
+; CHECK-NEXT:    add x8, x1, x8, lsl #4
+; CHECK-NEXT:    ldr za[w12, 0], [x8]
+; CHECK-NEXT:    ldr za[w12, 1], [x8, #1, mul vl]
+; CHECK-NEXT:    ldr za[w12, 2], [x8, #2, mul vl]
+; CHECK-NEXT:    ldr za[w12, 3], [x8, #3, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 16)
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 17)
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 18)
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 19)
+  ret void
+}
+
+define void @ldr_with_off_many_imm_31_34(i32 %tile_slice, ptr %ptr) {
+; CHECK-LABEL: ldr_with_off_many_imm_31_34:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    add w12, w0, #16
+; CHECK-NEXT:    add x9, x1, x8, lsl #4
+; CHECK-NEXT:    add x8, x1, x8, lsl #5
+; CHECK-NEXT:    add w13, w0, #32
+; CHECK-NEXT:    ldr za[w12, 15], [x9, #15, mul vl]
+; CHECK-NEXT:    ldr za[w13, 0], [x8]
+; CHECK-NEXT:    ldr za[w13, 1], [x8, #1, mul vl]
+; CHECK-NEXT:    ldr za[w13, 2], [x8, #2, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 31)
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 32)
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 33)
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 34)
+  ret void
+}
+
+define void @ldr_with_off_many_imm_32_35(i32 %tile_slice, ptr %ptr, i64 %vnum) {
+; CHECK-LABEL: ldr_with_off_many_imm_32_35:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    add w12, w0, #32
+; CHECK-NEXT:    add x8, x1, x8, lsl #5
+; CHECK-NEXT:    ldr za[w12, 0], [x8]
+; CHECK-NEXT:    ldr za[w12, 1], [x8, #1, mul vl]
+; CHECK-NEXT:    ldr za[w12, 2], [x8, #2, mul vl]
+; CHECK-NEXT:    ldr za[w12, 3], [x8, #3, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 32)
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 33)
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 34)
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 35)
+  ret void
+}
+
+define void @ldr_with_off_many_var(i32 %tile_slice, ptr %ptr, i64 %vnum) {
+; CHECK-LABEL: ldr_with_off_many_var:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxtw x8, w2
+; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    add w12, w0, w2
+; CHECK-NEXT:    madd x8, x9, x8, x1
+; CHECK-NEXT:    ldr za[w12, 0], [x8]
+; CHECK-NEXT:    ldr za[w12, 1], [x8, #1, mul vl]
+; CHECK-NEXT:    ldr za[w12, 2], [x8, #2, mul vl]
+; CHECK-NEXT:    ldr za[w12, 3], [x8, #3, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %0 = trunc i64 %vnum to i32
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %0)
+  %1 = add i32 %0, 1
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %1)
+  %2 = add i32 %0, 2
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %2)
+  %3 = add i32 %0, 3
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %3)
+  ret void
+}
+
+define void @ldr_with_off_many_var_high(i32 %tile_slice, ptr %ptr, i64 %vnum) {
+; CHECK-LABEL: ldr_with_off_many_var_high:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add w8, w2, #32
+; CHECK-NEXT:    rdsvl x10, #1
+; CHECK-NEXT:    sxtw x9, w8
+; CHECK-NEXT:    add w12, w0, w8
+; CHECK-NEXT:    madd x9, x10, x9, x1
+; CHECK-NEXT:    ldr za[w12, 1], [x9, #1, mul vl]
+; CHECK-NEXT:    ldr za[w12, 2], [x9, #2, mul vl]
+; CHECK-NEXT:    ldr za[w12, 3], [x9, #3, mul vl]
+; CHECK-NEXT:    ldr za[w12, 4], [x9, #4, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %0 = trunc i64 %vnum to i32
+  %1 = add i32 %0, 33
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %1)
+  %2 = add i32 %0, 34
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %2)
+  %3 = add i32 %0, 35
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %3)
+  %4 = add i32 %0, 36
+  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %4)
+  ret void
+}
+
 ; Ensure that the tile offset is sunk, given that this is likely to be an 'add'
 ; that's decomposed into a base + offset in ISel.
 define void @test_ld1_sink_tile0_offset_operand(<vscale x 4 x i1> %pg, ptr %src, i32 %base, i32 %N) {
 ; CHECK-LABEL: test_ld1_sink_tile0_offset_operand:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov w12, w1
-; CHECK-NEXT:  .LBB14_1: // %for.body
+; CHECK-NEXT:  .LBB24_1: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld1w {za0h.s[w12, 0]}, p0/z, [x0]
 ; CHECK-NEXT:    subs w2, w2, #1
 ; CHECK-NEXT:    ld1w {za0h.s[w12, 1]}, p0/z, [x0]
 ; CHECK-NEXT:    ld1w {za0h.s[w12, 2]}, p0/z, [x0]
-; CHECK-NEXT:    b.ne .LBB14_1
+; CHECK-NEXT:    b.ne .LBB24_1
 ; CHECK-NEXT:  // %bb.2: // %exit
 ; CHECK-NEXT:    ret
 entry:
@@ -341,5 +523,5 @@ declare void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1>, ptr, i32, i32)
 declare void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1>, ptr, i32, i32)
 declare void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1>, ptr, i32, i32)
 
-declare void @llvm.aarch64.sme.ldr(i32, ptr)
+declare void @llvm.aarch64.sme.ldr(i32, ptr, i32)
 declare i64 @llvm.vscale.i64()
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
index ddff4c7d3cd3e7a715de8f376ee85f29e145143d..366e24df3e8c975e2668ad8a8ee48c00ed141517 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
@@ -252,7 +252,7 @@ define void @str(ptr %ptr) {
 ; CHECK-NEXT:    mov w12, wzr
 ; CHECK-NEXT:    str za[w12, 0], [x0]
 ; CHECK-NEXT:    ret
-  call void @llvm.aarch64.sme.str(i32 0, ptr %ptr)
+  call void @llvm.aarch64.sme.str(i32 0, ptr %ptr, i32 0)
   ret void;
 }
 
@@ -264,7 +264,7 @@ define void @str_with_off_15(ptr %ptr) {
 ; CHECK-NEXT:    str za[w12, 0], [x8]
 ; CHECK-NEXT:    ret
   %base = getelementptr i8, ptr %ptr, i64 15
-  call void @llvm.aarch64.sme.str(i32 15, ptr %base)
+  call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 0)
   ret void;
 }
 
@@ -278,7 +278,7 @@ define void @str_with_off_15mulvl(ptr %ptr) {
   %vscale = call i64 @llvm.vscale.i64()
   %mulvl = mul i64 %vscale, 240
   %base = getelementptr i8, ptr %ptr, i64 %mulvl
-  call void @llvm.aarch64.sme.str(i32 15, ptr %base)
+  call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 0)
   ret void;
 }
 
@@ -292,23 +292,210 @@ define void @str_with_off_16mulvl(ptr %ptr) {
   %vscale = call i64 @llvm.vscale.i64()
   %mulvl = mul i64 %vscale, 256
   %base = getelementptr i8, ptr %ptr, i64 %mulvl
-  call void @llvm.aarch64.sme.str(i32 16, ptr %base)
+  call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 0)
   ret void;
 }
 
+define void @str_with_off_var(ptr %base, i32 %off) {
+; CHECK-LABEL: str_with_off_var:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    sxtw x8, w1
+; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    add w12, w1, #16
+; CHECK-NEXT:    madd x8, x9, x8, x0
+; CHECK-NEXT:    str za[w12, 0], [x8]
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 %off)
+  ret void;
+}
+
+define void @str_with_off_15imm(ptr %ptr) {
+; CHECK-LABEL: str_with_off_15imm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w12, #15 // =0xf
+; CHECK-NEXT:    add x8, x0, #15
+; CHECK-NEXT:    str za[w12, 15], [x8, #15, mul vl]
+; CHECK-NEXT:    ret
+  %base = getelementptr i8, ptr %ptr, i64 15
+  call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 15)
+  ret void;
+}
+
+define void @str_with_off_16imm(ptr %ptr) {
+; CHECK-LABEL: str_with_off_16imm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov w12, #31 // =0x1f
+; CHECK-NEXT:    add x8, x0, x8, lsl #4
+; CHECK-NEXT:    add x8, x8, #15
+; CHECK-NEXT:    str za[w12, 0], [x8]
+; CHECK-NEXT:    ret
+  %base = getelementptr i8, ptr %ptr, i64 15
+  call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 16)
+  ret void;
+}
+
+define void @str_with_off_many_imm(i32 %tile_slice, ptr %ptr) {
+; CHECK-LABEL: str_with_off_many_imm:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    str za[w12, 1], [x1, #1, mul vl]
+; CHECK-NEXT:    str za[w12, 2], [x1, #2, mul vl]
+; CHECK-NEXT:    str za[w12, 3], [x1, #3, mul vl]
+; CHECK-NEXT:    str za[w12, 4], [x1, #4, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 1)
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 2)
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 3)
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 4)
+  ret void
+}
+
+define void @str_with_off_many_imm_15_18(i32 %tile_slice, ptr %ptr) {
+; CHECK-LABEL: str_with_off_many_imm_15_18:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov w12, w0
+; CHECK-NEXT:    add x8, x1, x8, lsl #4
+; CHECK-NEXT:    add w13, w0, #16
+; CHECK-NEXT:    str za[w12, 15], [x1, #15, mul vl]
+; CHECK-NEXT:    str za[w13, 0], [x8]
+; CHECK-NEXT:    str za[w13, 1], [x8, #1, mul vl]
+; CHECK-NEXT:    str za[w13, 2], [x8, #2, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 15)
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 16)
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 17)
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 18)
+  ret void
+}
+
+define void @str_with_off_many_imm_16_19(i32 %tile_slice, ptr %ptr) {
+; CHECK-LABEL: str_with_off_many_imm_16_19:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    add w12, w0, #16
+; CHECK-NEXT:    add x8, x1, x8, lsl #4
+; CHECK-NEXT:    str za[w12, 0], [x8]
+; CHECK-NEXT:    str za[w12, 1], [x8, #1, mul vl]
+; CHECK-NEXT:    str za[w12, 2], [x8, #2, mul vl]
+; CHECK-NEXT:    str za[w12, 3], [x8, #3, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 16)
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 17)
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 18)
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 19)
+  ret void
+}
+
+define void @str_with_off_many_imm_31_34(i32 %tile_slice, ptr %ptr) {
+; CHECK-LABEL: str_with_off_many_imm_31_34:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    add w12, w0, #16
+; CHECK-NEXT:    add x9, x1, x8, lsl #4
+; CHECK-NEXT:    add x8, x1, x8, lsl #5
+; CHECK-NEXT:    add w13, w0, #32
+; CHECK-NEXT:    str za[w12, 15], [x9, #15, mul vl]
+; CHECK-NEXT:    str za[w13, 0], [x8]
+; CHECK-NEXT:    str za[w13, 1], [x8, #1, mul vl]
+; CHECK-NEXT:    str za[w13, 2], [x8, #2, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 31)
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 32)
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 33)
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 34)
+  ret void
+}
+
+define void @str_with_off_many_imm_32_35(i32 %tile_slice, ptr %ptr) {
+; CHECK-LABEL: str_with_off_many_imm_32_35:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    add w12, w0, #32
+; CHECK-NEXT:    add x8, x1, x8, lsl #5
+; CHECK-NEXT:    str za[w12, 0], [x8]
+; CHECK-NEXT:    str za[w12, 1], [x8, #1, mul vl]
+; CHECK-NEXT:    str za[w12, 2], [x8, #2, mul vl]
+; CHECK-NEXT:    str za[w12, 3], [x8, #3, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 32)
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 33)
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 34)
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 35)
+  ret void
+}
+
+define void @str_with_off_many_var(i32 %tile_slice, ptr %ptr, i64 %vnum) {
+; CHECK-LABEL: str_with_off_many_var:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxtw x8, w2
+; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    add w12, w0, w2
+; CHECK-NEXT:    madd x8, x9, x8, x1
+; CHECK-NEXT:    str za[w12, 0], [x8]
+; CHECK-NEXT:    str za[w12, 1], [x8, #1, mul vl]
+; CHECK-NEXT:    str za[w12, 2], [x8, #2, mul vl]
+; CHECK-NEXT:    str za[w12, 3], [x8, #3, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %0 = trunc i64 %vnum to i32
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %0)
+  %1 = add i32 %0, 1
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %1)
+  %2 = add i32 %0, 2
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %2)
+  %3 = add i32 %0, 3
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %3)
+  ret void
+}
+
+define void @str_with_off_many_var_high(i32 %tile_slice, ptr %ptr, i64 %vnum) {
+; CHECK-LABEL: str_with_off_many_var_high:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add w8, w2, #32
+; CHECK-NEXT:    rdsvl x10, #1
+; CHECK-NEXT:    sxtw x9, w8
+; CHECK-NEXT:    add w12, w0, w8
+; CHECK-NEXT:    madd x9, x10, x9, x1
+; CHECK-NEXT:    str za[w12, 1], [x9, #1, mul vl]
+; CHECK-NEXT:    str za[w12, 2], [x9, #2, mul vl]
+; CHECK-NEXT:    str za[w12, 3], [x9, #3, mul vl]
+; CHECK-NEXT:    str za[w12, 4], [x9, #4, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %0 = trunc i64 %vnum to i32
+  %1 = add i32 %0, 33
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %1)
+  %2 = add i32 %0, 34
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %2)
+  %3 = add i32 %0, 35
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %3)
+  %4 = add i32 %0, 36
+  tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %4)
+  ret void
+}
+
+
 ; Ensure that the tile offset is sunk, given that this is likely to be an 'add'
 ; that's decomposed into a base + offset in ISel.
 define void @test_sink_tile0_offset_operand(<vscale x 4 x i1> %pg, ptr %src, i32 %base, i32 %N) {
 ; CHECK-LABEL: test_sink_tile0_offset_operand:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov w12, w1
-; CHECK-NEXT:  .LBB14_1: // %for.body
+; CHECK-NEXT:  .LBB24_1: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    st1w {za0h.s[w12, 0]}, p0, [x0]
 ; CHECK-NEXT:    subs w2, w2, #1
 ; CHECK-NEXT:    st1w {za0h.s[w12, 1]}, p0, [x0]
 ; CHECK-NEXT:    st1w {za0h.s[w12, 2]}, p0, [x0]
-; CHECK-NEXT:    b.ne .LBB14_1
+; CHECK-NEXT:    b.ne .LBB24_1
 ; CHECK-NEXT:  // %bb.2: // %exit
 ; CHECK-NEXT:    ret
 entry:
@@ -340,5 +527,5 @@ declare void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1>, ptr, i32, i32)
 declare void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1>, ptr, i32, i32)
 declare void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1>, ptr, i32, i32)
 
-declare void @llvm.aarch64.sme.str(i32, ptr)
+declare void @llvm.aarch64.sme.str(i32, ptr, i32)
 declare i64 @llvm.vscale.i64()
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll
new file mode 100644
index 0000000000000000000000000000000000000000..65e50842d5d78fcc07898417b458bd35fde9ce76
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 -mattr=+sme --pass-remarks-analysis=sme -o /dev/null < %s 2>&1 | FileCheck %s
+
+declare void @private_za_callee()
+declare float @llvm.cos.f32(float)
+
+define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
+; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_1_callee' to 'private_za_callee' sets up a lazy save for ZA
+  call void @private_za_callee()
+  ret void
+}
+
+define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
+; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA
+  call void @private_za_callee()
+; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA
+  call void @private_za_callee()
+  ret void
+}
+
+define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inout_za" {
+; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_expanded_intrinsic' to 'cosf' sets up a lazy save for ZA
+  %res = call float @llvm.cos.f32(float %a)
+  ret float %res
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 00c1c9d66c3e656a9c473a92df61d3928838f62f..a3c3089d54250ccf9c501714aab9f9abb7721191 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -5,20 +5,21 @@ declare void @private_za_callee()
 declare float @llvm.cos.f32(float)
 
 ; Test lazy-save mechanism for a single callee.
-define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" {
+define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
 ; CHECK-LABEL: test_lazy_save_1_callee:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x9, x9, x8
-; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    msub x8, x9, x9, x8
+; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    stur x9, [x29, #-16]
-; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    sturh w9, [x29, #-8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
@@ -37,19 +38,20 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" {
 }
 
 ; Test lazy-save mechanism for multiple callees.
-define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" {
+define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
 ; CHECK-LABEL: test_lazy_save_2_callees:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mul x19, x8, x8
 ; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    sub x8, x8, x19
+; CHECK-NEXT:    rdsvl x19, #1
+; CHECK-NEXT:    msub x8, x19, x19, x8
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    sub x20, x29, #16
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-NEXT:    stur x8, [x29, #-16]
 ; CHECK-NEXT:    sturh w19, [x29, #-8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x20
@@ -83,20 +85,21 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" {
 }
 
 ; Test a call of an intrinsic that gets expanded to a library call.
-define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_pstate_za_shared" {
+define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inout_za" {
 ; CHECK-LABEL: test_lazy_save_expanded_intrinsic:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x9, x9, x8
-; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    msub x8, x9, x9, x8
+; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    stur x9, [x29, #-16]
-; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    sturh w9, [x29, #-8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    smstart za
@@ -115,7 +118,7 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_psta
 }
 
 ; Test a combination of streaming-compatible -> normal call with lazy-save.
-define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_za_shared" "aarch64_pstate_sm_compatible" {
+define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za" "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: test_lazy_save_and_conditional_smstart:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
@@ -126,23 +129,24 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_z
 ; CHECK-NEXT:    add x29, sp, #64
 ; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x9, x9, x8
-; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    msub x8, x9, x9, x8
+; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    sub x10, x29, #80
-; CHECK-NEXT:    stur x9, [x29, #-80]
-; CHECK-NEXT:    sturh w8, [x29, #-72]
+; CHECK-NEXT:    stur wzr, [x29, #-68]
+; CHECK-NEXT:    sturh wzr, [x29, #-70]
+; CHECK-NEXT:    stur x8, [x29, #-80]
+; CHECK-NEXT:    sturh w9, [x29, #-72]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    tbz x19, #0, .LBB3_2
+; CHECK-NEXT:    tbz w19, #0, .LBB3_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB3_2:
 ; CHECK-NEXT:    bl private_za_callee
-; CHECK-NEXT:    tbz x19, #0, .LBB3_4
+; CHECK-NEXT:    tbz w19, #0, .LBB3_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB3_4:
diff --git a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll
index 54ef5fd432755f98ebe387b5547fc096b0d90d04..04d26902c536a41edc830a7f742594345e23c312 100644
--- a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll
+++ b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll
@@ -1,9 +1,9 @@
 ; RUN: opt -S -mtriple=aarch64-linux-gnu -aarch64-sme-abi %s | FileCheck %s
 ; RUN: opt -S -mtriple=aarch64-linux-gnu -aarch64-sme-abi -aarch64-sme-abi %s | FileCheck %s
 
-declare void @shared_za_callee() "aarch64_pstate_za_shared"
+declare void @shared_za_callee() "aarch64_inout_za"
 
-define void @private_za() "aarch64_pstate_za_new" {
+define void @private_za() "aarch64_new_za" {
 ; CHECK-LABEL: @private_za(
 ; CHECK-NEXT:  prelude:
 ; CHECK-NEXT:    [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2()
@@ -15,6 +15,7 @@ define void @private_za() "aarch64_pstate_za_new" {
 ; CHECK-NEXT:    br label [[TMP0]]
 ; CHECK:       0:
 ; CHECK-NEXT:    call void @llvm.aarch64.sme.za.enable()
+; CHECK-NEXT:    call void @llvm.aarch64.sme.zero(i32 255)
 ; CHECK-NEXT:    call void @shared_za_callee()
 ; CHECK-NEXT:    call void @llvm.aarch64.sme.za.disable()
 ; CHECK-NEXT:    ret void
@@ -23,7 +24,7 @@ define void @private_za() "aarch64_pstate_za_new" {
   ret void
 }
 
-define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_pstate_za_new" {
+define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_new_za" {
 ; CHECK-LABEL: @private_za_multiple_exit(
 ; CHECK-NEXT:  prelude:
 ; CHECK-NEXT:    [[TPIDR2:%.*]] = call i64 @llvm.aarch64.sme.get.tpidr2()
@@ -35,6 +36,7 @@ define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_pstate_
 ; CHECK-NEXT:    br label [[ENTRY]]
 ; CHECK:       entry:
 ; CHECK-NEXT:    call void @llvm.aarch64.sme.za.enable()
+; CHECK-NEXT:    call void @llvm.aarch64.sme.zero(i32 255)
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i64 [[COND:%.*]], 1
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_ELSE:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.else:
@@ -60,4 +62,4 @@ if.end:
 }
 
 ; CHECK: declare void @__arm_tpidr2_save() #[[ATTR:[0-9]+]]
-; CHECK: attributes #[[ATTR]] = { "aarch64_pstate_sm_compatible" "aarch64_pstate_za_preserved" }
+; CHECK: attributes #[[ATTR]] = { "aarch64_pstate_sm_compatible" }
diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b702c71001a6eaecd3050b09ad45f0790a630c11
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
@@ -0,0 +1,1641 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-unknown-eabi-elf"
+
+; This test verifies that call arguments and results are not coalesced
+; with SVE vector registers by the coalescer, such that no 'mul vl'
+; ldr/str pairs are generated in the streaming-mode-changing call
+; sequence.
+
+;
+; Scalar arguments
+;
+
+define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl use_i8
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = insertelement <vscale x 16 x i8> poison, i8 %arg, i32 0
+  call void @use_i8(i8 %arg)
+  store <vscale x 16 x i8> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl use_i16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = insertelement <vscale x 8 x i16> poison, i16 %arg, i32 0
+  call void @use_i16(i16 %arg)
+  store <vscale x 8 x i16> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl use_i32
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = insertelement <vscale x 4 x i32> poison, i32 %arg, i32 0
+  call void @use_i32(i32 %arg)
+  store <vscale x 4 x i32> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl use_i64
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = insertelement <vscale x 2 x i64> poison, i64 %arg, i32 0
+  call void @use_i64(i64 %arg)
+  store <vscale x 2 x i64> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
+; CHECK-NEXT:    bl use_f16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = insertelement <vscale x 8 x half> poison, half %arg, i32 0
+  call void @use_f16(half %arg)
+  store <vscale x 8 x half> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-NEXT:    bl use_f32
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = insertelement <vscale x 4 x float> poison, float %arg, i32 0
+  call void @use_f32(float %arg)
+  store <vscale x 4 x float> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_f64
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = insertelement <vscale x 2 x double> poison, double %arg, i32 0
+  call void @use_f64(double %arg)
+  store <vscale x 2 x double> %vec, ptr %ptr
+  ret void
+}
+
+
+;
+; Single-element vector arguments
+;
+
+define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_v16i8
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %elt = extractelement <1 x i8> %arg, i32 0
+  %vec = insertelement <vscale x 16 x i8> poison, i8 %elt, i32 0
+  call void @use_v16i8(<1 x i8> %arg)
+  store <vscale x 16 x i8> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_v8i16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %elt = extractelement <1 x i16> %arg, i32 0
+  %vec = insertelement <vscale x 8 x i16> poison, i16 %elt, i32 0
+  call void @use_v8i16(<1 x i16> %arg)
+  store <vscale x 8 x i16> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_v4i32
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %elt = extractelement <1 x i32> %arg, i32 0
+  %vec = insertelement <vscale x 4 x i32> poison, i32 %elt, i32 0
+  call void @use_v4i32(<1 x i32> %arg)
+  store <vscale x 4 x i32> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_v2i64
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %elt = extractelement <1 x i64> %arg, i32 0
+  %vec = insertelement <vscale x 2 x i64> poison, i64 %elt, i32 0
+  call void @use_v2i64(<1 x i64> %arg)
+  store <vscale x 2 x i64> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v1f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
+; CHECK-NEXT:    bl use_v8f16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %elt = extractelement <1 x half> %arg, i32 0
+  %vec = insertelement <vscale x 8 x half> poison, half %elt, i32 0
+  call void @use_v8f16(<1 x half> %arg)
+  store <vscale x 8 x half> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v1f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_v4f32
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %elt = extractelement <1 x float> %arg, i32 0
+  %vec = insertelement <vscale x 4 x float> poison, float %elt, i32 0
+  call void @use_v4f32(<1 x float> %arg)
+  store <vscale x 4 x float> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_v2f64
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %elt = extractelement <1 x double> %arg, i32 0
+  %vec = insertelement <vscale x 2 x double> poison, double %elt, i32 0
+  call void @use_v2f64(<1 x double> %arg)
+  store <vscale x 2 x double> %vec, ptr %ptr
+  ret void
+}
+
+;
+; Full vector arguments
+;
+
+define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v16i8
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> %arg, i64 0)
+  call void @use_v16i8(<16 x i8> %arg)
+  store <vscale x 16 x i8> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v8i16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> %arg, i64 0)
+  call void @use_v8i16(<8 x i16> %arg)
+  store <vscale x 8 x i16> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v4i32
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> %arg, i64 0)
+  call void @use_v4i32(<4 x i32> %arg)
+  store <vscale x 4 x i32> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v2i64
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> %arg, i64 0)
+  call void @use_v2i64(<2 x i64> %arg)
+  store <vscale x 2 x i64> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v8f16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %arg, i64 0)
+  call void @use_v8f16(<8 x half> %arg)
+  store <vscale x 8 x half> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v8bf16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> poison, <8 x bfloat> %arg, i64 0)
+  call void @use_v8bf16(<8 x bfloat> %arg)
+  store <vscale x 8 x bfloat> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v4f32
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %arg, i64 0)
+  call void @use_v4f32(<4 x float> %arg)
+  store <vscale x 4 x float> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl use_v2f64
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> poison, <2 x double> %arg, i64 0)
+  call void @use_v2f64(<2 x double> %arg)
+  store <vscale x 2 x double> %vec, ptr %ptr
+  ret void
+}
+
+;
+; <8 x i1> type will need type promotion.
+;
+define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_arg_v8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    and z1.b, z1.b, #0x1
+; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str p0, [x8, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl use_v8i1
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr p0, [x8, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    str p0, [x19]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %vec = call <vscale x 16 x i1> @llvm.vector.insert.nxv8i1.v8i1(<vscale x 16 x i1> poison, <8 x i1> %arg, i64 0)
+  call void @use_v8i1(<8 x i1> %arg)
+  store <vscale x 16 x i1> %vec, ptr %ptr
+  ret void
+}
+
+;
+; Scalar return values
+;
+
+define void @dont_coalesce_res_i8(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_i8
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call i8 @get_i8()
+  %vec = insertelement <vscale x 16 x i8> poison, i8 %res, i32 0
+  store <vscale x 16 x i8> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_i16(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_i16
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call i16 @get_i16()
+  %vec = insertelement <vscale x 8 x i16> poison, i16 %res, i32 0
+  store <vscale x 8 x i16> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_i32(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_i32
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call i32 @get_i32()
+  %vec = insertelement <vscale x 4 x i32> poison, i32 %res, i32 0
+  store <vscale x 4 x i32> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_i64(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_i64
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call i64 @get_i64()
+  %vec = insertelement <vscale x 2 x i64> poison, i64 %res, i32 0
+  store <vscale x 2 x i64> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_f16(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_f16
+; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call half @get_f16()
+  %vec = insertelement <vscale x 8 x half> poison, half %res, i32 0
+  store <vscale x 8 x half> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_f32(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_f32
+; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call float @get_f32()
+  %vec = insertelement <vscale x 4 x float> poison, float %res, i32 0
+  store <vscale x 4 x float> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_f64(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_f64
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call double @get_f64()
+  %vec = insertelement <vscale x 2 x double> poison, double %res, i32 0
+  store <vscale x 2 x double> %vec, ptr %ptr
+  ret void
+}
+
+;
+; Single-element vector result values
+;
+
+define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v1i8
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <1 x i8> @get_v1i8()
+  %elt = extractelement <1 x i8> %res, i32 0
+  %vec = insertelement <vscale x 16 x i8> poison, i8 %elt, i32 0
+  store <vscale x 16 x i8> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v1i16
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <1 x i16> @get_v1i16()
+  %elt = extractelement <1 x i16> %res, i32 0
+  %vec = insertelement <vscale x 8 x i16> poison, i16 %elt, i32 0
+  store <vscale x 8 x i16> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v1i32
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <1 x i32> @get_v1i32()
+  %elt = extractelement <1 x i32> %res, i32 0
+  %vec = insertelement <vscale x 4 x i32> poison, i32 %elt, i32 0
+  store <vscale x 4 x i32> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v1i64
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <1 x i64> @get_v1i64()
+  %elt = extractelement <1 x i64> %res, i32 0
+  %vec = insertelement <vscale x 2 x i64> poison, i64 %elt, i32 0
+  store <vscale x 2 x i64> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v1f16(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v1f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v1f16
+; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <1 x half> @get_v1f16()
+  %elt = extractelement <1 x half> %res, i32 0
+  %vec = insertelement <vscale x 8 x half> poison, half %elt, i32 0
+  store <vscale x 8 x half> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v1f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v1f32
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <1 x float> @get_v1f32()
+  %elt = extractelement <1 x float> %res, i32 0
+  %vec = insertelement <vscale x 4 x float> poison, float %elt, i32 0
+  store <vscale x 4 x float> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v1f64
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <1 x double> @get_v1f64()
+  %elt = extractelement <1 x double> %res, i32 0
+  %vec = insertelement <vscale x 2 x double> poison, double %elt, i32 0
+  store <vscale x 2 x double> %vec, ptr %ptr
+  ret void
+}
+
+;
+; Full vector result values
+;
+
+define void @dont_coalesce_res_v16i8(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v16i8
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <16 x i8> @get_v16i8()
+  %vec = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> %res, i64 0)
+  store <vscale x 16 x i8> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v8i16(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v8i16
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <8 x i16> @get_v8i16()
+  %vec = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> %res, i64 0)
+  store <vscale x 8 x i16> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v4i32(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v4i32
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <4 x i32> @get_v4i32()
+  %vec = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> %res, i64 0)
+  store <vscale x 4 x i32> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v2i64(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v2i64
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <2 x i64> @get_v2i64()
+  %vec = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> %res, i64 0)
+  store <vscale x 2 x i64> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v8f16(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v8f16
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @get_v8f16()
+  %vec = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %res, i64 0)
+  store <vscale x 8 x half> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v4f32(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v4f32
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <4 x float> @get_v4f32()
+  %vec = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %res, i64 0)
+  store <vscale x 4 x float> %vec, ptr %ptr
+  ret void
+}
+
+define void @dont_coalesce_res_v2f64(ptr %ptr) #0 {
+; CHECK-LABEL: dont_coalesce_res_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl get_v2f64
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  %res = call <2 x double> @get_v2f64()
+  %vec = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> poison, <2 x double> %res, i64 0)
+  store <vscale x 2 x double> %vec, ptr %ptr
+  ret void
+}
+
+declare half @get_f16()
+declare float @get_f32()
+declare double @get_f64()
+declare <1 x half> @get_v1f16()
+declare <1 x float> @get_v1f32()
+declare <1 x double> @get_v1f64()
+declare <8 x half> @get_v8f16()
+declare <4 x float> @get_v4f32()
+declare <2 x double> @get_v2f64()
+
+declare i8 @get_i8()
+declare i16 @get_i16()
+declare i32 @get_i32()
+declare i64 @get_i64()
+declare <1 x i8> @get_v1i8()
+declare <1 x i16> @get_v1i16()
+declare <1 x i32> @get_v1i32()
+declare <2 x i64> @get_v1i64()
+declare <16 x i8> @get_v16i8()
+declare <8 x i16> @get_v8i16()
+declare <4 x i32> @get_v4i32()
+declare <2 x i64> @get_v2i64()
+
+declare void @use_f16(half)
+declare void @use_f32(float)
+declare void @use_f64(double)
+declare void @use_v1f16(<1 x half>)
+declare void @use_v1f32(<1 x float>)
+declare void @use_v1f64(<1 x double>)
+declare void @use_v8f16(<8 x half>)
+declare void @use_v8bf16(<8 x bfloat>)
+declare void @use_v4f32(<4 x float>)
+declare void @use_v2f64(<2 x double>)
+
+declare void @use_i8(i8)
+declare void @use_i16(i16)
+declare void @use_i32(i32)
+declare void @use_i64(i64)
+declare void @use_v1i8(<1 x i8>)
+declare void @use_v1i16(<1 x i16>)
+declare void @use_v1i32(<1 x i32>)
+declare void @use_v1i64(<1 x i64>)
+declare void @use_v16i8(<16 x i8>)
+declare void @use_v8i16(<8 x i16>)
+declare void @use_v4i32(<4 x i32>)
+declare void @use_v2i64(<2 x i64>)
+declare void @use_v8i1(<8 x i1>)
+
+declare <vscale x 16 x i1> @llvm.vector.insert.nxv8i1.v8i1(<vscale x 16 x i1>, <8 x i1>, i64)
+declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
+declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
+declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
+declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
+declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half>, <8 x half>, i64)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
+declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64)
+declare <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double>, <2 x double>, i64)
+
+attributes #0 = { nounwind "aarch64_pstate_sm_enabled" "target-features"="+sve,+sme"  }
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index 47d465cc320b5c491f04d8855006f424ca0e019c..49d7ae006bb1cf5d6da18969c9bee7595529187b 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -4,20 +4,21 @@
 declare void @private_za_callee()
 
 ; Ensure that we don't use tail call optimization when a lazy-save is required.
-define void @disable_tailcallopt() "aarch64_pstate_za_shared" nounwind {
+define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
 ; CHECK-LABEL: disable_tailcallopt:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x9, x9, x8
-; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    msub x8, x9, x9, x8
+; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    stur x9, [x29, #-16]
-; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    sturh w9, [x29, #-8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
@@ -36,20 +37,21 @@ define void @disable_tailcallopt() "aarch64_pstate_za_shared" nounwind {
 }
 
 ; Ensure we set up and restore the lazy save correctly for instructions which are lowered to lib calls
-define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwind {
+define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-LABEL: f128_call_za:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x9, x9, x8
-; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    msub x8, x9, x9, x8
+; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    stur x9, [x29, #-16]
-; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    sturh w9, [x29, #-8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    smstart za
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a50e44892c614416f083ee0c6ed17818d2ca9ac1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
@@ -0,0 +1,128 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
+
+declare void @normal_callee();
+declare void @streaming_callee() "aarch64_pstate_sm_enabled";
+declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible";
+
+define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: sm_body_sm_compatible_simple:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    and x8, x0, #0x1
+; CHECK-NEXT:    tbnz w8, #0, .LBB0_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    fmov s0, wzr
+; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    tbnz w8, #0, .LBB0_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+  ret float zeroinitializer
+}
+
+define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: sm_body_caller_sm_compatible_caller_normal_callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    tbnz w19, #0, .LBB1_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl normal_callee
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    tbnz w19, #0, .LBB1_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @normal_callee()
+  ret void
+}
+
+; Function Attrs: nounwind uwtable vscale_range(1,16)
+define void @streaming_body_and_streaming_compatible_interface_multi_basic_block(i32 noundef %x) "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: streaming_body_and_streaming_compatible_interface_multi_basic_block:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    tbnz w19, #0, .LBB2_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB2_2: // %entry
+; CHECK-NEXT:    cbz w8, .LBB2_6
+; CHECK-NEXT:  // %bb.3: // %if.else
+; CHECK-NEXT:    bl streaming_compatible_callee
+; CHECK-NEXT:    tbnz w19, #0, .LBB2_5
+; CHECK-NEXT:  // %bb.4: // %if.else
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB2_5: // %if.else
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_6: // %if.then
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl normal_callee
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    tbnz w19, #0, .LBB2_8
+; CHECK-NEXT:  // %bb.7: // %if.then
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB2_8: // %if.then
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  tail call void @normal_callee()
+  br label %return
+
+if.else:                                          ; preds = %entry
+  tail call void @streaming_compatible_callee()
+  br label %return
+
+return:                                           ; preds = %if.else, %if.then
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
index 7561e75d04efa0f9427682d14fd618b7b77cc6a9..384e7dc94f785f2d77281cccbc0d9d3edc2977ae 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false  -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
 
 declare void @normal_callee();
 declare void @streaming_callee() "aarch64_pstate_sm_enabled";
@@ -237,25 +237,31 @@ declare void @use_ptr(ptr) "aarch64_pstate_sm_compatible"
 define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: call_to_intrinsic_without_chain:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT:    str d0, [sp, #72] // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldr d0, [sp, #72] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl cos
-; CHECK-NEXT:    str d0, [sp, #72] // 8-byte Folded Spill
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr d0, [sp, #72] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
 entry:
   %0 = call fast double @llvm.cos.f64(double %x)
@@ -306,3 +312,24 @@ for.cond.cleanup:
   ret float %add
 
 }
+
+define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind {
+; CHECK-LABEL: disable_tailcallopt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    bl streaming_compatible_callee
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  tail call void @streaming_compatible_callee();
+  ret void;
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 214a5ce38f276bbb22e24ce46723a619b239c7f9..d1c542bb1c5ce696199aa033e630731da99db35b 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -43,12 +43,12 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
 ; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    tbz x19, #0, .LBB1_2
+; CHECK-NEXT:    tbz w19, #0, .LBB1_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    bl normal_callee
-; CHECK-NEXT:    tbz x19, #0, .LBB1_4
+; CHECK-NEXT:    tbz w19, #0, .LBB1_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB1_4:
@@ -79,12 +79,12 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c
 ; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    tbnz x19, #0, .LBB2_2
+; CHECK-NEXT:    tbnz w19, #0, .LBB2_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB2_2:
 ; CHECK-NEXT:    bl streaming_callee
-; CHECK-NEXT:    tbnz x19, #0, .LBB2_4
+; CHECK-NEXT:    tbnz w19, #0, .LBB2_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB2_4:
@@ -129,30 +129,37 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    str z0, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    tbz x19, #0, .LBB4_2
+; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    tbz w19, #0, .LBB4_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB4_2:
-; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl normal_callee_vec_arg
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    tbz x19, #0, .LBB4_4
+; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    tbz w19, #0, .LBB4_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB4_4:
-; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr z1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr z1, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
@@ -204,14 +211,14 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
 ; CHECK-NEXT:    str z0, [sp, #1, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    tbz x19, #0, .LBB5_2
+; CHECK-NEXT:    tbz w19, #0, .LBB5_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB5_2:
 ; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl normal_callee_scalable_vec_arg
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    tbz x19, #0, .LBB5_4
+; CHECK-NEXT:    tbz w19, #0, .LBB5_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB5_4:
@@ -219,18 +226,6 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
 ; CHECK-NEXT:    ldr z1, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    fadd z0.d, z1.d, z0.d
 ; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -247,6 +242,18 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x29, [sp], #32 // 8-byte Folded Reload
@@ -296,14 +303,14 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
 ; CHECK-NEXT:    str p0, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    tbz x19, #0, .LBB6_2
+; CHECK-NEXT:    tbz w19, #0, .LBB6_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB6_2:
 ; CHECK-NEXT:    ldr p0, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    bl normal_callee_predicate_vec_arg
 ; CHECK-NEXT:    str p0, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    tbz x19, #0, .LBB6_4
+; CHECK-NEXT:    tbz w19, #0, .LBB6_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB6_4:
@@ -311,18 +318,6 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
 ; CHECK-NEXT:    ldr p1, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -339,6 +334,18 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x29, [sp], #32 // 8-byte Folded Reload
@@ -360,7 +367,7 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl
 ; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    tbnz x19, #0, .LBB7_2
+; CHECK-NEXT:    tbnz w19, #0, .LBB7_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB7_2:
@@ -381,12 +388,12 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co
 ; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    tbnz x19, #0, .LBB8_3
+; CHECK-NEXT:    tbnz w19, #0, .LBB8_3
 ; CHECK-NEXT:  // %bb.2: // %if.then
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB8_3: // %if.then
 ; CHECK-NEXT:    bl streaming_callee
-; CHECK-NEXT:    tbnz x19, #0, .LBB8_5
+; CHECK-NEXT:    tbnz w19, #0, .LBB8_5
 ; CHECK-NEXT:  // %bb.4: // %if.then
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB8_5: // %if.then
@@ -417,12 +424,12 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    tbz x19, #0, .LBB9_2
+; CHECK-NEXT:    tbz w19, #0, .LBB9_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB9_2:
 ; CHECK-NEXT:    bl normal_callee
-; CHECK-NEXT:    tbz x19, #0, .LBB9_4
+; CHECK-NEXT:    tbz w19, #0, .LBB9_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB9_4:
@@ -437,5 +444,63 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
   ret void;
 }
 
+define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: call_to_non_streaming_pass_args:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #112
+; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    .cfi_offset b8, -24
+; CHECK-NEXT:    .cfi_offset b9, -32
+; CHECK-NEXT:    .cfi_offset b10, -40
+; CHECK-NEXT:    .cfi_offset b11, -48
+; CHECK-NEXT:    .cfi_offset b12, -56
+; CHECK-NEXT:    .cfi_offset b13, -64
+; CHECK-NEXT:    .cfi_offset b14, -72
+; CHECK-NEXT:    .cfi_offset b15, -80
+; CHECK-NEXT:    stp d2, d3, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x8, x1
+; CHECK-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x9, x0
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    ldr s4, [sp, #8] // 4-byte Folded Reload
+; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-NEXT:    stp s4, s0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    ldr d4, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr d0, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT:    stp d4, d0, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    tbz w19, #0, .LBB10_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB10_2: // %entry
+; CHECK-NEXT:    ldp d2, d3, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x0, x9
+; CHECK-NEXT:    mov x1, x8
+; CHECK-NEXT:    ldp s0, s1, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    tbz w19, #0, .LBB10_4
+; CHECK-NEXT:  // %bb.3: // %entry
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB10_4: // %entry
+; CHECK-NEXT:    ldp x30, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    ret
+entry:
+  call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
+  ret void
+}
+
+declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef)
 
 attributes #0 = { nounwind "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e1a474d898233132dd38c13019f7f26973b58367
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll
@@ -0,0 +1,90 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme,+sve -verify-machineinstrs --pass-remarks-analysis=sme -o /dev/null < %s 2>&1 | FileCheck %s
+
+declare void @normal_callee()
+declare void @streaming_callee() "aarch64_pstate_sm_enabled"
+declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"
+
+; CHECK: remark: <unknown>:0:0: call from 'normal_caller_streaming_callee' to 'streaming_callee' requires a streaming mode transition
+define void @normal_caller_streaming_callee() nounwind {
+  call void @streaming_callee()
+  ret void;
+}
+
+; CHECK: remark: <unknown>:0:0: call from 'streaming_caller_normal_callee' to 'normal_callee' requires a streaming mode transition
+define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enabled" {
+  call void @normal_callee()
+  ret void;
+}
+
+; CHECK-NOT: streaming_caller_streaming_callee
+define void @streaming_caller_streaming_callee() nounwind "aarch64_pstate_sm_enabled" {
+  call void @streaming_callee()
+  ret void;
+}
+
+; CHECK-NOT: streaming_caller_streaming_compatible_callee
+define void @streaming_caller_streaming_compatible_callee() nounwind "aarch64_pstate_sm_enabled" {
+  call void @streaming_compatible_callee()
+  ret void;
+}
+
+; CHECK: remark: <unknown>:0:0: call from 'call_to_function_pointer_streaming_enabled' to 'unknown callee' requires a streaming mode transition
+define void @call_to_function_pointer_streaming_enabled(ptr %p) nounwind {
+  call void %p() "aarch64_pstate_sm_enabled"
+  ret void
+}
+
+; CHECK: remark: <unknown>:0:0: call from 'smstart_clobber_simdfp' to 'streaming_callee' requires a streaming mode transition
+define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind {
+  call void @streaming_callee()
+  ret <4 x i32> %x;
+}
+
+; CHECK: remark: <unknown>:0:0: call from 'smstart_clobber_sve' to 'streaming_callee' requires a streaming mode transition
+define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
+  call void @streaming_callee()
+  ret <vscale x 4 x i32> %x;
+}
+
+; CHECK: remark: <unknown>:0:0: call from 'smstart_clobber_sve_duplicate' to 'streaming_callee' requires a streaming mode transition
+; CHECK: remark: <unknown>:0:0: call from 'smstart_clobber_sve_duplicate' to 'streaming_callee' requires a streaming mode transition
+define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x) nounwind {
+  call void @streaming_callee()
+  call void @streaming_callee()
+  ret <vscale x 4 x i32> %x;
+}
+
+; CHECK: remark: <unknown>:0:0: call from 'call_to_intrinsic_without_chain' to 'cos' requires a streaming mode transition
+define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_enabled" {
+entry:
+  %res = call fast double @llvm.cos.f64(double %x)
+  %res.fadd = fadd fast double %res, %x
+  ret double %res.fadd
+}
+
+declare double @llvm.cos.f64(double)
+
+; CHECK: remark: <unknown>:0:0: call from 'disable_tailcallopt' to 'streaming_callee' requires a streaming mode transition
+define void @disable_tailcallopt() nounwind {
+  tail call void @streaming_callee()
+  ret void;
+}
+
+; CHECK: remark: <unknown>:0:0: call from 'call_to_non_streaming_pass_sve_objects' to 'foo' requires a streaming mode transition
+define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone %ptr) #0 {
+entry:
+  %Data1 = alloca <vscale x 16 x i8>, align 16
+  %Data2 = alloca <vscale x 16 x i8>, align 16
+  %Data3 = alloca <vscale x 16 x i8>, align 16
+  %0 = tail call i64 @llvm.aarch64.sme.cntsb()
+  call void @foo(ptr noundef nonnull %Data1, ptr noundef nonnull %Data2, ptr noundef nonnull %Data3, i64 noundef %0)
+  %1 = load <vscale x 16 x i8>, ptr %Data1, align 16
+  %vecext = extractelement <vscale x 16 x i8> %1, i64 0
+  ret i8 %vecext
+}
+
+declare i64 @llvm.aarch64.sme.cntsb()
+
+declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef)
+
+attributes #0 = { nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" }
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 2f4c907c8724777e70f4e541b9edb735ea560532..9270a6392c41074e54dfe763622c0cb0808e0542 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme  -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
 
 ; This file tests the following combinations related to streaming-enabled functions:
 ; [ ] N  ->  S    (Normal -> Streaming)
@@ -187,18 +187,6 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) #0 {
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -215,6 +203,18 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) #0 {
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -267,18 +267,6 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x)
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -295,6 +283,18 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x)
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -313,20 +313,19 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    str d0, [sp, #88] // 8-byte Folded Spill
+; CHECK-NEXT:    stp d0, d0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldr d0, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl cos
-; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldp d1, d0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    fadd d0, d1, d0
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr d0, [sp, #88] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr d1, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
-; CHECK-NEXT:    fadd d0, d1, d0
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
 entry:
@@ -369,15 +368,11 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    addvl x9, sp, #2
-; CHECK-NEXT:    addvl x10, sp, #1
-; CHECK-NEXT:    mov x11, sp
+; CHECK-NEXT:    rdsvl x3, #1
+; CHECK-NEXT:    addvl x0, sp, #2
+; CHECK-NEXT:    addvl x1, sp, #1
+; CHECK-NEXT:    mov x2, sp
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    mov x0, x9
-; CHECK-NEXT:    mov x1, x10
-; CHECK-NEXT:    mov x2, x11
-; CHECK-NEXT:    mov x3, x8
 ; CHECK-NEXT:    bl foo
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ptrue p0.b
@@ -401,9 +396,38 @@ entry:
   ret i8 %vecext
 }
 
+define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) #1 {
+; CHECK-LABEL: call_to_non_streaming_pass_args:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #112
+; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp s1, s0, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT:    stp d3, d2, [sp, #8] // 16-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp d3, d2, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp s1, s0, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    ret
+entry:
+  call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
+  ret void
+}
+
 declare i64 @llvm.aarch64.sme.cntsb()
 
 declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef)
+declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef)
 
 attributes #0 = { nounwind "target-features"="+sve" }
 attributes #1 = { nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" }
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0e6de90289e90a7aae49182c916d094fe628079f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64"
+
+; This function would normally scavenge a stackslot from the callee-save
+; area, which would lead to spilling 's0' to that stackslot before the
+; smstop and filling it with 'addvl + <offset>' after the smstop because
+; the frame-pointer is not available.
+; This would not be valid, since the vector-length has changed so 'addvl'
+; cannot be used. This is testing that the stackslot-scavenging is disabled
+; when there are any streaming-mode-changing call-sequences in the
+; function.
+define void @test_no_stackslot_scavenging(float %f) #0 {
+; CHECK-LABEL: test_no_stackslot_scavenging:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x24, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-NEXT:    bl use_f
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ldp x30, x24, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ptr = alloca <vscale x 16 x i8>
+  call void asm sideeffect "", "~{x24}"() nounwind
+  call void @use_f(float %f)
+  ret void
+}
+
+declare void @use_f(float)
+
+attributes #0 = { nounwind "target-features"="+sme" "aarch64_pstate_sm_enabled" }
diff --git a/llvm/test/CodeGen/AArch64/sme-write-vg.ll b/llvm/test/CodeGen/AArch64/sme-write-vg.ll
new file mode 100644
index 0000000000000000000000000000000000000000..577606d45484f13b0a72c11f72f03c956184be4d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-write-vg.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mattr=+sme -stop-after=finalize-isel < %s | FileCheck %s
+
+target triple = "aarch64"
+
+; Check that we don't define VG for 'smstart za' and 'smstop za'
+define void @smstart_za() "aarch64_new_za" nounwind {
+  ; CHECK-LABEL:    name: smstart_za
+  ; CHECK-NOT:        implicit-def {{[^,]*}}$vg
+  ret void
+}
+
+; Check that we do define VG for 'smstart sm' and 'smstop sm'
+define void @smstart_sm() nounwind {
+  ; CHECK-LABEL: name: smstart_sm
+  ; CHECK:          MSRpstatesvcrImm1 1, 1,
+  ; CHECK-SAME:       implicit-def {{[^,]*}}$vg
+  ; CHECK:          MSRpstatesvcrImm1 1, 0,
+  ; CHECK-SAME:       implicit-def {{[^,]*}}$vg
+  call void @require_sm()
+  ret void
+}
+
+declare void @require_sm() "aarch64_pstate_sm_enabled"
+declare void @require_za() "aarch64_inout_za"
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index 951dbc72defc82a0c9f7bb03528281250f612018..08d55f07b7c4bf89dcbde14d0d635e0f071880b6 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -7,12 +7,14 @@
   target triple = "aarch64--linux-gnu"
 
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_ppr() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_pnr() #1 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr3() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4() #0 { entry: unreachable }
 
   attributes #0 = { nounwind "target-features"="+sve" }
+  attributes #1 = { nounwind "target-features"="+sve2p1" }
 
 ...
 ---
@@ -59,6 +61,49 @@ body:             |
     RET_ReallyLR
 ...
 ---
+name: spills_fills_stack_id_pnr
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: pnr }
+stack:
+liveins:
+  - { reg: '$pn0', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $pn0
+
+    ; CHECK-LABEL: name: spills_fills_stack_id_pnr
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
+
+    ; EXPAND-LABEL: name: spills_fills_stack_id_pnr
+    ; EXPAND: STR_PXI $p0, $sp, 7
+    ; EXPAND: $p0 = LDR_PXI $sp, 7, implicit-def $pn0
+
+    %0:pnr = COPY $pn0
+
+    $pn0 = IMPLICIT_DEF
+    $pn1 = IMPLICIT_DEF
+    $pn2 = IMPLICIT_DEF
+    $pn3 = IMPLICIT_DEF
+    $pn4 = IMPLICIT_DEF
+    $pn5 = IMPLICIT_DEF
+    $pn6 = IMPLICIT_DEF
+    $pn7 = IMPLICIT_DEF
+    $pn8 = IMPLICIT_DEF
+    $pn9 = IMPLICIT_DEF
+    $pn10 = IMPLICIT_DEF
+    $pn11 = IMPLICIT_DEF
+    $pn12 = IMPLICIT_DEF
+    $pn13 = IMPLICIT_DEF
+    $pn14 = IMPLICIT_DEF
+    $pn15 = IMPLICIT_DEF
+
+    $pn0 = COPY %0
+    RET_ReallyLR
+...
+---
 name: spills_fills_stack_id_zpr
 tracksRegLiveness: true
 registers:
diff --git a/llvm/test/CodeGen/AArch64/sve-alloca.ll b/llvm/test/CodeGen/AArch64/sve-alloca.ll
index 90eed07c242bf6833b883b0b3672fb4bb03f76db..565311d8e3f8fe456d626a990d29a3bbed395fd9 100644
--- a/llvm/test/CodeGen/AArch64/sve-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-alloca.ll
@@ -66,18 +66,6 @@ define void @foo(<vscale x 4 x i64> %dst, i1 %cond) {
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:    addvl sp, x29, #-18
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -94,6 +82,18 @@ define void @foo(<vscale x 4 x i64> %dst, i1 %cond) {
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    ldp x28, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index a97649523565dc0f3fdb7053ee49cbdb5b68f68d..c551fcf4da70dc3160c32277f793114406dbca69 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -567,18 +567,6 @@ define <vscale x 4 x float> @sve_caller_non_sve_callee_high_range(<vscale x 4 x
 ; CHECK-NEXT:    bl non_sve_callee_high_range
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #3
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -595,6 +583,18 @@ define <vscale x 4 x float> @sve_caller_non_sve_callee_high_range(<vscale x 4 x
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -659,18 +659,6 @@ define <vscale x 4 x float> @sve_ret_caller_non_sve_callee_high_range()  {
 ; CHECK-NEXT:    addvl x0, sp, #1
 ; CHECK-NEXT:    bl non_sve_callee_high_range
 ; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -687,6 +675,18 @@ define <vscale x 4 x float> @sve_ret_caller_non_sve_callee_high_range()  {
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir b/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir
index 3fbb7889c8b79b83b9162388f7299e017722583d..6063c8dfc792c955fec2d147a4c8f5a7dbaabed1 100644
--- a/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir
+++ b/llvm/test/CodeGen/AArch64/sve-localstackalloc.mir
@@ -48,7 +48,7 @@ body:             |
     %2:gpr32 = COPY $w0
     %1:zpr = COPY $z1
     %0:zpr = COPY $z0
-    %5:ppr_3b = PTRUE_B 31
+    %5:ppr_3b = PTRUE_B 31, implicit $vg
     %6:gpr64sp = ADDXri %stack.0, 0, 0
     ST1B_IMM %1, %5, %6, 1 :: (store unknown-size, align 16)
     ST1B_IMM %0, %5, %stack.0, 0 :: (store unknown-size into %stack.0, align 16)
diff --git a/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir b/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir
index b76fe7821b6c691a410e76648105f6499819084c..8395a7619fbb467bc9a52ddcd48933d52d7128ed 100644
--- a/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir
+++ b/llvm/test/CodeGen/AArch64/sve-pfalse-machine-cse.mir
@@ -11,15 +11,15 @@ body:             |
     ; CHECK: liveins: $p0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:ppr = COPY $p0
-    ; CHECK-NEXT: [[PFALSE:%[0-9]+]]:ppr = PFALSE
+    ; CHECK-NEXT: [[PFALSE:%[0-9]+]]:ppr = PFALSE implicit $vg
     ; CHECK-NEXT: [[UZP1_PPP_B:%[0-9]+]]:ppr = UZP1_PPP_B [[COPY]], [[PFALSE]]
     ; CHECK-NEXT: [[UZP1_PPP_B1:%[0-9]+]]:ppr = UZP1_PPP_B killed [[UZP1_PPP_B]], [[PFALSE]]
     ; CHECK-NEXT: $p0 = COPY [[UZP1_PPP_B1]]
     ; CHECK-NEXT: RET_ReallyLR implicit $p0
     %0:ppr = COPY $p0
-    %2:ppr = PFALSE
+    %2:ppr = PFALSE implicit $vg
     %3:ppr = UZP1_PPP_B %0, %2
-    %4:ppr = PFALSE
+    %4:ppr = PFALSE implicit $vg
     %5:ppr = UZP1_PPP_B killed %3, %4
     $p0 = COPY %5
     RET_ReallyLR implicit $p0
diff --git a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
index df0e50de4d1a7a8523c0e73c33160415f9fe089b..ae70f91a4ec6414c25f78c513fd09e671d6254a5 100644
--- a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
+++ b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
@@ -26,7 +26,7 @@ body:             |
 name: expand_mls_to_msb
 body:             |
   bb.0:
-    renamable $p0 = PTRUE_B 31
+    renamable $p0 = PTRUE_B 31, implicit $vg
     renamable $z0 = MLS_ZPZZZ_B_UNDEF killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1
     RET_ReallyLR implicit $z0
 ...
@@ -36,7 +36,7 @@ body:             |
 name: expand_mla_to_mad
 body:             |
   bb.0:
-    renamable $p0 = PTRUE_B 31
+    renamable $p0 = PTRUE_B 31, implicit $vg
     renamable $z0 = MLA_ZPZZZ_B_UNDEF killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1
     RET_ReallyLR implicit $z0
 ...
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir
index 81318aa5c2a58da84c29221bbb44f08096d9c0ee..5169113697dc603c3f8417f1fe7130b81631df51 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir
@@ -174,7 +174,7 @@ body:             |
     %1:zpr = COPY $z0
     %0:ppr_3b = COPY $p0
     %2:ppr = CMPEQ_PPzZI_B %0, %1, 0, implicit-def dead $nzcv
-    %3:ppr = PTRUE_B 31
+    %3:ppr = PTRUE_B 31, implicit $vg
     PTEST_PP killed %3, killed %2, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
     %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
@@ -409,14 +409,14 @@ body:             |
 
     ; CHECK-LABEL: name: cmpeq_imm_nxv16i8_ptest_not_all_active
     ; CHECK: %2:ppr = CMPEQ_PPzZI_B %0, %1, 0, implicit-def dead $nzcv
-    ; CHECK-NEXT: %3:ppr = PTRUE_B 0
+    ; CHECK-NEXT: %3:ppr = PTRUE_B 0, implicit $vg
     ; CHECK-NEXT: PTEST_PP killed %3, killed %2, implicit-def $nzcv
     ; CHECK-NEXT: %4:gpr32 = COPY $wzr
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:zpr = COPY $z0
     %0:ppr_3b = COPY $p0
     %2:ppr = CMPEQ_PPzZI_B %0, %1, 0, implicit-def dead $nzcv
-    %3:ppr = PTRUE_B 0
+    %3:ppr = PTRUE_B 0, implicit $vg
     PTEST_PP killed %3, killed %2, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
     %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
@@ -446,14 +446,14 @@ body:             |
 
     ; CHECK-LABEL: name: cmpeq_imm_nxv16i8_ptest_of_halfs
     ; CHECK: %2:ppr = CMPEQ_PPzZI_B %0, %1, 0, implicit-def dead $nzcv
-    ; CHECK-NEXT: %3:ppr = PTRUE_H 31
+    ; CHECK-NEXT: %3:ppr = PTRUE_H 31, implicit $vg
     ; CHECK-NEXT: PTEST_PP killed %3, killed %2, implicit-def $nzcv
     ; CHECK-NEXT: %4:gpr32 = COPY $wzr
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:zpr = COPY $z0
     %0:ppr_3b = COPY $p0
     %2:ppr = CMPEQ_PPzZI_B %0, %1, 0, implicit-def dead $nzcv
-    %3:ppr = PTRUE_H 31
+    %3:ppr = PTRUE_H 31, implicit $vg
     PTEST_PP killed %3, killed %2, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
     %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilege.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilege.mir
index 8f7467d99154e3285d3b1dee155aa32889737bfe..c1d9dfff73447ce19e91785eafaeda48bd6ad740 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilege.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilege.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEGE_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg, implicit $vg
     %3:ppr = WHILEGE_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILEGE_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILEGE_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILEGE_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILEGE_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILEGE_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILEGE_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 0
+    %2:ppr = PTRUE_B 0, implicit $vg
     %3:ppr = WHILEGE_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILEGE_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILEGE_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILEGE_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilegt.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilegt.mir
index 217d984560e36c472cf64e848c701290e49b0380..c6df21f85db77354f3f924059e16972d31302a86 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilegt.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilegt.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEGT_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg, implicit $vg
     %3:ppr = WHILEGT_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg, implicit $vg
     %4:ppr = WHILEGT_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg, implicit $vg
     %4:ppr = WHILEGT_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg, implicit $vg
     %4:ppr = WHILEGT_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg, implicit $vg
     %4:ppr = WHILEGT_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg, implicit $vg
     %4:ppr = WHILEGT_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg, implicit $vg
     %4:ppr = WHILEGT_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 1
+    %2:ppr = PTRUE_H 1, implicit $vg, implicit $vg
     %3:ppr = WHILEGT_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEGT_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILEGT_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILEGT_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehi.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehi.mir
index 8d6f466c6b735d1c0ae6b25d1283aaa9bd1ef6c7..7d8aed3c325a01863c90b48f5fd245eeabdbc048 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehi.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehi.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEHI_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEHI_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILEHI_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILEHI_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILEHI_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILEHI_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILEHI_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILEHI_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 29
+    %2:ppr = PTRUE_S 29, implicit $vg
     %3:ppr = WHILEHI_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEHI_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILEHI_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILEHI_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehs.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehs.mir
index da76a30f843b7ae15ead69a1874387467efcb4b5..f4dbfbc3db1cab61218c155c28455af1f51f0634 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehs.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilehs.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEHS_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEHS_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILEHS_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILEHS_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILEHS_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILEHS_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILEHS_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILEHS_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 30
+    %2:ppr = PTRUE_D 30, implicit $vg
     %3:ppr = WHILEHS_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEHS_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILEHS_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILEHS_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilele.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilele.mir
index 32954d593c1ddd21eb2dc180684f9707d455c608..dc2265490cb55e4c81c2c92937e798407393728e 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilele.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilele.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELE_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELE_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELE_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELE_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELE_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELE_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELE_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELE_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 7
+    %2:ppr = PTRUE_B 7, implicit $vg
     %3:ppr = WHILELE_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILELE_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILELE_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILELE_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir
index cca0ab8ef210b99577dc1bdd2f040bed94b0fefe..4d66e3e57da8b39390ef70b1ffebf06ec7db543b 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelo.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELO_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELO_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELO_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELO_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELO_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELO_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELO_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELO_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 6
+    %2:ppr = PTRUE_H 6, implicit $vg
     %3:ppr = WHILELO_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELO_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILELO_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILELO_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilels.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilels.mir
index 4bae3a1986f451b05556ceeb4e57d80f22a1fc4f..ea02f8c70ef86c68d3dfed7f140650636b28f78a 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilels.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilels.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELS_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELS_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELS_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELS_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELS_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELS_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELS_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELS_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 5
+    %2:ppr = PTRUE_S 5, implicit $vg
     %3:ppr = WHILELS_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELS_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILELS_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILELS_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelt.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelt.mir
index 3c6a9e21b4c6c15afe88f29b0b4102507abdbb97..d08781f203e328043c7eaf7080c954e5c9a0c841 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelt.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilelt.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELT_PWW_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -63,7 +63,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELT_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -98,7 +98,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELT_PWW_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -133,7 +133,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILELT_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -168,7 +168,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELT_PWW_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -203,7 +203,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILELT_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -238,7 +238,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELT_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILELT_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_D 4
+    %2:ppr = PTRUE_D 4, implicit $vg
     %3:ppr = WHILELT_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -353,7 +353,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILELT_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -393,7 +393,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILELT_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -433,7 +433,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr32 = COPY $w1
     %0:gpr32 = COPY $w0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILELT_PWW_D %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilerw.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilerw.mir
index 27cdf593df776f6635a01d17b74fdfbbf6d49074..d800009b9537f3d79bd1f4b0672161f8eecfec92 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilerw.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilerw.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILERW_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -65,7 +65,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILERW_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -100,7 +100,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILERW_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -135,7 +135,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILERW_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -175,7 +175,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 0
+    %2:ppr = PTRUE_B 0, implicit $vg
     %3:ppr = WHILERW_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -215,7 +215,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILERW_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -255,7 +255,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILERW_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -295,7 +295,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILERW_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilewr.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilewr.mir
index 3b49b1ec2c804578d7bf18990e8dbb8524954493..9f8b7c3197ecf2286653a0894a48c34abb1f032e 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilewr.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-whilewr.mir
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 31
+    %2:ppr = PTRUE_B 31, implicit $vg
     %3:ppr = WHILEWR_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -65,7 +65,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %4:ppr = WHILEWR_PXX_H %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -100,7 +100,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %4:ppr = WHILEWR_PXX_S %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -135,7 +135,7 @@ body:             |
     ; CHECK-NOT: PTEST
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %4:ppr = WHILEWR_PXX_D %0, %1, implicit-def dead $nzcv
     PTEST_PP %2, %4, implicit-def $nzcv
     %6:gpr32 = COPY $wzr
@@ -175,7 +175,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_B 0
+    %2:ppr = PTRUE_B 0, implicit $vg
     %3:ppr = WHILEWR_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -215,7 +215,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_H 31
+    %2:ppr = PTRUE_H 31, implicit $vg
     %3:ppr = WHILEWR_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -255,7 +255,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_S 31
+    %2:ppr = PTRUE_S 31, implicit $vg
     %3:ppr = WHILEWR_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
@@ -295,7 +295,7 @@ body:             |
     ; CHECK-NEXT: %5:gpr32 = CSINCWr %4, $wzr, 0, implicit $nzcv
     %1:gpr64 = COPY $x1
     %0:gpr64 = COPY $x0
-    %2:ppr = PTRUE_D 31
+    %2:ppr = PTRUE_D 31, implicit $vg
     %3:ppr = WHILEWR_PXX_B %0, %1, implicit-def dead $nzcv
     PTEST_PP killed %2, killed %3, implicit-def $nzcv
     %4:gpr32 = COPY $wzr
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index 0bd767cd43655794686e1e85edb5e990e35d8f10..4ae4e6538703caebbf68546b9a52633632d448e6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -1122,3 +1122,201 @@ define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) {
   store <4 x double> %res, ptr %b
   ret void
 }
+
+define half @scvtf_i16_f16(ptr %0) {
+; CHECK-LABEL: scvtf_i16_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrsh w8, [x0]
+; CHECK-NEXT:    scvtf h0, w8
+; CHECK-NEXT:    ret
+  %2 = load i16, ptr %0, align 64
+  %3 = sitofp i16 %2 to half
+  ret half %3
+}
+
+define float @scvtf_i16_f32(ptr %0) {
+; CHECK-LABEL: scvtf_i16_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrsh w8, [x0]
+; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ret
+  %2 = load i16, ptr %0, align 64
+  %3 = sitofp i16 %2 to float
+  ret float %3
+}
+
+define double @scvtf_i16_f64(ptr %0) {
+; CHECK-LABEL: scvtf_i16_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrsh w8, [x0]
+; CHECK-NEXT:    scvtf d0, w8
+; CHECK-NEXT:    ret
+  %2 = load i16, ptr %0, align 64
+  %3 = sitofp i16 %2 to double
+  ret double %3
+}
+
+define half @scvtf_i32_f16(ptr %0) {
+; CHECK-LABEL: scvtf_i32_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    scvtf h0, w8
+; CHECK-NEXT:    ret
+  %2 = load i32, ptr %0, align 64
+  %3 = sitofp i32 %2 to half
+  ret half %3
+}
+
+define float @scvtf_i32_f32(ptr %0) {
+; CHECK-LABEL: scvtf_i32_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ret
+  %2 = load i32, ptr %0, align 64
+  %3 = sitofp i32 %2 to float
+  ret float %3
+}
+
+define double @scvtf_i32_f64(ptr %0) {
+; CHECK-LABEL: scvtf_i32_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    scvtf d0, w8
+; CHECK-NEXT:    ret
+  %2 = load i32, ptr %0, align 64
+  %3 = sitofp i32 %2 to double
+  ret double %3
+}
+
+define half @scvtf_i64_f16(ptr %0) {
+; CHECK-LABEL: scvtf_i64_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    scvtf h0, x8
+; CHECK-NEXT:    ret
+  %2 = load i64, ptr %0, align 64
+  %3 = sitofp i64 %2 to half
+  ret half %3
+}
+
+define float @scvtf_i64_f32(ptr %0) {
+; CHECK-LABEL: scvtf_i64_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    ret
+  %2 = load i64, ptr %0, align 64
+  %3 = sitofp i64 %2 to float
+  ret float %3
+}
+
+define double @scvtf_i64_f64(ptr %0) {
+; CHECK-LABEL: scvtf_i64_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    ret
+  %2 = load i64, ptr %0, align 64
+  %3 = sitofp i64 %2 to double
+  ret double %3
+}
+
+define half @ucvtf_i16_f16(ptr %0) {
+; CHECK-LABEL: ucvtf_i16_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ucvtf h0, w8
+; CHECK-NEXT:    ret
+  %2 = load i16, ptr %0, align 64
+  %3 = uitofp i16 %2 to half
+  ret half %3
+}
+
+define float @ucvtf_i16_f32(ptr %0) {
+; CHECK-LABEL: ucvtf_i16_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ucvtf s0, s0
+; CHECK-NEXT:    ret
+  %2 = load i16, ptr %0, align 64
+  %3 = uitofp i16 %2 to float
+  ret float %3
+}
+
+define double @ucvtf_i16_f64(ptr %0) {
+; CHECK-LABEL: ucvtf_i16_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ucvtf d0, d0
+; CHECK-NEXT:    ret
+  %2 = load i16, ptr %0, align 64
+  %3 = uitofp i16 %2 to double
+  ret double %3
+}
+
+define half @ucvtf_i32_f16(ptr %0) {
+; CHECK-LABEL: ucvtf_i32_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ucvtf h0, w8
+; CHECK-NEXT:    ret
+  %2 = load i32, ptr %0, align 64
+  %3 = uitofp i32 %2 to half
+  ret half %3
+}
+
+define float @ucvtf_i32_f32(ptr %0) {
+; CHECK-LABEL: ucvtf_i32_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    ret
+  %2 = load i32, ptr %0, align 64
+  %3 = uitofp i32 %2 to float
+  ret float %3
+}
+
+define double @ucvtf_i32_f64(ptr %0) {
+; CHECK-LABEL: ucvtf_i32_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ucvtf d0, d0
+; CHECK-NEXT:    ret
+  %2 = load i32, ptr %0, align 64
+  %3 = uitofp i32 %2 to double
+  ret double %3
+}
+
+define half @ucvtf_i64_f16(ptr %0) {
+; CHECK-LABEL: ucvtf_i64_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ucvtf h0, x8
+; CHECK-NEXT:    ret
+  %2 = load i64, ptr %0, align 64
+  %3 = uitofp i64 %2 to half
+  ret half %3
+}
+
+define float @ucvtf_i64_f32(ptr %0) {
+; CHECK-LABEL: ucvtf_i64_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ucvtf s0, x8
+; CHECK-NEXT:    ret
+  %2 = load i64, ptr %0, align 64
+  %3 = uitofp i64 %2 to float
+  ret float %3
+}
+
+define double @ucvtf_i64_f64(ptr %0) {
+; CHECK-LABEL: ucvtf_i64_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ucvtf d0, x8
+; CHECK-NEXT:    ret
+  %2 = load i64, ptr %0, align 64
+  %3 = uitofp i64 %2 to double
+  ret double %3
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index e746770e29a2f5105ac4edec3f65871b2e022680..050c7353b97d9129223e9666491906982549555b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -335,6 +335,58 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) {
   ret <4 x double> %load
 }
 
+define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
+; CHECK-LABEL: masked_load_zext_v3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    adrp x8, .LCPI13_0
+; CHECK-NEXT:    strh w3, [sp, #12]
+; CHECK-NEXT:    strh w2, [sp, #10]
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    strh w1, [sp, #8]
+; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI13_0]
+; CHECK-NEXT:    ldr d1, [sp, #8]
+; CHECK-NEXT:    and z0.d, z1.d, z0.d
+; CHECK-NEXT:    lsl z0.h, z0.h, #15
+; CHECK-NEXT:    asr z0.h, z0.h, #15
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer)
+  %extend = zext <3 x i16> %load_value to <3 x i32>
+  ret <3 x i32> %extend;
+}
+
+define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) {
+; CHECK-LABEL: masked_load_sext_v3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    adrp x8, .LCPI14_0
+; CHECK-NEXT:    strh w3, [sp, #12]
+; CHECK-NEXT:    strh w2, [sp, #10]
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    strh w1, [sp, #8]
+; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI14_0]
+; CHECK-NEXT:    ldr d1, [sp, #8]
+; CHECK-NEXT:    and z0.d, z1.d, z0.d
+; CHECK-NEXT:    lsl z0.h, z0.h, #15
+; CHECK-NEXT:    asr z0.h, z0.h, #15
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+  %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer)
+  %extend = sext <3 x i16> %load_value to <3 x i32>
+  ret <3 x i32> %extend;
+}
+
 declare <4 x i8> @llvm.masked.load.v4i8(ptr, i32, <4 x i1>, <4 x i8>)
 declare <8 x i8> @llvm.masked.load.v8i8(ptr, i32, <8 x i1>, <8 x i8>)
 declare <16 x i8> @llvm.masked.load.v16i8(ptr, i32, <16 x i1>, <16 x i8>)
@@ -351,3 +403,5 @@ declare <8 x float> @llvm.masked.load.v8f32(ptr, i32, <8 x i1>, <8 x float>)
 
 declare <2 x double> @llvm.masked.load.v2f64(ptr, i32, <2 x i1>, <2 x double>)
 declare <4 x double> @llvm.masked.load.v4f64(ptr, i32, <4 x i1>, <4 x double>)
+
+declare <3 x i16> @llvm.masked.load.v3i16.p0(ptr, i32, <3 x i1>, <3 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/sve-tailcall.ll b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
index 58135e44fa912dbf2861f062604ddc2a1e15d694..4ddf007768fd2c0a04e58ee3715ace0d7156df89 100644
--- a/llvm/test/CodeGen/AArch64/sve-tailcall.ll
+++ b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
@@ -83,18 +83,6 @@ define i32 @sve_caller_non_sve_callee(<vscale x 4 x i32> %arg) nounwind {
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    bl non_sve_callee
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -111,6 +99,18 @@ define i32 @sve_caller_non_sve_callee(<vscale x 4 x i32> %arg) nounwind {
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -158,18 +158,6 @@ define i32 @sve_caller_non_sve_callee_fastcc(<vscale x 4 x i32> %arg) nounwind {
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    bl non_sve_callee
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -186,6 +174,18 @@ define i32 @sve_caller_non_sve_callee_fastcc(<vscale x 4 x i32> %arg) nounwind {
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/unwind-preserved.ll b/llvm/test/CodeGen/AArch64/unwind-preserved.ll
index f3c4d217e6fcaa1d060f4579b5857426d2a34858..822be14faaeb1f83e51e89926ebbf0d892b63a9c 100644
--- a/llvm/test/CodeGen/AArch64/unwind-preserved.ll
+++ b/llvm/test/CodeGen/AArch64/unwind-preserved.ll
@@ -63,18 +63,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -91,6 +79,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    .cfi_restore z8
@@ -112,18 +112,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -140,6 +128,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    .cfi_restore z8
@@ -215,18 +215,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #2
 ; GISEL-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -243,6 +231,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #18
 ; GISEL-NEXT:    .cfi_def_cfa wsp, 16
 ; GISEL-NEXT:    .cfi_restore z8
@@ -264,18 +264,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #2
 ; GISEL-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -292,6 +280,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #18
 ; GISEL-NEXT:    .cfi_def_cfa wsp, 16
 ; GISEL-NEXT:    .cfi_restore z8
diff --git a/llvm/test/MC/AArch64/SME/directives-negative.s b/llvm/test/MC/AArch64/SME/directives-negative.s
index 3df90b3016869ee39ef24f0545daa6cf04ef8b8f..123c3a383d71efc31e43c8e6594c8cbdc089b622 100644
--- a/llvm/test/MC/AArch64/SME/directives-negative.s
+++ b/llvm/test/MC/AArch64/SME/directives-negative.s
@@ -2,9 +2,9 @@
 
 .arch_extension sme
 .arch_extension nosme
-smstart
+zero {za}
 // CHECK: error: instruction requires: sme
-// CHECK-NEXT: smstart
+// CHECK-NEXT: zero {za}
 
 .arch_extension sme-f64f64
 .arch_extension nosme-f64f64
@@ -20,9 +20,9 @@ addha za0.d, p0/m, p0/m, z0.d
 
 .arch armv9-a+sme
 .arch armv9-a+nosme
-smstart
+zero {za}
 // CHECK: error: instruction requires: sme
-// CHECK-NEXT: smstart
+// CHECK-NEXT: zero {za}
 
 .arch armv9-a+sme-f64f64
 .arch armv9-a+nosme-f64f64
diff --git a/llvm/test/MC/AArch64/SME/smstart.s b/llvm/test/MC/AArch64/SME/smstart.s
index 1628a279bb3535d01936c55ab067139f928f2c5c..07a696d28df8c8185cf0317e4d0f9c66221aa6ac 100644
--- a/llvm/test/MC/AArch64/SME/smstart.s
+++ b/llvm/test/MC/AArch64/SME/smstart.s
@@ -1,38 +1,28 @@
 // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
-// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme < %s \
 // RUN:        | llvm-objdump -d --mattr=+sme - | FileCheck %s --check-prefix=CHECK-INST
 // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme < %s \
-// RUN:   | llvm-objdump -d --mattr=-sme - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// RUN:        | llvm-objdump -d --mattr=-sme - | FileCheck %s --check-prefix=CHECK-INST
 
 smstart
 // CHECK-INST: smstart
 // CHECK-ENCODING: [0x7f,0x47,0x03,0xd5]
-// CHECK-ERROR: instruction requires: sme
-// CHECK-UNKNOWN: d503477f   msr   S0_3_C4_C7_3, xzr
 
 smstart sm
 // CHECK-INST: smstart sm
 // CHECK-ENCODING: [0x7f,0x43,0x03,0xd5]
-// CHECK-ERROR: instruction requires: sme
-// CHECK-UNKNOWN: d503437f   msr   S0_3_C4_C3_3, xzr
 
 smstart za
 // CHECK-INST: smstart za
 // CHECK-ENCODING: [0x7f,0x45,0x03,0xd5]
-// CHECK-ERROR: instruction requires: sme
-// CHECK-UNKNOWN: d503457f   msr   S0_3_C4_C5_3, xzr
 
 smstart SM
 // CHECK-INST: smstart sm
 // CHECK-ENCODING: [0x7f,0x43,0x03,0xd5]
-// CHECK-ERROR: instruction requires: sme
-// CHECK-UNKNOWN: d503437f   msr   S0_3_C4_C3_3, xzr
 
 smstart ZA
 // CHECK-INST: smstart za
 // CHECK-ENCODING: [0x7f,0x45,0x03,0xd5]
-// CHECK-ERROR: instruction requires: sme
-// CHECK-UNKNOWN: d503457f   msr   S0_3_C4_C5_3, xzr
diff --git a/llvm/test/MC/AArch64/SME/smstop.s b/llvm/test/MC/AArch64/SME/smstop.s
index b7c21d939e9a9db44c865f0bcc141f64ca9ad10c..df9d1118d96ffffbc3ae91398558986dbefe9d12 100644
--- a/llvm/test/MC/AArch64/SME/smstop.s
+++ b/llvm/test/MC/AArch64/SME/smstop.s
@@ -1,38 +1,28 @@
 // RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
-// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme < %s \
 // RUN:        | llvm-objdump -d --mattr=+sme - | FileCheck %s --check-prefix=CHECK-INST
 // RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme < %s \
-// RUN:   | llvm-objdump -d --mattr=-sme - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// RUN:        | llvm-objdump -d --mattr=-sme - | FileCheck %s --check-prefix=CHECK-INST
 
 smstop
 // CHECK-INST: smstop
 // CHECK-ENCODING: [0x7f,0x46,0x03,0xd5]
-// CHECK-ERROR: instruction requires: sme
-// CHECK-UNKNOWN: d503467f   msr   S0_3_C4_C6_3, xzr
 
 smstop sm
 // CHECK-INST: smstop sm
 // CHECK-ENCODING: [0x7f,0x42,0x03,0xd5]
-// CHECK-ERROR: instruction requires: sme
-// CHECK-UNKNOWN: d503427f   msr   S0_3_C4_C2_3, xzr
 
 smstop za
 // CHECK-INST: smstop za
 // CHECK-ENCODING: [0x7f,0x44,0x03,0xd5]
-// CHECK-ERROR: instruction requires: sme
-// CHECK-UNKNOWN: d503447f   msr   S0_3_C4_C4_3, xzr
 
 smstop SM
 // CHECK-INST: smstop sm
 // CHECK-ENCODING: [0x7f,0x42,0x03,0xd5]
-// CHECK-ERROR: instruction requires: sme
-// CHECK-UNKNOWN: d503427f   msr   S0_3_C4_C2_3, xzr
 
 smstop ZA
 // CHECK-INST: smstop za
 // CHECK-ENCODING: [0x7f,0x44,0x03,0xd5]
-// CHECK-ERROR: instruction requires: sme
-// CHECK-UNKNOWN: d503447f   msr   S0_3_C4_C4_3, xzr
diff --git a/llvm/test/MC/AArch64/SME/system-regs.s b/llvm/test/MC/AArch64/SME/system-regs.s
index 48033d026c44bde5b28693fde41de3b22a049bf8..093b64a70fb9316f47b09d40dfb5fe9bf5293466 100644
--- a/llvm/test/MC/AArch64/SME/system-regs.s
+++ b/llvm/test/MC/AArch64/SME/system-regs.s
@@ -119,37 +119,37 @@ msr SVCRSM, #0
 // CHECK-INST: smstop sm
 // CHECK-ENCODING: [0x7f,0x42,0x03,0xd5]
 // CHECK-ERROR: expected writable system register or pstate
-// CHECK-UNKNOWN: d503427f   msr   S0_3_C4_C2_3, xzr
+// CHECK-UNKNOWN: d503427f   smstop sm
 
 msr SVCRSM, #1
 // CHECK-INST: smstart
 // CHECK-ENCODING: [0x7f,0x43,0x03,0xd5]
 // CHECK-ERROR: expected writable system register or pstate
-// CHECK-UNKNOWN: d503437f   msr   S0_3_C4_C3_3, xzr
+// CHECK-UNKNOWN: d503437f   smstart
 
 msr SVCRZA, #0
 // CHECK-INST: smstop za
 // CHECK-ENCODING: [0x7f,0x44,0x03,0xd5]
 // CHECK-ERROR: expected writable system register or pstate
-// CHECK-UNKNOWN: d503447f   msr   S0_3_C4_C4_3, xzr
+// CHECK-UNKNOWN: d503447f   smstop za
 
 msr SVCRZA, #1
 // CHECK-INST: smstart za
 // CHECK-ENCODING: [0x7f,0x45,0x03,0xd5]
 // CHECK-ERROR: expected writable system register or pstate
-// CHECK-UNKNOWN: d503457f   msr   S0_3_C4_C5_3, xzr
+// CHECK-UNKNOWN: d503457f   smstart za
 
 msr SVCRSMZA, #0
 // CHECK-INST: smstop
 // CHECK-ENCODING: [0x7f,0x46,0x03,0xd5]
 // CHECK-ERROR: expected writable system register or pstate
-// CHECK-UNKNOWN: d503467f   msr   S0_3_C4_C6_3, xzr
+// CHECK-UNKNOWN: d503467f   smstop
 
 msr SVCRSMZA, #1
 // CHECK-INST: smstart
 // CHECK-ENCODING: [0x7f,0x47,0x03,0xd5]
 // CHECK-ERROR: expected writable system register or pstate
-// CHECK-UNKNOWN: d503477f   msr   S0_3_C4_C7_3, xzr
+// CHECK-UNKNOWN: d503477f   smstart
 
 msr TPIDR2_EL0, x3
 // CHECK-INST: msr TPIDR2_EL0, x3
diff --git a/llvm/test/MC/AArch64/SVE/cntp-diagnostics.s b/llvm/test/MC/AArch64/SVE/cntp-diagnostics.s
index 9c4173f50dee0193eff34676d313edc4d9578c0f..99b38289ade736f1088745c6ffd3486fa1646485 100644
--- a/llvm/test/MC/AArch64/SVE/cntp-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/cntp-diagnostics.s
@@ -1,7 +1,7 @@
 // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve  2>&1 < %s| FileCheck %s
 
 cntp  sp
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // CHECK-NEXT: cntp  sp
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
diff --git a/llvm/test/MC/AArch64/SVE/predicate-as-counter-aliases.s b/llvm/test/MC/AArch64/SVE/predicate-as-counter-aliases.s
index bca2cf913ff64a2818ed1e1e3e2e184675236417..fd6c59888e097d3d5085907f3e2863fcdd9d7887 100644
--- a/llvm/test/MC/AArch64/SVE/predicate-as-counter-aliases.s
+++ b/llvm/test/MC/AArch64/SVE/predicate-as-counter-aliases.s
@@ -1,50 +1,50 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p1 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2 < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
 // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
 // RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p1 < %s \
-// RUN:        | llvm-objdump --no-print-imm-hex -d --mattr=+sve2p1 - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p1 < %s \
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve < %s \
+// RUN:        | llvm-objdump --no-print-imm-hex -d --mattr=+sve - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve < %s \
 // RUN:   | llvm-objdump --no-print-imm-hex -d --mattr=-sve - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
 
 ldr     pn0, [x0]
 // CHECK-INST: ldr     p0, [x0]
 // CHECK-ENCODING: [0x00,0x00,0x80,0x85]
-// CHECK-ERROR: instruction requires: sme2 or sve2p1
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 85800000 <unknown>
 
 ldr     pn5, [x10, #255, mul vl]
 // CHECK-INST: ldr     p5, [x10, #255, mul vl]
 // CHECK-ENCODING: [0x45,0x1d,0x9f,0x85]
-// CHECK-ERROR: instruction requires: sme2 or sve2p1
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 859f1d45 <unknown>
 
 
 str     pn0, [x0]
 // CHECK-INST: str     p0, [x0]
 // CHECK-ENCODING: [0x00,0x00,0x80,0xe5]
-// CHECK-ERROR: instruction requires: sme2 or sve2p1
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e5800000 <unknown>
 
 str     pn5, [x10, #255, mul vl]
 // CHECK-INST: str     p5, [x10, #255, mul vl]
 // CHECK-ENCODING: [0x45,0x1d,0x9f,0xe5]
-// CHECK-ERROR: instruction requires: sme2 or sve2p1
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: e59f1d45 <unknown>
 
 
 mov     pn0.b, pn0.b
 // CHECK-INST: mov     p0.b, p0.b
 // CHECK-ENCODING: [0x00,0x40,0x80,0x25]
-// CHECK-ERROR: instruction requires: sme2 or sve2p1
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 25804000 <unknown>
 
 
 pfalse pn15.b
 // CHECK-INST: pfalse  p15.b
 // CHECK-ENCODING: [0x0f,0xe4,0x18,0x25]
-// CHECK-ERROR: instruction requires: sme2 or sve2p1
+// CHECK-ERROR: instruction requires: sve or sme
 // CHECK-UNKNOWN: 2518e40f <unknown>
diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll
index 4e7074ec75dd63ee5263734090056d588be37f71..580fabfd00cb449ad3124c27b09778ebd145f9f4 100644
--- a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll
+++ b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll
@@ -1,45 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+sme -S -passes=inline | FileCheck %s
 
-declare void @inlined_body() "aarch64_pstate_sm_compatible";
+declare i32 @llvm.vscale.i32()
 
-;
-; Define some functions that will be called by the functions below.
-; These just call a '...body()' function. If we see the call to one of
-; these functions being replaced by '...body()', then we know it has been
-; inlined.
-;
+; Define some functions that merely call llvm.vscale.i32(), which will be called
+; by the other functions below. If we see the call to one of these functions
+; being replaced by 'llvm.vscale()', then we know it has been inlined.
 
-define void @normal_callee() {
+define i32 @normal_callee() {
+; CHECK-LABEL: define i32 @normal_callee
+; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
 entry:
-  call void @inlined_body()
-  ret void
+  %res = call i32 @llvm.vscale.i32()
+  ret i32 %res
 }
 
-define void @streaming_callee() "aarch64_pstate_sm_enabled" {
+define i32 @streaming_callee() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i32 @streaming_callee
+; CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
 entry:
-  call void @inlined_body()
-  ret void
+  %res = call i32 @llvm.vscale.i32()
+  ret i32 %res
 }
 
-define void @locally_streaming_callee() "aarch64_pstate_sm_body" {
+define i32 @locally_streaming_callee() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @locally_streaming_callee
+; CHECK-SAME: () #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
 entry:
-  call void @inlined_body()
-  ret void
+  %res = call i32 @llvm.vscale.i32()
+  ret i32 %res
 }
 
-define void @streaming_compatible_callee() "aarch64_pstate_sm_compatible" {
+define i32 @streaming_compatible_callee() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define i32 @streaming_compatible_callee
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
 entry:
-  call void @inlined_body()
-  ret void
+  %res = call i32 @llvm.vscale.i32()
+  ret i32 %res
 }
 
-define void @streaming_compatible_locally_streaming_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+declare i32 @llvm.vscale()
+
+define i32 @streaming_compatible_locally_streaming_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_callee
+; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
 entry:
-  call void @inlined_body()
-  ret void
+  %res = call i32 @llvm.vscale()
+  ret i32 %res
 }
 
-;
 ; Now test that inlining only happens when their streaming modes match.
 ; Test for a number of combinations, where:
 ; N       Normal-interface (PSTATE.SM=0 on entry/exit)
@@ -57,12 +86,16 @@ entry:
 ; [ ] N  -> SC
 ; [ ] N  -> N + B
 ; [ ] N  -> SC + B
-define void @normal_caller_normal_callee_inline() {
-; CHECK-LABEL: @normal_caller_normal_callee_inline(
-; CHECK: call void @inlined_body()
+define i32 @normal_caller_normal_callee_inline() {
+; CHECK-LABEL: define i32 @normal_caller_normal_callee_inline
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
+;
 entry:
-  call void @normal_callee()
-  ret void
+  %res = call i32 @normal_callee()
+  ret i32 %res
 }
 
 ; [ ] N  -> N
@@ -70,12 +103,16 @@ entry:
 ; [ ] N  -> SC
 ; [ ] N  -> N + B
 ; [ ] N  -> SC + B
-define void @normal_caller_streaming_callee_dont_inline() {
-; CHECK-LABEL: @normal_caller_streaming_callee_dont_inline(
-; CHECK: call void @streaming_callee()
+define i32 @normal_caller_streaming_callee_dont_inline() {
+; CHECK-LABEL: define i32 @normal_caller_streaming_callee_dont_inline
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @streaming_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
 entry:
-  call void @streaming_callee()
-  ret void
+  %res = call i32 @streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] N  -> N
@@ -83,12 +120,16 @@ entry:
 ; [x] N  -> SC
 ; [ ] N  -> N + B
 ; [ ] N  -> SC + B
-define void @normal_caller_streaming_compatible_callee_inline() {
-; CHECK-LABEL: @normal_caller_streaming_compatible_callee_inline(
-; CHECK: call void @inlined_body()
+define i32 @normal_caller_streaming_compatible_callee_inline() {
+; CHECK-LABEL: define i32 @normal_caller_streaming_compatible_callee_inline
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
+;
 entry:
-  call void @streaming_compatible_callee()
-  ret void
+  %res = call i32 @streaming_compatible_callee()
+  ret i32 %res
 }
 
 ; [ ] N  -> N
@@ -96,12 +137,16 @@ entry:
 ; [ ] N  -> SC
 ; [x] N  -> N + B
 ; [ ] N  -> SC + B
-define void @normal_caller_locally_streaming_callee_dont_inline() {
-; CHECK-LABEL: @normal_caller_locally_streaming_callee_dont_inline(
-; CHECK: call void @locally_streaming_callee()
+define i32 @normal_caller_locally_streaming_callee_dont_inline() {
+; CHECK-LABEL: define i32 @normal_caller_locally_streaming_callee_dont_inline
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @locally_streaming_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
 entry:
-  call void @locally_streaming_callee()
-  ret void
+  %res = call i32 @locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] N  -> N
@@ -109,12 +154,16 @@ entry:
 ; [ ] N  -> SC
 ; [ ] N  -> N + B
 ; [x] N  -> SC + B
-define void @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline() {
-; CHECK-LABEL: @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline(
-; CHECK: call void @streaming_compatible_locally_streaming_callee()
+define i32 @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline() {
+; CHECK-LABEL: define i32 @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @streaming_compatible_locally_streaming_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
 entry:
-  call void @streaming_compatible_locally_streaming_callee()
-  ret void
+  %res = call i32 @streaming_compatible_locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [x] S  -> N
@@ -122,12 +171,16 @@ entry:
 ; [ ] S  -> SC
 ; [ ] S  -> N + B
 ; [ ] S  -> SC + B
-define void @streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_enabled" {
-; CHECK-LABEL: @streaming_caller_normal_callee_dont_inline(
-; CHECK: call void @normal_callee()
+define i32 @streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i32 @streaming_caller_normal_callee_dont_inline
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @normal_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
 entry:
-  call void @normal_callee()
-  ret void
+  %res = call i32 @normal_callee()
+  ret i32 %res
 }
 
 ; [ ] S  -> N
@@ -135,12 +188,16 @@ entry:
 ; [ ] S  -> SC
 ; [ ] S  -> N + B
 ; [ ] S  -> SC + B
-define void @streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_enabled" {
-; CHECK-LABEL: @streaming_caller_streaming_callee_inline(
-; CHECK: call void @inlined_body()
+define i32 @streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i32 @streaming_caller_streaming_callee_inline
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
+;
 entry:
-  call void @streaming_callee()
-  ret void
+  %res = call i32 @streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] S  -> N
@@ -148,12 +205,16 @@ entry:
 ; [x] S  -> SC
 ; [ ] S  -> N + B
 ; [ ] S  -> SC + B
-define void @streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_enabled" {
-; CHECK-LABEL: @streaming_caller_streaming_compatible_callee_inline(
-; CHECK: call void @inlined_body()
+define i32 @streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i32 @streaming_caller_streaming_compatible_callee_inline
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
+;
 entry:
-  call void @streaming_compatible_callee()
-  ret void
+  %res = call i32 @streaming_compatible_callee()
+  ret i32 %res
 }
 
 ; [ ] S  -> N
@@ -161,12 +222,16 @@ entry:
 ; [ ] S  -> SC
 ; [x] S  -> N + B
 ; [ ] S  -> SC + B
-define void @streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" {
-; CHECK-LABEL: @streaming_caller_locally_streaming_callee_inline(
-; CHECK: call void @inlined_body()
+define i32 @streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i32 @streaming_caller_locally_streaming_callee_inline
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
+;
 entry:
-  call void @locally_streaming_callee()
-  ret void
+  %res = call i32 @locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] S  -> N
@@ -174,12 +239,16 @@ entry:
 ; [ ] S  -> SC
 ; [ ] S  -> N + B
 ; [x] S  -> SC + B
-define void @streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" {
-; CHECK-LABEL: @streaming_caller_streaming_compatible_locally_streaming_callee_inline(
-; CHECK: call void @inlined_body()
+define i32 @streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i32 @streaming_caller_streaming_compatible_locally_streaming_callee_inline
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
+;
 entry:
-  call void @streaming_compatible_locally_streaming_callee()
-  ret void
+  %res = call i32 @streaming_compatible_locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [x] N + B -> N
@@ -187,12 +256,16 @@ entry:
 ; [ ] N + B -> SC
 ; [ ] N + B -> N + B
 ; [ ] N + B -> SC + B
-define void @locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_body" {
-; CHECK-LABEL: @locally_streaming_caller_normal_callee_dont_inline(
-; CHECK: call void @normal_callee()
+define i32 @locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @locally_streaming_caller_normal_callee_dont_inline
+; CHECK-SAME: () #[[ATTR3]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @normal_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
 entry:
-  call void @normal_callee()
-  ret void
+  %res = call i32 @normal_callee()
+  ret i32 %res
 }
 
 ; [ ] N + B -> N
@@ -200,12 +273,16 @@ entry:
 ; [ ] N + B -> SC
 ; [ ] N + B -> N + B
 ; [ ] N + B -> SC + B
-define void @locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_body" {
-; CHECK-LABEL: @locally_streaming_caller_streaming_callee_inline(
-; CHECK: call void @inlined_body()
+define i32 @locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @locally_streaming_caller_streaming_callee_inline
+; CHECK-SAME: () #[[ATTR3]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
+;
 entry:
-  call void @streaming_callee()
-  ret void
+  %res = call i32 @streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] N + B -> N
@@ -213,12 +290,16 @@ entry:
 ; [x] N + B -> SC
 ; [ ] N + B -> N + B
 ; [ ] N + B -> SC + B
-define void @locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_body" {
-; CHECK-LABEL: @locally_streaming_caller_streaming_compatible_callee_inline(
-; CHECK: call void @inlined_body()
+define i32 @locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @locally_streaming_caller_streaming_compatible_callee_inline
+; CHECK-SAME: () #[[ATTR3]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
+;
 entry:
-  call void @streaming_compatible_callee()
-  ret void
+  %res = call i32 @streaming_compatible_callee()
+  ret i32 %res
 }
 
 ; [ ] N + B -> N
@@ -226,12 +307,16 @@ entry:
 ; [ ] N + B -> SC
 ; [x] N + B -> N + B
 ; [ ] N + B -> SC + B
-define void @locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_body" {
-; CHECK-LABEL: @locally_streaming_caller_locally_streaming_callee_inline(
-; CHECK: call void @inlined_body()
+define i32 @locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @locally_streaming_caller_locally_streaming_callee_inline
+; CHECK-SAME: () #[[ATTR3]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
+;
 entry:
-  call void @locally_streaming_callee()
-  ret void
+  %res = call i32 @locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] N + B -> N
@@ -239,12 +324,16 @@ entry:
 ; [ ] N + B -> SC
 ; [ ] N + B -> N + B
 ; [x] N + B -> SC + B
-define void @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_body" {
-; CHECK-LABEL: @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline(
-; CHECK: call void @inlined_body()
+define i32 @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @locally_streaming_caller_streaming_compatible_locally_streaming_callee_inline
+; CHECK-SAME: () #[[ATTR3]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
+;
 entry:
-  call void @streaming_compatible_locally_streaming_callee()
-  ret void
+  %res = call i32 @streaming_compatible_locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [x] SC -> N
@@ -252,12 +341,16 @@ entry:
 ; [ ] SC -> SC
 ; [ ] SC -> N + B
 ; [ ] SC -> SC + B
-define void @streaming_compatible_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: @streaming_compatible_caller_normal_callee_dont_inline(
-; CHECK: call void @normal_callee()
+define i32 @streaming_compatible_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define i32 @streaming_compatible_caller_normal_callee_dont_inline
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @normal_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
 entry:
-  call void @normal_callee()
-  ret void
+  %res = call i32 @normal_callee()
+  ret i32 %res
 }
 
 ; [ ] SC -> N
@@ -265,12 +358,16 @@ entry:
 ; [ ] SC -> SC
 ; [ ] SC -> N + B
 ; [ ] SC -> SC + B
-define void @streaming_compatible_caller_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: @streaming_compatible_caller_streaming_callee_dont_inline(
-; CHECK: call void @streaming_callee()
+define i32 @streaming_compatible_caller_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define i32 @streaming_compatible_caller_streaming_callee_dont_inline
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @streaming_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
 entry:
-  call void @streaming_callee()
-  ret void
+  %res = call i32 @streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] SC -> N
@@ -278,12 +375,16 @@ entry:
 ; [x] SC -> SC
 ; [ ] SC -> N + B
 ; [ ] SC -> SC + B
-define void @streaming_compatible_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: @streaming_compatible_caller_streaming_compatible_callee_inline(
-; CHECK: call void @inlined_body()
+define i32 @streaming_compatible_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define i32 @streaming_compatible_caller_streaming_compatible_callee_inline
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
+;
 entry:
-  call void @streaming_compatible_callee()
-  ret void
+  %res = call i32 @streaming_compatible_callee()
+  ret i32 %res
 }
 
 ; [ ] SC -> N
@@ -291,12 +392,16 @@ entry:
 ; [ ] SC -> SC
 ; [x] SC -> N + B
 ; [ ] SC -> SC + B
-define void @streaming_compatible_caller_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: @streaming_compatible_caller_locally_streaming_callee_dont_inline(
-; CHECK: call void @locally_streaming_callee()
+define i32 @streaming_compatible_caller_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define i32 @streaming_compatible_caller_locally_streaming_callee_dont_inline
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @locally_streaming_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
 entry:
-  call void @locally_streaming_callee()
-  ret void
+  %res = call i32 @locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] SC -> N
@@ -304,24 +409,32 @@ entry:
 ; [ ] SC -> SC
 ; [ ] SC -> N + B
 ; [x] SC -> SC + B
-define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline(
-; CHECK: call void @streaming_compatible_locally_streaming_callee()
+define i32 @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define i32 @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @streaming_compatible_locally_streaming_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
 entry:
-  call void @streaming_compatible_locally_streaming_callee()
-  ret void
+  %res = call i32 @streaming_compatible_locally_streaming_callee()
+  ret i32 %res
 }
 ; [x] SC + B -> N
 ; [ ] SC + B -> S
 ; [ ] SC + B -> SC
 ; [ ] SC + B -> N + B
 ; [ ] SC + B -> SC + B
-define void @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
-; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline(
-; CHECK: call void @normal_callee()
+define i32 @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline
+; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @normal_callee()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
 entry:
-  call void @normal_callee()
-  ret void
+  %res = call i32 @normal_callee()
+  ret i32 %res
 }
 
 ; [ ] SC + B -> N
@@ -329,12 +442,16 @@ entry:
 ; [ ] SC + B -> SC
 ; [ ] SC + B -> N + B
 ; [ ] SC + B -> SC + B
-define void @streaming_compatible_locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
-; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_streaming_callee_inline(
-; CHECK: call void @inlined_body()
+define i32 @streaming_compatible_locally_streaming_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_streaming_callee_inline
+; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
+;
 entry:
-  call void @streaming_callee()
-  ret void
+  %res = call i32 @streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] SC + B -> N
@@ -342,12 +459,16 @@ entry:
 ; [x] SC + B -> SC
 ; [ ] SC + B -> N + B
 ; [ ] SC + B -> SC + B
-define void @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
-; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline(
-; CHECK: call void @inlined_body()
+define i32 @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_streaming_compatible_callee_inline
+; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
+;
 entry:
-  call void @streaming_compatible_callee()
-  ret void
+  %res = call i32 @streaming_compatible_callee()
+  ret i32 %res
 }
 
 ; [ ] SC + B -> N
@@ -355,12 +476,16 @@ entry:
 ; [ ] SC + B -> SC
 ; [x] SC + B -> N + B
 ; [ ] SC + B -> SC + B
-define void @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
-; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline(
-; CHECK: call void @inlined_body()
+define i32 @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_locally_streaming_callee_inline
+; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
+;
 entry:
-  call void @locally_streaming_callee()
-  ret void
+  %res = call i32 @locally_streaming_callee()
+  ret i32 %res
 }
 
 ; [ ] SC + B -> N
@@ -368,10 +493,92 @@ entry:
 ; [ ] SC + B -> SC
 ; [ ] SC + B -> N + B
 ; [x] SC + B -> SC + B
-define void @streaming_compatible_locally_streaming_caller_and_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
-; CHECK-LABEL: @streaming_compatible_locally_streaming_caller_and_callee_inline(
-; CHECK: call void @inlined_body()
+define i32 @streaming_compatible_locally_streaming_caller_and_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define i32 @streaming_compatible_locally_streaming_caller_and_callee_inline
+; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES_I:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    ret i32 [[RES_I]]
+;
 entry:
-  call void @streaming_compatible_locally_streaming_callee()
+  %res = call i32 @streaming_compatible_locally_streaming_callee()
+  ret i32 %res
+}
+
+define void @normal_callee_with_inlineasm() {
+; CHECK-LABEL: define void @normal_callee_with_inlineasm
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void asm sideeffect "; inlineasm", ""()
   ret void
 }
+
+define void @streaming_caller_normal_callee_with_inlineasm_dont_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define void @streaming_caller_normal_callee_with_inlineasm_dont_inline
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @normal_callee_with_inlineasm()
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @normal_callee_with_inlineasm()
+  ret void
+}
+
+define i64 @normal_callee_with_intrinsic_call() {
+; CHECK-LABEL: define i64 @normal_callee_with_intrinsic_call
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.aarch64.sve.cntb(i32 4)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  %res = call i64 @llvm.aarch64.sve.cntb(i32 4)
+  ret i64 %res
+}
+
+define i64 @streaming_caller_normal_callee_with_intrinsic_call_dont_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i64 @streaming_caller_normal_callee_with_intrinsic_call_dont_inline
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @normal_callee_with_intrinsic_call()
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  %res = call i64 @normal_callee_with_intrinsic_call()
+  ret i64 %res
+}
+
+declare i64 @llvm.aarch64.sve.cntb(i32)
+
+define i64 @normal_callee_call_sme_state() {
+; CHECK-LABEL: define i64 @normal_callee_call_sme_state
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call { i64, i64 } @__arm_sme_state()
+; CHECK-NEXT:    [[RES_0:%.*]] = extractvalue { i64, i64 } [[RES]], 0
+; CHECK-NEXT:    ret i64 [[RES_0]]
+;
+entry:
+  %res = call {i64, i64} @__arm_sme_state()
+  %res.0 = extractvalue {i64, i64} %res, 0
+  ret i64 %res.0
+}
+
+declare {i64, i64} @__arm_sme_state()
+
+define i64 @streaming_caller_normal_callee_call_sme_state_dont_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i64 @streaming_caller_normal_callee_call_sme_state_dont_inline
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @normal_callee_call_sme_state()
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  %res = call i64 @normal_callee_call_sme_state()
+  ret i64 %res
+}
diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
index bdc9e637fe90c2d1f12c5da0811925ee0c940bf4..816492768cc0f5a146777735b3282080815d39bc 100644
--- a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
+++ b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt -mtriple=aarch64-unknown-linux-gnu -mattr=+sme -S -passes=inline < %s | FileCheck %s
 
 declare void @inlined_body()
@@ -10,18 +11,35 @@ declare void @inlined_body()
 ;
 
 define void @nonza_callee() {
+; CHECK-LABEL: define void @nonza_callee
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @inlined_body()
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @inlined_body()
   ret void
 }
 
-define void @shared_za_callee() "aarch64_pstate_za_shared" {
+define void @shared_za_callee() "aarch64_inout_za" {
+; CHECK-LABEL: define void @shared_za_callee
+; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @inlined_body()
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @inlined_body()
   ret void
 }
 
-define void @new_za_callee() "aarch64_pstate_za_new" {
+define void @new_za_callee() "aarch64_new_za" {
+; CHECK-LABEL: define void @new_za_callee
+; CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    call void @inlined_body()
+; CHECK-NEXT:    ret void
+;
   call void @inlined_body()
   ret void
 }
@@ -37,8 +55,12 @@ define void @new_za_callee() "aarch64_pstate_za_new" {
 ; [ ] N -> S (This combination is invalid)
 ; [ ] N -> Z
 define void @nonza_caller_nonza_callee_inline() {
-; CHECK-LABEL: @nonza_caller_nonza_callee_inline(
-; CHECK: call void @inlined_body()
+; CHECK-LABEL: define void @nonza_caller_nonza_callee_inline
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @inlined_body()
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @nonza_callee()
   ret void
@@ -48,8 +70,12 @@ entry:
 ; [ ] N -> S (This combination is invalid)
 ; [x] N -> Z
 define void @nonza_caller_new_za_callee_dont_inline() {
-; CHECK-LABEL: @nonza_caller_new_za_callee_dont_inline(
-; CHECK: call void @new_za_callee()
+; CHECK-LABEL: define void @nonza_caller_new_za_callee_dont_inline
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @new_za_callee()
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @new_za_callee()
   ret void
@@ -58,9 +84,13 @@ entry:
 ; [x] Z -> N
 ; [ ] Z -> S
 ; [ ] Z -> Z
-define void @new_za_caller_nonza_callee_dont_inline() "aarch64_pstate_za_new" {
-; CHECK-LABEL: @new_za_caller_nonza_callee_dont_inline(
-; CHECK: call void @nonza_callee()
+define void @new_za_caller_nonza_callee_inline() "aarch64_new_za" {
+; CHECK-LABEL: define void @new_za_caller_nonza_callee_inline
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @inlined_body()
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @nonza_callee()
   ret void
@@ -69,9 +99,13 @@ entry:
 ; [ ] Z -> N
 ; [x] Z -> S
 ; [ ] Z -> Z
-define void @new_za_caller_shared_za_callee_inline() "aarch64_pstate_za_new" {
-; CHECK-LABEL: @new_za_caller_shared_za_callee_inline(
-; CHECK: call void @inlined_body()
+define void @new_za_caller_shared_za_callee_inline() "aarch64_new_za" {
+; CHECK-LABEL: define void @new_za_caller_shared_za_callee_inline
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @inlined_body()
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @shared_za_callee()
   ret void
@@ -80,9 +114,13 @@ entry:
 ; [ ] Z -> N
 ; [ ] Z -> S
 ; [x] Z -> Z
-define void @new_za_caller_new_za_callee_dont_inline() "aarch64_pstate_za_new" {
-; CHECK-LABEL: @new_za_caller_new_za_callee_dont_inline(
-; CHECK: call void @new_za_callee()
+define void @new_za_caller_new_za_callee_dont_inline() "aarch64_new_za" {
+; CHECK-LABEL: define void @new_za_caller_new_za_callee_dont_inline
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @new_za_callee()
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @new_za_callee()
   ret void
@@ -91,9 +129,13 @@ entry:
 ; [x] Z -> N
 ; [ ] Z -> S
 ; [ ] Z -> Z
-define void @shared_za_caller_nonza_callee_dont_inline() "aarch64_pstate_za_shared" {
-; CHECK-LABEL: @shared_za_caller_nonza_callee_dont_inline(
-; CHECK: call void @nonza_callee()
+define void @shared_za_caller_nonza_callee_inline() "aarch64_inout_za" {
+; CHECK-LABEL: define void @shared_za_caller_nonza_callee_inline
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @inlined_body()
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @nonza_callee()
   ret void
@@ -102,9 +144,13 @@ entry:
 ; [ ] S -> N
 ; [x] S -> Z
 ; [ ] S -> S
-define void @shared_za_caller_new_za_callee_dont_inline() "aarch64_pstate_za_shared" {
-; CHECK-LABEL: @shared_za_caller_new_za_callee_dont_inline(
-; CHECK: call void @new_za_callee()
+define void @shared_za_caller_new_za_callee_dont_inline() "aarch64_inout_za" {
+; CHECK-LABEL: define void @shared_za_caller_new_za_callee_dont_inline
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @new_za_callee()
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @new_za_callee()
   ret void
@@ -113,10 +159,78 @@ entry:
 ; [ ] S -> N
 ; [ ] S -> Z
 ; [x] S -> S
-define void @shared_za_caller_shared_za_callee_inline() "aarch64_pstate_za_shared" {
-; CHECK-LABEL: @shared_za_caller_shared_za_callee_inline(
-; CHECK: call void @inlined_body()
+define void @shared_za_caller_shared_za_callee_inline() "aarch64_inout_za" {
+; CHECK-LABEL: define void @shared_za_caller_shared_za_callee_inline
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @inlined_body()
+; CHECK-NEXT:    ret void
+;
 entry:
   call void @shared_za_callee()
   ret void
 }
+
+define void @private_za_callee_call_za_disable() {
+; CHECK-LABEL: define void @private_za_callee_call_za_disable
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:    call void @__arm_za_disable()
+; CHECK-NEXT:    ret void
+;
+  call void @__arm_za_disable()
+  ret void
+}
+
+define void @shared_za_caller_private_za_callee_call_za_disable() "aarch64_inout_za" {
+; CHECK-LABEL: define void @shared_za_caller_private_za_callee_call_za_disable
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    call void @private_za_callee_call_za_disable()
+; CHECK-NEXT:    ret void
+;
+  call void @private_za_callee_call_za_disable()
+  ret void
+}
+
+define void @private_za_callee_call_tpidr2_save() {
+; CHECK-LABEL: define void @private_za_callee_call_tpidr2_save
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:    call void @__arm_tpidr2_save()
+; CHECK-NEXT:    ret void
+;
+  call void @__arm_tpidr2_save()
+  ret void
+}
+
+define void @shared_za_caller_private_za_callee_call_tpidr2_save_dont_inline() "aarch64_inout_za" {
+; CHECK-LABEL: define void @shared_za_caller_private_za_callee_call_tpidr2_save_dont_inline
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    call void @private_za_callee_call_tpidr2_save()
+; CHECK-NEXT:    ret void
+;
+  call void @private_za_callee_call_tpidr2_save()
+  ret void
+}
+
+define void @private_za_callee_call_tpidr2_restore(ptr %ptr) {
+; CHECK-LABEL: define void @private_za_callee_call_tpidr2_restore
+; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @__arm_tpidr2_restore(ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
+  call void @__arm_tpidr2_restore(ptr %ptr)
+  ret void
+}
+
+define void @shared_za_caller_private_za_callee_call_tpidr2_restore_dont_inline(ptr %ptr) "aarch64_inout_za" {
+; CHECK-LABEL: define void @shared_za_caller_private_za_callee_call_tpidr2_restore_dont_inline
+; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    call void @private_za_callee_call_tpidr2_restore(ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
+  call void @private_za_callee_call_tpidr2_restore(ptr %ptr)
+  ret void
+}
+
+declare void @__arm_za_disable()
+declare void @__arm_tpidr2_save()
+declare void @__arm_tpidr2_restore(ptr)
diff --git a/llvm/test/Verifier/sme-attributes.ll b/llvm/test/Verifier/sme-attributes.ll
index 7f788cfd09f07276277483447ed0120c9c4aaddf..c992cd7adcd88c02953a6098aed1b21c03ef1dc9 100644
--- a/llvm/test/Verifier/sme-attributes.ll
+++ b/llvm/test/Verifier/sme-attributes.ll
@@ -3,8 +3,32 @@
 declare void @sm_attrs() "aarch64_pstate_sm_enabled" "aarch64_pstate_sm_compatible";
 ; CHECK: Attributes 'aarch64_pstate_sm_enabled and aarch64_pstate_sm_compatible' are incompatible!
 
-declare void @za_preserved() "aarch64_pstate_za_new" "aarch64_pstate_za_preserved";
-; CHECK: Attributes 'aarch64_pstate_za_new and aarch64_pstate_za_preserved' are incompatible!
+declare void @za_new_preserved() "aarch64_new_za" "aarch64_preserves_za";
+; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive
 
-declare void @za_shared() "aarch64_pstate_za_new" "aarch64_pstate_za_shared";
-; CHECK: Attributes 'aarch64_pstate_za_new and aarch64_pstate_za_shared' are incompatible!
+declare void @za_new_in() "aarch64_new_za" "aarch64_in_za";
+; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive
+
+declare void @za_new_inout() "aarch64_new_za" "aarch64_inout_za";
+; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive
+
+declare void @za_new_out() "aarch64_new_za" "aarch64_out_za";
+; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive
+
+declare void @za_preserved_in() "aarch64_preserves_za" "aarch64_in_za";
+; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive
+
+declare void @za_preserved_inout() "aarch64_preserves_za" "aarch64_inout_za";
+; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive
+
+declare void @za_preserved_out() "aarch64_preserves_za" "aarch64_out_za";
+; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive
+
+declare void @za_in_inout() "aarch64_in_za" "aarch64_inout_za";
+; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive
+
+declare void @za_in_out() "aarch64_in_za" "aarch64_out_za";
+; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive
+
+declare void @za_inout_out() "aarch64_inout_za" "aarch64_out_za";
+; CHECK: Attributes 'aarch64_new_za', 'aarch64_in_za', 'aarch64_out_za', 'aarch64_inout_za' and 'aarch64_preserves_za' are mutually exclusive
diff --git a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp
index 233c75a138ae8051e682c7254d99eac7f7294831..ab96ee147a9ae5d79b531c2205d5cb24e2c692ae 100644
--- a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp
+++ b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp
@@ -38,25 +38,25 @@ TEST(SMEAttributes, Constructors) {
               ->getFunction("foo"))
           .hasStreamingCompatibleInterface());
 
-  ASSERT_TRUE(SA(*parseIR("declare void @foo() \"aarch64_pstate_za_shared\"")
-                      ->getFunction("foo"))
-                  .hasSharedZAInterface());
-
-  ASSERT_TRUE(SA(*parseIR("declare void @foo() \"aarch64_pstate_za_new\"")
+  ASSERT_TRUE(
+      SA(*parseIR("declare void @foo() \"aarch64_in_za\"")->getFunction("foo"))
+          .isInZA());
+  ASSERT_TRUE(
+      SA(*parseIR("declare void @foo() \"aarch64_out_za\"")->getFunction("foo"))
+          .isOutZA());
+  ASSERT_TRUE(SA(*parseIR("declare void @foo() \"aarch64_inout_za\"")
                       ->getFunction("foo"))
-                  .hasNewZAInterface());
-
-  ASSERT_TRUE(SA(*parseIR("declare void @foo() \"aarch64_pstate_za_preserved\"")
+                  .isInOutZA());
+  ASSERT_TRUE(SA(*parseIR("declare void @foo() \"aarch64_preserves_za\"")
                       ->getFunction("foo"))
-                  .preservesZA());
+                  .isPreservesZA());
+  ASSERT_TRUE(
+      SA(*parseIR("declare void @foo() \"aarch64_new_za\"")->getFunction("foo"))
+          .isNewZA());
 
   // Invalid combinations.
   EXPECT_DEBUG_DEATH(SA(SA::SM_Enabled | SA::SM_Compatible),
                      "SM_Enabled and SM_Compatible are mutually exclusive");
-  EXPECT_DEBUG_DEATH(SA(SA::ZA_New | SA::ZA_Shared),
-                     "ZA_New and ZA_Shared are mutually exclusive");
-  EXPECT_DEBUG_DEATH(SA(SA::ZA_New | SA::ZA_Preserved),
-                     "ZA_New and ZA_Preserved are mutually exclusive");
 
   // Test that the set() methods equally check validity.
   EXPECT_DEBUG_DEATH(SA(SA::SM_Enabled).set(SA::SM_Compatible),
@@ -79,22 +79,69 @@ TEST(SMEAttributes, Basics) {
   ASSERT_TRUE(SA(SA::SM_Compatible | SA::SM_Body).hasStreamingBody());
   ASSERT_FALSE(SA(SA::SM_Compatible | SA::SM_Body).hasNonStreamingInterface());
 
-  // Test PSTATE.ZA interfaces.
-  ASSERT_FALSE(SA(SA::ZA_Shared).hasPrivateZAInterface());
-  ASSERT_TRUE(SA(SA::ZA_Shared).hasSharedZAInterface());
-  ASSERT_TRUE(SA(SA::ZA_Shared).hasZAState());
-  ASSERT_FALSE(SA(SA::ZA_Shared).preservesZA());
-  ASSERT_TRUE(SA(SA::ZA_Shared | SA::ZA_Preserved).preservesZA());
-
-  ASSERT_TRUE(SA(SA::ZA_New).hasPrivateZAInterface());
-  ASSERT_TRUE(SA(SA::ZA_New).hasNewZAInterface());
-  ASSERT_TRUE(SA(SA::ZA_New).hasZAState());
-  ASSERT_FALSE(SA(SA::ZA_New).preservesZA());
-
-  ASSERT_TRUE(SA(SA::Normal).hasPrivateZAInterface());
-  ASSERT_FALSE(SA(SA::Normal).hasNewZAInterface());
+  // Test ZA State interfaces
+  SA ZA_In = SA(SA::encodeZAState(SA::StateValue::In));
+  ASSERT_TRUE(ZA_In.isInZA());
+  ASSERT_FALSE(ZA_In.isOutZA());
+  ASSERT_FALSE(ZA_In.isInOutZA());
+  ASSERT_FALSE(ZA_In.isPreservesZA());
+  ASSERT_FALSE(ZA_In.isNewZA());
+  ASSERT_TRUE(ZA_In.sharesZA());
+  ASSERT_TRUE(ZA_In.hasZAState());
+  ASSERT_TRUE(ZA_In.hasSharedZAInterface());
+  ASSERT_FALSE(ZA_In.hasPrivateZAInterface());
+
+  SA ZA_Out = SA(SA::encodeZAState(SA::StateValue::Out));
+  ASSERT_TRUE(ZA_Out.isOutZA());
+  ASSERT_FALSE(ZA_Out.isInZA());
+  ASSERT_FALSE(ZA_Out.isInOutZA());
+  ASSERT_FALSE(ZA_Out.isPreservesZA());
+  ASSERT_FALSE(ZA_Out.isNewZA());
+  ASSERT_TRUE(ZA_Out.sharesZA());
+  ASSERT_TRUE(ZA_Out.hasZAState());
+  ASSERT_TRUE(ZA_Out.hasSharedZAInterface());
+  ASSERT_FALSE(ZA_Out.hasPrivateZAInterface());
+
+  SA ZA_InOut = SA(SA::encodeZAState(SA::StateValue::InOut));
+  ASSERT_TRUE(ZA_InOut.isInOutZA());
+  ASSERT_FALSE(ZA_InOut.isInZA());
+  ASSERT_FALSE(ZA_InOut.isOutZA());
+  ASSERT_FALSE(ZA_InOut.isPreservesZA());
+  ASSERT_FALSE(ZA_InOut.isNewZA());
+  ASSERT_TRUE(ZA_InOut.sharesZA());
+  ASSERT_TRUE(ZA_InOut.hasZAState());
+  ASSERT_TRUE(ZA_InOut.hasSharedZAInterface());
+  ASSERT_FALSE(ZA_InOut.hasPrivateZAInterface());
+
+  SA ZA_Preserved = SA(SA::encodeZAState(SA::StateValue::Preserved));
+  ASSERT_TRUE(ZA_Preserved.isPreservesZA());
+  ASSERT_FALSE(ZA_Preserved.isInZA());
+  ASSERT_FALSE(ZA_Preserved.isOutZA());
+  ASSERT_FALSE(ZA_Preserved.isInOutZA());
+  ASSERT_FALSE(ZA_Preserved.isNewZA());
+  ASSERT_TRUE(ZA_Preserved.sharesZA());
+  ASSERT_TRUE(ZA_Preserved.hasZAState());
+  ASSERT_TRUE(ZA_Preserved.hasSharedZAInterface());
+  ASSERT_FALSE(ZA_Preserved.hasPrivateZAInterface());
+
+  SA ZA_New = SA(SA::encodeZAState(SA::StateValue::New));
+  ASSERT_TRUE(ZA_New.isNewZA());
+  ASSERT_FALSE(ZA_New.isInZA());
+  ASSERT_FALSE(ZA_New.isOutZA());
+  ASSERT_FALSE(ZA_New.isInOutZA());
+  ASSERT_FALSE(ZA_New.isPreservesZA());
+  ASSERT_FALSE(ZA_New.sharesZA());
+  ASSERT_TRUE(ZA_New.hasZAState());
+  ASSERT_FALSE(ZA_New.hasSharedZAInterface());
+  ASSERT_TRUE(ZA_New.hasPrivateZAInterface());
+
+  ASSERT_FALSE(SA(SA::Normal).isInZA());
+  ASSERT_FALSE(SA(SA::Normal).isOutZA());
+  ASSERT_FALSE(SA(SA::Normal).isInOutZA());
+  ASSERT_FALSE(SA(SA::Normal).isPreservesZA());
+  ASSERT_FALSE(SA(SA::Normal).isNewZA());
+  ASSERT_FALSE(SA(SA::Normal).sharesZA());
   ASSERT_FALSE(SA(SA::Normal).hasZAState());
-  ASSERT_FALSE(SA(SA::Normal).preservesZA());
 }
 
 TEST(SMEAttributes, Transitions) {
@@ -102,86 +149,51 @@ TEST(SMEAttributes, Transitions) {
   ASSERT_FALSE(SA(SA::Normal).requiresSMChange(SA(SA::Normal)));
   // Normal -> Normal + LocallyStreaming
   ASSERT_FALSE(SA(SA::Normal).requiresSMChange(SA(SA::Normal | SA::SM_Body)));
-  ASSERT_EQ(*SA(SA::Normal)
-                 .requiresSMChange(SA(SA::Normal | SA::SM_Body),
-                                   /*BodyOverridesInterface=*/true),
-            true);
 
   // Normal -> Streaming
-  ASSERT_EQ(*SA(SA::Normal).requiresSMChange(SA(SA::SM_Enabled)), true);
+  ASSERT_TRUE(SA(SA::Normal).requiresSMChange(SA(SA::SM_Enabled)));
   // Normal -> Streaming + LocallyStreaming
-  ASSERT_EQ(*SA(SA::Normal).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body)),
-            true);
-  ASSERT_EQ(*SA(SA::Normal)
-                 .requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body),
-                                   /*BodyOverridesInterface=*/true),
-            true);
+  ASSERT_TRUE(
+      SA(SA::Normal).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body)));
 
   // Normal -> Streaming-compatible
   ASSERT_FALSE(SA(SA::Normal).requiresSMChange(SA(SA::SM_Compatible)));
   // Normal -> Streaming-compatible + LocallyStreaming
   ASSERT_FALSE(
       SA(SA::Normal).requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body)));
-  ASSERT_EQ(*SA(SA::Normal)
-                 .requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body),
-                                   /*BodyOverridesInterface=*/true),
-            true);
 
   // Streaming -> Normal
-  ASSERT_EQ(*SA(SA::SM_Enabled).requiresSMChange(SA(SA::Normal)), false);
+  ASSERT_TRUE(SA(SA::SM_Enabled).requiresSMChange(SA(SA::Normal)));
   // Streaming -> Normal + LocallyStreaming
-  ASSERT_EQ(*SA(SA::SM_Enabled).requiresSMChange(SA(SA::Normal | SA::SM_Body)),
-            false);
-  ASSERT_FALSE(SA(SA::SM_Enabled)
-                   .requiresSMChange(SA(SA::Normal | SA::SM_Body),
-                                     /*BodyOverridesInterface=*/true));
+  ASSERT_TRUE(
+      SA(SA::SM_Enabled).requiresSMChange(SA(SA::Normal | SA::SM_Body)));
 
   // Streaming -> Streaming
   ASSERT_FALSE(SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Enabled)));
   // Streaming -> Streaming + LocallyStreaming
   ASSERT_FALSE(
       SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body)));
-  ASSERT_FALSE(SA(SA::SM_Enabled)
-                   .requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body),
-                                     /*BodyOverridesInterface=*/true));
 
   // Streaming -> Streaming-compatible
   ASSERT_FALSE(SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Compatible)));
   // Streaming -> Streaming-compatible + LocallyStreaming
   ASSERT_FALSE(
       SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body)));
-  ASSERT_FALSE(SA(SA::SM_Enabled)
-                   .requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body),
-                                     /*BodyOverridesInterface=*/true));
 
   // Streaming-compatible -> Normal
-  ASSERT_EQ(*SA(SA::SM_Compatible).requiresSMChange(SA(SA::Normal)), false);
-  ASSERT_EQ(
-      *SA(SA::SM_Compatible).requiresSMChange(SA(SA::Normal | SA::SM_Body)),
-      false);
-  ASSERT_EQ(*SA(SA::SM_Compatible)
-                 .requiresSMChange(SA(SA::Normal | SA::SM_Body),
-                                   /*BodyOverridesInterface=*/true),
-            true);
+  ASSERT_TRUE(SA(SA::SM_Compatible).requiresSMChange(SA(SA::Normal)));
+  ASSERT_TRUE(
+      SA(SA::SM_Compatible).requiresSMChange(SA(SA::Normal | SA::SM_Body)));
 
   // Streaming-compatible -> Streaming
-  ASSERT_EQ(*SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Enabled)), true);
+  ASSERT_TRUE(SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Enabled)));
   // Streaming-compatible -> Streaming + LocallyStreaming
-  ASSERT_EQ(
-      *SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body)),
-      true);
-  ASSERT_EQ(*SA(SA::SM_Compatible)
-                 .requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body),
-                                   /*BodyOverridesInterface=*/true),
-            true);
+  ASSERT_TRUE(
+      SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body)));
 
   // Streaming-compatible -> Streaming-compatible
   ASSERT_FALSE(SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Compatible)));
   // Streaming-compatible -> Streaming-compatible + LocallyStreaming
   ASSERT_FALSE(SA(SA::SM_Compatible)
                    .requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body)));
-  ASSERT_EQ(*SA(SA::SM_Compatible)
-                 .requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body),
-                                   /*BodyOverridesInterface=*/true),
-            true);
 }
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index bad86c2c984a75a5504eae3326fb84463d0561f0..e465ae7e00f376c0344606080c58ca67044bfad4 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -22,11 +22,11 @@ clang_tablegen("arm_sve") {
   output_name = "arm_sve.h"
 }
 
-# Generate arm_sme_draft_spec_subject_to_change.h
-clang_tablegen("arm_sme_draft_spec_subject_to_change") {
+# Generate arm_sme.h
+clang_tablegen("arm_sme") {
   args = [ "-gen-arm-sme-header" ]
   td_file = "//clang/include/clang/Basic/arm_sme.td"
-  output_name = "arm_sme_draft_spec_subject_to_change.h"
+  output_name = "arm_sme.h"
 }
 
 # Generate arm_bf16.h
@@ -65,7 +65,7 @@ copy("tablegen_headers") {
     ":arm_fp16",
     ":arm_mve",
     ":arm_neon",
-    ":arm_sme_draft_spec_subject_to_change",
+    ":arm_sme",
     ":arm_sve",
     ":riscv_vector",
   ]
diff --git a/mlir/test/Target/LLVMIR/arm-sme.mlir b/mlir/test/Target/LLVMIR/arm-sme.mlir
index 7beec1f61aa92363cea3709048aeaa97e60ea07d..ebda250bb30273b4e8d4ceb1c98ced61410525f7 100644
--- a/mlir/test/Target/LLVMIR/arm-sme.mlir
+++ b/mlir/test/Target/LLVMIR/arm-sme.mlir
@@ -222,7 +222,7 @@ llvm.func @arm_sme_store(%nxv1i1  : vector<[1]xi1>,
   "arm_sme.intr.st1b.vert"(%nxv16i1, %p8, %c0, %c0) :
               (vector<[16]xi1>, !llvm.ptr<i8>, i32, i32) -> ()
   // CHECK: call void @llvm.aarch64.sme.str
-  "arm_sme.intr.str"(%c0, %p8) : (i32, !llvm.ptr<i8>) -> ()
+  "arm_sme.intr.str"(%c0, %ptr, %c0) : (i32, !llvm.ptr, i32) -> ()
   llvm.return
 }
 
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 037719a51dd1e8be69fb3c4d68eae4eb2c29b2ef..56a4a72fece629e861cb86c76a4cd8f075bb38ae 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -1625,14 +1625,14 @@ gentbl(
 )
 
 gentbl(
-    name = "headers_arm_sme_draft_spec_subject_to_change_gen",
+    name = "headers_arm_sme_gen",
     copts = [
         "-Wno-implicit-fallthrough",
         "-Wno-error=frame-larger-than=",
     ],
     tbl_outs = [(
         "-gen-arm-sme-header",
-        "lib/Headers/arm_sme_draft_spec_subject_to_change.h",
+        "lib/Headers/arm_sme.h",
     )],
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_sme.td",
@@ -1673,7 +1673,7 @@ builtin_headers = glob(
     "lib/Headers/arm_mve.h",
     "lib/Headers/arm_neon.h",
     "lib/Headers/arm_sve.h",
-    "lib/Headers/arm_sme_draft_spec_subject_to_change.h",
+    "lib/Headers/arm_sme.h",
     "lib/Headers/arm_bf16.h",
     "lib/Headers/module.modulemap",
     "lib/Headers/riscv_vector.h",