From 999e35dab3bf7a8b4ff2679d4bc443dcd77d348c Mon Sep 17 00:00:00 2001
From: chen zheng <chenzheng79@huawei.com>
Date: Tue, 1 Jul 2025 12:40:41 +0800
Subject: [PATCH] [SME][matrix_type] lower matrix_type with ARM SME/SVE
 instructions

---
 clang/include/clang/AST/Type.h                |  15 +
 clang/lib/CodeGen/CGBuiltin.cpp               |  16 +-
 clang/lib/CodeGen/CGExprScalar.cpp            |  43 +-
 clang/lib/CodeGen/Targets/AArch64.cpp         |   8 +
 clang/lib/Driver/ToolChain.cpp                |   4 +
 .../aarch64-sme-matmul.c                      | 572 ++++++++++++++++++
 .../aarch64-sme-transpose.c                   | 337 +++++++++++
 compiler-rt/lib/builtins/CMakeLists.txt       |  47 ++
 .../lib/builtins/aarch64/matrix/add_float2.c  |  18 +
 .../lib/builtins/aarch64/matrix/add_float4.c  |  18 +
 .../lib/builtins/aarch64/matrix/add_float8.c  |  18 +
 .../lib/builtins/aarch64/matrix/add_int1.c    |  18 +
 .../lib/builtins/aarch64/matrix/add_int2.c    |  18 +
 .../lib/builtins/aarch64/matrix/add_int4.c    |  18 +
 .../lib/builtins/aarch64/matrix/add_int8.c    |  18 +
 .../lib/builtins/aarch64/matrix/add_uint1.c   |  18 +
 .../lib/builtins/aarch64/matrix/add_uint2.c   |  18 +
 .../lib/builtins/aarch64/matrix/add_uint4.c   |  18 +
 .../lib/builtins/aarch64/matrix/add_uint8.c   |  18 +
 .../builtins/aarch64/matrix/matmul_float2.c   |  19 +
 .../builtins/aarch64/matrix/matmul_float4.c   |  17 +
 .../builtins/aarch64/matrix/matmul_float8.c   |  18 +
 .../lib/builtins/aarch64/matrix/matmul_int1.c |  19 +
 .../lib/builtins/aarch64/matrix/matmul_int2.c |  19 +
 .../lib/builtins/aarch64/matrix/matmul_int4.c |  19 +
 .../lib/builtins/aarch64/matrix/matmul_int8.c |  19 +
 .../builtins/aarch64/matrix/matmul_uint1.c    |  19 +
 .../builtins/aarch64/matrix/matmul_uint2.c    |  20 +
 .../builtins/aarch64/matrix/matmul_uint4.c    |  19 +
 .../builtins/aarch64/matrix/matmul_uint8.c    |  19 +
 .../lib/builtins/aarch64/matrix/sme_acle.h    | 136 +++++
 .../lib/builtins/aarch64/matrix/sub_float2.c  |  18 +
 .../lib/builtins/aarch64/matrix/sub_float4.c  |  18 +
 .../lib/builtins/aarch64/matrix/sub_float8.c  |  18 +
 .../lib/builtins/aarch64/matrix/sub_int1.c    |  18 +
 .../lib/builtins/aarch64/matrix/sub_int2.c    |  18 +
 .../lib/builtins/aarch64/matrix/sub_int4.c    |  18 +
 .../lib/builtins/aarch64/matrix/sub_int8.c    |  18 +
 .../lib/builtins/aarch64/matrix/sub_uint1.c   |  18 +
 .../lib/builtins/aarch64/matrix/sub_uint2.c   |  18 +
 .../lib/builtins/aarch64/matrix/sub_uint4.c   |  18 +
 .../lib/builtins/aarch64/matrix/sub_uint8.c   |  18 +
 .../aarch64/matrix/transpose_float2.c         |  16 +
 .../aarch64/matrix/transpose_float4.c         |  16 +
 .../aarch64/matrix/transpose_float8.c         |  16 +
 .../builtins/aarch64/matrix/transpose_int1.c  |  17 +
 .../builtins/aarch64/matrix/transpose_int2.c  |  16 +
 .../builtins/aarch64/matrix/transpose_int4.c  |  16 +
 .../builtins/aarch64/matrix/transpose_int8.c  |  17 +
 .../builtins/Unit/aarch64/sme-matrix-add.c    | 172 ++++++
 .../builtins/Unit/aarch64/sme-matrix-matmul.c | 189 ++++++
 .../builtins/Unit/aarch64/sme-matrix-sub.c    | 173 ++++++
 .../Unit/aarch64/sme-matrix-transpose.c       | 113 ++++
 compiler-rt/test/lit.common.cfg.py            |   6 +
 llvm/include/llvm/IR/MatrixBuilder.h          | 155 +++++
 llvm/lib/CodeGen/MachineScheduler.cpp         |   9 +
 llvm/lib/CodeGen/ScheduleDAGInstrs.cpp        |  14 +-
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   4 +-
 58 files changed, 2713 insertions(+), 17 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-matmul.c
 create mode 100644 clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-transpose.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_float2.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_float4.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_float8.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_int1.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_int2.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_int4.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_int8.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_uint1.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_uint2.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_uint4.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_uint8.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_float2.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_float4.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_float8.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_int1.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_int2.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_int4.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_int8.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_uint1.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_uint2.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_uint4.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_uint8.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sme_acle.h
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_float2.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_float4.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_float8.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_int1.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_int2.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_int4.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_int8.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_uint1.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_uint2.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_uint4.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_uint8.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/transpose_float2.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/transpose_float4.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/transpose_float8.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/transpose_int1.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/transpose_int2.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/transpose_int4.c
 create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/transpose_int8.c
 create mode 100644 compiler-rt/test/builtins/Unit/aarch64/sme-matrix-add.c
 create mode 100644 compiler-rt/test/builtins/Unit/aarch64/sme-matrix-matmul.c
 create mode 100644 compiler-rt/test/builtins/Unit/aarch64/sme-matrix-sub.c
 create mode 100644 compiler-rt/test/builtins/Unit/aarch64/sme-matrix-transpose.c

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 9a711030cff9..435e45ee0eb1 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -4176,6 +4176,21 @@ public:
            (T->isRealType() && !T->isBooleanType() && !T->isEnumeralType());
   }
 
+  static bool isValidTypeForSME(QualType T) {
+    if (!isValidElementType(T))
+      return false;
+
+    if (!isa<BuiltinType>(T))
+      return false;
+
+    // AArch64 can not do vector operations like fma/add/sub for __bf16.
+    if (T->isBFloat16Type())
+      return false;
+
+    return cast<BuiltinType>(T)->isFloatingPoint() ||
+           cast<BuiltinType>(T)->isInteger();
+  }
+
   bool isSugared() const { return false; }
   QualType desugar() const { return QualType(this, 0); }
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 86d47054615e..4cb0659288a2 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3976,12 +3976,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         *this, E, llvm::Intrinsic::vector_reduce_and, "rdx.and"));
 
   case Builtin::BI__builtin_matrix_transpose: {
-    auto *MatrixTy = E->getArg(0)->getType()->castAs<ConstantMatrixType>();
-    Value *MatValue = EmitScalarExpr(E->getArg(0));
+    auto *MatrixValue = E->getArg(0);
+    auto *MatrixTy = MatrixValue->getType()->castAs<ConstantMatrixType>();
+    Value *MatValue = EmitScalarExpr(MatrixValue);
     MatrixBuilder MB(Builder);
-    Value *Result = MB.CreateMatrixTranspose(MatValue, MatrixTy->getNumRows(),
-                                             MatrixTy->getNumColumns());
-    return RValue::get(Result);
+
+    if (!getContext().getTargetInfo().hasFeature("sme") ||
+        !MatrixType::isValidTypeForSME(MatrixTy->getElementType()))
+      return RValue::get(MB.CreateMatrixTranspose(
+          MatValue, MatrixTy->getNumRows(), MatrixTy->getNumColumns()));
+
+    return RValue::get(MB.CreateSMEMatrixTranspose(
+        MatValue,MatrixTy->getNumRows(),MatrixTy->getNumColumns()));
   }
 
   case Builtin::BI__builtin_matrix_column_major_load: {
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 6e212e74676e..38ee8670dcc1 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -764,10 +764,23 @@ public:
       auto *RHSMatTy = dyn_cast<ConstantMatrixType>(
           BO->getRHS()->getType().getCanonicalType());
       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, Ops.FPFeatures);
-      if (LHSMatTy && RHSMatTy)
-        return MB.CreateMatrixMultiply(Ops.LHS, Ops.RHS, LHSMatTy->getNumRows(),
-                                       LHSMatTy->getNumColumns(),
-                                       RHSMatTy->getNumColumns());
+      if (LHSMatTy && RHSMatTy) {
+        // Note that SME only has non-widening MOPA for float32 and float64, so
+	// only these two types have native SME matmul operations. For other
+	// types, SVE version is used. We hope that SVE version is better than
+	// default NEON or scalar version.
+	auto Ty = LHSMatTy->getElementType();
+	if (!CGF.getContext().getTargetInfo().hasFeature("sme") ||
+	    !MatrixType::isValidTypeForSME(Ty))
+          return MB.CreateMatrixMultiply(
+	      Ops.LHS, Ops.RHS, LHSMatTy->getNumRows(),
+	      LHSMatTy->getNumColumns(), RHSMatTy->getNumColumns());
+        assert(isa<BuiltinType>(Ty) && "SME types should be BuiltinType.");
+        return MB.CreateSMEMatrixMultiply(
+	    Ops.LHS, Ops.RHS, LHSMatTy->getNumRows(), LHSMatTy->getNumColumns(),
+            RHSMatTy->getNumColumns(),
+	    cast<BuiltinType>(Ty)->isSignedInteger());
+      }
       return MB.CreateScalarMultiply(Ops.LHS, Ops.RHS);
     }
 
@@ -4170,7 +4183,16 @@ Value *ScalarExprEmitter::EmitAdd(const BinOpInfo &op) {
   if (op.Ty->isConstantMatrixType()) {
     llvm::MatrixBuilder MB(Builder);
     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, op.FPFeatures);
-    return MB.CreateAdd(op.LHS, op.RHS);
+
+    auto *MatTy = cast<ConstantMatrixType>(op.E->getType().getCanonicalType());
+    auto Ty = MatTy->getElementType();
+    if (!CGF.getContext().getTargetInfo().hasFeature("sme") ||
+        !MatrixType::isValidTypeForSME(Ty))
+      return MB.CreateAdd(op.LHS, op.RHS);
+    assert(isa<BuiltinType>(Ty) && "SME types should be BuiltinType.");
+    return MB.CreateSMEMatrixBinOp(
+        op.LHS, op.RHS, MatTy->getNumRows(), MatTy->getNumColumns(),
+	cast<BuiltinType>(Ty)->isSignedInteger(), "add");
   }
 
   if (op.Ty->isUnsignedIntegerType() &&
@@ -4326,7 +4348,16 @@ Value *ScalarExprEmitter::EmitSub(const BinOpInfo &op) {
     if (op.Ty->isConstantMatrixType()) {
       llvm::MatrixBuilder MB(Builder);
       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, op.FPFeatures);
-      return MB.CreateSub(op.LHS, op.RHS);
+      auto *MatTy =
+	  cast<ConstantMatrixType>(op.E->getType().getCanonicalType());
+      auto Ty = MatTy->getElementType();
+      if (!CGF.getContext().getTargetInfo().hasFeature("sme") ||
+          !MatrixType::isValidTypeForSME(Ty))
+	return MB.CreateSub(op.LHS, op.RHS);
+      assert(isa<BuiltinType>(Ty) && "SME types should be BuiltinType.");
+      return MB.CreateSMEMatrixBinOp(
+	  op.LHS, op.RHS, MatTy->getNumRows(), MatTy->getNumColumns(),
+          cast<BuiltinType>(Ty)->isSignedInteger(), "sub");
     }
 
     if (op.Ty->isUnsignedIntegerType() &&
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 97381f673c28..029967da5681 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -294,6 +294,10 @@ AArch64ABIInfo::classifyArgumentType(QualType Ty, bool IsVariadic,
   if (isIllegalVectorType(Ty))
     return coerceIllegalVector(Ty);
 
+  // Always pass the matrix type via memory.
+  if (Ty->isMatrixType())
+    return getNaturalAlignIndirect(Ty, false);
+
   if (!isAggregateTypeForABI(Ty)) {
     // Treat an enum type as its underlying type.
     if (const EnumType *EnumTy = Ty->getAs<EnumType>())
@@ -393,6 +397,10 @@ ABIArgInfo AArch64ABIInfo::classifyReturnType(QualType RetTy,
       return coerceIllegalVector(RetTy);
   }
 
+  // Always return the matrix type via memory.
+  if (RetTy->isMatrixType())
+    return getNaturalAlignIndirect(RetTy);
+
   // Large vector types should be returned via memory.
   if (RetTy->isVectorType() && getContext().getTypeSize(RetTy) > 128)
     return getNaturalAlignIndirect(RetTy);
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 20a555afb809..7941b5ba9250 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1112,6 +1112,10 @@ ToolChain::RuntimeLibType ToolChain::GetRuntimeLibType(
     runtimeLibType = GetDefaultRuntimeLibType();
   }
 
+  const llvm::Triple::ArchType Arch = getArch();
+  if (Arch == llvm::Triple::aarch64 && Args.hasArg(options::OPT_fenable_matrix))
+    runtimeLibType = ToolChain::RLT_CompilerRT;
+
   return *runtimeLibType;
 }
 
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-matmul.c b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-matmul.c
new file mode 100644
index 000000000000..fd4e4f068dee
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-matmul.c
@@ -0,0 +1,572 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -O1 -Werror -emit-llvm -fenable-matrix -o - %s | FileCheck %s -check-prefix=SME
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -sme -O1 -Werror -emit-llvm -fenable-matrix -o - -x c++ %s | FileCheck %s -check-prefix=NOSME
+
+#define M 1
+#define K 2
+#define N 3
+
+typedef __bf16 mbf16_t1 __attribute__((matrix_type(M, K)));
+typedef __fp16 mfp16_t1 __attribute__((matrix_type(M, K)));
+typedef float mfloat_t1 __attribute__((matrix_type(M, K)));
+typedef double mdouble_t1 __attribute__((matrix_type(M, K)));
+typedef signed char mschar_t1 __attribute__((matrix_type(M, K)));
+typedef unsigned char muchar_t1 __attribute__((matrix_type(M, K)));
+typedef signed short msshort_t1 __attribute__((matrix_type(M, K)));
+typedef unsigned short mushort_t1 __attribute__((matrix_type(M, K)));
+typedef signed int msint_t1 __attribute__((matrix_type(M, K)));
+typedef unsigned int muint_t1 __attribute__((matrix_type(M, K)));
+typedef signed long long msllong_t1 __attribute__((matrix_type(M, K)));
+typedef unsigned long long mullong_t1 __attribute__((matrix_type(M, K)));
+
+typedef __bf16 mbf16_t2 __attribute__((matrix_type(K, N)));
+typedef __fp16 mfp16_t2 __attribute__((matrix_type(K, N)));
+typedef float mfloat_t2 __attribute__((matrix_type(K, N)));
+typedef double mdouble_t2 __attribute__((matrix_type(K, N)));
+typedef signed char mschar_t2 __attribute__((matrix_type(K, N)));
+typedef unsigned char muchar_t2 __attribute__((matrix_type(K, N)));
+typedef signed short msshort_t2 __attribute__((matrix_type(K, N)));
+typedef unsigned short mushort_t2 __attribute__((matrix_type(K, N)));
+typedef signed int msint_t2 __attribute__((matrix_type(K, N)));
+typedef unsigned int muint_t2 __attribute__((matrix_type(K, N)));
+typedef signed long long msllong_t2 __attribute__((matrix_type(K, N)));
+typedef unsigned long long mullong_t2 __attribute__((matrix_type(K, N)));
+
+typedef __bf16 mbf16_t __attribute__((matrix_type(M, N)));
+typedef __fp16 mfp16_t __attribute__((matrix_type(M, N)));
+typedef float mfloat_t __attribute__((matrix_type(M, N)));
+typedef double mdouble_t __attribute__((matrix_type(M, N)));
+typedef signed char mschar_t __attribute__((matrix_type(M, N)));
+typedef unsigned char muchar_t __attribute__((matrix_type(M, N)));
+typedef signed short msshort_t __attribute__((matrix_type(M, N)));
+typedef unsigned short mushort_t __attribute__((matrix_type(M, N)));
+typedef signed int msint_t __attribute__((matrix_type(M, N)));
+typedef unsigned int muint_t __attribute__((matrix_type(M, N)));
+typedef signed long long msllong_t __attribute__((matrix_type(M, N)));
+typedef unsigned long long mullong_t __attribute__((matrix_type(M, N)));
+
+// SME-LABEL: define dso_local void @f_bf16(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x bfloat]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[A_UNPACK:%.*]] = load bfloat, ptr [[TMP0]], align 2, !tbaa [[TBAA2:![0-9]+]]
+// SME-NEXT:    [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2
+// SME-NEXT:    [[A_UNPACK4:%.*]] = load bfloat, ptr [[A_ELT3]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    [[B_UNPACK:%.*]] = load bfloat, ptr [[TMP1]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 2
+// SME-NEXT:    [[B_UNPACK7:%.*]] = load bfloat, ptr [[B_ELT6]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4
+// SME-NEXT:    [[B_UNPACK9:%.*]] = load bfloat, ptr [[B_ELT8]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 6
+// SME-NEXT:    [[B_UNPACK11:%.*]] = load bfloat, ptr [[B_ELT10]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
+// SME-NEXT:    [[B_UNPACK13:%.*]] = load bfloat, ptr [[B_ELT12]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 10
+// SME-NEXT:    [[B_UNPACK15:%.*]] = load bfloat, ptr [[B_ELT14]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x bfloat> poison, bfloat [[A_UNPACK]], i64 0
+// SME-NEXT:    [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x bfloat> [[A_ADDR_SROA_0_0_VEC_INSERT]], bfloat [[A_UNPACK4]], i64 1
+// SME-NEXT:    [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x bfloat> poison, bfloat [[B_UNPACK]], i64 0
+// SME-NEXT:    [[B_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_0_VEC_INSERT]], bfloat [[B_UNPACK7]], i64 1
+// SME-NEXT:    [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_2_VEC_INSERT]], bfloat [[B_UNPACK9]], i64 2
+// SME-NEXT:    [[B_ADDR_SROA_0_6_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_4_VEC_INSERT]], bfloat [[B_UNPACK11]], i64 3
+// SME-NEXT:    [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_6_VEC_INSERT]], bfloat [[B_UNPACK13]], i64 4
+// SME-NEXT:    [[B_ADDR_SROA_0_10_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_8_VEC_INSERT]], bfloat [[B_UNPACK15]], i64 5
+// SME-NEXT:    [[TMP2:%.*]] = tail call <3 x bfloat> @llvm.matrix.multiply.v3bf16.v2bf16.v6bf16(<2 x bfloat> [[A_ADDR_SROA_0_2_VEC_INSERT]], <6 x bfloat> [[B_ADDR_SROA_0_10_VEC_INSERT]], i32 1, i32 2, i32 3)
+// SME-NEXT:    store <3 x bfloat> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z6f_bf16u11matrix_typeILm1ELm2Eu6__bf16Eu11matrix_typeILm2ELm3Eu6__bf16E(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x bfloat]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load bfloat, ptr [[TMP0]], align 2, !tbaa [[TBAA2:![0-9]+]]
+// NOSME-NEXT:    [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2
+// NOSME-NEXT:    [[A_UNPACK4:%.*]] = load bfloat, ptr [[A_ELT3]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_UNPACK:%.*]] = load bfloat, ptr [[TMP1]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 2
+// NOSME-NEXT:    [[B_UNPACK7:%.*]] = load bfloat, ptr [[B_ELT6]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4
+// NOSME-NEXT:    [[B_UNPACK9:%.*]] = load bfloat, ptr [[B_ELT8]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 6
+// NOSME-NEXT:    [[B_UNPACK11:%.*]] = load bfloat, ptr [[B_ELT10]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
+// NOSME-NEXT:    [[B_UNPACK13:%.*]] = load bfloat, ptr [[B_ELT12]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 10
+// NOSME-NEXT:    [[B_UNPACK15:%.*]] = load bfloat, ptr [[B_ELT14]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x bfloat> poison, bfloat [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x bfloat> [[A_ADDR_SROA_0_0_VEC_INSERT]], bfloat [[A_UNPACK4]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x bfloat> poison, bfloat [[B_UNPACK]], i64 0
+// NOSME-NEXT:    [[B_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_0_VEC_INSERT]], bfloat [[B_UNPACK7]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_2_VEC_INSERT]], bfloat [[B_UNPACK9]], i64 2
+// NOSME-NEXT:    [[B_ADDR_SROA_0_6_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_4_VEC_INSERT]], bfloat [[B_UNPACK11]], i64 3
+// NOSME-NEXT:    [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_6_VEC_INSERT]], bfloat [[B_UNPACK13]], i64 4
+// NOSME-NEXT:    [[B_ADDR_SROA_0_10_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_8_VEC_INSERT]], bfloat [[B_UNPACK15]], i64 5
+// NOSME-NEXT:    [[TMP2:%.*]] = tail call <3 x bfloat> @llvm.matrix.multiply.v3bf16.v2bf16.v6bf16(<2 x bfloat> [[A_ADDR_SROA_0_2_VEC_INSERT]], <6 x bfloat> [[B_ADDR_SROA_0_10_VEC_INSERT]], i32 1, i32 2, i32 3)
+// NOSME-NEXT:    store <3 x bfloat> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+mbf16_t f_bf16(mbf16_t1 a, mbf16_t2 b) {
+  return a * b;
+}
+
+// SME-LABEL: define dso_local void @f_fp16(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x half]) align 2 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP2:%.*]] = alloca <3 x half>, align 8
+// SME-NEXT:    call void @__sme_matmul_float2(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4:[0-9]+]]
+// SME-NEXT:    [[TMP3:%.*]] = load <3 x half>, ptr [[TMP2]], align 8
+// SME-NEXT:    store <3 x half> [[TMP3]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z6f_fp16u11matrix_typeILm1ELm2EDhEu11matrix_typeILm2ELm3EDhE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x half]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load half, ptr [[TMP0]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2
+// NOSME-NEXT:    [[A_UNPACK4:%.*]] = load half, ptr [[A_ELT3]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_UNPACK:%.*]] = load half, ptr [[TMP1]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 2
+// NOSME-NEXT:    [[B_UNPACK7:%.*]] = load half, ptr [[B_ELT6]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4
+// NOSME-NEXT:    [[B_UNPACK9:%.*]] = load half, ptr [[B_ELT8]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 6
+// NOSME-NEXT:    [[B_UNPACK11:%.*]] = load half, ptr [[B_ELT10]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
+// NOSME-NEXT:    [[B_UNPACK13:%.*]] = load half, ptr [[B_ELT12]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 10
+// NOSME-NEXT:    [[B_UNPACK15:%.*]] = load half, ptr [[B_ELT14]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x half> poison, half [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x half> [[A_ADDR_SROA_0_0_VEC_INSERT]], half [[A_UNPACK4]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x half> poison, half [[B_UNPACK]], i64 0
+// NOSME-NEXT:    [[B_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <6 x half> [[B_ADDR_SROA_0_0_VEC_INSERT]], half [[B_UNPACK7]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x half> [[B_ADDR_SROA_0_2_VEC_INSERT]], half [[B_UNPACK9]], i64 2
+// NOSME-NEXT:    [[B_ADDR_SROA_0_6_VEC_INSERT:%.*]] = insertelement <6 x half> [[B_ADDR_SROA_0_4_VEC_INSERT]], half [[B_UNPACK11]], i64 3
+// NOSME-NEXT:    [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x half> [[B_ADDR_SROA_0_6_VEC_INSERT]], half [[B_UNPACK13]], i64 4
+// NOSME-NEXT:    [[B_ADDR_SROA_0_10_VEC_INSERT:%.*]] = insertelement <6 x half> [[B_ADDR_SROA_0_8_VEC_INSERT]], half [[B_UNPACK15]], i64 5
+// NOSME-NEXT:    [[TMP2:%.*]] = tail call <3 x half> @llvm.matrix.multiply.v3f16.v2f16.v6f16(<2 x half> [[A_ADDR_SROA_0_2_VEC_INSERT]], <6 x half> [[B_ADDR_SROA_0_10_VEC_INSERT]], i32 1, i32 2, i32 3)
+// NOSME-NEXT:    store <3 x half> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+mfp16_t f_fp16(mfp16_t1 a, mfp16_t2 b) {
+  return a * b;
+}
+
+// SME-LABEL: define dso_local void @f_float(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x float]) align 4 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP2:%.*]] = alloca <3 x float>, align 16
+// SME-NEXT:    call void @__sme_matmul_float4(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]]
+// SME-NEXT:    [[TMP3:%.*]] = load <3 x float>, ptr [[TMP2]], align 16
+// SME-NEXT:    store <3 x float> [[TMP3]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z7f_floatu11matrix_typeILm1ELm2EfEu11matrix_typeILm2ELm3EfE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x float]) align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load float, ptr [[TMP0]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4
+// NOSME-NEXT:    [[A_UNPACK4:%.*]] = load float, ptr [[A_ELT3]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_UNPACK:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4
+// NOSME-NEXT:    [[B_UNPACK7:%.*]] = load float, ptr [[B_ELT6]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
+// NOSME-NEXT:    [[B_UNPACK9:%.*]] = load float, ptr [[B_ELT8]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+// NOSME-NEXT:    [[B_UNPACK11:%.*]] = load float, ptr [[B_ELT10]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
+// NOSME-NEXT:    [[B_UNPACK13:%.*]] = load float, ptr [[B_ELT12]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+// NOSME-NEXT:    [[B_UNPACK15:%.*]] = load float, ptr [[B_ELT14]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x float> poison, float [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[A_ADDR_SROA_0_0_VEC_INSERT]], float [[A_UNPACK4]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x float> poison, float [[B_UNPACK]], i64 0
+// NOSME-NEXT:    [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x float> [[B_ADDR_SROA_0_0_VEC_INSERT]], float [[B_UNPACK7]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x float> [[B_ADDR_SROA_0_4_VEC_INSERT]], float [[B_UNPACK9]], i64 2
+// NOSME-NEXT:    [[B_ADDR_SROA_0_12_VEC_INSERT:%.*]] = insertelement <6 x float> [[B_ADDR_SROA_0_8_VEC_INSERT]], float [[B_UNPACK11]], i64 3
+// NOSME-NEXT:    [[B_ADDR_SROA_0_16_VEC_INSERT:%.*]] = insertelement <6 x float> [[B_ADDR_SROA_0_12_VEC_INSERT]], float [[B_UNPACK13]], i64 4
+// NOSME-NEXT:    [[B_ADDR_SROA_0_20_VEC_INSERT:%.*]] = insertelement <6 x float> [[B_ADDR_SROA_0_16_VEC_INSERT]], float [[B_UNPACK15]], i64 5
+// NOSME-NEXT:    [[TMP2:%.*]] = tail call <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float> [[A_ADDR_SROA_0_4_VEC_INSERT]], <6 x float> [[B_ADDR_SROA_0_20_VEC_INSERT]], i32 1, i32 2, i32 3)
+// NOSME-NEXT:    store <3 x float> [[TMP2]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+mfloat_t f_float(mfloat_t1 a, mfloat_t2 b) {
+  return a * b;
+}
+
+// SME-LABEL: define dso_local void @f_double(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x double]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP2:%.*]] = alloca <3 x double>, align 32
+// SME-NEXT:    call void @__sme_matmul_float8(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]]
+// SME-NEXT:    [[TMP3:%.*]] = load <3 x double>, ptr [[TMP2]], align 32
+// SME-NEXT:    store <3 x double> [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z8f_doubleu11matrix_typeILm1ELm2EdEu11matrix_typeILm2ELm3EdE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x double]) align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load double, ptr [[TMP0]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+// NOSME-NEXT:    [[A_UNPACK4:%.*]] = load double, ptr [[A_ELT3]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_UNPACK:%.*]] = load double, ptr [[TMP1]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
+// NOSME-NEXT:    [[B_UNPACK7:%.*]] = load double, ptr [[B_ELT6]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
+// NOSME-NEXT:    [[B_UNPACK9:%.*]] = load double, ptr [[B_ELT8]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 24
+// NOSME-NEXT:    [[B_UNPACK11:%.*]] = load double, ptr [[B_ELT10]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+// NOSME-NEXT:    [[B_UNPACK13:%.*]] = load double, ptr [[B_ELT12]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+// NOSME-NEXT:    [[B_UNPACK15:%.*]] = load double, ptr [[B_ELT14]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x double> poison, double [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x double> [[A_ADDR_SROA_0_0_VEC_INSERT]], double [[A_UNPACK4]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x double> poison, double [[B_UNPACK]], i64 0
+// NOSME-NEXT:    [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x double> [[B_ADDR_SROA_0_0_VEC_INSERT]], double [[B_UNPACK7]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_16_VEC_INSERT:%.*]] = insertelement <6 x double> [[B_ADDR_SROA_0_8_VEC_INSERT]], double [[B_UNPACK9]], i64 2
+// NOSME-NEXT:    [[B_ADDR_SROA_0_24_VEC_INSERT:%.*]] = insertelement <6 x double> [[B_ADDR_SROA_0_16_VEC_INSERT]], double [[B_UNPACK11]], i64 3
+// NOSME-NEXT:    [[B_ADDR_SROA_0_32_VEC_INSERT:%.*]] = insertelement <6 x double> [[B_ADDR_SROA_0_24_VEC_INSERT]], double [[B_UNPACK13]], i64 4
+// NOSME-NEXT:    [[B_ADDR_SROA_0_40_VEC_INSERT:%.*]] = insertelement <6 x double> [[B_ADDR_SROA_0_32_VEC_INSERT]], double [[B_UNPACK15]], i64 5
+// NOSME-NEXT:    [[TMP2:%.*]] = tail call <3 x double> @llvm.matrix.multiply.v3f64.v2f64.v6f64(<2 x double> [[A_ADDR_SROA_0_8_VEC_INSERT]], <6 x double> [[B_ADDR_SROA_0_40_VEC_INSERT]], i32 1, i32 2, i32 3)
+// NOSME-NEXT:    store <3 x double> [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+mdouble_t f_double(mdouble_t1 a, mdouble_t2 b) {
+  return a * b;
+}
+
+// SME-LABEL: define dso_local void @f_signed_char(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i8]) align 1 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP2:%.*]] = alloca <3 x i8>, align 4
+// SME-NEXT:    call void @__sme_matmul_int1(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]]
+// SME-NEXT:    [[TMP3:%.*]] = load <3 x i8>, ptr [[TMP2]], align 4
+// SME-NEXT:    store <3 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z13f_signed_charu11matrix_typeILm1ELm2EaEu11matrix_typeILm2ELm3EaE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i8]) align 1 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i8, ptr [[TMP0]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 1
+// NOSME-NEXT:    [[A_UNPACK4:%.*]] = load i8, ptr [[A_ELT3]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_UNPACK:%.*]] = load i8, ptr [[TMP1]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 1
+// NOSME-NEXT:    [[B_UNPACK7:%.*]] = load i8, ptr [[B_ELT6]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 2
+// NOSME-NEXT:    [[B_UNPACK9:%.*]] = load i8, ptr [[B_ELT8]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 3
+// NOSME-NEXT:    [[B_UNPACK11:%.*]] = load i8, ptr [[B_ELT10]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4
+// NOSME-NEXT:    [[B_UNPACK13:%.*]] = load i8, ptr [[B_ELT12]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 5
+// NOSME-NEXT:    [[B_UNPACK15:%.*]] = load i8, ptr [[B_ELT14]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i8> poison, i8 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_1_VEC_INSERT:%.*]] = insertelement <2 x i8> [[A_ADDR_SROA_0_0_VEC_INSERT]], i8 [[A_UNPACK4]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i8> poison, i8 [[B_UNPACK]], i64 0
+// NOSME-NEXT:    [[B_ADDR_SROA_0_1_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_0_VEC_INSERT]], i8 [[B_UNPACK7]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_1_VEC_INSERT]], i8 [[B_UNPACK9]], i64 2
+// NOSME-NEXT:    [[B_ADDR_SROA_0_3_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_2_VEC_INSERT]], i8 [[B_UNPACK11]], i64 3
+// NOSME-NEXT:    [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_3_VEC_INSERT]], i8 [[B_UNPACK13]], i64 4
+// NOSME-NEXT:    [[B_ADDR_SROA_0_5_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_4_VEC_INSERT]], i8 [[B_UNPACK15]], i64 5
+// NOSME-NEXT:    [[TMP2:%.*]] = tail call <3 x i8> @llvm.matrix.multiply.v3i8.v2i8.v6i8(<2 x i8> [[A_ADDR_SROA_0_1_VEC_INSERT]], <6 x i8> [[B_ADDR_SROA_0_5_VEC_INSERT]], i32 1, i32 2, i32 3)
+// NOSME-NEXT:    store <3 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+mschar_t f_signed_char(mschar_t1 a, mschar_t2 b) {
+  return a * b;
+}
+
+// SME-LABEL: define dso_local void @f_unsigned_char(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i8]) align 1 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP2:%.*]] = alloca <3 x i8>, align 4
+// SME-NEXT:    call void @__sme_matmul_uint1(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]]
+// SME-NEXT:    [[TMP3:%.*]] = load <3 x i8>, ptr [[TMP2]], align 4
+// SME-NEXT:    store <3 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z15f_unsigned_charu11matrix_typeILm1ELm2EhEu11matrix_typeILm2ELm3EhE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i8]) align 1 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i8, ptr [[TMP0]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 1
+// NOSME-NEXT:    [[A_UNPACK4:%.*]] = load i8, ptr [[A_ELT3]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_UNPACK:%.*]] = load i8, ptr [[TMP1]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 1
+// NOSME-NEXT:    [[B_UNPACK7:%.*]] = load i8, ptr [[B_ELT6]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 2
+// NOSME-NEXT:    [[B_UNPACK9:%.*]] = load i8, ptr [[B_ELT8]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 3
+// NOSME-NEXT:    [[B_UNPACK11:%.*]] = load i8, ptr [[B_ELT10]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4
+// NOSME-NEXT:    [[B_UNPACK13:%.*]] = load i8, ptr [[B_ELT12]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 5
+// NOSME-NEXT:    [[B_UNPACK15:%.*]] = load i8, ptr [[B_ELT14]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i8> poison, i8 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_1_VEC_INSERT:%.*]] = insertelement <2 x i8> [[A_ADDR_SROA_0_0_VEC_INSERT]], i8 [[A_UNPACK4]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i8> poison, i8 [[B_UNPACK]], i64 0
+// NOSME-NEXT:    [[B_ADDR_SROA_0_1_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_0_VEC_INSERT]], i8 [[B_UNPACK7]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_1_VEC_INSERT]], i8 [[B_UNPACK9]], i64 2
+// NOSME-NEXT:    [[B_ADDR_SROA_0_3_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_2_VEC_INSERT]], i8 [[B_UNPACK11]], i64 3
+// NOSME-NEXT:    [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_3_VEC_INSERT]], i8 [[B_UNPACK13]], i64 4
+// NOSME-NEXT:    [[B_ADDR_SROA_0_5_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_4_VEC_INSERT]], i8 [[B_UNPACK15]], i64 5
+// NOSME-NEXT:    [[TMP2:%.*]] = tail call <3 x i8> @llvm.matrix.multiply.v3i8.v2i8.v6i8(<2 x i8> [[A_ADDR_SROA_0_1_VEC_INSERT]], <6 x i8> [[B_ADDR_SROA_0_5_VEC_INSERT]], i32 1, i32 2, i32 3)
+// NOSME-NEXT:    store <3 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+muchar_t f_unsigned_char(muchar_t1 a, muchar_t2 b) {
+  return a * b;
+}
+
+// SME-LABEL: define dso_local void @f_signed_short(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i16]) align 2 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP2:%.*]] = alloca <3 x i16>, align 8
+// SME-NEXT:    call void @__sme_matmul_int2(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]]
+// SME-NEXT:    [[TMP3:%.*]] = load <3 x i16>, ptr [[TMP2]], align 8
+// SME-NEXT:    store <3 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z14f_signed_shortu11matrix_typeILm1ELm2EsEu11matrix_typeILm2ELm3EsE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i16]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i16, ptr [[TMP0]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2
+// NOSME-NEXT:    [[A_UNPACK4:%.*]] = load i16, ptr [[A_ELT3]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_UNPACK:%.*]] = load i16, ptr [[TMP1]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 2
+// NOSME-NEXT:    [[B_UNPACK7:%.*]] = load i16, ptr [[B_ELT6]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4
+// NOSME-NEXT:    [[B_UNPACK9:%.*]] = load i16, ptr [[B_ELT8]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 6
+// NOSME-NEXT:    [[B_UNPACK11:%.*]] = load i16, ptr [[B_ELT10]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
+// NOSME-NEXT:    [[B_UNPACK13:%.*]] = load i16, ptr [[B_ELT12]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 10
+// NOSME-NEXT:    [[B_UNPACK15:%.*]] = load i16, ptr [[B_ELT14]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x i16> [[A_ADDR_SROA_0_0_VEC_INSERT]], i16 [[A_UNPACK4]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i16> poison, i16 [[B_UNPACK]], i64 0
+// NOSME-NEXT:    [[B_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_0_VEC_INSERT]], i16 [[B_UNPACK7]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_2_VEC_INSERT]], i16 [[B_UNPACK9]], i64 2
+// NOSME-NEXT:    [[B_ADDR_SROA_0_6_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_4_VEC_INSERT]], i16 [[B_UNPACK11]], i64 3
+// NOSME-NEXT:    [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_6_VEC_INSERT]], i16 [[B_UNPACK13]], i64 4
+// NOSME-NEXT:    [[B_ADDR_SROA_0_10_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_8_VEC_INSERT]], i16 [[B_UNPACK15]], i64 5
+// NOSME-NEXT:    [[TMP2:%.*]] = tail call <3 x i16> @llvm.matrix.multiply.v3i16.v2i16.v6i16(<2 x i16> [[A_ADDR_SROA_0_2_VEC_INSERT]], <6 x i16> [[B_ADDR_SROA_0_10_VEC_INSERT]], i32 1, i32 2, i32 3)
+// NOSME-NEXT:    store <3 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+msshort_t f_signed_short(msshort_t1 a, msshort_t2 b) {
+  return a * b;
+}
+
+// SME-LABEL: define dso_local void @f_unsigned_short(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i16]) align 2 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP2:%.*]] = alloca <3 x i16>, align 8
+// SME-NEXT:    call void @__sme_matmul_uint2(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]]
+// SME-NEXT:    [[TMP3:%.*]] = load <3 x i16>, ptr [[TMP2]], align 8
+// SME-NEXT:    store <3 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z16f_unsigned_shortu11matrix_typeILm1ELm2EtEu11matrix_typeILm2ELm3EtE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i16]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i16, ptr [[TMP0]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2
+// NOSME-NEXT:    [[A_UNPACK4:%.*]] = load i16, ptr [[A_ELT3]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_UNPACK:%.*]] = load i16, ptr [[TMP1]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 2
+// NOSME-NEXT:    [[B_UNPACK7:%.*]] = load i16, ptr [[B_ELT6]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4
+// NOSME-NEXT:    [[B_UNPACK9:%.*]] = load i16, ptr [[B_ELT8]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 6
+// NOSME-NEXT:    [[B_UNPACK11:%.*]] = load i16, ptr [[B_ELT10]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
+// NOSME-NEXT:    [[B_UNPACK13:%.*]] = load i16, ptr [[B_ELT12]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 10
+// NOSME-NEXT:    [[B_UNPACK15:%.*]] = load i16, ptr [[B_ELT14]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x i16> [[A_ADDR_SROA_0_0_VEC_INSERT]], i16 [[A_UNPACK4]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i16> poison, i16 [[B_UNPACK]], i64 0
+// NOSME-NEXT:    [[B_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_0_VEC_INSERT]], i16 [[B_UNPACK7]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_2_VEC_INSERT]], i16 [[B_UNPACK9]], i64 2
+// NOSME-NEXT:    [[B_ADDR_SROA_0_6_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_4_VEC_INSERT]], i16 [[B_UNPACK11]], i64 3
+// NOSME-NEXT:    [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_6_VEC_INSERT]], i16 [[B_UNPACK13]], i64 4
+// NOSME-NEXT:    [[B_ADDR_SROA_0_10_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_8_VEC_INSERT]], i16 [[B_UNPACK15]], i64 5
+// NOSME-NEXT:    [[TMP2:%.*]] = tail call <3 x i16> @llvm.matrix.multiply.v3i16.v2i16.v6i16(<2 x i16> [[A_ADDR_SROA_0_2_VEC_INSERT]], <6 x i16> [[B_ADDR_SROA_0_10_VEC_INSERT]], i32 1, i32 2, i32 3)
+// NOSME-NEXT:    store <3 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+mushort_t f_unsigned_short(mushort_t1 a, mushort_t2 b) {
+  return a * b;
+}
+
+// SME-LABEL: define dso_local void @f_signed_int(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i32]) align 4 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP2:%.*]] = alloca <3 x i32>, align 16
+// SME-NEXT:    call void @__sme_matmul_int4(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]]
+// SME-NEXT:    [[TMP3:%.*]] = load <3 x i32>, ptr [[TMP2]], align 16
+// SME-NEXT:    store <3 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z12f_signed_intu11matrix_typeILm1ELm2EiEu11matrix_typeILm2ELm3EiE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i32]) align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4
+// NOSME-NEXT:    [[A_UNPACK4:%.*]] = load i32, ptr [[A_ELT3]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_UNPACK:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4
+// NOSME-NEXT:    [[B_UNPACK7:%.*]] = load i32, ptr [[B_ELT6]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
+// NOSME-NEXT:    [[B_UNPACK9:%.*]] = load i32, ptr [[B_ELT8]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+// NOSME-NEXT:    [[B_UNPACK11:%.*]] = load i32, ptr [[B_ELT10]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
+// NOSME-NEXT:    [[B_UNPACK13:%.*]] = load i32, ptr [[B_ELT12]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+// NOSME-NEXT:    [[B_UNPACK15:%.*]] = load i32, ptr [[B_ELT14]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[A_ADDR_SROA_0_0_VEC_INSERT]], i32 [[A_UNPACK4]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i32> poison, i32 [[B_UNPACK]], i64 0
+// NOSME-NEXT:    [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_0_VEC_INSERT]], i32 [[B_UNPACK7]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_4_VEC_INSERT]], i32 [[B_UNPACK9]], i64 2
+// NOSME-NEXT:    [[B_ADDR_SROA_0_12_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_8_VEC_INSERT]], i32 [[B_UNPACK11]], i64 3
+// NOSME-NEXT:    [[B_ADDR_SROA_0_16_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_12_VEC_INSERT]], i32 [[B_UNPACK13]], i64 4
+// NOSME-NEXT:    [[B_ADDR_SROA_0_20_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_16_VEC_INSERT]], i32 [[B_UNPACK15]], i64 5
+// NOSME-NEXT:    [[TMP2:%.*]] = tail call <3 x i32> @llvm.matrix.multiply.v3i32.v2i32.v6i32(<2 x i32> [[A_ADDR_SROA_0_4_VEC_INSERT]], <6 x i32> [[B_ADDR_SROA_0_20_VEC_INSERT]], i32 1, i32 2, i32 3)
+// NOSME-NEXT:    store <3 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+msint_t f_signed_int(msint_t1 a, msint_t2 b) {
+  return a * b;
+}
+
+// SME-LABEL: define dso_local void @f_unsigned_int(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i32]) align 4 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP2:%.*]] = alloca <3 x i32>, align 16
+// SME-NEXT:    call void @__sme_matmul_uint4(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]]
+// SME-NEXT:    [[TMP3:%.*]] = load <3 x i32>, ptr [[TMP2]], align 16
+// SME-NEXT:    store <3 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z14f_unsigned_intu11matrix_typeILm1ELm2EjEu11matrix_typeILm2ELm3EjE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i32]) align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4
+// NOSME-NEXT:    [[A_UNPACK4:%.*]] = load i32, ptr [[A_ELT3]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_UNPACK:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4
+// NOSME-NEXT:    [[B_UNPACK7:%.*]] = load i32, ptr [[B_ELT6]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
+// NOSME-NEXT:    [[B_UNPACK9:%.*]] = load i32, ptr [[B_ELT8]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+// NOSME-NEXT:    [[B_UNPACK11:%.*]] = load i32, ptr [[B_ELT10]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
+// NOSME-NEXT:    [[B_UNPACK13:%.*]] = load i32, ptr [[B_ELT12]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+// NOSME-NEXT:    [[B_UNPACK15:%.*]] = load i32, ptr [[B_ELT14]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[A_ADDR_SROA_0_0_VEC_INSERT]], i32 [[A_UNPACK4]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i32> poison, i32 [[B_UNPACK]], i64 0
+// NOSME-NEXT:    [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_0_VEC_INSERT]], i32 [[B_UNPACK7]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_4_VEC_INSERT]], i32 [[B_UNPACK9]], i64 2
+// NOSME-NEXT:    [[B_ADDR_SROA_0_12_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_8_VEC_INSERT]], i32 [[B_UNPACK11]], i64 3
+// NOSME-NEXT:    [[B_ADDR_SROA_0_16_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_12_VEC_INSERT]], i32 [[B_UNPACK13]], i64 4
+// NOSME-NEXT:    [[B_ADDR_SROA_0_20_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_16_VEC_INSERT]], i32 [[B_UNPACK15]], i64 5
+// NOSME-NEXT:    [[TMP2:%.*]] = tail call <3 x i32> @llvm.matrix.multiply.v3i32.v2i32.v6i32(<2 x i32> [[A_ADDR_SROA_0_4_VEC_INSERT]], <6 x i32> [[B_ADDR_SROA_0_20_VEC_INSERT]], i32 1, i32 2, i32 3)
+// NOSME-NEXT:    store <3 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+muint_t f_unsigned_int(muint_t1 a, muint_t2 b) {
+  return a * b;
+}
+
+// SME-LABEL: define dso_local void @f_signed_long_long(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i64]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP2:%.*]] = alloca <3 x i64>, align 32
+// SME-NEXT:    call void @__sme_matmul_int8(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]]
+// SME-NEXT:    [[TMP3:%.*]] = load <3 x i64>, ptr [[TMP2]], align 32
+// SME-NEXT:    store <3 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z18f_signed_long_longu11matrix_typeILm1ELm2ExEu11matrix_typeILm2ELm3ExE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i64]) align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i64, ptr [[TMP0]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+// NOSME-NEXT:    [[A_UNPACK4:%.*]] = load i64, ptr [[A_ELT3]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_UNPACK:%.*]] = load i64, ptr [[TMP1]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
+// NOSME-NEXT:    [[B_UNPACK7:%.*]] = load i64, ptr [[B_ELT6]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
+// NOSME-NEXT:    [[B_UNPACK9:%.*]] = load i64, ptr [[B_ELT8]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 24
+// NOSME-NEXT:    [[B_UNPACK11:%.*]] = load i64, ptr [[B_ELT10]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+// NOSME-NEXT:    [[B_UNPACK13:%.*]] = load i64, ptr [[B_ELT12]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+// NOSME-NEXT:    [[B_UNPACK15:%.*]] = load i64, ptr [[B_ELT14]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[A_ADDR_SROA_0_0_VEC_INSERT]], i64 [[A_UNPACK4]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i64> poison, i64 [[B_UNPACK]], i64 0
+// NOSME-NEXT:    [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_0_VEC_INSERT]], i64 [[B_UNPACK7]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_16_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_8_VEC_INSERT]], i64 [[B_UNPACK9]], i64 2
+// NOSME-NEXT:    [[B_ADDR_SROA_0_24_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_16_VEC_INSERT]], i64 [[B_UNPACK11]], i64 3
+// NOSME-NEXT:    [[B_ADDR_SROA_0_32_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_24_VEC_INSERT]], i64 [[B_UNPACK13]], i64 4
+// NOSME-NEXT:    [[B_ADDR_SROA_0_40_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_32_VEC_INSERT]], i64 [[B_UNPACK15]], i64 5
+// NOSME-NEXT:    [[TMP2:%.*]] = tail call <3 x i64> @llvm.matrix.multiply.v3i64.v2i64.v6i64(<2 x i64> [[A_ADDR_SROA_0_8_VEC_INSERT]], <6 x i64> [[B_ADDR_SROA_0_40_VEC_INSERT]], i32 1, i32 2, i32 3)
+// NOSME-NEXT:    store <3 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+msllong_t f_signed_long_long(msllong_t1 a, msllong_t2 b) {
+  return a * b;
+}
+
+// SME-LABEL: define dso_local void @f_unsigned_long_long(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i64]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP2:%.*]] = alloca <3 x i64>, align 32
+// SME-NEXT:    call void @__sme_matmul_uint8(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]]
+// SME-NEXT:    [[TMP3:%.*]] = load <3 x i64>, ptr [[TMP2]], align 32
+// SME-NEXT:    store <3 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z20f_unsigned_long_longu11matrix_typeILm1ELm2EyEu11matrix_typeILm2ELm3EyE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i64]) align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i64, ptr [[TMP0]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+// NOSME-NEXT:    [[A_UNPACK4:%.*]] = load i64, ptr [[A_ELT3]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_UNPACK:%.*]] = load i64, ptr [[TMP1]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
+// NOSME-NEXT:    [[B_UNPACK7:%.*]] = load i64, ptr [[B_ELT6]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
+// NOSME-NEXT:    [[B_UNPACK9:%.*]] = load i64, ptr [[B_ELT8]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 24
+// NOSME-NEXT:    [[B_UNPACK11:%.*]] = load i64, ptr [[B_ELT10]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+// NOSME-NEXT:    [[B_UNPACK13:%.*]] = load i64, ptr [[B_ELT12]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+// NOSME-NEXT:    [[B_UNPACK15:%.*]] = load i64, ptr [[B_ELT14]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[A_ADDR_SROA_0_0_VEC_INSERT]], i64 [[A_UNPACK4]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i64> poison, i64 [[B_UNPACK]], i64 0
+// NOSME-NEXT:    [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_0_VEC_INSERT]], i64 [[B_UNPACK7]], i64 1
+// NOSME-NEXT:    [[B_ADDR_SROA_0_16_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_8_VEC_INSERT]], i64 [[B_UNPACK9]], i64 2
+// NOSME-NEXT:    [[B_ADDR_SROA_0_24_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_16_VEC_INSERT]], i64 [[B_UNPACK11]], i64 3
+// NOSME-NEXT:    [[B_ADDR_SROA_0_32_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_24_VEC_INSERT]], i64 [[B_UNPACK13]], i64 4
+// NOSME-NEXT:    [[B_ADDR_SROA_0_40_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_32_VEC_INSERT]], i64 [[B_UNPACK15]], i64 5
+// NOSME-NEXT:    [[TMP2:%.*]] = tail call <3 x i64> @llvm.matrix.multiply.v3i64.v2i64.v6i64(<2 x i64> [[A_ADDR_SROA_0_8_VEC_INSERT]], <6 x i64> [[B_ADDR_SROA_0_40_VEC_INSERT]], i32 1, i32 2, i32 3)
+// NOSME-NEXT:    store <3 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+mullong_t f_unsigned_long_long(mullong_t1 a, mullong_t2 b) {
+  return a * b;
+}
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-transpose.c b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-transpose.c
new file mode 100644
index 000000000000..06fc018bef32
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-transpose.c
@@ -0,0 +1,337 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -O1 -Werror -emit-llvm -fenable-matrix -o - %s | FileCheck %s -check-prefix=SME
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -sme -O1 -Werror -emit-llvm -fenable-matrix -o - -x c++ %s | FileCheck %s -check-prefix=NOSME
+
+#define M 1
+#define K 2
+
+typedef __bf16 mbf16_t __attribute__((matrix_type(M, K)));
+typedef __fp16 mfp16_t __attribute__((matrix_type(M, K)));
+typedef float mfloat_t __attribute__((matrix_type(M, K)));
+typedef double mdouble_t __attribute__((matrix_type(M, K)));
+typedef signed char mschar_t __attribute__((matrix_type(M, K)));
+typedef unsigned char muchar_t __attribute__((matrix_type(M, K)));
+typedef signed short msshort_t __attribute__((matrix_type(M, K)));
+typedef unsigned short mushort_t __attribute__((matrix_type(M, K)));
+typedef signed int msint_t __attribute__((matrix_type(M, K)));
+typedef unsigned int muint_t __attribute__((matrix_type(M, K)));
+typedef signed long long msllong_t __attribute__((matrix_type(M, K)));
+typedef unsigned long long mullong_t __attribute__((matrix_type(M, K)));
+
+typedef __bf16 tran_mbf16_t __attribute__((matrix_type(K, M)));
+typedef __fp16 tran_mfp16_t __attribute__((matrix_type(K, M)));
+typedef float tran_mfloat_t __attribute__((matrix_type(K, M)));
+typedef double tran_mdouble_t __attribute__((matrix_type(K, M)));
+typedef signed char tran_mschar_t __attribute__((matrix_type(K, M)));
+typedef unsigned char tran_muchar_t __attribute__((matrix_type(K, M)));
+typedef signed short tran_msshort_t __attribute__((matrix_type(K, M)));
+typedef unsigned short tran_mushort_t __attribute__((matrix_type(K, M)));
+typedef signed int tran_msint_t __attribute__((matrix_type(K, M)));
+typedef unsigned int tran_muint_t __attribute__((matrix_type(K, M)));
+typedef signed long long tran_msllong_t __attribute__((matrix_type(K, M)));
+typedef unsigned long long tran_mullong_t __attribute__((matrix_type(K, M)));
+
+// SME-LABEL: define dso_local void @f_bf16(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x bfloat]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[A_UNPACK:%.*]] = load bfloat, ptr [[TMP0]], align 2, !tbaa [[TBAA2:![0-9]+]]
+// SME-NEXT:    [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2
+// SME-NEXT:    [[A_UNPACK3:%.*]] = load bfloat, ptr [[A_ELT2]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x bfloat> poison, bfloat [[A_UNPACK]], i64 0
+// SME-NEXT:    [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x bfloat> [[A_ADDR_SROA_0_0_VEC_INSERT]], bfloat [[A_UNPACK3]], i64 1
+// SME-NEXT:    [[TMP1:%.*]] = tail call <2 x bfloat> @llvm.matrix.transpose.v2bf16(<2 x bfloat> [[A_ADDR_SROA_0_2_VEC_INSERT]], i32 1, i32 2)
+// SME-NEXT:    store <2 x bfloat> [[TMP1]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z6f_bf16u11matrix_typeILm1ELm2Eu6__bf16E(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x bfloat]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load bfloat, ptr [[TMP0]], align 2, !tbaa [[TBAA2:![0-9]+]]
+// NOSME-NEXT:    [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2
+// NOSME-NEXT:    [[A_UNPACK3:%.*]] = load bfloat, ptr [[A_ELT2]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x bfloat> poison, bfloat [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x bfloat> [[A_ADDR_SROA_0_0_VEC_INSERT]], bfloat [[A_UNPACK3]], i64 1
+// NOSME-NEXT:    [[TMP1:%.*]] = tail call <2 x bfloat> @llvm.matrix.transpose.v2bf16(<2 x bfloat> [[A_ADDR_SROA_0_2_VEC_INSERT]], i32 1, i32 2)
+// NOSME-NEXT:    store <2 x bfloat> [[TMP1]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+tran_mbf16_t f_bf16(mbf16_t a) {
+  return __builtin_matrix_transpose(a);
+}
+
+// SME-LABEL: define dso_local void @f_fp16(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x half]) align 2 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP1:%.*]] = alloca <2 x half>, align 4
+// SME-NEXT:    call void @__sme_transpose_float2(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4:[0-9]+]]
+// SME-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr [[TMP1]], align 4
+// SME-NEXT:    store <2 x half> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z6f_fp16u11matrix_typeILm1ELm2EDhE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x half]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load half, ptr [[TMP0]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2
+// NOSME-NEXT:    [[A_UNPACK3:%.*]] = load half, ptr [[A_ELT2]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x half> poison, half [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x half> [[A_ADDR_SROA_0_0_VEC_INSERT]], half [[A_UNPACK3]], i64 1
+// NOSME-NEXT:    [[TMP1:%.*]] = tail call <2 x half> @llvm.matrix.transpose.v2f16(<2 x half> [[A_ADDR_SROA_0_2_VEC_INSERT]], i32 1, i32 2)
+// NOSME-NEXT:    store <2 x half> [[TMP1]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+tran_mfp16_t f_fp16(mfp16_t a) {
+  return __builtin_matrix_transpose(a);
+}
+
+// SME-LABEL: define dso_local void @f_float(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x float]) align 4 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP1:%.*]] = alloca <2 x float>, align 8
+// SME-NEXT:    call void @__sme_transpose_float4(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]]
+// SME-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[TMP1]], align 8
+// SME-NEXT:    store <2 x float> [[TMP2]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z7f_floatu11matrix_typeILm1ELm2EfE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x float]) align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load float, ptr [[TMP0]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4
+// NOSME-NEXT:    [[A_UNPACK3:%.*]] = load float, ptr [[A_ELT2]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x float> poison, float [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[A_ADDR_SROA_0_0_VEC_INSERT]], float [[A_UNPACK3]], i64 1
+// NOSME-NEXT:    [[TMP1:%.*]] = tail call <2 x float> @llvm.matrix.transpose.v2f32(<2 x float> [[A_ADDR_SROA_0_4_VEC_INSERT]], i32 1, i32 2)
+// NOSME-NEXT:    store <2 x float> [[TMP1]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+tran_mfloat_t f_float(mfloat_t a) {
+  return __builtin_matrix_transpose(a);
+}
+
+// SME-LABEL: define dso_local void @f_double(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x double]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP1:%.*]] = alloca <2 x double>, align 16
+// SME-NEXT:    call void @__sme_transpose_float8(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]]
+// SME-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[TMP1]], align 16
+// SME-NEXT:    store <2 x double> [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z8f_doubleu11matrix_typeILm1ELm2EdE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x double]) align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load double, ptr [[TMP0]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+// NOSME-NEXT:    [[A_UNPACK3:%.*]] = load double, ptr [[A_ELT2]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x double> poison, double [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x double> [[A_ADDR_SROA_0_0_VEC_INSERT]], double [[A_UNPACK3]], i64 1
+// NOSME-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.matrix.transpose.v2f64(<2 x double> [[A_ADDR_SROA_0_8_VEC_INSERT]], i32 1, i32 2)
+// NOSME-NEXT:    store <2 x double> [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+tran_mdouble_t f_double(mdouble_t a) {
+  return __builtin_matrix_transpose(a);
+}
+
+// SME-LABEL: define dso_local void @f_signed_char(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i8]) align 1 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP1:%.*]] = alloca <2 x i8>, align 2
+// SME-NEXT:    call void @__sme_transpose_int1(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]]
+// SME-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr [[TMP1]], align 2
+// SME-NEXT:    store <2 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z13f_signed_charu11matrix_typeILm1ELm2EaE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i8]) align 1 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i8, ptr [[TMP0]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 1
+// NOSME-NEXT:    [[A_UNPACK3:%.*]] = load i8, ptr [[A_ELT2]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i8> poison, i8 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_1_VEC_INSERT:%.*]] = insertelement <2 x i8> [[A_ADDR_SROA_0_0_VEC_INSERT]], i8 [[A_UNPACK3]], i64 1
+// NOSME-NEXT:    [[TMP1:%.*]] = tail call <2 x i8> @llvm.matrix.transpose.v2i8(<2 x i8> [[A_ADDR_SROA_0_1_VEC_INSERT]], i32 1, i32 2)
+// NOSME-NEXT:    store <2 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+tran_mschar_t f_signed_char(mschar_t a) {
+  return __builtin_matrix_transpose(a);
+}
+
+// SME-LABEL: define dso_local void @f_unsigned_char(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i8]) align 1 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP1:%.*]] = alloca <2 x i8>, align 2
+// SME-NEXT:    call void @__sme_transpose_int1(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]]
+// SME-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr [[TMP1]], align 2
+// SME-NEXT:    store <2 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z15f_unsigned_charu11matrix_typeILm1ELm2EhE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i8]) align 1 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i8, ptr [[TMP0]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 1
+// NOSME-NEXT:    [[A_UNPACK3:%.*]] = load i8, ptr [[A_ELT2]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i8> poison, i8 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_1_VEC_INSERT:%.*]] = insertelement <2 x i8> [[A_ADDR_SROA_0_0_VEC_INSERT]], i8 [[A_UNPACK3]], i64 1
+// NOSME-NEXT:    [[TMP1:%.*]] = tail call <2 x i8> @llvm.matrix.transpose.v2i8(<2 x i8> [[A_ADDR_SROA_0_1_VEC_INSERT]], i32 1, i32 2)
+// NOSME-NEXT:    store <2 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+tran_muchar_t f_unsigned_char(muchar_t a) {
+  return __builtin_matrix_transpose(a);
+}
+
+// SME-LABEL: define dso_local void @f_signed_short(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i16]) align 2 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP1:%.*]] = alloca <2 x i16>, align 4
+// SME-NEXT:    call void @__sme_transpose_int2(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]]
+// SME-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr [[TMP1]], align 4
+// SME-NEXT:    store <2 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z14f_signed_shortu11matrix_typeILm1ELm2EsE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i16]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i16, ptr [[TMP0]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2
+// NOSME-NEXT:    [[A_UNPACK3:%.*]] = load i16, ptr [[A_ELT2]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x i16> [[A_ADDR_SROA_0_0_VEC_INSERT]], i16 [[A_UNPACK3]], i64 1
+// NOSME-NEXT:    [[TMP1:%.*]] = tail call <2 x i16> @llvm.matrix.transpose.v2i16(<2 x i16> [[A_ADDR_SROA_0_2_VEC_INSERT]], i32 1, i32 2)
+// NOSME-NEXT:    store <2 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+tran_msshort_t f_signed_short(msshort_t a) {
+  return __builtin_matrix_transpose(a);
+}
+
+// SME-LABEL: define dso_local void @f_unsigned_short(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i16]) align 2 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP1:%.*]] = alloca <2 x i16>, align 4
+// SME-NEXT:    call void @__sme_transpose_int2(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]]
+// SME-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr [[TMP1]], align 4
+// SME-NEXT:    store <2 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z16f_unsigned_shortu11matrix_typeILm1ELm2EtE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i16]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i16, ptr [[TMP0]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2
+// NOSME-NEXT:    [[A_UNPACK3:%.*]] = load i16, ptr [[A_ELT2]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x i16> [[A_ADDR_SROA_0_0_VEC_INSERT]], i16 [[A_UNPACK3]], i64 1
+// NOSME-NEXT:    [[TMP1:%.*]] = tail call <2 x i16> @llvm.matrix.transpose.v2i16(<2 x i16> [[A_ADDR_SROA_0_2_VEC_INSERT]], i32 1, i32 2)
+// NOSME-NEXT:    store <2 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+tran_mushort_t f_unsigned_short(mushort_t a) {
+  return __builtin_matrix_transpose(a);
+}
+
+// SME-LABEL: define dso_local void @f_signed_int(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i32]) align 4 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP1:%.*]] = alloca <2 x i32>, align 8
+// SME-NEXT:    call void @__sme_transpose_int4(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]]
+// SME-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+// SME-NEXT:    store <2 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z12f_signed_intu11matrix_typeILm1ELm2EiE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i32]) align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4
+// NOSME-NEXT:    [[A_UNPACK3:%.*]] = load i32, ptr [[A_ELT2]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[A_ADDR_SROA_0_0_VEC_INSERT]], i32 [[A_UNPACK3]], i64 1
+// NOSME-NEXT:    [[TMP1:%.*]] = tail call <2 x i32> @llvm.matrix.transpose.v2i32(<2 x i32> [[A_ADDR_SROA_0_4_VEC_INSERT]], i32 1, i32 2)
+// NOSME-NEXT:    store <2 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+tran_msint_t f_signed_int(msint_t a) {
+  return __builtin_matrix_transpose(a);
+}
+
+// SME-LABEL: define dso_local void @f_unsigned_int(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i32]) align 4 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP1:%.*]] = alloca <2 x i32>, align 8
+// SME-NEXT:    call void @__sme_transpose_int4(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]]
+// SME-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+// SME-NEXT:    store <2 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z14f_unsigned_intu11matrix_typeILm1ELm2EjE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i32]) align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4
+// NOSME-NEXT:    [[A_UNPACK3:%.*]] = load i32, ptr [[A_ELT2]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[A_ADDR_SROA_0_0_VEC_INSERT]], i32 [[A_UNPACK3]], i64 1
+// NOSME-NEXT:    [[TMP1:%.*]] = tail call <2 x i32> @llvm.matrix.transpose.v2i32(<2 x i32> [[A_ADDR_SROA_0_4_VEC_INSERT]], i32 1, i32 2)
+// NOSME-NEXT:    store <2 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+tran_muint_t f_unsigned_int(muint_t a) {
+  return __builtin_matrix_transpose(a);
+}
+
+// SME-LABEL: define dso_local void @f_signed_long_long(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i64]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP1:%.*]] = alloca <2 x i64>, align 16
+// SME-NEXT:    call void @__sme_transpose_int8(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]]
+// SME-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[TMP1]], align 16
+// SME-NEXT:    store <2 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z18f_signed_long_longu11matrix_typeILm1ELm2ExE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i64]) align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i64, ptr [[TMP0]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+// NOSME-NEXT:    [[A_UNPACK3:%.*]] = load i64, ptr [[A_ELT2]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[A_ADDR_SROA_0_0_VEC_INSERT]], i64 [[A_UNPACK3]], i64 1
+// NOSME-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.matrix.transpose.v2i64(<2 x i64> [[A_ADDR_SROA_0_8_VEC_INSERT]], i32 1, i32 2)
+// NOSME-NEXT:    store <2 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+tran_msllong_t f_signed_long_long(msllong_t a) {
+  return __builtin_matrix_transpose(a);
+}
+
+// SME-LABEL: define dso_local void @f_unsigned_long_long(
+// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i64]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SME-NEXT:  [[ENTRY:.*:]]
+// SME-NEXT:    [[TMP1:%.*]] = alloca <2 x i64>, align 16
+// SME-NEXT:    call void @__sme_transpose_int8(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]]
+// SME-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[TMP1]], align 16
+// SME-NEXT:    store <2 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]]
+// SME-NEXT:    ret void
+//
+// NOSME-LABEL: define dso_local void @_Z20f_unsigned_long_longu11matrix_typeILm1ELm2EyE(
+// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i64]) align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// NOSME-NEXT:  [[ENTRY:.*:]]
+// NOSME-NEXT:    [[A_UNPACK:%.*]] = load i64, ptr [[TMP0]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+// NOSME-NEXT:    [[A_UNPACK3:%.*]] = load i64, ptr [[A_ELT2]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A_UNPACK]], i64 0
+// NOSME-NEXT:    [[A_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[A_ADDR_SROA_0_0_VEC_INSERT]], i64 [[A_UNPACK3]], i64 1
+// NOSME-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.matrix.transpose.v2i64(<2 x i64> [[A_ADDR_SROA_0_8_VEC_INSERT]], i32 1, i32 2)
+// NOSME-NEXT:    store <2 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]]
+// NOSME-NEXT:    ret void
+//
+tran_mullong_t f_unsigned_long_long(mullong_t a) {
+  return __builtin_matrix_transpose(a);
+}
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index e0b2d08c2077..79a2a81a259f 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -569,6 +569,53 @@ set(aarch64_SOURCES
   aarch64/fp_mode.c
 )
 
+if (COMPILER_RT_HAS_AARCH64_SME)
+  list(APPEND aarch64_SOURCES
+    aarch64/matrix/add_float2.c
+    aarch64/matrix/add_float4.c
+    aarch64/matrix/add_float8.c
+    aarch64/matrix/add_int1.c
+    aarch64/matrix/add_int2.c
+    aarch64/matrix/add_int4.c
+    aarch64/matrix/add_int8.c
+    aarch64/matrix/add_uint1.c
+    aarch64/matrix/add_uint2.c
+    aarch64/matrix/add_uint4.c
+    aarch64/matrix/add_uint8.c
+    aarch64/matrix/matmul_float2.c
+    aarch64/matrix/matmul_float4.c
+    aarch64/matrix/matmul_float8.c
+    aarch64/matrix/matmul_int1.c
+    aarch64/matrix/matmul_int2.c
+    aarch64/matrix/matmul_int4.c
+    aarch64/matrix/matmul_int8.c
+    aarch64/matrix/matmul_uint1.c
+    aarch64/matrix/matmul_uint2.c
+    aarch64/matrix/matmul_uint4.c
+    aarch64/matrix/matmul_uint8.c
+    aarch64/matrix/sub_float2.c
+    aarch64/matrix/sub_float4.c
+    aarch64/matrix/sub_float8.c
+    aarch64/matrix/sub_int1.c
+    aarch64/matrix/sub_int2.c
+    aarch64/matrix/sub_int4.c
+    aarch64/matrix/sub_int8.c
+    aarch64/matrix/sub_uint1.c
+    aarch64/matrix/sub_uint2.c
+    aarch64/matrix/sub_uint4.c
+    aarch64/matrix/sub_uint8.c
+    aarch64/matrix/transpose_float2.c
+    aarch64/matrix/transpose_float4.c
+    aarch64/matrix/transpose_float8.c
+    aarch64/matrix/transpose_int1.c
+    aarch64/matrix/transpose_int2.c
+    aarch64/matrix/transpose_int4.c
+    aarch64/matrix/transpose_int8.c
+  )
+else()
+  message(STATUS "AArch64 SME matrix routines disabled")
+endif()
+
 if (COMPILER_RT_HAS_AARCH64_SME)
   if (NOT COMPILER_RT_DISABLE_AARCH64_FMV AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
     list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-libc-mem-routines.S aarch64/sme-abi-init.c aarch64/sme-abi-vg.c aarch64/sme-libc-routines.c)
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_float2.c b/compiler-rt/lib/builtins/aarch64/matrix/add_float2.c
new file mode 100644
index 000000000000..49c054958cf1
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/add_float2.c
@@ -0,0 +1,18 @@
+//==- builtins/arrch64/matrix/add_float2.c - sme matrix operations -*- C -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native add instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void __sme_add_float2(__fp16 *dst, __fp16 *lhs,
+                                                      __fp16 *rhs, unsigned row,
+                                                      unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, h, float16, 16, f, add);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_float4.c b/compiler-rt/lib/builtins/aarch64/matrix/add_float4.c
new file mode 100644
index 000000000000..3e6c7a759153
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/add_float4.c
@@ -0,0 +1,18 @@
+//==- builtins/arrch64/matrix/add_float4.c - sme matrix operations -*- C -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native add instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void __sme_add_float4(float *dst, float *lhs,
+                                                      float *rhs, unsigned row,
+                                                      unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, w, float32, 32, f, add);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_float8.c b/compiler-rt/lib/builtins/aarch64/matrix/add_float8.c
new file mode 100644
index 000000000000..c75fc929a89d
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/add_float8.c
@@ -0,0 +1,18 @@
+//==- builtins/arrch64/matrix/add_float8.c - sme matrix operations -*- C -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native add instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void __sme_add_float8(double *dst, double *lhs,
+                                                      double *rhs, unsigned row,
+                                                      unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, d, float64, 64, f, add);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_int1.c b/compiler-rt/lib/builtins/aarch64/matrix/add_int1.c
new file mode 100644
index 000000000000..e5ed762d68f6
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/add_int1.c
@@ -0,0 +1,18 @@
+//===- builtins/arrch64/matrix/add_int1.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native add instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_add_int1(signed char *dst, signed char *lhs, signed char *rhs,
+               unsigned row, unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, b, int8, 8, s, add);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_int2.c b/compiler-rt/lib/builtins/aarch64/matrix/add_int2.c
new file mode 100644
index 000000000000..7c21bb27151d
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/add_int2.c
@@ -0,0 +1,18 @@
+//===- builtins/arrch64/matrix/add_int2.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native add instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_add_int2(signed short *dst, signed short *lhs, signed short *rhs,
+               unsigned row, unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, h, int16, 16, s, add);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_int4.c b/compiler-rt/lib/builtins/aarch64/matrix/add_int4.c
new file mode 100644
index 000000000000..e5ab80f0b2db
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/add_int4.c
@@ -0,0 +1,18 @@
+//===- builtins/arrch64/matrix/add_int4.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native add instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_add_int4(signed int *dst, signed int *lhs, signed int *rhs, unsigned row,
+               unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, w, int32, 32, s, add);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_int8.c b/compiler-rt/lib/builtins/aarch64/matrix/add_int8.c
new file mode 100644
index 000000000000..60bf63e4e3b5
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/add_int8.c
@@ -0,0 +1,18 @@
+//===- builtins/arrch64/matrix/add_int8.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native add instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_add_int8(signed long long *dst, signed long long *lhs,
+               signed long long *rhs, unsigned row, unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, d, int64, 64, s, add);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_uint1.c b/compiler-rt/lib/builtins/aarch64/matrix/add_uint1.c
new file mode 100644
index 000000000000..44b451e6aa48
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/add_uint1.c
@@ -0,0 +1,18 @@
+//==- builtins/arrch64/matrix/add_uint1.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native add instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_add_uint1(unsigned char *dst, unsigned char *lhs, unsigned char *rhs,
+                unsigned row, unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, b, uint8, 8, u, add);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_uint2.c b/compiler-rt/lib/builtins/aarch64/matrix/add_uint2.c
new file mode 100644
index 000000000000..f4545e1baeef
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/add_uint2.c
@@ -0,0 +1,18 @@
+//==- builtins/arrch64/matrix/add_uint2.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native add instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_add_uint2(unsigned short *dst, unsigned short *lhs, unsigned short *rhs,
+                unsigned row, unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, h, uint16, 16, u, add);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_uint4.c b/compiler-rt/lib/builtins/aarch64/matrix/add_uint4.c
new file mode 100644
index 000000000000..31a4af9fe071
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/add_uint4.c
@@ -0,0 +1,18 @@
+//==- builtins/arrch64/matrix/add_uint4.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native add instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_add_uint4(unsigned int *dst, unsigned int *lhs, unsigned int *rhs,
+                unsigned row, unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, w, uint32, 32, u, add);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_uint8.c b/compiler-rt/lib/builtins/aarch64/matrix/add_uint8.c
new file mode 100644
index 000000000000..c2c0f2106c46
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/add_uint8.c
@@ -0,0 +1,18 @@
+//==- builtins/arrch64/matrix/add_uint8.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native add instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_add_uint8(unsigned long long *dst, unsigned long long *lhs,
+                unsigned long long *rhs, unsigned row, unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, d, uint64, 64, u, add);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_float2.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_float2.c
new file mode 100644
index 000000000000..cee21c2ab2eb
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_float2.c
@@ -0,0 +1,19 @@
+//= builtins/arrch64/matrix/matmul_float2.c - sme matrix operations -*- C -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native FMOPA instruction for __fp16 type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_matmul_float2(__fp16 *dst, __fp16 *lhs, __fp16 *rhs, unsigned lhs_row,
+                    unsigned lhs_column, unsigned rhs_column) {
+  SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, h,
+                      float16, 16, f);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_float4.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_float4.c
new file mode 100644
index 000000000000..396768e5073b
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_float4.c
@@ -0,0 +1,17 @@
+//= builtins/arrch64/matrix/matmul_float4.c - sme matrix operations -*- C -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+__attribute__((target("+sme,+sve"))) __arm_new("za") void __sme_matmul_float4(
+    float *dst, float *lhs, float *rhs, unsigned lhs_row, unsigned lhs_column,
+    unsigned rhs_column) __arm_streaming {
+  BREAK_SME_ACLE_MATMUL_SME(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, w,
+                            float32, 32, 0b00010001);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_float8.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_float8.c
new file mode 100644
index 000000000000..983427dbb200
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_float8.c
@@ -0,0 +1,18 @@
+//= builtins/arrch64/matrix/matmul_float8.c - sme matrix operations -*- C -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+__attribute__((target("+sme,+sve,+sme-f64f64"))) __arm_new(
+    "za") void __sme_matmul_float8(double *dst, double *lhs, double *rhs,
+                                   unsigned lhs_row, unsigned lhs_column,
+                                   unsigned rhs_column) __arm_streaming {
+  BREAK_SME_ACLE_MATMUL_SME(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, d,
+                            float64, 64, 0b00000001);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_int1.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int1.c
new file mode 100644
index 000000000000..a5fd9a59186e
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int1.c
@@ -0,0 +1,19 @@
+//=- builtins/arrch64/matrix/matmul_int1.c - sme matrix operations -*- C -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native FMOPA instruction for i8 type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_matmul_int1(signed char *dst, signed char *lhs, signed char *rhs,
+                  unsigned lhs_row, unsigned lhs_column, unsigned rhs_column) {
+  SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, b, int8,
+                      8, s);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_int2.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int2.c
new file mode 100644
index 000000000000..5d070095a43c
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int2.c
@@ -0,0 +1,19 @@
+//=- builtins/arrch64/matrix/matmul_int2.c - sme matrix operations -*- C -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native FMOPA instruction for i16 type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_matmul_int2(signed short *dst, signed short *lhs, signed short *rhs,
+                  unsigned lhs_row, unsigned lhs_column, unsigned rhs_column) {
+  SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, h, int16,
+                      16, s);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_int4.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int4.c
new file mode 100644
index 000000000000..81a59c319026
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int4.c
@@ -0,0 +1,19 @@
+//=- builtins/arrch64/matrix/matmul_int4.c - sme matrix operations -*- C -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native FMOPA instruction for i32 type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_matmul_int4(signed int *dst, signed int *lhs, signed int *rhs,
+                  unsigned lhs_row, unsigned lhs_column, unsigned rhs_column) {
+  SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, w, int32,
+                      32, s);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_int8.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int8.c
new file mode 100644
index 000000000000..84d83061738c
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int8.c
@@ -0,0 +1,19 @@
+//=- builtins/arrch64/matrix/matmul_int8.c - sme matrix operations -*- C -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native FMOPA instruction for i64 type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_matmul_int8(int64_t *dst, int64_t *lhs, int64_t *rhs, unsigned lhs_row,
+                  unsigned lhs_column, unsigned rhs_column) {
+  SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, d, int64,
+                      64, s);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint1.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint1.c
new file mode 100644
index 000000000000..f5873f22cdb3
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint1.c
@@ -0,0 +1,19 @@
+//=- builtins/arrch64/matrix/matmul_uint1.c - sme matrix operations -*- C -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native FMOPA instruction for zext i8 type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_matmul_uint1(unsigned char *dst, unsigned char *lhs, unsigned char *rhs,
+                   unsigned lhs_row, unsigned lhs_column, unsigned rhs_column) {
+  SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, b, uint8,
+                      8, u);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint2.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint2.c
new file mode 100644
index 000000000000..5962a02a2e5d
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint2.c
@@ -0,0 +1,20 @@
+//=- builtins/arrch64/matrix/matmul_uint2.c - sme matrix operations -*- C -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native FMOPA instruction for zext i16 type, we use
+// SVE version.
+__attribute__((target("+sve"))) void
+__sme_matmul_uint2(unsigned short *dst, unsigned short *lhs,
+                   unsigned short *rhs, unsigned lhs_row, unsigned lhs_column,
+                   unsigned rhs_column) {
+  SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, h, uint16,
+                      16, u);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint4.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint4.c
new file mode 100644
index 000000000000..0fd33ea44fb2
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint4.c
@@ -0,0 +1,19 @@
+//=- builtins/arrch64/matrix/matmul_uint4.c - sme matrix operations -*- C -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native FMOPA instruction for zext i32 type, we use
+// SVE version.
+__attribute__((target("+sve"))) void
+__sme_matmul_uint4(unsigned int *dst, unsigned int *lhs, unsigned int *rhs,
+                   unsigned lhs_row, unsigned lhs_column, unsigned rhs_column) {
+  SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, w, uint32,
+                      32, u);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint8.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint8.c
new file mode 100644
index 000000000000..c4a9a010bf61
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint8.c
@@ -0,0 +1,19 @@
+//=- builtins/arrch64/matrix/matmul_uint8.c - sme matrix operations -*- C -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native FMOPA instruction for zext i64 type, we use
+// SVE version.
+__attribute__((target("+sve"))) void
+__sme_matmul_uint8(uint64_t *dst, uint64_t *lhs, uint64_t *rhs,
+                   unsigned lhs_row, unsigned lhs_column, unsigned rhs_column) {
+  SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, d, uint64,
+                      64, u);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sme_acle.h b/compiler-rt/lib/builtins/aarch64/matrix/sme_acle.h
new file mode 100644
index 000000000000..fb40593ad89d
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/sme_acle.h
@@ -0,0 +1,136 @@
+//===- builtins/arrch64/matrix/sme_acle.h - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file implements the runtime SME routines for matrix_type operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _SME_ACLE_H_
+#define _SME_ACLE_H_
+
+#include <arm_sme.h>
+#include <arm_sve.h>
+#include <stddef.h>
+
+#define SVCNT(x) svcnt##x
+#define VectorType(x) sv##x##_t
+#define SVWHILELT(x) svwhilelt_b##x##_u32
+#define SMZERO svzero_mask_za
+#define SMLD1H(x) svld1_hor_za##x
+#define SMST1V(x) svst1_ver_za##x
+#define SVLD1_FLOAT(x) svld1_f##x
+#define SMMOPA_FLOAT(x) svmopa_za##x##_f##x##_m
+#define SMEXTRACTV_FLOAT(x) svread_ver_za##x##_f##x##_m
+
+#define SVDUP(type, bit) svdup_##type##bit
+#define SVLD1(type, bit) svld1_##type##bit
+#define SVMLA(type, bit) svmla_##type##bit##_x
+#define SVST1(type, bit) svst1_##type##bit
+#define SVBINOP(type, bit, op) sv##op##_##type##bit##_x
+
+// We don't have add/sub etc. binary operations for za tiles. Use SVE version
+// instead.
+#define SME_ACLE_BINOP_SVE(matA, matB, matC, M, N, svcnt_type, vec_type, bit,  \
+                           func_type, op_type)                                 \
+  do {                                                                         \
+    uint64_t vscale = SVCNT(svcnt_type)();                                     \
+    VectorType(vec_type) src1, src2, res;                                      \
+    svbool_t p;                                                                \
+    for (size_t i = 0; i < M; i += vscale) {                                   \
+      p = SVWHILELT(bit)(i, M);                                                \
+      for (size_t j = 0; j < N; j++) {                                         \
+        src1 = SVLD1(func_type, bit)(p, matA + j * M + i);                     \
+        src2 = SVLD1(func_type, bit)(p, matB + j * M + i);                     \
+        res = SVBINOP(func_type, bit, op_type)(p, src1, src2);                 \
+        SVST1(func_type, bit)(p, matC + j * M + i, res);                       \
+      }                                                                        \
+    }                                                                          \
+  } while (0)
+
+// We don't have non-widening matmul for types except float4 and float8 in SME.
+// Use SVE version instead.
+#define SME_ACLE_MATMUL_SVE(matA, matB, matC, M, K, N, svcnt_type, vec_type,   \
+                            bit, func_type)                                    \
+  do {                                                                         \
+    uint64_t vscale = SVCNT(svcnt_type)();                                     \
+    VectorType(vec_type) src1, src2, acc;                                      \
+    svbool_t p;                                                                \
+    for (size_t j = 0; j < N; j++)                                             \
+      for (size_t i = 0; i < M; i += vscale) {                                 \
+        acc = SVDUP(func_type, bit)(0);                                        \
+        p = SVWHILELT(bit)(i, M);                                              \
+        for (size_t k = 0; k < K; ++k) {                                       \
+          src1 = SVDUP(func_type, bit)(matB[j * K + k]);                       \
+          src2 = SVLD1(func_type, bit)(p, matA + k * M + i);                   \
+          acc = SVMLA(func_type, bit)(p, acc, src1, src2);                     \
+        }                                                                      \
+        SVST1(func_type, bit)(p, matC + j * M + i, acc);                       \
+      }                                                                        \
+  } while (0)
+
+// matrix_type in clang is column major, so we can just reuse Fortran's matmul
+// version.
+#define BREAK_SME_ACLE_MATMUL_SME(matA, matB, matC, M, K, N, svcnt_type,       \
+                                  vec_type, bit, zero_mask)                    \
+  do {                                                                         \
+    uint64_t vscale = SVCNT(svcnt_type)();                                     \
+    svbool_t pm, pn, pk;                                                       \
+    VectorType(vec_type) src1, src2;                                           \
+    for (size_t i = 0; i < M; i += vscale) {                                   \
+      pm = SVWHILELT(bit)(i, M);                                               \
+      for (size_t j = 0; j < N; j += vscale) {                                 \
+        pn = SVWHILELT(bit)(j, N);                                             \
+        SMZERO(zero_mask);                                                     \
+        for (size_t k = 0; k < K; k += vscale) {                               \
+          pk = SVWHILELT(bit)(k, K);                                           \
+          for (size_t t = 0; t < vscale; t++) {                                \
+            if (j + t == N)                                                    \
+              break;                                                           \
+            SMLD1H(bit)(1, t, pk, matB + (j + t) * K + k);                     \
+          }                                                                    \
+          for (size_t t = 0; t < vscale; t++) {                                \
+            if (k + t == K)                                                    \
+              break;                                                           \
+            src2 = SMEXTRACTV_FLOAT(bit)(src2, pn, 1, t);                      \
+            src1 = SVLD1_FLOAT(bit)(pm, matA + (k + t) * M + i);               \
+            SMMOPA_FLOAT(bit)(0, pm, pn, src1, src2);                          \
+          }                                                                    \
+        }                                                                      \
+        for (size_t t = 0; t < vscale; t++) {                                  \
+          if (j + t == N)                                                      \
+            break;                                                             \
+          SMST1V(bit)(0, t, pm, matC + (j + t) * M + i);                       \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+  } while (0)
+
+// matrix_type in clang is column major, so we can just reuse Fortran's
+// transpose version.
+#define BREAK_SME_ACLE_TRANSPOSE(matA, M, N, ans, svcnt_type, bit, zero_mask)  \
+  do {                                                                         \
+    uint64_t vscale = SVCNT(svcnt_type)();                                     \
+    svbool_t pm, pn;                                                           \
+    for (size_t i = 0; i < M; i += vscale) {                                   \
+      pm = SVWHILELT(bit)(i, M);                                               \
+      for (size_t j = 0; j < N; j += vscale) {                                 \
+        pn = SVWHILELT(bit)(j, N);                                             \
+        SMZERO(zero_mask);                                                     \
+        for (size_t t = 0; t < vscale; t++) {                                  \
+          if (j + t == N)                                                      \
+            break;                                                             \
+          SMLD1H(bit)(0, t, pm, matA + (j + t) * M + i);                       \
+        }                                                                      \
+        for (size_t t = 0; t < vscale; t++) {                                  \
+          if (i + t == M)                                                      \
+            break;                                                             \
+          SMST1V(bit)(0, t, pn, ans + (i + t) * N + j);                        \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+  } while (0)
+
+#endif
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_float2.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_float2.c
new file mode 100644
index 000000000000..6105b0479c03
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_float2.c
@@ -0,0 +1,18 @@
+//==- builtins/arrch64/matrix/sub_float2.c - sme matrix operations -*- C -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native sub instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void __sme_sub_float2(__fp16 *dst, __fp16 *lhs,
+                                                      __fp16 *rhs, unsigned row,
+                                                      unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, h, float16, 16, f, sub);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_float4.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_float4.c
new file mode 100644
index 000000000000..3a57ffbe2801
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_float4.c
@@ -0,0 +1,18 @@
+//==- builtins/arrch64/matrix/sub_float4.c - sme matrix operations -*- C -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native sub instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void __sme_sub_float4(float *dst, float *lhs,
+                                                      float *rhs, unsigned row,
+                                                      unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, w, float32, 32, f, sub);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_float8.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_float8.c
new file mode 100644
index 000000000000..29e5b232b10a
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_float8.c
@@ -0,0 +1,18 @@
+//==- builtins/arrch64/matrix/sub_float8.c - sme matrix operations -*- C -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native sub instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void __sme_sub_float8(double *dst, double *lhs,
+                                                      double *rhs, unsigned row,
+                                                      unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, d, float64, 64, f, sub);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_int1.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_int1.c
new file mode 100644
index 000000000000..66834c072895
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_int1.c
@@ -0,0 +1,18 @@
+//===- builtins/arrch64/matrix/sub_int1.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native sub instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_sub_int1(signed char *dst, signed char *lhs, signed char *rhs,
+               unsigned row, unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, b, int8, 8, s, sub);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_int2.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_int2.c
new file mode 100644
index 000000000000..3c33f889b791
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_int2.c
@@ -0,0 +1,18 @@
+//===- builtins/arrch64/matrix/sub_int2.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native sub instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_sub_int2(signed short *dst, signed short *lhs, signed short *rhs,
+               unsigned row, unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, h, int16, 16, s, sub);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_int4.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_int4.c
new file mode 100644
index 000000000000..06361835dfc9
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_int4.c
@@ -0,0 +1,18 @@
+//===- builtins/arrch64/matrix/sub_int4.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native sub instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_sub_int4(signed int *dst, signed int *lhs, signed int *rhs, unsigned row,
+               unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, w, int32, 32, s, sub);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_int8.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_int8.c
new file mode 100644
index 000000000000..404f22d39a37
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_int8.c
@@ -0,0 +1,18 @@
+//===- builtins/arrch64/matrix/sub_int8.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native sub instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_sub_int8(signed long long *dst, signed long long *lhs,
+               signed long long *rhs, unsigned row, unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, d, int64, 64, s, sub);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_uint1.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint1.c
new file mode 100644
index 000000000000..5370c8024cff
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint1.c
@@ -0,0 +1,18 @@
+//==- builtins/arrch64/matrix/sub_uint1.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native sub instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_sub_uint1(unsigned char *dst, unsigned char *lhs, unsigned char *rhs,
+                unsigned row, unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, b, uint8, 8, u, sub);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_uint2.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint2.c
new file mode 100644
index 000000000000..e2f108ac36ba
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint2.c
@@ -0,0 +1,18 @@
+//==- builtins/arrch64/matrix/sub_uint2.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native sub instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_sub_uint2(unsigned short *dst, unsigned short *lhs, unsigned short *rhs,
+                unsigned row, unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, h, uint16, 16, u, sub);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_uint4.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint4.c
new file mode 100644
index 000000000000..9b2bc83164cb
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint4.c
@@ -0,0 +1,18 @@
+//==- builtins/arrch64/matrix/sub_uint4.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native sub instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_sub_uint4(unsigned int *dst, unsigned int *lhs, unsigned int *rhs,
+                unsigned row, unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, w, uint32, 32, u, sub);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_uint8.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint8.c
new file mode 100644
index 000000000000..da8824af7b85
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint8.c
@@ -0,0 +1,18 @@
+//==- builtins/arrch64/matrix/sub_uint8.c - sme matrix operations -*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// Note: SME does not have native sub instruction for matrix type, we use SVE
+// version.
+__attribute__((target("+sve"))) void
+__sme_sub_uint8(unsigned long long *dst, unsigned long long *lhs,
+                unsigned long long *rhs, unsigned row, unsigned column) {
+  SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, d, uint64, 64, u, sub);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/transpose_float2.c b/compiler-rt/lib/builtins/aarch64/matrix/transpose_float2.c
new file mode 100644
index 000000000000..832258df2dfb
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/transpose_float2.c
@@ -0,0 +1,16 @@
+//==- builtins/arrch64/matrix/transpose_float2.c - sme matrix operations C -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+__attribute__((target("+sme,+sve"))) __arm_new(
+    "za") void __sme_transpose_float2(__fp16 *dst, __fp16 *src, unsigned row,
+                                      unsigned column) __arm_streaming {
+  BREAK_SME_ACLE_TRANSPOSE(src, row, column, dst, h, 16, 0b01010101);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/transpose_float4.c b/compiler-rt/lib/builtins/aarch64/matrix/transpose_float4.c
new file mode 100644
index 000000000000..94ea3f5c7016
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/transpose_float4.c
@@ -0,0 +1,16 @@
+//==- builtins/arrch64/matrix/transpose_float4.c - sme matrix operations C -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+__attribute__((target("+sme,+sve"))) __arm_new(
+    "za") void __sme_transpose_float4(float *dst, float *src, unsigned row,
+                                      unsigned column) __arm_streaming {
+  BREAK_SME_ACLE_TRANSPOSE(src, row, column, dst, w, 32, 0b00010001);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/transpose_float8.c b/compiler-rt/lib/builtins/aarch64/matrix/transpose_float8.c
new file mode 100644
index 000000000000..8be09a2f0d41
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/transpose_float8.c
@@ -0,0 +1,16 @@
+//==- builtins/arrch64/matrix/transpose_float8.c - sme matrix operations C -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+__attribute__((target("+sme,+sve"))) __arm_new(
+    "za") void __sme_transpose_float8(double *dst, double *src, unsigned row,
+                                      unsigned column) __arm_streaming {
+  BREAK_SME_ACLE_TRANSPOSE(src, row, column, dst, d, 64, 0b00000001);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/transpose_int1.c b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int1.c
new file mode 100644
index 000000000000..89be7c9582f4
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int1.c
@@ -0,0 +1,17 @@
+//===- builtins/arrch64/matrix/transpose_int1.c - sme matrix operations C -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// For transpose, signed or unsigned does not matter.
+__attribute__((target("+sme,+sve"))) __arm_new("za") void __sme_transpose_int1(
+    signed char *dst, signed char *src, unsigned row,
+    unsigned column) __arm_streaming {
+  BREAK_SME_ACLE_TRANSPOSE(src, row, column, dst, b, 8, 0b11111111);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/transpose_int2.c b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int2.c
new file mode 100644
index 000000000000..d7ab08592b5b
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int2.c
@@ -0,0 +1,16 @@
+//===- builtins/arrch64/matrix/transpose_int2.c - sme matrix operations C -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// For transpose, signed or unsigned does not matter.
+__attribute__((target("+sme,+sve"))) __arm_new("za") void __sme_transpose_int2(
+    short *dst, short *src, unsigned row, unsigned column) __arm_streaming {
+  BREAK_SME_ACLE_TRANSPOSE(src, row, column, dst, h, 16, 0b01010101);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/transpose_int4.c b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int4.c
new file mode 100644
index 000000000000..cc415709dfcb
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int4.c
@@ -0,0 +1,16 @@
+//===- builtins/arrch64/matrix/transpose_int4.c - sme matrix operations C -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// For transpose, signed or unsigned does not matter.
+__attribute__((target("+sme,+sve"))) __arm_new("za") void __sme_transpose_int4(
+    int *dst, int *src, unsigned row, unsigned column) __arm_streaming {
+  BREAK_SME_ACLE_TRANSPOSE(src, row, column, dst, w, 32, 0b00010001);
+  return;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/matrix/transpose_int8.c b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int8.c
new file mode 100644
index 000000000000..2ce680d0b5b0
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int8.c
@@ -0,0 +1,17 @@
+//===- builtins/arrch64/matrix/transpose_int8.c - sme matrix operations C -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sme_acle.h"
+
+// For transpose, signed or unsigned does not matter.
+__attribute__((target("+sme,+sve"))) __arm_new("za") void __sme_transpose_int8(
+    long long *dst, long long *src, unsigned row,
+    unsigned column) __arm_streaming {
+  BREAK_SME_ACLE_TRANSPOSE(src, row, column, dst, d, 64, 0b00000001);
+  return;
+}
diff --git a/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-add.c b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-add.c
new file mode 100644
index 000000000000..18a7b96e0f9a
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-add.c
@@ -0,0 +1,172 @@
+// REQUIRES: aarch64-target-arch,aarch64_sme_run
+
+// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -o %t
+// RUN: %run %t 2>&1 | FileCheck %s
+// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -O3 -o %t_smeopt
+// RUN: %run %t_smeopt 2>&1 | FileCheck %s
+// RUN: %clang_builtins -O3 -march=armv9.1-a -fenable-matrix %s %librt -o %t_nosmeopt
+// RUN: %run %t_nosmeopt 2>&1 | FileCheck %s
+
+#include <stdio.h>
+
+#define M 2
+#define N 4
+
+typedef __bf16 m_tbf16 __attribute__((matrix_type(M, N)));
+
+typedef float m_tfloat __attribute__((matrix_type(M, N)));
+
+typedef double m_tdouble __attribute__((matrix_type(M, N)));
+
+typedef signed int m_tint __attribute__((matrix_type(M, N)));
+
+typedef unsigned long long m_tull __attribute__((matrix_type(M, N)));
+
+int main() {
+  m_tbf16 a;
+  m_tbf16 b;
+  float v = 0.0;
+
+  // Input matrix 1:
+  // 0.00 1.00 2.00 3.00
+  // 4.00 5.00 6.00 7.00
+  //
+  // Input matrix 2:
+  // 8.00  9.00  10.00 11.00
+  // 12.00 13.00 14.00 15.00
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      a[i][j] = v++;
+
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      b[i][j] = v++;
+
+  m_tbf16 c = a + b;
+
+  // CHECK:      8.00  10.00 12.00 14.00
+  // CHECK-NEXT: 16.00 18.00 20.00 22.00
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++)
+      printf("%.2f ", (float)c[i][j]);
+    printf("\n");
+  }
+
+  m_tfloat af;
+  m_tfloat bf;
+  float vf = 0.0;
+
+  // Input matrix 1:
+  // 0.00 1.00 2.00 3.00
+  // 4.00 5.00 6.00 7.00
+  //
+  // Input matrix 2:
+  // 8.00  9.00  10.00 11.00
+  // 12.00 13.00 14.00 15.00
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      af[i][j] = vf++;
+
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      bf[i][j] = vf++;
+
+  m_tfloat cf = af + bf;
+
+  // CHECK:      8.00  10.00 12.00 14.00
+  // CHECK-NEXT: 16.00 18.00 20.00 22.00
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++)
+      printf("%.2f ", cf[i][j]);
+    printf("\n");
+  }
+
+  m_tdouble ad;
+  m_tdouble bd;
+  double vd = 0.0;
+
+  // Input matrix 1:
+  // 0.00 1.00 2.00 3.00
+  // 4.00 5.00 6.00 7.00
+  //
+  // Input matrix 2:
+  // 8.00  9.00  10.00 11.00
+  // 12.00 13.00 14.00 15.00
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      ad[i][j] = vd++;
+
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      bd[i][j] = vd++;
+
+  m_tdouble cd = ad + bd;
+
+  // CHECK:      8.00  10.00 12.00 14.00
+  // CHECK-NEXT: 16.00 18.00 20.00 22.00
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++)
+      printf("%.2f ", cd[i][j]);
+    printf("\n");
+  }
+
+  m_tint ai;
+  m_tint bi;
+  int vi = 0;
+
+  // Input matrix 1:
+  // 0 1 2 3
+  // 4 5 6 7
+  //
+  // Input matrix 2:
+  // 8  9  10 11
+  // 12 13 14 15
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      ai[i][j] = vi++;
+
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      bi[i][j] = vi++;
+
+  m_tint ci = ai + bi;
+
+  // CHECK:      8  10 12 14
+  // CHECK-NEXT: 16 18 20 22
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++)
+      printf("%d ", ci[i][j]);
+    printf("\n");
+  }
+
+  m_tull au;
+  m_tull bu;
+  unsigned long long int vu = 0;
+
+  // Input matrix 1:
+  // 0 1 2 3
+  // 4 5 6 7
+  //
+  // Input matrix 2:
+  // 8  9  10 11
+  // 12 13 14 15
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      au[i][j] = vu++;
+
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      bu[i][j] = vu++;
+
+  m_tull cu = au + bu;
+
+  // CHECK:      8  10 12 14
+  // CHECK-NEXT: 16 18 20 22
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++)
+      printf("%lld ", cu[i][j]);
+    printf("\n");
+  }
+
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-matmul.c b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-matmul.c
new file mode 100644
index 000000000000..b69c79918d67
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-matmul.c
@@ -0,0 +1,189 @@
+// REQUIRES: aarch64-target-arch,aarch64_sme_run
+
+// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -o %t
+// RUN: %run %t 2>&1 | FileCheck %s
+// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -O3 -o %t_smeopt
+// RUN: %run %t_smeopt 2>&1 | FileCheck %s
+// RUN: %clang_builtins -O3 -march=armv9.1-a -fenable-matrix %s %librt -o %t_nosmeopt
+// RUN: %run %t_nosmeopt 2>&1 | FileCheck %s
+
+#include <stdio.h>
+
+// First matrix is 2x3. Second matrix is 3x4. Result matrix is 2x4.
+#define M 2
+#define K 3
+#define N 4
+
+typedef __bf16 m1_tbf16 __attribute__((matrix_type(M, K)));
+typedef __bf16 m2_tbf16 __attribute__((matrix_type(K, N)));
+typedef __bf16 mr_tbf16 __attribute__((matrix_type(M, N)));
+
+typedef float m1_tfloat __attribute__((matrix_type(M, K)));
+typedef float m2_tfloat __attribute__((matrix_type(K, N)));
+typedef float mr_tfloat __attribute__((matrix_type(M, N)));
+
+typedef double m1_tdouble __attribute__((matrix_type(M, K)));
+typedef double m2_tdouble __attribute__((matrix_type(K, N)));
+typedef double mr_tdouble __attribute__((matrix_type(M, N)));
+
+typedef signed int m1_tint __attribute__((matrix_type(M, K)));
+typedef signed int m2_tint __attribute__((matrix_type(K, N)));
+typedef signed int mr_tint __attribute__((matrix_type(M, N)));
+
+typedef unsigned long long m1_tull __attribute__((matrix_type(M, K)));
+typedef unsigned long long m2_tull __attribute__((matrix_type(K, N)));
+typedef unsigned long long mr_tull __attribute__((matrix_type(M, N)));
+
+int main() {
+  m1_tbf16 a;
+  m2_tbf16 b;
+  float v = 0.0;
+
+  // Input matrix 1:
+  // 0.00 1.00 2.00
+  // 3.00 4.00 5.00
+  //
+  // Input matrix 2:
+  // 6.00  7.00  8.00  9.00
+  // 10.00 11.00 12.00 13.00
+  // 14.00 15.00 16.00 17.00
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < K; j++)
+      a[i][j] = v++;
+
+  for (int i = 0; i < K; i++)
+    for (int j = 0; j < N; j++)
+      b[i][j] = v++;
+
+  mr_tbf16 c = a * b;
+
+  // CHECK:      38.00  41.00  44.00  47.00
+  // CHECK-NEXT: 128.00 140.00 152.00 164.00
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++)
+      printf("%.2f ", (float)c[i][j]);
+    printf("\n");
+  }
+
+  m1_tfloat af;
+  m2_tfloat bf;
+  float vf = 0.0;
+
+  // Input matrix 1:
+  // 0.00 1.00 2.00
+  // 3.00 4.00 5.00
+  //
+  // Input matrix 2:
+  // 6.00  7.00  8.00  9.00
+  // 10.00 11.00 12.00 13.00
+  // 14.00 15.00 16.00 17.00
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < K; j++)
+      af[i][j] = vf++;
+
+  for (int i = 0; i < K; i++)
+    for (int j = 0; j < N; j++)
+      bf[i][j] = vf++;
+
+  mr_tfloat cf = af * bf;
+
+  // CHECK:      38.00  41.00  44.00  47.00
+  // CHECK-NEXT: 128.00 140.00 152.00 164.00
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++)
+      printf("%.2f ", cf[i][j]);
+    printf("\n");
+  }
+
+  m1_tdouble ad;
+  m2_tdouble bd;
+  double vd = 0.0;
+
+  // Input matrix 1:
+  // 0.00 1.00 2.00
+  // 3.00 4.00 5.00
+  //
+  // Input matrix 2:
+  // 6.00  7.00  8.00  9.00
+  // 10.00 11.00 12.00 13.00
+  // 14.00 15.00 16.00 17.00
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < K; j++)
+      ad[i][j] = vd++;
+
+  for (int i = 0; i < K; i++)
+    for (int j = 0; j < N; j++)
+      bd[i][j] = vd++;
+
+  mr_tdouble cd = ad * bd;
+
+  // CHECK:      38.00  41.00  44.00  47.00
+  // CHECK-NEXT: 128.00 140.00 152.00 164.00
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++)
+      printf("%.2f ", cd[i][j]);
+    printf("\n");
+  }
+
+  m1_tint ai;
+  m2_tint bi;
+  int vi = 0;
+
+  // Input matrix 1:
+  // 0 1 2
+  // 3 4 5
+  //
+  // Input matrix 2:
+  // 6  7  8  9
+  // 10 11 12 13
+  // 14 15 16 17
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < K; j++)
+      ai[i][j] = vi++;
+
+  for (int i = 0; i < K; i++)
+    for (int j = 0; j < N; j++)
+      bi[i][j] = vi++;
+
+  mr_tint ci = ai * bi;
+
+  // CHECK:      38  41  44  47
+  // CHECK-NEXT: 128 140 152 164
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++)
+      printf("%d ", ci[i][j]);
+    printf("\n");
+  }
+
+  m1_tull au;
+  m2_tull bu;
+  unsigned long long int vu = 0;
+
+  // Input matrix 1:
+  // 0 1 2
+  // 3 4 5
+  //
+  // Input matrix 2:
+  // 6  7  8  9
+  // 10 11 12 13
+  // 14 15 16 17
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < K; j++)
+      au[i][j] = vu++;
+
+  for (int i = 0; i < K; i++)
+    for (int j = 0; j < N; j++)
+      bu[i][j] = vu++;
+
+  mr_tull cu = au * bu;
+
+  // CHECK:      38  41  44  47
+  // CHECK-NEXT: 128 140 152 164
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++)
+      printf("%lld ", cu[i][j]);
+    printf("\n");
+  }
+
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-sub.c b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-sub.c
new file mode 100644
index 000000000000..01de075a8ca9
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-sub.c
@@ -0,0 +1,173 @@
+// REQUIRES: aarch64-target-arch,aarch64_sme_run
+
+// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -o %t
+// RUN: %run %t 2>&1 | FileCheck %s
+// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -O3 -o %t_smeopt
+// RUN: %run %t_smeopt 2>&1 | FileCheck %s
+// RUN: %clang_builtins -O3 -march=armv9.1-a -fenable-matrix %s %librt -o %t_nosmeopt
+// RUN: %run %t_nosmeopt 2>&1 | FileCheck %s
+
+#include <stdio.h>
+
+#define M 2
+#define N 4
+
+typedef __bf16 m_tbf16 __attribute__((matrix_type(M, N)));
+
+typedef float m_tfloat __attribute__((matrix_type(M, N)));
+
+typedef double m_tdouble __attribute__((matrix_type(M, N)));
+
+typedef signed int m_tint __attribute__((matrix_type(M, N)));
+
+typedef unsigned long long m_tull __attribute__((matrix_type(M, N)));
+
+int main() {
+  m_tbf16 a;
+  m_tbf16 b;
+  float v = 0.0;
+
+  // Input matrix 1:
+  // 0.00 1.00 2.00 3.00
+  // 4.00 5.00 6.00 7.00
+  //
+  // Input matrix 2:
+  // 8.00  9.00  10.00 11.00
+  // 12.00 13.00 14.00 15.00
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      a[i][j] = v++;
+
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      b[i][j] = v++;
+
+  m_tbf16 c = a - b;
+
+  // CHECK:      -8.00 -8.00 -8.00 -8.00
+  // CHECK-NEXT: -8.00 -8.00 -8.00 -8.00
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++)
+      printf("%.2f ", (float)c[i][j]);
+    printf("\n");
+  }
+
+  m_tfloat af;
+  m_tfloat bf;
+  float vf = 0.0;
+
+  // Input matrix 1:
+  // 0.00 1.00 2.00 3.00
+  // 4.00 5.00 6.00 7.00
+  //
+  // Input matrix 2:
+  // 8.00  9.00  10.00 11.00
+  // 12.00 13.00 14.00 15.00
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      af[i][j] = vf++;
+
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      bf[i][j] = vf++;
+
+  m_tfloat cf = af - bf;
+
+  // CHECK:      -8.00 -8.00 -8.00 -8.00
+  // CHECK-NEXT: -8.00 -8.00 -8.00 -8.00
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++)
+      printf("%.2f ", cf[i][j]);
+    printf("\n");
+  }
+
+  m_tdouble ad;
+  m_tdouble bd;
+  double vd = 0.0;
+
+  // Input matrix 1:
+  // 0.00 1.00 2.00 3.00
+  // 4.00 5.00 6.00 7.00
+  //
+  // Input matrix 2:
+  // 8.00  9.00  10.00 11.00
+  // 12.00 13.00 14.00 15.00
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      ad[i][j] = vd++;
+
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      bd[i][j] = vd++;
+
+  m_tdouble cd = ad - bd;
+
+  // CHECK:      -8.00 -8.00 -8.00 -8.00
+  // CHECK-NEXT: -8.00 -8.00 -8.00 -8.00
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++)
+      printf("%.2f ", cd[i][j]);
+    printf("\n");
+  }
+
+  m_tint ai;
+  m_tint bi;
+  int vi = 0;
+
+  // Input matrix 1:
+  // 0 1 2 3
+  // 4 5 6 7
+  //
+  // Input matrix 2:
+  // 8  9  10 11
+  // 12 13 14 15
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      ai[i][j] = vi++;
+
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      bi[i][j] = vi++;
+
+  m_tint ci = ai - bi;
+
+  // CHECK:      -8 -8 -8 -8
+  // CHECK-NEXT: -8 -8 -8 -8
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++)
+      printf("%d ", ci[i][j]);
+    printf("\n");
+  }
+
+  m_tull au;
+  m_tull bu;
+  unsigned long long int vu = 0;
+
+  // Input matrix 1:
+  // 0 1 2 3
+  // 4 5 6 7
+  //
+  // Input matrix 2:
+  // 0 1 2 3
+  // 4 5 6 7
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      au[i][j] = vu++;
+
+  vu = 0;
+  for (int i = 0; i < M; i++)
+    for (int j = 0; j < N; j++)
+      bu[i][j] = vu++;
+
+  m_tull cu = au - bu;
+
+  // CHECK:      0 0 0 0
+  // CHECK-NEXT: 0 0 0 0
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++)
+      printf("%lld ", cu[i][j]);
+    printf("\n");
+  }
+
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-transpose.c b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-transpose.c
new file mode 100644
index 000000000000..53f67e25f715
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-transpose.c
@@ -0,0 +1,113 @@
+// REQUIRES: aarch64-target-arch,aarch64_sme_run
+
+// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -o %t
+// RUN: %run %t 2>&1 | FileCheck %s
+// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -O3 -o %t_smeopt
+// RUN: %run %t_smeopt 2>&1 | FileCheck %s
+// RUN: %clang_builtins -O3 -march=armv9.1-a -fenable-matrix %s %librt -o %t_nosmeopt
+// RUN: %run %t_nosmeopt 2>&1 | FileCheck %s
+
+#include <stdio.h>
+
+#define ROW 2
+#define COL 3
+
+typedef __bf16 m_t __attribute__((matrix_type(ROW, COL)));
+typedef __bf16 mt_t __attribute__((matrix_type(COL, ROW)));
+
+typedef double md_t __attribute__((matrix_type(ROW, COL)));
+typedef double mdt_t __attribute__((matrix_type(COL, ROW)));
+
+typedef signed char mc_t __attribute__((matrix_type(ROW, COL)));
+typedef signed char mct_t __attribute__((matrix_type(COL, ROW)));
+
+typedef unsigned int mui_t __attribute__((matrix_type(ROW, COL)));
+typedef unsigned int muit_t __attribute__((matrix_type(COL, ROW)));
+
+int main() {
+  m_t a;
+  float v = 0.0;
+
+  // Input matrix:
+  // 0.00 1.00 2.00
+  // 3.00 4.00 5.00
+  for (int i = 0; i < ROW; i++)
+    for (int j = 0; j < COL; j++)
+      a[i][j] = v++;
+
+  mt_t b = __builtin_matrix_transpose(a);
+
+  // CHECK:      0.00 3.00
+  // CHECK-NEXT: 1.00 4.00
+  // CHECK-NEXT: 2.00 5.00
+  for (int i = 0; i < COL; i++) {
+    for (int j = 0; j < ROW; j++)
+      printf("%.2f ", (float)b[i][j]);
+    printf("\n");
+  }
+
+  md_t ad;
+  double vd = 1.0;
+
+  // Input matrix:
+  // 1.00 2.00 3.00
+  // 4.00 5.00 6.00
+  for (int i = 0; i < ROW; i++)
+    for (int j = 0; j < COL; j++)
+      ad[i][j] = vd++;
+
+  mdt_t bd = __builtin_matrix_transpose(ad);
+
+  // CHECK:      1.00 4.00
+  // CHECK-NEXT: 2.00 5.00
+  // CHECK-NEXT: 3.00 6.00
+  for (int i = 0; i < COL; i++) {
+    for (int j = 0; j < ROW; j++)
+      printf("%.2f ", bd[i][j]);
+    printf("\n");
+  }
+
+  mc_t ac;
+  signed char vc = 5;
+
+  // Input matrix:
+  // 5 6 7
+  // 8 9 10
+  for (int i = 0; i < ROW; i++)
+    for (int j = 0; j < COL; j++)
+      ac[i][j] = vc++;
+
+  mct_t bc = __builtin_matrix_transpose(ac);
+
+  // CHECK:      5 8
+  // CHECK-NEXT: 6 9
+  // CHECK-NEXT: 7 10
+  for (int i = 0; i < COL; i++) {
+    for (int j = 0; j < ROW; j++)
+      printf("%d ", bc[i][j]);
+    printf("\n");
+  }
+
+  mui_t aui;
+  unsigned int vui = 10;
+
+  // Input matrix:
+  // 10 11 12
+  // 13 14 15
+  for (int i = 0; i < ROW; i++)
+    for (int j = 0; j < COL; j++)
+      aui[i][j] = vui++;
+
+  muit_t bui = __builtin_matrix_transpose(aui);
+
+  // CHECK:      10 13
+  // CHECK-NEXT: 11 14
+  // CHECK-NEXT: 12 15
+  for (int i = 0; i < COL; i++) {
+    for (int j = 0; j < ROW; j++)
+      printf("%d ", bui[i][j]);
+    printf("\n");
+  }
+
+  return 0;
+}
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index d4b1e1d71d3c..d0b38288212f 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -1010,3 +1010,9 @@ if config.compiler_id == "GNU":
 # llvm.
 config.substitutions.append(("%crt_src", config.compiler_rt_src_root))
 config.substitutions.append(("%llvm_src", config.llvm_src_root))
+
+if config.host_os in ["Linux"] and config.target_triple.startswith("aarch64"):
+    output = subprocess.check_output(["/usr/bin/lscpu"])
+    sme_indicator=b"smef64f64"
+    if re.search(sme_indicator, output):
+        config.available_features.add("aarch64_sme_run")
diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
index dbf2cfb7c5e9..a2489f3e6f3b 100644
--- a/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -34,6 +34,36 @@ class MatrixBuilder {
   IRBuilderBase &B;
   Module *getModule() { return B.GetInsertBlock()->getParent()->getParent(); }
 
+  Value *getExistingLocation(Value *V) {
+    // if V is a load, we find the location for reusing.
+    if (!isa<LoadInst>(V))
+      return nullptr;
+
+    // We can further optimize if load address is alloca and it has only two
+    // uses: one is store which initializes the alloca and another is load
+    // which is V itself. The store use must store a value loaded
+    // from an address. Then the address is the memory location we need.
+    // This normally happens in the function entry, so we won't do recursive
+    // search here.
+    Value *Addr = cast<LoadInst>(V)->getPointerOperand();
+    if (!isa<AllocaInst>(Addr) || !Addr->hasNUses(2))
+      return Addr;
+
+    Value *AnotherUse = *Addr->user_begin();
+    if (AnotherUse == V)
+      AnotherUse = *(++Addr->user_begin());
+
+    if (!isa<StoreInst>(AnotherUse))
+      return Addr;
+
+    // Store value is the result of V.
+    Value *StoredValue = cast<StoreInst>(AnotherUse)->getValueOperand();
+    if (!isa<LoadInst>(StoredValue))
+      return Addr;
+
+    return cast<LoadInst>(StoredValue)->getPointerOperand();
+  }
+
   std::pair<Value *, Value *> splatScalarOperandIfNeeded(Value *LHS,
                                                          Value *RHS) {
     assert((LHS->getType()->isVectorTy() || RHS->getType()->isVectorTy()) &&
@@ -105,6 +135,44 @@ public:
     return Call;
   }
 
+  Value *CreateSMEMatrixTranspose(Value *Matrix, unsigned Rows,
+                                  unsigned Columns) {
+    auto *OpType = cast<VectorType>(Matrix->getType());
+    auto *ElemType = OpType->getElementType();
+    auto *ReturnType = FixedVectorType::get(ElemType, Rows * Columns);
+
+    std::string FuncName = "__sme_transpose_";
+    FuncName += ElemType->isIntegerTy() ? "int" : "float";
+    FuncName += std::to_string(ElemType->getScalarSizeInBits() / 8);
+
+    // %a.addr = alloca [6 x i16]
+    // %a = load [6 x i16], ptr %0
+    // store [6 x i16] %a, ptr %a.addr
+    // %1 = load <6 x i16>, ptr %a.addr
+    //
+    // If we want to create a stack slot for Matrix, we can just use %0,
+    // no need to create new memory.
+    Value *MemPara = getExistingLocation(Matrix);
+    if (!MemPara) {
+      MemPara = B.CreateAlloca(OpType);
+      B.CreateStore(Matrix, MemPara);
+    }
+
+    // FIXME: optimize the memory for the return address too.
+    Value *MemForRet = B.CreateAlloca(ReturnType);
+
+    Value *Ops[] = {MemForRet, MemPara, B.getInt32(Rows), B.getInt32(Columns)};
+    FunctionCallee Func = getModule()->getOrInsertFunction(
+        FuncName, FunctionType::get(B.getVoidTy(),
+                                    {B.getPtrTy(), B.getPtrTy(), B.getInt32Ty(),
+                                     B.getInt32Ty()},
+                                    false));
+
+    B.CreateCall(Func, Ops);
+    (cast<Function>(Func.getCallee()))->addFnAttr("aarch64_pstate_sm_enabled");
+    return B.CreateLoad(ReturnType, MemForRet);
+  }
+
   /// Create a llvm.matrix.transpose call, transposing \p Matrix with \p Rows
   /// rows and \p Columns columns.
   CallInst *CreateMatrixTranspose(Value *Matrix, unsigned Rows,
@@ -121,6 +189,93 @@ public:
     return B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name);
   }
 
+  Value *CreateSMEMatrixMultiply(Value *LHS, Value *RHS, unsigned LHSRows,
+                                 unsigned LHSColumns, unsigned RHSColumns,
+                                 bool IsSigned) {
+    auto *ElemType = (cast<VectorType>(LHS->getType()))->getElementType();
+
+    auto *LHSType = FixedVectorType::get(ElemType, LHSRows * LHSColumns);
+    auto *RHSType = FixedVectorType::get(ElemType, LHSColumns * RHSColumns);
+    auto *ReturnType = FixedVectorType::get(ElemType, LHSRows * RHSColumns);
+
+    std::string FuncName = "__sme_matmul_";
+    FuncName += ElemType->isIntegerTy() ? (IsSigned ? "int" : "uint") : "float";
+    FuncName += std::to_string(ElemType->getScalarSizeInBits() / 8);
+
+    // First check if we can reuse some existing memory.
+    Value *MemForLHS = getExistingLocation(LHS);
+    if (!MemForLHS) {
+      MemForLHS = B.CreateAlloca(LHSType);
+      B.CreateStore(LHS, MemForLHS);
+    }
+
+    Value *MemForRHS = getExistingLocation(RHS);
+    if (!MemForRHS) {
+      MemForRHS = B.CreateAlloca(RHSType);
+      B.CreateStore(RHS, MemForRHS);
+    }
+
+    Value *MemForRet = B.CreateAlloca(ReturnType);
+
+    Value *Ops[] = {MemForRet,
+                    MemForLHS,
+                    MemForRHS,
+                    B.getInt32(LHSRows),
+                    B.getInt32(LHSColumns),
+                    B.getInt32(RHSColumns)};
+    FunctionCallee Func = getModule()->getOrInsertFunction(
+        FuncName,
+        FunctionType::get(B.getVoidTy(),
+                          {B.getPtrTy(), B.getPtrTy(), B.getPtrTy(),
+                           B.getInt32Ty(), B.getInt32Ty(), B.getInt32Ty()},
+                          false));
+
+    B.CreateCall(Func, Ops);
+    (cast<Function>(Func.getCallee()))->addFnAttr("aarch64_pstate_sm_enabled");
+    return B.CreateLoad(ReturnType, MemForRet);
+  }
+
+  // Matrix binary operations that depend on weo matrixes have same shape, like
+  // add, sub.
+  Value *CreateSMEMatrixBinOp(Value *LHS, Value *RHS, unsigned Rows,
+                              unsigned Columns, bool IsSigned,
+                              StringRef OpName) {
+    auto *ElemType = (cast<VectorType>(LHS->getType()))->getElementType();
+
+    auto *Type = FixedVectorType::get(ElemType, Rows * Columns);
+
+    std::string FuncName = (StringRef("__sme_") + OpName + "_").str();
+    FuncName += ElemType->isIntegerTy() ? (IsSigned ? "int" : "uint") : "float";
+    FuncName += std::to_string(ElemType->getScalarSizeInBits() / 8);
+
+    // First check if we can reuse some existing memory.
+    Value *MemForLHS = getExistingLocation(LHS);
+    if (!MemForLHS) {
+      MemForLHS = B.CreateAlloca(Type);
+      B.CreateStore(LHS, MemForLHS);
+    }
+
+    Value *MemForRHS = getExistingLocation(RHS);
+    if (!MemForRHS) {
+      MemForRHS = B.CreateAlloca(Type);
+      B.CreateStore(RHS, MemForRHS);
+    }
+
+    Value *MemForRet = B.CreateAlloca(Type);
+
+    Value *Ops[] = {MemForRet, MemForLHS, MemForRHS, B.getInt32(Rows),
+                    B.getInt32(Columns)};
+    FunctionCallee Func = getModule()->getOrInsertFunction(
+        FuncName, FunctionType::get(B.getVoidTy(),
+                                    {B.getPtrTy(), B.getPtrTy(), B.getPtrTy(),
+                                     B.getInt32Ty(), B.getInt32Ty()},
+                                    false));
+
+    B.CreateCall(Func, Ops);
+    (cast<Function>(Func.getCallee()))->addFnAttr("aarch64_pstate_sm_enabled");
+    return B.CreateLoad(Type, MemForRet);
+  }
+
   /// Create a llvm.matrix.multiply call, multiplying matrixes \p LHS and \p
   /// RHS.
   CallInst *CreateMatrixMultiply(Value *LHS, Value *RHS, unsigned LHSRows,
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index a8a17101b9c9..935a8f1cb394 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -193,6 +193,12 @@ static cl::opt<unsigned>
     MIResourceCutOff("misched-resource-cutoff", cl::Hidden,
                      cl::desc("Number of intervals to track"), cl::init(10));
 
+extern cl::opt<bool> EnableMatrix;
+
+static cl::opt<unsigned>
+    BigBasicBlock("schedule-big-basic-block", cl::Hidden, cl::init(200),
+                  cl::desc("The limit to use while schedule a region "));
+
 // DAG subtrees must have at least this many nodes.
 static const unsigned MinSubtreeSize = 8;
 
@@ -636,6 +642,9 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
       MachineBasicBlock::iterator RegionEnd = R.RegionEnd;
       unsigned NumRegionInstrs = R.NumRegionInstrs;
 
+      if (EnableMatrix && NumRegionInstrs > BigBasicBlock)
+        continue;
+
       // Notify the scheduler of the region, even if we may skip scheduling
       // it. Perhaps it still needs to be bundled.
       Scheduler.enterRegion(&*MBB, I, RegionEnd, NumRegionInstrs);
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 68dece6cf73e..7f8b56d8ae03 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -79,6 +79,10 @@ static cl::opt<unsigned> HugeRegion("dag-maps-huge-region", cl::Hidden,
                              "prior to scheduling, at which point a trade-off "
                              "is made to avoid excessive compile time."));
 
+cl::opt<bool>
+    EnableMatrix("enable-matrix", cl::init(false), cl::Hidden,
+                 cl::desc("Enable lowering of the matrix intrinsics"));
+
 static cl::opt<unsigned> ReductionSize(
     "dag-maps-reduction-size", cl::Hidden,
     cl::desc("A huge scheduling region will have maps reduced by this many "
@@ -93,8 +97,11 @@ static cl::opt<bool> SchedPrintCycles(
 static unsigned getReductionSize() {
   // Always reduce a huge region with half of the elements, except
   // when user sets this number explicitly.
-  if (ReductionSize.getNumOccurrences() == 0)
+  if (ReductionSize.getNumOccurrences() == 0) {
+    if (EnableMatrix)
+      return HugeRegion / 20;
     return HugeRegion / 2;
+  }
   return ReductionSize;
 }
 
@@ -1010,11 +1017,12 @@ void ScheduleDAGInstrs::buildSchedGraph(AAResults *AA,
     }
 
     // Reduce maps if they grow huge.
-    if (Stores.size() + Loads.size() >= HugeRegion) {
+    unsigned RegionSize = EnableMatrix ? (HugeRegion / 10) : HugeRegion;
+    if (Stores.size() + Loads.size() >= RegionSize) {
       LLVM_DEBUG(dbgs() << "Reducing Stores and Loads maps.\n";);
       reduceHugeMemNodeMaps(Stores, Loads, getReductionSize());
     }
-    if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) {
+    if (NonAliasStores.size() + NonAliasLoads.size() >= RegionSize) {
       LLVM_DEBUG(
           dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n";);
       reduceHugeMemNodeMaps(NonAliasStores, NonAliasLoads, getReductionSize());
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 6f36bdad780a..03389905ef57 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -275,9 +275,7 @@ static cl::opt<bool> EnableOrderFileInstrumentation(
     "enable-order-file-instrumentation", cl::init(false), cl::Hidden,
     cl::desc("Enable order file instrumentation (default = off)"));
 
-static cl::opt<bool>
-    EnableMatrix("enable-matrix", cl::init(false), cl::Hidden,
-                 cl::desc("Enable lowering of the matrix intrinsics"));
+extern cl::opt<bool> EnableMatrix;
 
 static cl::opt<bool> EnableConstraintElimination(
     "enable-constraint-elimination", cl::init(true), cl::Hidden,
-- 
Gitee