From 999e35dab3bf7a8b4ff2679d4bc443dcd77d348c Mon Sep 17 00:00:00 2001 From: chen zheng Date: Tue, 1 Jul 2025 12:40:41 +0800 Subject: [PATCH] [SME][matrix_type] lower matrix_type with ARM SME/SVE instructions --- clang/include/clang/AST/Type.h | 15 + clang/lib/CodeGen/CGBuiltin.cpp | 16 +- clang/lib/CodeGen/CGExprScalar.cpp | 43 +- clang/lib/CodeGen/Targets/AArch64.cpp | 8 + clang/lib/Driver/ToolChain.cpp | 4 + .../aarch64-sme-matmul.c | 572 ++++++++++++++++++ .../aarch64-sme-transpose.c | 337 +++++++++++ compiler-rt/lib/builtins/CMakeLists.txt | 47 ++ .../lib/builtins/aarch64/matrix/add_float2.c | 18 + .../lib/builtins/aarch64/matrix/add_float4.c | 18 + .../lib/builtins/aarch64/matrix/add_float8.c | 18 + .../lib/builtins/aarch64/matrix/add_int1.c | 18 + .../lib/builtins/aarch64/matrix/add_int2.c | 18 + .../lib/builtins/aarch64/matrix/add_int4.c | 18 + .../lib/builtins/aarch64/matrix/add_int8.c | 18 + .../lib/builtins/aarch64/matrix/add_uint1.c | 18 + .../lib/builtins/aarch64/matrix/add_uint2.c | 18 + .../lib/builtins/aarch64/matrix/add_uint4.c | 18 + .../lib/builtins/aarch64/matrix/add_uint8.c | 18 + .../builtins/aarch64/matrix/matmul_float2.c | 19 + .../builtins/aarch64/matrix/matmul_float4.c | 17 + .../builtins/aarch64/matrix/matmul_float8.c | 18 + .../lib/builtins/aarch64/matrix/matmul_int1.c | 19 + .../lib/builtins/aarch64/matrix/matmul_int2.c | 19 + .../lib/builtins/aarch64/matrix/matmul_int4.c | 19 + .../lib/builtins/aarch64/matrix/matmul_int8.c | 19 + .../builtins/aarch64/matrix/matmul_uint1.c | 19 + .../builtins/aarch64/matrix/matmul_uint2.c | 20 + .../builtins/aarch64/matrix/matmul_uint4.c | 19 + .../builtins/aarch64/matrix/matmul_uint8.c | 19 + .../lib/builtins/aarch64/matrix/sme_acle.h | 136 +++++ .../lib/builtins/aarch64/matrix/sub_float2.c | 18 + .../lib/builtins/aarch64/matrix/sub_float4.c | 18 + .../lib/builtins/aarch64/matrix/sub_float8.c | 18 + .../lib/builtins/aarch64/matrix/sub_int1.c | 18 + .../lib/builtins/aarch64/matrix/sub_int2.c | 18 + .../lib/builtins/aarch64/matrix/sub_int4.c | 18 + .../lib/builtins/aarch64/matrix/sub_int8.c | 18 + .../lib/builtins/aarch64/matrix/sub_uint1.c | 18 + .../lib/builtins/aarch64/matrix/sub_uint2.c | 18 + .../lib/builtins/aarch64/matrix/sub_uint4.c | 18 + .../lib/builtins/aarch64/matrix/sub_uint8.c | 18 + .../aarch64/matrix/transpose_float2.c | 16 + .../aarch64/matrix/transpose_float4.c | 16 + .../aarch64/matrix/transpose_float8.c | 16 + .../builtins/aarch64/matrix/transpose_int1.c | 17 + .../builtins/aarch64/matrix/transpose_int2.c | 16 + .../builtins/aarch64/matrix/transpose_int4.c | 16 + .../builtins/aarch64/matrix/transpose_int8.c | 17 + .../builtins/Unit/aarch64/sme-matrix-add.c | 172 ++++++ .../builtins/Unit/aarch64/sme-matrix-matmul.c | 189 ++++++ .../builtins/Unit/aarch64/sme-matrix-sub.c | 173 ++++++ .../Unit/aarch64/sme-matrix-transpose.c | 113 ++++ compiler-rt/test/lit.common.cfg.py | 6 + llvm/include/llvm/IR/MatrixBuilder.h | 155 +++++ llvm/lib/CodeGen/MachineScheduler.cpp | 9 + llvm/lib/CodeGen/ScheduleDAGInstrs.cpp | 14 +- llvm/lib/Passes/PassBuilderPipelines.cpp | 4 +- 58 files changed, 2713 insertions(+), 17 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-matmul.c create mode 100644 clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-transpose.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_float2.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_float4.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_float8.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_int1.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_int2.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_int4.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_int8.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_uint1.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_uint2.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_uint4.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/add_uint8.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_float2.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_float4.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_float8.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_int1.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_int2.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_int4.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_int8.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_uint1.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_uint2.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_uint4.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/matmul_uint8.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sme_acle.h create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_float2.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_float4.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_float8.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_int1.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_int2.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_int4.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_int8.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_uint1.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_uint2.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_uint4.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/sub_uint8.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/transpose_float2.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/transpose_float4.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/transpose_float8.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/transpose_int1.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/transpose_int2.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/transpose_int4.c create mode 100644 compiler-rt/lib/builtins/aarch64/matrix/transpose_int8.c create mode 100644 compiler-rt/test/builtins/Unit/aarch64/sme-matrix-add.c create mode 100644 compiler-rt/test/builtins/Unit/aarch64/sme-matrix-matmul.c create mode 100644 compiler-rt/test/builtins/Unit/aarch64/sme-matrix-sub.c create mode 100644 compiler-rt/test/builtins/Unit/aarch64/sme-matrix-transpose.c diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 9a711030cff9..435e45ee0eb1 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -4176,6 +4176,21 @@ public: (T->isRealType() && !T->isBooleanType() && !T->isEnumeralType()); } + static bool isValidTypeForSME(QualType T) { + if (!isValidElementType(T)) + return false; + + if (!isa(T)) + return false; + + // AArch64 can not do vector operations like fma/add/sub for __bf16. + if (T->isBFloat16Type()) + return false; + + return cast(T)->isFloatingPoint() || + cast(T)->isInteger(); + } + bool isSugared() const { return false; } QualType desugar() const { return QualType(this, 0); } diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 86d47054615e..4cb0659288a2 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -3976,12 +3976,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, *this, E, llvm::Intrinsic::vector_reduce_and, "rdx.and")); case Builtin::BI__builtin_matrix_transpose: { - auto *MatrixTy = E->getArg(0)->getType()->castAs(); - Value *MatValue = EmitScalarExpr(E->getArg(0)); + auto *MatrixValue = E->getArg(0); + auto *MatrixTy = MatrixValue->getType()->castAs(); + Value *MatValue = EmitScalarExpr(MatrixValue); MatrixBuilder MB(Builder); - Value *Result = MB.CreateMatrixTranspose(MatValue, MatrixTy->getNumRows(), - MatrixTy->getNumColumns()); - return RValue::get(Result); + + if (!getContext().getTargetInfo().hasFeature("sme") || + !MatrixType::isValidTypeForSME(MatrixTy->getElementType())) + return RValue::get(MB.CreateMatrixTranspose( + MatValue, MatrixTy->getNumRows(), MatrixTy->getNumColumns())); + + return RValue::get(MB.CreateSMEMatrixTranspose( + MatValue,MatrixTy->getNumRows(),MatrixTy->getNumColumns())); } case Builtin::BI__builtin_matrix_column_major_load: { diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 6e212e74676e..38ee8670dcc1 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -764,10 +764,23 @@ public: auto *RHSMatTy = dyn_cast( BO->getRHS()->getType().getCanonicalType()); CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, Ops.FPFeatures); - if (LHSMatTy && RHSMatTy) - return MB.CreateMatrixMultiply(Ops.LHS, Ops.RHS, LHSMatTy->getNumRows(), - LHSMatTy->getNumColumns(), - RHSMatTy->getNumColumns()); + if (LHSMatTy && RHSMatTy) { + // Note that SME only has non-widening MOPA for float32 and float64, so + // only these two types have native SME matmul operations. For other + // types, SVE version is used. We hope that SVE version is better than + // default NEON or scalar version. + auto Ty = LHSMatTy->getElementType(); + if (!CGF.getContext().getTargetInfo().hasFeature("sme") || + !MatrixType::isValidTypeForSME(Ty)) + return MB.CreateMatrixMultiply( + Ops.LHS, Ops.RHS, LHSMatTy->getNumRows(), + LHSMatTy->getNumColumns(), RHSMatTy->getNumColumns()); + assert(isa(Ty) && "SME types should be BuiltinType."); + return MB.CreateSMEMatrixMultiply( + Ops.LHS, Ops.RHS, LHSMatTy->getNumRows(), LHSMatTy->getNumColumns(), + RHSMatTy->getNumColumns(), + cast(Ty)->isSignedInteger()); + } return MB.CreateScalarMultiply(Ops.LHS, Ops.RHS); } @@ -4170,7 +4183,16 @@ Value *ScalarExprEmitter::EmitAdd(const BinOpInfo &op) { if (op.Ty->isConstantMatrixType()) { llvm::MatrixBuilder MB(Builder); CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, op.FPFeatures); - return MB.CreateAdd(op.LHS, op.RHS); + + auto *MatTy = cast(op.E->getType().getCanonicalType()); + auto Ty = MatTy->getElementType(); + if (!CGF.getContext().getTargetInfo().hasFeature("sme") || + !MatrixType::isValidTypeForSME(Ty)) + return MB.CreateAdd(op.LHS, op.RHS); + assert(isa(Ty) && "SME types should be BuiltinType."); + return MB.CreateSMEMatrixBinOp( + op.LHS, op.RHS, MatTy->getNumRows(), MatTy->getNumColumns(), + cast(Ty)->isSignedInteger(), "add"); } if (op.Ty->isUnsignedIntegerType() && @@ -4326,7 +4348,16 @@ Value *ScalarExprEmitter::EmitSub(const BinOpInfo &op) { if (op.Ty->isConstantMatrixType()) { llvm::MatrixBuilder MB(Builder); CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, op.FPFeatures); - return MB.CreateSub(op.LHS, op.RHS); + auto *MatTy = + cast(op.E->getType().getCanonicalType()); + auto Ty = MatTy->getElementType(); + if (!CGF.getContext().getTargetInfo().hasFeature("sme") || + !MatrixType::isValidTypeForSME(Ty)) + return MB.CreateSub(op.LHS, op.RHS); + assert(isa(Ty) && "SME types should be BuiltinType."); + return MB.CreateSMEMatrixBinOp( + op.LHS, op.RHS, MatTy->getNumRows(), MatTy->getNumColumns(), + cast(Ty)->isSignedInteger(), "sub"); } if (op.Ty->isUnsignedIntegerType() && diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp index 97381f673c28..029967da5681 100644 --- a/clang/lib/CodeGen/Targets/AArch64.cpp +++ b/clang/lib/CodeGen/Targets/AArch64.cpp @@ -294,6 +294,10 @@ AArch64ABIInfo::classifyArgumentType(QualType Ty, bool IsVariadic, if (isIllegalVectorType(Ty)) return coerceIllegalVector(Ty); + // Always pass the matrix type via memory. + if (Ty->isMatrixType()) + return getNaturalAlignIndirect(Ty, false); + if (!isAggregateTypeForABI(Ty)) { // Treat an enum type as its underlying type. if (const EnumType *EnumTy = Ty->getAs()) @@ -393,6 +397,10 @@ ABIArgInfo AArch64ABIInfo::classifyReturnType(QualType RetTy, return coerceIllegalVector(RetTy); } + // Always return the matrix type via memory. + if (RetTy->isMatrixType()) + return getNaturalAlignIndirect(RetTy); + // Large vector types should be returned via memory. if (RetTy->isVectorType() && getContext().getTypeSize(RetTy) > 128) return getNaturalAlignIndirect(RetTy); diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 20a555afb809..7941b5ba9250 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -1112,6 +1112,10 @@ ToolChain::RuntimeLibType ToolChain::GetRuntimeLibType( runtimeLibType = GetDefaultRuntimeLibType(); } + const llvm::Triple::ArchType Arch = getArch(); + if (Arch == llvm::Triple::aarch64 && Args.hasArg(options::OPT_fenable_matrix)) + runtimeLibType = ToolChain::RLT_CompilerRT; + return *runtimeLibType; } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-matmul.c b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-matmul.c new file mode 100644 index 000000000000..fd4e4f068dee --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-matmul.c @@ -0,0 +1,572 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -O1 -Werror -emit-llvm -fenable-matrix -o - %s | FileCheck %s -check-prefix=SME +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -sme -O1 -Werror -emit-llvm -fenable-matrix -o - -x c++ %s | FileCheck %s -check-prefix=NOSME + +#define M 1 +#define K 2 +#define N 3 + +typedef __bf16 mbf16_t1 __attribute__((matrix_type(M, K))); +typedef __fp16 mfp16_t1 __attribute__((matrix_type(M, K))); +typedef float mfloat_t1 __attribute__((matrix_type(M, K))); +typedef double mdouble_t1 __attribute__((matrix_type(M, K))); +typedef signed char mschar_t1 __attribute__((matrix_type(M, K))); +typedef unsigned char muchar_t1 __attribute__((matrix_type(M, K))); +typedef signed short msshort_t1 __attribute__((matrix_type(M, K))); +typedef unsigned short mushort_t1 __attribute__((matrix_type(M, K))); +typedef signed int msint_t1 __attribute__((matrix_type(M, K))); +typedef unsigned int muint_t1 __attribute__((matrix_type(M, K))); +typedef signed long long msllong_t1 __attribute__((matrix_type(M, K))); +typedef unsigned long long mullong_t1 __attribute__((matrix_type(M, K))); + +typedef __bf16 mbf16_t2 __attribute__((matrix_type(K, N))); +typedef __fp16 mfp16_t2 __attribute__((matrix_type(K, N))); +typedef float mfloat_t2 __attribute__((matrix_type(K, N))); +typedef double mdouble_t2 __attribute__((matrix_type(K, N))); +typedef signed char mschar_t2 __attribute__((matrix_type(K, N))); +typedef unsigned char muchar_t2 __attribute__((matrix_type(K, N))); +typedef signed short msshort_t2 __attribute__((matrix_type(K, N))); +typedef unsigned short mushort_t2 __attribute__((matrix_type(K, N))); +typedef signed int msint_t2 __attribute__((matrix_type(K, N))); +typedef unsigned int muint_t2 __attribute__((matrix_type(K, N))); +typedef signed long long msllong_t2 __attribute__((matrix_type(K, N))); +typedef unsigned long long mullong_t2 __attribute__((matrix_type(K, N))); + +typedef __bf16 mbf16_t __attribute__((matrix_type(M, N))); +typedef __fp16 mfp16_t __attribute__((matrix_type(M, N))); +typedef float mfloat_t __attribute__((matrix_type(M, N))); +typedef double mdouble_t __attribute__((matrix_type(M, N))); +typedef signed char mschar_t __attribute__((matrix_type(M, N))); +typedef unsigned char muchar_t __attribute__((matrix_type(M, N))); +typedef signed short msshort_t __attribute__((matrix_type(M, N))); +typedef unsigned short mushort_t __attribute__((matrix_type(M, N))); +typedef signed int msint_t __attribute__((matrix_type(M, N))); +typedef unsigned int muint_t __attribute__((matrix_type(M, N))); +typedef signed long long msllong_t __attribute__((matrix_type(M, N))); +typedef unsigned long long mullong_t __attribute__((matrix_type(M, N))); + +// SME-LABEL: define dso_local void @f_bf16( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x bfloat]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[A_UNPACK:%.*]] = load bfloat, ptr [[TMP0]], align 2, !tbaa [[TBAA2:![0-9]+]] +// SME-NEXT: [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2 +// SME-NEXT: [[A_UNPACK4:%.*]] = load bfloat, ptr [[A_ELT3]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: [[B_UNPACK:%.*]] = load bfloat, ptr [[TMP1]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 2 +// SME-NEXT: [[B_UNPACK7:%.*]] = load bfloat, ptr [[B_ELT6]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4 +// SME-NEXT: [[B_UNPACK9:%.*]] = load bfloat, ptr [[B_ELT8]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 6 +// SME-NEXT: [[B_UNPACK11:%.*]] = load bfloat, ptr [[B_ELT10]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8 +// SME-NEXT: [[B_UNPACK13:%.*]] = load bfloat, ptr [[B_ELT12]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 10 +// SME-NEXT: [[B_UNPACK15:%.*]] = load bfloat, ptr [[B_ELT14]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x bfloat> poison, bfloat [[A_UNPACK]], i64 0 +// SME-NEXT: [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x bfloat> [[A_ADDR_SROA_0_0_VEC_INSERT]], bfloat [[A_UNPACK4]], i64 1 +// SME-NEXT: [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x bfloat> poison, bfloat [[B_UNPACK]], i64 0 +// SME-NEXT: [[B_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_0_VEC_INSERT]], bfloat [[B_UNPACK7]], i64 1 +// SME-NEXT: [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_2_VEC_INSERT]], bfloat [[B_UNPACK9]], i64 2 +// SME-NEXT: [[B_ADDR_SROA_0_6_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_4_VEC_INSERT]], bfloat [[B_UNPACK11]], i64 3 +// SME-NEXT: [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_6_VEC_INSERT]], bfloat [[B_UNPACK13]], i64 4 +// SME-NEXT: [[B_ADDR_SROA_0_10_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_8_VEC_INSERT]], bfloat [[B_UNPACK15]], i64 5 +// SME-NEXT: [[TMP2:%.*]] = tail call <3 x bfloat> @llvm.matrix.multiply.v3bf16.v2bf16.v6bf16(<2 x bfloat> [[A_ADDR_SROA_0_2_VEC_INSERT]], <6 x bfloat> [[B_ADDR_SROA_0_10_VEC_INSERT]], i32 1, i32 2, i32 3) +// SME-NEXT: store <3 x bfloat> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z6f_bf16u11matrix_typeILm1ELm2Eu6__bf16Eu11matrix_typeILm2ELm3Eu6__bf16E( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x bfloat]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load bfloat, ptr [[TMP0]], align 2, !tbaa [[TBAA2:![0-9]+]] +// NOSME-NEXT: [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2 +// NOSME-NEXT: [[A_UNPACK4:%.*]] = load bfloat, ptr [[A_ELT3]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_UNPACK:%.*]] = load bfloat, ptr [[TMP1]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 2 +// NOSME-NEXT: [[B_UNPACK7:%.*]] = load bfloat, ptr [[B_ELT6]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4 +// NOSME-NEXT: [[B_UNPACK9:%.*]] = load bfloat, ptr [[B_ELT8]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 6 +// NOSME-NEXT: [[B_UNPACK11:%.*]] = load bfloat, ptr [[B_ELT10]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8 +// NOSME-NEXT: [[B_UNPACK13:%.*]] = load bfloat, ptr [[B_ELT12]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 10 +// NOSME-NEXT: [[B_UNPACK15:%.*]] = load bfloat, ptr [[B_ELT14]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x bfloat> poison, bfloat [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x bfloat> [[A_ADDR_SROA_0_0_VEC_INSERT]], bfloat [[A_UNPACK4]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x bfloat> poison, bfloat [[B_UNPACK]], i64 0 +// NOSME-NEXT: [[B_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_0_VEC_INSERT]], bfloat [[B_UNPACK7]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_2_VEC_INSERT]], bfloat [[B_UNPACK9]], i64 2 +// NOSME-NEXT: [[B_ADDR_SROA_0_6_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_4_VEC_INSERT]], bfloat [[B_UNPACK11]], i64 3 +// NOSME-NEXT: [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_6_VEC_INSERT]], bfloat [[B_UNPACK13]], i64 4 +// NOSME-NEXT: [[B_ADDR_SROA_0_10_VEC_INSERT:%.*]] = insertelement <6 x bfloat> [[B_ADDR_SROA_0_8_VEC_INSERT]], bfloat [[B_UNPACK15]], i64 5 +// NOSME-NEXT: [[TMP2:%.*]] = tail call <3 x bfloat> @llvm.matrix.multiply.v3bf16.v2bf16.v6bf16(<2 x bfloat> [[A_ADDR_SROA_0_2_VEC_INSERT]], <6 x bfloat> [[B_ADDR_SROA_0_10_VEC_INSERT]], i32 1, i32 2, i32 3) +// NOSME-NEXT: store <3 x bfloat> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +mbf16_t f_bf16(mbf16_t1 a, mbf16_t2 b) { + return a * b; +} + +// SME-LABEL: define dso_local void @f_fp16( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x half]) align 2 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP2:%.*]] = alloca <3 x half>, align 8 +// SME-NEXT: call void @__sme_matmul_float2(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4:[0-9]+]] +// SME-NEXT: [[TMP3:%.*]] = load <3 x half>, ptr [[TMP2]], align 8 +// SME-NEXT: store <3 x half> [[TMP3]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z6f_fp16u11matrix_typeILm1ELm2EDhEu11matrix_typeILm2ELm3EDhE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x half]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load half, ptr [[TMP0]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2 +// NOSME-NEXT: [[A_UNPACK4:%.*]] = load half, ptr [[A_ELT3]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_UNPACK:%.*]] = load half, ptr [[TMP1]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 2 +// NOSME-NEXT: [[B_UNPACK7:%.*]] = load half, ptr [[B_ELT6]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4 +// NOSME-NEXT: [[B_UNPACK9:%.*]] = load half, ptr [[B_ELT8]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 6 +// NOSME-NEXT: [[B_UNPACK11:%.*]] = load half, ptr [[B_ELT10]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8 +// NOSME-NEXT: [[B_UNPACK13:%.*]] = load half, ptr [[B_ELT12]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 10 +// NOSME-NEXT: [[B_UNPACK15:%.*]] = load half, ptr [[B_ELT14]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x half> poison, half [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x half> [[A_ADDR_SROA_0_0_VEC_INSERT]], half [[A_UNPACK4]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x half> poison, half [[B_UNPACK]], i64 0 +// NOSME-NEXT: [[B_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <6 x half> [[B_ADDR_SROA_0_0_VEC_INSERT]], half [[B_UNPACK7]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x half> [[B_ADDR_SROA_0_2_VEC_INSERT]], half [[B_UNPACK9]], i64 2 +// NOSME-NEXT: [[B_ADDR_SROA_0_6_VEC_INSERT:%.*]] = insertelement <6 x half> [[B_ADDR_SROA_0_4_VEC_INSERT]], half [[B_UNPACK11]], i64 3 +// NOSME-NEXT: [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x half> [[B_ADDR_SROA_0_6_VEC_INSERT]], half [[B_UNPACK13]], i64 4 +// NOSME-NEXT: [[B_ADDR_SROA_0_10_VEC_INSERT:%.*]] = insertelement <6 x half> [[B_ADDR_SROA_0_8_VEC_INSERT]], half [[B_UNPACK15]], i64 5 +// NOSME-NEXT: [[TMP2:%.*]] = tail call <3 x half> @llvm.matrix.multiply.v3f16.v2f16.v6f16(<2 x half> [[A_ADDR_SROA_0_2_VEC_INSERT]], <6 x half> [[B_ADDR_SROA_0_10_VEC_INSERT]], i32 1, i32 2, i32 3) +// NOSME-NEXT: store <3 x half> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +mfp16_t f_fp16(mfp16_t1 a, mfp16_t2 b) { + return a * b; +} + +// SME-LABEL: define dso_local void @f_float( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x float]) align 4 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP2:%.*]] = alloca <3 x float>, align 16 +// SME-NEXT: call void @__sme_matmul_float4(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]] +// SME-NEXT: [[TMP3:%.*]] = load <3 x float>, ptr [[TMP2]], align 16 +// SME-NEXT: store <3 x float> [[TMP3]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z7f_floatu11matrix_typeILm1ELm2EfEu11matrix_typeILm2ELm3EfE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x float]) align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load float, ptr [[TMP0]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4 +// NOSME-NEXT: [[A_UNPACK4:%.*]] = load float, ptr [[A_ELT3]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_UNPACK:%.*]] = load float, ptr [[TMP1]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4 +// NOSME-NEXT: [[B_UNPACK7:%.*]] = load float, ptr [[B_ELT6]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8 +// NOSME-NEXT: [[B_UNPACK9:%.*]] = load float, ptr [[B_ELT8]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +// NOSME-NEXT: [[B_UNPACK11:%.*]] = load float, ptr [[B_ELT10]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 +// NOSME-NEXT: [[B_UNPACK13:%.*]] = load float, ptr [[B_ELT12]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +// NOSME-NEXT: [[B_UNPACK15:%.*]] = load float, ptr [[B_ELT14]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x float> poison, float [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[A_ADDR_SROA_0_0_VEC_INSERT]], float [[A_UNPACK4]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x float> poison, float [[B_UNPACK]], i64 0 +// NOSME-NEXT: [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x float> [[B_ADDR_SROA_0_0_VEC_INSERT]], float [[B_UNPACK7]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x float> [[B_ADDR_SROA_0_4_VEC_INSERT]], float [[B_UNPACK9]], i64 2 +// NOSME-NEXT: [[B_ADDR_SROA_0_12_VEC_INSERT:%.*]] = insertelement <6 x float> [[B_ADDR_SROA_0_8_VEC_INSERT]], float [[B_UNPACK11]], i64 3 +// NOSME-NEXT: [[B_ADDR_SROA_0_16_VEC_INSERT:%.*]] = insertelement <6 x float> [[B_ADDR_SROA_0_12_VEC_INSERT]], float [[B_UNPACK13]], i64 4 +// NOSME-NEXT: [[B_ADDR_SROA_0_20_VEC_INSERT:%.*]] = insertelement <6 x float> [[B_ADDR_SROA_0_16_VEC_INSERT]], float [[B_UNPACK15]], i64 5 +// NOSME-NEXT: [[TMP2:%.*]] = tail call <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float> [[A_ADDR_SROA_0_4_VEC_INSERT]], <6 x float> [[B_ADDR_SROA_0_20_VEC_INSERT]], i32 1, i32 2, i32 3) +// NOSME-NEXT: store <3 x float> [[TMP2]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +mfloat_t f_float(mfloat_t1 a, mfloat_t2 b) { + return a * b; +} + +// SME-LABEL: define dso_local void @f_double( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x double]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP2:%.*]] = alloca <3 x double>, align 32 +// SME-NEXT: call void @__sme_matmul_float8(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]] +// SME-NEXT: [[TMP3:%.*]] = load <3 x double>, ptr [[TMP2]], align 32 +// SME-NEXT: store <3 x double> [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z8f_doubleu11matrix_typeILm1ELm2EdEu11matrix_typeILm2ELm3EdE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x double]) align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load double, ptr [[TMP0]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 +// NOSME-NEXT: [[A_UNPACK4:%.*]] = load double, ptr [[A_ELT3]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_UNPACK:%.*]] = load double, ptr [[TMP1]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8 +// NOSME-NEXT: [[B_UNPACK7:%.*]] = load double, ptr [[B_ELT6]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 +// NOSME-NEXT: [[B_UNPACK9:%.*]] = load double, ptr [[B_ELT8]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 24 +// NOSME-NEXT: [[B_UNPACK11:%.*]] = load double, ptr [[B_ELT10]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 +// NOSME-NEXT: [[B_UNPACK13:%.*]] = load double, ptr [[B_ELT12]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +// NOSME-NEXT: [[B_UNPACK15:%.*]] = load double, ptr [[B_ELT14]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x double> poison, double [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x double> [[A_ADDR_SROA_0_0_VEC_INSERT]], double [[A_UNPACK4]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x double> poison, double [[B_UNPACK]], i64 0 +// NOSME-NEXT: [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x double> [[B_ADDR_SROA_0_0_VEC_INSERT]], double [[B_UNPACK7]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_16_VEC_INSERT:%.*]] = insertelement <6 x double> [[B_ADDR_SROA_0_8_VEC_INSERT]], double [[B_UNPACK9]], i64 2 +// NOSME-NEXT: [[B_ADDR_SROA_0_24_VEC_INSERT:%.*]] = insertelement <6 x double> [[B_ADDR_SROA_0_16_VEC_INSERT]], double [[B_UNPACK11]], i64 3 +// NOSME-NEXT: [[B_ADDR_SROA_0_32_VEC_INSERT:%.*]] = insertelement <6 x double> [[B_ADDR_SROA_0_24_VEC_INSERT]], double [[B_UNPACK13]], i64 4 +// NOSME-NEXT: [[B_ADDR_SROA_0_40_VEC_INSERT:%.*]] = insertelement <6 x double> [[B_ADDR_SROA_0_32_VEC_INSERT]], double [[B_UNPACK15]], i64 5 +// NOSME-NEXT: [[TMP2:%.*]] = tail call <3 x double> @llvm.matrix.multiply.v3f64.v2f64.v6f64(<2 x double> [[A_ADDR_SROA_0_8_VEC_INSERT]], <6 x double> [[B_ADDR_SROA_0_40_VEC_INSERT]], i32 1, i32 2, i32 3) +// NOSME-NEXT: store <3 x double> [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +mdouble_t f_double(mdouble_t1 a, mdouble_t2 b) { + return a * b; +} + +// SME-LABEL: define dso_local void @f_signed_char( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i8]) align 1 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP2:%.*]] = alloca <3 x i8>, align 4 +// SME-NEXT: call void @__sme_matmul_int1(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]] +// SME-NEXT: [[TMP3:%.*]] = load <3 x i8>, ptr [[TMP2]], align 4 +// SME-NEXT: store <3 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z13f_signed_charu11matrix_typeILm1ELm2EaEu11matrix_typeILm2ELm3EaE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i8]) align 1 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i8, ptr [[TMP0]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 1 +// NOSME-NEXT: [[A_UNPACK4:%.*]] = load i8, ptr [[A_ELT3]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_UNPACK:%.*]] = load i8, ptr [[TMP1]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 1 +// NOSME-NEXT: [[B_UNPACK7:%.*]] = load i8, ptr [[B_ELT6]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 2 +// NOSME-NEXT: [[B_UNPACK9:%.*]] = load i8, ptr [[B_ELT8]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 3 +// NOSME-NEXT: [[B_UNPACK11:%.*]] = load i8, ptr [[B_ELT10]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4 +// NOSME-NEXT: [[B_UNPACK13:%.*]] = load i8, ptr [[B_ELT12]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 5 +// NOSME-NEXT: [[B_UNPACK15:%.*]] = load i8, ptr [[B_ELT14]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i8> poison, i8 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_1_VEC_INSERT:%.*]] = insertelement <2 x i8> [[A_ADDR_SROA_0_0_VEC_INSERT]], i8 [[A_UNPACK4]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i8> poison, i8 [[B_UNPACK]], i64 0 +// NOSME-NEXT: [[B_ADDR_SROA_0_1_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_0_VEC_INSERT]], i8 [[B_UNPACK7]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_1_VEC_INSERT]], i8 [[B_UNPACK9]], i64 2 +// NOSME-NEXT: [[B_ADDR_SROA_0_3_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_2_VEC_INSERT]], i8 [[B_UNPACK11]], i64 3 +// NOSME-NEXT: [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_3_VEC_INSERT]], i8 [[B_UNPACK13]], i64 4 +// NOSME-NEXT: [[B_ADDR_SROA_0_5_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_4_VEC_INSERT]], i8 [[B_UNPACK15]], i64 5 +// NOSME-NEXT: [[TMP2:%.*]] = tail call <3 x i8> @llvm.matrix.multiply.v3i8.v2i8.v6i8(<2 x i8> [[A_ADDR_SROA_0_1_VEC_INSERT]], <6 x i8> [[B_ADDR_SROA_0_5_VEC_INSERT]], i32 1, i32 2, i32 3) +// NOSME-NEXT: store <3 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +mschar_t f_signed_char(mschar_t1 a, mschar_t2 b) { + return a * b; +} + +// SME-LABEL: define dso_local void @f_unsigned_char( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i8]) align 1 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP2:%.*]] = alloca <3 x i8>, align 4 +// SME-NEXT: call void @__sme_matmul_uint1(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]] +// SME-NEXT: [[TMP3:%.*]] = load <3 x i8>, ptr [[TMP2]], align 4 +// SME-NEXT: store <3 x i8> [[TMP3]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z15f_unsigned_charu11matrix_typeILm1ELm2EhEu11matrix_typeILm2ELm3EhE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i8]) align 1 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i8, ptr [[TMP0]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 1 +// NOSME-NEXT: [[A_UNPACK4:%.*]] = load i8, ptr [[A_ELT3]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_UNPACK:%.*]] = load i8, ptr [[TMP1]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 1 +// NOSME-NEXT: [[B_UNPACK7:%.*]] = load i8, ptr [[B_ELT6]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 2 +// NOSME-NEXT: [[B_UNPACK9:%.*]] = load i8, ptr [[B_ELT8]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 3 +// NOSME-NEXT: [[B_UNPACK11:%.*]] = load i8, ptr [[B_ELT10]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4 +// NOSME-NEXT: [[B_UNPACK13:%.*]] = load i8, ptr [[B_ELT12]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 5 +// NOSME-NEXT: [[B_UNPACK15:%.*]] = load i8, ptr [[B_ELT14]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i8> poison, i8 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_1_VEC_INSERT:%.*]] = insertelement <2 x i8> [[A_ADDR_SROA_0_0_VEC_INSERT]], i8 [[A_UNPACK4]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i8> poison, i8 [[B_UNPACK]], i64 0 +// NOSME-NEXT: [[B_ADDR_SROA_0_1_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_0_VEC_INSERT]], i8 [[B_UNPACK7]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_1_VEC_INSERT]], i8 [[B_UNPACK9]], i64 2 +// NOSME-NEXT: [[B_ADDR_SROA_0_3_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_2_VEC_INSERT]], i8 [[B_UNPACK11]], i64 3 +// NOSME-NEXT: [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_3_VEC_INSERT]], i8 [[B_UNPACK13]], i64 4 +// NOSME-NEXT: [[B_ADDR_SROA_0_5_VEC_INSERT:%.*]] = insertelement <6 x i8> [[B_ADDR_SROA_0_4_VEC_INSERT]], i8 [[B_UNPACK15]], i64 5 +// NOSME-NEXT: [[TMP2:%.*]] = tail call <3 x i8> @llvm.matrix.multiply.v3i8.v2i8.v6i8(<2 x i8> [[A_ADDR_SROA_0_1_VEC_INSERT]], <6 x i8> [[B_ADDR_SROA_0_5_VEC_INSERT]], i32 1, i32 2, i32 3) +// NOSME-NEXT: store <3 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +muchar_t f_unsigned_char(muchar_t1 a, muchar_t2 b) { + return a * b; +} + +// SME-LABEL: define dso_local void @f_signed_short( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i16]) align 2 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP2:%.*]] = alloca <3 x i16>, align 8 +// SME-NEXT: call void @__sme_matmul_int2(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]] +// SME-NEXT: [[TMP3:%.*]] = load <3 x i16>, ptr [[TMP2]], align 8 +// SME-NEXT: store <3 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z14f_signed_shortu11matrix_typeILm1ELm2EsEu11matrix_typeILm2ELm3EsE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i16]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i16, ptr [[TMP0]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2 +// NOSME-NEXT: [[A_UNPACK4:%.*]] = load i16, ptr [[A_ELT3]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_UNPACK:%.*]] = load i16, ptr [[TMP1]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 2 +// NOSME-NEXT: [[B_UNPACK7:%.*]] = load i16, ptr [[B_ELT6]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4 +// NOSME-NEXT: [[B_UNPACK9:%.*]] = load i16, ptr [[B_ELT8]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 6 +// NOSME-NEXT: [[B_UNPACK11:%.*]] = load i16, ptr [[B_ELT10]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8 +// NOSME-NEXT: [[B_UNPACK13:%.*]] = load i16, ptr [[B_ELT12]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 10 +// NOSME-NEXT: [[B_UNPACK15:%.*]] = load i16, ptr [[B_ELT14]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x i16> [[A_ADDR_SROA_0_0_VEC_INSERT]], i16 [[A_UNPACK4]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i16> poison, i16 [[B_UNPACK]], i64 0 +// NOSME-NEXT: [[B_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_0_VEC_INSERT]], i16 [[B_UNPACK7]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_2_VEC_INSERT]], i16 [[B_UNPACK9]], i64 2 +// NOSME-NEXT: [[B_ADDR_SROA_0_6_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_4_VEC_INSERT]], i16 [[B_UNPACK11]], i64 3 +// NOSME-NEXT: [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_6_VEC_INSERT]], i16 [[B_UNPACK13]], i64 4 +// NOSME-NEXT: [[B_ADDR_SROA_0_10_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_8_VEC_INSERT]], i16 [[B_UNPACK15]], i64 5 +// NOSME-NEXT: [[TMP2:%.*]] = tail call <3 x i16> @llvm.matrix.multiply.v3i16.v2i16.v6i16(<2 x i16> [[A_ADDR_SROA_0_2_VEC_INSERT]], <6 x i16> [[B_ADDR_SROA_0_10_VEC_INSERT]], i32 1, i32 2, i32 3) +// NOSME-NEXT: store <3 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +msshort_t f_signed_short(msshort_t1 a, msshort_t2 b) { + return a * b; +} + +// SME-LABEL: define dso_local void @f_unsigned_short( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i16]) align 2 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP2:%.*]] = alloca <3 x i16>, align 8 +// SME-NEXT: call void @__sme_matmul_uint2(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]] +// SME-NEXT: [[TMP3:%.*]] = load <3 x i16>, ptr [[TMP2]], align 8 +// SME-NEXT: store <3 x i16> [[TMP3]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z16f_unsigned_shortu11matrix_typeILm1ELm2EtEu11matrix_typeILm2ELm3EtE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i16]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i16, ptr [[TMP0]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2 +// NOSME-NEXT: [[A_UNPACK4:%.*]] = load i16, ptr [[A_ELT3]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_UNPACK:%.*]] = load i16, ptr [[TMP1]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 2 +// NOSME-NEXT: [[B_UNPACK7:%.*]] = load i16, ptr [[B_ELT6]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4 +// NOSME-NEXT: [[B_UNPACK9:%.*]] = load i16, ptr [[B_ELT8]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 6 +// NOSME-NEXT: [[B_UNPACK11:%.*]] = load i16, ptr [[B_ELT10]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8 +// NOSME-NEXT: [[B_UNPACK13:%.*]] = load i16, ptr [[B_ELT12]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 10 +// NOSME-NEXT: [[B_UNPACK15:%.*]] = load i16, ptr [[B_ELT14]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x i16> [[A_ADDR_SROA_0_0_VEC_INSERT]], i16 [[A_UNPACK4]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i16> poison, i16 [[B_UNPACK]], i64 0 +// NOSME-NEXT: [[B_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_0_VEC_INSERT]], i16 [[B_UNPACK7]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_2_VEC_INSERT]], i16 [[B_UNPACK9]], i64 2 +// NOSME-NEXT: [[B_ADDR_SROA_0_6_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_4_VEC_INSERT]], i16 [[B_UNPACK11]], i64 3 +// NOSME-NEXT: [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_6_VEC_INSERT]], i16 [[B_UNPACK13]], i64 4 +// NOSME-NEXT: [[B_ADDR_SROA_0_10_VEC_INSERT:%.*]] = insertelement <6 x i16> [[B_ADDR_SROA_0_8_VEC_INSERT]], i16 [[B_UNPACK15]], i64 5 +// NOSME-NEXT: [[TMP2:%.*]] = tail call <3 x i16> @llvm.matrix.multiply.v3i16.v2i16.v6i16(<2 x i16> [[A_ADDR_SROA_0_2_VEC_INSERT]], <6 x i16> [[B_ADDR_SROA_0_10_VEC_INSERT]], i32 1, i32 2, i32 3) +// NOSME-NEXT: store <3 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +mushort_t f_unsigned_short(mushort_t1 a, mushort_t2 b) { + return a * b; +} + +// SME-LABEL: define dso_local void @f_signed_int( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i32]) align 4 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP2:%.*]] = alloca <3 x i32>, align 16 +// SME-NEXT: call void @__sme_matmul_int4(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]] +// SME-NEXT: [[TMP3:%.*]] = load <3 x i32>, ptr [[TMP2]], align 16 +// SME-NEXT: store <3 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z12f_signed_intu11matrix_typeILm1ELm2EiEu11matrix_typeILm2ELm3EiE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i32]) align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4 +// NOSME-NEXT: [[A_UNPACK4:%.*]] = load i32, ptr [[A_ELT3]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_UNPACK:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4 +// NOSME-NEXT: [[B_UNPACK7:%.*]] = load i32, ptr [[B_ELT6]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8 +// NOSME-NEXT: [[B_UNPACK9:%.*]] = load i32, ptr [[B_ELT8]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +// NOSME-NEXT: [[B_UNPACK11:%.*]] = load i32, ptr [[B_ELT10]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 +// NOSME-NEXT: [[B_UNPACK13:%.*]] = load i32, ptr [[B_ELT12]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +// NOSME-NEXT: [[B_UNPACK15:%.*]] = load i32, ptr [[B_ELT14]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[A_ADDR_SROA_0_0_VEC_INSERT]], i32 [[A_UNPACK4]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i32> poison, i32 [[B_UNPACK]], i64 0 +// NOSME-NEXT: [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_0_VEC_INSERT]], i32 [[B_UNPACK7]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_4_VEC_INSERT]], i32 [[B_UNPACK9]], i64 2 +// NOSME-NEXT: [[B_ADDR_SROA_0_12_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_8_VEC_INSERT]], i32 [[B_UNPACK11]], i64 3 +// NOSME-NEXT: [[B_ADDR_SROA_0_16_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_12_VEC_INSERT]], i32 [[B_UNPACK13]], i64 4 +// NOSME-NEXT: [[B_ADDR_SROA_0_20_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_16_VEC_INSERT]], i32 [[B_UNPACK15]], i64 5 +// NOSME-NEXT: [[TMP2:%.*]] = tail call <3 x i32> @llvm.matrix.multiply.v3i32.v2i32.v6i32(<2 x i32> [[A_ADDR_SROA_0_4_VEC_INSERT]], <6 x i32> [[B_ADDR_SROA_0_20_VEC_INSERT]], i32 1, i32 2, i32 3) +// NOSME-NEXT: store <3 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +msint_t f_signed_int(msint_t1 a, msint_t2 b) { + return a * b; +} + +// SME-LABEL: define dso_local void @f_unsigned_int( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i32]) align 4 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP2:%.*]] = alloca <3 x i32>, align 16 +// SME-NEXT: call void @__sme_matmul_uint4(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]] +// SME-NEXT: [[TMP3:%.*]] = load <3 x i32>, ptr [[TMP2]], align 16 +// SME-NEXT: store <3 x i32> [[TMP3]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z14f_unsigned_intu11matrix_typeILm1ELm2EjEu11matrix_typeILm2ELm3EjE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i32]) align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4 +// NOSME-NEXT: [[A_UNPACK4:%.*]] = load i32, ptr [[A_ELT3]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_UNPACK:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4 +// NOSME-NEXT: [[B_UNPACK7:%.*]] = load i32, ptr [[B_ELT6]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8 +// NOSME-NEXT: [[B_UNPACK9:%.*]] = load i32, ptr [[B_ELT8]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +// NOSME-NEXT: [[B_UNPACK11:%.*]] = load i32, ptr [[B_ELT10]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 +// NOSME-NEXT: [[B_UNPACK13:%.*]] = load i32, ptr [[B_ELT12]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +// NOSME-NEXT: [[B_UNPACK15:%.*]] = load i32, ptr [[B_ELT14]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[A_ADDR_SROA_0_0_VEC_INSERT]], i32 [[A_UNPACK4]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i32> poison, i32 [[B_UNPACK]], i64 0 +// NOSME-NEXT: [[B_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_0_VEC_INSERT]], i32 [[B_UNPACK7]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_4_VEC_INSERT]], i32 [[B_UNPACK9]], i64 2 +// NOSME-NEXT: [[B_ADDR_SROA_0_12_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_8_VEC_INSERT]], i32 [[B_UNPACK11]], i64 3 +// NOSME-NEXT: [[B_ADDR_SROA_0_16_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_12_VEC_INSERT]], i32 [[B_UNPACK13]], i64 4 +// NOSME-NEXT: [[B_ADDR_SROA_0_20_VEC_INSERT:%.*]] = insertelement <6 x i32> [[B_ADDR_SROA_0_16_VEC_INSERT]], i32 [[B_UNPACK15]], i64 5 +// NOSME-NEXT: [[TMP2:%.*]] = tail call <3 x i32> @llvm.matrix.multiply.v3i32.v2i32.v6i32(<2 x i32> [[A_ADDR_SROA_0_4_VEC_INSERT]], <6 x i32> [[B_ADDR_SROA_0_20_VEC_INSERT]], i32 1, i32 2, i32 3) +// NOSME-NEXT: store <3 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +muint_t f_unsigned_int(muint_t1 a, muint_t2 b) { + return a * b; +} + +// SME-LABEL: define dso_local void @f_signed_long_long( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i64]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP2:%.*]] = alloca <3 x i64>, align 32 +// SME-NEXT: call void @__sme_matmul_int8(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]] +// SME-NEXT: [[TMP3:%.*]] = load <3 x i64>, ptr [[TMP2]], align 32 +// SME-NEXT: store <3 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z18f_signed_long_longu11matrix_typeILm1ELm2ExEu11matrix_typeILm2ELm3ExE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i64]) align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i64, ptr [[TMP0]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 +// NOSME-NEXT: [[A_UNPACK4:%.*]] = load i64, ptr [[A_ELT3]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_UNPACK:%.*]] = load i64, ptr [[TMP1]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8 +// NOSME-NEXT: [[B_UNPACK7:%.*]] = load i64, ptr [[B_ELT6]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 +// NOSME-NEXT: [[B_UNPACK9:%.*]] = load i64, ptr [[B_ELT8]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 24 +// NOSME-NEXT: [[B_UNPACK11:%.*]] = load i64, ptr [[B_ELT10]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 +// NOSME-NEXT: [[B_UNPACK13:%.*]] = load i64, ptr [[B_ELT12]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +// NOSME-NEXT: [[B_UNPACK15:%.*]] = load i64, ptr [[B_ELT14]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[A_ADDR_SROA_0_0_VEC_INSERT]], i64 [[A_UNPACK4]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i64> poison, i64 [[B_UNPACK]], i64 0 +// NOSME-NEXT: [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_0_VEC_INSERT]], i64 [[B_UNPACK7]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_16_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_8_VEC_INSERT]], i64 [[B_UNPACK9]], i64 2 +// NOSME-NEXT: [[B_ADDR_SROA_0_24_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_16_VEC_INSERT]], i64 [[B_UNPACK11]], i64 3 +// NOSME-NEXT: [[B_ADDR_SROA_0_32_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_24_VEC_INSERT]], i64 [[B_UNPACK13]], i64 4 +// NOSME-NEXT: [[B_ADDR_SROA_0_40_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_32_VEC_INSERT]], i64 [[B_UNPACK15]], i64 5 +// NOSME-NEXT: [[TMP2:%.*]] = tail call <3 x i64> @llvm.matrix.multiply.v3i64.v2i64.v6i64(<2 x i64> [[A_ADDR_SROA_0_8_VEC_INSERT]], <6 x i64> [[B_ADDR_SROA_0_40_VEC_INSERT]], i32 1, i32 2, i32 3) +// NOSME-NEXT: store <3 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +msllong_t f_signed_long_long(msllong_t1 a, msllong_t2 b) { + return a * b; +} + +// SME-LABEL: define dso_local void @f_unsigned_long_long( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i64]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP2:%.*]] = alloca <3 x i64>, align 32 +// SME-NEXT: call void @__sme_matmul_uint8(ptr nonnull [[TMP2]], ptr [[TMP0]], ptr [[TMP1]], i32 1, i32 2, i32 3) #[[ATTR4]] +// SME-NEXT: [[TMP3:%.*]] = load <3 x i64>, ptr [[TMP2]], align 32 +// SME-NEXT: store <3 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z20f_unsigned_long_longu11matrix_typeILm1ELm2EyEu11matrix_typeILm2ELm3EyE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([3 x i64]) align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]], ptr nocapture noundef readonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i64, ptr [[TMP0]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 +// NOSME-NEXT: [[A_UNPACK4:%.*]] = load i64, ptr [[A_ELT3]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_UNPACK:%.*]] = load i64, ptr [[TMP1]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8 +// NOSME-NEXT: [[B_UNPACK7:%.*]] = load i64, ptr [[B_ELT6]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 +// NOSME-NEXT: [[B_UNPACK9:%.*]] = load i64, ptr [[B_ELT8]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 24 +// NOSME-NEXT: [[B_UNPACK11:%.*]] = load i64, ptr [[B_ELT10]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 +// NOSME-NEXT: [[B_UNPACK13:%.*]] = load i64, ptr [[B_ELT12]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[B_ELT14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +// NOSME-NEXT: [[B_UNPACK15:%.*]] = load i64, ptr [[B_ELT14]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[A_ADDR_SROA_0_0_VEC_INSERT]], i64 [[A_UNPACK4]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <6 x i64> poison, i64 [[B_UNPACK]], i64 0 +// NOSME-NEXT: [[B_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_0_VEC_INSERT]], i64 [[B_UNPACK7]], i64 1 +// NOSME-NEXT: [[B_ADDR_SROA_0_16_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_8_VEC_INSERT]], i64 [[B_UNPACK9]], i64 2 +// NOSME-NEXT: [[B_ADDR_SROA_0_24_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_16_VEC_INSERT]], i64 [[B_UNPACK11]], i64 3 +// NOSME-NEXT: [[B_ADDR_SROA_0_32_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_24_VEC_INSERT]], i64 [[B_UNPACK13]], i64 4 +// NOSME-NEXT: [[B_ADDR_SROA_0_40_VEC_INSERT:%.*]] = insertelement <6 x i64> [[B_ADDR_SROA_0_32_VEC_INSERT]], i64 [[B_UNPACK15]], i64 5 +// NOSME-NEXT: [[TMP2:%.*]] = tail call <3 x i64> @llvm.matrix.multiply.v3i64.v2i64.v6i64(<2 x i64> [[A_ADDR_SROA_0_8_VEC_INSERT]], <6 x i64> [[B_ADDR_SROA_0_40_VEC_INSERT]], i32 1, i32 2, i32 3) +// NOSME-NEXT: store <3 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +mullong_t f_unsigned_long_long(mullong_t1 a, mullong_t2 b) { + return a * b; +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-transpose.c b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-transpose.c new file mode 100644 index 000000000000..06fc018bef32 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-transpose.c @@ -0,0 +1,337 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -O1 -Werror -emit-llvm -fenable-matrix -o - %s | FileCheck %s -check-prefix=SME +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -sme -O1 -Werror -emit-llvm -fenable-matrix -o - -x c++ %s | FileCheck %s -check-prefix=NOSME + +#define M 1 +#define K 2 + +typedef __bf16 mbf16_t __attribute__((matrix_type(M, K))); +typedef __fp16 mfp16_t __attribute__((matrix_type(M, K))); +typedef float mfloat_t __attribute__((matrix_type(M, K))); +typedef double mdouble_t __attribute__((matrix_type(M, K))); +typedef signed char mschar_t __attribute__((matrix_type(M, K))); +typedef unsigned char muchar_t __attribute__((matrix_type(M, K))); +typedef signed short msshort_t __attribute__((matrix_type(M, K))); +typedef unsigned short mushort_t __attribute__((matrix_type(M, K))); +typedef signed int msint_t __attribute__((matrix_type(M, K))); +typedef unsigned int muint_t __attribute__((matrix_type(M, K))); +typedef signed long long msllong_t __attribute__((matrix_type(M, K))); +typedef unsigned long long mullong_t __attribute__((matrix_type(M, K))); + +typedef __bf16 tran_mbf16_t __attribute__((matrix_type(K, M))); +typedef __fp16 tran_mfp16_t __attribute__((matrix_type(K, M))); +typedef float tran_mfloat_t __attribute__((matrix_type(K, M))); +typedef double tran_mdouble_t __attribute__((matrix_type(K, M))); +typedef signed char tran_mschar_t __attribute__((matrix_type(K, M))); +typedef unsigned char tran_muchar_t __attribute__((matrix_type(K, M))); +typedef signed short tran_msshort_t __attribute__((matrix_type(K, M))); +typedef unsigned short tran_mushort_t __attribute__((matrix_type(K, M))); +typedef signed int tran_msint_t __attribute__((matrix_type(K, M))); +typedef unsigned int tran_muint_t __attribute__((matrix_type(K, M))); +typedef signed long long tran_msllong_t __attribute__((matrix_type(K, M))); +typedef unsigned long long tran_mullong_t __attribute__((matrix_type(K, M))); + +// SME-LABEL: define dso_local void @f_bf16( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x bfloat]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[A_UNPACK:%.*]] = load bfloat, ptr [[TMP0]], align 2, !tbaa [[TBAA2:![0-9]+]] +// SME-NEXT: [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2 +// SME-NEXT: [[A_UNPACK3:%.*]] = load bfloat, ptr [[A_ELT2]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x bfloat> poison, bfloat [[A_UNPACK]], i64 0 +// SME-NEXT: [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x bfloat> [[A_ADDR_SROA_0_0_VEC_INSERT]], bfloat [[A_UNPACK3]], i64 1 +// SME-NEXT: [[TMP1:%.*]] = tail call <2 x bfloat> @llvm.matrix.transpose.v2bf16(<2 x bfloat> [[A_ADDR_SROA_0_2_VEC_INSERT]], i32 1, i32 2) +// SME-NEXT: store <2 x bfloat> [[TMP1]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z6f_bf16u11matrix_typeILm1ELm2Eu6__bf16E( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x bfloat]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load bfloat, ptr [[TMP0]], align 2, !tbaa [[TBAA2:![0-9]+]] +// NOSME-NEXT: [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2 +// NOSME-NEXT: [[A_UNPACK3:%.*]] = load bfloat, ptr [[A_ELT2]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x bfloat> poison, bfloat [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x bfloat> [[A_ADDR_SROA_0_0_VEC_INSERT]], bfloat [[A_UNPACK3]], i64 1 +// NOSME-NEXT: [[TMP1:%.*]] = tail call <2 x bfloat> @llvm.matrix.transpose.v2bf16(<2 x bfloat> [[A_ADDR_SROA_0_2_VEC_INSERT]], i32 1, i32 2) +// NOSME-NEXT: store <2 x bfloat> [[TMP1]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +tran_mbf16_t f_bf16(mbf16_t a) { + return __builtin_matrix_transpose(a); +} + +// SME-LABEL: define dso_local void @f_fp16( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x half]) align 2 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP1:%.*]] = alloca <2 x half>, align 4 +// SME-NEXT: call void @__sme_transpose_float2(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4:[0-9]+]] +// SME-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr [[TMP1]], align 4 +// SME-NEXT: store <2 x half> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z6f_fp16u11matrix_typeILm1ELm2EDhE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x half]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load half, ptr [[TMP0]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2 +// NOSME-NEXT: [[A_UNPACK3:%.*]] = load half, ptr [[A_ELT2]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x half> poison, half [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x half> [[A_ADDR_SROA_0_0_VEC_INSERT]], half [[A_UNPACK3]], i64 1 +// NOSME-NEXT: [[TMP1:%.*]] = tail call <2 x half> @llvm.matrix.transpose.v2f16(<2 x half> [[A_ADDR_SROA_0_2_VEC_INSERT]], i32 1, i32 2) +// NOSME-NEXT: store <2 x half> [[TMP1]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +tran_mfp16_t f_fp16(mfp16_t a) { + return __builtin_matrix_transpose(a); +} + +// SME-LABEL: define dso_local void @f_float( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x float]) align 4 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP1:%.*]] = alloca <2 x float>, align 8 +// SME-NEXT: call void @__sme_transpose_float4(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]] +// SME-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[TMP1]], align 8 +// SME-NEXT: store <2 x float> [[TMP2]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z7f_floatu11matrix_typeILm1ELm2EfE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x float]) align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load float, ptr [[TMP0]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4 +// NOSME-NEXT: [[A_UNPACK3:%.*]] = load float, ptr [[A_ELT2]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x float> poison, float [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[A_ADDR_SROA_0_0_VEC_INSERT]], float [[A_UNPACK3]], i64 1 +// NOSME-NEXT: [[TMP1:%.*]] = tail call <2 x float> @llvm.matrix.transpose.v2f32(<2 x float> [[A_ADDR_SROA_0_4_VEC_INSERT]], i32 1, i32 2) +// NOSME-NEXT: store <2 x float> [[TMP1]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +tran_mfloat_t f_float(mfloat_t a) { + return __builtin_matrix_transpose(a); +} + +// SME-LABEL: define dso_local void @f_double( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x double]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP1:%.*]] = alloca <2 x double>, align 16 +// SME-NEXT: call void @__sme_transpose_float8(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]] +// SME-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[TMP1]], align 16 +// SME-NEXT: store <2 x double> [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z8f_doubleu11matrix_typeILm1ELm2EdE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x double]) align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load double, ptr [[TMP0]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 +// NOSME-NEXT: [[A_UNPACK3:%.*]] = load double, ptr [[A_ELT2]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x double> poison, double [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x double> [[A_ADDR_SROA_0_0_VEC_INSERT]], double [[A_UNPACK3]], i64 1 +// NOSME-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.matrix.transpose.v2f64(<2 x double> [[A_ADDR_SROA_0_8_VEC_INSERT]], i32 1, i32 2) +// NOSME-NEXT: store <2 x double> [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +tran_mdouble_t f_double(mdouble_t a) { + return __builtin_matrix_transpose(a); +} + +// SME-LABEL: define dso_local void @f_signed_char( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i8]) align 1 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP1:%.*]] = alloca <2 x i8>, align 2 +// SME-NEXT: call void @__sme_transpose_int1(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]] +// SME-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[TMP1]], align 2 +// SME-NEXT: store <2 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z13f_signed_charu11matrix_typeILm1ELm2EaE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i8]) align 1 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i8, ptr [[TMP0]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 1 +// NOSME-NEXT: [[A_UNPACK3:%.*]] = load i8, ptr [[A_ELT2]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i8> poison, i8 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_1_VEC_INSERT:%.*]] = insertelement <2 x i8> [[A_ADDR_SROA_0_0_VEC_INSERT]], i8 [[A_UNPACK3]], i64 1 +// NOSME-NEXT: [[TMP1:%.*]] = tail call <2 x i8> @llvm.matrix.transpose.v2i8(<2 x i8> [[A_ADDR_SROA_0_1_VEC_INSERT]], i32 1, i32 2) +// NOSME-NEXT: store <2 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +tran_mschar_t f_signed_char(mschar_t a) { + return __builtin_matrix_transpose(a); +} + +// SME-LABEL: define dso_local void @f_unsigned_char( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i8]) align 1 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP1:%.*]] = alloca <2 x i8>, align 2 +// SME-NEXT: call void @__sme_transpose_int1(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]] +// SME-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[TMP1]], align 2 +// SME-NEXT: store <2 x i8> [[TMP2]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z15f_unsigned_charu11matrix_typeILm1ELm2EhE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i8]) align 1 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i8, ptr [[TMP0]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 1 +// NOSME-NEXT: [[A_UNPACK3:%.*]] = load i8, ptr [[A_ELT2]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i8> poison, i8 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_1_VEC_INSERT:%.*]] = insertelement <2 x i8> [[A_ADDR_SROA_0_0_VEC_INSERT]], i8 [[A_UNPACK3]], i64 1 +// NOSME-NEXT: [[TMP1:%.*]] = tail call <2 x i8> @llvm.matrix.transpose.v2i8(<2 x i8> [[A_ADDR_SROA_0_1_VEC_INSERT]], i32 1, i32 2) +// NOSME-NEXT: store <2 x i8> [[TMP1]], ptr [[AGG_RESULT]], align 1, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +tran_muchar_t f_unsigned_char(muchar_t a) { + return __builtin_matrix_transpose(a); +} + +// SME-LABEL: define dso_local void @f_signed_short( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i16]) align 2 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP1:%.*]] = alloca <2 x i16>, align 4 +// SME-NEXT: call void @__sme_transpose_int2(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]] +// SME-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr [[TMP1]], align 4 +// SME-NEXT: store <2 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z14f_signed_shortu11matrix_typeILm1ELm2EsE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i16]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i16, ptr [[TMP0]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2 +// NOSME-NEXT: [[A_UNPACK3:%.*]] = load i16, ptr [[A_ELT2]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x i16> [[A_ADDR_SROA_0_0_VEC_INSERT]], i16 [[A_UNPACK3]], i64 1 +// NOSME-NEXT: [[TMP1:%.*]] = tail call <2 x i16> @llvm.matrix.transpose.v2i16(<2 x i16> [[A_ADDR_SROA_0_2_VEC_INSERT]], i32 1, i32 2) +// NOSME-NEXT: store <2 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +tran_msshort_t f_signed_short(msshort_t a) { + return __builtin_matrix_transpose(a); +} + +// SME-LABEL: define dso_local void @f_unsigned_short( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i16]) align 2 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP1:%.*]] = alloca <2 x i16>, align 4 +// SME-NEXT: call void @__sme_transpose_int2(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]] +// SME-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr [[TMP1]], align 4 +// SME-NEXT: store <2 x i16> [[TMP2]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z16f_unsigned_shortu11matrix_typeILm1ELm2EtE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i16]) align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i16, ptr [[TMP0]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 2 +// NOSME-NEXT: [[A_UNPACK3:%.*]] = load i16, ptr [[A_ELT2]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i16> poison, i16 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_2_VEC_INSERT:%.*]] = insertelement <2 x i16> [[A_ADDR_SROA_0_0_VEC_INSERT]], i16 [[A_UNPACK3]], i64 1 +// NOSME-NEXT: [[TMP1:%.*]] = tail call <2 x i16> @llvm.matrix.transpose.v2i16(<2 x i16> [[A_ADDR_SROA_0_2_VEC_INSERT]], i32 1, i32 2) +// NOSME-NEXT: store <2 x i16> [[TMP1]], ptr [[AGG_RESULT]], align 2, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +tran_mushort_t f_unsigned_short(mushort_t a) { + return __builtin_matrix_transpose(a); +} + +// SME-LABEL: define dso_local void @f_signed_int( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i32]) align 4 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP1:%.*]] = alloca <2 x i32>, align 8 +// SME-NEXT: call void @__sme_transpose_int4(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]] +// SME-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8 +// SME-NEXT: store <2 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z12f_signed_intu11matrix_typeILm1ELm2EiE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i32]) align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4 +// NOSME-NEXT: [[A_UNPACK3:%.*]] = load i32, ptr [[A_ELT2]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[A_ADDR_SROA_0_0_VEC_INSERT]], i32 [[A_UNPACK3]], i64 1 +// NOSME-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.matrix.transpose.v2i32(<2 x i32> [[A_ADDR_SROA_0_4_VEC_INSERT]], i32 1, i32 2) +// NOSME-NEXT: store <2 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +tran_msint_t f_signed_int(msint_t a) { + return __builtin_matrix_transpose(a); +} + +// SME-LABEL: define dso_local void @f_unsigned_int( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i32]) align 4 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP1:%.*]] = alloca <2 x i32>, align 8 +// SME-NEXT: call void @__sme_transpose_int4(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]] +// SME-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8 +// SME-NEXT: store <2 x i32> [[TMP2]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z14f_unsigned_intu11matrix_typeILm1ELm2EjE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i32]) align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4 +// NOSME-NEXT: [[A_UNPACK3:%.*]] = load i32, ptr [[A_ELT2]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_4_VEC_INSERT:%.*]] = insertelement <2 x i32> [[A_ADDR_SROA_0_0_VEC_INSERT]], i32 [[A_UNPACK3]], i64 1 +// NOSME-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.matrix.transpose.v2i32(<2 x i32> [[A_ADDR_SROA_0_4_VEC_INSERT]], i32 1, i32 2) +// NOSME-NEXT: store <2 x i32> [[TMP1]], ptr [[AGG_RESULT]], align 4, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +tran_muint_t f_unsigned_int(muint_t a) { + return __builtin_matrix_transpose(a); +} + +// SME-LABEL: define dso_local void @f_signed_long_long( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i64]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP1:%.*]] = alloca <2 x i64>, align 16 +// SME-NEXT: call void @__sme_transpose_int8(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]] +// SME-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[TMP1]], align 16 +// SME-NEXT: store <2 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z18f_signed_long_longu11matrix_typeILm1ELm2ExE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i64]) align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i64, ptr [[TMP0]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 +// NOSME-NEXT: [[A_UNPACK3:%.*]] = load i64, ptr [[A_ELT2]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[A_ADDR_SROA_0_0_VEC_INSERT]], i64 [[A_UNPACK3]], i64 1 +// NOSME-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.matrix.transpose.v2i64(<2 x i64> [[A_ADDR_SROA_0_8_VEC_INSERT]], i32 1, i32 2) +// NOSME-NEXT: store <2 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +tran_msllong_t f_signed_long_long(msllong_t a) { + return __builtin_matrix_transpose(a); +} + +// SME-LABEL: define dso_local void @f_unsigned_long_long( +// SME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i64]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] { +// SME-NEXT: [[ENTRY:.*:]] +// SME-NEXT: [[TMP1:%.*]] = alloca <2 x i64>, align 16 +// SME-NEXT: call void @__sme_transpose_int8(ptr nonnull [[TMP1]], ptr [[TMP0]], i32 1, i32 2) #[[ATTR4]] +// SME-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[TMP1]], align 16 +// SME-NEXT: store <2 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]] +// SME-NEXT: ret void +// +// NOSME-LABEL: define dso_local void @_Z20f_unsigned_long_longu11matrix_typeILm1ELm2EyE( +// NOSME-SAME: ptr dead_on_unwind noalias nocapture writable writeonly sret([2 x i64]) align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// NOSME-NEXT: [[ENTRY:.*:]] +// NOSME-NEXT: [[A_UNPACK:%.*]] = load i64, ptr [[TMP0]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ELT2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 +// NOSME-NEXT: [[A_UNPACK3:%.*]] = load i64, ptr [[A_ELT2]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: [[A_ADDR_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A_UNPACK]], i64 0 +// NOSME-NEXT: [[A_ADDR_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[A_ADDR_SROA_0_0_VEC_INSERT]], i64 [[A_UNPACK3]], i64 1 +// NOSME-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.matrix.transpose.v2i64(<2 x i64> [[A_ADDR_SROA_0_8_VEC_INSERT]], i32 1, i32 2) +// NOSME-NEXT: store <2 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 8, !tbaa [[TBAA2]] +// NOSME-NEXT: ret void +// +tran_mullong_t f_unsigned_long_long(mullong_t a) { + return __builtin_matrix_transpose(a); +} diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index e0b2d08c2077..79a2a81a259f 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -569,6 +569,53 @@ set(aarch64_SOURCES aarch64/fp_mode.c ) +if (COMPILER_RT_HAS_AARCH64_SME) + list(APPEND aarch64_SOURCES + aarch64/matrix/add_float2.c + aarch64/matrix/add_float4.c + aarch64/matrix/add_float8.c + aarch64/matrix/add_int1.c + aarch64/matrix/add_int2.c + aarch64/matrix/add_int4.c + aarch64/matrix/add_int8.c + aarch64/matrix/add_uint1.c + aarch64/matrix/add_uint2.c + aarch64/matrix/add_uint4.c + aarch64/matrix/add_uint8.c + aarch64/matrix/matmul_float2.c + aarch64/matrix/matmul_float4.c + aarch64/matrix/matmul_float8.c + aarch64/matrix/matmul_int1.c + aarch64/matrix/matmul_int2.c + aarch64/matrix/matmul_int4.c + aarch64/matrix/matmul_int8.c + aarch64/matrix/matmul_uint1.c + aarch64/matrix/matmul_uint2.c + aarch64/matrix/matmul_uint4.c + aarch64/matrix/matmul_uint8.c + aarch64/matrix/sub_float2.c + aarch64/matrix/sub_float4.c + aarch64/matrix/sub_float8.c + aarch64/matrix/sub_int1.c + aarch64/matrix/sub_int2.c + aarch64/matrix/sub_int4.c + aarch64/matrix/sub_int8.c + aarch64/matrix/sub_uint1.c + aarch64/matrix/sub_uint2.c + aarch64/matrix/sub_uint4.c + aarch64/matrix/sub_uint8.c + aarch64/matrix/transpose_float2.c + aarch64/matrix/transpose_float4.c + aarch64/matrix/transpose_float8.c + aarch64/matrix/transpose_int1.c + aarch64/matrix/transpose_int2.c + aarch64/matrix/transpose_int4.c + aarch64/matrix/transpose_int8.c + ) +else() + message(STATUS "AArch64 SME matrix routines disabled") +endif() + if (COMPILER_RT_HAS_AARCH64_SME) if (NOT COMPILER_RT_DISABLE_AARCH64_FMV AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD)) list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-libc-mem-routines.S aarch64/sme-abi-init.c aarch64/sme-abi-vg.c aarch64/sme-libc-routines.c) diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_float2.c b/compiler-rt/lib/builtins/aarch64/matrix/add_float2.c new file mode 100644 index 000000000000..49c054958cf1 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/add_float2.c @@ -0,0 +1,18 @@ +//==- builtins/arrch64/matrix/add_float2.c - sme matrix operations -*- C -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native add instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void __sme_add_float2(__fp16 *dst, __fp16 *lhs, + __fp16 *rhs, unsigned row, + unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, h, float16, 16, f, add); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_float4.c b/compiler-rt/lib/builtins/aarch64/matrix/add_float4.c new file mode 100644 index 000000000000..3e6c7a759153 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/add_float4.c @@ -0,0 +1,18 @@ +//==- builtins/arrch64/matrix/add_float4.c - sme matrix operations -*- C -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native add instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void __sme_add_float4(float *dst, float *lhs, + float *rhs, unsigned row, + unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, w, float32, 32, f, add); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_float8.c b/compiler-rt/lib/builtins/aarch64/matrix/add_float8.c new file mode 100644 index 000000000000..c75fc929a89d --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/add_float8.c @@ -0,0 +1,18 @@ +//==- builtins/arrch64/matrix/add_float8.c - sme matrix operations -*- C -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native add instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void __sme_add_float8(double *dst, double *lhs, + double *rhs, unsigned row, + unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, d, float64, 64, f, add); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_int1.c b/compiler-rt/lib/builtins/aarch64/matrix/add_int1.c new file mode 100644 index 000000000000..e5ed762d68f6 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/add_int1.c @@ -0,0 +1,18 @@ +//===- builtins/arrch64/matrix/add_int1.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native add instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_add_int1(signed char *dst, signed char *lhs, signed char *rhs, + unsigned row, unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, b, int8, 8, s, add); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_int2.c b/compiler-rt/lib/builtins/aarch64/matrix/add_int2.c new file mode 100644 index 000000000000..7c21bb27151d --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/add_int2.c @@ -0,0 +1,18 @@ +//===- builtins/arrch64/matrix/add_int2.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native add instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_add_int2(signed short *dst, signed short *lhs, signed short *rhs, + unsigned row, unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, h, int16, 16, s, add); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_int4.c b/compiler-rt/lib/builtins/aarch64/matrix/add_int4.c new file mode 100644 index 000000000000..e5ab80f0b2db --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/add_int4.c @@ -0,0 +1,18 @@ +//===- builtins/arrch64/matrix/add_int4.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native add instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_add_int4(signed int *dst, signed int *lhs, signed int *rhs, unsigned row, + unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, w, int32, 32, s, add); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_int8.c b/compiler-rt/lib/builtins/aarch64/matrix/add_int8.c new file mode 100644 index 000000000000..60bf63e4e3b5 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/add_int8.c @@ -0,0 +1,18 @@ +//===- builtins/arrch64/matrix/add_int8.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native add instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_add_int8(signed long long *dst, signed long long *lhs, + signed long long *rhs, unsigned row, unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, d, int64, 64, s, add); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_uint1.c b/compiler-rt/lib/builtins/aarch64/matrix/add_uint1.c new file mode 100644 index 000000000000..44b451e6aa48 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/add_uint1.c @@ -0,0 +1,18 @@ +//==- builtins/arrch64/matrix/add_uint1.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native add instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_add_uint1(unsigned char *dst, unsigned char *lhs, unsigned char *rhs, + unsigned row, unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, b, uint8, 8, u, add); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_uint2.c b/compiler-rt/lib/builtins/aarch64/matrix/add_uint2.c new file mode 100644 index 000000000000..f4545e1baeef --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/add_uint2.c @@ -0,0 +1,18 @@ +//==- builtins/arrch64/matrix/add_uint2.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native add instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_add_uint2(unsigned short *dst, unsigned short *lhs, unsigned short *rhs, + unsigned row, unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, h, uint16, 16, u, add); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_uint4.c b/compiler-rt/lib/builtins/aarch64/matrix/add_uint4.c new file mode 100644 index 000000000000..31a4af9fe071 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/add_uint4.c @@ -0,0 +1,18 @@ +//==- builtins/arrch64/matrix/add_uint4.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native add instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_add_uint4(unsigned int *dst, unsigned int *lhs, unsigned int *rhs, + unsigned row, unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, w, uint32, 32, u, add); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/add_uint8.c b/compiler-rt/lib/builtins/aarch64/matrix/add_uint8.c new file mode 100644 index 000000000000..c2c0f2106c46 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/add_uint8.c @@ -0,0 +1,18 @@ +//==- builtins/arrch64/matrix/add_uint8.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native add instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_add_uint8(unsigned long long *dst, unsigned long long *lhs, + unsigned long long *rhs, unsigned row, unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, d, uint64, 64, u, add); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_float2.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_float2.c new file mode 100644 index 000000000000..cee21c2ab2eb --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_float2.c @@ -0,0 +1,19 @@ +//= builtins/arrch64/matrix/matmul_float2.c - sme matrix operations -*- C -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native FMOPA instruction for __fp16 type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_matmul_float2(__fp16 *dst, __fp16 *lhs, __fp16 *rhs, unsigned lhs_row, + unsigned lhs_column, unsigned rhs_column) { + SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, h, + float16, 16, f); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_float4.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_float4.c new file mode 100644 index 000000000000..396768e5073b --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_float4.c @@ -0,0 +1,17 @@ +//= builtins/arrch64/matrix/matmul_float4.c - sme matrix operations -*- C -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +__attribute__((target("+sme,+sve"))) __arm_new("za") void __sme_matmul_float4( + float *dst, float *lhs, float *rhs, unsigned lhs_row, unsigned lhs_column, + unsigned rhs_column) __arm_streaming { + BREAK_SME_ACLE_MATMUL_SME(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, w, + float32, 32, 0b00010001); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_float8.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_float8.c new file mode 100644 index 000000000000..983427dbb200 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_float8.c @@ -0,0 +1,18 @@ +//= builtins/arrch64/matrix/matmul_float8.c - sme matrix operations -*- C -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +__attribute__((target("+sme,+sve,+sme-f64f64"))) __arm_new( + "za") void __sme_matmul_float8(double *dst, double *lhs, double *rhs, + unsigned lhs_row, unsigned lhs_column, + unsigned rhs_column) __arm_streaming { + BREAK_SME_ACLE_MATMUL_SME(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, d, + float64, 64, 0b00000001); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_int1.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int1.c new file mode 100644 index 000000000000..a5fd9a59186e --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int1.c @@ -0,0 +1,19 @@ +//=- builtins/arrch64/matrix/matmul_int1.c - sme matrix operations -*- C -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native FMOPA instruction for i8 type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_matmul_int1(signed char *dst, signed char *lhs, signed char *rhs, + unsigned lhs_row, unsigned lhs_column, unsigned rhs_column) { + SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, b, int8, + 8, s); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_int2.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int2.c new file mode 100644 index 000000000000..5d070095a43c --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int2.c @@ -0,0 +1,19 @@ +//=- builtins/arrch64/matrix/matmul_int2.c - sme matrix operations -*- C -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native FMOPA instruction for i16 type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_matmul_int2(signed short *dst, signed short *lhs, signed short *rhs, + unsigned lhs_row, unsigned lhs_column, unsigned rhs_column) { + SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, h, int16, + 16, s); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_int4.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int4.c new file mode 100644 index 000000000000..81a59c319026 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int4.c @@ -0,0 +1,19 @@ +//=- builtins/arrch64/matrix/matmul_int4.c - sme matrix operations -*- C -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native FMOPA instruction for i32 type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_matmul_int4(signed int *dst, signed int *lhs, signed int *rhs, + unsigned lhs_row, unsigned lhs_column, unsigned rhs_column) { + SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, w, int32, + 32, s); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_int8.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int8.c new file mode 100644 index 000000000000..84d83061738c --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_int8.c @@ -0,0 +1,19 @@ +//=- builtins/arrch64/matrix/matmul_int8.c - sme matrix operations -*- C -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native FMOPA instruction for i64 type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_matmul_int8(int64_t *dst, int64_t *lhs, int64_t *rhs, unsigned lhs_row, + unsigned lhs_column, unsigned rhs_column) { + SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, d, int64, + 64, s); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint1.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint1.c new file mode 100644 index 000000000000..f5873f22cdb3 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint1.c @@ -0,0 +1,19 @@ +//=- builtins/arrch64/matrix/matmul_uint1.c - sme matrix operations -*- C -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native FMOPA instruction for zext i8 type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_matmul_uint1(unsigned char *dst, unsigned char *lhs, unsigned char *rhs, + unsigned lhs_row, unsigned lhs_column, unsigned rhs_column) { + SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, b, uint8, + 8, u); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint2.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint2.c new file mode 100644 index 000000000000..5962a02a2e5d --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint2.c @@ -0,0 +1,20 @@ +//=- builtins/arrch64/matrix/matmul_uint2.c - sme matrix operations -*- C -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native FMOPA instruction for zext i16 type, we use +// SVE version. +__attribute__((target("+sve"))) void +__sme_matmul_uint2(unsigned short *dst, unsigned short *lhs, + unsigned short *rhs, unsigned lhs_row, unsigned lhs_column, + unsigned rhs_column) { + SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, h, uint16, + 16, u); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint4.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint4.c new file mode 100644 index 000000000000..0fd33ea44fb2 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint4.c @@ -0,0 +1,19 @@ +//=- builtins/arrch64/matrix/matmul_uint4.c - sme matrix operations -*- C -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native FMOPA instruction for zext i32 type, we use +// SVE version. +__attribute__((target("+sve"))) void +__sme_matmul_uint4(unsigned int *dst, unsigned int *lhs, unsigned int *rhs, + unsigned lhs_row, unsigned lhs_column, unsigned rhs_column) { + SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, w, uint32, + 32, u); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint8.c b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint8.c new file mode 100644 index 000000000000..c4a9a010bf61 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/matmul_uint8.c @@ -0,0 +1,19 @@ +//=- builtins/arrch64/matrix/matmul_uint8.c - sme matrix operations -*- C -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native FMOPA instruction for zext i64 type, we use +// SVE version. +__attribute__((target("+sve"))) void +__sme_matmul_uint8(uint64_t *dst, uint64_t *lhs, uint64_t *rhs, + unsigned lhs_row, unsigned lhs_column, unsigned rhs_column) { + SME_ACLE_MATMUL_SVE(lhs, rhs, dst, lhs_row, lhs_column, rhs_column, d, uint64, + 64, u); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sme_acle.h b/compiler-rt/lib/builtins/aarch64/matrix/sme_acle.h new file mode 100644 index 000000000000..fb40593ad89d --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/sme_acle.h @@ -0,0 +1,136 @@ +//===- builtins/arrch64/matrix/sme_acle.h - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file implements the runtime SME routines for matrix_type operations. +// +//===----------------------------------------------------------------------===// + +#ifndef _SME_ACLE_H_ +#define _SME_ACLE_H_ + +#include +#include +#include + +#define SVCNT(x) svcnt##x +#define VectorType(x) sv##x##_t +#define SVWHILELT(x) svwhilelt_b##x##_u32 +#define SMZERO svzero_mask_za +#define SMLD1H(x) svld1_hor_za##x +#define SMST1V(x) svst1_ver_za##x +#define SVLD1_FLOAT(x) svld1_f##x +#define SMMOPA_FLOAT(x) svmopa_za##x##_f##x##_m +#define SMEXTRACTV_FLOAT(x) svread_ver_za##x##_f##x##_m + +#define SVDUP(type, bit) svdup_##type##bit +#define SVLD1(type, bit) svld1_##type##bit +#define SVMLA(type, bit) svmla_##type##bit##_x +#define SVST1(type, bit) svst1_##type##bit +#define SVBINOP(type, bit, op) sv##op##_##type##bit##_x + +// We don't have add/sub etc. binary operations for za tiles. Use SVE version +// instead. +#define SME_ACLE_BINOP_SVE(matA, matB, matC, M, N, svcnt_type, vec_type, bit, \ + func_type, op_type) \ + do { \ + uint64_t vscale = SVCNT(svcnt_type)(); \ + VectorType(vec_type) src1, src2, res; \ + svbool_t p; \ + for (size_t i = 0; i < M; i += vscale) { \ + p = SVWHILELT(bit)(i, M); \ + for (size_t j = 0; j < N; j++) { \ + src1 = SVLD1(func_type, bit)(p, matA + j * M + i); \ + src2 = SVLD1(func_type, bit)(p, matB + j * M + i); \ + res = SVBINOP(func_type, bit, op_type)(p, src1, src2); \ + SVST1(func_type, bit)(p, matC + j * M + i, res); \ + } \ + } \ + } while (0) + +// We don't have non-widening matmul for types except float4 and float8 in SME. +// Use SVE version instead. +#define SME_ACLE_MATMUL_SVE(matA, matB, matC, M, K, N, svcnt_type, vec_type, \ + bit, func_type) \ + do { \ + uint64_t vscale = SVCNT(svcnt_type)(); \ + VectorType(vec_type) src1, src2, acc; \ + svbool_t p; \ + for (size_t j = 0; j < N; j++) \ + for (size_t i = 0; i < M; i += vscale) { \ + acc = SVDUP(func_type, bit)(0); \ + p = SVWHILELT(bit)(i, M); \ + for (size_t k = 0; k < K; ++k) { \ + src1 = SVDUP(func_type, bit)(matB[j * K + k]); \ + src2 = SVLD1(func_type, bit)(p, matA + k * M + i); \ + acc = SVMLA(func_type, bit)(p, acc, src1, src2); \ + } \ + SVST1(func_type, bit)(p, matC + j * M + i, acc); \ + } \ + } while (0) + +// matrix_type in clang is column major, so we can just reuse Fortran's matmul +// version. +#define BREAK_SME_ACLE_MATMUL_SME(matA, matB, matC, M, K, N, svcnt_type, \ + vec_type, bit, zero_mask) \ + do { \ + uint64_t vscale = SVCNT(svcnt_type)(); \ + svbool_t pm, pn, pk; \ + VectorType(vec_type) src1, src2; \ + for (size_t i = 0; i < M; i += vscale) { \ + pm = SVWHILELT(bit)(i, M); \ + for (size_t j = 0; j < N; j += vscale) { \ + pn = SVWHILELT(bit)(j, N); \ + SMZERO(zero_mask); \ + for (size_t k = 0; k < K; k += vscale) { \ + pk = SVWHILELT(bit)(k, K); \ + for (size_t t = 0; t < vscale; t++) { \ + if (j + t == N) \ + break; \ + SMLD1H(bit)(1, t, pk, matB + (j + t) * K + k); \ + } \ + for (size_t t = 0; t < vscale; t++) { \ + if (k + t == K) \ + break; \ + src2 = SMEXTRACTV_FLOAT(bit)(src2, pn, 1, t); \ + src1 = SVLD1_FLOAT(bit)(pm, matA + (k + t) * M + i); \ + SMMOPA_FLOAT(bit)(0, pm, pn, src1, src2); \ + } \ + } \ + for (size_t t = 0; t < vscale; t++) { \ + if (j + t == N) \ + break; \ + SMST1V(bit)(0, t, pm, matC + (j + t) * M + i); \ + } \ + } \ + } \ + } while (0) + +// matrix_type in clang is column major, so we can just reuse Fortran's +// transpose version. +#define BREAK_SME_ACLE_TRANSPOSE(matA, M, N, ans, svcnt_type, bit, zero_mask) \ + do { \ + uint64_t vscale = SVCNT(svcnt_type)(); \ + svbool_t pm, pn; \ + for (size_t i = 0; i < M; i += vscale) { \ + pm = SVWHILELT(bit)(i, M); \ + for (size_t j = 0; j < N; j += vscale) { \ + pn = SVWHILELT(bit)(j, N); \ + SMZERO(zero_mask); \ + for (size_t t = 0; t < vscale; t++) { \ + if (j + t == N) \ + break; \ + SMLD1H(bit)(0, t, pm, matA + (j + t) * M + i); \ + } \ + for (size_t t = 0; t < vscale; t++) { \ + if (i + t == M) \ + break; \ + SMST1V(bit)(0, t, pn, ans + (i + t) * N + j); \ + } \ + } \ + } \ + } while (0) + +#endif diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_float2.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_float2.c new file mode 100644 index 000000000000..6105b0479c03 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_float2.c @@ -0,0 +1,18 @@ +//==- builtins/arrch64/matrix/sub_float2.c - sme matrix operations -*- C -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native sub instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void __sme_sub_float2(__fp16 *dst, __fp16 *lhs, + __fp16 *rhs, unsigned row, + unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, h, float16, 16, f, sub); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_float4.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_float4.c new file mode 100644 index 000000000000..3a57ffbe2801 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_float4.c @@ -0,0 +1,18 @@ +//==- builtins/arrch64/matrix/sub_float4.c - sme matrix operations -*- C -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native sub instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void __sme_sub_float4(float *dst, float *lhs, + float *rhs, unsigned row, + unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, w, float32, 32, f, sub); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_float8.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_float8.c new file mode 100644 index 000000000000..29e5b232b10a --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_float8.c @@ -0,0 +1,18 @@ +//==- builtins/arrch64/matrix/sub_float8.c - sme matrix operations -*- C -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native sub instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void __sme_sub_float8(double *dst, double *lhs, + double *rhs, unsigned row, + unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, d, float64, 64, f, sub); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_int1.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_int1.c new file mode 100644 index 000000000000..66834c072895 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_int1.c @@ -0,0 +1,18 @@ +//===- builtins/arrch64/matrix/sub_int1.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native sub instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_sub_int1(signed char *dst, signed char *lhs, signed char *rhs, + unsigned row, unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, b, int8, 8, s, sub); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_int2.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_int2.c new file mode 100644 index 000000000000..3c33f889b791 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_int2.c @@ -0,0 +1,18 @@ +//===- builtins/arrch64/matrix/sub_int2.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native sub instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_sub_int2(signed short *dst, signed short *lhs, signed short *rhs, + unsigned row, unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, h, int16, 16, s, sub); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_int4.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_int4.c new file mode 100644 index 000000000000..06361835dfc9 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_int4.c @@ -0,0 +1,18 @@ +//===- builtins/arrch64/matrix/sub_int4.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native sub instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_sub_int4(signed int *dst, signed int *lhs, signed int *rhs, unsigned row, + unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, w, int32, 32, s, sub); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_int8.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_int8.c new file mode 100644 index 000000000000..404f22d39a37 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_int8.c @@ -0,0 +1,18 @@ +//===- builtins/arrch64/matrix/sub_int8.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native sub instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_sub_int8(signed long long *dst, signed long long *lhs, + signed long long *rhs, unsigned row, unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, d, int64, 64, s, sub); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_uint1.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint1.c new file mode 100644 index 000000000000..5370c8024cff --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint1.c @@ -0,0 +1,18 @@ +//==- builtins/arrch64/matrix/sub_uint1.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native sub instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_sub_uint1(unsigned char *dst, unsigned char *lhs, unsigned char *rhs, + unsigned row, unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, b, uint8, 8, u, sub); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_uint2.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint2.c new file mode 100644 index 000000000000..e2f108ac36ba --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint2.c @@ -0,0 +1,18 @@ +//==- builtins/arrch64/matrix/sub_uint2.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native sub instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_sub_uint2(unsigned short *dst, unsigned short *lhs, unsigned short *rhs, + unsigned row, unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, h, uint16, 16, u, sub); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_uint4.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint4.c new file mode 100644 index 000000000000..9b2bc83164cb --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint4.c @@ -0,0 +1,18 @@ +//==- builtins/arrch64/matrix/sub_uint4.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native sub instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_sub_uint4(unsigned int *dst, unsigned int *lhs, unsigned int *rhs, + unsigned row, unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, w, uint32, 32, u, sub); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/sub_uint8.c b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint8.c new file mode 100644 index 000000000000..da8824af7b85 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/sub_uint8.c @@ -0,0 +1,18 @@ +//==- builtins/arrch64/matrix/sub_uint8.c - sme matrix operations -*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// Note: SME does not have native sub instruction for matrix type, we use SVE +// version. +__attribute__((target("+sve"))) void +__sme_sub_uint8(unsigned long long *dst, unsigned long long *lhs, + unsigned long long *rhs, unsigned row, unsigned column) { + SME_ACLE_BINOP_SVE(lhs, rhs, dst, row, column, d, uint64, 64, u, sub); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/transpose_float2.c b/compiler-rt/lib/builtins/aarch64/matrix/transpose_float2.c new file mode 100644 index 000000000000..832258df2dfb --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/transpose_float2.c @@ -0,0 +1,16 @@ +//==- builtins/arrch64/matrix/transpose_float2.c - sme matrix operations C -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +__attribute__((target("+sme,+sve"))) __arm_new( + "za") void __sme_transpose_float2(__fp16 *dst, __fp16 *src, unsigned row, + unsigned column) __arm_streaming { + BREAK_SME_ACLE_TRANSPOSE(src, row, column, dst, h, 16, 0b01010101); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/transpose_float4.c b/compiler-rt/lib/builtins/aarch64/matrix/transpose_float4.c new file mode 100644 index 000000000000..94ea3f5c7016 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/transpose_float4.c @@ -0,0 +1,16 @@ +//==- builtins/arrch64/matrix/transpose_float4.c - sme matrix operations C -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +__attribute__((target("+sme,+sve"))) __arm_new( + "za") void __sme_transpose_float4(float *dst, float *src, unsigned row, + unsigned column) __arm_streaming { + BREAK_SME_ACLE_TRANSPOSE(src, row, column, dst, w, 32, 0b00010001); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/transpose_float8.c b/compiler-rt/lib/builtins/aarch64/matrix/transpose_float8.c new file mode 100644 index 000000000000..8be09a2f0d41 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/transpose_float8.c @@ -0,0 +1,16 @@ +//==- builtins/arrch64/matrix/transpose_float8.c - sme matrix operations C -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +__attribute__((target("+sme,+sve"))) __arm_new( + "za") void __sme_transpose_float8(double *dst, double *src, unsigned row, + unsigned column) __arm_streaming { + BREAK_SME_ACLE_TRANSPOSE(src, row, column, dst, d, 64, 0b00000001); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/transpose_int1.c b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int1.c new file mode 100644 index 000000000000..89be7c9582f4 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int1.c @@ -0,0 +1,17 @@ +//===- builtins/arrch64/matrix/transpose_int1.c - sme matrix operations C -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// For transpose, signed or unsigned does not matter. +__attribute__((target("+sme,+sve"))) __arm_new("za") void __sme_transpose_int1( + signed char *dst, signed char *src, unsigned row, + unsigned column) __arm_streaming { + BREAK_SME_ACLE_TRANSPOSE(src, row, column, dst, b, 8, 0b11111111); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/transpose_int2.c b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int2.c new file mode 100644 index 000000000000..d7ab08592b5b --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int2.c @@ -0,0 +1,16 @@ +//===- builtins/arrch64/matrix/transpose_int2.c - sme matrix operations C -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// For transpose, signed or unsigned does not matter. +__attribute__((target("+sme,+sve"))) __arm_new("za") void __sme_transpose_int2( + short *dst, short *src, unsigned row, unsigned column) __arm_streaming { + BREAK_SME_ACLE_TRANSPOSE(src, row, column, dst, h, 16, 0b01010101); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/transpose_int4.c b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int4.c new file mode 100644 index 000000000000..cc415709dfcb --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int4.c @@ -0,0 +1,16 @@ +//===- builtins/arrch64/matrix/transpose_int4.c - sme matrix operations C -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// For transpose, signed or unsigned does not matter. +__attribute__((target("+sme,+sve"))) __arm_new("za") void __sme_transpose_int4( + int *dst, int *src, unsigned row, unsigned column) __arm_streaming { + BREAK_SME_ACLE_TRANSPOSE(src, row, column, dst, w, 32, 0b00010001); + return; +} diff --git a/compiler-rt/lib/builtins/aarch64/matrix/transpose_int8.c b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int8.c new file mode 100644 index 000000000000..2ce680d0b5b0 --- /dev/null +++ b/compiler-rt/lib/builtins/aarch64/matrix/transpose_int8.c @@ -0,0 +1,17 @@ +//===- builtins/arrch64/matrix/transpose_int8.c - sme matrix operations C -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "sme_acle.h" + +// For transpose, signed or unsigned does not matter. +__attribute__((target("+sme,+sve"))) __arm_new("za") void __sme_transpose_int8( + long long *dst, long long *src, unsigned row, + unsigned column) __arm_streaming { + BREAK_SME_ACLE_TRANSPOSE(src, row, column, dst, d, 64, 0b00000001); + return; +} diff --git a/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-add.c b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-add.c new file mode 100644 index 000000000000..18a7b96e0f9a --- /dev/null +++ b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-add.c @@ -0,0 +1,172 @@ +// REQUIRES: aarch64-target-arch,aarch64_sme_run + +// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -o %t +// RUN: %run %t 2>&1 | FileCheck %s +// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -O3 -o %t_smeopt +// RUN: %run %t_smeopt 2>&1 | FileCheck %s +// RUN: %clang_builtins -O3 -march=armv9.1-a -fenable-matrix %s %librt -o %t_nosmeopt +// RUN: %run %t_nosmeopt 2>&1 | FileCheck %s + +#include + +#define M 2 +#define N 4 + +typedef __bf16 m_tbf16 __attribute__((matrix_type(M, N))); + +typedef float m_tfloat __attribute__((matrix_type(M, N))); + +typedef double m_tdouble __attribute__((matrix_type(M, N))); + +typedef signed int m_tint __attribute__((matrix_type(M, N))); + +typedef unsigned long long m_tull __attribute__((matrix_type(M, N))); + +int main() { + m_tbf16 a; + m_tbf16 b; + float v = 0.0; + + // Input matrix 1: + // 0.00 1.00 2.00 3.00 + // 4.00 5.00 6.00 7.00 + // + // Input matrix 2: + // 8.00 9.00 10.00 11.00 + // 12.00 13.00 14.00 15.00 + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + a[i][j] = v++; + + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + b[i][j] = v++; + + m_tbf16 c = a + b; + + // CHECK: 8.00 10.00 12.00 14.00 + // CHECK-NEXT: 16.00 18.00 20.00 22.00 + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) + printf("%.2f ", (float)c[i][j]); + printf("\n"); + } + + m_tfloat af; + m_tfloat bf; + float vf = 0.0; + + // Input matrix 1: + // 0.00 1.00 2.00 3.00 + // 4.00 5.00 6.00 7.00 + // + // Input matrix 2: + // 8.00 9.00 10.00 11.00 + // 12.00 13.00 14.00 15.00 + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + af[i][j] = vf++; + + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + bf[i][j] = vf++; + + m_tfloat cf = af + bf; + + // CHECK: 8.00 10.00 12.00 14.00 + // CHECK-NEXT: 16.00 18.00 20.00 22.00 + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) + printf("%.2f ", cf[i][j]); + printf("\n"); + } + + m_tdouble ad; + m_tdouble bd; + double vd = 0.0; + + // Input matrix 1: + // 0.00 1.00 2.00 3.00 + // 4.00 5.00 6.00 7.00 + // + // Input matrix 2: + // 8.00 9.00 10.00 11.00 + // 12.00 13.00 14.00 15.00 + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + ad[i][j] = vd++; + + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + bd[i][j] = vd++; + + m_tdouble cd = ad + bd; + + // CHECK: 8.00 10.00 12.00 14.00 + // CHECK-NEXT: 16.00 18.00 20.00 22.00 + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) + printf("%.2f ", cd[i][j]); + printf("\n"); + } + + m_tint ai; + m_tint bi; + int vi = 0; + + // Input matrix 1: + // 0 1 2 3 + // 4 5 6 7 + // + // Input matrix 2: + // 8 9 10 11 + // 12 13 14 15 + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + ai[i][j] = vi++; + + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + bi[i][j] = vi++; + + m_tint ci = ai + bi; + + // CHECK: 8 10 12 14 + // CHECK-NEXT: 16 18 20 22 + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) + printf("%d ", ci[i][j]); + printf("\n"); + } + + m_tull au; + m_tull bu; + unsigned long long int vu = 0; + + // Input matrix 1: + // 0 1 2 3 + // 4 5 6 7 + // + // Input matrix 2: + // 8 9 10 11 + // 12 13 14 15 + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + au[i][j] = vu++; + + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + bu[i][j] = vu++; + + m_tull cu = au + bu; + + // CHECK: 8 10 12 14 + // CHECK-NEXT: 16 18 20 22 + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) + printf("%lld ", cu[i][j]); + printf("\n"); + } + + return 0; +} diff --git a/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-matmul.c b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-matmul.c new file mode 100644 index 000000000000..b69c79918d67 --- /dev/null +++ b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-matmul.c @@ -0,0 +1,189 @@ +// REQUIRES: aarch64-target-arch,aarch64_sme_run + +// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -o %t +// RUN: %run %t 2>&1 | FileCheck %s +// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -O3 -o %t_smeopt +// RUN: %run %t_smeopt 2>&1 | FileCheck %s +// RUN: %clang_builtins -O3 -march=armv9.1-a -fenable-matrix %s %librt -o %t_nosmeopt +// RUN: %run %t_nosmeopt 2>&1 | FileCheck %s + +#include + +// First matrix is 2x3. Second matrix is 3x4. Result matrix is 2x4. +#define M 2 +#define K 3 +#define N 4 + +typedef __bf16 m1_tbf16 __attribute__((matrix_type(M, K))); +typedef __bf16 m2_tbf16 __attribute__((matrix_type(K, N))); +typedef __bf16 mr_tbf16 __attribute__((matrix_type(M, N))); + +typedef float m1_tfloat __attribute__((matrix_type(M, K))); +typedef float m2_tfloat __attribute__((matrix_type(K, N))); +typedef float mr_tfloat __attribute__((matrix_type(M, N))); + +typedef double m1_tdouble __attribute__((matrix_type(M, K))); +typedef double m2_tdouble __attribute__((matrix_type(K, N))); +typedef double mr_tdouble __attribute__((matrix_type(M, N))); + +typedef signed int m1_tint __attribute__((matrix_type(M, K))); +typedef signed int m2_tint __attribute__((matrix_type(K, N))); +typedef signed int mr_tint __attribute__((matrix_type(M, N))); + +typedef unsigned long long m1_tull __attribute__((matrix_type(M, K))); +typedef unsigned long long m2_tull __attribute__((matrix_type(K, N))); +typedef unsigned long long mr_tull __attribute__((matrix_type(M, N))); + +int main() { + m1_tbf16 a; + m2_tbf16 b; + float v = 0.0; + + // Input matrix 1: + // 0.00 1.00 2.00 + // 3.00 4.00 5.00 + // + // Input matrix 2: + // 6.00 7.00 8.00 9.00 + // 10.00 11.00 12.00 13.00 + // 14.00 15.00 16.00 17.00 + for (int i = 0; i < M; i++) + for (int j = 0; j < K; j++) + a[i][j] = v++; + + for (int i = 0; i < K; i++) + for (int j = 0; j < N; j++) + b[i][j] = v++; + + mr_tbf16 c = a * b; + + // CHECK: 38.00 41.00 44.00 47.00 + // CHECK-NEXT: 128.00 140.00 152.00 164.00 + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) + printf("%.2f ", (float)c[i][j]); + printf("\n"); + } + + m1_tfloat af; + m2_tfloat bf; + float vf = 0.0; + + // Input matrix 1: + // 0.00 1.00 2.00 + // 3.00 4.00 5.00 + // + // Input matrix 2: + // 6.00 7.00 8.00 9.00 + // 10.00 11.00 12.00 13.00 + // 14.00 15.00 16.00 17.00 + for (int i = 0; i < M; i++) + for (int j = 0; j < K; j++) + af[i][j] = vf++; + + for (int i = 0; i < K; i++) + for (int j = 0; j < N; j++) + bf[i][j] = vf++; + + mr_tfloat cf = af * bf; + + // CHECK: 38.00 41.00 44.00 47.00 + // CHECK-NEXT: 128.00 140.00 152.00 164.00 + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) + printf("%.2f ", cf[i][j]); + printf("\n"); + } + + m1_tdouble ad; + m2_tdouble bd; + double vd = 0.0; + + // Input matrix 1: + // 0.00 1.00 2.00 + // 3.00 4.00 5.00 + // + // Input matrix 2: + // 6.00 7.00 8.00 9.00 + // 10.00 11.00 12.00 13.00 + // 14.00 15.00 16.00 17.00 + for (int i = 0; i < M; i++) + for (int j = 0; j < K; j++) + ad[i][j] = vd++; + + for (int i = 0; i < K; i++) + for (int j = 0; j < N; j++) + bd[i][j] = vd++; + + mr_tdouble cd = ad * bd; + + // CHECK: 38.00 41.00 44.00 47.00 + // CHECK-NEXT: 128.00 140.00 152.00 164.00 + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) + printf("%.2f ", cd[i][j]); + printf("\n"); + } + + m1_tint ai; + m2_tint bi; + int vi = 0; + + // Input matrix 1: + // 0 1 2 + // 3 4 5 + // + // Input matrix 2: + // 6 7 8 9 + // 10 11 12 13 + // 14 15 16 17 + for (int i = 0; i < M; i++) + for (int j = 0; j < K; j++) + ai[i][j] = vi++; + + for (int i = 0; i < K; i++) + for (int j = 0; j < N; j++) + bi[i][j] = vi++; + + mr_tint ci = ai * bi; + + // CHECK: 38 41 44 47 + // CHECK-NEXT: 128 140 152 164 + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) + printf("%d ", ci[i][j]); + printf("\n"); + } + + m1_tull au; + m2_tull bu; + unsigned long long int vu = 0; + + // Input matrix 1: + // 0 1 2 + // 3 4 5 + // + // Input matrix 2: + // 6 7 8 9 + // 10 11 12 13 + // 14 15 16 17 + for (int i = 0; i < M; i++) + for (int j = 0; j < K; j++) + au[i][j] = vu++; + + for (int i = 0; i < K; i++) + for (int j = 0; j < N; j++) + bu[i][j] = vu++; + + mr_tull cu = au * bu; + + // CHECK: 38 41 44 47 + // CHECK-NEXT: 128 140 152 164 + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) + printf("%lld ", cu[i][j]); + printf("\n"); + } + + return 0; +} diff --git a/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-sub.c b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-sub.c new file mode 100644 index 000000000000..01de075a8ca9 --- /dev/null +++ b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-sub.c @@ -0,0 +1,173 @@ +// REQUIRES: aarch64-target-arch,aarch64_sme_run + +// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -o %t +// RUN: %run %t 2>&1 | FileCheck %s +// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -O3 -o %t_smeopt +// RUN: %run %t_smeopt 2>&1 | FileCheck %s +// RUN: %clang_builtins -O3 -march=armv9.1-a -fenable-matrix %s %librt -o %t_nosmeopt +// RUN: %run %t_nosmeopt 2>&1 | FileCheck %s + +#include + +#define M 2 +#define N 4 + +typedef __bf16 m_tbf16 __attribute__((matrix_type(M, N))); + +typedef float m_tfloat __attribute__((matrix_type(M, N))); + +typedef double m_tdouble __attribute__((matrix_type(M, N))); + +typedef signed int m_tint __attribute__((matrix_type(M, N))); + +typedef unsigned long long m_tull __attribute__((matrix_type(M, N))); + +int main() { + m_tbf16 a; + m_tbf16 b; + float v = 0.0; + + // Input matrix 1: + // 0.00 1.00 2.00 3.00 + // 4.00 5.00 6.00 7.00 + // + // Input matrix 2: + // 8.00 9.00 10.00 11.00 + // 12.00 13.00 14.00 15.00 + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + a[i][j] = v++; + + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + b[i][j] = v++; + + m_tbf16 c = a - b; + + // CHECK: -8.00 -8.00 -8.00 -8.00 + // CHECK-NEXT: -8.00 -8.00 -8.00 -8.00 + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) + printf("%.2f ", (float)c[i][j]); + printf("\n"); + } + + m_tfloat af; + m_tfloat bf; + float vf = 0.0; + + // Input matrix 1: + // 0.00 1.00 2.00 3.00 + // 4.00 5.00 6.00 7.00 + // + // Input matrix 2: + // 8.00 9.00 10.00 11.00 + // 12.00 13.00 14.00 15.00 + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + af[i][j] = vf++; + + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + bf[i][j] = vf++; + + m_tfloat cf = af - bf; + + // CHECK: -8.00 -8.00 -8.00 -8.00 + // CHECK-NEXT: -8.00 -8.00 -8.00 -8.00 + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) + printf("%.2f ", cf[i][j]); + printf("\n"); + } + + m_tdouble ad; + m_tdouble bd; + double vd = 0.0; + + // Input matrix 1: + // 0.00 1.00 2.00 3.00 + // 4.00 5.00 6.00 7.00 + // + // Input matrix 2: + // 8.00 9.00 10.00 11.00 + // 12.00 13.00 14.00 15.00 + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + ad[i][j] = vd++; + + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + bd[i][j] = vd++; + + m_tdouble cd = ad - bd; + + // CHECK: -8.00 -8.00 -8.00 -8.00 + // CHECK-NEXT: -8.00 -8.00 -8.00 -8.00 + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) + printf("%.2f ", cd[i][j]); + printf("\n"); + } + + m_tint ai; + m_tint bi; + int vi = 0; + + // Input matrix 1: + // 0 1 2 3 + // 4 5 6 7 + // + // Input matrix 2: + // 8 9 10 11 + // 12 13 14 15 + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + ai[i][j] = vi++; + + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + bi[i][j] = vi++; + + m_tint ci = ai - bi; + + // CHECK: -8 -8 -8 -8 + // CHECK-NEXT: -8 -8 -8 -8 + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) + printf("%d ", ci[i][j]); + printf("\n"); + } + + m_tull au; + m_tull bu; + unsigned long long int vu = 0; + + // Input matrix 1: + // 0 1 2 3 + // 4 5 6 7 + // + // Input matrix 2: + // 0 1 2 3 + // 4 5 6 7 + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + au[i][j] = vu++; + + vu = 0; + for (int i = 0; i < M; i++) + for (int j = 0; j < N; j++) + bu[i][j] = vu++; + + m_tull cu = au - bu; + + // CHECK: 0 0 0 0 + // CHECK-NEXT: 0 0 0 0 + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) + printf("%lld ", cu[i][j]); + printf("\n"); + } + + return 0; +} diff --git a/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-transpose.c b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-transpose.c new file mode 100644 index 000000000000..53f67e25f715 --- /dev/null +++ b/compiler-rt/test/builtins/Unit/aarch64/sme-matrix-transpose.c @@ -0,0 +1,113 @@ +// REQUIRES: aarch64-target-arch,aarch64_sme_run + +// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -o %t +// RUN: %run %t 2>&1 | FileCheck %s +// RUN: %clang_builtins -march=armv9.1-a+sme -fenable-matrix %s %librt -O3 -o %t_smeopt +// RUN: %run %t_smeopt 2>&1 | FileCheck %s +// RUN: %clang_builtins -O3 -march=armv9.1-a -fenable-matrix %s %librt -o %t_nosmeopt +// RUN: %run %t_nosmeopt 2>&1 | FileCheck %s + +#include + +#define ROW 2 +#define COL 3 + +typedef __bf16 m_t __attribute__((matrix_type(ROW, COL))); +typedef __bf16 mt_t __attribute__((matrix_type(COL, ROW))); + +typedef double md_t __attribute__((matrix_type(ROW, COL))); +typedef double mdt_t __attribute__((matrix_type(COL, ROW))); + +typedef signed char mc_t __attribute__((matrix_type(ROW, COL))); +typedef signed char mct_t __attribute__((matrix_type(COL, ROW))); + +typedef unsigned int mui_t __attribute__((matrix_type(ROW, COL))); +typedef unsigned int muit_t __attribute__((matrix_type(COL, ROW))); + +int main() { + m_t a; + float v = 0.0; + + // Input matrix: + // 0.00 1.00 2.00 + // 3.00 4.00 5.00 + for (int i = 0; i < ROW; i++) + for (int j = 0; j < COL; j++) + a[i][j] = v++; + + mt_t b = __builtin_matrix_transpose(a); + + // CHECK: 0.00 3.00 + // CHECK-NEXT: 1.00 4.00 + // CHECK-NEXT: 2.00 5.00 + for (int i = 0; i < COL; i++) { + for (int j = 0; j < ROW; j++) + printf("%.2f ", (float)b[i][j]); + printf("\n"); + } + + md_t ad; + double vd = 1.0; + + // Input matrix: + // 1.00 2.00 3.00 + // 4.00 5.00 6.00 + for (int i = 0; i < ROW; i++) + for (int j = 0; j < COL; j++) + ad[i][j] = vd++; + + mdt_t bd = __builtin_matrix_transpose(ad); + + // CHECK: 1.00 4.00 + // CHECK-NEXT: 2.00 5.00 + // CHECK-NEXT: 3.00 6.00 + for (int i = 0; i < COL; i++) { + for (int j = 0; j < ROW; j++) + printf("%.2f ", bd[i][j]); + printf("\n"); + } + + mc_t ac; + signed char vc = 5; + + // Input matrix: + // 5 6 7 + // 8 9 10 + for (int i = 0; i < ROW; i++) + for (int j = 0; j < COL; j++) + ac[i][j] = vc++; + + mct_t bc = __builtin_matrix_transpose(ac); + + // CHECK: 5 8 + // CHECK-NEXT: 6 9 + // CHECK-NEXT: 7 10 + for (int i = 0; i < COL; i++) { + for (int j = 0; j < ROW; j++) + printf("%d ", bc[i][j]); + printf("\n"); + } + + mui_t aui; + unsigned int vui = 10; + + // Input matrix: + // 10 11 12 + // 13 14 15 + for (int i = 0; i < ROW; i++) + for (int j = 0; j < COL; j++) + aui[i][j] = vui++; + + muit_t bui = __builtin_matrix_transpose(aui); + + // CHECK: 10 13 + // CHECK-NEXT: 11 14 + // CHECK-NEXT: 12 15 + for (int i = 0; i < COL; i++) { + for (int j = 0; j < ROW; j++) + printf("%d ", bui[i][j]); + printf("\n"); + } + + return 0; +} diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py index d4b1e1d71d3c..d0b38288212f 100644 --- a/compiler-rt/test/lit.common.cfg.py +++ b/compiler-rt/test/lit.common.cfg.py @@ -1010,3 +1010,9 @@ if config.compiler_id == "GNU": # llvm. config.substitutions.append(("%crt_src", config.compiler_rt_src_root)) config.substitutions.append(("%llvm_src", config.llvm_src_root)) + +if config.host_os in ["Linux"] and config.target_triple.startswith("aarch64"): + output = subprocess.check_output(["/usr/bin/lscpu"]) + sme_indicator=b"smef64f64" + if re.search(sme_indicator, output): + config.available_features.add("aarch64_sme_run") diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h index dbf2cfb7c5e9..a2489f3e6f3b 100644 --- a/llvm/include/llvm/IR/MatrixBuilder.h +++ b/llvm/include/llvm/IR/MatrixBuilder.h @@ -34,6 +34,36 @@ class MatrixBuilder { IRBuilderBase &B; Module *getModule() { return B.GetInsertBlock()->getParent()->getParent(); } + Value *getExistingLocation(Value *V) { + // if V is a load, we find the location for reusing. + if (!isa(V)) + return nullptr; + + // We can further optimize if load address is alloca and it has only two + // uses: one is store which initializes the alloca and another is load + // which is V itself. The store use must store a value loaded + // from an address. Then the address is the memory location we need. + // This normally happens in the function entry, so we won't do recursive + // search here. + Value *Addr = cast(V)->getPointerOperand(); + if (!isa(Addr) || !Addr->hasNUses(2)) + return Addr; + + Value *AnotherUse = *Addr->user_begin(); + if (AnotherUse == V) + AnotherUse = *(++Addr->user_begin()); + + if (!isa(AnotherUse)) + return Addr; + + // Store value is the result of V. + Value *StoredValue = cast(AnotherUse)->getValueOperand(); + if (!isa(StoredValue)) + return Addr; + + return cast(StoredValue)->getPointerOperand(); + } + std::pair splatScalarOperandIfNeeded(Value *LHS, Value *RHS) { assert((LHS->getType()->isVectorTy() || RHS->getType()->isVectorTy()) && @@ -105,6 +135,44 @@ public: return Call; } + Value *CreateSMEMatrixTranspose(Value *Matrix, unsigned Rows, + unsigned Columns) { + auto *OpType = cast(Matrix->getType()); + auto *ElemType = OpType->getElementType(); + auto *ReturnType = FixedVectorType::get(ElemType, Rows * Columns); + + std::string FuncName = "__sme_transpose_"; + FuncName += ElemType->isIntegerTy() ? "int" : "float"; + FuncName += std::to_string(ElemType->getScalarSizeInBits() / 8); + + // %a.addr = alloca [6 x i16] + // %a = load [6 x i16], ptr %0 + // store [6 x i16] %a, ptr %a.addr + // %1 = load <6 x i16>, ptr %a.addr + // + // If we want to create a stack slot for Matrix, we can just use %0, + // no need to create new memory. + Value *MemPara = getExistingLocation(Matrix); + if (!MemPara) { + MemPara = B.CreateAlloca(OpType); + B.CreateStore(Matrix, MemPara); + } + + // FIXME: optimize the memory for the return address too. + Value *MemForRet = B.CreateAlloca(ReturnType); + + Value *Ops[] = {MemForRet, MemPara, B.getInt32(Rows), B.getInt32(Columns)}; + FunctionCallee Func = getModule()->getOrInsertFunction( + FuncName, FunctionType::get(B.getVoidTy(), + {B.getPtrTy(), B.getPtrTy(), B.getInt32Ty(), + B.getInt32Ty()}, + false)); + + B.CreateCall(Func, Ops); + (cast(Func.getCallee()))->addFnAttr("aarch64_pstate_sm_enabled"); + return B.CreateLoad(ReturnType, MemForRet); + } + /// Create a llvm.matrix.transpose call, transposing \p Matrix with \p Rows /// rows and \p Columns columns. CallInst *CreateMatrixTranspose(Value *Matrix, unsigned Rows, @@ -121,6 +189,93 @@ public: return B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name); } + Value *CreateSMEMatrixMultiply(Value *LHS, Value *RHS, unsigned LHSRows, + unsigned LHSColumns, unsigned RHSColumns, + bool IsSigned) { + auto *ElemType = (cast(LHS->getType()))->getElementType(); + + auto *LHSType = FixedVectorType::get(ElemType, LHSRows * LHSColumns); + auto *RHSType = FixedVectorType::get(ElemType, LHSColumns * RHSColumns); + auto *ReturnType = FixedVectorType::get(ElemType, LHSRows * RHSColumns); + + std::string FuncName = "__sme_matmul_"; + FuncName += ElemType->isIntegerTy() ? (IsSigned ? "int" : "uint") : "float"; + FuncName += std::to_string(ElemType->getScalarSizeInBits() / 8); + + // First check if we can reuse some existing memory. + Value *MemForLHS = getExistingLocation(LHS); + if (!MemForLHS) { + MemForLHS = B.CreateAlloca(LHSType); + B.CreateStore(LHS, MemForLHS); + } + + Value *MemForRHS = getExistingLocation(RHS); + if (!MemForRHS) { + MemForRHS = B.CreateAlloca(RHSType); + B.CreateStore(RHS, MemForRHS); + } + + Value *MemForRet = B.CreateAlloca(ReturnType); + + Value *Ops[] = {MemForRet, + MemForLHS, + MemForRHS, + B.getInt32(LHSRows), + B.getInt32(LHSColumns), + B.getInt32(RHSColumns)}; + FunctionCallee Func = getModule()->getOrInsertFunction( + FuncName, + FunctionType::get(B.getVoidTy(), + {B.getPtrTy(), B.getPtrTy(), B.getPtrTy(), + B.getInt32Ty(), B.getInt32Ty(), B.getInt32Ty()}, + false)); + + B.CreateCall(Func, Ops); + (cast(Func.getCallee()))->addFnAttr("aarch64_pstate_sm_enabled"); + return B.CreateLoad(ReturnType, MemForRet); + } + + // Matrix binary operations that depend on weo matrixes have same shape, like + // add, sub. + Value *CreateSMEMatrixBinOp(Value *LHS, Value *RHS, unsigned Rows, + unsigned Columns, bool IsSigned, + StringRef OpName) { + auto *ElemType = (cast(LHS->getType()))->getElementType(); + + auto *Type = FixedVectorType::get(ElemType, Rows * Columns); + + std::string FuncName = (StringRef("__sme_") + OpName + "_").str(); + FuncName += ElemType->isIntegerTy() ? (IsSigned ? "int" : "uint") : "float"; + FuncName += std::to_string(ElemType->getScalarSizeInBits() / 8); + + // First check if we can reuse some existing memory. + Value *MemForLHS = getExistingLocation(LHS); + if (!MemForLHS) { + MemForLHS = B.CreateAlloca(Type); + B.CreateStore(LHS, MemForLHS); + } + + Value *MemForRHS = getExistingLocation(RHS); + if (!MemForRHS) { + MemForRHS = B.CreateAlloca(Type); + B.CreateStore(RHS, MemForRHS); + } + + Value *MemForRet = B.CreateAlloca(Type); + + Value *Ops[] = {MemForRet, MemForLHS, MemForRHS, B.getInt32(Rows), + B.getInt32(Columns)}; + FunctionCallee Func = getModule()->getOrInsertFunction( + FuncName, FunctionType::get(B.getVoidTy(), + {B.getPtrTy(), B.getPtrTy(), B.getPtrTy(), + B.getInt32Ty(), B.getInt32Ty()}, + false)); + + B.CreateCall(Func, Ops); + (cast(Func.getCallee()))->addFnAttr("aarch64_pstate_sm_enabled"); + return B.CreateLoad(Type, MemForRet); + } + /// Create a llvm.matrix.multiply call, multiplying matrixes \p LHS and \p /// RHS. CallInst *CreateMatrixMultiply(Value *LHS, Value *RHS, unsigned LHSRows, diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index a8a17101b9c9..935a8f1cb394 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -193,6 +193,12 @@ static cl::opt MIResourceCutOff("misched-resource-cutoff", cl::Hidden, cl::desc("Number of intervals to track"), cl::init(10)); +extern cl::opt EnableMatrix; + +static cl::opt + BigBasicBlock("schedule-big-basic-block", cl::Hidden, cl::init(200), + cl::desc("The limit to use while schedule a region ")); + // DAG subtrees must have at least this many nodes. static const unsigned MinSubtreeSize = 8; @@ -636,6 +642,9 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler, MachineBasicBlock::iterator RegionEnd = R.RegionEnd; unsigned NumRegionInstrs = R.NumRegionInstrs; + if (EnableMatrix && NumRegionInstrs > BigBasicBlock) + continue; + // Notify the scheduler of the region, even if we may skip scheduling // it. Perhaps it still needs to be bundled. Scheduler.enterRegion(&*MBB, I, RegionEnd, NumRegionInstrs); diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index 68dece6cf73e..7f8b56d8ae03 100644 --- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -79,6 +79,10 @@ static cl::opt HugeRegion("dag-maps-huge-region", cl::Hidden, "prior to scheduling, at which point a trade-off " "is made to avoid excessive compile time.")); +cl::opt + EnableMatrix("enable-matrix", cl::init(false), cl::Hidden, + cl::desc("Enable lowering of the matrix intrinsics")); + static cl::opt ReductionSize( "dag-maps-reduction-size", cl::Hidden, cl::desc("A huge scheduling region will have maps reduced by this many " @@ -93,8 +97,11 @@ static cl::opt SchedPrintCycles( static unsigned getReductionSize() { // Always reduce a huge region with half of the elements, except // when user sets this number explicitly. - if (ReductionSize.getNumOccurrences() == 0) + if (ReductionSize.getNumOccurrences() == 0) { + if (EnableMatrix) + return HugeRegion / 20; return HugeRegion / 2; + } return ReductionSize; } @@ -1010,11 +1017,12 @@ void ScheduleDAGInstrs::buildSchedGraph(AAResults *AA, } // Reduce maps if they grow huge. - if (Stores.size() + Loads.size() >= HugeRegion) { + unsigned RegionSize = EnableMatrix ? (HugeRegion / 10) : HugeRegion; + if (Stores.size() + Loads.size() >= RegionSize) { LLVM_DEBUG(dbgs() << "Reducing Stores and Loads maps.\n";); reduceHugeMemNodeMaps(Stores, Loads, getReductionSize()); } - if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) { + if (NonAliasStores.size() + NonAliasLoads.size() >= RegionSize) { LLVM_DEBUG( dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n";); reduceHugeMemNodeMaps(NonAliasStores, NonAliasLoads, getReductionSize()); diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 6f36bdad780a..03389905ef57 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -275,9 +275,7 @@ static cl::opt EnableOrderFileInstrumentation( "enable-order-file-instrumentation", cl::init(false), cl::Hidden, cl::desc("Enable order file instrumentation (default = off)")); -static cl::opt - EnableMatrix("enable-matrix", cl::init(false), cl::Hidden, - cl::desc("Enable lowering of the matrix intrinsics")); +extern cl::opt EnableMatrix; static cl::opt EnableConstraintElimination( "enable-constraint-elimination", cl::init(true), cl::Hidden, -- Gitee