From acfb50b03b0be3eda5282f26bad34ffc18595b30 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Fri, 8 Sep 2023 10:54:35 -0700
Subject: [PATCH 01/20] [Clang][LoongArch] Generate _mcount instead of mcount
 (#65657)

When building the LoongArch Linux kernel without
`CONFIG_DYNAMIC_FTRACE`, the build fails to link because the mcount
symbol is `mcount`, not `_mcount` like GCC generates and the kernel
expects:

```
ld.lld: error: undefined symbol: mcount
>>> referenced by version.c
>>>               init/version.o:(early_hostname) in archive vmlinux.a
>>> referenced by do_mounts.c
>>>               init/do_mounts.o:(rootfs_init_fs_context) in archive vmlinux.a
>>> referenced by main.c
>>>               init/main.o:(__traceiter_initcall_level) in archive vmlinux.a
>>> referenced 97011 more times
>>> did you mean: _mcount
>>> defined in: vmlinux.a(arch/loongarch/kernel/mcount.o)
```

Set `MCountName` in `LoongArchTargetInfo` to `_mcount`, which resolves
the build failure.

(cherry picked from commit cc2b09bee017147527e7bd1eb5272f4f70a7b900)
---
 clang/lib/Basic/Targets/LoongArch.h | 1 +
 clang/test/CodeGen/mcount.c         | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h
index 8f4150b2539d..3313102492cb 100644
--- a/clang/lib/Basic/Targets/LoongArch.h
+++ b/clang/lib/Basic/Targets/LoongArch.h
@@ -40,6 +40,7 @@ public:
     LongDoubleWidth = 128;
     LongDoubleAlign = 128;
     LongDoubleFormat = &llvm::APFloat::IEEEquad();
+    MCountName = "_mcount";
     SuitableAlign = 128;
     WCharType = SignedInt;
     WIntType = UnsignedInt;
diff --git a/clang/test/CodeGen/mcount.c b/clang/test/CodeGen/mcount.c
index 8f994ab4e754..bdd609c1dfc5 100644
--- a/clang/test/CodeGen/mcount.c
+++ b/clang/test/CodeGen/mcount.c
@@ -7,6 +7,8 @@
 // RUN: %clang_cc1 -pg -triple x86_64-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-DOUBLE-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple arm-netbsd-eabi -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-DOUBLE-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple aarch64-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-DOUBLE-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple loongarch32 -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
+// RUN: %clang_cc1 -pg -triple loongarch64 -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple mips-netbsd -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-DOUBLE-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple mips-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
 // RUN: %clang_cc1 -pg -triple mipsel-unknown-gnu-linux -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK-PREFIXED,NO-MCOUNT1 %s
-- 
Gitee


From 6f3143e1ad0bb759b7519af81994ed3c71dcf52b Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Fri, 20 Oct 2023 10:44:55 +0800
Subject: [PATCH 02/20] [LoongArch] Fix td pattern for CACOP LDPTE and LDDIR

The immediate argument should be a target constant (`timm`).

(cherry picked from commit 47826b3f148996767ebd2c67ee41c329cb364fef)
---
 llvm/lib/Target/LoongArch/LoongArchInstrInfo.td | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index b2c4bb812ba5..166379d7d592 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -1857,9 +1857,9 @@ defm : PseudoBinPat<"atomic_load_xor_32", PseudoAtomicLoadXor32>;
 /// Intrinsics
 
 def : Pat<(int_loongarch_cacop_d timm:$op, i64:$rj, timm:$imm12),
-          (CACOP uimm5:$op, GPR:$rj, simm12:$imm12)>;
+          (CACOP timm:$op, GPR:$rj, timm:$imm12)>;
 def : Pat<(int_loongarch_cacop_w i32:$op, i32:$rj, i32:$imm12),
-          (CACOP uimm5:$op, GPR:$rj, simm12:$imm12)>;
+          (CACOP timm:$op, GPR:$rj, timm:$imm12)>;
 def : Pat<(loongarch_dbar uimm15:$imm15), (DBAR uimm15:$imm15)>;
 def : Pat<(loongarch_ibar uimm15:$imm15), (IBAR uimm15:$imm15)>;
 def : Pat<(loongarch_break uimm15:$imm15), (BREAK uimm15:$imm15)>;
@@ -2023,9 +2023,9 @@ def : Pat<(int_loongarch_asrtle_d GPR:$rj, GPR:$rk),
 def : Pat<(int_loongarch_asrtgt_d GPR:$rj, GPR:$rk),
           (ASRTGT_D GPR:$rj, GPR:$rk)>;
 def : Pat<(int_loongarch_lddir_d GPR:$rj, timm:$imm8),
-          (LDDIR GPR:$rj, uimm8:$imm8)>;
+          (LDDIR GPR:$rj, timm:$imm8)>;
 def : Pat<(int_loongarch_ldpte_d GPR:$rj, timm:$imm8),
-          (LDPTE GPR:$rj, uimm8:$imm8)>;
+          (LDPTE GPR:$rj, timm:$imm8)>;
 } // Predicates = [IsLA64]
 
 //===----------------------------------------------------------------------===//
-- 
Gitee


From a841576d7a53d3d8fd61aa854af7d9c2dd204536 Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Thu, 26 Oct 2023 11:50:28 +0800
Subject: [PATCH 03/20] [LoongArch][test] Add some ABI regression tests for
 empty struct. NFC

How empty structs (not as fields of container struct) are passed in C++
is not explicitly documented in psABI. This patch adds some tests
showing the current handing of clang. Some of the results are different
from gcc. Following patch(es) will try to fix the mismatch.

(cherry picked from commit 8149066fa532d82ff62a0629d5a9fab6bd4da768)
---
 .../LoongArch/abi-lp64d-empty-structs.c       | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c
index fb90bf556c19..d0daafac336e 100644
--- a/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c
+++ b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c
@@ -81,9 +81,62 @@ struct s8 test_s8(struct s8 a) {
   return a;
 }
 
+/// Note: Below tests check how empty structs are passed while above tests check
+/// empty structs as fields of container struct are ignored when flattening
+/// structs to examine whether the container structs can be passed via FARs.
+
 // CHECK-C: define{{.*}} void @test_s9()
 // CHECK-CXX: define{{.*}} i64 @_Z7test_s92s9(i64 {{.*}})
 struct s9 { struct empty e; };
 struct s9 test_s9(struct s9 a) {
   return a;
 }
+
+// CHECK-C: define{{.*}} void @test_s10()
+// CHECK-CXX: define{{.*}} void @_Z8test_s103s10()
+struct s10 { };
+struct s10 test_s10(struct s10 a) {
+  return a;
+}
+
+// CHECK-C: define{{.*}} void @test_s11()
+// CHECK-CXX: define{{.*}} i64 @_Z8test_s113s11(i64 {{.*}})
+struct s11 { struct { } s; };
+struct s11 test_s11(struct s11 a) {
+  return a;
+}
+
+// CHECK-C: define{{.*}} void @test_s12()
+// CHECK-CXX: define{{.*}} void @_Z8test_s123s12()
+struct s12 { int i[0]; };
+struct s12 test_s12(struct s12 a) {
+  return a;
+}
+
+// CHECK-C: define{{.*}} void @test_s13()
+// CHECK-CXX: define{{.*}} void @_Z8test_s133s13()
+struct s13 { struct { } s[0]; };
+struct s13 test_s13(struct s13 a) {
+  return a;
+}
+
+// CHECK-C: define{{.*}} void @test_s14()
+// CHECK-CXX: define{{.*}} i64 @_Z8test_s143s14(i64 {{.*}})
+struct s14 { struct { } s[1]; };
+struct s14 test_s14(struct s14 a) {
+  return a;
+}
+
+// CHECK-C: define{{.*}} void @test_s15()
+// CHECK-CXX: define{{.*}} void @_Z8test_s153s15()
+struct s15 { int : 0; };
+struct s15 test_s15(struct s15 a) {
+  return a;
+}
+
+// CHECK-C: define{{.*}} void @test_s16()
+// CHECK-CXX: define{{.*}} void @_Z8test_s163s16()
+struct s16 { int : 1; };
+struct s16 test_s16(struct s16 a) {
+  return a;
+}
-- 
Gitee


From 6248fa0fc405952a8b907624c27b2dd1ee86a962 Mon Sep 17 00:00:00 2001
From: Lu Weining <luweining@loongson.cn>
Date: Tue, 31 Oct 2023 21:18:06 +0800
Subject: [PATCH 04/20] [LoongArch] Fix ABI mismatch with gcc/g++ about empty
 structs passing (#70320)

How empty structs (not as fields of container struct) are passed in C++
is not explicitly documented in psABI. However, this patch fixes the
mismatch with g++.

Note that the unnamed bitfield case `struct { int : 1; }` in C is also
fixed. Previously clang regards it as an empty struct and then ignores
it when passing. Now size of the struct is counted; since it's size is
not 0, clang will not ignore it even in C.

While https://reviews.llvm.org/D156116 fixed the handling of empty
struct when considering eligibility of the container struct for the FP
calling convention ('flattening'), this patch fixes the handling of
passing the empty struct itself.

Fix https://github.com/llvm/llvm-project/issues/70319

(cherry picked from commit 9ca6bf3fb7b7df373723b3275730f101f9ff816b)
---
 clang/lib/CodeGen/Targets/LoongArch.cpp                | 10 ++++++----
 clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c |  8 ++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/LoongArch.cpp b/clang/lib/CodeGen/Targets/LoongArch.cpp
index 7483bf6d6d1e..bc508a99da9c 100644
--- a/clang/lib/CodeGen/Targets/LoongArch.cpp
+++ b/clang/lib/CodeGen/Targets/LoongArch.cpp
@@ -308,12 +308,14 @@ ABIArgInfo LoongArchABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
                                            CGCXXABI::RAA_DirectInMemory);
   }
 
-  // Ignore empty structs/unions.
-  if (isEmptyRecord(getContext(), Ty, true))
-    return ABIArgInfo::getIgnore();
-
   uint64_t Size = getContext().getTypeSize(Ty);
 
+  // Ignore empty struct or union whose size is zero, e.g. `struct { }` in C or
+  // `struct { int a[0]; }` in C++. In C++, `struct { }` is empty but it's size
+  // is 1 byte and g++ doesn't ignore it; clang++ matches this behaviour.
+  if (isEmptyRecord(getContext(), Ty, true) && Size == 0)
+    return ABIArgInfo::getIgnore();
+
   // Pass floating point values via FARs if possible.
   if (IsFixed && Ty->isFloatingType() && !Ty->isComplexType() &&
       FRLen >= Size && FARsLeft) {
diff --git a/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c
index d0daafac336e..281b7b15841a 100644
--- a/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c
+++ b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c
@@ -93,7 +93,7 @@ struct s9 test_s9(struct s9 a) {
 }
 
 // CHECK-C: define{{.*}} void @test_s10()
-// CHECK-CXX: define{{.*}} void @_Z8test_s103s10()
+// CHECK-CXX: define{{.*}} i64 @_Z8test_s103s10(i64 {{.*}})
 struct s10 { };
 struct s10 test_s10(struct s10 a) {
   return a;
@@ -128,14 +128,14 @@ struct s14 test_s14(struct s14 a) {
 }
 
 // CHECK-C: define{{.*}} void @test_s15()
-// CHECK-CXX: define{{.*}} void @_Z8test_s153s15()
+// CHECK-CXX: define{{.*}} i64 @_Z8test_s153s15(i64 {{.*}})
 struct s15 { int : 0; };
 struct s15 test_s15(struct s15 a) {
   return a;
 }
 
-// CHECK-C: define{{.*}} void @test_s16()
-// CHECK-CXX: define{{.*}} void @_Z8test_s163s16()
+// CHECK-C: define{{.*}} i64 @test_s16(i64 {{.*}})
+// CHECK-CXX: define{{.*}} i64 @_Z8test_s163s16(i64 {{.*}})
 struct s16 { int : 1; };
 struct s16 test_s16(struct s16 a) {
   return a;
-- 
Gitee


From 028d0d88cd73c724f954577dc90cbbc2873a6832 Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Thu, 2 Nov 2023 09:29:43 +0800
Subject: [PATCH 05/20] [LoongArch] Pre-commit test for issue #70890

(cherry picked from commit 749083b91f31f370cf64831d3e7e6215b6d51442)
---
 .../LoongArch/abi-lp64d-empty-unions.c        | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 clang/test/CodeGen/LoongArch/abi-lp64d-empty-unions.c

diff --git a/clang/test/CodeGen/LoongArch/abi-lp64d-empty-unions.c b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-unions.c
new file mode 100644
index 000000000000..b0607425336e
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-unions.c
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +f -target-feature +d -target-abi lp64d -emit-llvm %s -o - | \
+// RUN:   FileCheck --check-prefix=CHECK-C %s
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +f -target-feature +d -target-abi lp64d -emit-llvm %s -o - -x c++ | \
+// RUN:   FileCheck --check-prefix=CHECK-CXX %s
+
+#include <stdint.h>
+
+// CHECK-C: define{{.*}} void @test1()
+// CHECK-CXX: define{{.*}} i64 @_Z5test12u1(i64{{[^,]*}})
+union u1 { };
+union u1 test1(union u1 a) {
+  return a;
+}
+
+struct s1 {
+  union u1 u;
+  int i;
+  float f;
+};
+
+// CHECK-C: define{{.*}} { i32, float } @test2(i32{{[^,]*}}, float{{[^,]*}})
+/// FIXME: This doesn't match g++.
+// CHECK-CXX: define{{.*}} { i32, float } @_Z5test22s1(i32{{[^,]*}}, float{{[^,]*}})
+struct s1 test2(struct s1 a) {
+  return a;
+}
-- 
Gitee


From 8c4371c0e53635a23852d0dc7025b4c48495277b Mon Sep 17 00:00:00 2001
From: Lu Weining <luweining@loongson.cn>
Date: Sat, 4 Nov 2023 10:04:37 +0800
Subject: [PATCH 06/20] [LoongArch] Fix ABI mismatch with g++ when handling
 empty unions (#71025)

In g++, empty unions are not ignored like empty structs when flattening
structs to examine whether the structs can be passed via FARs in C++.
This patch aligns clang++ with g++.

Fix https://github.com/llvm/llvm-project/issues/70890.

(cherry picked from commit 4253fdc2c462da61cc0deb74a43265665720c828)
---
 clang/lib/CodeGen/Targets/LoongArch.cpp                | 7 ++++---
 clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c | 2 +-
 clang/test/CodeGen/LoongArch/abi-lp64d-empty-unions.c  | 3 +--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/LoongArch.cpp b/clang/lib/CodeGen/Targets/LoongArch.cpp
index bc508a99da9c..63b9a1fdb988 100644
--- a/clang/lib/CodeGen/Targets/LoongArch.cpp
+++ b/clang/lib/CodeGen/Targets/LoongArch.cpp
@@ -170,10 +170,11 @@ bool LoongArchABIInfo::detectFARsEligibleStructHelper(
     // copy constructor are not eligible for the FP calling convention.
     if (getRecordArgABI(Ty, CGT.getCXXABI()))
       return false;
-    if (isEmptyRecord(getContext(), Ty, true, true))
-      return true;
     const RecordDecl *RD = RTy->getDecl();
-    // Unions aren't eligible unless they're empty (which is caught above).
+    if (isEmptyRecord(getContext(), Ty, true, true) &&
+        (!RD->isUnion() || !isa<CXXRecordDecl>(RD)))
+      return true;
+    // Unions aren't eligible unless they're empty in C (which is caught above).
     if (RD->isUnion())
       return false;
     const ASTRecordLayout &Layout = getContext().getASTRecordLayout(RD);
diff --git a/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c
index 281b7b15841a..2f7596f0ebdc 100644
--- a/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c
+++ b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c
@@ -3,7 +3,7 @@
 // RUN: %clang_cc1 -triple loongarch64 -target-feature +f -target-feature +d -target-abi lp64d -emit-llvm %s -o - -x c++ | \
 // RUN:   FileCheck --check-prefix=CHECK-CXX %s
 
-// Fields containing empty structs or unions are ignored when flattening
+// Fields containing empty structs are ignored when flattening
 // structs to examine whether the structs can be passed via FARs, even in C++.
 // But there is an exception that non-zero-length array of empty structures are
 // not ignored in C++. These rules are not documented in psABI <https://www.github.com/loongson/la-abi-specs>
diff --git a/clang/test/CodeGen/LoongArch/abi-lp64d-empty-unions.c b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-unions.c
index b0607425336e..363e37efb646 100644
--- a/clang/test/CodeGen/LoongArch/abi-lp64d-empty-unions.c
+++ b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-unions.c
@@ -19,8 +19,7 @@ struct s1 {
 };
 
 // CHECK-C: define{{.*}} { i32, float } @test2(i32{{[^,]*}}, float{{[^,]*}})
-/// FIXME: This doesn't match g++.
-// CHECK-CXX: define{{.*}} { i32, float } @_Z5test22s1(i32{{[^,]*}}, float{{[^,]*}})
+// CHECK-CXX: define{{.*}} [2 x i64] @_Z5test22s1([2 x i64]{{[^,]*}})
 struct s1 test2(struct s1 a) {
   return a;
 }
-- 
Gitee


From 8e855955a009ec398b9f7da88e980dae9d20c420 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 14 Nov 2023 00:43:40 -0800
Subject: [PATCH 07/20] [Driver] Default LoongArch to
 -fno-direct-access-external-data for non-PIC (#72221)

For -fno-pic, if an extern variable is defined in a DSO, a copy
relocation will be needed. However, loongarch*-linux does not and will
not support copy relocations.

Change Driver to default to -fno-direct-access-external-data for
LoongArch && non-PIC.
Keep Frontend conditions unchanged (-fdirect-access-external-data ||
-fno-direct-access-external-data && PIC>0 => direct access).

Fix #71645

(cherry picked from commit 47eeee297775347cbdb7624d6a766c2a3eec4a59)
---
 clang/lib/Driver/ToolChains/Clang.cpp            | 7 ++++++-
 clang/test/Driver/fdirect-access-external-data.c | 6 ++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 6b5930990f11..b21aeaee7f5a 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5632,10 +5632,15 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // defaults to -fno-direct-access-external-data. Pass the option if different
   // from the default.
   if (Arg *A = Args.getLastArg(options::OPT_fdirect_access_external_data,
-                               options::OPT_fno_direct_access_external_data))
+                               options::OPT_fno_direct_access_external_data)) {
     if (A->getOption().matches(options::OPT_fdirect_access_external_data) !=
         (PICLevel == 0))
       A->render(Args, CmdArgs);
+  } else if (PICLevel == 0 && Triple.isLoongArch()) {
+    // Some targets default to -fno-direct-access-external-data even for
+    // -fno-pic.
+    CmdArgs.push_back("-fno-direct-access-external-data");
+  }
 
   if (Args.hasFlag(options::OPT_fno_plt, options::OPT_fplt, false)) {
     CmdArgs.push_back("-fno-plt");
diff --git a/clang/test/Driver/fdirect-access-external-data.c b/clang/test/Driver/fdirect-access-external-data.c
index f132b1b088af..a6da776e6977 100644
--- a/clang/test/Driver/fdirect-access-external-data.c
+++ b/clang/test/Driver/fdirect-access-external-data.c
@@ -9,6 +9,12 @@
 // RUN: %clang -### -c -target aarch64 %s -fpic 2>&1 | FileCheck %s --check-prefix=DEFAULT
 // RUN: %clang -### -c -target aarch64 %s -fpic -fdirect-access-external-data 2>&1 | FileCheck %s --check-prefix=DIRECT
 
+/// loongarch* targets default to -fno-direct-access-external-data even for -fno-pic.
+// RUN: %clang -### -c --target=loongarch64 -fno-pic %s 2>&1 | FileCheck %s --check-prefix=INDIRECT
+// RUN: %clang -### -c --target=loongarch64 -fpie %s 2>&1 | FileCheck %s --check-prefix=DEFAULT
+// RUN: %clang -### -c --target=loongarch32 -fno-pic -fdirect-access-external-data %s 2>&1 | FileCheck %s --check-prefix=DEFAULT
+// RUN: %clang -### -c --target=loongarch32 -fpie -fdirect-access-external-data %s 2>&1 | FileCheck %s --check-prefix=DIRECT
+
 // DEFAULT-NOT: direct-access-external-data"
 // DIRECT:      "-fdirect-access-external-data"
 // INDIRECT:    "-fno-direct-access-external-data"
-- 
Gitee


From d90b85e94180543fd1789f9e26d7931f2329069b Mon Sep 17 00:00:00 2001
From: ZhaoQi <zhaoqi01@loongson.cn>
Date: Fri, 10 Nov 2023 15:54:33 +0800
Subject: [PATCH 08/20] [LoongArch][MC] Refine MCInstrAnalysis based on
 registers used (#71276)

MCInstrAnalysis can return properties of instructions (e.g., isCall(),
isBranch(),...) based on the informations that MCInstrDesc can get from
*InstrInfo*.td files. These infos are based on opcodes only, but JIRL
can have different properties based on different registers used.

So this patch refines several MCInstrAnalysis methods: isTerminator,
isCall,isReturn,isBranch,isUnconditionalBranch and isIndirectBranch.

This patch also allows BOLT which will be supported on LoongArch later
to get right instruction infos.

(cherry picked from commit f7d784709673ca185f6fb0633fd53c72e81f2ae1)
---
 .../MCTargetDesc/LoongArchMCTargetDesc.cpp    |  76 +++++++++++++
 .../unittests/Target/LoongArch/CMakeLists.txt |   1 +
 .../Target/LoongArch/MCInstrAnalysisTest.cpp  | 107 ++++++++++++++++++
 3 files changed, 184 insertions(+)
 create mode 100644 llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp

diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
index 942e667bc261..d580c3457fec 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
@@ -104,6 +104,82 @@ public:
 
     return false;
   }
+
+  bool isTerminator(const MCInst &Inst) const override {
+    if (MCInstrAnalysis::isTerminator(Inst))
+      return true;
+
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case LoongArch::JIRL:
+      return Inst.getOperand(0).getReg() == LoongArch::R0;
+    }
+  }
+
+  bool isCall(const MCInst &Inst) const override {
+    if (MCInstrAnalysis::isCall(Inst))
+      return true;
+
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case LoongArch::JIRL:
+      return Inst.getOperand(0).getReg() != LoongArch::R0;
+    }
+  }
+
+  bool isReturn(const MCInst &Inst) const override {
+    if (MCInstrAnalysis::isReturn(Inst))
+      return true;
+
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case LoongArch::JIRL:
+      return Inst.getOperand(0).getReg() == LoongArch::R0 &&
+             Inst.getOperand(1).getReg() == LoongArch::R1;
+    }
+  }
+
+  bool isBranch(const MCInst &Inst) const override {
+    if (MCInstrAnalysis::isBranch(Inst))
+      return true;
+
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case LoongArch::JIRL:
+      return Inst.getOperand(0).getReg() == LoongArch::R0 &&
+             Inst.getOperand(1).getReg() != LoongArch::R1;
+    }
+  }
+
+  bool isUnconditionalBranch(const MCInst &Inst) const override {
+    if (MCInstrAnalysis::isUnconditionalBranch(Inst))
+      return true;
+
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case LoongArch::JIRL:
+      return Inst.getOperand(0).getReg() == LoongArch::R0 &&
+             Inst.getOperand(1).getReg() != LoongArch::R1;
+    }
+  }
+
+  bool isIndirectBranch(const MCInst &Inst) const override {
+    if (MCInstrAnalysis::isIndirectBranch(Inst))
+      return true;
+
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case LoongArch::JIRL:
+      return Inst.getOperand(0).getReg() == LoongArch::R0 &&
+             Inst.getOperand(1).getReg() != LoongArch::R1;
+    }
+  }
 };
 
 } // end namespace
diff --git a/llvm/unittests/Target/LoongArch/CMakeLists.txt b/llvm/unittests/Target/LoongArch/CMakeLists.txt
index fef4f8e15461..e6f8ec073721 100644
--- a/llvm/unittests/Target/LoongArch/CMakeLists.txt
+++ b/llvm/unittests/Target/LoongArch/CMakeLists.txt
@@ -20,6 +20,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_target_unittest(LoongArchTests
   InstSizes.cpp
+  MCInstrAnalysisTest.cpp
   )
 
 set_property(TARGET LoongArchTests PROPERTY FOLDER "Tests/UnitTests/TargetTests")
diff --git a/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp b/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
new file mode 100644
index 000000000000..6a208d274a0d
--- /dev/null
+++ b/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
@@ -0,0 +1,107 @@
+//===- MCInstrAnalysisTest.cpp - LoongArchMCInstrAnalysis unit tests ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "MCTargetDesc/LoongArchMCTargetDesc.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+
+#include "gtest/gtest.h"
+
+#include <memory>
+
+using namespace llvm;
+
+namespace {
+
+class InstrAnalysisTest : public testing::TestWithParam<const char *> {
+protected:
+  std::unique_ptr<const MCInstrInfo> Info;
+  std::unique_ptr<const MCInstrAnalysis> Analysis;
+
+  static void SetUpTestSuite() {
+    LLVMInitializeLoongArchTargetInfo();
+    LLVMInitializeLoongArchTarget();
+    LLVMInitializeLoongArchTargetMC();
+  }
+
+  InstrAnalysisTest() {
+    std::string Error;
+    const Target *TheTarget =
+        TargetRegistry::lookupTarget(Triple::normalize(GetParam()), Error);
+    Info = std::unique_ptr<const MCInstrInfo>(TheTarget->createMCInstrInfo());
+    Analysis = std::unique_ptr<const MCInstrAnalysis>(
+        TheTarget->createMCInstrAnalysis(Info.get()));
+  }
+};
+
+} // namespace
+
+static MCInst beq() {
+  return MCInstBuilder(LoongArch::BEQ)
+      .addReg(LoongArch::R0)
+      .addReg(LoongArch::R1)
+      .addImm(32);
+}
+
+static MCInst bl() { return MCInstBuilder(LoongArch::BL).addImm(32); }
+
+static MCInst jirl(unsigned RD, unsigned RJ = LoongArch::R10) {
+  return MCInstBuilder(LoongArch::JIRL).addReg(RD).addReg(RJ).addImm(16);
+}
+
+TEST_P(InstrAnalysisTest, IsTerminator) {
+  EXPECT_TRUE(Analysis->isTerminator(beq()));
+  EXPECT_FALSE(Analysis->isTerminator(bl()));
+  EXPECT_TRUE(Analysis->isTerminator(jirl(LoongArch::R0)));
+  EXPECT_FALSE(Analysis->isTerminator(jirl(LoongArch::R5)));
+}
+
+TEST_P(InstrAnalysisTest, IsCall) {
+  EXPECT_FALSE(Analysis->isCall(beq()));
+  EXPECT_TRUE(Analysis->isCall(bl()));
+  EXPECT_TRUE(Analysis->isCall(jirl(LoongArch::R1)));
+  EXPECT_FALSE(Analysis->isCall(jirl(LoongArch::R0)));
+}
+
+TEST_P(InstrAnalysisTest, IsReturn) {
+  EXPECT_FALSE(Analysis->isReturn(beq()));
+  EXPECT_FALSE(Analysis->isReturn(bl()));
+  EXPECT_TRUE(Analysis->isReturn(jirl(LoongArch::R0, LoongArch::R1)));
+  EXPECT_FALSE(Analysis->isReturn(jirl(LoongArch::R0)));
+  EXPECT_FALSE(Analysis->isReturn(jirl(LoongArch::R1)));
+}
+
+TEST_P(InstrAnalysisTest, IsBranch) {
+  EXPECT_TRUE(Analysis->isBranch(beq()));
+  EXPECT_FALSE(Analysis->isBranch(bl()));
+  EXPECT_TRUE(Analysis->isBranch(jirl(LoongArch::R0)));
+  EXPECT_FALSE(Analysis->isBranch(jirl(LoongArch::R1)));
+  EXPECT_FALSE(Analysis->isBranch(jirl(LoongArch::R0, LoongArch::R1)));
+}
+
+TEST_P(InstrAnalysisTest, IsUnconditionalBranch) {
+  EXPECT_FALSE(Analysis->isUnconditionalBranch(beq()));
+  EXPECT_FALSE(Analysis->isUnconditionalBranch(bl()));
+  EXPECT_TRUE(Analysis->isUnconditionalBranch(jirl(LoongArch::R0)));
+  EXPECT_FALSE(Analysis->isUnconditionalBranch(jirl(LoongArch::R1)));
+  EXPECT_FALSE(
+      Analysis->isUnconditionalBranch(jirl(LoongArch::R0, LoongArch::R1)));
+}
+
+TEST_P(InstrAnalysisTest, IsIndirectBranch) {
+  EXPECT_FALSE(Analysis->isIndirectBranch(beq()));
+  EXPECT_FALSE(Analysis->isIndirectBranch(bl()));
+  EXPECT_TRUE(Analysis->isIndirectBranch(jirl(LoongArch::R0)));
+  EXPECT_FALSE(Analysis->isIndirectBranch(jirl(LoongArch::R1)));
+  EXPECT_FALSE(Analysis->isIndirectBranch(jirl(LoongArch::R0, LoongArch::R1)));
+}
+
+INSTANTIATE_TEST_SUITE_P(LA32And64, InstrAnalysisTest,
+                         testing::Values("loongarch32", "loongarch64"));
-- 
Gitee


From 4d3ba0892d66b21f6a8a72f1d787e42a64be8867 Mon Sep 17 00:00:00 2001
From: ZhaoQi <zhaoqi01@loongson.cn>
Date: Wed, 15 Nov 2023 11:12:30 +0800
Subject: [PATCH 09/20] [LoongArch][NFC] Pre-commit MCInstrAnalysis tests for
 instruction 'b' (#71903)

The tests for 'b' which commented with FIXME are incorrect, the
following patch will fix it.

(cherry picked from commit f6c4bb07eaa94bcd5d02ba7a46850225b6ed50d4)
---
 .../Target/LoongArch/MCInstrAnalysisTest.cpp   | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp b/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
index 6a208d274a0d..6e1919fc2261 100644
--- a/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
+++ b/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
@@ -50,6 +50,8 @@ static MCInst beq() {
       .addImm(32);
 }
 
+static MCInst b() { return MCInstBuilder(LoongArch::B).addImm(32); }
+
 static MCInst bl() { return MCInstBuilder(LoongArch::BL).addImm(32); }
 
 static MCInst jirl(unsigned RD, unsigned RJ = LoongArch::R10) {
@@ -58,6 +60,7 @@ static MCInst jirl(unsigned RD, unsigned RJ = LoongArch::R10) {
 
 TEST_P(InstrAnalysisTest, IsTerminator) {
   EXPECT_TRUE(Analysis->isTerminator(beq()));
+  EXPECT_TRUE(Analysis->isTerminator(b()));
   EXPECT_FALSE(Analysis->isTerminator(bl()));
   EXPECT_TRUE(Analysis->isTerminator(jirl(LoongArch::R0)));
   EXPECT_FALSE(Analysis->isTerminator(jirl(LoongArch::R5)));
@@ -65,6 +68,7 @@ TEST_P(InstrAnalysisTest, IsTerminator) {
 
 TEST_P(InstrAnalysisTest, IsCall) {
   EXPECT_FALSE(Analysis->isCall(beq()));
+  EXPECT_FALSE(Analysis->isCall(b()));
   EXPECT_TRUE(Analysis->isCall(bl()));
   EXPECT_TRUE(Analysis->isCall(jirl(LoongArch::R1)));
   EXPECT_FALSE(Analysis->isCall(jirl(LoongArch::R0)));
@@ -72,6 +76,7 @@ TEST_P(InstrAnalysisTest, IsCall) {
 
 TEST_P(InstrAnalysisTest, IsReturn) {
   EXPECT_FALSE(Analysis->isReturn(beq()));
+  EXPECT_FALSE(Analysis->isReturn(b()));
   EXPECT_FALSE(Analysis->isReturn(bl()));
   EXPECT_TRUE(Analysis->isReturn(jirl(LoongArch::R0, LoongArch::R1)));
   EXPECT_FALSE(Analysis->isReturn(jirl(LoongArch::R0)));
@@ -80,14 +85,26 @@ TEST_P(InstrAnalysisTest, IsReturn) {
 
 TEST_P(InstrAnalysisTest, IsBranch) {
   EXPECT_TRUE(Analysis->isBranch(beq()));
+  EXPECT_TRUE(Analysis->isBranch(b()));
   EXPECT_FALSE(Analysis->isBranch(bl()));
   EXPECT_TRUE(Analysis->isBranch(jirl(LoongArch::R0)));
   EXPECT_FALSE(Analysis->isBranch(jirl(LoongArch::R1)));
   EXPECT_FALSE(Analysis->isBranch(jirl(LoongArch::R0, LoongArch::R1)));
 }
 
+TEST_P(InstrAnalysisTest, IsConditionalBranch) {
+  EXPECT_TRUE(Analysis->isConditionalBranch(beq()));
+  // FIXME: Instr 'b' is not a ConditionalBranch, so the analysis here is
+  // wrong. The following patch will fix it.
+  EXPECT_TRUE(Analysis->isConditionalBranch(b()));
+  EXPECT_FALSE(Analysis->isConditionalBranch(bl()));
+}
+
 TEST_P(InstrAnalysisTest, IsUnconditionalBranch) {
   EXPECT_FALSE(Analysis->isUnconditionalBranch(beq()));
+  // FIXME: Instr 'b' is an UnconditionalBranch, so the analysis here is
+  // wrong. The following patch will fix it.
+  EXPECT_FALSE(Analysis->isUnconditionalBranch(b()));
   EXPECT_FALSE(Analysis->isUnconditionalBranch(bl()));
   EXPECT_TRUE(Analysis->isUnconditionalBranch(jirl(LoongArch::R0)));
   EXPECT_FALSE(Analysis->isUnconditionalBranch(jirl(LoongArch::R1)));
@@ -97,6 +114,7 @@ TEST_P(InstrAnalysisTest, IsUnconditionalBranch) {
 
 TEST_P(InstrAnalysisTest, IsIndirectBranch) {
   EXPECT_FALSE(Analysis->isIndirectBranch(beq()));
+  EXPECT_FALSE(Analysis->isIndirectBranch(b()));
   EXPECT_FALSE(Analysis->isIndirectBranch(bl()));
   EXPECT_TRUE(Analysis->isIndirectBranch(jirl(LoongArch::R0)));
   EXPECT_FALSE(Analysis->isIndirectBranch(jirl(LoongArch::R1)));
-- 
Gitee


From 034d4087be71c54248fff1bf7eae66291671776a Mon Sep 17 00:00:00 2001
From: ZhaoQi <zhaoqi01@loongson.cn>
Date: Thu, 16 Nov 2023 14:01:58 +0800
Subject: [PATCH 10/20] [LoongArch] Set isBarrier to true for instruction 'b'
 (#72339)

Instr "b offs26" represent to an unconditional branch in LoongArch. Set
isBarrier to 1 in tablegen for it, so that MCInstrAnalysis can return
correctly.

Fixes https://github.com/llvm/llvm-project/pull/71903.

(cherry picked from commit 42a4d5e8cab1537515d92ed56d6e17b673ed352f)
---
 llvm/lib/Target/LoongArch/LoongArchInstrInfo.td         | 1 +
 llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp | 8 ++------
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 166379d7d592..05ae36a9781d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -586,6 +586,7 @@ class Br_I26<bits<32> op>
     : FmtI26<op, (outs), (ins simm26_b:$imm26), "$imm26"> {
   let isBranch = 1;
   let isTerminator = 1;
+  let isBarrier = 1;
 }
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
diff --git a/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp b/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
index 6e1919fc2261..468ee79615d6 100644
--- a/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
+++ b/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
@@ -94,17 +94,13 @@ TEST_P(InstrAnalysisTest, IsBranch) {
 
 TEST_P(InstrAnalysisTest, IsConditionalBranch) {
   EXPECT_TRUE(Analysis->isConditionalBranch(beq()));
-  // FIXME: Instr 'b' is not a ConditionalBranch, so the analysis here is
-  // wrong. The following patch will fix it.
-  EXPECT_TRUE(Analysis->isConditionalBranch(b()));
+  EXPECT_FALSE(Analysis->isConditionalBranch(b()));
   EXPECT_FALSE(Analysis->isConditionalBranch(bl()));
 }
 
 TEST_P(InstrAnalysisTest, IsUnconditionalBranch) {
   EXPECT_FALSE(Analysis->isUnconditionalBranch(beq()));
-  // FIXME: Instr 'b' is an UnconditionalBranch, so the analysis here is
-  // wrong. The following patch will fix it.
-  EXPECT_FALSE(Analysis->isUnconditionalBranch(b()));
+  EXPECT_TRUE(Analysis->isUnconditionalBranch(b()));
   EXPECT_FALSE(Analysis->isUnconditionalBranch(bl()));
   EXPECT_TRUE(Analysis->isUnconditionalBranch(jirl(LoongArch::R0)));
   EXPECT_FALSE(Analysis->isUnconditionalBranch(jirl(LoongArch::R1)));
-- 
Gitee


From 701109dc419b8d07cd5254268d848dee1278b9ad Mon Sep 17 00:00:00 2001
From: ZhaoQi <zhaoqi01@loongson.cn>
Date: Tue, 21 Nov 2023 08:34:52 +0800
Subject: [PATCH 11/20] [LoongArch][MC] Pre-commit tests for instr bl fixupkind
 testing (#72826)

This patch is used to test whether fixupkind for bl can be returned
correctly. When BL has target-flags(loongarch-call), there is no error.
But without this flag, an assertion error will appear. So the test is
just tagged as "Expectedly Failed" now until the following patch fix it.

(cherry picked from commit 2ca028ce7c6de5f1350440012355a65383b8729a)
---
 .../CodeGen/LoongArch/test_bl_fixupkind.mir   | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir

diff --git a/llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir b/llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir
new file mode 100644
index 000000000000..2c1d41be7711
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir
@@ -0,0 +1,66 @@
+## Tagged as "Expectedly Failed" until the following patch fix it
+# XFAIL: *
+# RUN: llc --mtriple=loongarch64 --filetype=obj %s -o - | \
+# RUN: llvm-objdump -d - | FileCheck %s
+
+# REQUIRES: asserts
+
+## Check that bl can get fixupkind correctly.
+## When BL has target-flags(loongarch-call), there is no error. But without
+## this flag, an assertion error will appear:
+## Assertion `FixupKind != LoongArch::fixup_loongarch_invalid && "Unhandled expression!"' failed.
+
+--- |
+  target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
+  target triple = "loongarch64"
+  
+  define dso_local void @test_bl_fixupkind_with_flag() {
+  ; CHECK-LABEL: test_bl_fixupkind_with_flag
+  ; CHECK:         addi.d $sp, $sp, -16
+  ; CHECK-NEXT:    st.d $ra, $sp, 8
+  ; CHECK-NEXT:    bl 0 <test_bl_fixupkind_with_flag+0x8>
+  ; CHECK-NEXT:    ld.d $ra, $sp, 8
+  ; CHECK-NEXT:    addi.d $sp, $sp, 16
+  ; CHECK-NEXT:    ret
+  entry:
+    call void @foo()
+    ret void
+  }
+  
+  define dso_local void @test_bl_fixupkind_without_flag() {
+  ; CHECK-LABEL: test_bl_fixupkind_without_flag
+  ; CHECK:         addi.d $sp, $sp, -16
+  ; CHECK-NEXT:    st.d $ra, $sp, 8
+  ; CHECK-NEXT:    bl 0 <test_bl_fixupkind_without_flag+0x8>
+  ; CHECK-NEXT:    ld.d $ra, $sp, 8
+  ; CHECK-NEXT:    addi.d $sp, $sp, 16
+  ; CHECK-NEXT:    ret
+  entry:
+    call void @foo()
+    ret void
+  }
+  
+  declare dso_local void @foo(...)
+...
+---
+name:            test_bl_fixupkind_with_flag
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $r3, implicit $r3
+    BL target-flags(loongarch-call) @foo, csr_ilp32d_lp64d, implicit-def $r1, implicit-def dead $r1, implicit-def $r3
+    ADJCALLSTACKUP 0, 0, implicit-def dead $r3, implicit $r3
+    PseudoRET
+
+...
+---
+name:            test_bl_fixupkind_without_flag
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $r3, implicit $r3
+    BL @foo, csr_ilp32d_lp64d, implicit-def $r1, implicit-def dead $r1, implicit-def $r3
+    ADJCALLSTACKUP 0, 0, implicit-def dead $r3, implicit $r3
+    PseudoRET
+
+...
-- 
Gitee


From a5bf03107b8738b0fab521d7718bed863056134b Mon Sep 17 00:00:00 2001
From: ZhaoQi <zhaoqi01@loongson.cn>
Date: Tue, 21 Nov 2023 19:00:29 +0800
Subject: [PATCH 12/20] [LoongArch][MC] Support to get the FixupKind for BL
 (#72938)

Previously, bolt could not get FixupKind for BL correctly, because bolt
cannot get target-flags for BL. Here just add support in MCCodeEmitter.

Fixes https://github.com/llvm/llvm-project/pull/72826.

(cherry picked from commit 775d2f3201cf7fb657aaf58d1b37c130bd9eb8f9)
---
 .../LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp     | 1 +
 llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir         | 8 ++------
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index 08c0820cb862..09d92ac9aa3a 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -263,6 +263,7 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
       FixupKind = LoongArch::fixup_loongarch_b21;
       break;
     case LoongArch::B:
+    case LoongArch::BL:
       FixupKind = LoongArch::fixup_loongarch_b26;
       break;
     }
diff --git a/llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir b/llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir
index 2c1d41be7711..70cd5fb8d7eb 100644
--- a/llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir
+++ b/llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir
@@ -1,14 +1,10 @@
-## Tagged as "Expectedly Failed" until the following patch fix it
-# XFAIL: *
 # RUN: llc --mtriple=loongarch64 --filetype=obj %s -o - | \
 # RUN: llvm-objdump -d - | FileCheck %s
 
 # REQUIRES: asserts
 
-## Check that bl can get fixupkind correctly.
-## When BL has target-flags(loongarch-call), there is no error. But without
-## this flag, an assertion error will appear:
-## Assertion `FixupKind != LoongArch::fixup_loongarch_invalid && "Unhandled expression!"' failed.
+## Check that bl can get fixupkind correctly, whether BL contains
+## target-flags(loongarch-call) or not.
 
 --- |
   target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
-- 
Gitee


From 20421e57af53d963a95c6c318f71f9399d241188 Mon Sep 17 00:00:00 2001
From: ZhaoQi <zhaoqi01@loongson.cn>
Date: Thu, 23 Nov 2023 16:38:41 +0800
Subject: [PATCH 13/20] [LoongArch][MC] Modify branch evaluation for
 MCInstrAnalysis (#73205)

Function evaluateBranch() is used to compute target address for a given
branch instruction and return true on success. But target address of
indirect branch cannot be simply added, so rule it out and just return
false.

This patch also add objdump tests which capture the current state of
support for printing branch targets. Without this patch, the result of
"jirl $zero, $a0, 4" is "jirl $zero, $a0, 4 <foo+0x64>". It is obviously
incorrect, because this instruction represents an indirect branch whose
target address depends on both the register value and the imm. After
this patch, it will be right despite loss of details.

(cherry picked from commit 1c68c4c57a65a67963264878bc4646be8b58854c)
---
 .../MCTargetDesc/LoongArchMCTargetDesc.cpp    |  3 +-
 .../llvm-objdump/ELF/LoongArch/branches.s     | 76 +++++++++++++++++++
 .../llvm-objdump/ELF/LoongArch/lit.local.cfg  |  2 +
 3 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/tools/llvm-objdump/ELF/LoongArch/branches.s
 create mode 100644 llvm/test/tools/llvm-objdump/ELF/LoongArch/lit.local.cfg

diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
index d580c3457fec..a4e6a09863e6 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
@@ -97,7 +97,8 @@ public:
   bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
                       uint64_t &Target) const override {
     unsigned NumOps = Inst.getNumOperands();
-    if (isBranch(Inst) || Inst.getOpcode() == LoongArch::BL) {
+    if ((isBranch(Inst) && !isIndirectBranch(Inst)) ||
+        Inst.getOpcode() == LoongArch::BL) {
       Target = Addr + Inst.getOperand(NumOps - 1).getImm();
       return true;
     }
diff --git a/llvm/test/tools/llvm-objdump/ELF/LoongArch/branches.s b/llvm/test/tools/llvm-objdump/ELF/LoongArch/branches.s
new file mode 100644
index 000000000000..8cb00aef9954
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/LoongArch/branches.s
@@ -0,0 +1,76 @@
+# RUN: llvm-mc --triple=loongarch32 --filetype=obj < %s | \
+# RUN:   llvm-objdump -d --no-show-raw-insn - | FileCheck %s
+# RUN: llvm-mc --triple=loongarch64 --filetype=obj < %s | \
+# RUN:   llvm-objdump -d --no-show-raw-insn - | FileCheck %s
+
+# CHECK-LABEL: <foo>:
+foo:
+# CHECK: beq $a0, $a1, 108 <foo+0x6c>
+beq $a0, $a1, .Llocal
+# CHECK: bne $a0, $a1, 104 <foo+0x6c>
+bne $a0, $a1, .Llocal
+# CHECK: blt $a0, $a1, 100 <foo+0x6c>
+blt $a0, $a1, .Llocal
+# CHECK: bltu $a0, $a1, 96 <foo+0x6c>
+bltu $a0, $a1, .Llocal
+# CHECK: bge $a0, $a1, 92 <foo+0x6c>
+bge $a0, $a1, .Llocal
+# CHECK: bgeu $a0, $a1, 88 <foo+0x6c>
+bgeu $a0, $a1, .Llocal
+# CHECK: beqz $a0, 84 <foo+0x6c>
+beqz $a0, .Llocal
+# CHECK: bnez $a0, 80 <foo+0x6c>
+bnez $a0, .Llocal
+# CHECK: bceqz $fcc6, 76 <foo+0x6c>
+bceqz $fcc6, .Llocal
+# CHECK: bcnez $fcc6, 72 <foo+0x6c>
+bcnez $fcc6, .Llocal
+
+# CHECK: beq $a0, $a1, 76 <bar>
+beq $a0, $a1, bar
+# CHECK: bne $a0, $a1, 72 <bar>
+bne $a0, $a1, bar
+# CHECK: blt $a0, $a1, 68 <bar>
+blt $a0, $a1, bar
+# CHECK: bltu $a0, $a1, 64 <bar>
+bltu $a0, $a1, bar
+# CHECK: bge $a0, $a1, 60 <bar>
+bge $a0, $a1, bar
+# CHECK: bgeu $a0, $a1, 56 <bar>
+bgeu $a0, $a1, bar
+# CHECK: beqz $a0, 52 <bar>
+beqz $a0, bar
+# CHECK: bnez $a0, 48 <bar>
+bnez $a0, bar
+# CHECK: bceqz $fcc6, 44 <bar>
+bceqz $fcc6, bar
+# CHECK: bcnez $fcc6, 40 <bar>
+bcnez $fcc6, bar
+
+# CHECK: b 28 <foo+0x6c>
+b .Llocal
+# CHECK: b 32 <bar>
+b bar
+
+# CHECK: bl 20 <foo+0x6c>
+bl .Llocal
+# CHECK: bl 24 <bar>
+bl bar
+
+# CHECK: jirl $zero, $a0, 4{{$}}
+jirl $zero, $a0, 4
+# CHECK: jirl $ra, $a0, 4{{$}}
+jirl $ra, $a0, 4
+# CHECK: ret
+ret
+
+.Llocal:
+# CHECK: 6c: nop
+# CHECK: nop
+nop
+nop
+
+# CHECK-LABEL: <bar>:
+bar:
+# CHECK: 74: nop
+nop
diff --git a/llvm/test/tools/llvm-objdump/ELF/LoongArch/lit.local.cfg b/llvm/test/tools/llvm-objdump/ELF/LoongArch/lit.local.cfg
new file mode 100644
index 000000000000..cc24278acbb4
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/LoongArch/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "LoongArch" in config.root.targets:
+    config.unsupported = True
-- 
Gitee


From 0fe85205a8637c6671f423cddd41b712085232ac Mon Sep 17 00:00:00 2001
From: hev <wangrui@loongson.cn>
Date: Thu, 23 Nov 2023 15:15:26 +0800
Subject: [PATCH 14/20] [LoongArch] Precommit a test for smul with overflow
 (NFC) (#73212)

(cherry picked from commit 7414c0db962f8a5029fd44c3e0bc93d9ce20be71)
---
 .../CodeGen/LoongArch/smul-with-overflow.ll   | 118 ++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 llvm/test/CodeGen/LoongArch/smul-with-overflow.ll

diff --git a/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
new file mode 100644
index 000000000000..a53e77e5aa4b
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 < %s | FileCheck %s --check-prefix=LA32
+; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s --check-prefix=LA64
+
+define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
+; LA32-LABEL: smuloi64:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    .cfi_def_cfa_offset 16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
+; LA32-NEXT:    .cfi_offset 1, -4
+; LA32-NEXT:    .cfi_offset 22, -8
+; LA32-NEXT:    move $fp, $a4
+; LA32-NEXT:    st.w $zero, $sp, 4
+; LA32-NEXT:    addi.w $a4, $sp, 4
+; LA32-NEXT:    bl %plt(__mulodi4)
+; LA32-NEXT:    st.w $a1, $fp, 4
+; LA32-NEXT:    st.w $a0, $fp, 0
+; LA32-NEXT:    ld.w $a0, $sp, 4
+; LA32-NEXT:    sltu $a0, $zero, $a0
+; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: smuloi64:
+; LA64:       # %bb.0:
+; LA64-NEXT:    mul.d $a3, $a0, $a1
+; LA64-NEXT:    st.d $a3, $a2, 0
+; LA64-NEXT:    mulh.d $a0, $a0, $a1
+; LA64-NEXT:    srai.d $a1, $a3, 63
+; LA64-NEXT:    xor $a0, $a0, $a1
+; LA64-NEXT:    sltu $a0, $zero, $a0
+; LA64-NEXT:    ret
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
+; LA32-LABEL: smuloi128:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -64
+; LA32-NEXT:    .cfi_def_cfa_offset 64
+; LA32-NEXT:    st.w $ra, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT:    .cfi_offset 1, -4
+; LA32-NEXT:    .cfi_offset 22, -8
+; LA32-NEXT:    move $fp, $a2
+; LA32-NEXT:    st.w $zero, $sp, 52
+; LA32-NEXT:    ld.w $a2, $a1, 12
+; LA32-NEXT:    st.w $a2, $sp, 12
+; LA32-NEXT:    ld.w $a2, $a1, 8
+; LA32-NEXT:    st.w $a2, $sp, 8
+; LA32-NEXT:    ld.w $a2, $a1, 4
+; LA32-NEXT:    st.w $a2, $sp, 4
+; LA32-NEXT:    ld.w $a1, $a1, 0
+; LA32-NEXT:    st.w $a1, $sp, 0
+; LA32-NEXT:    ld.w $a1, $a0, 12
+; LA32-NEXT:    st.w $a1, $sp, 28
+; LA32-NEXT:    ld.w $a1, $a0, 8
+; LA32-NEXT:    st.w $a1, $sp, 24
+; LA32-NEXT:    ld.w $a1, $a0, 4
+; LA32-NEXT:    st.w $a1, $sp, 20
+; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    st.w $a0, $sp, 16
+; LA32-NEXT:    addi.w $a0, $sp, 32
+; LA32-NEXT:    addi.w $a1, $sp, 16
+; LA32-NEXT:    addi.w $a2, $sp, 0
+; LA32-NEXT:    addi.w $a3, $sp, 52
+; LA32-NEXT:    bl %plt(__muloti4)
+; LA32-NEXT:    ld.w $a0, $sp, 44
+; LA32-NEXT:    st.w $a0, $fp, 12
+; LA32-NEXT:    ld.w $a0, $sp, 40
+; LA32-NEXT:    st.w $a0, $fp, 8
+; LA32-NEXT:    ld.w $a0, $sp, 36
+; LA32-NEXT:    st.w $a0, $fp, 4
+; LA32-NEXT:    ld.w $a0, $sp, 32
+; LA32-NEXT:    st.w $a0, $fp, 0
+; LA32-NEXT:    ld.w $a0, $sp, 52
+; LA32-NEXT:    sltu $a0, $zero, $a0
+; LA32-NEXT:    ld.w $fp, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 64
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: smuloi128:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -32
+; LA64-NEXT:    .cfi_def_cfa_offset 32
+; LA64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    .cfi_offset 1, -8
+; LA64-NEXT:    .cfi_offset 22, -16
+; LA64-NEXT:    move $fp, $a4
+; LA64-NEXT:    st.d $zero, $sp, 8
+; LA64-NEXT:    addi.d $a4, $sp, 8
+; LA64-NEXT:    bl %plt(__muloti4)
+; LA64-NEXT:    st.d $a1, $fp, 8
+; LA64-NEXT:    st.d $a0, $fp, 0
+; LA64-NEXT:    ld.d $a0, $sp, 8
+; LA64-NEXT:    sltu $a0, $zero, $a0
+; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 32
+; LA64-NEXT:    ret
+  %t = call {i128, i1} @llvm.smul.with.overflow.i128(i128 %v1, i128 %v2)
+  %val = extractvalue {i128, i1} %t, 0
+  %obit = extractvalue {i128, i1} %t, 1
+  store i128 %val, ptr %res
+  ret i1 %obit
+}
+
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
+declare {i128, i1} @llvm.smul.with.overflow.i128(i128, i128) nounwind readnone
-- 
Gitee


From e29ff285726046ec46c9005c67ba992e3efc8ace Mon Sep 17 00:00:00 2001
From: hev <wangrui@loongson.cn>
Date: Thu, 23 Nov 2023 19:34:50 +0800
Subject: [PATCH 15/20] [LoongArch] Disable mulodi4 and muloti4 libcalls
 (#73199)

This library function only exists in compiler-rt not libgcc. So this
would fail to link unless we were linking with compiler-rt.

Fixes https://github.com/ClangBuiltLinux/linux/issues/1958

(cherry picked from commit 0d9f557b6c36da3aa92daff4c0d37ea821d7ae1e)
---
 .../LoongArch/LoongArchISelLowering.cpp       |   5 +
 .../CodeGen/LoongArch/smul-with-overflow.ll   | 463 +++++++++++++++---
 2 files changed, 397 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index f7eacd56c542..ed106cb766bc 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -152,8 +152,13 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
 
     // Set libcalls.
     setLibcallName(RTLIB::MUL_I128, nullptr);
+    // The MULO libcall is not part of libgcc, only compiler-rt.
+    setLibcallName(RTLIB::MULO_I64, nullptr);
   }
 
+  // The MULO libcall is not part of libgcc, only compiler-rt.
+  setLibcallName(RTLIB::MULO_I128, nullptr);
+
   static const ISD::CondCode FPCCToExpand[] = {
       ISD::SETOGT, ISD::SETOGE, ISD::SETUGT, ISD::SETUGE,
       ISD::SETGE,  ISD::SETNE,  ISD::SETGT};
diff --git a/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
index a53e77e5aa4b..6cba4108d63c 100644
--- a/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
@@ -5,23 +5,53 @@
 define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; LA32-LABEL: smuloi64:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    addi.w $sp, $sp, -16
-; LA32-NEXT:    .cfi_def_cfa_offset 16
-; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
-; LA32-NEXT:    .cfi_offset 1, -4
-; LA32-NEXT:    .cfi_offset 22, -8
-; LA32-NEXT:    move $fp, $a4
-; LA32-NEXT:    st.w $zero, $sp, 4
-; LA32-NEXT:    addi.w $a4, $sp, 4
-; LA32-NEXT:    bl %plt(__mulodi4)
-; LA32-NEXT:    st.w $a1, $fp, 4
-; LA32-NEXT:    st.w $a0, $fp, 0
-; LA32-NEXT:    ld.w $a0, $sp, 4
+; LA32-NEXT:    srai.w $a5, $a1, 31
+; LA32-NEXT:    mul.w $a6, $a2, $a5
+; LA32-NEXT:    mulh.wu $a7, $a2, $a5
+; LA32-NEXT:    add.w $a7, $a7, $a6
+; LA32-NEXT:    mul.w $a5, $a3, $a5
+; LA32-NEXT:    add.w $a5, $a7, $a5
+; LA32-NEXT:    srai.w $a7, $a3, 31
+; LA32-NEXT:    mul.w $t0, $a7, $a1
+; LA32-NEXT:    mulh.wu $t1, $a7, $a0
+; LA32-NEXT:    add.w $t0, $t1, $t0
+; LA32-NEXT:    mul.w $a7, $a7, $a0
+; LA32-NEXT:    add.w $t0, $t0, $a7
+; LA32-NEXT:    add.w $a5, $t0, $a5
+; LA32-NEXT:    mulh.wu $t0, $a0, $a2
+; LA32-NEXT:    mul.w $t1, $a1, $a2
+; LA32-NEXT:    add.w $t0, $t1, $t0
+; LA32-NEXT:    sltu $t1, $t0, $t1
+; LA32-NEXT:    mulh.wu $t2, $a1, $a2
+; LA32-NEXT:    add.w $t1, $t2, $t1
+; LA32-NEXT:    mul.w $t2, $a0, $a3
+; LA32-NEXT:    add.w $t0, $t2, $t0
+; LA32-NEXT:    sltu $t2, $t0, $t2
+; LA32-NEXT:    mulh.wu $t3, $a0, $a3
+; LA32-NEXT:    add.w $t2, $t3, $t2
+; LA32-NEXT:    add.w $a6, $a7, $a6
+; LA32-NEXT:    sltu $a7, $a6, $a7
+; LA32-NEXT:    add.w $a5, $a5, $a7
+; LA32-NEXT:    mul.w $a0, $a0, $a2
+; LA32-NEXT:    mul.w $a2, $a1, $a3
+; LA32-NEXT:    mulh.wu $a1, $a1, $a3
+; LA32-NEXT:    add.w $a3, $t1, $t2
+; LA32-NEXT:    sltu $a7, $a3, $t1
+; LA32-NEXT:    add.w $a1, $a1, $a7
+; LA32-NEXT:    st.w $a0, $a4, 0
+; LA32-NEXT:    add.w $a0, $a2, $a3
+; LA32-NEXT:    sltu $a2, $a0, $a2
+; LA32-NEXT:    add.w $a1, $a1, $a2
+; LA32-NEXT:    st.w $t0, $a4, 4
+; LA32-NEXT:    add.w $a1, $a1, $a5
+; LA32-NEXT:    add.w $a2, $a0, $a6
+; LA32-NEXT:    sltu $a0, $a2, $a0
+; LA32-NEXT:    add.w $a0, $a1, $a0
+; LA32-NEXT:    srai.w $a1, $t0, 31
+; LA32-NEXT:    xor $a0, $a0, $a1
+; LA32-NEXT:    xor $a1, $a2, $a1
+; LA32-NEXT:    or $a0, $a1, $a0
 ; LA32-NEXT:    sltu $a0, $zero, $a0
-; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: smuloi64:
@@ -43,69 +73,360 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; LA32-LABEL: smuloi128:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    addi.w $sp, $sp, -64
-; LA32-NEXT:    .cfi_def_cfa_offset 64
-; LA32-NEXT:    st.w $ra, $sp, 60 # 4-byte Folded Spill
-; LA32-NEXT:    st.w $fp, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT:    addi.w $sp, $sp, -96
+; LA32-NEXT:    .cfi_def_cfa_offset 96
+; LA32-NEXT:    st.w $ra, $sp, 92 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 88 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 84 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 80 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 68 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s5, $sp, 64 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s6, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s7, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s8, $sp, 52 # 4-byte Folded Spill
 ; LA32-NEXT:    .cfi_offset 1, -4
 ; LA32-NEXT:    .cfi_offset 22, -8
-; LA32-NEXT:    move $fp, $a2
-; LA32-NEXT:    st.w $zero, $sp, 52
-; LA32-NEXT:    ld.w $a2, $a1, 12
-; LA32-NEXT:    st.w $a2, $sp, 12
-; LA32-NEXT:    ld.w $a2, $a1, 8
-; LA32-NEXT:    st.w $a2, $sp, 8
-; LA32-NEXT:    ld.w $a2, $a1, 4
-; LA32-NEXT:    st.w $a2, $sp, 4
-; LA32-NEXT:    ld.w $a1, $a1, 0
-; LA32-NEXT:    st.w $a1, $sp, 0
-; LA32-NEXT:    ld.w $a1, $a0, 12
-; LA32-NEXT:    st.w $a1, $sp, 28
-; LA32-NEXT:    ld.w $a1, $a0, 8
-; LA32-NEXT:    st.w $a1, $sp, 24
-; LA32-NEXT:    ld.w $a1, $a0, 4
-; LA32-NEXT:    st.w $a1, $sp, 20
-; LA32-NEXT:    ld.w $a0, $a0, 0
-; LA32-NEXT:    st.w $a0, $sp, 16
-; LA32-NEXT:    addi.w $a0, $sp, 32
-; LA32-NEXT:    addi.w $a1, $sp, 16
-; LA32-NEXT:    addi.w $a2, $sp, 0
-; LA32-NEXT:    addi.w $a3, $sp, 52
-; LA32-NEXT:    bl %plt(__muloti4)
-; LA32-NEXT:    ld.w $a0, $sp, 44
-; LA32-NEXT:    st.w $a0, $fp, 12
-; LA32-NEXT:    ld.w $a0, $sp, 40
-; LA32-NEXT:    st.w $a0, $fp, 8
-; LA32-NEXT:    ld.w $a0, $sp, 36
-; LA32-NEXT:    st.w $a0, $fp, 4
-; LA32-NEXT:    ld.w $a0, $sp, 32
-; LA32-NEXT:    st.w $a0, $fp, 0
-; LA32-NEXT:    ld.w $a0, $sp, 52
+; LA32-NEXT:    .cfi_offset 23, -12
+; LA32-NEXT:    .cfi_offset 24, -16
+; LA32-NEXT:    .cfi_offset 25, -20
+; LA32-NEXT:    .cfi_offset 26, -24
+; LA32-NEXT:    .cfi_offset 27, -28
+; LA32-NEXT:    .cfi_offset 28, -32
+; LA32-NEXT:    .cfi_offset 29, -36
+; LA32-NEXT:    .cfi_offset 30, -40
+; LA32-NEXT:    .cfi_offset 31, -44
+; LA32-NEXT:    st.w $a2, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ld.w $a6, $a1, 0
+; LA32-NEXT:    ld.w $a7, $a0, 0
+; LA32-NEXT:    mulh.wu $a3, $a7, $a6
+; LA32-NEXT:    ld.w $a5, $a0, 4
+; LA32-NEXT:    mul.w $a4, $a5, $a6
+; LA32-NEXT:    add.w $a3, $a4, $a3
+; LA32-NEXT:    sltu $a4, $a3, $a4
+; LA32-NEXT:    mulh.wu $t0, $a5, $a6
+; LA32-NEXT:    add.w $a4, $t0, $a4
+; LA32-NEXT:    ld.w $t0, $a1, 4
+; LA32-NEXT:    mul.w $t1, $a7, $t0
+; LA32-NEXT:    add.w $a3, $t1, $a3
+; LA32-NEXT:    st.w $a3, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT:    sltu $t1, $a3, $t1
+; LA32-NEXT:    mulh.wu $t2, $a7, $t0
+; LA32-NEXT:    add.w $t1, $t2, $t1
+; LA32-NEXT:    ld.w $t4, $a0, 12
+; LA32-NEXT:    ld.w $t2, $a0, 8
+; LA32-NEXT:    ld.w $t3, $a1, 8
+; LA32-NEXT:    mulh.wu $a0, $t2, $t3
+; LA32-NEXT:    mul.w $t5, $t4, $t3
+; LA32-NEXT:    add.w $a0, $t5, $a0
+; LA32-NEXT:    sltu $t5, $a0, $t5
+; LA32-NEXT:    mulh.wu $t6, $t4, $t3
+; LA32-NEXT:    add.w $t5, $t6, $t5
+; LA32-NEXT:    ld.w $t7, $a1, 12
+; LA32-NEXT:    mul.w $a1, $t2, $t7
+; LA32-NEXT:    add.w $a0, $a1, $a0
+; LA32-NEXT:    st.w $a0, $sp, 48 # 4-byte Folded Spill
+; LA32-NEXT:    sltu $a1, $a0, $a1
+; LA32-NEXT:    mulh.wu $t6, $t2, $t7
+; LA32-NEXT:    add.w $t6, $t6, $a1
+; LA32-NEXT:    srai.w $s7, $t4, 31
+; LA32-NEXT:    mul.w $a1, $s7, $t7
+; LA32-NEXT:    mulh.wu $t8, $s7, $t3
+; LA32-NEXT:    add.w $t8, $t8, $a1
+; LA32-NEXT:    mulh.wu $fp, $a6, $s7
+; LA32-NEXT:    mul.w $s6, $t0, $s7
+; LA32-NEXT:    add.w $s8, $s6, $fp
+; LA32-NEXT:    mul.w $a1, $a6, $s7
+; LA32-NEXT:    add.w $ra, $a1, $s8
+; LA32-NEXT:    sltu $s0, $ra, $a1
+; LA32-NEXT:    add.w $a0, $fp, $s0
+; LA32-NEXT:    add.w $a3, $a4, $t1
+; LA32-NEXT:    st.w $a3, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT:    sltu $a4, $a3, $a4
+; LA32-NEXT:    mulh.wu $t1, $a5, $t0
+; LA32-NEXT:    add.w $a3, $t1, $a4
+; LA32-NEXT:    st.w $a3, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    srai.w $s4, $t7, 31
+; LA32-NEXT:    mul.w $fp, $a7, $s4
+; LA32-NEXT:    mulh.wu $a4, $a7, $s4
+; LA32-NEXT:    add.w $s1, $a4, $fp
+; LA32-NEXT:    sltu $s0, $s1, $fp
+; LA32-NEXT:    add.w $s5, $a4, $s0
+; LA32-NEXT:    mul.w $a4, $s7, $t3
+; LA32-NEXT:    add.w $t8, $t8, $a4
+; LA32-NEXT:    add.w $s0, $ra, $t8
+; LA32-NEXT:    add.w $a3, $a1, $a4
+; LA32-NEXT:    st.w $a3, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT:    sltu $a4, $a3, $a1
+; LA32-NEXT:    add.w $a3, $s0, $a4
+; LA32-NEXT:    st.w $a3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT:    add.w $s3, $t5, $t6
+; LA32-NEXT:    sltu $a4, $s3, $t5
+; LA32-NEXT:    mulh.wu $t5, $t4, $t7
+; LA32-NEXT:    add.w $a3, $t5, $a4
+; LA32-NEXT:    st.w $a3, $sp, 16 # 4-byte Folded Spill
+; LA32-NEXT:    mul.w $a4, $a7, $a6
+; LA32-NEXT:    st.w $a4, $a2, 0
+; LA32-NEXT:    sltu $a4, $s8, $s6
+; LA32-NEXT:    mulh.wu $t5, $t0, $s7
+; LA32-NEXT:    add.w $a4, $t5, $a4
+; LA32-NEXT:    add.w $t1, $a4, $a0
+; LA32-NEXT:    sltu $a4, $t1, $a4
+; LA32-NEXT:    add.w $s2, $t5, $a4
+; LA32-NEXT:    mulh.wu $a4, $a7, $t3
+; LA32-NEXT:    mul.w $t5, $a5, $t3
+; LA32-NEXT:    add.w $a4, $t5, $a4
+; LA32-NEXT:    sltu $t5, $a4, $t5
+; LA32-NEXT:    mulh.wu $t6, $a5, $t3
+; LA32-NEXT:    add.w $a3, $t6, $t5
+; LA32-NEXT:    mul.w $t6, $a7, $t7
+; LA32-NEXT:    add.w $t5, $t6, $a4
+; LA32-NEXT:    sltu $a4, $t5, $t6
+; LA32-NEXT:    mulh.wu $t6, $a7, $t7
+; LA32-NEXT:    add.w $a4, $t6, $a4
+; LA32-NEXT:    mulh.wu $t6, $t2, $a6
+; LA32-NEXT:    mul.w $s7, $t4, $a6
+; LA32-NEXT:    add.w $t6, $s7, $t6
+; LA32-NEXT:    sltu $s7, $t6, $s7
+; LA32-NEXT:    mulh.wu $s8, $t4, $a6
+; LA32-NEXT:    add.w $a0, $s8, $s7
+; LA32-NEXT:    mul.w $s7, $t2, $t0
+; LA32-NEXT:    add.w $t6, $s7, $t6
+; LA32-NEXT:    sltu $s7, $t6, $s7
+; LA32-NEXT:    mulh.wu $s8, $t2, $t0
+; LA32-NEXT:    add.w $a2, $s8, $s7
+; LA32-NEXT:    mul.w $s8, $a5, $s4
+; LA32-NEXT:    add.w $s7, $s1, $s8
+; LA32-NEXT:    add.w $s1, $s7, $ra
+; LA32-NEXT:    add.w $a1, $fp, $a1
+; LA32-NEXT:    st.w $a1, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT:    sltu $ra, $a1, $fp
+; LA32-NEXT:    add.w $a1, $s1, $ra
+; LA32-NEXT:    st.w $a1, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT:    xor $s0, $a1, $s7
+; LA32-NEXT:    sltui $s0, $s0, 1
+; LA32-NEXT:    sltu $a1, $a1, $s7
+; LA32-NEXT:    masknez $s1, $a1, $s0
+; LA32-NEXT:    maskeqz $s0, $ra, $s0
+; LA32-NEXT:    add.w $t1, $s6, $t1
+; LA32-NEXT:    sltu $s6, $t1, $s6
+; LA32-NEXT:    add.w $s2, $s2, $s6
+; LA32-NEXT:    add.w $a2, $a0, $a2
+; LA32-NEXT:    sltu $a0, $a2, $a0
+; LA32-NEXT:    mulh.wu $s6, $t4, $t0
+; LA32-NEXT:    add.w $t8, $s6, $a0
+; LA32-NEXT:    add.w $a4, $a3, $a4
+; LA32-NEXT:    sltu $a3, $a4, $a3
+; LA32-NEXT:    mulh.wu $s6, $a5, $t7
+; LA32-NEXT:    add.w $a3, $s6, $a3
+; LA32-NEXT:    mul.w $s6, $t4, $t7
+; LA32-NEXT:    mul.w $t7, $a5, $t7
+; LA32-NEXT:    mul.w $ra, $t4, $t0
+; LA32-NEXT:    mul.w $t0, $a5, $t0
+; LA32-NEXT:    mul.w $t4, $t4, $s4
+; LA32-NEXT:    mul.w $a7, $a7, $t3
+; LA32-NEXT:    mul.w $a6, $t2, $a6
+; LA32-NEXT:    mul.w $t3, $t2, $t3
+; LA32-NEXT:    mul.w $a0, $t2, $s4
+; LA32-NEXT:    mulh.wu $t2, $t2, $s4
+; LA32-NEXT:    mulh.wu $a5, $s4, $a5
+; LA32-NEXT:    sltu $s4, $s7, $s8
+; LA32-NEXT:    add.w $s4, $a5, $s4
+; LA32-NEXT:    add.w $s4, $s5, $s4
+; LA32-NEXT:    sltu $s5, $s4, $s5
+; LA32-NEXT:    add.w $s5, $a5, $s5
+; LA32-NEXT:    ld.w $a1, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT:    add.w $a1, $t0, $a1
+; LA32-NEXT:    sltu $a5, $a1, $t0
+; LA32-NEXT:    ld.w $t0, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    add.w $t0, $t0, $a5
+; LA32-NEXT:    or $s0, $s0, $s1
+; LA32-NEXT:    add.w $a4, $t7, $a4
+; LA32-NEXT:    sltu $a5, $a4, $t7
+; LA32-NEXT:    add.w $t7, $a3, $a5
+; LA32-NEXT:    add.w $s1, $ra, $a2
+; LA32-NEXT:    sltu $a2, $s1, $ra
+; LA32-NEXT:    add.w $t8, $t8, $a2
+; LA32-NEXT:    add.w $a5, $s6, $s3
+; LA32-NEXT:    sltu $a2, $a5, $s6
+; LA32-NEXT:    ld.w $a3, $sp, 16 # 4-byte Folded Reload
+; LA32-NEXT:    add.w $a2, $a3, $a2
+; LA32-NEXT:    ld.w $s6, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $a3, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT:    st.w $a3, $s6, 4
+; LA32-NEXT:    ld.w $a3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT:    add.w $a3, $s2, $a3
+; LA32-NEXT:    ld.w $s2, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT:    add.w $s2, $t1, $s2
+; LA32-NEXT:    sltu $t1, $s2, $t1
+; LA32-NEXT:    add.w $a3, $a3, $t1
+; LA32-NEXT:    add.w $t1, $s8, $s4
+; LA32-NEXT:    sltu $s3, $t1, $s8
+; LA32-NEXT:    add.w $s3, $s5, $s3
+; LA32-NEXT:    add.w $t2, $t2, $a0
+; LA32-NEXT:    add.w $t2, $t2, $t4
+; LA32-NEXT:    add.w $t2, $t2, $s7
+; LA32-NEXT:    add.w $t4, $a0, $fp
+; LA32-NEXT:    sltu $a0, $t4, $a0
+; LA32-NEXT:    add.w $a0, $t2, $a0
+; LA32-NEXT:    add.w $a0, $s3, $a0
+; LA32-NEXT:    add.w $t2, $t1, $t4
+; LA32-NEXT:    sltu $t1, $t2, $t1
+; LA32-NEXT:    add.w $a0, $a0, $t1
+; LA32-NEXT:    add.w $a0, $a0, $a3
+; LA32-NEXT:    add.w $t1, $t2, $s2
+; LA32-NEXT:    sltu $a3, $t1, $t2
+; LA32-NEXT:    add.w $a0, $a0, $a3
+; LA32-NEXT:    add.w $a3, $t6, $t0
+; LA32-NEXT:    add.w $a1, $a6, $a1
+; LA32-NEXT:    sltu $a6, $a1, $a6
+; LA32-NEXT:    add.w $t0, $a3, $a6
+; LA32-NEXT:    add.w $a1, $a7, $a1
+; LA32-NEXT:    sltu $a7, $a1, $a7
+; LA32-NEXT:    add.w $a3, $t5, $t0
+; LA32-NEXT:    add.w $a3, $a3, $a7
+; LA32-NEXT:    sltu $t2, $a3, $t5
+; LA32-NEXT:    xor $t4, $a3, $t5
+; LA32-NEXT:    sltui $t4, $t4, 1
+; LA32-NEXT:    masknez $t2, $t2, $t4
+; LA32-NEXT:    maskeqz $a7, $a7, $t4
+; LA32-NEXT:    st.w $a1, $s6, 8
+; LA32-NEXT:    or $a1, $a7, $t2
+; LA32-NEXT:    sltu $a7, $t0, $t6
+; LA32-NEXT:    xor $t0, $t0, $t6
+; LA32-NEXT:    sltui $t0, $t0, 1
+; LA32-NEXT:    masknez $a7, $a7, $t0
+; LA32-NEXT:    maskeqz $a6, $a6, $t0
+; LA32-NEXT:    or $a6, $a6, $a7
+; LA32-NEXT:    add.w $a6, $s1, $a6
+; LA32-NEXT:    sltu $a7, $a6, $s1
+; LA32-NEXT:    add.w $a7, $t8, $a7
+; LA32-NEXT:    add.w $a1, $a4, $a1
+; LA32-NEXT:    sltu $a4, $a1, $a4
+; LA32-NEXT:    add.w $a4, $t7, $a4
+; LA32-NEXT:    add.w $t0, $t1, $s0
+; LA32-NEXT:    sltu $t1, $t0, $t1
+; LA32-NEXT:    add.w $a0, $a0, $t1
+; LA32-NEXT:    st.w $a3, $s6, 12
+; LA32-NEXT:    add.w $a1, $a6, $a1
+; LA32-NEXT:    sltu $a6, $a1, $a6
+; LA32-NEXT:    add.w $a4, $a7, $a4
+; LA32-NEXT:    add.w $a4, $a4, $a6
+; LA32-NEXT:    sltu $t1, $a4, $a7
+; LA32-NEXT:    xor $a7, $a4, $a7
+; LA32-NEXT:    sltui $a7, $a7, 1
+; LA32-NEXT:    masknez $t1, $t1, $a7
+; LA32-NEXT:    maskeqz $a6, $a6, $a7
+; LA32-NEXT:    or $a6, $a6, $t1
+; LA32-NEXT:    add.w $a6, $a5, $a6
+; LA32-NEXT:    sltu $a5, $a6, $a5
+; LA32-NEXT:    add.w $a2, $a2, $a5
+; LA32-NEXT:    ld.w $t1, $sp, 48 # 4-byte Folded Reload
+; LA32-NEXT:    add.w $a4, $t1, $a4
+; LA32-NEXT:    add.w $a1, $t3, $a1
+; LA32-NEXT:    sltu $a5, $a1, $t3
+; LA32-NEXT:    add.w $a4, $a4, $a5
+; LA32-NEXT:    sltu $a7, $a4, $t1
+; LA32-NEXT:    xor $t1, $a4, $t1
+; LA32-NEXT:    sltui $t1, $t1, 1
+; LA32-NEXT:    masknez $a7, $a7, $t1
+; LA32-NEXT:    maskeqz $a5, $a5, $t1
+; LA32-NEXT:    or $a5, $a5, $a7
+; LA32-NEXT:    add.w $a5, $a6, $a5
+; LA32-NEXT:    sltu $a6, $a5, $a6
+; LA32-NEXT:    add.w $a2, $a2, $a6
+; LA32-NEXT:    add.w $a0, $a2, $a0
+; LA32-NEXT:    add.w $a2, $a5, $t0
+; LA32-NEXT:    sltu $a5, $a2, $a5
+; LA32-NEXT:    add.w $a0, $a0, $a5
+; LA32-NEXT:    ld.w $a5, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT:    add.w $a5, $a1, $a5
+; LA32-NEXT:    sltu $a1, $a5, $a1
+; LA32-NEXT:    ld.w $a6, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT:    add.w $a6, $a4, $a6
+; LA32-NEXT:    add.w $a6, $a6, $a1
+; LA32-NEXT:    sltu $a7, $a6, $a4
+; LA32-NEXT:    xor $a4, $a6, $a4
+; LA32-NEXT:    sltui $a4, $a4, 1
+; LA32-NEXT:    masknez $a7, $a7, $a4
+; LA32-NEXT:    maskeqz $a1, $a1, $a4
+; LA32-NEXT:    or $a1, $a1, $a7
+; LA32-NEXT:    add.w $a1, $a2, $a1
+; LA32-NEXT:    sltu $a2, $a1, $a2
+; LA32-NEXT:    add.w $a0, $a0, $a2
+; LA32-NEXT:    srai.w $a2, $a3, 31
+; LA32-NEXT:    xor $a3, $a6, $a2
+; LA32-NEXT:    xor $a0, $a0, $a2
+; LA32-NEXT:    or $a0, $a3, $a0
+; LA32-NEXT:    xor $a3, $a5, $a2
+; LA32-NEXT:    xor $a1, $a1, $a2
+; LA32-NEXT:    or $a1, $a3, $a1
+; LA32-NEXT:    or $a0, $a1, $a0
 ; LA32-NEXT:    sltu $a0, $zero, $a0
-; LA32-NEXT:    ld.w $fp, $sp, 56 # 4-byte Folded Reload
-; LA32-NEXT:    ld.w $ra, $sp, 60 # 4-byte Folded Reload
-; LA32-NEXT:    addi.w $sp, $sp, 64
+; LA32-NEXT:    ld.w $s8, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s7, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s6, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s5, $sp, 64 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s4, $sp, 68 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 80 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 84 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 88 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 92 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 96
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: smuloi128:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -32
-; LA64-NEXT:    .cfi_def_cfa_offset 32
-; LA64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
-; LA64-NEXT:    .cfi_offset 1, -8
-; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    move $fp, $a4
-; LA64-NEXT:    st.d $zero, $sp, 8
-; LA64-NEXT:    addi.d $a4, $sp, 8
-; LA64-NEXT:    bl %plt(__muloti4)
-; LA64-NEXT:    st.d $a1, $fp, 8
-; LA64-NEXT:    st.d $a0, $fp, 0
-; LA64-NEXT:    ld.d $a0, $sp, 8
+; LA64-NEXT:    srai.d $a5, $a1, 63
+; LA64-NEXT:    mul.d $a6, $a2, $a5
+; LA64-NEXT:    mulh.du $a7, $a2, $a5
+; LA64-NEXT:    add.d $a7, $a7, $a6
+; LA64-NEXT:    mul.d $a5, $a3, $a5
+; LA64-NEXT:    add.d $a5, $a7, $a5
+; LA64-NEXT:    srai.d $a7, $a3, 63
+; LA64-NEXT:    mul.d $t0, $a7, $a1
+; LA64-NEXT:    mulh.du $t1, $a7, $a0
+; LA64-NEXT:    add.d $t0, $t1, $t0
+; LA64-NEXT:    mul.d $a7, $a7, $a0
+; LA64-NEXT:    add.d $t0, $t0, $a7
+; LA64-NEXT:    add.d $a5, $t0, $a5
+; LA64-NEXT:    mulh.du $t0, $a0, $a2
+; LA64-NEXT:    mul.d $t1, $a1, $a2
+; LA64-NEXT:    add.d $t0, $t1, $t0
+; LA64-NEXT:    sltu $t1, $t0, $t1
+; LA64-NEXT:    mulh.du $t2, $a1, $a2
+; LA64-NEXT:    add.d $t1, $t2, $t1
+; LA64-NEXT:    mul.d $t2, $a0, $a3
+; LA64-NEXT:    add.d $t0, $t2, $t0
+; LA64-NEXT:    sltu $t2, $t0, $t2
+; LA64-NEXT:    mulh.du $t3, $a0, $a3
+; LA64-NEXT:    add.d $t2, $t3, $t2
+; LA64-NEXT:    add.d $a6, $a7, $a6
+; LA64-NEXT:    sltu $a7, $a6, $a7
+; LA64-NEXT:    add.d $a5, $a5, $a7
+; LA64-NEXT:    mul.d $a0, $a0, $a2
+; LA64-NEXT:    mul.d $a2, $a1, $a3
+; LA64-NEXT:    mulh.du $a1, $a1, $a3
+; LA64-NEXT:    add.d $a3, $t1, $t2
+; LA64-NEXT:    sltu $a7, $a3, $t1
+; LA64-NEXT:    add.d $a1, $a1, $a7
+; LA64-NEXT:    st.d $a0, $a4, 0
+; LA64-NEXT:    add.d $a0, $a2, $a3
+; LA64-NEXT:    sltu $a2, $a0, $a2
+; LA64-NEXT:    add.d $a1, $a1, $a2
+; LA64-NEXT:    st.d $t0, $a4, 8
+; LA64-NEXT:    add.d $a1, $a1, $a5
+; LA64-NEXT:    add.d $a2, $a0, $a6
+; LA64-NEXT:    sltu $a0, $a2, $a0
+; LA64-NEXT:    add.d $a0, $a1, $a0
+; LA64-NEXT:    srai.d $a1, $t0, 63
+; LA64-NEXT:    xor $a0, $a0, $a1
+; LA64-NEXT:    xor $a1, $a2, $a1
+; LA64-NEXT:    or $a0, $a1, $a0
 ; LA64-NEXT:    sltu $a0, $zero, $a0
-; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 32
 ; LA64-NEXT:    ret
   %t = call {i128, i1} @llvm.smul.with.overflow.i128(i128 %v1, i128 %v2)
   %val = extractvalue {i128, i1} %t, 0
-- 
Gitee


From 01ced6193e2abfbd50fbd9d40066cf27f9f9067b Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Wed, 29 Nov 2023 15:21:21 +0800
Subject: [PATCH 16/20] [LoongArch] Fix pattern for FNMSUB_{S/D} instructions
 (#73742)

```
when a=c=-0.0, b=0.0:
-(a * b + (-c)) = -0.0
-a * b + c = 0.0
(fneg (fma a, b (-c))) != (fma (fneg a), b ,c)
```

See https://reviews.llvm.org/D90901 for a similar discussion on X86.

(cherry picked from commit 5e7e0d603204ede803323a825318e365a87f73e9)
---
 .../LoongArch/LoongArchFloat32InstrInfo.td    |   8 +-
 .../LoongArch/LoongArchFloat64InstrInfo.td    |   6 +-
 llvm/test/CodeGen/LoongArch/double-fma.ll     | 259 ++++++++++++++++--
 llvm/test/CodeGen/LoongArch/float-fma.ll      | 259 ++++++++++++++++--
 4 files changed, 483 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index 826db54febd3..65120c083f49 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -294,8 +294,12 @@ def : Pat<(fneg (fma FPR32:$fj, FPR32:$fk, FPR32:$fa)),
 def : Pat<(fma_nsz (fneg FPR32:$fj), FPR32:$fk, (fneg FPR32:$fa)),
           (FNMADD_S FPR32:$fj, FPR32:$fk, FPR32:$fa)>;
 
-// fnmsub.s: -fj * fk + fa
-def : Pat<(fma (fneg FPR32:$fj), FPR32:$fk, FPR32:$fa),
+// fnmsub.s: -(fj * fk - fa)
+def : Pat<(fneg (fma FPR32:$fj, FPR32:$fk, (fneg FPR32:$fa))),
+          (FNMSUB_S FPR32:$fj, FPR32:$fk, FPR32:$fa)>;
+
+// fnmsub.s: -fj * fk + fa (the nsz flag on the FMA)
+def : Pat<(fma_nsz (fneg FPR32:$fj), FPR32:$fk, FPR32:$fa),
           (FNMSUB_S FPR32:$fj, FPR32:$fk, FPR32:$fa)>;
 } // Predicates = [HasBasicF]
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
index 5118474725b6..437c1e4d7be2 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
@@ -256,7 +256,11 @@ def : Pat<(fma_nsz (fneg FPR64:$fj), FPR64:$fk, (fneg FPR64:$fa)),
           (FNMADD_D FPR64:$fj, FPR64:$fk, FPR64:$fa)>;
 
 // fnmsub.d: -(fj * fk - fa)
-def : Pat<(fma (fneg FPR64:$fj), FPR64:$fk, FPR64:$fa),
+def : Pat<(fneg (fma FPR64:$fj, FPR64:$fk, (fneg FPR64:$fa))),
+          (FNMSUB_D FPR64:$fj, FPR64:$fk, FPR64:$fa)>;
+
+// fnmsub.d: -fj * fk + fa (the nsz flag on the FMA)
+def : Pat<(fma_nsz (fneg FPR64:$fj), FPR64:$fk, FPR64:$fa),
           (FNMSUB_D FPR64:$fj, FPR64:$fk, FPR64:$fa)>;
 } // Predicates = [HasBasicD]
 
diff --git a/llvm/test/CodeGen/LoongArch/double-fma.ll b/llvm/test/CodeGen/LoongArch/double-fma.ll
index 6dd628479433..58d20c62a668 100644
--- a/llvm/test/CodeGen/LoongArch/double-fma.ll
+++ b/llvm/test/CodeGen/LoongArch/double-fma.ll
@@ -236,13 +236,15 @@ define double @fnmsub_d(double %a, double %b, double %c) nounwind {
 ; LA32-CONTRACT-ON-LABEL: fnmsub_d:
 ; LA32-CONTRACT-ON:       # %bb.0:
 ; LA32-CONTRACT-ON-NEXT:    fmul.d $fa0, $fa0, $fa1
-; LA32-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa2, $fa0
+; LA32-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa0, $fa2
+; LA32-CONTRACT-ON-NEXT:    fneg.d $fa0, $fa0
 ; LA32-CONTRACT-ON-NEXT:    ret
 ;
 ; LA32-CONTRACT-OFF-LABEL: fnmsub_d:
 ; LA32-CONTRACT-OFF:       # %bb.0:
 ; LA32-CONTRACT-OFF-NEXT:    fmul.d $fa0, $fa0, $fa1
-; LA32-CONTRACT-OFF-NEXT:    fsub.d $fa0, $fa2, $fa0
+; LA32-CONTRACT-OFF-NEXT:    fsub.d $fa0, $fa0, $fa2
+; LA32-CONTRACT-OFF-NEXT:    fneg.d $fa0, $fa0
 ; LA32-CONTRACT-OFF-NEXT:    ret
 ;
 ; LA64-CONTRACT-FAST-LABEL: fnmsub_d:
@@ -253,12 +255,98 @@ define double @fnmsub_d(double %a, double %b, double %c) nounwind {
 ; LA64-CONTRACT-ON-LABEL: fnmsub_d:
 ; LA64-CONTRACT-ON:       # %bb.0:
 ; LA64-CONTRACT-ON-NEXT:    fmul.d $fa0, $fa0, $fa1
-; LA64-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa2, $fa0
+; LA64-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa0, $fa2
+; LA64-CONTRACT-ON-NEXT:    fneg.d $fa0, $fa0
 ; LA64-CONTRACT-ON-NEXT:    ret
 ;
 ; LA64-CONTRACT-OFF-LABEL: fnmsub_d:
 ; LA64-CONTRACT-OFF:       # %bb.0:
 ; LA64-CONTRACT-OFF-NEXT:    fmul.d $fa0, $fa0, $fa1
+; LA64-CONTRACT-OFF-NEXT:    fsub.d $fa0, $fa0, $fa2
+; LA64-CONTRACT-OFF-NEXT:    fneg.d $fa0, $fa0
+; LA64-CONTRACT-OFF-NEXT:    ret
+  %negc = fneg double %c
+  %mul = fmul double %a, %b
+  %add = fadd double %mul, %negc
+  %neg = fneg double %add
+  ret double %neg
+}
+
+define double @fnmsub_d_nsz(double %a, double %b, double %c) nounwind {
+; LA32-CONTRACT-FAST-LABEL: fnmsub_d_nsz:
+; LA32-CONTRACT-FAST:       # %bb.0:
+; LA32-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-FAST-NEXT:    ret
+;
+; LA32-CONTRACT-ON-LABEL: fnmsub_d_nsz:
+; LA32-CONTRACT-ON:       # %bb.0:
+; LA32-CONTRACT-ON-NEXT:    fmul.d $fa0, $fa0, $fa1
+; LA32-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa2, $fa0
+; LA32-CONTRACT-ON-NEXT:    ret
+;
+; LA32-CONTRACT-OFF-LABEL: fnmsub_d_nsz:
+; LA32-CONTRACT-OFF:       # %bb.0:
+; LA32-CONTRACT-OFF-NEXT:    fmul.d $fa0, $fa0, $fa1
+; LA32-CONTRACT-OFF-NEXT:    fsub.d $fa0, $fa2, $fa0
+; LA32-CONTRACT-OFF-NEXT:    ret
+;
+; LA64-CONTRACT-FAST-LABEL: fnmsub_d_nsz:
+; LA64-CONTRACT-FAST:       # %bb.0:
+; LA64-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-FAST-NEXT:    ret
+;
+; LA64-CONTRACT-ON-LABEL: fnmsub_d_nsz:
+; LA64-CONTRACT-ON:       # %bb.0:
+; LA64-CONTRACT-ON-NEXT:    fmul.d $fa0, $fa0, $fa1
+; LA64-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa2, $fa0
+; LA64-CONTRACT-ON-NEXT:    ret
+;
+; LA64-CONTRACT-OFF-LABEL: fnmsub_d_nsz:
+; LA64-CONTRACT-OFF:       # %bb.0:
+; LA64-CONTRACT-OFF-NEXT:    fmul.d $fa0, $fa0, $fa1
+; LA64-CONTRACT-OFF-NEXT:    fsub.d $fa0, $fa2, $fa0
+; LA64-CONTRACT-OFF-NEXT:    ret
+  %nega = fneg nsz double %a
+  %mul = fmul nsz double %nega, %b
+  %add = fadd nsz double %mul, %c
+  ret double %add
+}
+
+;; Check that fnmsub.d is not emitted.
+define double @not_fnmsub_d(double %a, double %b, double %c) nounwind {
+; LA32-CONTRACT-FAST-LABEL: not_fnmsub_d:
+; LA32-CONTRACT-FAST:       # %bb.0:
+; LA32-CONTRACT-FAST-NEXT:    fneg.d $fa0, $fa0
+; LA32-CONTRACT-FAST-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-FAST-NEXT:    ret
+;
+; LA32-CONTRACT-ON-LABEL: not_fnmsub_d:
+; LA32-CONTRACT-ON:       # %bb.0:
+; LA32-CONTRACT-ON-NEXT:    fmul.d $fa0, $fa0, $fa1
+; LA32-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa2, $fa0
+; LA32-CONTRACT-ON-NEXT:    ret
+;
+; LA32-CONTRACT-OFF-LABEL: not_fnmsub_d:
+; LA32-CONTRACT-OFF:       # %bb.0:
+; LA32-CONTRACT-OFF-NEXT:    fmul.d $fa0, $fa0, $fa1
+; LA32-CONTRACT-OFF-NEXT:    fsub.d $fa0, $fa2, $fa0
+; LA32-CONTRACT-OFF-NEXT:    ret
+;
+; LA64-CONTRACT-FAST-LABEL: not_fnmsub_d:
+; LA64-CONTRACT-FAST:       # %bb.0:
+; LA64-CONTRACT-FAST-NEXT:    fneg.d $fa0, $fa0
+; LA64-CONTRACT-FAST-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-FAST-NEXT:    ret
+;
+; LA64-CONTRACT-ON-LABEL: not_fnmsub_d:
+; LA64-CONTRACT-ON:       # %bb.0:
+; LA64-CONTRACT-ON-NEXT:    fmul.d $fa0, $fa0, $fa1
+; LA64-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa2, $fa0
+; LA64-CONTRACT-ON-NEXT:    ret
+;
+; LA64-CONTRACT-OFF-LABEL: not_fnmsub_d:
+; LA64-CONTRACT-OFF:       # %bb.0:
+; LA64-CONTRACT-OFF-NEXT:    fmul.d $fa0, $fa0, $fa1
 ; LA64-CONTRACT-OFF-NEXT:    fsub.d $fa0, $fa2, $fa0
 ; LA64-CONTRACT-OFF-NEXT:    ret
   %nega = fneg double %a
@@ -483,6 +571,86 @@ define double @contract_fnmsub_d(double %a, double %b, double %c) nounwind {
 ; LA64-CONTRACT-OFF-LABEL: contract_fnmsub_d:
 ; LA64-CONTRACT-OFF:       # %bb.0:
 ; LA64-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-OFF-NEXT:    ret
+  %negc = fneg contract double %c
+  %mul = fmul contract double %a, %b
+  %add = fadd contract double %mul, %negc
+  %neg = fneg contract double %add
+  ret double %neg
+}
+
+define double @contract_fnmsub_d_nsz(double %a, double %b, double %c) nounwind {
+; LA32-CONTRACT-FAST-LABEL: contract_fnmsub_d_nsz:
+; LA32-CONTRACT-FAST:       # %bb.0:
+; LA32-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-FAST-NEXT:    ret
+;
+; LA32-CONTRACT-ON-LABEL: contract_fnmsub_d_nsz:
+; LA32-CONTRACT-ON:       # %bb.0:
+; LA32-CONTRACT-ON-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-ON-NEXT:    ret
+;
+; LA32-CONTRACT-OFF-LABEL: contract_fnmsub_d_nsz:
+; LA32-CONTRACT-OFF:       # %bb.0:
+; LA32-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-OFF-NEXT:    ret
+;
+; LA64-CONTRACT-FAST-LABEL: contract_fnmsub_d_nsz:
+; LA64-CONTRACT-FAST:       # %bb.0:
+; LA64-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-FAST-NEXT:    ret
+;
+; LA64-CONTRACT-ON-LABEL: contract_fnmsub_d_nsz:
+; LA64-CONTRACT-ON:       # %bb.0:
+; LA64-CONTRACT-ON-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-ON-NEXT:    ret
+;
+; LA64-CONTRACT-OFF-LABEL: contract_fnmsub_d_nsz:
+; LA64-CONTRACT-OFF:       # %bb.0:
+; LA64-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-OFF-NEXT:    ret
+  %nega = fneg contract nsz double %a
+  %mul = fmul contract nsz double %nega, %b
+  %add = fadd contract nsz double %mul, %c
+  ret double %add
+}
+
+;; Check that fnmsub.d is not emitted.
+define double @not_contract_fnmsub_d(double %a, double %b, double %c) nounwind {
+; LA32-CONTRACT-FAST-LABEL: not_contract_fnmsub_d:
+; LA32-CONTRACT-FAST:       # %bb.0:
+; LA32-CONTRACT-FAST-NEXT:    fneg.d $fa0, $fa0
+; LA32-CONTRACT-FAST-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-FAST-NEXT:    ret
+;
+; LA32-CONTRACT-ON-LABEL: not_contract_fnmsub_d:
+; LA32-CONTRACT-ON:       # %bb.0:
+; LA32-CONTRACT-ON-NEXT:    fneg.d $fa0, $fa0
+; LA32-CONTRACT-ON-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-ON-NEXT:    ret
+;
+; LA32-CONTRACT-OFF-LABEL: not_contract_fnmsub_d:
+; LA32-CONTRACT-OFF:       # %bb.0:
+; LA32-CONTRACT-OFF-NEXT:    fneg.d $fa0, $fa0
+; LA32-CONTRACT-OFF-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-OFF-NEXT:    ret
+;
+; LA64-CONTRACT-FAST-LABEL: not_contract_fnmsub_d:
+; LA64-CONTRACT-FAST:       # %bb.0:
+; LA64-CONTRACT-FAST-NEXT:    fneg.d $fa0, $fa0
+; LA64-CONTRACT-FAST-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-FAST-NEXT:    ret
+;
+; LA64-CONTRACT-ON-LABEL: not_contract_fnmsub_d:
+; LA64-CONTRACT-ON:       # %bb.0:
+; LA64-CONTRACT-ON-NEXT:    fneg.d $fa0, $fa0
+; LA64-CONTRACT-ON-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-ON-NEXT:    ret
+;
+; LA64-CONTRACT-OFF-LABEL: not_contract_fnmsub_d:
+; LA64-CONTRACT-OFF:       # %bb.0:
+; LA64-CONTRACT-OFF-NEXT:    fneg.d $fa0, $fa0
+; LA64-CONTRACT-OFF-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
 ; LA64-CONTRACT-OFF-NEXT:    ret
   %nega = fneg contract double %a
   %mul = fmul contract double %nega, %b
@@ -592,8 +760,8 @@ define double @fnmadd_d_intrinsics(double %a, double %b, double %c) nounwind {
 ; LA64-CONTRACT-OFF-NEXT:    fnmadd.d $fa0, $fa0, $fa1, $fa2
 ; LA64-CONTRACT-OFF-NEXT:    ret
   %fma = call double @llvm.fma.f64(double %a, double %b, double %c)
-  %neg = fneg double %fma
-  ret double %neg
+  %negfma = fneg double %fma
+  ret double %negfma
 }
 
 define double @fnmadd_d_nsz_intrinsics(double %a, double %b, double %c) nounwind {
@@ -704,44 +872,87 @@ define double @fnmsub_d_intrinsics(double %a, double %b, double %c) nounwind {
 ; LA64-CONTRACT-OFF-LABEL: fnmsub_d_intrinsics:
 ; LA64-CONTRACT-OFF:       # %bb.0:
 ; LA64-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-OFF-NEXT:    ret
+  %negc = fneg double %c
+  %fma = call double @llvm.fma.f64(double %a, double %b, double %negc)
+  %negfma = fneg double %fma
+  ret double %negfma
+}
+
+define double @fnmsub_d_nsz_intrinsics(double %a, double %b, double %c) nounwind {
+; LA32-CONTRACT-FAST-LABEL: fnmsub_d_nsz_intrinsics:
+; LA32-CONTRACT-FAST:       # %bb.0:
+; LA32-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-FAST-NEXT:    ret
+;
+; LA32-CONTRACT-ON-LABEL: fnmsub_d_nsz_intrinsics:
+; LA32-CONTRACT-ON:       # %bb.0:
+; LA32-CONTRACT-ON-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-ON-NEXT:    ret
+;
+; LA32-CONTRACT-OFF-LABEL: fnmsub_d_nsz_intrinsics:
+; LA32-CONTRACT-OFF:       # %bb.0:
+; LA32-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-OFF-NEXT:    ret
+;
+; LA64-CONTRACT-FAST-LABEL: fnmsub_d_nsz_intrinsics:
+; LA64-CONTRACT-FAST:       # %bb.0:
+; LA64-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-FAST-NEXT:    ret
+;
+; LA64-CONTRACT-ON-LABEL: fnmsub_d_nsz_intrinsics:
+; LA64-CONTRACT-ON:       # %bb.0:
+; LA64-CONTRACT-ON-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-ON-NEXT:    ret
+;
+; LA64-CONTRACT-OFF-LABEL: fnmsub_d_nsz_intrinsics:
+; LA64-CONTRACT-OFF:       # %bb.0:
+; LA64-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
 ; LA64-CONTRACT-OFF-NEXT:    ret
   %nega = fneg double %a
-  %fma = call double @llvm.fma.f64(double %nega, double %b, double %c)
+  %fma = call nsz double @llvm.fma.f64(double %nega, double %b, double %c)
   ret double %fma
 }
 
-define double @fnmsub_d_swap_intrinsics(double %a, double %b, double %c) nounwind {
-; LA32-CONTRACT-FAST-LABEL: fnmsub_d_swap_intrinsics:
+;; Check that fnmsub.d is not emitted.
+define double @not_fnmsub_d_intrinsics(double %a, double %b, double %c) nounwind {
+; LA32-CONTRACT-FAST-LABEL: not_fnmsub_d_intrinsics:
 ; LA32-CONTRACT-FAST:       # %bb.0:
-; LA32-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa1, $fa0, $fa2
+; LA32-CONTRACT-FAST-NEXT:    fneg.d $fa0, $fa0
+; LA32-CONTRACT-FAST-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
 ; LA32-CONTRACT-FAST-NEXT:    ret
 ;
-; LA32-CONTRACT-ON-LABEL: fnmsub_d_swap_intrinsics:
+; LA32-CONTRACT-ON-LABEL: not_fnmsub_d_intrinsics:
 ; LA32-CONTRACT-ON:       # %bb.0:
-; LA32-CONTRACT-ON-NEXT:    fnmsub.d $fa0, $fa1, $fa0, $fa2
+; LA32-CONTRACT-ON-NEXT:    fneg.d $fa0, $fa0
+; LA32-CONTRACT-ON-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
 ; LA32-CONTRACT-ON-NEXT:    ret
 ;
-; LA32-CONTRACT-OFF-LABEL: fnmsub_d_swap_intrinsics:
+; LA32-CONTRACT-OFF-LABEL: not_fnmsub_d_intrinsics:
 ; LA32-CONTRACT-OFF:       # %bb.0:
-; LA32-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa1, $fa0, $fa2
+; LA32-CONTRACT-OFF-NEXT:    fneg.d $fa0, $fa0
+; LA32-CONTRACT-OFF-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
 ; LA32-CONTRACT-OFF-NEXT:    ret
 ;
-; LA64-CONTRACT-FAST-LABEL: fnmsub_d_swap_intrinsics:
+; LA64-CONTRACT-FAST-LABEL: not_fnmsub_d_intrinsics:
 ; LA64-CONTRACT-FAST:       # %bb.0:
-; LA64-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa1, $fa0, $fa2
+; LA64-CONTRACT-FAST-NEXT:    fneg.d $fa0, $fa0
+; LA64-CONTRACT-FAST-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
 ; LA64-CONTRACT-FAST-NEXT:    ret
 ;
-; LA64-CONTRACT-ON-LABEL: fnmsub_d_swap_intrinsics:
+; LA64-CONTRACT-ON-LABEL: not_fnmsub_d_intrinsics:
 ; LA64-CONTRACT-ON:       # %bb.0:
-; LA64-CONTRACT-ON-NEXT:    fnmsub.d $fa0, $fa1, $fa0, $fa2
+; LA64-CONTRACT-ON-NEXT:    fneg.d $fa0, $fa0
+; LA64-CONTRACT-ON-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
 ; LA64-CONTRACT-ON-NEXT:    ret
 ;
-; LA64-CONTRACT-OFF-LABEL: fnmsub_d_swap_intrinsics:
+; LA64-CONTRACT-OFF-LABEL: not_fnmsub_d_intrinsics:
 ; LA64-CONTRACT-OFF:       # %bb.0:
-; LA64-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa1, $fa0, $fa2
+; LA64-CONTRACT-OFF-NEXT:    fneg.d $fa0, $fa0
+; LA64-CONTRACT-OFF-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
 ; LA64-CONTRACT-OFF-NEXT:    ret
-  %negb = fneg double %b
-  %fma = call double @llvm.fma.f64(double %a, double %negb, double %c)
+  %nega = fneg double %a
+  %fma = call double @llvm.fma.f64(double %nega, double %b, double %c)
   ret double %fma
 }
 
@@ -882,6 +1093,8 @@ define double @fnmsub_d_contract(double %a, double %b, double %c) nounwind {
 ; LA64-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
 ; LA64-CONTRACT-OFF-NEXT:    ret
   %mul = fmul contract double %a, %b
-  %sub = fsub contract double %c, %mul
-  ret double %sub
+  %negc = fneg contract double %c
+  %add = fadd contract double %negc, %mul
+  %negadd = fneg contract double %add
+  ret double %negadd
 }
diff --git a/llvm/test/CodeGen/LoongArch/float-fma.ll b/llvm/test/CodeGen/LoongArch/float-fma.ll
index 54dc56784006..c236255d971a 100644
--- a/llvm/test/CodeGen/LoongArch/float-fma.ll
+++ b/llvm/test/CodeGen/LoongArch/float-fma.ll
@@ -236,13 +236,15 @@ define float @fnmsub_s(float %a, float %b, float %c) nounwind {
 ; LA32-CONTRACT-ON-LABEL: fnmsub_s:
 ; LA32-CONTRACT-ON:       # %bb.0:
 ; LA32-CONTRACT-ON-NEXT:    fmul.s $fa0, $fa0, $fa1
-; LA32-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa2, $fa0
+; LA32-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa0, $fa2
+; LA32-CONTRACT-ON-NEXT:    fneg.s $fa0, $fa0
 ; LA32-CONTRACT-ON-NEXT:    ret
 ;
 ; LA32-CONTRACT-OFF-LABEL: fnmsub_s:
 ; LA32-CONTRACT-OFF:       # %bb.0:
 ; LA32-CONTRACT-OFF-NEXT:    fmul.s $fa0, $fa0, $fa1
-; LA32-CONTRACT-OFF-NEXT:    fsub.s $fa0, $fa2, $fa0
+; LA32-CONTRACT-OFF-NEXT:    fsub.s $fa0, $fa0, $fa2
+; LA32-CONTRACT-OFF-NEXT:    fneg.s $fa0, $fa0
 ; LA32-CONTRACT-OFF-NEXT:    ret
 ;
 ; LA64-CONTRACT-FAST-LABEL: fnmsub_s:
@@ -253,12 +255,98 @@ define float @fnmsub_s(float %a, float %b, float %c) nounwind {
 ; LA64-CONTRACT-ON-LABEL: fnmsub_s:
 ; LA64-CONTRACT-ON:       # %bb.0:
 ; LA64-CONTRACT-ON-NEXT:    fmul.s $fa0, $fa0, $fa1
-; LA64-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa2, $fa0
+; LA64-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa0, $fa2
+; LA64-CONTRACT-ON-NEXT:    fneg.s $fa0, $fa0
 ; LA64-CONTRACT-ON-NEXT:    ret
 ;
 ; LA64-CONTRACT-OFF-LABEL: fnmsub_s:
 ; LA64-CONTRACT-OFF:       # %bb.0:
 ; LA64-CONTRACT-OFF-NEXT:    fmul.s $fa0, $fa0, $fa1
+; LA64-CONTRACT-OFF-NEXT:    fsub.s $fa0, $fa0, $fa2
+; LA64-CONTRACT-OFF-NEXT:    fneg.s $fa0, $fa0
+; LA64-CONTRACT-OFF-NEXT:    ret
+  %negc = fneg float %c
+  %mul = fmul float %a, %b
+  %add = fadd float %mul, %negc
+  %neg = fneg float %add
+  ret float %neg
+}
+
+define float @fnmsub_s_nsz(float %a, float %b, float %c) nounwind {
+; LA32-CONTRACT-FAST-LABEL: fnmsub_s_nsz:
+; LA32-CONTRACT-FAST:       # %bb.0:
+; LA32-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-FAST-NEXT:    ret
+;
+; LA32-CONTRACT-ON-LABEL: fnmsub_s_nsz:
+; LA32-CONTRACT-ON:       # %bb.0:
+; LA32-CONTRACT-ON-NEXT:    fmul.s $fa0, $fa0, $fa1
+; LA32-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa2, $fa0
+; LA32-CONTRACT-ON-NEXT:    ret
+;
+; LA32-CONTRACT-OFF-LABEL: fnmsub_s_nsz:
+; LA32-CONTRACT-OFF:       # %bb.0:
+; LA32-CONTRACT-OFF-NEXT:    fmul.s $fa0, $fa0, $fa1
+; LA32-CONTRACT-OFF-NEXT:    fsub.s $fa0, $fa2, $fa0
+; LA32-CONTRACT-OFF-NEXT:    ret
+;
+; LA64-CONTRACT-FAST-LABEL: fnmsub_s_nsz:
+; LA64-CONTRACT-FAST:       # %bb.0:
+; LA64-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-FAST-NEXT:    ret
+;
+; LA64-CONTRACT-ON-LABEL: fnmsub_s_nsz:
+; LA64-CONTRACT-ON:       # %bb.0:
+; LA64-CONTRACT-ON-NEXT:    fmul.s $fa0, $fa0, $fa1
+; LA64-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa2, $fa0
+; LA64-CONTRACT-ON-NEXT:    ret
+;
+; LA64-CONTRACT-OFF-LABEL: fnmsub_s_nsz:
+; LA64-CONTRACT-OFF:       # %bb.0:
+; LA64-CONTRACT-OFF-NEXT:    fmul.s $fa0, $fa0, $fa1
+; LA64-CONTRACT-OFF-NEXT:    fsub.s $fa0, $fa2, $fa0
+; LA64-CONTRACT-OFF-NEXT:    ret
+  %nega = fneg nsz float %a
+  %mul = fmul nsz float %nega, %b
+  %add = fadd nsz float %mul, %c
+  ret float %add
+}
+
+;; Check that fnmsub.s is not emitted.
+define float @not_fnmsub_s(float %a, float %b, float %c) nounwind {
+; LA32-CONTRACT-FAST-LABEL: not_fnmsub_s:
+; LA32-CONTRACT-FAST:       # %bb.0:
+; LA32-CONTRACT-FAST-NEXT:    fneg.s $fa0, $fa0
+; LA32-CONTRACT-FAST-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-FAST-NEXT:    ret
+;
+; LA32-CONTRACT-ON-LABEL: not_fnmsub_s:
+; LA32-CONTRACT-ON:       # %bb.0:
+; LA32-CONTRACT-ON-NEXT:    fmul.s $fa0, $fa0, $fa1
+; LA32-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa2, $fa0
+; LA32-CONTRACT-ON-NEXT:    ret
+;
+; LA32-CONTRACT-OFF-LABEL: not_fnmsub_s:
+; LA32-CONTRACT-OFF:       # %bb.0:
+; LA32-CONTRACT-OFF-NEXT:    fmul.s $fa0, $fa0, $fa1
+; LA32-CONTRACT-OFF-NEXT:    fsub.s $fa0, $fa2, $fa0
+; LA32-CONTRACT-OFF-NEXT:    ret
+;
+; LA64-CONTRACT-FAST-LABEL: not_fnmsub_s:
+; LA64-CONTRACT-FAST:       # %bb.0:
+; LA64-CONTRACT-FAST-NEXT:    fneg.s $fa0, $fa0
+; LA64-CONTRACT-FAST-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-FAST-NEXT:    ret
+;
+; LA64-CONTRACT-ON-LABEL: not_fnmsub_s:
+; LA64-CONTRACT-ON:       # %bb.0:
+; LA64-CONTRACT-ON-NEXT:    fmul.s $fa0, $fa0, $fa1
+; LA64-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa2, $fa0
+; LA64-CONTRACT-ON-NEXT:    ret
+;
+; LA64-CONTRACT-OFF-LABEL: not_fnmsub_s:
+; LA64-CONTRACT-OFF:       # %bb.0:
+; LA64-CONTRACT-OFF-NEXT:    fmul.s $fa0, $fa0, $fa1
 ; LA64-CONTRACT-OFF-NEXT:    fsub.s $fa0, $fa2, $fa0
 ; LA64-CONTRACT-OFF-NEXT:    ret
   %nega = fneg float %a
@@ -483,6 +571,86 @@ define float @contract_fnmsub_s(float %a, float %b, float %c) nounwind {
 ; LA64-CONTRACT-OFF-LABEL: contract_fnmsub_s:
 ; LA64-CONTRACT-OFF:       # %bb.0:
 ; LA64-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-OFF-NEXT:    ret
+  %negc = fneg contract float %c
+  %mul = fmul contract float %a, %b
+  %add = fadd contract float %mul, %negc
+  %neg = fneg contract float %add
+  ret float %neg
+}
+
+define float @contract_fnmsub_s_nsz(float %a, float %b, float %c) nounwind {
+; LA32-CONTRACT-FAST-LABEL: contract_fnmsub_s_nsz:
+; LA32-CONTRACT-FAST:       # %bb.0:
+; LA32-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-FAST-NEXT:    ret
+;
+; LA32-CONTRACT-ON-LABEL: contract_fnmsub_s_nsz:
+; LA32-CONTRACT-ON:       # %bb.0:
+; LA32-CONTRACT-ON-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-ON-NEXT:    ret
+;
+; LA32-CONTRACT-OFF-LABEL: contract_fnmsub_s_nsz:
+; LA32-CONTRACT-OFF:       # %bb.0:
+; LA32-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-OFF-NEXT:    ret
+;
+; LA64-CONTRACT-FAST-LABEL: contract_fnmsub_s_nsz:
+; LA64-CONTRACT-FAST:       # %bb.0:
+; LA64-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-FAST-NEXT:    ret
+;
+; LA64-CONTRACT-ON-LABEL: contract_fnmsub_s_nsz:
+; LA64-CONTRACT-ON:       # %bb.0:
+; LA64-CONTRACT-ON-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-ON-NEXT:    ret
+;
+; LA64-CONTRACT-OFF-LABEL: contract_fnmsub_s_nsz:
+; LA64-CONTRACT-OFF:       # %bb.0:
+; LA64-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-OFF-NEXT:    ret
+  %nega = fneg contract nsz float %a
+  %mul = fmul contract nsz float %nega, %b
+  %add = fadd contract nsz float %mul, %c
+  ret float %add
+}
+
+;; Check that fnmsub.s is not emitted.
+define float @not_contract_fnmsub_s(float %a, float %b, float %c) nounwind {
+; LA32-CONTRACT-FAST-LABEL: not_contract_fnmsub_s:
+; LA32-CONTRACT-FAST:       # %bb.0:
+; LA32-CONTRACT-FAST-NEXT:    fneg.s $fa0, $fa0
+; LA32-CONTRACT-FAST-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-FAST-NEXT:    ret
+;
+; LA32-CONTRACT-ON-LABEL: not_contract_fnmsub_s:
+; LA32-CONTRACT-ON:       # %bb.0:
+; LA32-CONTRACT-ON-NEXT:    fneg.s $fa0, $fa0
+; LA32-CONTRACT-ON-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-ON-NEXT:    ret
+;
+; LA32-CONTRACT-OFF-LABEL: not_contract_fnmsub_s:
+; LA32-CONTRACT-OFF:       # %bb.0:
+; LA32-CONTRACT-OFF-NEXT:    fneg.s $fa0, $fa0
+; LA32-CONTRACT-OFF-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-OFF-NEXT:    ret
+;
+; LA64-CONTRACT-FAST-LABEL: not_contract_fnmsub_s:
+; LA64-CONTRACT-FAST:       # %bb.0:
+; LA64-CONTRACT-FAST-NEXT:    fneg.s $fa0, $fa0
+; LA64-CONTRACT-FAST-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-FAST-NEXT:    ret
+;
+; LA64-CONTRACT-ON-LABEL: not_contract_fnmsub_s:
+; LA64-CONTRACT-ON:       # %bb.0:
+; LA64-CONTRACT-ON-NEXT:    fneg.s $fa0, $fa0
+; LA64-CONTRACT-ON-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-ON-NEXT:    ret
+;
+; LA64-CONTRACT-OFF-LABEL: not_contract_fnmsub_s:
+; LA64-CONTRACT-OFF:       # %bb.0:
+; LA64-CONTRACT-OFF-NEXT:    fneg.s $fa0, $fa0
+; LA64-CONTRACT-OFF-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
 ; LA64-CONTRACT-OFF-NEXT:    ret
   %nega = fneg contract float %a
   %mul = fmul contract float %nega, %b
@@ -592,8 +760,8 @@ define float @fnmadd_s_intrinsics(float %a, float %b, float %c) nounwind {
 ; LA64-CONTRACT-OFF-NEXT:    fnmadd.s $fa0, $fa0, $fa1, $fa2
 ; LA64-CONTRACT-OFF-NEXT:    ret
   %fma = call float @llvm.fma.f64(float %a, float %b, float %c)
-  %neg = fneg float %fma
-  ret float %neg
+  %negfma = fneg float %fma
+  ret float %negfma
 }
 
 define float @fnmadd_s_nsz_intrinsics(float %a, float %b, float %c) nounwind {
@@ -704,44 +872,87 @@ define float @fnmsub_s_intrinsics(float %a, float %b, float %c) nounwind {
 ; LA64-CONTRACT-OFF-LABEL: fnmsub_s_intrinsics:
 ; LA64-CONTRACT-OFF:       # %bb.0:
 ; LA64-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-OFF-NEXT:    ret
+  %negc = fneg float %c
+  %fma = call float @llvm.fma.f64(float %a, float %b, float %negc)
+  %negfma = fneg float %fma
+  ret float %negfma
+}
+
+define float @fnmsub_s_nsz_intrinsics(float %a, float %b, float %c) nounwind {
+; LA32-CONTRACT-FAST-LABEL: fnmsub_s_nsz_intrinsics:
+; LA32-CONTRACT-FAST:       # %bb.0:
+; LA32-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-FAST-NEXT:    ret
+;
+; LA32-CONTRACT-ON-LABEL: fnmsub_s_nsz_intrinsics:
+; LA32-CONTRACT-ON:       # %bb.0:
+; LA32-CONTRACT-ON-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-ON-NEXT:    ret
+;
+; LA32-CONTRACT-OFF-LABEL: fnmsub_s_nsz_intrinsics:
+; LA32-CONTRACT-OFF:       # %bb.0:
+; LA32-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+; LA32-CONTRACT-OFF-NEXT:    ret
+;
+; LA64-CONTRACT-FAST-LABEL: fnmsub_s_nsz_intrinsics:
+; LA64-CONTRACT-FAST:       # %bb.0:
+; LA64-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-FAST-NEXT:    ret
+;
+; LA64-CONTRACT-ON-LABEL: fnmsub_s_nsz_intrinsics:
+; LA64-CONTRACT-ON:       # %bb.0:
+; LA64-CONTRACT-ON-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+; LA64-CONTRACT-ON-NEXT:    ret
+;
+; LA64-CONTRACT-OFF-LABEL: fnmsub_s_nsz_intrinsics:
+; LA64-CONTRACT-OFF:       # %bb.0:
+; LA64-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
 ; LA64-CONTRACT-OFF-NEXT:    ret
   %nega = fneg float %a
-  %fma = call float @llvm.fma.f64(float %nega, float %b, float %c)
+  %fma = call nsz float @llvm.fma.f64(float %nega, float %b, float %c)
   ret float %fma
 }
 
-define float @fnmsub_s_swap_intrinsics(float %a, float %b, float %c) nounwind {
-; LA32-CONTRACT-FAST-LABEL: fnmsub_s_swap_intrinsics:
+;; Check that fnmsub.s is not emitted.
+define float @not_fnmsub_s_intrinsics(float %a, float %b, float %c) nounwind {
+; LA32-CONTRACT-FAST-LABEL: not_fnmsub_s_intrinsics:
 ; LA32-CONTRACT-FAST:       # %bb.0:
-; LA32-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa1, $fa0, $fa2
+; LA32-CONTRACT-FAST-NEXT:    fneg.s $fa0, $fa0
+; LA32-CONTRACT-FAST-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
 ; LA32-CONTRACT-FAST-NEXT:    ret
 ;
-; LA32-CONTRACT-ON-LABEL: fnmsub_s_swap_intrinsics:
+; LA32-CONTRACT-ON-LABEL: not_fnmsub_s_intrinsics:
 ; LA32-CONTRACT-ON:       # %bb.0:
-; LA32-CONTRACT-ON-NEXT:    fnmsub.s $fa0, $fa1, $fa0, $fa2
+; LA32-CONTRACT-ON-NEXT:    fneg.s $fa0, $fa0
+; LA32-CONTRACT-ON-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
 ; LA32-CONTRACT-ON-NEXT:    ret
 ;
-; LA32-CONTRACT-OFF-LABEL: fnmsub_s_swap_intrinsics:
+; LA32-CONTRACT-OFF-LABEL: not_fnmsub_s_intrinsics:
 ; LA32-CONTRACT-OFF:       # %bb.0:
-; LA32-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa1, $fa0, $fa2
+; LA32-CONTRACT-OFF-NEXT:    fneg.s $fa0, $fa0
+; LA32-CONTRACT-OFF-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
 ; LA32-CONTRACT-OFF-NEXT:    ret
 ;
-; LA64-CONTRACT-FAST-LABEL: fnmsub_s_swap_intrinsics:
+; LA64-CONTRACT-FAST-LABEL: not_fnmsub_s_intrinsics:
 ; LA64-CONTRACT-FAST:       # %bb.0:
-; LA64-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa1, $fa0, $fa2
+; LA64-CONTRACT-FAST-NEXT:    fneg.s $fa0, $fa0
+; LA64-CONTRACT-FAST-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
 ; LA64-CONTRACT-FAST-NEXT:    ret
 ;
-; LA64-CONTRACT-ON-LABEL: fnmsub_s_swap_intrinsics:
+; LA64-CONTRACT-ON-LABEL: not_fnmsub_s_intrinsics:
 ; LA64-CONTRACT-ON:       # %bb.0:
-; LA64-CONTRACT-ON-NEXT:    fnmsub.s $fa0, $fa1, $fa0, $fa2
+; LA64-CONTRACT-ON-NEXT:    fneg.s $fa0, $fa0
+; LA64-CONTRACT-ON-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
 ; LA64-CONTRACT-ON-NEXT:    ret
 ;
-; LA64-CONTRACT-OFF-LABEL: fnmsub_s_swap_intrinsics:
+; LA64-CONTRACT-OFF-LABEL: not_fnmsub_s_intrinsics:
 ; LA64-CONTRACT-OFF:       # %bb.0:
-; LA64-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa1, $fa0, $fa2
+; LA64-CONTRACT-OFF-NEXT:    fneg.s $fa0, $fa0
+; LA64-CONTRACT-OFF-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
 ; LA64-CONTRACT-OFF-NEXT:    ret
-  %negb = fneg float %b
-  %fma = call float @llvm.fma.f64(float %a, float %negb, float %c)
+  %nega = fneg float %a
+  %fma = call float @llvm.fma.f64(float %nega, float %b, float %c)
   ret float %fma
 }
 
@@ -882,6 +1093,8 @@ define float @fnmsub_s_contract(float %a, float %b, float %c) nounwind {
 ; LA64-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
 ; LA64-CONTRACT-OFF-NEXT:    ret
   %mul = fmul contract float %a, %b
-  %sub = fsub contract float %c, %mul
-  ret float %sub
+  %negc = fneg contract float %c
+  %add = fadd contract float %negc, %mul
+  %negadd = fneg contract float %add
+  ret float %negadd
 }
-- 
Gitee


From 7a3bd125d9c1d0265b265ce238a88d0d4550e5a0 Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Wed, 3 Jan 2024 13:59:12 +0800
Subject: [PATCH 17/20] [LoongArch] Fix the procossor series mask

Refer PRID_SERIES_MASK definition in linux kernel:
arch/loongarch/include/asm/cpu.h.

(cherry picked from commit 7e186d366d6c7def0543acc255931f617e76dff0)
---
 llvm/lib/TargetParser/Host.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 81309280a44b..d11dc605e188 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1462,7 +1462,8 @@ StringRef sys::getHostCPUName() {
   // Use processor id to detect cpu name.
   uint32_t processor_id;
   __asm__("cpucfg %[prid], $zero\n\t" : [prid] "=r"(processor_id));
-  switch (processor_id & 0xff00) {
+  // Refer PRID_SERIES_MASK in linux kernel: arch/loongarch/include/asm/cpu.h.
+  switch (processor_id & 0xf000) {
   case 0xc000: // Loongson 64bit, 4-issue
     return "la464";
   // TODO: Others.
-- 
Gitee


From 3634ac4cbc475509c46521f5b8a3fcbeca6d06c7 Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Mon, 11 Mar 2024 08:59:17 +0800
Subject: [PATCH 18/20] [LoongArch] Make sure that the LoongArchISD::BSTRINS
 node uses the correct `MSB` value (#84454)

The `MSB` must not be greater than `GRLen`. Without this patch, newly
added test cases will crash with LoongArch32, resulting in a 'cannot
select' error.

(cherry picked from commit edd4c6c6dca4c556de22b2ab73d5bfc02d28e59b)
(cherry picked from commit d77c5c3830d925b3795e2f1535a6568399fe6626)
---
 llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp |  4 +++-
 llvm/test/CodeGen/LoongArch/bstrins_w.ll            | 13 +++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index ed106cb766bc..5affaf37ad5a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2310,7 +2310,9 @@ Retry:
     return DAG.getNode(
         LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0),
         DAG.getConstant(CN1->getSExtValue() >> MaskIdx0, DL, ValTy),
-        DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT),
+        DAG.getConstant(ValBits == 32 ? (MaskIdx0 + (MaskLen0 & 31) - 1)
+                                      : (MaskIdx0 + MaskLen0 - 1),
+                        DL, GRLenVT),
         DAG.getConstant(MaskIdx0, DL, GRLenVT));
   }
 
diff --git a/llvm/test/CodeGen/LoongArch/bstrins_w.ll b/llvm/test/CodeGen/LoongArch/bstrins_w.ll
index dfbe000841cd..e008caacad2a 100644
--- a/llvm/test/CodeGen/LoongArch/bstrins_w.ll
+++ b/llvm/test/CodeGen/LoongArch/bstrins_w.ll
@@ -145,6 +145,19 @@ define i32 @pat5(i32 %a) nounwind {
   ret i32 %or
 }
 
+;; The high bits of `const` are zero.
+define i32 @pat5_high_zeros(i32 %a) nounwind {
+; CHECK-LABEL: pat5_high_zeros:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lu12i.w $a1, 1
+; CHECK-NEXT:    ori $a1, $a1, 564
+; CHECK-NEXT:    bstrins.w $a0, $a1, 31, 16
+; CHECK-NEXT:    ret
+  %and = and i32 %a, 65535      ; 0x0000ffff
+  %or = or i32 %and, 305397760  ; 0x12340000
+  ret i32 %or
+}
+
 ;; Pattern 6: a = b | ((c & mask) << shamt)
 ;; In this testcase b is 0x10000002, but in fact we do not require b being a
 ;; constant. As long as all positions in b to be overwritten by the incoming
-- 
Gitee


From 29409970a5c68e20022a05457127102a66abfead Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Tue, 5 Mar 2024 19:44:28 +0800
Subject: [PATCH 19/20] [Clang][LoongArch] Precommit test for fix wrong return
 value type of __iocsrrd_h. NFC

(cherry picked from commit aeda1a6e800e0dd6c91c0332b4db95094ad5b301)
(cherry picked from commit a9ba36c7e7d7fa076f201843e3b826b6c6d7f5ef)
---
 clang/test/CodeGen/LoongArch/intrinsic-la32.c | 29 ++++++++++++++-----
 clang/test/CodeGen/LoongArch/intrinsic-la64.c | 21 ++++++++++++--
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la32.c b/clang/test/CodeGen/LoongArch/intrinsic-la32.c
index 93d54f511a9c..6a8d99880be3 100644
--- a/clang/test/CodeGen/LoongArch/intrinsic-la32.c
+++ b/clang/test/CodeGen/LoongArch/intrinsic-la32.c
@@ -169,8 +169,8 @@ unsigned int cpucfg(unsigned int a) {
 
 // LA32-LABEL: @rdtime(
 // LA32-NEXT:  entry:
-// LA32-NEXT:    [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc !2
-// LA32-NEXT:    [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc !3
+// LA32-NEXT:    [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc [[META2:![0-9]+]]
+// LA32-NEXT:    [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc [[META3:![0-9]+]]
 // LA32-NEXT:    ret void
 //
 void rdtime() {
@@ -201,13 +201,28 @@ void loongarch_movgr2fcsr(int a) {
   __builtin_loongarch_movgr2fcsr(1, a);
 }
 
-// CHECK-LABEL: @cacop_w(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A:%.*]], i32 1024)
-// CHECK-NEXT:    tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A]], i32 1024)
-// CHECK-NEXT:    ret void
+// LA32-LABEL: @cacop_w(
+// LA32-NEXT:  entry:
+// LA32-NEXT:    tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A:%.*]], i32 1024)
+// LA32-NEXT:    tail call void @llvm.loongarch.cacop.w(i32 1, i32 [[A]], i32 1024)
+// LA32-NEXT:    ret void
 //
 void cacop_w(unsigned long int a) {
   __cacop_w(1, a, 1024);
   __builtin_loongarch_cacop_w(1, a, 1024);
 }
+
+// LA32-LABEL: @iocsrrd_h_result(
+// LA32-NEXT:  entry:
+// LA32-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A:%.*]])
+// LA32-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A]])
+// LA32-NEXT:    [[CONV2:%.*]] = and i32 [[TMP0]], 255
+// LA32-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], [[CONV2]]
+// LA32-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD]] to i16
+// LA32-NEXT:    ret i16 [[CONV4]]
+//
+unsigned short iocsrrd_h_result(unsigned int a) {
+  unsigned short b = __iocsrrd_h(a);
+  unsigned short c = __builtin_loongarch_iocsrrd_h(a);
+  return b+c;
+}
diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la64.c b/clang/test/CodeGen/LoongArch/intrinsic-la64.c
index a740882eef54..48b6a7a3d227 100644
--- a/clang/test/CodeGen/LoongArch/intrinsic-la64.c
+++ b/clang/test/CodeGen/LoongArch/intrinsic-la64.c
@@ -387,7 +387,7 @@ unsigned int cpucfg(unsigned int a) {
 
 // CHECK-LABEL: @rdtime_d(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i64, i64 } asm sideeffect "rdtime.d $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc !2
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i64, i64 } asm sideeffect "rdtime.d $0, $1\0A\09", "=&r,=&r"() #[[ATTR1:[0-9]+]], !srcloc [[META2:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void rdtime_d() {
@@ -396,8 +396,8 @@ void rdtime_d() {
 
 // CHECK-LABEL: @rdtime(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc !3
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc !4
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimeh.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc [[META3:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i32 } asm sideeffect "rdtimel.w $0, $1\0A\09", "=&r,=&r"() #[[ATTR1]], !srcloc [[META4:![0-9]+]]
 // CHECK-NEXT:    ret void
 //
 void rdtime() {
@@ -427,3 +427,18 @@ void loongarch_movgr2fcsr(int a) {
   __movgr2fcsr(1, a);
   __builtin_loongarch_movgr2fcsr(1, a);
 }
+
+// CHECK-LABEL: @iocsrrd_h_result(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A]])
+// CHECK-NEXT:    [[CONV2:%.*]] = and i32 [[TMP0]], 255
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], [[CONV2]]
+// CHECK-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD]] to i16
+// CHECK-NEXT:    ret i16 [[CONV4]]
+//
+unsigned short iocsrrd_h_result(unsigned int a) {
+  unsigned short b = __iocsrrd_h(a);
+  unsigned short c = __builtin_loongarch_iocsrrd_h(a);
+  return b+c;
+}
-- 
Gitee


From 47425dfdd1582ec652aba1c289f3a80fe25c1a8c Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Wed, 6 Mar 2024 10:03:28 +0800
Subject: [PATCH 20/20] [Clang][LoongArch] Fix wrong return value type of
 __iocsrrd_h (#84100)

relate:
https: //gcc.gnu.org/pipermail/gcc-patches/2024-February/645016.html
(cherry picked from commit 2f479b811274fede36535e34ecb545ac22e399c3)
(cherry picked from commit 9b9aee16d4dcf1b4af49988ebd7918fa4ce77e44)
---
 clang/lib/Headers/larchintrin.h               | 2 +-
 clang/test/CodeGen/LoongArch/intrinsic-la32.c | 8 ++++----
 clang/test/CodeGen/LoongArch/intrinsic-la64.c | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Headers/larchintrin.h b/clang/lib/Headers/larchintrin.h
index c5c533ee0b8c..24dd29ce91ff 100644
--- a/clang/lib/Headers/larchintrin.h
+++ b/clang/lib/Headers/larchintrin.h
@@ -156,7 +156,7 @@ extern __inline unsigned char
   return (unsigned char)__builtin_loongarch_iocsrrd_b((unsigned int)_1);
 }
 
-extern __inline unsigned char
+extern __inline unsigned short
     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     __iocsrrd_h(unsigned int _1) {
   return (unsigned short)__builtin_loongarch_iocsrrd_h((unsigned int)_1);
diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la32.c b/clang/test/CodeGen/LoongArch/intrinsic-la32.c
index 6a8d99880be3..eb3f8cbe7ac4 100644
--- a/clang/test/CodeGen/LoongArch/intrinsic-la32.c
+++ b/clang/test/CodeGen/LoongArch/intrinsic-la32.c
@@ -215,11 +215,11 @@ void cacop_w(unsigned long int a) {
 // LA32-LABEL: @iocsrrd_h_result(
 // LA32-NEXT:  entry:
 // LA32-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A:%.*]])
+// LA32-NEXT:    [[CONV_I:%.*]] = trunc i32 [[TMP0]] to i16
 // LA32-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A]])
-// LA32-NEXT:    [[CONV2:%.*]] = and i32 [[TMP0]], 255
-// LA32-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], [[CONV2]]
-// LA32-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD]] to i16
-// LA32-NEXT:    ret i16 [[CONV4]]
+// LA32-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// LA32-NEXT:    [[CONV3:%.*]] = add i16 [[TMP2]], [[CONV_I]]
+// LA32-NEXT:    ret i16 [[CONV3]]
 //
 unsigned short iocsrrd_h_result(unsigned int a) {
   unsigned short b = __iocsrrd_h(a);
diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la64.c b/clang/test/CodeGen/LoongArch/intrinsic-la64.c
index 48b6a7a3d227..50ec358f546e 100644
--- a/clang/test/CodeGen/LoongArch/intrinsic-la64.c
+++ b/clang/test/CodeGen/LoongArch/intrinsic-la64.c
@@ -431,11 +431,11 @@ void loongarch_movgr2fcsr(int a) {
 // CHECK-LABEL: @iocsrrd_h_result(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A:%.*]])
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc i32 [[TMP0]] to i16
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.loongarch.iocsrrd.h(i32 [[A]])
-// CHECK-NEXT:    [[CONV2:%.*]] = and i32 [[TMP0]], 255
-// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], [[CONV2]]
-// CHECK-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK-NEXT:    ret i16 [[CONV4]]
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    [[CONV3:%.*]] = add i16 [[TMP2]], [[CONV_I]]
+// CHECK-NEXT:    ret i16 [[CONV3]]
 //
 unsigned short iocsrrd_h_result(unsigned int a) {
   unsigned short b = __iocsrrd_h(a);
-- 
Gitee