From 0cae10595a7521e2c430c605c1f830570b3c9682 Mon Sep 17 00:00:00 2001
From: Lu Weining <luweining@loongson.cn>
Date: Thu, 30 Nov 2023 14:08:45 +0800
Subject: [PATCH 01/23] [Driver] Support -mcmodel= for LoongArch (#72514)

7e42545 rejects unsupported mcmodel options, but normal/medium/extreme
should be supported models for LoongArch according to [gcc
document](https://gcc.gnu.org/onlinedocs/gcc/LoongArch-Options.html).

The mappings among `gcc`, `clang driver`, `clang cc1` and `LLVM (i.e.
llc --code-model=)` are:

|     gcc      |  clang driver |  clang cc1   |    LLVM    |
| -------------  | ------------------ | ----------------- | -------------- |
| normal    |      normal     |   small        |    small    |
| medium  |     medium     |   medium    |    medium |
| extreme  |     extreme     |  large         |     large     |

(cherry picked from commit 1296d20adfb0978afe38d67efab9818079d870ca)
---
 clang/lib/Driver/ToolChains/Clang.cpp | 38 ++++++++++++++++++++-------
 clang/test/Driver/mcmodel.c           | 15 +++++++++++
 2 files changed, 44 insertions(+), 9 deletions(-)
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index fac4f03d6193..4e5f689498d6 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5773,18 +5773,38 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (Arg *A = Args.getLastArg(options::OPT_mcmodel_EQ)) {
     StringRef CM = A->getValue();
-    if (CM == "small" || CM == "kernel" || CM == "medium" || CM == "large" ||
-        CM == "tiny") {
-      if (Triple.isOSAIX() && CM == "medium")
-        CmdArgs.push_back("-mcmodel=large");
-      else if (Triple.isAArch64() && (CM == "kernel" || CM == "medium"))
+    if (Triple.isLoongArch()) {
+      bool Ok = false;
+      if (CM == "extreme" &&
+          Args.hasFlagNoClaim(options::OPT_fplt, options::OPT_fno_plt, false))
+        D.Diag(diag::err_drv_argument_not_allowed_with)
+            << A->getAsString(Args) << "-fplt";
+      Ok = CM == "normal" || CM == "medium" || CM == "extreme";
+      // Convert to LLVM recognizable names.
+      if (Ok) {
+        CM = llvm::StringSwitch<StringRef>(CM)
+                 .Case("normal", "small")
+                 .Case("extreme", "large")
+                 .Default(CM);
+        CmdArgs.push_back(Args.MakeArgString("-mcmodel=" + CM));
+      } else {
         D.Diag(diag::err_drv_invalid_argument_to_option)
             << CM << A->getOption().getName();
-      else
-        A->render(Args, CmdArgs);
+      }
     } else {
-      D.Diag(diag::err_drv_invalid_argument_to_option)
-          << CM << A->getOption().getName();
+      if (CM == "small" || CM == "kernel" || CM == "medium" || CM == "large" ||
+          CM == "tiny") {
+        if (Triple.isOSAIX() && CM == "medium")
+          CmdArgs.push_back("-mcmodel=large");
+        else if (Triple.isAArch64() && (CM == "kernel" || CM == "medium"))
+          D.Diag(diag::err_drv_invalid_argument_to_option)
+              << CM << A->getOption().getName();
+        else
+          A->render(Args, CmdArgs);
+      } else {
+        D.Diag(diag::err_drv_invalid_argument_to_option)
+            << CM << A->getOption().getName();
+      }
     }
   }
 
diff --git a/clang/test/Driver/mcmodel.c b/clang/test/Driver/mcmodel.c
index 63b432036159..4aada126cf06 100644
--- a/clang/test/Driver/mcmodel.c
+++ b/clang/test/Driver/mcmodel.c
@@ -8,6 +8,14 @@
 // RUN: not %clang -c -mcmodel=lager %s 2>&1 | FileCheck --check-prefix=INVALID %s
 // RUN: not %clang -c --target=aarch64 -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=AARCH64-MEDIUM %s
 // RUN: not %clang -c --target=aarch64 -mcmodel=kernel %s 2>&1 | FileCheck --check-prefix=AARCH64-KERNEL %s
+// RUN: %clang --target=loongarch64 -### -S -mcmodel=normal %s 2>&1 | FileCheck --check-prefix=SMALL %s
+// RUN: %clang --target=loongarch64 -### -S -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=MEDIUM %s
+// RUN: %clang --target=loongarch64 -### -S -mcmodel=extreme %s 2>&1 | FileCheck --check-prefix=LARGE %s
+// RUN: not %clang -c --target=loongarch64 -mcmodel=tiny %s 2>&1 | FileCheck --check-prefix=ERR-LOONGARCH64-TINY %s
+// RUN: not %clang -c --target=loongarch64 -mcmodel=small %s 2>&1 | FileCheck --check-prefix=ERR-LOONGARCH64-SMALL %s
+// RUN: not %clang -c --target=loongarch64 -mcmodel=kernel %s 2>&1 | FileCheck --check-prefix=ERR-LOONGARCH64-KERNEL %s
+// RUN: not %clang -c --target=loongarch64 -mcmodel=large %s 2>&1 | FileCheck --check-prefix=ERR-LOONGARCH64-LARGE %s
+// RUN: not %clang -c --target=loongarch64 -mcmodel=extreme -fplt %s 2>&1 | FileCheck --check-prefix=ERR-LOONGARCH64-PLT-EXTREME %s
 
 // TINY: "-mcmodel=tiny"
 // SMALL: "-mcmodel=small"
@@ -20,3 +28,10 @@
 
 // AARCH64-MEDIUM: error: invalid argument 'medium' to -mcmodel=
 // AARCH64-KERNEL: error: invalid argument 'kernel' to -mcmodel=
+
+// ERR-LOONGARCH64-TINY:   error: invalid argument 'tiny' to -mcmodel=
+// ERR-LOONGARCH64-SMALL:  error: invalid argument 'small' to -mcmodel=
+// ERR-LOONGARCH64-KERNEL: error: invalid argument 'kernel' to -mcmodel=
+// ERR-LOONGARCH64-LARGE:  error: invalid argument 'large' to -mcmodel=
+
+// ERR-LOONGARCH64-PLT-EXTREME: error: invalid argument '-mcmodel=extreme' not allowed with '-fplt'
-- 
Gitee


From 53a624f1fbb2d1f837070b400812e8bddf66fd3d Mon Sep 17 00:00:00 2001
From: Lu Weining <luweining@loongson.cn>
Date: Tue, 5 Dec 2023 09:20:48 +0800
Subject: [PATCH 02/23] [BinaryFormat][LoongArch] Define psABI v2.20 relocs for
 R_LARCH_CALL36(#73345)

R_LARCH_CALL36 was designed for function call on medium code model where
the 2 instructions (pcaddu18i + jirl) must be adjacent.

(cherry picked from commit c3a9c905fbc486add75e16218fe58a04b7b6c282)
---
 llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def      | 6 ++++++
 .../tools/llvm-readobj/ELF/reloc-types-loongarch64.test     | 2 ++
 llvm/unittests/Object/ELFTest.cpp                           | 2 ++
 3 files changed, 10 insertions(+)

diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def
index 02bce3c71712..c4393432677b 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def
@@ -118,3 +118,9 @@ ELF_RELOC(R_LARCH_SUB6,        106)
 ELF_RELOC(R_LARCH_ADD_ULEB128, 107)
 ELF_RELOC(R_LARCH_SUB_ULEB128, 108)
 ELF_RELOC(R_LARCH_64_PCREL,    109)
+
+// Relocs added in ELF for the LoongArch™ Architecture v20231102, part of the
+// v2.20 LoongArch ABI specs.
+//
+// Spec addition: https://github.com/loongson/la-abi-specs/pull/4
+ELF_RELOC(R_LARCH_CALL36, 110)
diff --git a/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test b/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test
index e32dc893fa79..88ff7fa405ed 100644
--- a/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test
+++ b/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test
@@ -102,6 +102,7 @@
 # CHECK: Type: R_LARCH_ADD_ULEB128 (107)
 # CHECK: Type: R_LARCH_SUB_ULEB128 (108)
 # CHECK: Type: R_LARCH_64_PCREL (109)
+# CHECK: Type: R_LARCH_CALL36 (110)
 
 --- !ELF
 FileHeader:
@@ -211,3 +212,4 @@ Sections:
       - Type: R_LARCH_ADD_ULEB128
       - Type: R_LARCH_SUB_ULEB128
       - Type: R_LARCH_64_PCREL
+      - Type: R_LARCH_CALL36
diff --git a/llvm/unittests/Object/ELFTest.cpp b/llvm/unittests/Object/ELFTest.cpp
index 50b1df124a4a..ed851dde4c00 100644
--- a/llvm/unittests/Object/ELFTest.cpp
+++ b/llvm/unittests/Object/ELFTest.cpp
@@ -251,6 +251,8 @@ TEST(ELFTest, getELFRelocationTypeNameForLoongArch) {
             getELFRelocationTypeName(EM_LOONGARCH, R_LARCH_SUB_ULEB128));
   EXPECT_EQ("R_LARCH_64_PCREL",
             getELFRelocationTypeName(EM_LOONGARCH, R_LARCH_64_PCREL));
+  EXPECT_EQ("R_LARCH_CALL36",
+            getELFRelocationTypeName(EM_LOONGARCH, R_LARCH_CALL36));
 }
 
 TEST(ELFTest, getELFRelativeRelocationType) {
-- 
Gitee


From 2b870b1f213f2d645f4fa685371fbefea09b2969 Mon Sep 17 00:00:00 2001
From: Lu Weining <luweining@loongson.cn>
Date: Mon, 25 Dec 2023 17:40:48 +0800
Subject: [PATCH 03/23] [lld][LoongArch] Support the R_LARCH_CALL36 relocation
 type (#73346)

R_LARCH_CALL36 was designed for function call on medium code model where
the 2 instructions (pcaddu18i + jirl) must be adjacent. This is expected
to replace current medium code model implementation, i.e.
R_LARCH_PCALA_{HI20,LO12} on pcalau12i + jirl.

See https://github.com/loongson/la-abi-specs/pull/3 for more details.

(cherry picked from commit 88548df0fc08364bd03148c936e36f0bb07dde8a)
---
 lld/ELF/Arch/LoongArch.cpp      | 20 ++++++++++
 lld/test/ELF/loongarch-call36.s | 69 +++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 lld/test/ELF/loongarch-call36.s

diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index 160fab4aeba9..72d9c6838e31 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -479,6 +479,7 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
   case R_LARCH_B16:
   case R_LARCH_B21:
   case R_LARCH_B26:
+  case R_LARCH_CALL36:
     return R_PLT_PC;
   case R_LARCH_GOT_PC_HI20:
   case R_LARCH_GOT64_PC_LO20:
@@ -607,6 +608,25 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
     write32le(loc, setD10k16(read32le(loc), val >> 2));
     return;
 
+  case R_LARCH_CALL36: {
+    // This relocation is designed for adjancent pcaddu18i+jirl pairs that
+    // are patched in one time. Because of sign extension of these insns'
+    // immediate fields, the relocation range is [-128G - 0x20000, +128G -
+    // 0x20000) (of course must be 4-byte aligned).
+    if (((int64_t)val + 0x20000) != llvm::SignExtend64(val + 0x20000, 38))
+      reportRangeError(loc, rel, Twine(val), llvm::minIntN(38) - 0x20000,
+                       llvm::maxIntN(38) - 0x20000);
+    checkAlignment(loc, val, 4, rel);
+    // Since jirl performs sign extension on the offset immediate, adds (1<<17)
+    // to original val to get the correct hi20.
+    uint32_t hi20 = extractBits(val + (1 << 17), 37, 18);
+    // Despite the name, the lower part is actually 18 bits with 4-byte aligned.
+    uint32_t lo16 = extractBits(val, 17, 2);
+    write32le(loc, setJ20(read32le(loc), hi20));
+    write32le(loc + 4, setK16(read32le(loc + 4), lo16));
+    return;
+  }
+
   // Relocs intended for `addi`, `ld` or `st`.
   case R_LARCH_PCALA_LO12:
     // We have to again inspect the insn word to handle the R_LARCH_PCALA_LO12
diff --git a/lld/test/ELF/loongarch-call36.s b/lld/test/ELF/loongarch-call36.s
new file mode 100644
index 000000000000..2d25a2ac64ed
--- /dev/null
+++ b/lld/test/ELF/loongarch-call36.s
@@ -0,0 +1,69 @@
+# REQUIRES: loongarch
+
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc --filetype=obj --triple=loongarch64-unknown-elf %t/a.s -o %t/a.o
+
+# RUN: ld.lld %t/a.o --section-start=.text=0x20010 --section-start=.sec.foo=0x60020 -o %t/exe1
+# RUN: llvm-objdump --no-show-raw-insn -d %t/exe1 | FileCheck --match-full-lines %s --check-prefix=EXE1
+## hi20 = target - pc + (1 << 17) >> 18 = 0x60020 - 0x20010 + 0x20000 >> 18 = 1
+## lo18 = target - pc & (1 << 18) - 1 = 0x60020 - 0x20010 & 0x3ffff = 16
+# EXE1:      20010: pcaddu18i $t0, 1
+# EXE1-NEXT: 20014: jirl $zero, $t0, 16
+
+# RUN: ld.lld %t/a.o --section-start=.text=0x20010 --section-start=.sec.foo=0x40020 -o %t/exe2
+# RUN: llvm-objdump --no-show-raw-insn -d %t/exe2 | FileCheck --match-full-lines %s --check-prefix=EXE2
+## hi20 = target - pc + (1 << 17) >> 18 = 0x40020 - 0x20010 + 0x20000 >> 18 = 1
+## lo18 = target - pc & (1 << 18) - 1 = 0x40020 - 0x20010 & 0x3ffff = -131056
+# EXE2:      20010: pcaddu18i $t0, 1
+# EXE2-NEXT: 20014: jirl $zero, $t0, -131056
+
+# RUN: ld.lld %t/a.o -shared -T %t/a.t -o %t/a.so
+# RUN: llvm-readelf -x .got.plt %t/a.so | FileCheck --check-prefix=GOTPLT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/a.so | FileCheck --check-prefix=SO %s
+## PLT should be present in this case.
+# SO:    Disassembly of section .plt:
+# SO:    <.plt>:
+##       foo@plt:
+# SO:    1234520:  pcaddu12i $t3, 64{{$}}
+# SO-NEXT:         ld.d $t3, $t3, 544{{$}}
+# SO-NEXT:         jirl $t1, $t3, 0
+# SO-NEXT:         nop
+
+# SO:   Disassembly of section .text:
+# SO:   <_start>:
+## hi20 = foo@plt - pc + (1 << 17) >> 18 = 0x1234520 - 0x1274670 + 0x20000 >> 18 = -1
+## lo18 = foo@plt - pc & (1 << 18) - 1 = 0x1234520 - 0x1274670 & 0x3ffff = -336
+# SO-NEXT: pcaddu18i $t0, -1{{$}}
+# SO-NEXT: jirl $zero, $t0, -336{{$}}
+
+# GOTPLT:      section '.got.plt':
+# GOTPLT-NEXT: 0x01274730 00000000 00000000 00000000 00000000
+# GOTPLT-NEXT: 0x01274740 00452301 00000000
+
+# RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x2000020000 -o /dev/null 2>&1 | \
+# RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-RANGE %s
+# ERROR-RANGE: error: [[FILE]]:(.text+0x0): relocation R_LARCH_CALL36 out of range: 137438953472 is not in [-137439084544, 137438822399]; references 'foo'
+
+## Impossible case in reality becasue all LoongArch instructions are fixed 4-bytes long.
+# RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x40001 -o /dev/null 2>&1 | \
+# RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-ALIGN %s
+# ERROR-ALIGN: error: [[FILE]]:(.text+0x0): improper alignment for relocation R_LARCH_CALL36: 0x20001 is not aligned to 4 bytes
+
+#--- a.t
+SECTIONS {
+ .plt   0x1234500: { *(.plt) }
+ .text  0x1274670: { *(.text) }
+}
+
+#--- a.s
+.text
+.global _start
+_start:
+  .reloc ., R_LARCH_CALL36, foo
+  pcaddu18i $t0, 0
+  jirl      $zero, $t0, 0
+
+.section .sec.foo,"ax"
+.global foo
+foo:
+  ret
-- 
Gitee


From 6accc3e17550f87c2e5154fdee4056e21f680542 Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Mon, 25 Dec 2023 18:28:19 +0800
Subject: [PATCH 04/23] [lld][test][LoongArch] Remove the test for
 R_LARCH_CALL36 range checking

Several buildbots report:
ld.lld: error: failed to open /dev/null: Cannot allocate memory

For example:
- https://lab.llvm.org/buildbot/#/builders/184/builds/8842
- https://lab.llvm.org/buildbot/#/builders/247/builds/12559

(cherry picked from commit 0fbc728dba97149e530cfb7f2ada0283c398a7ce)
---
 lld/test/ELF/loongarch-call36.s | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lld/test/ELF/loongarch-call36.s b/lld/test/ELF/loongarch-call36.s
index 2d25a2ac64ed..0a00adacbd6a 100644
--- a/lld/test/ELF/loongarch-call36.s
+++ b/lld/test/ELF/loongarch-call36.s
@@ -40,10 +40,6 @@
 # GOTPLT-NEXT: 0x01274730 00000000 00000000 00000000 00000000
 # GOTPLT-NEXT: 0x01274740 00452301 00000000
 
-# RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x2000020000 -o /dev/null 2>&1 | \
-# RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-RANGE %s
-# ERROR-RANGE: error: [[FILE]]:(.text+0x0): relocation R_LARCH_CALL36 out of range: 137438953472 is not in [-137439084544, 137438822399]; references 'foo'
-
 ## Impossible case in reality becasue all LoongArch instructions are fixed 4-bytes long.
 # RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x40001 -o /dev/null 2>&1 | \
 # RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-ALIGN %s
-- 
Gitee


From be0c0cd979b6f4e2d778ca16d96a3e465a3ac4dc Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Mon, 25 Dec 2023 22:41:09 +0800
Subject: [PATCH 05/23] Revert "[lld][test][LoongArch] Remove the test for
 R_LARCH_CALL36 range checking"

This reverts commit 0fbc728dba97149e530cfb7f2ada0283c398a7ce.

In 88548df0fc08, both the .sec.foo and .tex sections used the same
section flags, hence sharing one segment, pushing the output file
size too large. This breaks on many buildbots.

Now assign section .sec.foo different flags ("awx") from .text ("ax")
so that both sections get their own segment.

(cherry picked from commit 6452395561eaae59e38f1df84f5413dffdb9169f)
---
 lld/test/ELF/loongarch-call36.s | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lld/test/ELF/loongarch-call36.s b/lld/test/ELF/loongarch-call36.s
index 0a00adacbd6a..b593fdf1f604 100644
--- a/lld/test/ELF/loongarch-call36.s
+++ b/lld/test/ELF/loongarch-call36.s
@@ -40,6 +40,10 @@
 # GOTPLT-NEXT: 0x01274730 00000000 00000000 00000000 00000000
 # GOTPLT-NEXT: 0x01274740 00452301 00000000
 
+# RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x2000020000 -o /dev/null 2>&1 | \
+# RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-RANGE %s
+# ERROR-RANGE: error: [[FILE]]:(.text+0x0): relocation R_LARCH_CALL36 out of range: 137438953472 is not in [-137439084544, 137438822399]; references 'foo'
+
 ## Impossible case in reality becasue all LoongArch instructions are fixed 4-bytes long.
 # RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x40001 -o /dev/null 2>&1 | \
 # RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-ALIGN %s
@@ -59,7 +63,7 @@ _start:
   pcaddu18i $t0, 0
   jirl      $zero, $t0, 0
 
-.section .sec.foo,"ax"
+.section .sec.foo,"awx"
 .global foo
 foo:
   ret
-- 
Gitee


From a8ed0f26220bbacb2c485a392f79ac4b271d73af Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Tue, 2 Jan 2024 10:55:02 +0800
Subject: [PATCH 06/23] [LoongArch] Emit function call code sequence as
 `PCADDU18I+JIRL` in medium code model

According to the description of the psABI v2.20:
https://github.com/loongson/la-abi-specs/releases/tag/v2.20, adjustments
are made to the function call instructions under the medium code model.

At the same time, AsmParser has already supported parsing the call36 and
tail36 macro instructions.

(cherry picked from commit 2cf420d5b846a4733ef0ef7c8ed0ae0bfd1c6772)
---
 .../AsmParser/LoongArchAsmParser.cpp          | 61 +++++++++++++++++++
 .../LoongArch/LoongArchExpandPseudoInsts.cpp  | 29 ++++-----
 .../Target/LoongArch/LoongArchInstrInfo.td    | 23 ++++++-
 .../Target/LoongArch/LoongArchMCInstLower.cpp |  3 +
 .../LoongArch/LoongArchTargetMachine.cpp      |  4 +-
 .../MCTargetDesc/LoongArchBaseInfo.h          |  1 +
 .../MCTargetDesc/LoongArchELFObjectWriter.cpp |  2 +
 .../MCTargetDesc/LoongArchFixupKinds.h        |  3 +
 .../MCTargetDesc/LoongArchMCCodeEmitter.cpp   |  3 +
 .../MCTargetDesc/LoongArchMCExpr.cpp          |  3 +
 .../LoongArch/MCTargetDesc/LoongArchMCExpr.h  |  1 +
 llvm/test/CodeGen/LoongArch/code-models.ll    | 12 ++--
 .../MC/LoongArch/Basic/Integer/invalid64.s    |  2 +-
 llvm/test/MC/LoongArch/Macros/macros-call.s   |  9 +++
 .../MC/LoongArch/Relocations/relocations.s    |  5 ++
 15 files changed, 133 insertions(+), 28 deletions(-)
 create mode 100644 llvm/test/MC/LoongArch/Macros/macros-call.s

diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index a132e645c864..f908e5bc63d3 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -122,6 +122,10 @@ class LoongArchAsmParser : public MCTargetAsmParser {
   // Helper to emit pseudo instruction "li.w/d $rd, $imm".
   void emitLoadImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
 
+  // Helper to emit pseudo instruction "call36 sym" or "tail36 $rj, sym".
+  void emitFuncCall36(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                      bool IsTailCall);
+
 public:
   enum LoongArchMatchResultTy {
     Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
@@ -401,6 +405,22 @@ public:
                      IsValidKind;
   }
 
+  bool isSImm20pcaddu18i() const {
+    if (!isImm())
+      return false;
+
+    int64_t Imm;
+    LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    bool IsValidKind = VK == LoongArchMCExpr::VK_LoongArch_None ||
+                       VK == LoongArchMCExpr::VK_LoongArch_CALL36;
+
+    return IsConstantImm
+               ? isInt<20>(Imm) && IsValidKind
+               : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
+                     IsValidKind;
+  }
+
   bool isSImm21lsl2() const {
     if (!isImm())
       return false;
@@ -1111,6 +1131,35 @@ void LoongArchAsmParser::emitLoadImm(MCInst &Inst, SMLoc IDLoc,
   }
 }
 
+void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc,
+                                        MCStreamer &Out, bool IsTailCall) {
+  // call36 sym
+  // expands to:
+  //   pcaddu18i $ra, %call36(sym)
+  //   jirl      $ra, $ra, 0
+  //
+  // tail36 $rj, sym
+  // expands to:
+  //   pcaddu18i $rj, %call36(sym)
+  //   jirl      $r0, $rj, 0
+  unsigned ScratchReg =
+      IsTailCall ? Inst.getOperand(0).getReg() : (unsigned)LoongArch::R1;
+  const MCExpr *Sym =
+      IsTailCall ? Inst.getOperand(1).getExpr() : Inst.getOperand(0).getExpr();
+  const LoongArchMCExpr *LE = LoongArchMCExpr::create(
+      Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, getContext());
+
+  Out.emitInstruction(
+      MCInstBuilder(LoongArch::PCADDU18I).addReg(ScratchReg).addExpr(LE),
+      getSTI());
+  Out.emitInstruction(
+      MCInstBuilder(LoongArch::JIRL)
+          .addReg(IsTailCall ? (unsigned)LoongArch::R0 : ScratchReg)
+          .addReg(ScratchReg)
+          .addImm(0),
+      getSTI());
+}
+
 bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                                             OperandVector &Operands,
                                             MCStreamer &Out) {
@@ -1159,6 +1208,12 @@ bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   case LoongArch::PseudoLI_D:
     emitLoadImm(Inst, IDLoc, Out);
     return false;
+  case LoongArch::PseudoCALL36:
+    emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/false);
+    return false;
+  case LoongArch::PseudoTAIL36:
+    emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/true);
+    return false;
   }
   Out.emitInstruction(Inst, getSTI());
   return false;
@@ -1440,6 +1495,12 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         /*Upper=*/(1 << 19) - 1,
         "operand must be a symbol with modifier (e.g. %pc_hi20) or an integer "
         "in the range");
+  case Match_InvalidSImm20pcaddu18i:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, /*Lower=*/-(1 << 19),
+        /*Upper=*/(1 << 19) - 1,
+        "operand must be a symbol with modifier (e.g. %call36) or an integer "
+        "in the range");
   case Match_InvalidSImm21lsl2:
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, /*Lower=*/-(1 << 22), /*Upper=*/(1 << 22) - 4,
diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index 72c1f1cec198..8eda2dcc1633 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -458,11 +458,11 @@ bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
   }
   case CodeModel::Medium: {
     // CALL:
-    // pcalau12i  $ra, %pc_hi20(func)
-    // jirl       $ra, $ra, %pc_lo12(func)
+    // pcaddu18i $ra, %call36(func)
+    // jirl      $ra, $ra, 0
     // TAIL:
-    // pcalau12i  $scratch, %pc_hi20(func)
-    // jirl       $r0, $scratch, %pc_lo12(func)
+    // pcaddu18i $scratch, %call36(func)
+    // jirl      $r0, $scratch, 0
     Opcode =
         IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
     Register ScratchReg =
@@ -470,18 +470,15 @@ bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
             ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
             : LoongArch::R1;
     MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), ScratchReg);
-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg);
-    if (Func.isSymbol()) {
-      const char *FnName = Func.getSymbolName();
-      MIB.addExternalSymbol(FnName, LoongArchII::MO_PCREL_HI);
-      CALL.addExternalSymbol(FnName, LoongArchII::MO_PCREL_LO);
-      break;
-    }
-    assert(Func.isGlobal() && "Expected a GlobalValue at this time");
-    const GlobalValue *GV = Func.getGlobal();
-    MIB.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_HI);
-    CALL.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_LO);
+        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
+
+    CALL =
+        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
+
+    if (Func.isSymbol())
+      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
+    else
+      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
     break;
   }
   case CodeModel::Large: {
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index ab1890556814..67de5f7afd78 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -351,6 +351,10 @@ def simm20_lu32id : SImm20Operand {
   let ParserMatchClass = SImmAsmOperand<20, "lu32id">;
 }
 
+def simm20_pcaddu18i : SImm20Operand {
+  let ParserMatchClass = SImmAsmOperand<20, "pcaddu18i">;
+}
+
 def simm21_lsl2 : Operand<OtherVT> {
   let ParserMatchClass = SImmAsmOperand<21, "lsl2">;
   let EncoderMethod = "getImmOpValueAsr<2>";
@@ -772,7 +776,7 @@ def LU32I_D : Fmt1RI20<0x16000000, (outs GPR:$dst),
                        "$rd, $imm20">;
 }
 def LU52I_D : ALU_2RI12<0x03000000, simm12_lu52id>;
-def PCADDU18I : ALU_1RI20<0x1e000000, simm20>;
+def PCADDU18I : ALU_1RI20<0x1e000000, simm20_pcaddu18i>;
 def MUL_D     : ALU_3R<0x001d8000>;
 def MULH_D    : ALU_3R<0x001e0000>;
 def MULH_DU   : ALU_3R<0x001e8000>;
@@ -1324,7 +1328,7 @@ def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)),
           (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>;
 
 let isCall = 1, Defs = [R1] in
-def PseudoCALL : Pseudo<(outs), (ins simm26_symbol:$func)>;
+def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func)>;
 
 def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
 def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
@@ -1344,7 +1348,7 @@ def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>,
                 PseudoInstExpansion<(JIRL R0, R1, 0)>;
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
-def PseudoTAIL : Pseudo<(outs), (ins simm26_symbol:$dst)>;
+def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst)>;
 
 def : Pat<(loongarch_tail (iPTR tglobaladdr:$dst)),
           (PseudoTAIL tglobaladdr:$dst)>;
@@ -1367,6 +1371,19 @@ def PseudoJIRL_TAIL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>,
                       PseudoInstExpansion<(JIRL R0, GPR:$rj,
                                            simm16_lsl2:$imm16)>;
 
+/// call36/taill36 macro instructions
+let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, isAsmParserOnly = 1,
+    Defs = [R1], Size = 8, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in
+def PseudoCALL36 : Pseudo<(outs), (ins bare_symbol:$dst), [],
+                          "call36", "$dst">,
+                   Requires<[IsLA64]>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3],
+    isCodeGenOnly = 0, isAsmParserOnly = 1, Size = 8, hasSideEffects = 0,
+    mayStore = 0, mayLoad = 0 in
+def PseudoTAIL36 : Pseudo<(outs), (ins GPR:$tmp, bare_symbol:$dst), [],
+                          "tail36", "$tmp, $dst">,
+                   Requires<[IsLA64]>;
+
 /// Load address (la*) macro instructions.
 
 // Define isCodeGenOnly = 0 to expose them to tablegened assembly parser.
diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
index 5daa9481c907..98ad49f25e3f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
@@ -95,6 +95,9 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
   case LoongArchII::MO_GD_PC_HI:
     Kind = LoongArchMCExpr::VK_LoongArch_TLS_GD_PC_HI20;
     break;
+  case LoongArchII::MO_CALL36:
+    Kind = LoongArchMCExpr::VK_LoongArch_CALL36;
+    break;
     // TODO: Handle more target-flags.
   }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index d0a4e9375048..0efc5e6ebb99 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -63,11 +63,11 @@ getEffectiveLoongArchCodeModel(const Triple &TT,
 
   switch (*CM) {
   case CodeModel::Small:
-  case CodeModel::Medium:
     return *CM;
+  case CodeModel::Medium:
   case CodeModel::Large:
     if (!TT.isArch64Bit())
-      report_fatal_error("Large code model requires LA64");
+      report_fatal_error("Medium/Large code model requires LA64");
     return *CM;
   default:
     report_fatal_error(
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
index cee6dad1f095..0692cb92b694 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
@@ -47,6 +47,7 @@ enum {
   MO_IE_PC64_HI,
   MO_LD_PC_HI,
   MO_GD_PC_HI,
+  MO_CALL36
   // TODO: Add more flags.
 };
 } // end namespace LoongArchII
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
index e60b9c2cfd97..0a52380dd2cd 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
@@ -90,6 +90,8 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_LARCH_TLS_LE64_LO20;
   case LoongArch::fixup_loongarch_tls_le64_hi12:
     return ELF::R_LARCH_TLS_LE64_HI12;
+  case LoongArch::fixup_loongarch_call36:
+    return ELF::R_LARCH_CALL36;
     // TODO: Handle more fixup-kinds.
   }
 }
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
index 78414408f21f..0d19d2b0fb1f 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
@@ -111,6 +111,9 @@ enum Fixups {
   fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX,
   // Generate an R_LARCH_ALIGN which indicates the linker may fixup align here.
   fixup_loongarch_align = FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN,
+  // 36-bit fixup corresponding to %call36(foo) for a pair instructions:
+  // pcaddu18i+jirl.
+  fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36,
 };
 } // end namespace LoongArch
 } // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index 09d92ac9aa3a..7c4fe9674d4e 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -241,6 +241,9 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
     case LoongArchMCExpr::VK_LoongArch_TLS_GD_HI20:
       FixupKind = LoongArch::fixup_loongarch_tls_gd_hi20;
       break;
+    case LoongArchMCExpr::VK_LoongArch_CALL36:
+      FixupKind = LoongArch::fixup_loongarch_call36;
+      break;
     }
   } else if (Kind == MCExpr::SymbolRef &&
              cast<MCSymbolRefExpr>(Expr)->getKind() ==
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
index 82c992b1cc8c..8ca8876a19b9 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
@@ -138,6 +138,8 @@ StringRef LoongArchMCExpr::getVariantKindName(VariantKind Kind) {
     return "gd_pc_hi20";
   case VK_LoongArch_TLS_GD_HI20:
     return "gd_hi20";
+  case VK_LoongArch_CALL36:
+    return "call36";
   }
 }
 
@@ -180,6 +182,7 @@ LoongArchMCExpr::getVariantKindForName(StringRef name) {
       .Case("ld_hi20", VK_LoongArch_TLS_LD_HI20)
       .Case("gd_pc_hi20", VK_LoongArch_TLS_GD_PC_HI20)
       .Case("gd_hi20", VK_LoongArch_TLS_GD_HI20)
+      .Case("call36", VK_LoongArch_CALL36)
       .Default(VK_LoongArch_Invalid);
 }
 
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
index 93251f824103..bd828116d7fa 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
@@ -61,6 +61,7 @@ public:
     VK_LoongArch_TLS_LD_HI20,
     VK_LoongArch_TLS_GD_PC_HI20,
     VK_LoongArch_TLS_GD_HI20,
+    VK_LoongArch_CALL36,
     VK_LoongArch_Invalid // Must be the last item.
   };
 
diff --git a/llvm/test/CodeGen/LoongArch/code-models.ll b/llvm/test/CodeGen/LoongArch/code-models.ll
index c610f645a06a..7c6f46d5e926 100644
--- a/llvm/test/CodeGen/LoongArch/code-models.ll
+++ b/llvm/test/CodeGen/LoongArch/code-models.ll
@@ -23,8 +23,8 @@ define i32 @call_globaladdress(i32 %a) nounwind {
 ; MEDIUM:       # %bb.0:
 ; MEDIUM-NEXT:    addi.d $sp, $sp, -16
 ; MEDIUM-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; MEDIUM-NEXT:    pcalau12i $ra, %pc_hi20(callee)
-; MEDIUM-NEXT:    jirl $ra, $ra, %pc_lo12(callee)
+; MEDIUM-NEXT:    pcaddu18i $ra, %call36(callee)
+; MEDIUM-NEXT:    jirl $ra, $ra, 0
 ; MEDIUM-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; MEDIUM-NEXT:    addi.d $sp, $sp, 16
 ; MEDIUM-NEXT:    ret
@@ -68,8 +68,8 @@ define void @call_external_sym(ptr %dst) {
 ; MEDIUM-NEXT:    .cfi_offset 1, -8
 ; MEDIUM-NEXT:    ori $a2, $zero, 1000
 ; MEDIUM-NEXT:    move $a1, $zero
-; MEDIUM-NEXT:    pcalau12i $ra, %pc_hi20(memset)
-; MEDIUM-NEXT:    jirl $ra, $ra, %pc_lo12(memset)
+; MEDIUM-NEXT:    pcaddu18i $ra, %call36(memset)
+; MEDIUM-NEXT:    jirl $ra, $ra, 0
 ; MEDIUM-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; MEDIUM-NEXT:    addi.d $sp, $sp, 16
 ; MEDIUM-NEXT:    ret
@@ -105,8 +105,8 @@ define i32 @caller_tail(i32 %i) nounwind {
 ;
 ; MEDIUM-LABEL: caller_tail:
 ; MEDIUM:       # %bb.0: # %entry
-; MEDIUM-NEXT:    pcalau12i $a1, %pc_hi20(callee_tail)
-; MEDIUM-NEXT:    jirl $zero, $a1, %pc_lo12(callee_tail)
+; MEDIUM-NEXT:    pcaddu18i $a1, %call36(callee_tail)
+; MEDIUM-NEXT:    jr $a1
 ;
 ; LARGE-LABEL: caller_tail:
 ; LARGE:       # %bb.0: # %entry
diff --git a/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s b/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
index acddca9432a6..1c1c658ad440 100644
--- a/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
+++ b/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
@@ -65,7 +65,7 @@ addu16i.d $a0, $a0, 32768
 
 ## simm20
 pcaddu18i $a0, 0x80000
-# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-524288, 524287]
+# CHECK: :[[#@LINE-1]]:16: error: operand must be a symbol with modifier (e.g. %call36) or an integer in the range [-524288, 524287]
 
 ## simm20_lu32id
 lu32i.d $a0, 0x80000
diff --git a/llvm/test/MC/LoongArch/Macros/macros-call.s b/llvm/test/MC/LoongArch/Macros/macros-call.s
new file mode 100644
index 000000000000..a648a3978038
--- /dev/null
+++ b/llvm/test/MC/LoongArch/Macros/macros-call.s
@@ -0,0 +1,9 @@
+# RUN: llvm-mc --triple=loongarch64 %s | FileCheck %s
+
+call36 sym_call
+# CHECK:      pcaddu18i $ra, %call36(sym_call)
+# CHECK-NEXT: jirl $ra, $ra, 0
+
+tail36 $t0, sym_tail
+# CHECK:      pcaddu18i $t0, %call36(sym_tail)
+# CHECK-NEXT: jr $t0
diff --git a/llvm/test/MC/LoongArch/Relocations/relocations.s b/llvm/test/MC/LoongArch/Relocations/relocations.s
index 042cc93470a1..bec71e103893 100644
--- a/llvm/test/MC/LoongArch/Relocations/relocations.s
+++ b/llvm/test/MC/LoongArch/Relocations/relocations.s
@@ -218,3 +218,8 @@ lu12i.w $t1, %gd_hi20(foo)
 # RELOC: R_LARCH_TLS_GD_HI20 foo 0x0
 # INSTR: lu12i.w $t1, %gd_hi20(foo)
 # FIXUP: fixup A - offset: 0, value: %gd_hi20(foo), kind: FK_NONE
+
+pcaddu18i $t1, %call36(foo)
+# RELOC: R_LARCH_CALL36 foo 0x0
+# INSTR: pcaddu18i $t1, %call36(foo)
+# FIXUP: fixup A - offset: 0, value: %call36(foo), kind: FK_NONE
-- 
Gitee


From d59688f326d8f915ffc5db80b40c9b99d9f95470 Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Tue, 2 Jan 2024 10:57:40 +0800
Subject: [PATCH 07/23] [LoongArch] Pre-commit test for #76555. NFC

(cherry picked from commit 3d6fc35b9071009c5ef37f879a12982c6a54db60)
---
 .../LoongArch/psabi-restricted-scheduling.ll  | 172 ++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll

diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
new file mode 100644
index 000000000000..150a935d7bf8
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc --mtriple=loongarch64 --code-model=medium --post-RA-scheduler=0 < %s \
+; RUN:     | FileCheck %s --check-prefix=MEDIUM_NO_SCH
+; RUN: llc --mtriple=loongarch64 --code-model=medium --post-RA-scheduler=1 < %s \
+; RUN:     | FileCheck %s --check-prefix=MEDIUM_SCH
+; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=0 < %s \
+; RUN:     | FileCheck %s --check-prefix=LARGE_NO_SCH
+; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=1 < %s \
+; RUN:     | FileCheck %s --check-prefix=LARGE_SCH
+
+;; FIXME: According to the description of the psABI v2.30, the code sequences
+;; of `PseudoLA*_LARGE` instruction and Medium code model's function call must
+;; be adjacent.
+
+@g = dso_local global i64 zeroinitializer, align 4
+@G = global i64 zeroinitializer, align 4
+@gd = external thread_local global i64
+@ld = external thread_local(localdynamic) global i64
+@ie = external thread_local(initialexec) global i64
+
+declare ptr @bar(i64)
+
+define void @foo() nounwind {
+; MEDIUM_NO_SCH-LABEL: foo:
+; MEDIUM_NO_SCH:       # %bb.0:
+; MEDIUM_NO_SCH-NEXT:    addi.d $sp, $sp, -16
+; MEDIUM_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; MEDIUM_NO_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+; MEDIUM_NO_SCH-NEXT:    ori $a0, $zero, 1
+; MEDIUM_NO_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
+; MEDIUM_NO_SCH-NEXT:    jirl $ra, $ra, 0
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
+; MEDIUM_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ld)
+; MEDIUM_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ie)
+; MEDIUM_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; MEDIUM_NO_SCH-NEXT:    addi.d $sp, $sp, 16
+; MEDIUM_NO_SCH-NEXT:    ret
+;
+; MEDIUM_SCH-LABEL: foo:
+; MEDIUM_SCH:       # %bb.0:
+; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, -16
+; MEDIUM_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
+; MEDIUM_SCH-NEXT:    ori $a0, $zero, 1
+; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
+; MEDIUM_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ld)
+; MEDIUM_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ie)
+; MEDIUM_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, 16
+; MEDIUM_SCH-NEXT:    ret
+;
+; LARGE_NO_SCH-LABEL: foo:
+; LARGE_NO_SCH:       # %bb.0:
+; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, -16
+; LARGE_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
+; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+; LARGE_NO_SCH-NEXT:    ori $a0, $zero, 1
+; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
+; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
+; LARGE_NO_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
+; LARGE_NO_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
+; LARGE_NO_SCH-NEXT:    ldx.d $ra, $ra, $a1
+; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, 16
+; LARGE_NO_SCH-NEXT:    ret
+;
+; LARGE_SCH-LABEL: foo:
+; LARGE_SCH:       # %bb.0:
+; LARGE_SCH-NEXT:    addi.d $sp, $sp, -16
+; LARGE_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+; LARGE_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+; LARGE_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+; LARGE_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+; LARGE_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
+; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
+; LARGE_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
+; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
+; LARGE_SCH-NEXT:    ldx.d $ra, $ra, $a1
+; LARGE_SCH-NEXT:    ori $a0, $zero, 1
+; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
+; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LARGE_SCH-NEXT:    addi.d $sp, $sp, 16
+; LARGE_SCH-NEXT:    ret
+  %V = load volatile i64, ptr @G
+  %v = load volatile i64, ptr @g
+  call void @bar(i64 1)
+  %v_gd = load volatile i64, ptr @gd
+  %v_ld = load volatile i64, ptr @ld
+  %v_ie = load volatile i64, ptr @ie
+  ret void
+}
-- 
Gitee


From 1248440ab618fcffada7fa29eed71bc04945c3ec Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Tue, 25 Jun 2024 09:52:17 +0800
Subject: [PATCH 08/23] [LoongArch][test] Remove the FIXME in
 psabi-restricted-scheduling.ll which has been addressed by #76555

(cherry picked from commit 7ea63b9db4198688873036f3b0b81f9124076f7a)
---
 llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
index 150a935d7bf8..a515939b9c2b 100644
--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
@@ -8,10 +8,6 @@
 ; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=1 < %s \
 ; RUN:     | FileCheck %s --check-prefix=LARGE_SCH
 
-;; FIXME: According to the description of the psABI v2.30, the code sequences
-;; of `PseudoLA*_LARGE` instruction and Medium code model's function call must
-;; be adjacent.
-
 @g = dso_local global i64 zeroinitializer, align 4
 @G = global i64 zeroinitializer, align 4
 @gd = external thread_local global i64
-- 
Gitee


From 0e86ae628414dac6d7ef2eaccc8655d790595f9f Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Tue, 2 Jan 2024 10:57:15 +0800
Subject: [PATCH 09/23] [LoongArch] Reimplement the expansion of
 PseudoLA*_LARGE instructions (#76555)

According to the description of the psABI v2.30:
https://github.com/loongson/la-abi-specs/releases/tag/v2.30, moved the
expansion of relevant pseudo-instructions from
`LoongArchPreRAExpandPseudo` pass to `LoongArchExpandPseudo` pass, to
ensure that the code sequences of `PseudoLA*_LARGE` instructions and
Medium code model's function call are not scheduled.

(cherry picked from commit c56a5e895a96fec4292e9333d998cfa88770432a)
---
 .../LoongArch/LoongArchExpandPseudoInsts.cpp  | 519 +++++++++---------
 .../LoongArch/LoongArchISelLowering.cpp       |  24 +-
 .../Target/LoongArch/LoongArchISelLowering.h  |   4 +
 .../Target/LoongArch/LoongArchInstrInfo.td    |  83 ++-
 llvm/test/CodeGen/LoongArch/code-models.ll    |  36 +-
 llvm/test/CodeGen/LoongArch/expand-call.ll    |   2 +-
 llvm/test/CodeGen/LoongArch/global-address.ll |  32 +-
 .../LoongArch/psabi-restricted-scheduling.ll  | 102 ++--
 llvm/test/CodeGen/LoongArch/tls-models.ll     |  68 +--
 9 files changed, 487 insertions(+), 383 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index 8eda2dcc1633..f977f176066a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -62,43 +62,24 @@ private:
                                MachineBasicBlock::iterator &NextMBBI,
                                unsigned FlagsHi, unsigned SecondOpcode,
                                unsigned FlagsLo);
-  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              unsigned LastOpcode, unsigned IdentifyingMO);
-  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              unsigned LastOpcode, unsigned IdentifyingMO,
-                              const MachineOperand &Symbol, Register DestReg,
-                              bool EraseFromParent);
   bool expandLoadAddressPcrel(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              bool Large = false);
+                              MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressGot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
-                            MachineBasicBlock::iterator &NextMBBI,
-                            bool Large = false);
+                            MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressTLSLE(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
                               MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressTLSIE(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              bool Large = false);
+                              MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressTLSLD(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              bool Large = false);
+                              MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressTLSGD(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              bool Large = false);
-  bool expandFunctionCALL(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI,
-                          MachineBasicBlock::iterator &NextMBBI,
-                          bool IsTailCall);
+                              MachineBasicBlock::iterator &NextMBBI);
 };
 
 char LoongArchPreRAExpandPseudo::ID = 0;
@@ -131,30 +112,16 @@ bool LoongArchPreRAExpandPseudo::expandMI(
   switch (MBBI->getOpcode()) {
   case LoongArch::PseudoLA_PCREL:
     return expandLoadAddressPcrel(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_PCREL_LARGE:
-    return expandLoadAddressPcrel(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_GOT:
     return expandLoadAddressGot(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_GOT_LARGE:
-    return expandLoadAddressGot(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_LE:
     return expandLoadAddressTLSLE(MBB, MBBI, NextMBBI);
   case LoongArch::PseudoLA_TLS_IE:
     return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_IE_LARGE:
-    return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_LD:
     return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_LD_LARGE:
-    return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_GD:
     return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_GD_LARGE:
-    return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI, /*Large=*/true);
-  case LoongArch::PseudoCALL:
-    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
-  case LoongArch::PseudoTAIL:
-    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
   }
   return false;
 }
@@ -187,118 +154,9 @@ bool LoongArchPreRAExpandPseudo::expandPcalau12iInstPair(
   return true;
 }
 
-bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
-    unsigned IdentifyingMO) {
-  MachineInstr &MI = *MBBI;
-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
-                                MI.getOperand(2), MI.getOperand(0).getReg(),
-                                true);
-}
-
-bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
-    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
-    bool EraseFromParent) {
-  // Code Sequence:
-  //
-  // Part1: pcalau12i  $scratch, %MO1(sym)
-  // Part0: addi.d     $dest, $zero, %MO0(sym)
-  // Part2: lu32i.d    $dest, %MO2(sym)
-  // Part3: lu52i.d    $dest, $dest, %MO3(sym)
-  // Fin:   LastOpcode $dest, $dest, $scratch
-
-  unsigned MO0, MO1, MO2, MO3;
-  switch (IdentifyingMO) {
-  default:
-    llvm_unreachable("unsupported identifying MO");
-  case LoongArchII::MO_PCREL_LO:
-    MO0 = IdentifyingMO;
-    MO1 = LoongArchII::MO_PCREL_HI;
-    MO2 = LoongArchII::MO_PCREL64_LO;
-    MO3 = LoongArchII::MO_PCREL64_HI;
-    break;
-  case LoongArchII::MO_GOT_PC_HI:
-  case LoongArchII::MO_LD_PC_HI:
-  case LoongArchII::MO_GD_PC_HI:
-    // These cases relocate just like the GOT case, except for Part1.
-    MO0 = LoongArchII::MO_GOT_PC_LO;
-    MO1 = IdentifyingMO;
-    MO2 = LoongArchII::MO_GOT_PC64_LO;
-    MO3 = LoongArchII::MO_GOT_PC64_HI;
-    break;
-  case LoongArchII::MO_IE_PC_LO:
-    MO0 = IdentifyingMO;
-    MO1 = LoongArchII::MO_IE_PC_HI;
-    MO2 = LoongArchII::MO_IE_PC64_LO;
-    MO3 = LoongArchII::MO_IE_PC64_HI;
-    break;
-  }
-
-  MachineFunction *MF = MBB.getParent();
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-
-  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
-         "Large code model requires LA64");
-
-  Register TmpPart1 =
-      MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
-  Register TmpPart0 =
-      DestReg.isVirtual()
-          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-          : DestReg;
-  Register TmpParts02 =
-      DestReg.isVirtual()
-          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-          : DestReg;
-  Register TmpParts023 =
-      DestReg.isVirtual()
-          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-          : DestReg;
-
-  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), TmpPart1);
-  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), TmpPart0)
-                   .addReg(LoongArch::R0);
-  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), TmpParts02)
-                   // "rj" is needed due to InstrInfo pattern requirement.
-                   .addReg(TmpPart0, RegState::Kill);
-  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), TmpParts023)
-                   .addReg(TmpParts02, RegState::Kill);
-  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
-      .addReg(TmpParts023)
-      .addReg(TmpPart1, RegState::Kill);
-
-  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
-    const char *SymName = Symbol.getSymbolName();
-    Part0.addExternalSymbol(SymName, MO0);
-    Part1.addExternalSymbol(SymName, MO1);
-    Part2.addExternalSymbol(SymName, MO2);
-    Part3.addExternalSymbol(SymName, MO3);
-  } else {
-    Part0.addDisp(Symbol, 0, MO0);
-    Part1.addDisp(Symbol, 0, MO1);
-    Part2.addDisp(Symbol, 0, MO2);
-    Part3.addDisp(Symbol, 0, MO3);
-  }
-
-  if (EraseFromParent)
-    MI.eraseFromParent();
-
-  return true;
-}
-
 bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
-  if (Large)
-    // Emit the 5-insn large address load sequence with the `%pc` family of
-    // relocs.
-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
-                                  LoongArchII::MO_PCREL_LO);
-
+    MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
   // pcalau12i $rd, %pc_hi20(sym)
   // addi.w/d $rd, $rd, %pc_lo12(sym)
@@ -311,13 +169,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressGot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
-  if (Large)
-    // Emit the 5-insn large address load sequence with the `%got_pc` family
-    // of relocs, loading the result from GOT with `ldx.d` in the end.
-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
-                                  LoongArchII::MO_GOT_PC_HI);
-
+    MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
   // pcalau12i $rd, %got_pc_hi20(sym)
   // ld.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -378,13 +230,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLE(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
-  if (Large)
-    // Emit the 5-insn large address load sequence with the `%ie_pc` family
-    // of relocs, loading the result with `ldx.d` in the end.
-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
-                                  LoongArchII::MO_IE_PC_LO);
-
+    MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
   // pcalau12i $rd, %ie_pc_hi20(sym)
   // ld.w/d $rd, $rd, %ie_pc_lo12(sym)
@@ -397,13 +243,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
-  if (Large)
-    // Emit the 5-insn large address load sequence with the `%got_pc` family
-    // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
-                                  LoongArchII::MO_LD_PC_HI);
-
+    MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
   // pcalau12i $rd, %ld_pc_hi20(sym)
   // addi.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -416,13 +256,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
-  if (Large)
-    // Emit the 5-insn large address load sequence with the `%got_pc` family
-    // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
-                                  LoongArchII::MO_GD_PC_HI);
-
+    MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
   // pcalau12i $rd, %gd_pc_hi20(sym)
   // addi.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -433,85 +267,6 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
                                  SecondOpcode, LoongArchII::MO_GOT_PC_LO);
 }
 
-bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
-  MachineFunction *MF = MBB.getParent();
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-  const MachineOperand &Func = MI.getOperand(0);
-  MachineInstrBuilder CALL;
-  unsigned Opcode;
-
-  switch (MF->getTarget().getCodeModel()) {
-  default:
-    report_fatal_error("Unsupported code model");
-    break;
-  case CodeModel::Small: {
-    // CALL:
-    // bl func
-    // TAIL:
-    // b func
-    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
-    break;
-  }
-  case CodeModel::Medium: {
-    // CALL:
-    // pcaddu18i $ra, %call36(func)
-    // jirl      $ra, $ra, 0
-    // TAIL:
-    // pcaddu18i $scratch, %call36(func)
-    // jirl      $r0, $scratch, 0
-    Opcode =
-        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
-    Register ScratchReg =
-        IsTailCall
-            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-            : LoongArch::R1;
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
-
-    CALL =
-        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
-
-    if (Func.isSymbol())
-      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
-    else
-      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
-    break;
-  }
-  case CodeModel::Large: {
-    // Emit the 5-insn large address load sequence, either directly or
-    // indirectly in case of going through the GOT, then JIRL_TAIL or
-    // JIRL_CALL to $addr.
-    Opcode =
-        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
-    Register AddrReg =
-        IsTailCall
-            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-            : LoongArch::R1;
-
-    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
-    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
-    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
-    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
-                           false);
-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
-    break;
-  }
-  }
-
-  // Transfer implicit operands.
-  CALL.copyImplicitOps(MI);
-
-  // Transfer MI flags.
-  CALL.setMIFlags(MI.getFlags());
-
-  MI.eraseFromParent();
-  return true;
-}
-
 class LoongArchExpandPseudo : public MachineFunctionPass {
 public:
   const LoongArchInstrInfo *TII;
@@ -533,6 +288,35 @@ private:
                 MachineBasicBlock::iterator &NextMBBI);
   bool expandCopyCFR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                      MachineBasicBlock::iterator &NextMBBI);
+  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI,
+                              MachineBasicBlock::iterator &NextMBBI,
+                              unsigned LastOpcode, unsigned IdentifyingMO);
+  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI,
+                              MachineBasicBlock::iterator &NextMBBI,
+                              unsigned LastOpcode, unsigned IdentifyingMO,
+                              const MachineOperand &Symbol, Register DestReg,
+                              bool EraseFromParent);
+  bool expandLoadAddressPcrelLarge(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI);
+  bool expandLoadAddressGotLarge(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 MachineBasicBlock::iterator &NextMBBI);
+  bool expandLoadAddressTLSIELarge(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI);
+  bool expandLoadAddressTLSLDLarge(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI);
+  bool expandLoadAddressTLSGDLarge(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI);
+  bool expandFunctionCALL(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          MachineBasicBlock::iterator &NextMBBI,
+                          bool IsTailCall);
 };
 
 char LoongArchExpandPseudo::ID = 0;
@@ -567,6 +351,24 @@ bool LoongArchExpandPseudo::expandMI(MachineBasicBlock &MBB,
   switch (MBBI->getOpcode()) {
   case LoongArch::PseudoCopyCFR:
     return expandCopyCFR(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_PCREL_LARGE:
+    return expandLoadAddressPcrelLarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_GOT_LARGE:
+    return expandLoadAddressGotLarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_IE_LARGE:
+    return expandLoadAddressTLSIELarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_LD_LARGE:
+    return expandLoadAddressTLSLDLarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_GD_LARGE:
+    return expandLoadAddressTLSGDLarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoCALL:
+  case LoongArch::PseudoCALL_MEDIUM:
+  case LoongArch::PseudoCALL_LARGE:
+    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
+  case LoongArch::PseudoTAIL:
+  case LoongArch::PseudoTAIL_MEDIUM:
+  case LoongArch::PseudoTAIL_LARGE:
+    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
   }
 
   return false;
@@ -625,6 +427,213 @@ bool LoongArchExpandPseudo::expandCopyCFR(
   return true;
 }
 
+bool LoongArchExpandPseudo::expandLargeAddressLoad(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+    unsigned IdentifyingMO) {
+  MachineInstr &MI = *MBBI;
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
+                                MI.getOperand(2), MI.getOperand(0).getReg(),
+                                true);
+}
+
+bool LoongArchExpandPseudo::expandLargeAddressLoad(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
+    bool EraseFromParent) {
+  // Code Sequence:
+  //
+  // Part1: pcalau12i  $dst, %MO1(sym)
+  // Part0: addi.d     $t8, $zero, %MO0(sym)
+  // Part2: lu32i.d    $t8, %MO2(sym)
+  // Part3: lu52i.d    $t8, $t8, %MO3(sym)
+  // Fin:   LastOpcode $dst, $t8, $dst
+
+  unsigned MO0, MO1, MO2, MO3;
+  switch (IdentifyingMO) {
+  default:
+    llvm_unreachable("unsupported identifying MO");
+  case LoongArchII::MO_PCREL_LO:
+    MO0 = IdentifyingMO;
+    MO1 = LoongArchII::MO_PCREL_HI;
+    MO2 = LoongArchII::MO_PCREL64_LO;
+    MO3 = LoongArchII::MO_PCREL64_HI;
+    break;
+  case LoongArchII::MO_GOT_PC_HI:
+  case LoongArchII::MO_LD_PC_HI:
+  case LoongArchII::MO_GD_PC_HI:
+    // These cases relocate just like the GOT case, except for Part1.
+    MO0 = LoongArchII::MO_GOT_PC_LO;
+    MO1 = IdentifyingMO;
+    MO2 = LoongArchII::MO_GOT_PC64_LO;
+    MO3 = LoongArchII::MO_GOT_PC64_HI;
+    break;
+  case LoongArchII::MO_IE_PC_LO:
+    MO0 = IdentifyingMO;
+    MO1 = LoongArchII::MO_IE_PC_HI;
+    MO2 = LoongArchII::MO_IE_PC64_LO;
+    MO3 = LoongArchII::MO_IE_PC64_HI;
+    break;
+  }
+
+  MachineFunction *MF = MBB.getParent();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  Register ScratchReg = LoongArch::R20; // $t8
+
+  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
+         "Large code model requires LA64");
+
+  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg);
+  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), ScratchReg)
+                   .addReg(LoongArch::R0);
+  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), ScratchReg)
+                   // "rj" is needed due to InstrInfo pattern requirement.
+                   .addReg(ScratchReg);
+  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), ScratchReg)
+                   .addReg(ScratchReg);
+  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
+      .addReg(ScratchReg)
+      .addReg(DestReg);
+
+  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
+    const char *SymName = Symbol.getSymbolName();
+    Part0.addExternalSymbol(SymName, MO0);
+    Part1.addExternalSymbol(SymName, MO1);
+    Part2.addExternalSymbol(SymName, MO2);
+    Part3.addExternalSymbol(SymName, MO3);
+  } else {
+    Part0.addDisp(Symbol, 0, MO0);
+    Part1.addDisp(Symbol, 0, MO1);
+    Part2.addDisp(Symbol, 0, MO2);
+    Part3.addDisp(Symbol, 0, MO3);
+  }
+
+  if (EraseFromParent)
+    MI.eraseFromParent();
+
+  return true;
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressPcrelLarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Emit the 5-insn large address load sequence with the `%pc` family of
+  // relocs.
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+                                LoongArchII::MO_PCREL_LO);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressGotLarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Emit the 5-insn large address load sequence with the `%got_pc` family
+  // of relocs, loading the result from GOT with `ldx.d` in the end.
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+                                LoongArchII::MO_GOT_PC_HI);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressTLSIELarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Emit the 5-insn large address load sequence with the `%ie_pc` family
+  // of relocs, loading the result with `ldx.d` in the end.
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+                                LoongArchII::MO_IE_PC_LO);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressTLSLDLarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Emit the 5-insn large address load sequence with the `%got_pc` family
+  // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+                                LoongArchII::MO_LD_PC_HI);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressTLSGDLarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Emit the 5-insn large address load sequence with the `%got_pc` family
+  // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+                                LoongArchII::MO_GD_PC_HI);
+}
+
+bool LoongArchExpandPseudo::expandFunctionCALL(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
+  MachineFunction *MF = MBB.getParent();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  const MachineOperand &Func = MI.getOperand(0);
+  MachineInstrBuilder CALL;
+  unsigned Opcode;
+
+  switch (MF->getTarget().getCodeModel()) {
+  default:
+    report_fatal_error("Unsupported code model");
+    break;
+  case CodeModel::Small: {
+    // CALL:
+    // bl func
+    // TAIL:
+    // b func
+    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
+    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
+    break;
+  }
+  case CodeModel::Medium: {
+    // CALL:
+    // pcaddu18i  $ra, %call36(func)
+    // jirl       $ra, $ra, 0
+    // TAIL:
+    // pcaddu18i  $t8, %call36(func)
+    // jr         $t8
+    Opcode =
+        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+    Register ScratchReg = IsTailCall ? LoongArch::R20 : LoongArch::R1;
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
+
+    CALL =
+        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
+
+    if (Func.isSymbol())
+      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
+    else
+      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
+    break;
+  }
+  case CodeModel::Large: {
+    // Emit the 5-insn large address load sequence, either directly or
+    // indirectly in case of going through the GOT, then JIRL_TAIL or
+    // JIRL_CALL to $addr.
+    Opcode =
+        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+    Register AddrReg = IsTailCall ? LoongArch::R19 : LoongArch::R1;
+
+    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
+    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
+    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
+    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
+                           false);
+    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
+    break;
+  }
+  }
+
+  // Transfer implicit operands.
+  CALL.copyImplicitOps(MI);
+
+  // Transfer MI flags.
+  CALL.setMIFlags(MI.getFlags());
+
+  MI.eraseFromParent();
+  return true;
+}
+
 } // end namespace
 
 INITIALIZE_PASS(LoongArchPreRAExpandPseudo, "loongarch-prera-expand-pseudo",
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 4fc2b4709840..df1b17649b7d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -3389,8 +3389,12 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
     // TODO: Add more target-dependent nodes later.
     NODE_NAME_CASE(CALL)
+    NODE_NAME_CASE(CALL_MEDIUM)
+    NODE_NAME_CASE(CALL_LARGE)
     NODE_NAME_CASE(RET)
     NODE_NAME_CASE(TAIL)
+    NODE_NAME_CASE(TAIL_MEDIUM)
+    NODE_NAME_CASE(TAIL_LARGE)
     NODE_NAME_CASE(SLL_W)
     NODE_NAME_CASE(SRA_W)
     NODE_NAME_CASE(SRL_W)
@@ -4248,15 +4252,31 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Emit the call.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  unsigned Op;
+  switch (DAG.getTarget().getCodeModel()) {
+  default:
+    report_fatal_error("Unsupported code model");
+  case CodeModel::Small:
+    Op = IsTailCall ? LoongArchISD::TAIL : LoongArchISD::CALL;
+    break;
+  case CodeModel::Medium:
+    assert(Subtarget.is64Bit() && "Medium code model requires LA64");
+    Op = IsTailCall ? LoongArchISD::TAIL_MEDIUM : LoongArchISD::CALL_MEDIUM;
+    break;
+  case CodeModel::Large:
+    assert(Subtarget.is64Bit() && "Large code model requires LA64");
+    Op = IsTailCall ? LoongArchISD::TAIL_LARGE : LoongArchISD::CALL_LARGE;
+    break;
+  }
 
   if (IsTailCall) {
     MF.getFrameInfo().setHasTailCall();
-    SDValue Ret = DAG.getNode(LoongArchISD::TAIL, DL, NodeTys, Ops);
+    SDValue Ret = DAG.getNode(Op, DL, NodeTys, Ops);
     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
     return Ret;
   }
 
-  Chain = DAG.getNode(LoongArchISD::CALL, DL, NodeTys, Ops);
+  Chain = DAG.getNode(Op, DL, NodeTys, Ops);
   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   Glue = Chain.getValue(1);
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 2c9826a13237..a2ed149f4bb7 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -28,8 +28,12 @@ enum NodeType : unsigned {
 
   // TODO: add more LoongArchISDs
   CALL,
+  CALL_MEDIUM,
+  CALL_LARGE,
   RET,
   TAIL,
+  TAIL_MEDIUM,
+  TAIL_LARGE,
 
   // 32-bit shifts, directly matching the semantics of the named LoongArch
   // instructions.
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 67de5f7afd78..ecd0c2b71b85 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -69,6 +69,18 @@ def loongarch_ret : SDNode<"LoongArchISD::RET", SDTNone,
 def loongarch_tail : SDNode<"LoongArchISD::TAIL", SDT_LoongArchCall,
                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                              SDNPVariadic]>;
+def loongarch_call_medium : SDNode<"LoongArchISD::CALL_MEDIUM", SDT_LoongArchCall,
+                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                    SDNPVariadic]>;
+def loongarch_tail_medium : SDNode<"LoongArchISD::TAIL_MEDIUM", SDT_LoongArchCall,
+                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                    SDNPVariadic]>;
+def loongarch_call_large : SDNode<"LoongArchISD::CALL_LARGE", SDT_LoongArchCall,
+                                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                   SDNPVariadic]>;
+def loongarch_tail_large : SDNode<"LoongArchISD::TAIL_LARGE", SDT_LoongArchCall,
+                                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                   SDNPVariadic]>;
 def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>;
 def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>;
 def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>;
@@ -1327,16 +1339,43 @@ def : Pat<(brind GPR:$rj), (PseudoBRIND GPR:$rj, 0)>;
 def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)),
           (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>;
 
+// Function call with 'Small' code model.
 let isCall = 1, Defs = [R1] in
 def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func)>;
 
 def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
 def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
 
+// Function call with 'Medium' code model.
+let isCall = 1, Defs = [R1, R20], Size = 8 in
+def PseudoCALL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$func)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_call_medium tglobaladdr:$func),
+          (PseudoCALL_MEDIUM tglobaladdr:$func)>;
+def : Pat<(loongarch_call_medium texternalsym:$func),
+          (PseudoCALL_MEDIUM texternalsym:$func)>;
+} // Predicates = [IsLA64]
+
+// Function call with 'Large' code model.
+let isCall = 1, Defs = [R1, R20], Size = 24 in
+def PseudoCALL_LARGE: Pseudo<(outs), (ins bare_symbol:$func)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_call_large tglobaladdr:$func),
+          (PseudoCALL_LARGE tglobaladdr:$func)>;
+def : Pat<(loongarch_call_large texternalsym:$func),
+          (PseudoCALL_LARGE texternalsym:$func)>;
+} // Predicates = [IsLA64]
+
 let isCall = 1, Defs = [R1] in
 def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rj),
                                 [(loongarch_call GPR:$rj)]>,
                          PseudoInstExpansion<(JIRL R1, GPR:$rj, 0)>;
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_call_medium GPR:$rj), (PseudoCALLIndirect GPR:$rj)>;
+def : Pat<(loongarch_call_large GPR:$rj), (PseudoCALLIndirect GPR:$rj)>;
+}
 
 let isCall = 1, hasSideEffects = 0, mayStore = 0, mayLoad = 0, Defs = [R1] in
 def PseudoJIRL_CALL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>,
@@ -1347,6 +1386,7 @@ let isBarrier = 1, isReturn = 1, isTerminator = 1 in
 def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>,
                 PseudoInstExpansion<(JIRL R0, R1, 0)>;
 
+// Tail call with 'Small' code model.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
 def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst)>;
 
@@ -1355,10 +1395,38 @@ def : Pat<(loongarch_tail (iPTR tglobaladdr:$dst)),
 def : Pat<(loongarch_tail (iPTR texternalsym:$dst)),
           (PseudoTAIL texternalsym:$dst)>;
 
+// Tail call with 'Medium' code model.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    Uses = [R3], Defs = [R20], Size = 8 in
+def PseudoTAIL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$dst)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_tail_medium (iPTR tglobaladdr:$dst)),
+          (PseudoTAIL_MEDIUM tglobaladdr:$dst)>;
+def : Pat<(loongarch_tail_medium (iPTR texternalsym:$dst)),
+          (PseudoTAIL_MEDIUM texternalsym:$dst)>;
+} // Predicates = [IsLA64]
+
+// Tail call with 'Large' code model.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    Uses = [R3], Defs = [R19, R20], Size = 24 in
+def PseudoTAIL_LARGE : Pseudo<(outs), (ins bare_symbol:$dst)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_tail_large (iPTR tglobaladdr:$dst)),
+          (PseudoTAIL_LARGE tglobaladdr:$dst)>;
+def : Pat<(loongarch_tail_large (iPTR texternalsym:$dst)),
+          (PseudoTAIL_LARGE texternalsym:$dst)>;
+} // Predicates = [IsLA64]
+
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
 def PseudoTAILIndirect : Pseudo<(outs), (ins GPRT:$rj),
                                 [(loongarch_tail GPRT:$rj)]>,
                          PseudoInstExpansion<(JIRL R0, GPR:$rj, 0)>;
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_tail_medium GPR:$rj), (PseudoTAILIndirect GPR:$rj)>;
+def : Pat<(loongarch_tail_large GPR:$rj), (PseudoTAILIndirect GPR:$rj)>;
+}
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasSideEffects = 0, mayStore = 0, mayLoad = 0, Uses = [R3] in
@@ -1396,6 +1464,7 @@ def PseudoLA_ABS_LARGE : Pseudo<(outs GPR:$dst),
                                 "la.abs", "$dst, $src">;
 def PseudoLA_PCREL : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                             "la.pcrel", "$dst, $src">;
+let Defs = [R20], Size = 20 in
 def PseudoLA_PCREL_LARGE : Pseudo<(outs GPR:$dst),
                                   (ins GPR:$tmp, bare_symbol:$src), [],
                                   "la.pcrel", "$dst, $tmp, $src">,
@@ -1407,28 +1476,30 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
     isAsmParserOnly = 1 in {
 def PseudoLA_GOT : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                           "la.got", "$dst, $src">;
+def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+                             "la.tls.ie", "$dst, $src">;
+def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+                             "la.tls.ld", "$dst, $src">;
+def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+                             "la.tls.gd", "$dst, $src">;
+let Defs = [R20], Size = 20 in {
 def PseudoLA_GOT_LARGE : Pseudo<(outs GPR:$dst),
                                 (ins GPR:$tmp, bare_symbol:$src), [],
                                 "la.got", "$dst, $tmp, $src">,
                          Requires<[IsLA64]>;
-def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
-                             "la.tls.ie", "$dst, $src">;
 def PseudoLA_TLS_IE_LARGE : Pseudo<(outs GPR:$dst),
                                    (ins GPR:$tmp, bare_symbol:$src), [],
                                    "la.tls.ie", "$dst, $tmp, $src">,
                             Requires<[IsLA64]>;
-def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
-                             "la.tls.ld", "$dst, $src">;
 def PseudoLA_TLS_LD_LARGE : Pseudo<(outs GPR:$dst),
                                    (ins GPR:$tmp, bare_symbol:$src), [],
                                    "la.tls.ld", "$dst, $tmp, $src">,
                             Requires<[IsLA64]>;
-def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
-                             "la.tls.gd", "$dst, $src">;
 def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst),
                                    (ins GPR:$tmp, bare_symbol:$src), [],
                                    "la.tls.gd", "$dst, $tmp, $src">,
                             Requires<[IsLA64]>;
+} // Defs = [R20], Size = 20
 }
 
 // Load address inst alias: "la", "la.global" and "la.local".
diff --git a/llvm/test/CodeGen/LoongArch/code-models.ll b/llvm/test/CodeGen/LoongArch/code-models.ll
index 7c6f46d5e926..f93c31670928 100644
--- a/llvm/test/CodeGen/LoongArch/code-models.ll
+++ b/llvm/test/CodeGen/LoongArch/code-models.ll
@@ -33,11 +33,11 @@ define i32 @call_globaladdress(i32 %a) nounwind {
 ; LARGE:       # %bb.0:
 ; LARGE-NEXT:    addi.d $sp, $sp, -16
 ; LARGE-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(callee)
-; LARGE-NEXT:    addi.d $ra, $zero, %got_pc_lo12(callee)
-; LARGE-NEXT:    lu32i.d $ra, %got64_pc_lo20(callee)
-; LARGE-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(callee)
-; LARGE-NEXT:    ldx.d $ra, $ra, $a1
+; LARGE-NEXT:    pcalau12i $ra, %got_pc_hi20(callee)
+; LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(callee)
+; LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(callee)
+; LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(callee)
+; LARGE-NEXT:    ldx.d $ra, $t8, $ra
 ; LARGE-NEXT:    jirl $ra, $ra, 0
 ; LARGE-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE-NEXT:    addi.d $sp, $sp, 16
@@ -82,11 +82,11 @@ define void @call_external_sym(ptr %dst) {
 ; LARGE-NEXT:    .cfi_offset 1, -8
 ; LARGE-NEXT:    ori $a2, $zero, 1000
 ; LARGE-NEXT:    move $a1, $zero
-; LARGE-NEXT:    pcalau12i $a3, %pc_hi20(memset)
-; LARGE-NEXT:    addi.d $ra, $zero, %pc_lo12(memset)
-; LARGE-NEXT:    lu32i.d $ra, %pc64_lo20(memset)
-; LARGE-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(memset)
-; LARGE-NEXT:    add.d $ra, $ra, $a3
+; LARGE-NEXT:    pcalau12i $ra, %pc_hi20(memset)
+; LARGE-NEXT:    addi.d $t8, $zero, %pc_lo12(memset)
+; LARGE-NEXT:    lu32i.d $t8, %pc64_lo20(memset)
+; LARGE-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(memset)
+; LARGE-NEXT:    add.d $ra, $t8, $ra
 ; LARGE-NEXT:    jirl $ra, $ra, 0
 ; LARGE-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE-NEXT:    addi.d $sp, $sp, 16
@@ -105,17 +105,17 @@ define i32 @caller_tail(i32 %i) nounwind {
 ;
 ; MEDIUM-LABEL: caller_tail:
 ; MEDIUM:       # %bb.0: # %entry
-; MEDIUM-NEXT:    pcaddu18i $a1, %call36(callee_tail)
-; MEDIUM-NEXT:    jr $a1
+; MEDIUM-NEXT:    pcaddu18i $t8, %call36(callee_tail)
+; MEDIUM-NEXT:    jr $t8
 ;
 ; LARGE-LABEL: caller_tail:
 ; LARGE:       # %bb.0: # %entry
-; LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(callee_tail)
-; LARGE-NEXT:    addi.d $a2, $zero, %got_pc_lo12(callee_tail)
-; LARGE-NEXT:    lu32i.d $a2, %got64_pc_lo20(callee_tail)
-; LARGE-NEXT:    lu52i.d $a2, $a2, %got64_pc_hi12(callee_tail)
-; LARGE-NEXT:    ldx.d $a1, $a2, $a1
-; LARGE-NEXT:    jr $a1
+; LARGE-NEXT:    pcalau12i $t7, %got_pc_hi20(callee_tail)
+; LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(callee_tail)
+; LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(callee_tail)
+; LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(callee_tail)
+; LARGE-NEXT:    ldx.d $t7, $t8, $t7
+; LARGE-NEXT:    jr $t7
 entry:
   %r = tail call i32 @callee_tail(i32 %i)
   ret i32 %r
diff --git a/llvm/test/CodeGen/LoongArch/expand-call.ll b/llvm/test/CodeGen/LoongArch/expand-call.ll
index 86bf4292665b..e0d179f92de6 100644
--- a/llvm/test/CodeGen/LoongArch/expand-call.ll
+++ b/llvm/test/CodeGen/LoongArch/expand-call.ll
@@ -1,6 +1,6 @@
 ; RUN: llc --mtriple=loongarch64 --stop-before loongarch-prera-expand-pseudo \
 ; RUN:     --verify-machineinstrs < %s | FileCheck %s --check-prefix=NOEXPAND
-; RUN: llc --mtriple=loongarch64 --stop-after loongarch-prera-expand-pseudo \
+; RUN: llc --mtriple=loongarch64 --stop-before machine-opt-remark-emitter \
 ; RUN:     --verify-machineinstrs < %s | FileCheck %s --check-prefix=EXPAND
 
 declare void @callee()
diff --git a/llvm/test/CodeGen/LoongArch/global-address.ll b/llvm/test/CodeGen/LoongArch/global-address.ll
index a8f0ef648aa7..d32a17f488b1 100644
--- a/llvm/test/CodeGen/LoongArch/global-address.ll
+++ b/llvm/test/CodeGen/LoongArch/global-address.ll
@@ -53,32 +53,32 @@ define void @foo() nounwind {
 ; LA64LARGENOPIC-LABEL: foo:
 ; LA64LARGENOPIC:       # %bb.0:
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
+; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
+; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    ld.w $a0, $a0, 0
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %pc64_lo20(g)
-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
-; LA64LARGENOPIC-NEXT:    add.d $a0, $a1, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
+; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %pc64_lo20(g)
+; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
+; LA64LARGENOPIC-NEXT:    add.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    ld.w $a0, $a0, 0
 ; LA64LARGENOPIC-NEXT:    ret
 ;
 ; LA64LARGEPIC-LABEL: foo:
 ; LA64LARGEPIC:       # %bb.0:
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
-; LA64LARGEPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
+; LA64LARGEPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGEPIC-NEXT:    ld.w $a0, $a0, 0
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(.Lg$local)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(.Lg$local)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(.Lg$local)
+; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
 ; LA64LARGEPIC-NEXT:    ld.w $a0, $a0, 0
 ; LA64LARGEPIC-NEXT:    ret
   %V = load volatile i32, ptr @G
diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
index a515939b9c2b..474436a0126b 100644
--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
@@ -48,13 +48,13 @@ define void @foo() nounwind {
 ; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, -16
 ; MEDIUM_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
 ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
 ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
 ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
 ; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
 ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
 ; MEDIUM_SCH-NEXT:    ori $a0, $zero, 1
+; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
 ; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
 ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
 ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
@@ -74,41 +74,41 @@ define void @foo() nounwind {
 ; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, -16
 ; LARGE_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
-; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(g)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
+; LARGE_NO_SCH-NEXT:    add.d $a0, $t8, $a0
 ; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
 ; LARGE_NO_SCH-NEXT:    ori $a0, $zero, 1
-; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
-; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
-; LARGE_NO_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
-; LARGE_NO_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
-; LARGE_NO_SCH-NEXT:    ldx.d $ra, $ra, $a1
+; LARGE_NO_SCH-NEXT:    pcalau12i $ra, %got_pc_hi20(bar)
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(bar)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(bar)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
+; LARGE_NO_SCH-NEXT:    ldx.d $ra, $t8, $ra
 ; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(gd)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
 ; LARGE_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, 16
@@ -118,42 +118,42 @@ define void @foo() nounwind {
 ; LARGE_SCH:       # %bb.0:
 ; LARGE_SCH-NEXT:    addi.d $sp, $sp, -16
 ; LARGE_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
 ; LARGE_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LARGE_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
-; LARGE_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
-; LARGE_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
-; LARGE_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
-; LARGE_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
+; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
 ; LARGE_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
-; LARGE_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(g)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
+; LARGE_SCH-NEXT:    add.d $a0, $t8, $a0
 ; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
-; LARGE_SCH-NEXT:    ldx.d $ra, $ra, $a1
 ; LARGE_SCH-NEXT:    ori $a0, $zero, 1
+; LARGE_SCH-NEXT:    pcalau12i $ra, %got_pc_hi20(bar)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(bar)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(bar)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
+; LARGE_SCH-NEXT:    ldx.d $ra, $t8, $ra
 ; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
 ; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
-; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
-; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(gd)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
+; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
 ; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
-; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
+; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
 ; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
+; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
 ; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
 ; LARGE_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE_SCH-NEXT:    addi.d $sp, $sp, 16
diff --git a/llvm/test/CodeGen/LoongArch/tls-models.ll b/llvm/test/CodeGen/LoongArch/tls-models.ll
index a2a3792a6a54..3994df1da716 100644
--- a/llvm/test/CodeGen/LoongArch/tls-models.ll
+++ b/llvm/test/CodeGen/LoongArch/tls-models.ll
@@ -45,15 +45,15 @@ define ptr @f1() nounwind {
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, -16
 ; LA64LARGEPIC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %gd_pc_hi20(unspecified)
-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(unspecified)
-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(unspecified)
-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(unspecified)
-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
-; LA64LARGEPIC-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    add.d $ra, $ra, $a1
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(unspecified)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(unspecified)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(unspecified)
+; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
+; LA64LARGEPIC-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    add.d $ra, $t8, $ra
 ; LA64LARGEPIC-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGEPIC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, 16
@@ -76,10 +76,10 @@ define ptr @f1() nounwind {
 ; LA64LARGENOPIC-LABEL: f1:
 ; LA64LARGENOPIC:       # %bb.0: # %entry
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(unspecified)
-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(unspecified)
-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(unspecified)
-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(unspecified)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(unspecified)
+; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(unspecified)
+; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(unspecified)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
 entry:
@@ -116,15 +116,15 @@ define ptr @f2() nounwind {
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, -16
 ; LA64LARGEPIC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(ld)
-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(ld)
-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(ld)
-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
-; LA64LARGEPIC-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    add.d $ra, $ra, $a1
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(ld)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(ld)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(ld)
+; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
+; LA64LARGEPIC-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    add.d $ra, $t8, $ra
 ; LA64LARGEPIC-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGEPIC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, 16
@@ -147,10 +147,10 @@ define ptr @f2() nounwind {
 ; LA64LARGENOPIC-LABEL: f2:
 ; LA64LARGENOPIC:       # %bb.0: # %entry
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
+; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
+; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
 entry:
@@ -177,10 +177,10 @@ define ptr @f3() nounwind {
 ; LA64LARGEPIC-LABEL: f3:
 ; LA64LARGEPIC:       # %bb.0: # %entry
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
-; LA64LARGEPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
+; LA64LARGEPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGEPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGEPIC-NEXT:    ret
 ;
@@ -201,10 +201,10 @@ define ptr @f3() nounwind {
 ; LA64LARGENOPIC-LABEL: f3:
 ; LA64LARGENOPIC:       # %bb.0: # %entry
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
+; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
+; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
 entry:
-- 
Gitee


From 34e8c30579faf4a8ef69fa686bd9b2d9e832d299 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Fri, 5 Jan 2024 12:05:23 +0800
Subject: [PATCH 10/23] [LoongArch] Fix -Wunused-variable in
 LoongArchExpandPseudoInsts.cpp (NFC)

llvm-project/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp:480:20:
 error: unused variable 'MF' [-Werror,-Wunused-variable]
  MachineFunction *MF = MBB.getParent();
                   ^
1 error generated.

(cherry picked from commit 52d1397e38ee88b170585c9c824d08e6975890ca)
---
 llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index f977f176066a..ad39658f698e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -477,12 +477,11 @@ bool LoongArchExpandPseudo::expandLargeAddressLoad(
     break;
   }
 
-  MachineFunction *MF = MBB.getParent();
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
   Register ScratchReg = LoongArch::R20; // $t8
 
-  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
+  assert(MBB.getParent()->getSubtarget<LoongArchSubtarget>().is64Bit() &&
          "Large code model requires LA64");
 
   auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg);
-- 
Gitee


From cf0d9db4664d59e163d53c62ae1de663092ab2d2 Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Fri, 10 Nov 2023 13:37:55 +0800
Subject: [PATCH 11/23] [lld][ELF] Add a corner testcase for
 elf::getLoongArchPageDelta

If `page(dest) - page(pc)` is 0xfffffffffff000, i.e. page(pc) is next
to page(dest), and lo12(dest) > 0x7ff, correct %pc64_lo12 and %pc64_hi12
should be both -1 (which can be checked with binutils) but they are both
0 on lld. This patch adds such a test showing lld's incorrect behaviour
and following patch will fix this issue.

(cherry picked from commit e752b58e0d26fc08bca6b2a4e56b05af7f8d8d66)
---
 lld/test/ELF/loongarch-pc-aligned.s | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/lld/test/ELF/loongarch-pc-aligned.s b/lld/test/ELF/loongarch-pc-aligned.s
index 9df3492d1877..f6ac56e5261d 100644
--- a/lld/test/ELF/loongarch-pc-aligned.s
+++ b/lld/test/ELF/loongarch-pc-aligned.s
@@ -260,6 +260,19 @@
 # EXTREME15-NEXT: lu32i.d   $t0, -349526
 # EXTREME15-NEXT: lu52i.d   $t0, $t0, -1093
 
+## FIXME: Correct %pc64_lo20 should be 0xfffff (-1) and %pc64_hi12 should be 0xfff (-1), but current values are:
+## page delta = 0x0000000000000000, page offset = 0x888
+## %pc_lo12   = 0x888 = -1912
+## %pc_hi20   = 0x00000 = 0
+## %pc64_lo20 = 0x00000 = 0
+## %pc64_hi12 = 0x00000 = 0
+# RUN: ld.lld %t/extreme.o --section-start=.rodata=0x0000000012344888 --section-start=.text=0x0000000012345678 -o %t/extreme16
+# RUN: llvm-objdump -d --no-show-raw-insn %t/extreme16 | FileCheck %s --check-prefix=EXTREME16
+# EXTREME16:      addi.d $t0, $zero, -1912
+# EXTREME16-NEXT: pcalau12i $t1, 0
+# EXTREME16-NEXT: lu32i.d   $t0, 0
+# EXTREME16-NEXT: lu52i.d   $t0, $t0, 0
+
 #--- a.s
 .rodata
 x:
-- 
Gitee


From 11d61b028f306d5ace2b09154781575e88b118cb Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Sat, 25 Nov 2023 15:44:05 +0800
Subject: [PATCH 12/23] [lld][LoongArch] Add a another corner testcase for
 elf::getLoongArchPageDelta

Similar to e752b58e0d26.

(cherry picked from commit 84a20989c6f72d0f7d04c9981d51c7838e95855c)
---
 lld/ELF/Arch/LoongArch.cpp          |  1 -
 lld/test/ELF/loongarch-pc-aligned.s | 13 +++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index 72d9c6838e31..516d02bb9e3f 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -168,7 +168,6 @@ uint64_t elf::getLoongArchPageDelta(uint64_t dest, uint64_t pc) {
     result -= 0x10000'0000;
   else if (!negativeA && negativeB)
     result += 0x10000'0000;
-
   return result;
 }
 
diff --git a/lld/test/ELF/loongarch-pc-aligned.s b/lld/test/ELF/loongarch-pc-aligned.s
index f6ac56e5261d..e7950400a5c8 100644
--- a/lld/test/ELF/loongarch-pc-aligned.s
+++ b/lld/test/ELF/loongarch-pc-aligned.s
@@ -273,6 +273,19 @@
 # EXTREME16-NEXT: lu32i.d   $t0, 0
 # EXTREME16-NEXT: lu52i.d   $t0, $t0, 0
 
+## FIXME: Correct %pc64_lo20 should be 0x00000 (0) and %pc64_hi12 should be 0x000 (0), but current values are:
+## page delta = 0xffffffff80000000, page offset = 0x888
+## %pc_lo12   = 0x888 = -1912
+## %pc_hi20   = 0x80000 = -524288
+## %pc64_lo20 = 0xfffff = -1
+## %pc64_hi12 = 0xfff = -1
+# RUN: ld.lld %t/extreme.o --section-start=.rodata=0x000071238ffff888 --section-start=.text=0x0000712310000678 -o %t/extreme17
+# RUN: llvm-objdump -d --no-show-raw-insn %t/extreme17 | FileCheck %s --check-prefix=EXTREME17
+# EXTREME17:      addi.d $t0, $zero, -1912
+# EXTREME17-NEXT: pcalau12i $t1, -524288
+# EXTREME17-NEXT: lu32i.d   $t0, -1
+# EXTREME17-NEXT: lu52i.d   $t0, $t0, -1
+
 #--- a.s
 .rodata
 x:
-- 
Gitee


From 1ea127e041629ae2df9b9cf6a85e25b6276d4a99 Mon Sep 17 00:00:00 2001
From: Lu Weining <luweining@loongson.cn>
Date: Wed, 10 Jan 2024 18:03:52 +0800
Subject: [PATCH 13/23] [lld][LoongArch] Handle extreme code model relocs
 according to psABI v2.30 (#73387)

psABI v2.30 requires the extreme code model instructions sequence
(pcalau12i+addi.d+lu32i.d+lu52i.d) to be adjacent.

See https://github.com/llvm/llvm-project/pull/71907 and
https://github.com/loongson-community/discussions/issues/17 for details.

(cherry picked from commit 38394a3d0b8b9a1fdc444bdebeba17a19250997d)
---
 lld/ELF/Arch/LoongArch.cpp          | 110 +++++++---------------------
 lld/ELF/InputSection.cpp            |  10 +--
 lld/ELF/Target.h                    |   2 +-
 lld/test/ELF/loongarch-pc-aligned.s | 109 ++++++++++++++-------------
 4 files changed, 93 insertions(+), 138 deletions(-)

diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index 516d02bb9e3f..19147a0f6df6 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -85,89 +85,33 @@ static uint64_t getLoongArchPage(uint64_t p) {
 static uint32_t lo12(uint32_t val) { return val & 0xfff; }
 
 // Calculate the adjusted page delta between dest and PC.
-uint64_t elf::getLoongArchPageDelta(uint64_t dest, uint64_t pc) {
-  // Consider the large code model access pattern, of which the smaller code
-  // models' access patterns are a subset:
-  //
-  //     pcalau12i       U, %foo_hi20(sym)        ; b in [-0x80000, 0x7ffff]
-  //     addi.d          T, zero, %foo_lo12(sym)  ; a in [-0x800, 0x7ff]
-  //     lu32i.d         T, %foo64_lo20(sym)      ; c in [-0x80000, 0x7ffff]
-  //     lu52i.d         T, T, %foo64_hi12(sym)   ; d in [-0x800, 0x7ff]
-  //     {ldx,stx,add}.* dest, U, T
-  //
-  // Let page(pc) = 0xRRR'QQQQQ'PPPPP'000 and dest = 0xZZZ'YYYYY'XXXXX'AAA,
-  // with RQ, P, ZY, X and A representing the respective bitfields as unsigned
-  // integers. We have:
-  //
-  //     page(dest) = 0xZZZ'YYYYY'XXXXX'000
-  //     - page(pc) = 0xRRR'QQQQQ'PPPPP'000
-  //     ----------------------------------
-  //                  0xddd'ccccc'bbbbb'000
-  //
-  // Now consider the above pattern's actual effects:
-  //
-  //     page(pc)                     0xRRR'QQQQQ'PPPPP'000
-  //     pcalau12i                  + 0xiii'iiiii'bbbbb'000
-  //     addi                       + 0xjjj'jjjjj'kkkkk'AAA
-  //     lu32i.d & lu52i.d          + 0xddd'ccccc'00000'000
-  //     --------------------------------------------------
-  //     dest = U + T
-  //          = ((RQ<<32) + (P<<12) + i + (b<<12)) + (j + k + A + (cd<<32))
-  //          = (((RQ+cd)<<32) + i + j) + (((P+b)<<12) + k) + A
-  //          = (ZY<<32)                + (X<<12)           + A
-  //
-  //     ZY<<32 = (RQ<<32)+(cd<<32)+i+j, X<<12 = (P<<12)+(b<<12)+k
-  //     cd<<32 = (ZY<<32)-(RQ<<32)-i-j, b<<12 = (X<<12)-(P<<12)-k
-  //
-  // where i and k are terms representing the effect of b's and A's sign
-  // extension respectively.
-  //
-  //     i = signed b < 0 ? -0x10000'0000 : 0
-  //     k = signed A < 0 ? -0x1000 : 0
-  //
-  // The j term is a bit complex: it represents the higher half of
-  // sign-extended bits from A that are effectively lost if i == 0 but k != 0,
-  // due to overwriting by lu32i.d & lu52i.d.
-  //
-  //     j = signed A < 0 && signed b >= 0 ? 0x10000'0000 : 0
-  //
-  // The actual effect of the instruction sequence before the final addition,
-  // i.e. our desired result value, is thus:
-  //
-  //     result = (cd<<32) + (b<<12)
-  //            = (ZY<<32)-(RQ<<32)-i-j + (X<<12)-(P<<12)-k
-  //            = ((ZY<<32)+(X<<12)) - ((RQ<<32)+(P<<12)) - i - j - k
-  //            = page(dest) - page(pc) - i - j - k
-  //
-  // when signed A >= 0 && signed b >= 0:
-  //
-  //     i = j = k = 0
-  //     result = page(dest) - page(pc)
-  //
-  // when signed A >= 0 && signed b < 0:
-  //
-  //     i = -0x10000'0000, j = k = 0
-  //     result = page(dest) - page(pc) + 0x10000'0000
-  //
-  // when signed A < 0 && signed b >= 0:
-  //
-  //     i = 0, j = 0x10000'0000, k = -0x1000
-  //     result = page(dest) - page(pc) - 0x10000'0000 + 0x1000
-  //
-  // when signed A < 0 && signed b < 0:
-  //
-  //     i = -0x10000'0000, j = 0, k = -0x1000
-  //     result = page(dest) - page(pc) + 0x1000
-  uint64_t result = getLoongArchPage(dest) - getLoongArchPage(pc);
-  bool negativeA = lo12(dest) > 0x7ff;
-  bool negativeB = (result & 0x8000'0000) != 0;
-
-  if (negativeA)
-    result += 0x1000;
-  if (negativeA && !negativeB)
-    result -= 0x10000'0000;
-  else if (!negativeA && negativeB)
-    result += 0x10000'0000;
+uint64_t elf::getLoongArchPageDelta(uint64_t dest, uint64_t pc, RelType type) {
+  // Note that if the sequence being relocated is `pcalau12i + addi.d + lu32i.d
+  // + lu52i.d`, they must be adjancent so that we can infer the PC of
+  // `pcalau12i` when calculating the page delta for the other two instructions
+  // (lu32i.d and lu52i.d). Compensate all the sign-extensions is a bit
+  // complicated. Just use psABI recommended algorithm.
+  uint64_t pcalau12i_pc;
+  switch (type) {
+  case R_LARCH_PCALA64_LO20:
+  case R_LARCH_GOT64_PC_LO20:
+  case R_LARCH_TLS_IE64_PC_LO20:
+    pcalau12i_pc = pc - 8;
+    break;
+  case R_LARCH_PCALA64_HI12:
+  case R_LARCH_GOT64_PC_HI12:
+  case R_LARCH_TLS_IE64_PC_HI12:
+    pcalau12i_pc = pc - 12;
+    break;
+  default:
+    pcalau12i_pc = pc;
+    break;
+  }
+  uint64_t result = getLoongArchPage(dest) - getLoongArchPage(pcalau12i_pc);
+  if (dest & 0x800)
+    result += 0x1000 - 0x1'0000'0000;
+  if (result & 0x8000'0000)
+    result += 0x1'0000'0000;
   return result;
 }
 
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index b178d82407e3..44444b62251d 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -712,8 +712,8 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
     return sym.getGotVA() + a - p;
   case R_LOONGARCH_GOT_PAGE_PC:
     if (sym.hasFlag(NEEDS_TLSGD))
-      return getLoongArchPageDelta(in.got->getGlobalDynAddr(sym) + a, p);
-    return getLoongArchPageDelta(sym.getGotVA() + a, p);
+      return getLoongArchPageDelta(in.got->getGlobalDynAddr(sym) + a, p, type);
+    return getLoongArchPageDelta(sym.getGotVA() + a, p, type);
   case R_MIPS_GOTREL:
     return sym.getVA(a) - in.mipsGot->getGp(file);
   case R_MIPS_GOT_GP:
@@ -763,7 +763,7 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
     return 0;
   }
   case R_LOONGARCH_PAGE_PC:
-    return getLoongArchPageDelta(sym.getVA(a), p);
+    return getLoongArchPageDelta(sym.getVA(a), p, type);
   case R_PC:
   case R_ARM_PCA: {
     uint64_t dest;
@@ -798,7 +798,7 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
   case R_PPC64_CALL_PLT:
     return sym.getPltVA() + a - p;
   case R_LOONGARCH_PLT_PAGE_PC:
-    return getLoongArchPageDelta(sym.getPltVA() + a, p);
+    return getLoongArchPageDelta(sym.getPltVA() + a, p, type);
   case R_PLT_GOTPLT:
     return sym.getPltVA() + a - in.gotPlt->getVA();
   case R_PPC32_PLTREL:
@@ -860,7 +860,7 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
   case R_TLSGD_PC:
     return in.got->getGlobalDynAddr(sym) + a - p;
   case R_LOONGARCH_TLSGD_PAGE_PC:
-    return getLoongArchPageDelta(in.got->getGlobalDynAddr(sym) + a, p);
+    return getLoongArchPageDelta(in.got->getGlobalDynAddr(sym) + a, p, type);
   case R_TLSLD_GOTPLT:
     return in.got->getVA() + in.got->getTlsIndexOff() + a - in.gotPlt->getVA();
   case R_TLSLD_GOT:
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index bf831afa1793..aeabe47f92a1 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -229,7 +229,7 @@ void addPPC64SaveRestore();
 uint64_t getPPC64TocBase();
 uint64_t getAArch64Page(uint64_t expr);
 template <typename ELFT> void writeARMCmseImportLib();
-uint64_t getLoongArchPageDelta(uint64_t dest, uint64_t pc);
+uint64_t getLoongArchPageDelta(uint64_t dest, uint64_t pc, RelType type);
 void riscvFinalizeRelax(int passes);
 void mergeRISCVAttributesSections();
 void addArmInputSectionMappingSymbols();
diff --git a/lld/test/ELF/loongarch-pc-aligned.s b/lld/test/ELF/loongarch-pc-aligned.s
index e7950400a5c8..0405961e5f74 100644
--- a/lld/test/ELF/loongarch-pc-aligned.s
+++ b/lld/test/ELF/loongarch-pc-aligned.s
@@ -75,8 +75,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x4443333334567111 --section-start=.text=0x0000000012345678 -o %t/extreme0
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme0 | FileCheck %s --check-prefix=EXTREME0
-# EXTREME0:      addi.d $t0, $zero, 273
-# EXTREME0-NEXT: pcalau12i $t1, 139810
+# EXTREME0:      pcalau12i $t1, 139810
+# EXTREME0-NEXT: addi.d $t0, $zero, 273
 # EXTREME0-NEXT: lu32i.d   $t0, 209715
 # EXTREME0-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -87,8 +87,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x4443333334567888 --section-start=.text=0x0000000012345678 -o %t/extreme1
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme1 | FileCheck %s --check-prefix=EXTREME1
-# EXTREME1:      addi.d $t0, $zero, -1912
-# EXTREME1-NEXT: pcalau12i $t1, 139811
+# EXTREME1:      pcalau12i $t1, 139811
+# EXTREME1-NEXT: addi.d $t0, $zero, -1912
 # EXTREME1-NEXT: lu32i.d   $t0, 209714
 # EXTREME1-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -99,8 +99,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x44433333abcde111 --section-start=.text=0x0000000012345678 -o %t/extreme2
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme2 | FileCheck %s --check-prefix=EXTREME2
-# EXTREME2:      addi.d $t0, $zero, 273
-# EXTREME2-NEXT: pcalau12i $t1, -419431
+# EXTREME2:      pcalau12i $t1, -419431
+# EXTREME2-NEXT: addi.d $t0, $zero, 273
 # EXTREME2-NEXT: lu32i.d   $t0, 209716
 # EXTREME2-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -111,8 +111,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x44433333abcde888 --section-start=.text=0x0000000012345678 -o %t/extreme3
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme3 | FileCheck %s --check-prefix=EXTREME3
-# EXTREME3:      addi.d $t0, $zero, -1912
-# EXTREME3-NEXT: pcalau12i $t1, -419430
+# EXTREME3:      pcalau12i $t1, -419430
+# EXTREME3-NEXT: addi.d $t0, $zero, -1912
 # EXTREME3-NEXT: lu32i.d   $t0, 209715
 # EXTREME3-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -123,8 +123,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x444aaaaa34567111 --section-start=.text=0x0000000012345678 -o %t/extreme4
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme4 | FileCheck %s --check-prefix=EXTREME4
-# EXTREME4:      addi.d $t0, $zero, 273
-# EXTREME4-NEXT: pcalau12i $t1, 139810
+# EXTREME4:      pcalau12i $t1, 139810
+# EXTREME4-NEXT: addi.d $t0, $zero, 273
 # EXTREME4-NEXT: lu32i.d   $t0, -349526
 # EXTREME4-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -135,8 +135,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x444aaaaa34567888 --section-start=.text=0x0000000012345678 -o %t/extreme5
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme5 | FileCheck %s --check-prefix=EXTREME5
-# EXTREME5:      addi.d $t0, $zero, -1912
-# EXTREME5-NEXT: pcalau12i $t1, 139811
+# EXTREME5:      pcalau12i $t1, 139811
+# EXTREME5-NEXT: addi.d $t0, $zero, -1912
 # EXTREME5-NEXT: lu32i.d   $t0, -349527
 # EXTREME5-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -147,8 +147,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x444aaaaaabcde111 --section-start=.text=0x0000000012345678 -o %t/extreme6
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme6 | FileCheck %s --check-prefix=EXTREME6
-# EXTREME6:      addi.d $t0, $zero, 273
-# EXTREME6-NEXT: pcalau12i $t1, -419431
+# EXTREME6:      pcalau12i $t1, -419431
+# EXTREME6-NEXT: addi.d $t0, $zero, 273
 # EXTREME6-NEXT: lu32i.d   $t0, -349525
 # EXTREME6-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -159,8 +159,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x444aaaaaabcde888 --section-start=.text=0x0000000012345678 -o %t/extreme7
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme7 | FileCheck %s --check-prefix=EXTREME7
-# EXTREME7:      addi.d $t0, $zero, -1912
-# EXTREME7-NEXT: pcalau12i $t1, -419430
+# EXTREME7:      pcalau12i $t1, -419430
+# EXTREME7-NEXT: addi.d $t0, $zero, -1912
 # EXTREME7-NEXT: lu32i.d   $t0, -349526
 # EXTREME7-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -171,8 +171,8 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbb3333334567111 --section-start=.text=0x0000000012345678 -o %t/extreme8
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme8 | FileCheck %s --check-prefix=EXTREME8
-# EXTREME8:      addi.d $t0, $zero, 273
-# EXTREME8-NEXT: pcalau12i $t1, 139810
+# EXTREME8:      pcalau12i $t1, 139810
+# EXTREME8-NEXT: addi.d $t0, $zero, 273
 # EXTREME8-NEXT: lu32i.d   $t0, 209715
 # EXTREME8-NEXT: lu52i.d   $t0, $t0, -1093
 
@@ -183,8 +183,8 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbb3333334567888 --section-start=.text=0x0000000012345678 -o %t/extreme9
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme9 | FileCheck %s --check-prefix=EXTREME9
-# EXTREME9:      addi.d $t0, $zero, -1912
-# EXTREME9-NEXT: pcalau12i $t1, 139811
+# EXTREME9:      pcalau12i $t1, 139811
+# EXTREME9-NEXT: addi.d $t0, $zero, -1912
 # EXTREME9-NEXT: lu32i.d   $t0, 209714
 # EXTREME9-NEXT: lu52i.d   $t0, $t0, -1093
 
@@ -195,8 +195,8 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbb33333abcde111 --section-start=.text=0x0000000012345678 -o %t/extreme10
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme10 | FileCheck %s --check-prefix=EXTREME10
-# EXTREME10:      addi.d $t0, $zero, 273
-# EXTREME10-NEXT: pcalau12i $t1, -419431
+# EXTREME10:      pcalau12i $t1, -419431
+# EXTREME10-NEXT: addi.d $t0, $zero, 273
 # EXTREME10-NEXT: lu32i.d   $t0, 209716
 # EXTREME10-NEXT: lu52i.d   $t0, $t0, -1093
 
@@ -207,8 +207,8 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbb33333abcde888 --section-start=.text=0x0000000012345678 -o %t/extreme11
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme11 | FileCheck %s --check-prefix=EXTREME11
-# EXTREME11:      addi.d $t0, $zero, -1912
-# EXTREME11-NEXT: pcalau12i $t1, -419430
+# EXTREME11:      pcalau12i $t1, -419430
+# EXTREME11-NEXT: addi.d $t0, $zero, -1912
 # EXTREME11-NEXT: lu32i.d   $t0, 209715
 # EXTREME11-NEXT: lu52i.d   $t0, $t0, -1093
 
@@ -219,8 +219,8 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbbaaaaa34567111 --section-start=.text=0x0000000012345678 -o %t/extreme12
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme12 | FileCheck %s --check-prefix=EXTREME12
-# EXTREME12:      addi.d $t0, $zero, 273
-# EXTREME12-NEXT: pcalau12i $t1, 139810
+# EXTREME12:      pcalau12i $t1, 139810
+# EXTREME12-NEXT: addi.d $t0, $zero, 273
 # EXTREME12-NEXT: lu32i.d   $t0, -349526
 # EXTREME12-NEXT: lu52i.d   $t0, $t0, -1093
 
@@ -231,8 +231,8 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbbaaaaa34567888 --section-start=.text=0x0000000012345678 -o %t/extreme13
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme13 | FileCheck %s --check-prefix=EXTREME13
-# EXTREME13:      addi.d $t0, $zero, -1912
-# EXTREME13-NEXT: pcalau12i $t1, 139811
+# EXTREME13:      pcalau12i $t1, 139811
+# EXTREME13-NEXT: addi.d $t0, $zero, -1912
 # EXTREME13-NEXT: lu32i.d   $t0, -349527
 # EXTREME13-NEXT: lu52i.d   $t0, $t0, -1093
 
@@ -243,8 +243,8 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbbaaaaaabcde111 --section-start=.text=0x0000000012345678 -o %t/extreme14
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme14 | FileCheck %s --check-prefix=EXTREME14
-# EXTREME14:      addi.d $t0, $zero, 273
-# EXTREME14-NEXT: pcalau12i $t1, -419431
+# EXTREME14:      pcalau12i $t1, -419431
+# EXTREME14-NEXT: addi.d $t0, $zero, 273
 # EXTREME14-NEXT: lu32i.d   $t0, -349525
 # EXTREME14-NEXT: lu52i.d   $t0, $t0, -1093
 
@@ -255,36 +255,47 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbbaaaaaabcde888 --section-start=.text=0x0000000012345678 -o %t/extreme15
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme15 | FileCheck %s --check-prefix=EXTREME15
-# EXTREME15:      addi.d $t0, $zero, -1912
-# EXTREME15-NEXT: pcalau12i $t1, -419430
+# EXTREME15:      pcalau12i $t1, -419430
+# EXTREME15-NEXT: addi.d $t0, $zero, -1912
 # EXTREME15-NEXT: lu32i.d   $t0, -349526
 # EXTREME15-NEXT: lu52i.d   $t0, $t0, -1093
 
-## FIXME: Correct %pc64_lo20 should be 0xfffff (-1) and %pc64_hi12 should be 0xfff (-1), but current values are:
-## page delta = 0x0000000000000000, page offset = 0x888
+## page delta = 0xffffffff00000000, page offset = 0x888
 ## %pc_lo12   = 0x888 = -1912
 ## %pc_hi20   = 0x00000 = 0
-## %pc64_lo20 = 0x00000 = 0
-## %pc64_hi12 = 0x00000 = 0
+## %pc64_lo20 = 0xfffff = -1
+## %pc64_hi12 = 0xfff = -1
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x0000000012344888 --section-start=.text=0x0000000012345678 -o %t/extreme16
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme16 | FileCheck %s --check-prefix=EXTREME16
-# EXTREME16:      addi.d $t0, $zero, -1912
-# EXTREME16-NEXT: pcalau12i $t1, 0
-# EXTREME16-NEXT: lu32i.d   $t0, 0
-# EXTREME16-NEXT: lu52i.d   $t0, $t0, 0
+# EXTREME16:      pcalau12i $t1, 0
+# EXTREME16-NEXT: addi.d $t0, $zero, -1912
+# EXTREME16-NEXT: lu32i.d   $t0, -1
+# EXTREME16-NEXT: lu52i.d   $t0, $t0, -1
 
-## FIXME: Correct %pc64_lo20 should be 0x00000 (0) and %pc64_hi12 should be 0x000 (0), but current values are:
-## page delta = 0xffffffff80000000, page offset = 0x888
+## page delta = 0x0000000080000000, page offset = 0x888
 ## %pc_lo12   = 0x888 = -1912
 ## %pc_hi20   = 0x80000 = -524288
-## %pc64_lo20 = 0xfffff = -1
-## %pc64_hi12 = 0xfff = -1
+## %pc64_lo20 = 0xfffff = 0
+## %pc64_hi12 = 0xfff = 0
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x000071238ffff888 --section-start=.text=0x0000712310000678 -o %t/extreme17
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme17 | FileCheck %s --check-prefix=EXTREME17
-# EXTREME17:      addi.d $t0, $zero, -1912
-# EXTREME17-NEXT: pcalau12i $t1, -524288
-# EXTREME17-NEXT: lu32i.d   $t0, -1
-# EXTREME17-NEXT: lu52i.d   $t0, $t0, -1
+# EXTREME17:      pcalau12i $t1, -524288
+# EXTREME17-NEXT: addi.d $t0, $zero, -1912
+# EXTREME17-NEXT: lu32i.d   $t0, 0
+# EXTREME17-NEXT: lu52i.d   $t0, $t0, 0
+
+## A case that pcalau12i, lu32i.d and lu52i.d are in different pages.
+## page delta = 0x0000000080000000, page offset = 0x123
+## %pc_lo12   = 0x111 = 273
+## %pc_hi20   = 0x80000 = -524288
+## %pc64_lo20 = 0x00001 = 1
+## %pc64_hi12 = 0x000 = 0
+# RUN: ld.lld %t/extreme.o --section-start=.rodata=0x80000111 --section-start=.text=0xff8 -o %t/extreme18
+# RUN: llvm-objdump -d --no-show-raw-insn %t/extreme18 | FileCheck %s --check-prefix=EXTREME18
+# EXTREME18:      pcalau12i $t1, -524288
+# EXTREME18-NEXT: addi.d $t0, $zero, 273
+# EXTREME18-NEXT: lu32i.d   $t0, 1
+# EXTREME18-NEXT: lu52i.d   $t0, $t0, 0
 
 #--- a.s
 .rodata
@@ -303,7 +314,7 @@ x:
 .text
 .global _start
 _start:
-    addi.d    $t0, $zero, %pc_lo12(x)
     pcalau12i $t1, %pc_hi20(x)
+    addi.d    $t0, $zero, %pc_lo12(x)
     lu32i.d   $t0, %pc64_lo20(x)
     lu52i.d   $t0, $t0, %pc64_hi12(x)
-- 
Gitee


From b0e5225dea19a71b0c2f2168c117ac5032c2d18a Mon Sep 17 00:00:00 2001
From: Zhaoxin Yang <yangzhaoxin@loongson.cn>
Date: Tue, 9 Jul 2024 14:13:19 +0800
Subject: [PATCH 14/23] [LoongArch][clang] Add support for option `-msimd=` and
 macro `__loongarch_simd_width`. (#97984)

(cherry picked from commit 626c7ce33f850831949e4e724016ddbff3a34990)
---
 .../clang/Basic/DiagnosticDriverKinds.td      |   2 +
 clang/include/clang/Driver/Options.td         |   3 +
 clang/lib/Basic/Targets/LoongArch.cpp         |   8 +-
 .../lib/Driver/ToolChains/Arch/LoongArch.cpp  |  29 ++++
 clang/test/Driver/loongarch-msimd.c           | 129 ++++++++++++++++++
 clang/test/Preprocessor/init-loongarch.c      |   3 +
 6 files changed, 172 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Driver/loongarch-msimd.c

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 6b68bc458b93..060f96118364 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -757,6 +757,8 @@ def err_drv_loongarch_wrong_fpu_width_for_lasx : Error<
   "wrong fpu width; LASX depends on 64-bit FPU.">;
 def err_drv_loongarch_invalid_simd_option_combination : Error<
   "invalid option combination; LASX depends on LSX.">;
+def err_drv_loongarch_invalid_msimd_EQ : Error<
+  "invalid argument '%0' to -msimd=; must be one of: none, lsx, lasx">;
 
 def err_drv_expand_response_file : Error<
   "failed to expand response file: %0">;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 344c8bd49da7..530bb53ea9b5 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4236,6 +4236,9 @@ def mlasx : Flag<["-"], "mlasx">, Group<m_loongarch_Features_Group>,
   HelpText<"Enable Loongson Advanced SIMD Extension (LASX).">;
 def mno_lasx : Flag<["-"], "mno-lasx">, Group<m_loongarch_Features_Group>,
   HelpText<"Disable Loongson Advanced SIMD Extension (LASX).">;
+def msimd_EQ : Joined<["-"], "msimd=">, Group<m_loongarch_Features_Group>,
+  Flags<[TargetSpecific]>,
+  HelpText<"Select the SIMD extension(s) to be enabled in LoongArch either 'none', 'lsx', 'lasx'.">;
 def mnop_mcount : Flag<["-"], "mnop-mcount">, HelpText<"Generate mcount/__fentry__ calls as nops. To activate they need to be patched in.">,
   Flags<[CC1Option]>, Group<m_Group>,
   MarshallingInfoFlag<CodeGenOpts<"MNopMCount">>;
diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp
index 88537989a051..913404240916 100644
--- a/clang/lib/Basic/Targets/LoongArch.cpp
+++ b/clang/lib/Basic/Targets/LoongArch.cpp
@@ -208,10 +208,14 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
     TuneCPU = ArchName;
   Builder.defineMacro("__loongarch_tune", Twine('"') + TuneCPU + Twine('"'));
 
-  if (HasFeatureLSX)
+  if (HasFeatureLASX) {
+    Builder.defineMacro("__loongarch_simd_width", "256");
     Builder.defineMacro("__loongarch_sx", Twine(1));
-  if (HasFeatureLASX)
     Builder.defineMacro("__loongarch_asx", Twine(1));
+  } else if (HasFeatureLSX) {
+    Builder.defineMacro("__loongarch_simd_width", "128");
+    Builder.defineMacro("__loongarch_sx", Twine(1));
+  }
 
   StringRef ABI = getABI();
   if (ABI == "lp64d" || ABI == "lp64f" || ABI == "lp64s")
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 31153a67ad28..2d9c3f810a06 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -207,6 +207,35 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     } else /*-mno-lasx*/
       Features.push_back("-lasx");
   }
+
+  // Select lsx/lasx feature determined by -msimd=.
+  // Option -msimd= has lower priority than -m[no-]lsx and -m[no-]lasx.
+  if (const Arg *A = Args.getLastArg(options::OPT_msimd_EQ)) {
+    StringRef MSIMD = A->getValue();
+    if (MSIMD == "lsx") {
+      // Option -msimd=lsx depends on 64-bit FPU.
+      // -m*-float and -mfpu=none/0/32 conflict with -mlsx.
+      if (llvm::find(Features, "-d") != Features.end())
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LSX*/ 0;
+      // The previous option does not contain feature -lsx.
+      else if (llvm::find(Features, "-lsx") == Features.end())
+        Features.push_back("+lsx");
+    } else if (MSIMD == "lasx") {
+      // Option -msimd=lasx depends on 64-bit FPU and LSX.
+      // -m*-float and -mfpu=none/0/32 conflict with -mlsx.
+      if (llvm::find(Features, "-d") != Features.end())
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LASX*/ 1;
+      else if (llvm::find(Features, "-lsx") != Features.end())
+        D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
+      // The previous option does not contain feature -lasx.
+      else if (llvm::find(Features, "-lasx") == Features.end()) {
+        Features.push_back("+lsx");
+        Features.push_back("+lasx");
+      }
+    } else if (MSIMD != "none") {
+      D.Diag(diag::err_drv_loongarch_invalid_msimd_EQ) << MSIMD;
+    }
+  }
 }
 
 std::string loongarch::postProcessTargetCPUString(const std::string &CPU,
diff --git a/clang/test/Driver/loongarch-msimd.c b/clang/test/Driver/loongarch-msimd.c
new file mode 100644
index 000000000000..984f3e8bf2bf
--- /dev/null
+++ b/clang/test/Driver/loongarch-msimd.c
@@ -0,0 +1,129 @@
+/// Test -msimd options.
+
+/// COM: -msimd=none
+// RUN: %clang --target=loongarch64 -mlasx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+// RUN: %clang --target=loongarch64 -mlasx -mlsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+
+// RUN: %clang --target=loongarch64 -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlsx -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mlsx -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mlsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+
+
+/// COM: -msimd=lsx
+// RUN: %clang --target=loongarch64 -mlasx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+// RUN: %clang --target=loongarch64 -mlasx -mlsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlsx -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mlsx -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+
+// RUN: %clang --target=loongarch64 -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mlsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+
+
+/// COM: -msimd=lasx
+// RUN: %clang --target=loongarch64 -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+// RUN: %clang --target=loongarch64 -mlasx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+// RUN: %clang --target=loongarch64 -mlasx -mlsx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+// RUN: %clang --target=loongarch64 -mlsx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mlsx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+
+
+// LSX: "-target-feature" "+lsx"
+// LASX: "-target-feature" "+lasx"
+// NOLSX-NOT: "-target-feature" "+lsx"
+// NOLASX-NOT: "-target-feature" "+lasx"
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index e235a7283021..154ad82e0f8c 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -817,6 +817,7 @@
 // RUN: %clang --target=loongarch64 -mno-lasx -mlsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
 // MLSX-NOT: #define __loongarch_asx
+// MLSX: #define __loongarch_simd_width 128
 // MLSX: #define __loongarch_sx 1
 
 // RUN: %clang --target=loongarch64 -mlasx -x c -E -dM %s -o - \
@@ -828,6 +829,7 @@
 // RUN: %clang --target=loongarch64 -mlasx -mlsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
 // MLASX: #define __loongarch_asx 1
+// MLASX: #define __loongarch_simd_width 256
 // MLASX: #define __loongarch_sx 1
 
 // RUN: %clang --target=loongarch64 -mno-lsx -x c -E -dM %s -o - \
@@ -841,4 +843,5 @@
 // RUN: %clang --target=loongarch64 -mno-lasx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // MNO-LSX-NOT: #define __loongarch_asx
+// MNO-LSX-NOT: #define __loongarch_simd_width
 // MNO-LSX-NOT: #define __loongarch_sx
-- 
Gitee


From b2f8e92e88bf63e54ace9b2f9b2aa77dcf0c50c4 Mon Sep 17 00:00:00 2001
From: Zhaoxin Yang <yangzhaoxin@loongson.cn>
Date: Thu, 11 Jul 2024 17:43:38 +0800
Subject: [PATCH 15/23] [LoongArch][clang] Modify `loongarch-msimd.c` to avoid
 `grep -o`. NFC (#98442)

Address buildbot failure:
https://lab.llvm.org/buildbot/#/builders/64/builds/250/steps/6/logs/FAIL__Clang__loongarch-msimd_c

(cherry picked from commit 74b933c28e777fdc04e50f5f96e4f7a4ad1e79a6)
---
 clang/test/Driver/loongarch-msimd.c | 42 +++--------------------------
 1 file changed, 4 insertions(+), 38 deletions(-)

diff --git a/clang/test/Driver/loongarch-msimd.c b/clang/test/Driver/loongarch-msimd.c
index 984f3e8bf2bf..cd463300c874 100644
--- a/clang/test/Driver/loongarch-msimd.c
+++ b/clang/test/Driver/loongarch-msimd.c
@@ -2,128 +2,94 @@
 
 /// COM: -msimd=none
 // RUN: %clang --target=loongarch64 -mlasx -msimd=none -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,LASX
 // RUN: %clang --target=loongarch64 -mlasx -mlsx -msimd=none -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,LASX
 
 // RUN: %clang --target=loongarch64 -msimd=none -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -msimd=none -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 // RUN: %clang --target=loongarch64 -mlsx -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 // RUN: %clang --target=loongarch64 -mno-lasx -msimd=none -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 // RUN: %clang --target=loongarch64 -mno-lasx -mlsx -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 // RUN: %clang --target=loongarch64 -mno-lasx -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 // RUN: %clang --target=loongarch64 -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -msimd=none -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
 // RUN: %clang --target=loongarch64 -mno-lasx -mlsx -msimd=none -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
 // RUN: %clang --target=loongarch64 -mlsx -msimd=none -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
 
 
 /// COM: -msimd=lsx
 // RUN: %clang --target=loongarch64 -mlasx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,LASX
 // RUN: %clang --target=loongarch64 -mlasx -mlsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,LASX
 
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 // RUN: %clang --target=loongarch64 -mlsx -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 // RUN: %clang --target=loongarch64 -mno-lasx -mlsx -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 // RUN: %clang --target=loongarch64 -mno-lasx -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 // RUN: %clang --target=loongarch64 -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 
 // RUN: %clang --target=loongarch64 -msimd=lsx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
 // RUN: %clang --target=loongarch64 -mlsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
 // RUN: %clang --target=loongarch64 -mno-lasx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
 // RUN: %clang --target=loongarch64 -mno-lasx -mlsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
 
 
 /// COM: -msimd=lasx
 // RUN: %clang --target=loongarch64 -msimd=lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,LASX
 // RUN: %clang --target=loongarch64 -mlasx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,LASX
 // RUN: %clang --target=loongarch64 -mlasx -mlsx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,LASX
 // RUN: %clang --target=loongarch64 -mlsx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,LASX
 
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 // RUN: %clang --target=loongarch64 -mno-lasx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
 
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
 // RUN: %clang --target=loongarch64 -mno-lasx -mlsx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
-// RUN:   grep -o '"-target-feature" "+[[:alnum:]]\+"' | sort -r | \
 // RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
 
 
-// LSX: "-target-feature" "+lsx"
-// LASX: "-target-feature" "+lasx"
+// NOLSX-NOT: "-target-feature" "+lsx"
+// NOLASX-NOT: "-target-feature" "+lasx"
+// LSX-DAG: "-target-feature" "+lsx"
+// LASX-DAG: "-target-feature" "+lasx"
 // NOLSX-NOT: "-target-feature" "+lsx"
 // NOLASX-NOT: "-target-feature" "+lasx"
-- 
Gitee


From b5d3aa3ac0dcf98fbb5f8d2d9de295be991c9e8f Mon Sep 17 00:00:00 2001
From: Zhaoxin Yang <yangzhaoxin@loongson.cn>
Date: Tue, 23 Jul 2024 12:06:59 +0800
Subject: [PATCH 16/23] [LoongArch][CodeGen] Implement 128-bit and 256-bit
 vector shuffle. (#100054)

[LoongArch][CodeGen] Implement 128-bit and 256-bit vector shuffle
operations.

In LoongArch, shuffle operations can be divided into two types:
- Single-vector shuffle: Shuffle using only one vector, with the other
vector being `undef` or not selected by mask. This can be expanded to
instructions such as `vreplvei` and `vshuf4i`.
- Two-vector shuffle: Shuflle using two vectors. This can be expanded to
instructions like `vilv[l/h]`, `vpack[ev/od]`, `vpick[ev/od]` and the
basic `vshuf`.

In the future, more optimizations may be added, such as handling 1-bit
vectors and processing single element patterns, etc.

(cherry picked from commit 464ea880cf7710cc8675c83001d7ae020406cf42)
---
 .../lib/Driver/ToolChains/Arch/LoongArch.cpp  |   4 +-
 .../LoongArch/LoongArchISelLowering.cpp       | 933 +++++++++++++++++-
 .../Target/LoongArch/LoongArchISelLowering.h  |  10 +
 .../LoongArch/LoongArchLASXInstrInfo.td       | 130 +++
 .../Target/LoongArch/LoongArchLSXInstrInfo.td | 148 +++
 .../lasx/ir-instruction/shuffle-as-xvilv.ll   |  74 ++
 .../lasx/ir-instruction/shuffle-as-xvpack.ll  | 124 +++
 .../lasx/ir-instruction/shuffle-as-xvpick.ll  |  84 ++
 .../ir-instruction/shuffle-as-xvrepl128vei.ll |  65 ++
 .../lasx/ir-instruction/shuffle-as-xvshuf.ll  |  76 ++
 .../ir-instruction/shuffle-as-xvshuf4i.ll     |  43 +
 .../lsx/ir-instruction/shuffle-as-vilv.ll     |  82 ++
 .../lsx/ir-instruction/shuffle-as-vpack.ll    | 122 +++
 .../lsx/ir-instruction/shuffle-as-vpick.ll    |  82 ++
 .../lsx/ir-instruction/shuffle-as-vreplvei.ll |  62 ++
 .../lsx/ir-instruction/shuffle-as-vshuf.ll    |  84 ++
 .../lsx/ir-instruction/shuffle-as-vshuf4i.ll  |  42 +
 17 files changed, 2160 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll

diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 2d9c3f810a06..8b3d2837a4e5 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -216,7 +216,7 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
       // Option -msimd=lsx depends on 64-bit FPU.
       // -m*-float and -mfpu=none/0/32 conflict with -mlsx.
       if (llvm::find(Features, "-d") != Features.end())
-        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LSX*/ 0;
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lsx);
       // The previous option does not contain feature -lsx.
       else if (llvm::find(Features, "-lsx") == Features.end())
         Features.push_back("+lsx");
@@ -224,7 +224,7 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
       // Option -msimd=lasx depends on 64-bit FPU and LSX.
       // -m*-float and -mfpu=none/0/32 conflict with -mlsx.
       if (llvm::find(Features, "-d") != Features.end())
-        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LASX*/ 1;
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lasx);
       else if (llvm::find(Features, "-lsx") != Features.end())
         D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
       // The previous option does not contain feature -lasx.
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index df1b17649b7d..618ae7056425 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -247,9 +247,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
 
       setOperationAction(ISD::SETCC, VT, Legal);
       setOperationAction(ISD::VSELECT, VT, Legal);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
     }
     for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
-      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
       setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
                          Legal);
@@ -293,9 +293,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
 
       setOperationAction(ISD::SETCC, VT, Legal);
       setOperationAction(ISD::VSELECT, VT, Legal);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
     }
     for (MVT VT : {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8}) {
-      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
       setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
                          Legal);
@@ -422,9 +422,926 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
   return SDValue();
 }
 
+/// Determine whether a range fits a regular pattern of values.
+/// This function accounts for the possibility of jumping over the End iterator.
+template <typename ValType>
+static bool
+fitsRegularPattern(typename SmallVectorImpl<ValType>::const_iterator Begin,
+                   unsigned CheckStride,
+                   typename SmallVectorImpl<ValType>::const_iterator End,
+                   ValType ExpectedIndex, unsigned ExpectedIndexStride) {
+  auto &I = Begin;
+
+  while (I != End) {
+    if (*I != -1 && *I != ExpectedIndex)
+      return false;
+    ExpectedIndex += ExpectedIndexStride;
+
+    // Incrementing past End is undefined behaviour so we must increment one
+    // step at a time and check for End at each step.
+    for (unsigned n = 0; n < CheckStride && I != End; ++n, ++I)
+      ; // Empty loop body.
+  }
+  return true;
+}
+
+/// Lower VECTOR_SHUFFLE into VREPLVEI (if possible).
+///
+/// VREPLVEI performs vector broadcast based on an element specified by an
+/// integer immediate, with its mask being similar to:
+///   <x, x, x, ...>
+/// where x is any valid index.
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+  int SplatIndex = -1;
+  for (const auto &M : Mask) {
+    if (M != -1) {
+      SplatIndex = M;
+      break;
+    }
+  }
+
+  if (SplatIndex == -1)
+    return DAG.getUNDEF(VT);
+
+  assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index");
+  if (fitsRegularPattern<int>(Mask.begin(), 1, Mask.end(), SplatIndex, 0)) {
+    APInt Imm(64, SplatIndex);
+    return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
+                       DAG.getConstant(Imm, DL, MVT::i64));
+  }
+
+  return SDValue();
+}
+
+/// Lower VECTOR_SHUFFLE into VSHUF4I (if possible).
+///
+/// VSHUF4I splits the vector into blocks of four elements, then shuffles these
+/// elements according to a <4 x i2> constant (encoded as an integer immediate).
+///
+/// It is therefore possible to lower into VSHUF4I when the mask takes the form:
+///   <a, b, c, d, a+4, b+4, c+4, d+4, a+8, b+8, c+8, d+8, ...>
+/// When undef's appear they are treated as if they were whatever value is
+/// necessary in order to fit the above forms.
+///
+/// For example:
+///   %2 = shufflevector <8 x i16> %0, <8 x i16> undef,
+///                      <8 x i32> <i32 3, i32 2, i32 1, i32 0,
+///                                 i32 7, i32 6, i32 5, i32 4>
+/// is lowered to:
+///   (VSHUF4I_H $v0, $v1, 27)
+/// where the 27 comes from:
+///   3 + (2 << 2) + (1 << 4) + (0 << 6)
+static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+
+  // When the size is less than 4, lower cost instructions may be used.
+  if (Mask.size() < 4)
+    return SDValue();
+
+  int SubMask[4] = {-1, -1, -1, -1};
+  for (unsigned i = 0; i < 4; ++i) {
+    for (unsigned j = i; j < Mask.size(); j += 4) {
+      int Idx = Mask[j];
+
+      // Convert from vector index to 4-element subvector index
+      // If an index refers to an element outside of the subvector then give up
+      if (Idx != -1) {
+        Idx -= 4 * (j / 4);
+        if (Idx < 0 || Idx >= 4)
+          return SDValue();
+      }
+
+      // If the mask has an undef, replace it with the current index.
+      // Note that it might still be undef if the current index is also undef
+      if (SubMask[i] == -1)
+        SubMask[i] = Idx;
+      // Check that non-undef values are the same as in the mask. If they
+      // aren't then give up
+      else if (Idx != -1 && Idx != SubMask[i])
+        return SDValue();
+    }
+  }
+
+  // Calculate the immediate. Replace any remaining undefs with zero
+  APInt Imm(64, 0);
+  for (int i = 3; i >= 0; --i) {
+    int Idx = SubMask[i];
+
+    if (Idx == -1)
+      Idx = 0;
+
+    Imm <<= 2;
+    Imm |= Idx & 0x3;
+  }
+
+  return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1,
+                     DAG.getConstant(Imm, DL, MVT::i64));
+}
+
+/// Lower VECTOR_SHUFFLE into VPACKEV (if possible).
+///
+/// VPACKEV interleaves the even elements from each vector.
+///
+/// It is possible to lower into VPACKEV when the mask consists of two of the
+/// following forms interleaved:
+///   <0, 2, 4, ...>
+///   <n, n+2, n+4, ...>
+/// where n is the number of elements in the vector.
+/// For example:
+///   <0, 0, 2, 2, 4, 4, ...>
+///   <0, n, 2, n+2, 4, n+4, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VPACKEV(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End, 0, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size(), 2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size(), 2))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPACKEV, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VPACKOD (if possible).
+///
+/// VPACKOD interleaves the odd elements from each vector.
+///
+/// It is possible to lower into VPACKOD when the mask consists of two of the
+/// following forms interleaved:
+///   <1, 3, 5, ...>
+///   <n+1, n+3, n+5, ...>
+/// where n is the number of elements in the vector.
+/// For example:
+///   <1, 1, 3, 3, 5, 5, ...>
+///   <1, n+1, 3, n+3, 5, n+5, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VPACKOD(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End, 1, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size() + 1, 2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, 1, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size() + 1, 2))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPACKOD, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VILVH (if possible).
+///
+/// VILVH interleaves consecutive elements from the left (highest-indexed) half
+/// of each vector.
+///
+/// It is possible to lower into VILVH when the mask consists of two of the
+/// following forms interleaved:
+///   <x, x+1, x+2, ...>
+///   <n+x, n+x+1, n+x+2, ...>
+/// where n is the number of elements in the vector and x is half n.
+/// For example:
+///   <x, x, x+1, x+1, x+2, x+2, ...>
+///   <x, n+x, x+1, n+x+1, x+2, n+x+2, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VILVH(const SDLoc &DL, ArrayRef<int> Mask,
+                                         MVT VT, SDValue V1, SDValue V2,
+                                         SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End, HalfSize, 1))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size() + HalfSize, 1))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, HalfSize, 1))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size() + HalfSize,
+                                   1))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VILVH, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VILVL (if possible).
+///
+/// VILVL interleaves consecutive elements from the right (lowest-indexed) half
+/// of each vector.
+///
+/// It is possible to lower into VILVL when the mask consists of two of the
+/// following forms interleaved:
+///   <0, 1, 2, ...>
+///   <n, n+1, n+2, ...>
+/// where n is the number of elements in the vector.
+/// For example:
+///   <0, 0, 1, 1, 2, 2, ...>
+///   <0, n, 1, n+1, 2, n+2, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VILVL(const SDLoc &DL, ArrayRef<int> Mask,
+                                         MVT VT, SDValue V1, SDValue V2,
+                                         SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End, 0, 1))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size(), 1))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 1))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size(), 1))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VILVL, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VPICKEV (if possible).
+///
+/// VPICKEV copies the even elements of each vector into the result vector.
+///
+/// It is possible to lower into VPICKEV when the mask consists of two of the
+/// following forms concatenated:
+///   <0, 2, 4, ...>
+///   <n, n+2, n+4, ...>
+/// where n is the number of elements in the vector.
+/// For example:
+///   <0, 2, 4, ..., 0, 2, 4, ...>
+///   <0, 2, 4, ..., n, n+2, n+4, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VPICKEV(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &Mid = Mask.begin() + Mask.size() / 2;
+  const auto &End = Mask.end();
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 1, Mid, 0, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 1, Mid, Mask.size(), 2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Mid, 1, End, 0, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Mid, 1, End, Mask.size(), 2))
+    V2 = OriV2;
+
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPICKEV, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VPICKOD (if possible).
+///
+/// VPICKOD copies the odd elements of each vector into the result vector.
+///
+/// It is possible to lower into VPICKOD when the mask consists of two of the
+/// following forms concatenated:
+///   <1, 3, 5, ...>
+///   <n+1, n+3, n+5, ...>
+/// where n is the number of elements in the vector.
+/// For example:
+///   <1, 3, 5, ..., 1, 3, 5, ...>
+///   <1, 3, 5, ..., n+1, n+3, n+5, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VPICKOD(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &Mid = Mask.begin() + Mask.size() / 2;
+  const auto &End = Mask.end();
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 1, Mid, 1, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 1, Mid, Mask.size() + 1, 2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Mid, 1, End, 1, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Mid, 1, End, Mask.size() + 1, 2))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VSHUF.
+///
+/// This mostly consists of converting the shuffle mask into a BUILD_VECTOR and
+/// adding it as an operand to the resulting VSHUF.
+static SDValue lowerVECTOR_SHUFFLE_VSHUF(const SDLoc &DL, ArrayRef<int> Mask,
+                                         MVT VT, SDValue V1, SDValue V2,
+                                         SelectionDAG &DAG) {
+
+  SmallVector<SDValue, 16> Ops;
+  for (auto M : Mask)
+    Ops.push_back(DAG.getConstant(M, DL, MVT::i64));
+
+  EVT MaskVecTy = VT.changeVectorElementTypeToInteger();
+  SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops);
+
+  // VECTOR_SHUFFLE concatenates the vectors in an vectorwise fashion.
+  // <0b00, 0b01> + <0b10, 0b11> -> <0b00, 0b01, 0b10, 0b11>
+  // VSHF concatenates the vectors in a bitwise fashion:
+  // <0b00, 0b01> + <0b10, 0b11> ->
+  // 0b0100       + 0b1110       -> 0b01001110
+  //                                <0b10, 0b11, 0b00, 0b01>
+  // We must therefore swap the operands to get the correct result.
+  return DAG.getNode(LoongArchISD::VSHUF, DL, VT, MaskVec, V2, V1);
+}
+
+/// Dispatching routine to lower various 128-bit LoongArch vector shuffles.
+///
+/// This routine breaks down the specific type of 128-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                                  SDValue V1, SDValue V2, SelectionDAG &DAG) {
+  assert((VT.SimpleTy == MVT::v16i8 || VT.SimpleTy == MVT::v8i16 ||
+          VT.SimpleTy == MVT::v4i32 || VT.SimpleTy == MVT::v2i64 ||
+          VT.SimpleTy == MVT::v4f32 || VT.SimpleTy == MVT::v2f64) &&
+         "Vector type is unsupported for lsx!");
+  assert(V1.getSimpleValueType() == V2.getSimpleValueType() &&
+         "Two operands have different types!");
+  assert(VT.getVectorNumElements() == Mask.size() &&
+         "Unexpected mask size for shuffle!");
+  assert(Mask.size() % 2 == 0 && "Expected even mask size.");
+
+  SDValue Result;
+  // TODO: Add more comparison patterns.
+  if (V2.isUndef()) {
+    if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG)))
+      return Result;
+    if ((Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG)))
+      return Result;
+
+    // TODO: This comment may be enabled in the future to better match the
+    // pattern for instruction selection.
+    /* V2 = V1; */
+  }
+
+  // It is recommended not to change the pattern comparison order for better
+  // performance.
+  if ((Result = lowerVECTOR_SHUFFLE_VPACKEV(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VPACKOD(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VILVH(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VILVL(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VPICKEV(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+
+  return SDValue();
+}
+
+/// Lower VECTOR_SHUFFLE into XVREPLVEI (if possible).
+///
+/// It is a XVREPLVEI when the mask is:
+///   <x, x, x, ..., x+n, x+n, x+n, ...>
+/// where the number of x is equal to n and n is half the length of vector.
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL,
+                                             ArrayRef<int> Mask, MVT VT,
+                                             SDValue V1, SDValue V2,
+                                             SelectionDAG &DAG) {
+  int SplatIndex = -1;
+  for (const auto &M : Mask) {
+    if (M != -1) {
+      SplatIndex = M;
+      break;
+    }
+  }
+
+  if (SplatIndex == -1)
+    return DAG.getUNDEF(VT);
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+
+  assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index");
+  if (fitsRegularPattern<int>(Begin, 1, End - HalfSize, SplatIndex, 0) &&
+      fitsRegularPattern<int>(Begin + HalfSize, 1, End, SplatIndex + HalfSize,
+                              0)) {
+    APInt Imm(64, SplatIndex);
+    return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
+                       DAG.getConstant(Imm, DL, MVT::i64));
+  }
+
+  return SDValue();
+}
+
+/// Lower VECTOR_SHUFFLE into XVSHUF4I (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+  // When the size is less than or equal to 4, lower cost instructions may be
+  // used.
+  if (Mask.size() <= 4)
+    return SDValue();
+  return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG);
+}
+
+/// Lower VECTOR_SHUFFLE into XVPACKEV (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPACKEV(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+  return lowerVECTOR_SHUFFLE_VPACKEV(DL, Mask, VT, V1, V2, DAG);
+}
+
+/// Lower VECTOR_SHUFFLE into XVPACKOD (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPACKOD(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+  return lowerVECTOR_SHUFFLE_VPACKOD(DL, Mask, VT, V1, V2, DAG);
+}
+
+/// Lower VECTOR_SHUFFLE into XVILVH (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVILVH(const SDLoc &DL, ArrayRef<int> Mask,
+                                          MVT VT, SDValue V1, SDValue V2,
+                                          SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+  unsigned LeftSize = HalfSize / 2;
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, HalfSize - LeftSize,
+                              1) &&
+      fitsRegularPattern<int>(Begin + HalfSize, 2, End, HalfSize + LeftSize, 1))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End - HalfSize,
+                                   Mask.size() + HalfSize - LeftSize, 1) &&
+           fitsRegularPattern<int>(Begin + HalfSize, 2, End,
+                                   Mask.size() + HalfSize + LeftSize, 1))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, HalfSize - LeftSize,
+                              1) &&
+      fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End, HalfSize + LeftSize,
+                              1))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize,
+                                   Mask.size() + HalfSize - LeftSize, 1) &&
+           fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End,
+                                   Mask.size() + HalfSize + LeftSize, 1))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VILVH, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into XVILVL (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVILVL(const SDLoc &DL, ArrayRef<int> Mask,
+                                          MVT VT, SDValue V1, SDValue V2,
+                                          SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, 0, 1) &&
+      fitsRegularPattern<int>(Begin + HalfSize, 2, End, HalfSize, 1))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, Mask.size(), 1) &&
+           fitsRegularPattern<int>(Begin + HalfSize, 2, End,
+                                   Mask.size() + HalfSize, 1))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, 0, 1) &&
+      fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End, HalfSize, 1))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, Mask.size(),
+                                   1) &&
+           fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End,
+                                   Mask.size() + HalfSize, 1))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VILVL, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into XVPICKEV (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPICKEV(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &LeftMid = Mask.begin() + Mask.size() / 4;
+  const auto &Mid = Mask.begin() + Mask.size() / 2;
+  const auto &RightMid = Mask.end() - Mask.size() / 4;
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 1, LeftMid, 0, 2) &&
+      fitsRegularPattern<int>(Mid, 1, RightMid, HalfSize, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 1, LeftMid, Mask.size(), 2) &&
+           fitsRegularPattern<int>(Mid, 1, RightMid, Mask.size() + HalfSize, 2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(LeftMid, 1, Mid, 0, 2) &&
+      fitsRegularPattern<int>(RightMid, 1, End, HalfSize, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(LeftMid, 1, Mid, Mask.size(), 2) &&
+           fitsRegularPattern<int>(RightMid, 1, End, Mask.size() + HalfSize, 2))
+    V2 = OriV2;
+
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPICKEV, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into XVPICKOD (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPICKOD(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &LeftMid = Mask.begin() + Mask.size() / 4;
+  const auto &Mid = Mask.begin() + Mask.size() / 2;
+  const auto &RightMid = Mask.end() - Mask.size() / 4;
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 1, LeftMid, 1, 2) &&
+      fitsRegularPattern<int>(Mid, 1, RightMid, HalfSize + 1, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 1, LeftMid, Mask.size() + 1, 2) &&
+           fitsRegularPattern<int>(Mid, 1, RightMid, Mask.size() + HalfSize + 1,
+                                   2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(LeftMid, 1, Mid, 1, 2) &&
+      fitsRegularPattern<int>(RightMid, 1, End, HalfSize + 1, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(LeftMid, 1, Mid, Mask.size() + 1, 2) &&
+           fitsRegularPattern<int>(RightMid, 1, End, Mask.size() + HalfSize + 1,
+                                   2))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into XVSHUF (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask,
+                                          MVT VT, SDValue V1, SDValue V2,
+                                          SelectionDAG &DAG) {
+
+  int MaskSize = Mask.size();
+  int HalfSize = Mask.size() / 2;
+  const auto &Begin = Mask.begin();
+  const auto &Mid = Mask.begin() + HalfSize;
+  const auto &End = Mask.end();
+
+  // VECTOR_SHUFFLE concatenates the vectors:
+  //  <0, 1, 2, 3, 4, 5, 6, 7> + <8, 9, 10, 11, 12, 13, 14, 15>
+  //  shuffling ->
+  //  <0, 1, 2, 3, 8, 9, 10, 11> <4, 5, 6, 7, 12, 13, 14, 15>
+  //
+  // XVSHUF concatenates the vectors:
+  //  <a0, a1, a2, a3, b0, b1, b2, b3> + <a4, a5, a6, a7, b4, b5, b6, b7>
+  //  shuffling ->
+  //  <a0, a1, a2, a3, a4, a5, a6, a7> + <b0, b1, b2, b3, b4, b5, b6, b7>
+  SmallVector<SDValue, 8> MaskAlloc;
+  for (auto it = Begin; it < Mid; it++) {
+    if (*it < 0) // UNDEF
+      MaskAlloc.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
+    else if ((*it >= 0 && *it < HalfSize) ||
+             (*it >= MaskSize && *it <= MaskSize + HalfSize)) {
+      int M = *it < HalfSize ? *it : *it - HalfSize;
+      MaskAlloc.push_back(DAG.getTargetConstant(M, DL, MVT::i64));
+    } else
+      return SDValue();
+  }
+  assert((int)MaskAlloc.size() == HalfSize && "xvshuf convert failed!");
+
+  for (auto it = Mid; it < End; it++) {
+    if (*it < 0) // UNDEF
+      MaskAlloc.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
+    else if ((*it >= HalfSize && *it < MaskSize) ||
+             (*it >= MaskSize + HalfSize && *it < MaskSize * 2)) {
+      int M = *it < MaskSize ? *it - HalfSize : *it - MaskSize;
+      MaskAlloc.push_back(DAG.getTargetConstant(M, DL, MVT::i64));
+    } else
+      return SDValue();
+  }
+  assert((int)MaskAlloc.size() == MaskSize && "xvshuf convert failed!");
+
+  EVT MaskVecTy = VT.changeVectorElementTypeToInteger();
+  SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, MaskAlloc);
+  return DAG.getNode(LoongArchISD::VSHUF, DL, VT, MaskVec, V2, V1);
+}
+
+/// Shuffle vectors by lane to generate more optimized instructions.
+/// 256-bit shuffles are always considered as 2-lane 128-bit shuffles.
+///
+/// Therefore, except for the following four cases, other cases are regarded
+/// as cross-lane shuffles, where optimization is relatively limited.
+///
+/// - Shuffle high, low lanes of two inputs vector
+///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <0, 5, 3, 6>
+/// - Shuffle low, high lanes of two inputs vector
+///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <3, 6, 0, 5>
+/// - Shuffle low, low lanes of two inputs vector
+///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <3, 6, 3, 6>
+/// - Shuffle high, high lanes of two inputs vector
+///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <0, 5, 0, 5>
+///
+/// The first case is the closest to LoongArch instructions and the other
+/// cases need to be converted to it for processing.
+///
+/// This function may modify V1, V2 and Mask
+static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
+                                            MutableArrayRef<int> Mask, MVT VT,
+                                            SDValue &V1, SDValue &V2,
+                                            SelectionDAG &DAG) {
+
+  enum HalfMaskType { HighLaneTy, LowLaneTy, None };
+
+  int MaskSize = Mask.size();
+  int HalfSize = Mask.size() / 2;
+
+  HalfMaskType preMask = None, postMask = None;
+
+  if (std::all_of(Mask.begin(), Mask.begin() + HalfSize, [&](int M) {
+        return M < 0 || (M >= 0 && M < HalfSize) ||
+               (M >= MaskSize && M < MaskSize + HalfSize);
+      }))
+    preMask = HighLaneTy;
+  else if (std::all_of(Mask.begin(), Mask.begin() + HalfSize, [&](int M) {
+             return M < 0 || (M >= HalfSize && M < MaskSize) ||
+                    (M >= MaskSize + HalfSize && M < MaskSize * 2);
+           }))
+    preMask = LowLaneTy;
+
+  if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
+        return M < 0 || (M >= 0 && M < HalfSize) ||
+               (M >= MaskSize && M < MaskSize + HalfSize);
+      }))
+    postMask = HighLaneTy;
+  else if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
+             return M < 0 || (M >= HalfSize && M < MaskSize) ||
+                    (M >= MaskSize + HalfSize && M < MaskSize * 2);
+           }))
+    postMask = LowLaneTy;
+
+  // The pre-half of mask is high lane type, and the post-half of mask
+  // is low lane type, which is closest to the LoongArch instructions.
+  //
+  // Note: In the LoongArch architecture, the high lane of mask corresponds
+  // to the lower 128-bit of vector register, and the low lane of mask
+  // corresponds the higher 128-bit of vector register.
+  if (preMask == HighLaneTy && postMask == LowLaneTy) {
+    return;
+  }
+  if (preMask == LowLaneTy && postMask == HighLaneTy) {
+    V1 = DAG.getBitcast(MVT::v4i64, V1);
+    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
+                     DAG.getConstant(0b01001110, DL, MVT::i64));
+    V1 = DAG.getBitcast(VT, V1);
+
+    if (!V2.isUndef()) {
+      V2 = DAG.getBitcast(MVT::v4i64, V2);
+      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
+                       DAG.getConstant(0b01001110, DL, MVT::i64));
+      V2 = DAG.getBitcast(VT, V2);
+    }
+
+    for (auto it = Mask.begin(); it < Mask.begin() + HalfSize; it++) {
+      *it = *it < 0 ? *it : *it - HalfSize;
+    }
+    for (auto it = Mask.begin() + HalfSize; it < Mask.end(); it++) {
+      *it = *it < 0 ? *it : *it + HalfSize;
+    }
+  } else if (preMask == LowLaneTy && postMask == LowLaneTy) {
+    V1 = DAG.getBitcast(MVT::v4i64, V1);
+    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
+                     DAG.getConstant(0b11101110, DL, MVT::i64));
+    V1 = DAG.getBitcast(VT, V1);
+
+    if (!V2.isUndef()) {
+      V2 = DAG.getBitcast(MVT::v4i64, V2);
+      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
+                       DAG.getConstant(0b11101110, DL, MVT::i64));
+      V2 = DAG.getBitcast(VT, V2);
+    }
+
+    for (auto it = Mask.begin(); it < Mask.begin() + HalfSize; it++) {
+      *it = *it < 0 ? *it : *it - HalfSize;
+    }
+  } else if (preMask == HighLaneTy && postMask == HighLaneTy) {
+    V1 = DAG.getBitcast(MVT::v4i64, V1);
+    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
+                     DAG.getConstant(0b01000100, DL, MVT::i64));
+    V1 = DAG.getBitcast(VT, V1);
+
+    if (!V2.isUndef()) {
+      V2 = DAG.getBitcast(MVT::v4i64, V2);
+      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
+                       DAG.getConstant(0b01000100, DL, MVT::i64));
+      V2 = DAG.getBitcast(VT, V2);
+    }
+
+    for (auto it = Mask.begin() + HalfSize; it < Mask.end(); it++) {
+      *it = *it < 0 ? *it : *it + HalfSize;
+    }
+  } else { // cross-lane
+    return;
+  }
+}
+
+/// Dispatching routine to lower various 256-bit LoongArch vector shuffles.
+///
+/// This routine breaks down the specific type of 256-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                                  SDValue V1, SDValue V2, SelectionDAG &DAG) {
+  assert((VT.SimpleTy == MVT::v32i8 || VT.SimpleTy == MVT::v16i16 ||
+          VT.SimpleTy == MVT::v8i32 || VT.SimpleTy == MVT::v4i64 ||
+          VT.SimpleTy == MVT::v8f32 || VT.SimpleTy == MVT::v4f64) &&
+         "Vector type is unsupported for lasx!");
+  assert(V1.getSimpleValueType() == V2.getSimpleValueType() &&
+         "Two operands have different types!");
+  assert(VT.getVectorNumElements() == Mask.size() &&
+         "Unexpected mask size for shuffle!");
+  assert(Mask.size() % 2 == 0 && "Expected even mask size.");
+  assert(Mask.size() >= 4 && "Mask size is less than 4.");
+
+  // canonicalize non cross-lane shuffle vector
+  SmallVector<int> NewMask(Mask);
+  canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG);
+
+  SDValue Result;
+  // TODO: Add more comparison patterns.
+  if (V2.isUndef()) {
+    if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG)))
+      return Result;
+    if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG)))
+      return Result;
+
+    // TODO: This comment may be enabled in the future to better match the
+    // pattern for instruction selection.
+    /* V2 = V1; */
+  }
+
+  // It is recommended not to change the pattern comparison order for better
+  // performance.
+  if ((Result = lowerVECTOR_SHUFFLE_XVPACKEV(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVPACKOD(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVILVH(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVILVL(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVPICKEV(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVSHUF(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+
+  return SDValue();
+}
+
 SDValue LoongArchTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  // TODO: custom shuffle.
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> OrigMask = SVOp->getMask();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getSimpleValueType();
+  int NumElements = VT.getVectorNumElements();
+  SDLoc DL(Op);
+
+  bool V1IsUndef = V1.isUndef();
+  bool V2IsUndef = V2.isUndef();
+  if (V1IsUndef && V2IsUndef)
+    return DAG.getUNDEF(VT);
+
+  // When we create a shuffle node we put the UNDEF node to second operand,
+  // but in some cases the first operand may be transformed to UNDEF.
+  // In this case we should just commute the node.
+  if (V1IsUndef)
+    return DAG.getCommutedVectorShuffle(*SVOp);
+
+  // Check for non-undef masks pointing at an undef vector and make the masks
+  // undef as well. This makes it easier to match the shuffle based solely on
+  // the mask.
+  if (V2IsUndef &&
+      any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
+    SmallVector<int, 8> NewMask(OrigMask);
+    for (int &M : NewMask)
+      if (M >= NumElements)
+        M = -1;
+    return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
+  }
+
+  // Check for illegal shuffle mask element index values.
+  int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
+  (void)MaskUpperLimit;
+  assert(llvm::all_of(OrigMask,
+                      [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
+         "Out of bounds shuffle index");
+
+  // For each vector width, delegate to a specialized lowering routine.
+  if (VT.is128BitVector())
+    return lower128BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
+
+  if (VT.is256BitVector())
+    return lower256BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
+
   return SDValue();
 }
 
@@ -3439,6 +4356,16 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(MOVFCSR2GR)
     NODE_NAME_CASE(CACOP_D)
     NODE_NAME_CASE(CACOP_W)
+    NODE_NAME_CASE(VSHUF)
+    NODE_NAME_CASE(VPICKEV)
+    NODE_NAME_CASE(VPICKOD)
+    NODE_NAME_CASE(VPACKEV)
+    NODE_NAME_CASE(VPACKOD)
+    NODE_NAME_CASE(VILVL)
+    NODE_NAME_CASE(VILVH)
+    NODE_NAME_CASE(VSHUF4I)
+    NODE_NAME_CASE(VREPLVEI)
+    NODE_NAME_CASE(XVPERMI)
     NODE_NAME_CASE(VPICK_SEXT_ELT)
     NODE_NAME_CASE(VPICK_ZEXT_ELT)
     NODE_NAME_CASE(VREPLVE)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index a2ed149f4bb7..a5ee740c1261 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -117,6 +117,16 @@ enum NodeType : unsigned {
 
   // Vector Shuffle
   VREPLVE,
+  VSHUF,
+  VPICKEV,
+  VPICKOD,
+  VPACKEV,
+  VPACKOD,
+  VILVL,
+  VILVH,
+  VSHUF4I,
+  VREPLVEI,
+  XVPERMI,
 
   // Extended vector element extraction
   VPICK_SEXT_ELT,
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 492b62da6ce7..5b6721cdf1b4 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -10,6 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_loongArchV1RUimm>;
+
 def lasxsplati8
   : PatFrag<(ops node:$e0),
             (v32i8 (build_vector node:$e0, node:$e0, node:$e0, node:$e0,
@@ -1571,6 +1573,134 @@ def : Pat<(loongarch_vreplve v8i32:$xj, GRLenVT:$rk),
 def : Pat<(loongarch_vreplve v4i64:$xj, GRLenVT:$rk),
           (XVREPLVE_D v4i64:$xj, GRLenVT:$rk)>;
 
+// XVSHUF_{B/H/W/D}
+def : Pat<(loongarch_vshuf v32i8:$xa, v32i8:$xj, v32i8:$xk),
+          (XVSHUF_B v32i8:$xj, v32i8:$xk, v32i8:$xa)>;
+def : Pat<(loongarch_vshuf v16i16:$xd, v16i16:$xj, v16i16:$xk),
+          (XVSHUF_H v16i16:$xd, v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vshuf v8i32:$xd, v8i32:$xj, v8i32:$xk),
+          (XVSHUF_W v8i32:$xd, v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vshuf v4i64:$xd, v4i64:$xj, v4i64:$xk),
+          (XVSHUF_D v4i64:$xd, v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vshuf v8i32:$xd, v8f32:$xj, v8f32:$xk),
+          (XVSHUF_W v8i32:$xd, v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vshuf v4i64:$xd, v4f64:$xj, v4f64:$xk),
+          (XVSHUF_D v4i64:$xd, v4f64:$xj, v4f64:$xk)>;
+
+// XVPICKEV_{B/H/W/D}
+def : Pat<(loongarch_vpickev v32i8:$xj, v32i8:$xk),
+          (XVPICKEV_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vpickev v16i16:$xj, v16i16:$xk),
+          (XVPICKEV_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vpickev v8i32:$xj, v8i32:$xk),
+          (XVPICKEV_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vpickev v4i64:$xj, v4i64:$xk),
+          (XVPICKEV_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vpickev v8f32:$xj, v8f32:$xk),
+          (XVPICKEV_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vpickev v4f64:$xj, v4f64:$xk),
+          (XVPICKEV_D v4f64:$xj, v4f64:$xk)>;
+
+// XVPICKOD_{B/H/W/D}
+def : Pat<(loongarch_vpickod v32i8:$xj, v32i8:$xk),
+          (XVPICKOD_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vpickod v16i16:$xj, v16i16:$xk),
+          (XVPICKOD_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vpickod v8i32:$xj, v8i32:$xk),
+          (XVPICKOD_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vpickod v4i64:$xj, v4i64:$xk),
+          (XVPICKOD_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vpickod v8f32:$xj, v8f32:$xk),
+          (XVPICKOD_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vpickod v4f64:$xj, v4f64:$xk),
+          (XVPICKOD_D v4f64:$xj, v4f64:$xk)>;
+
+// XVPACKEV_{B/H/W/D}
+def : Pat<(loongarch_vpackev v32i8:$xj, v32i8:$xk),
+          (XVPACKEV_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vpackev v16i16:$xj, v16i16:$xk),
+          (XVPACKEV_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vpackev v8i32:$xj, v8i32:$xk),
+          (XVPACKEV_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vpackev v4i64:$xj, v4i64:$xk),
+          (XVPACKEV_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vpackev v8f32:$xj, v8f32:$xk),
+          (XVPACKEV_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vpackev v4f64:$xj, v4f64:$xk),
+          (XVPACKEV_D v4f64:$xj, v4f64:$xk)>;
+
+// XVPACKOD_{B/H/W/D}
+def : Pat<(loongarch_vpackod v32i8:$xj, v32i8:$xk),
+          (XVPACKOD_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vpackod v16i16:$xj, v16i16:$xk),
+          (XVPACKOD_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vpackod v8i32:$xj, v8i32:$xk),
+          (XVPACKOD_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vpackod v4i64:$xj, v4i64:$xk),
+          (XVPACKOD_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vpackod v8f32:$xj, v8f32:$xk),
+          (XVPACKOD_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vpackod v4f64:$xj, v4f64:$xk),
+          (XVPACKOD_D v4f64:$xj, v4f64:$xk)>;
+
+// XVILVL_{B/H/W/D}
+def : Pat<(loongarch_vilvl v32i8:$xj, v32i8:$xk),
+          (XVILVL_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vilvl v16i16:$xj, v16i16:$xk),
+          (XVILVL_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vilvl v8i32:$xj, v8i32:$xk),
+          (XVILVL_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vilvl v4i64:$xj, v4i64:$xk),
+          (XVILVL_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vilvl v8f32:$xj, v8f32:$xk),
+          (XVILVL_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vilvl v4f64:$xj, v4f64:$xk),
+          (XVILVL_D v4f64:$xj, v4f64:$xk)>;
+
+// XVILVH_{B/H/W/D}
+def : Pat<(loongarch_vilvh v32i8:$xj, v32i8:$xk),
+          (XVILVH_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vilvh v16i16:$xj, v16i16:$xk),
+          (XVILVH_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vilvh v8i32:$xj, v8i32:$xk),
+          (XVILVH_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vilvh v4i64:$xj, v4i64:$xk),
+          (XVILVH_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vilvh v8f32:$xj, v8f32:$xk),
+          (XVILVH_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vilvh v4f64:$xj, v4f64:$xk),
+          (XVILVH_D v4f64:$xj, v4f64:$xk)>;
+
+// XVSHUF4I_{B/H/W}
+def : Pat<(loongarch_vshuf4i v32i8:$xj, immZExt8:$ui8),
+          (XVSHUF4I_B v32i8:$xj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v16i16:$xj, immZExt8:$ui8),
+        (XVSHUF4I_H v16i16:$xj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v8i32:$xj, immZExt8:$ui8),
+        (XVSHUF4I_W v8i32:$xj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v8f32:$xj, immZExt8:$ui8),
+        (XVSHUF4I_W v8f32:$xj, immZExt8:$ui8)>;
+
+// XVREPL128VEI_{B/H/W/D}
+def : Pat<(loongarch_vreplvei v32i8:$xj, immZExt4:$ui4),
+          (XVREPL128VEI_B v32i8:$xj, immZExt4:$ui4)>;
+def : Pat<(loongarch_vreplvei v16i16:$xj, immZExt3:$ui3),
+        (XVREPL128VEI_H v16i16:$xj, immZExt3:$ui3)>;
+def : Pat<(loongarch_vreplvei v8i32:$xj, immZExt2:$ui2),
+        (XVREPL128VEI_W v8i32:$xj, immZExt2:$ui2)>;
+def : Pat<(loongarch_vreplvei v4i64:$xj, immZExt1:$ui1),
+        (XVREPL128VEI_D v4i64:$xj, immZExt1:$ui1)>;
+def : Pat<(loongarch_vreplvei v8f32:$xj, immZExt2:$ui2),
+        (XVREPL128VEI_W v8f32:$xj, immZExt2:$ui2)>;
+def : Pat<(loongarch_vreplvei v4f64:$xj, immZExt1:$ui1),
+        (XVREPL128VEI_D v4f64:$xj, immZExt1:$ui1)>;
+
+// XVPERMI_D
+def : Pat<(loongarch_xvpermi v4i64:$xj, immZExt8: $ui8),
+          (XVPERMI_D v4i64:$xj, immZExt8: $ui8)>;
+def : Pat<(loongarch_xvpermi v4f64:$xj, immZExt8: $ui8),
+          (XVPERMI_D v4f64:$xj, immZExt8: $ui8)>;
+
 // XVREPLVE0_{W/D}
 def : Pat<(lasxsplatf32 FPR32:$fj),
           (XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 99ac2f3c162f..3519fa3142c3 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -15,6 +15,15 @@ def SDT_LoongArchVreplve : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
                                          SDTCisSameAs<0, 1>, SDTCisInt<2>]>;
 def SDT_LoongArchVecCond : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>]>;
 
+def SDT_LoongArchVShuf : SDTypeProfile<1, 3, [SDTCisVec<0>,
+                                         SDTCisInt<1>, SDTCisVec<1>,
+                                         SDTCisSameAs<0, 2>,
+                                         SDTCisSameAs<2, 3>]>;
+def SDT_LoongArchV2R : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                         SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
+def SDT_loongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                         SDTCisSameAs<0,1>, SDTCisVT<2, i64>]>;
+
 // Target nodes.
 def loongarch_vreplve : SDNode<"LoongArchISD::VREPLVE", SDT_LoongArchVreplve>;
 def loongarch_vall_nonzero : SDNode<"LoongArchISD::VALL_NONZERO",
@@ -31,6 +40,23 @@ def loongarch_vpick_sext_elt : SDNode<"LoongArchISD::VPICK_SEXT_ELT",
 def loongarch_vpick_zext_elt : SDNode<"LoongArchISD::VPICK_ZEXT_ELT",
                                       SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>>;
 
+def loongarch_vshuf: SDNode<"LoongArchISD::VSHUF", SDT_LoongArchVShuf>;
+def loongarch_vpickev: SDNode<"LoongArchISD::VPICKEV", SDT_LoongArchV2R>;
+def loongarch_vpickod: SDNode<"LoongArchISD::VPICKOD", SDT_LoongArchV2R>;
+def loongarch_vpackev: SDNode<"LoongArchISD::VPACKEV", SDT_LoongArchV2R>;
+def loongarch_vpackod: SDNode<"LoongArchISD::VPACKOD", SDT_LoongArchV2R>;
+def loongarch_vilvl: SDNode<"LoongArchISD::VILVL", SDT_LoongArchV2R>;
+def loongarch_vilvh: SDNode<"LoongArchISD::VILVH", SDT_LoongArchV2R>;
+
+def loongarch_vshuf4i: SDNode<"LoongArchISD::VSHUF4I", SDT_loongArchV1RUimm>;
+def loongarch_vreplvei: SDNode<"LoongArchISD::VREPLVEI", SDT_loongArchV1RUimm>;
+
+def immZExt1 : ImmLeaf<i64, [{return isUInt<1>(Imm);}]>;
+def immZExt2 : ImmLeaf<i64, [{return isUInt<2>(Imm);}]>;
+def immZExt3 : ImmLeaf<i64, [{return isUInt<3>(Imm);}]>;
+def immZExt4 : ImmLeaf<i64, [{return isUInt<4>(Imm);}]>;
+def immZExt8 : ImmLeaf<i64, [{return isUInt<8>(Imm);}]>;
+
 class VecCond<SDPatternOperator OpNode, ValueType TyNode,
               RegisterClass RC = LSX128>
     : Pseudo<(outs GPR:$rd), (ins RC:$vj),
@@ -1678,6 +1704,128 @@ def : Pat<(loongarch_vreplve v4i32:$vj, GRLenVT:$rk),
 def : Pat<(loongarch_vreplve v2i64:$vj, GRLenVT:$rk),
           (VREPLVE_D v2i64:$vj, GRLenVT:$rk)>;
 
+// VSHUF_{B/H/W/D}
+def : Pat<(loongarch_vshuf v16i8:$va, v16i8:$vj, v16i8:$vk),
+          (VSHUF_B v16i8:$vj, v16i8:$vk, v16i8:$va)>;
+def : Pat<(loongarch_vshuf v8i16:$vd, v8i16:$vj, v8i16:$vk),
+          (VSHUF_H v8i16:$vd, v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vshuf v4i32:$vd, v4i32:$vj, v4i32:$vk),
+          (VSHUF_W v4i32:$vd, v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vshuf v2i64:$vd, v2i64:$vj, v2i64:$vk),
+          (VSHUF_D v2i64:$vd, v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vshuf v4i32:$vd, v4f32:$vj, v4f32:$vk),
+          (VSHUF_W v4i32:$vd, v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vshuf v2i64:$vd, v2f64:$vj, v2f64:$vk),
+          (VSHUF_D v2i64:$vd, v2f64:$vj, v2f64:$vk)>;
+
+// VPICKEV_{B/H/W/D}
+def : Pat<(loongarch_vpickev v16i8:$vj, v16i8:$vk),
+          (VPICKEV_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vpickev v8i16:$vj, v8i16:$vk),
+          (VPICKEV_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vpickev v4i32:$vj, v4i32:$vk),
+          (VPICKEV_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vpickev v2i64:$vj, v2i64:$vk),
+          (VPICKEV_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vpickev v4f32:$vj, v4f32:$vk),
+          (VPICKEV_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vpickev v2f64:$vj, v2f64:$vk),
+          (VPICKEV_D v2f64:$vj, v2f64:$vk)>;
+
+// VPICKOD_{B/H/W/D}
+def : Pat<(loongarch_vpickod v16i8:$vj, v16i8:$vk),
+          (VPICKOD_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vpickod v8i16:$vj, v8i16:$vk),
+          (VPICKOD_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vpickod v4i32:$vj, v4i32:$vk),
+          (VPICKOD_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vpickod v2i64:$vj, v2i64:$vk),
+          (VPICKOD_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vpickod v4f32:$vj, v4f32:$vk),
+          (VPICKOD_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vpickod v2f64:$vj, v2f64:$vk),
+          (VPICKOD_D v2f64:$vj, v2f64:$vk)>;
+
+// VPACKEV_{B/H/W/D}
+def : Pat<(loongarch_vpackev v16i8:$vj, v16i8:$vk),
+          (VPACKEV_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vpackev v8i16:$vj, v8i16:$vk),
+          (VPACKEV_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vpackev v4i32:$vj, v4i32:$vk),
+          (VPACKEV_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vpackev v2i64:$vj, v2i64:$vk),
+          (VPACKEV_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vpackev v4f32:$vj, v4f32:$vk),
+          (VPACKEV_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vpackev v2f64:$vj, v2f64:$vk),
+          (VPACKEV_D v2f64:$vj, v2f64:$vk)>;
+
+// VPACKOD_{B/H/W/D}
+def : Pat<(loongarch_vpackod v16i8:$vj, v16i8:$vk),
+          (VPACKOD_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vpackod v8i16:$vj, v8i16:$vk),
+          (VPACKOD_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vpackod v4i32:$vj, v4i32:$vk),
+          (VPACKOD_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vpackod v2i64:$vj, v2i64:$vk),
+          (VPACKOD_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vpackod v4f32:$vj, v4f32:$vk),
+          (VPACKOD_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vpackod v2f64:$vj, v2f64:$vk),
+          (VPACKOD_D v2f64:$vj, v2f64:$vk)>;
+
+// VILVL_{B/H/W/D}
+def : Pat<(loongarch_vilvl v16i8:$vj, v16i8:$vk),
+          (VILVL_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vilvl v8i16:$vj, v8i16:$vk),
+          (VILVL_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vilvl v4i32:$vj, v4i32:$vk),
+          (VILVL_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vilvl v2i64:$vj, v2i64:$vk),
+          (VILVL_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vilvl v4f32:$vj, v4f32:$vk),
+          (VILVL_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vilvl v2f64:$vj, v2f64:$vk),
+          (VILVL_D v2f64:$vj, v2f64:$vk)>;
+
+// VILVH_{B/H/W/D}
+def : Pat<(loongarch_vilvh v16i8:$vj, v16i8:$vk),
+          (VILVH_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vilvh v8i16:$vj, v8i16:$vk),
+          (VILVH_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vilvh v4i32:$vj, v4i32:$vk),
+          (VILVH_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vilvh v2i64:$vj, v2i64:$vk),
+          (VILVH_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vilvh v4f32:$vj, v4f32:$vk),
+          (VILVH_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vilvh v2f64:$vj, v2f64:$vk),
+          (VILVH_D v2f64:$vj, v2f64:$vk)>;
+
+// VSHUF4I_{B/H/W}
+def : Pat<(loongarch_vshuf4i v16i8:$vj, immZExt8:$ui8),
+          (VSHUF4I_B v16i8:$vj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v8i16:$vj, immZExt8:$ui8),
+        (VSHUF4I_H v8i16:$vj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v4i32:$vj, immZExt8:$ui8),
+        (VSHUF4I_W v4i32:$vj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v4f32:$vj, immZExt8:$ui8),
+        (VSHUF4I_W v4f32:$vj, immZExt8:$ui8)>;
+
+// VREPLVEI_{B/H/W/D}
+def : Pat<(loongarch_vreplvei v16i8:$vj, immZExt4:$ui4),
+          (VREPLVEI_B v16i8:$vj, immZExt4:$ui4)>;
+def : Pat<(loongarch_vreplvei v8i16:$vj, immZExt3:$ui3),
+        (VREPLVEI_H v8i16:$vj, immZExt3:$ui3)>;
+def : Pat<(loongarch_vreplvei v4i32:$vj, immZExt2:$ui2),
+        (VREPLVEI_W v4i32:$vj, immZExt2:$ui2)>;
+def : Pat<(loongarch_vreplvei v2i64:$vj, immZExt1:$ui1),
+        (VREPLVEI_D v2i64:$vj, immZExt1:$ui1)>;
+def : Pat<(loongarch_vreplvei v4f32:$vj, immZExt2:$ui2),
+        (VREPLVEI_W v4f32:$vj, immZExt2:$ui2)>;
+def : Pat<(loongarch_vreplvei v2f64:$vj, immZExt1:$ui1),
+        (VREPLVEI_D v2f64:$vj, immZExt1:$ui1)>;
+
 // VREPLVEI_{W/D}
 def : Pat<(lsxsplatf32 FPR32:$fj),
           (VREPLVEI_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), 0)>;
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
new file mode 100644
index 000000000000..22ab19b9fa44
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xvilvl.b
+define <32 x i8> @shufflevector_xvilvl_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_xvilvl_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvl.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39,
+                                                               i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+    ret <32 x i8> %c
+}
+
+;; xvilvl.h
+define <16 x i16> @shufflevector_xvilvl_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_xvilvl_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvl.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+    ret <16 x i16> %c
+}
+
+;; xvilvl.w
+define <8 x i32> @shufflevector_xvilvl_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_xvilvl_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvl.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+    ret <8 x i32> %c
+}
+
+;; xvilvh.b
+define <32 x i8> @shufflevector_xvilvh_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_xvilvh_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvh.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47,
+                                                               i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+    ret <32 x i8> %c
+}
+
+;; xvilvh.h
+define <16 x i16> @shufflevector_xvilvh_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_xvilvh_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvh.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+    ret <16 x i16> %c
+}
+
+;; xvilvh.w
+define <8 x i32> @shufflevector_xvilvh_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_xvilvh_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvh.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+    ret <8 x i32> %c
+}
+
+;; xvilvh.w
+define <8 x float> @shufflevector_xvilvh_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_xvilvh_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvh.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+    ret <8 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
new file mode 100644
index 000000000000..2ff9af4069b9
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xvpackev.b
+define <32 x i8> @shufflevector_pack_ev_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46,
+                                                               i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+    ret <32 x i8> %c
+}
+
+;; xvpackev.h
+define <16 x i16> @shufflevector_pack_ev_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+    ret <16 x i16> %c
+}
+
+;; xvpackev.w
+define <8 x i32> @shufflevector_pack_ev_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+    ret <8 x i32> %c
+}
+
+;; xvpickev.d/xvpackev.d/xvilvl.d
+define <4 x i64> @shufflevector_pack_ev_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+    ret <4 x i64> %c
+}
+
+;; xvpackev.w
+define <8 x float> @shufflevector_pack_ev_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+    ret <8 x float> %c
+}
+
+;; xvpickev.d/xvpackev.d/xvilvl.d
+define <4 x double> @shufflevector_pack_ev_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+    ret <4 x double> %c
+}
+
+;; xvpackod.b
+define <32 x i8> @shufflevector_pack_od_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 33, i32 3, i32 35, i32 5, i32 37, i32 7, i32 39, i32 9, i32 41, i32 11, i32 43, i32 13, i32 45, i32 15, i32 47,
+                                                              i32 17, i32 49, i32 19, i32 51, i32 21, i32 53, i32 23, i32 55, i32 25, i32 57, i32 27, i32 59, i32 29, i32 61, i32 31, i32 63>
+    ret <32 x i8> %c
+}
+
+;; xvpackod.h
+define <16 x i16> @shufflevector_pack_od_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+    ret <16 x i16> %c
+}
+
+;; xvpackod.w
+define <8 x i32> @shufflevector_pack_od_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+    ret <8 x i32> %c
+}
+
+;; xvpickod.d/xvpackod.d/xvilvh.d
+define <4 x i64> @shufflodector_pack_od_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+    ret <4 x i64> %c
+}
+
+;; xvpackod.w
+define <8 x float> @shufflodector_pack_od_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+    ret <8 x float> %c
+}
+
+;; xvpickod.d/xvpackod.d/xvilvh.d
+define <4 x double> @shufflodector_pack_od_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+    ret <4 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
new file mode 100644
index 000000000000..294d292d1764
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xvpickev.b
+define <32 x i8> @shufflevector_pick_ev_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickev.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46,
+                                                               i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+    ret <32 x i8> %c
+}
+
+;; xvpickev.h
+define <16 x i16> @shufflevector_pick_ev_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickev.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+    ret <16 x i16> %c
+}
+
+;; xvpickev.w
+define <8 x i32> @shufflevector_pick_ev_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+    ret <8 x i32> %c
+}
+
+;; xvpickev.w
+define <8 x float> @shufflevector_pick_ev_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+    ret <8 x float> %c
+}
+
+;; xvpickod.b
+define <32 x i8> @shufflevector_pick_od_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickod.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47,
+                                                               i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
+    ret <32 x i8> %c
+}
+
+;; xvpickod.h
+define <16 x i16> @shufflevector_pick_od_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickod.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+    ret <16 x i16> %c
+}
+
+;; xvpickod.w
+define <8 x i32> @shufflevector_pick_od_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickod.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+    ret <8 x i32> %c
+}
+
+;; xvpickod.w
+define <8 x float> @shufflodector_pick_od_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflodector_pick_od_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickod.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+    ret <8 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
new file mode 100644
index 000000000000..dce1e4b777e2
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xvrepl128vei.b
+define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvrepl128vei.b $xr0, $xr0, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
+                                                               i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
+    ret <32 x i8> %c
+}
+
+;; xvrepl128vei.h
+define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvrepl128vei.h $xr0, $xr0, 3
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3,
+                                                                 i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+    ret <16 x i16> %c
+}
+
+;; xvrepl128vei.w
+define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
+; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 3
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 3, i32 3, i32 3, i32 3>
+    ret <8 x i32> %c
+}
+
+;; xvrepl128vei.d
+define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: shufflevector_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr0, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+    ret <4 x i64> %c
+}
+
+;; xvrepl128vei.w
+define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 3
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+    ret <8 x float> %c
+}
+
+;; xvrepl128vei.d
+define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: shufflevector_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr1, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 5, i32 7, i32 7>
+    ret <4 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
new file mode 100644
index 000000000000..fce32647da3d
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xvshuf.b
+define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
+; CHECK-NEXT:    xvld $xr2, $a0, 0
+; CHECK-NEXT:    xvshuf.b $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39,
+                                                               i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+    ret <32 x i8> %c
+}
+
+;; xvshuf.h
+define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
+; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 78
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_0)
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvshuf.h $xr0, $xr1, $xr2
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 27, i32 26, i32 25, i32 24,
+                                                                 i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3>
+    ret <16 x i16> %c
+}
+
+;; xvshuf.w
+define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 68
+; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 68
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvshuf.w $xr0, $xr1, $xr2
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 9, i32 3, i32 2, i32 8, i32 9, i32 3, i32 2>
+    ret <8 x i32> %c
+}
+
+;; xvshuf.d
+define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: shufflevector_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 238
+; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 238
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvshuf.d $xr0, $xr1, $xr2
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+    ret <4 x i64> %c
+}
+
+;; xvshuf.w
+define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_0)
+; CHECK-NEXT:    xvld $xr2, $a0, 0
+; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 0, i32 10, i32 9, i32 4, i32 5, i32 12, i32 13>
+    ret <8 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
new file mode 100644
index 000000000000..dc4532a7292a
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xxvshuf4i.b
+define <32 x i8> @shufflevector_xvshuf4i_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvshuf4i.b $xr0, $xr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12,
+                                                               i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 26, i32 25, i32 24, i32 31, i32 30, i32 29, i32 28>
+    ret <32 x i8> %c
+}
+
+;; xvshuf4i.h
+define <16 x i16> @shufflevector_xvshuf4i_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvshuf4i.h $xr0, $xr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+    ret <16 x i16> %c
+}
+
+;; xvshuf4i.w
+define <8 x i32> @shufflevector_xvshuf4i_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+    ret <8 x i32> %c
+}
+
+;; xvshuf4i.w
+define <8 x float> @shufflevector_xvshuf4i_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+    ret <8 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
new file mode 100644
index 000000000000..31398c6081c0
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+;; vilvl.b
+define <16 x i8> @shufflevector_vilvl_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_vilvl_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvl.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+    ret <16 x i8> %c
+}
+
+;; vilvl.h
+define <8 x i16> @shufflevector_vilvl_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_vilvl_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvl.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+    ret <8 x i16> %c
+}
+
+;; vilvl.w
+define <4 x i32> @shufflevector_vilvl_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_vilvl_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+    ret <4 x i32> %c
+}
+
+;; vilvl.w
+define <4 x float> @shufflevector_vilvl_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_vilvl_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+    ret <4 x float> %c
+}
+
+;; vilvh.b
+define <16 x i8> @shufflevector_vilvh_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_vilvh_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+    ret <16 x i8> %c
+}
+
+;; vilvh.h
+define <8 x i16> @shufflevector_vilvh_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_vilvh_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+    ret <8 x i16> %c
+}
+
+;; vilvh.w
+define <4 x i32> @shufflevector_vilvh_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_vilvh_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+    ret <4 x i32> %c
+}
+
+;; vilvh.w
+define <4 x float> @shufflevector_vilvh_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_vilvh_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+    ret <4 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
new file mode 100644
index 000000000000..171e68306cd1
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+;; vpackev.b
+define <16 x i8> @shufflevector_pack_ev_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+    ret <16 x i8> %c
+}
+
+;; vpackev.h
+define <8 x i16> @shufflevector_pack_ev_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+    ret <8 x i16> %c
+}
+
+;; vpackev.w
+define <4 x i32> @shufflevector_pack_ev_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+    ret <4 x i32> %c
+}
+
+;; vpickev.d/vpackev.d/vilvl.d
+define <2 x i64> @shufflevector_pack_ev_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.d $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+    ret <2 x i64> %c
+}
+
+;; vpackev.w
+define <4 x float> @shufflevector_pack_ev_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+    ret <4 x float> %c
+}
+
+;; vpickev.d/vpackev.d/vilvl.d
+define <2 x double> @shufflevector_pack_ev_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.d $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+    ret <2 x double> %c
+}
+
+;; vpackod.b
+define <16 x i8> @shufflevector_pack_od_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+    ret <16 x i8> %c
+}
+
+;; vpackod.h
+define <8 x i16> @shufflevector_pack_od_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+    ret <8 x i16> %c
+}
+
+;; vpackod.w
+define <4 x i32> @shufflevector_pack_od_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+    ret <4 x i32> %c
+}
+
+;; vpickod.d/vpackod.d/vilvh.d
+define <2 x i64> @shufflodector_pack_od_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.d $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+    ret <2 x i64> %c
+}
+
+;; vpackod.w
+define <4 x float> @shufflodector_pack_od_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+    ret <4 x float> %c
+}
+
+;; vpickod.d/vpackod.d/vilvh.d
+define <2 x double> @shufflodector_pack_od_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.d $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+    ret <2 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
new file mode 100644
index 000000000000..ca636d942b58
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+;; vpickev.b
+define <16 x i8> @shufflevector_pick_ev_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickev.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+    ret <16 x i8> %c
+}
+
+;; vpickev.h
+define <8 x i16> @shufflevector_pick_ev_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickev.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+    ret <8 x i16> %c
+}
+
+;; vpickev.w
+define <4 x i32> @shufflevector_pick_ev_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickev.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+    ret <4 x i32> %c
+}
+
+;; vpickev.w
+define <4 x float> @shufflevector_pick_ev_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickev.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+    ret <4 x float> %c
+}
+
+;; vpickod.b
+define <16 x i8> @shufflevector_pick_od_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickod.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+    ret <16 x i8> %c
+}
+
+;; vpickod.h
+define <8 x i16> @shufflevector_pick_od_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickod.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+    ret <8 x i16> %c
+}
+
+;; vpickod.w
+define <4 x i32> @shufflevector_pick_od_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickod.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+    ret <4 x i32> %c
+}
+
+;; vpickod.w
+define <4 x float> @shufflodector_pick_od_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflodector_pick_od_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickod.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+    ret <4 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
new file mode 100644
index 000000000000..10510786f321
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+;; vreplvei.b
+define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.b $vr0, $vr0, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+    ret <16 x i8> %c
+}
+
+;; vreplvei.h
+define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.h $vr0, $vr1, 2
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+    ret <8 x i16> %c
+}
+
+;; vreplvei.w
+define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+    ret <4 x i32> %c
+}
+
+;; vreplvei.d
+define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: shufflevector_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 1>
+    ret <2 x i64> %c
+}
+
+;; vreplvei.w
+define <4 x float> @shufflevector_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+    ret <4 x float> %c
+}
+
+;; vreplvei.d
+define <2 x double> @shufflevector_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: shufflevector_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
+    ret <2 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
new file mode 100644
index 000000000000..55800b31446b
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.b $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15, i32 2, i32 4, i32 6, i32 8, i32 25, i32 30, i32 31, i32 31>
+    ret <16 x i8> %c
+}
+
+;; vshuf.h
+define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.h $vr2, $vr1, $vr0
+; CHECK-NEXT:    vori.b $vr0, $vr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15>
+    ret <8 x i16> %c
+}
+
+;; vshuf.w
+define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.w $vr2, $vr1, $vr0
+; CHECK-NEXT:    vori.b $vr0, $vr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 5, i32 7>
+    ret <4 x i32> %c
+}
+
+;; vshuf.d
+define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: shufflevector_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.d $vr2, $vr1, $vr0
+; CHECK-NEXT:    vori.b $vr0, $vr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
+    ret <2 x i64> %c
+}
+
+;; vshuf.w
+define <4 x float> @shufflevector_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.w $vr2, $vr1, $vr0
+; CHECK-NEXT:    vori.b $vr0, $vr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 5, i32 7>
+    ret <4 x float> %c
+}
+
+;; vshuf.d
+define <2 x double> @shufflevector_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: shufflevector_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.d $vr2, $vr1, $vr0
+; CHECK-NEXT:    vori.b $vr0, $vr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+    ret <2 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll
new file mode 100644
index 000000000000..660b9581c3d1
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+;; vilvh.b
+define <16 x i8> @shufflevector_vshuf4i_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_vshuf4i_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+    ret <16 x i8> %c
+}
+
+;; vilvh.h
+define <8 x i16> @shufflevector_vshuf4i_v8i4(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_vshuf4i_v8i4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshuf4i.h $vr0, $vr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+    ret <8 x i16> %c
+}
+
+;; vilvh.w
+define <4 x i32> @shufflevector_vshuf4i_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_vshuf4i_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+    ret <4 x i32> %c
+}
+
+;; vilvh.w
+define <4 x float> @shufflevector_vshuf4i_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_vshuf4i_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+    ret <4 x float> %c
+}
-- 
Gitee


From 17f537eeaef8db451c70fc56a921e5ff542f713b Mon Sep 17 00:00:00 2001
From: Ami-zhang <zhanglimin@loongson.cn>
Date: Tue, 23 Jul 2024 14:02:04 +0800
Subject: [PATCH 17/23] [LoongArch] Enable 128-bits vector by default (#100056)

This commit is to enable 128 vector feature by default, in order to be
consistent with gcc.

(cherry picked from commit b4ef0ba244899a64a1b1e6448eca942cfa5eda18)
---
 .../lib/Driver/ToolChains/Arch/LoongArch.cpp  | 76 +++++++++++--------
 .../test/Driver/loongarch-default-features.c  |  2 +-
 clang/test/Driver/loongarch-mlasx.c           |  6 +-
 clang/test/Driver/loongarch-msimd.c           |  4 +-
 clang/test/Driver/loongarch-msingle-float.c   |  4 +-
 clang/test/Driver/loongarch-msoft-float.c     |  4 +-
 clang/test/Preprocessor/init-loongarch.c      |  8 +-
 7 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 8b3d2837a4e5..87d7b30ef5d3 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -127,6 +127,11 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
                                            const llvm::Triple &Triple,
                                            const ArgList &Args,
                                            std::vector<StringRef> &Features) {
+  // Enable the `lsx` feature on 64-bit LoongArch by default.
+  if (Triple.isLoongArch64() &&
+      (!Args.hasArgNoClaim(clang::driver::options::OPT_march_EQ)))
+    Features.push_back("+lsx");
+
   std::string ArchName;
   if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
     ArchName = A->getValue();
@@ -145,9 +150,11 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     } else if (A->getOption().matches(options::OPT_msingle_float)) {
       Features.push_back("+f");
       Features.push_back("-d");
+      Features.push_back("-lsx");
     } else /*Soft-float*/ {
       Features.push_back("-f");
       Features.push_back("-d");
+      Features.push_back("-lsx");
     }
   } else if (const Arg *A = Args.getLastArg(options::OPT_mfpu_EQ)) {
     StringRef FPU = A->getValue();
@@ -157,9 +164,11 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     } else if (FPU == "32") {
       Features.push_back("+f");
       Features.push_back("-d");
+      Features.push_back("-lsx");
     } else if (FPU == "0" || FPU == "none") {
       Features.push_back("-f");
       Features.push_back("-d");
+      Features.push_back("-lsx");
     } else {
       D.Diag(diag::err_drv_loongarch_invalid_mfpu_EQ) << FPU;
     }
@@ -175,6 +184,42 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     A->ignoreTargetSpecific();
   if (Arg *A = Args.getLastArgNoClaim(options::OPT_mfpu_EQ))
     A->ignoreTargetSpecific();
+  if (Arg *A = Args.getLastArgNoClaim(options::OPT_msimd_EQ))
+    A->ignoreTargetSpecific();
+
+  // Select lsx/lasx feature determined by -msimd=.
+  // Option -msimd= precedes -m[no-]lsx and -m[no-]lasx.
+  if (const Arg *A = Args.getLastArg(options::OPT_msimd_EQ)) {
+    StringRef MSIMD = A->getValue();
+    if (MSIMD == "lsx") {
+      // Option -msimd=lsx depends on 64-bit FPU.
+      // -m*-float and -mfpu=none/0/32 conflict with -msimd=lsx.
+      if (llvm::find(Features, "-d") != Features.end())
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lsx);
+      else
+        Features.push_back("+lsx");
+    } else if (MSIMD == "lasx") {
+      // Option -msimd=lasx depends on 64-bit FPU and LSX.
+      // -m*-float, -mfpu=none/0/32 and -mno-lsx conflict with -msimd=lasx.
+      if (llvm::find(Features, "-d") != Features.end())
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lasx);
+      else if (llvm::find(Features, "-lsx") != Features.end())
+        D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
+
+      // The command options do not contain -mno-lasx.
+      if (!Args.getLastArg(options::OPT_mno_lasx)) {
+        Features.push_back("+lsx");
+        Features.push_back("+lasx");
+      }
+    } else if (MSIMD == "none") {
+      if (llvm::find(Features, "+lsx") != Features.end())
+        Features.push_back("-lsx");
+      if (llvm::find(Features, "+lasx") != Features.end())
+        Features.push_back("-lasx");
+    } else {
+      D.Diag(diag::err_drv_loongarch_invalid_msimd_EQ) << MSIMD;
+    }
+  }
 
   // Select lsx feature determined by -m[no-]lsx.
   if (const Arg *A = Args.getLastArg(options::OPT_mlsx, options::OPT_mno_lsx)) {
@@ -198,8 +243,6 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     if (A->getOption().matches(options::OPT_mlasx)) {
       if (llvm::find(Features, "-d") != Features.end())
         D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lasx);
-      else if (llvm::find(Features, "-lsx") != Features.end())
-        D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
       else { /*-mlasx*/
         Features.push_back("+lsx");
         Features.push_back("+lasx");
@@ -207,35 +250,6 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     } else /*-mno-lasx*/
       Features.push_back("-lasx");
   }
-
-  // Select lsx/lasx feature determined by -msimd=.
-  // Option -msimd= has lower priority than -m[no-]lsx and -m[no-]lasx.
-  if (const Arg *A = Args.getLastArg(options::OPT_msimd_EQ)) {
-    StringRef MSIMD = A->getValue();
-    if (MSIMD == "lsx") {
-      // Option -msimd=lsx depends on 64-bit FPU.
-      // -m*-float and -mfpu=none/0/32 conflict with -mlsx.
-      if (llvm::find(Features, "-d") != Features.end())
-        D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lsx);
-      // The previous option does not contain feature -lsx.
-      else if (llvm::find(Features, "-lsx") == Features.end())
-        Features.push_back("+lsx");
-    } else if (MSIMD == "lasx") {
-      // Option -msimd=lasx depends on 64-bit FPU and LSX.
-      // -m*-float and -mfpu=none/0/32 conflict with -mlsx.
-      if (llvm::find(Features, "-d") != Features.end())
-        D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lasx);
-      else if (llvm::find(Features, "-lsx") != Features.end())
-        D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
-      // The previous option does not contain feature -lasx.
-      else if (llvm::find(Features, "-lasx") == Features.end()) {
-        Features.push_back("+lsx");
-        Features.push_back("+lasx");
-      }
-    } else if (MSIMD != "none") {
-      D.Diag(diag::err_drv_loongarch_invalid_msimd_EQ) << MSIMD;
-    }
-  }
 }
 
 std::string loongarch::postProcessTargetCPUString(const std::string &CPU,
diff --git a/clang/test/Driver/loongarch-default-features.c b/clang/test/Driver/loongarch-default-features.c
index 3cdf3ba3d23e..90634bbcf003 100644
--- a/clang/test/Driver/loongarch-default-features.c
+++ b/clang/test/Driver/loongarch-default-features.c
@@ -2,7 +2,7 @@
 // RUN: %clang --target=loongarch64 -S -emit-llvm %s -o - | FileCheck %s --check-prefix=LA64
 
 // LA32: "target-features"="+32bit"
-// LA64: "target-features"="+64bit,+d,+f,+ual"
+// LA64: "target-features"="+64bit,+d,+f,+lsx,+ual"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-mlasx.c b/clang/test/Driver/loongarch-mlasx.c
index 0b934f125c9e..87634ff5a9a4 100644
--- a/clang/test/Driver/loongarch-mlasx.c
+++ b/clang/test/Driver/loongarch-mlasx.c
@@ -5,7 +5,7 @@
 // RUN: %clang --target=loongarch64 -mno-lasx -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-NOLASX
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-NOLASX
+// RUN:   FileCheck %s --check-prefix=CC1-LSX
 // RUN: %clang --target=loongarch64 -mno-lasx -mlasx -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-LASX
 // RUN: %clang --target=loongarch64 -mlsx -mlasx -fsyntax-only %s -### 2>&1 | \
@@ -18,7 +18,7 @@
 // RUN: %clang --target=loongarch64 -mno-lasx -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-NOLASX
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-NOLASX
+// RUN:   FileCheck %s --check-prefix=IR-LSX
 // RUN: %clang --target=loongarch64 -mno-lasx -mlasx -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LASX
 // RUN: %clang --target=loongarch64 -mlsx -mlasx -S -emit-llvm %s -o - | \
@@ -26,9 +26,11 @@
 // RUN: %clang --target=loongarch64 -mlasx -mlsx -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LASX
 
+// CC1-LSX: "-target-feature" "+lsx"
 // CC1-LASX: "-target-feature" "+lsx" "-target-feature" "+lasx"
 // CC1-NOLASX: "-target-feature" "-lasx"
 
+// IR-LSX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+lsx{{(,.*)?}}"
 // IR-LASX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+lasx{{(,.*)?}}"
 // IR-NOLASX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-lasx{{(,.*)?}}"
 
diff --git a/clang/test/Driver/loongarch-msimd.c b/clang/test/Driver/loongarch-msimd.c
index cd463300c874..49d298e1b2e3 100644
--- a/clang/test/Driver/loongarch-msimd.c
+++ b/clang/test/Driver/loongarch-msimd.c
@@ -75,9 +75,9 @@
 // RUN:   FileCheck %s --check-prefixes=LSX,LASX
 
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
 // RUN: %clang --target=loongarch64 -mno-lasx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
 
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
diff --git a/clang/test/Driver/loongarch-msingle-float.c b/clang/test/Driver/loongarch-msingle-float.c
index bd9b3e8a8c01..4eb0865b53a5 100644
--- a/clang/test/Driver/loongarch-msingle-float.c
+++ b/clang/test/Driver/loongarch-msingle-float.c
@@ -11,10 +11,10 @@
 // WARN: warning: ignoring '-mabi=lp64s' as it conflicts with that implied by '-msingle-float' (lp64f)
 // WARN: warning: ignoring '-mfpu=64' as it conflicts with that implied by '-msingle-float' (32)
 
-// CC1: "-target-feature" "+f"{{.*}} "-target-feature" "-d"
+// CC1: "-target-feature" "+f"{{.*}} "-target-feature" "-d" "-target-feature" "-lsx"
 // CC1: "-target-abi" "lp64f"
 
-// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+f,{{(.*,)?}}-d"
+// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+f,{{(.*,)?}}-d,-lsx"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-msoft-float.c b/clang/test/Driver/loongarch-msoft-float.c
index 0e5121ac84b4..ebf27fb00e30 100644
--- a/clang/test/Driver/loongarch-msoft-float.c
+++ b/clang/test/Driver/loongarch-msoft-float.c
@@ -11,10 +11,10 @@
 // WARN: warning: ignoring '-mabi=lp64d' as it conflicts with that implied by '-msoft-float' (lp64s)
 // WARN: warning: ignoring '-mfpu=64' as it conflicts with that implied by '-msoft-float' (0)
 
-// CC1: "-target-feature" "-f"{{.*}} "-target-feature" "-d"
+// CC1: "-target-feature" "-f"{{.*}} "-target-feature" "-d" "-target-feature" "-lsx"
 // CC1: "-target-abi" "lp64s"
 
-// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-d,{{(.*,)?}}-f{{(,.*)?}}"
+// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-d,{{(.*,)?}}-f,-lsx"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index 154ad82e0f8c..635d029ce9d3 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -814,6 +814,8 @@
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
 // RUN: %clang --target=loongarch64 -mlsx -mno-lasx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
+// RUN: %clang --target=loongarch64 -mno-lasx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
 // RUN: %clang --target=loongarch64 -mno-lasx -mlsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
 // MLSX-NOT: #define __loongarch_asx
@@ -822,12 +824,12 @@
 
 // RUN: %clang --target=loongarch64 -mlasx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
-// RUN: %clang --target=loongarch64 -mno-lasx -mlasx -x c -E -dM %s -o - \
-// RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
 // RUN: %clang --target=loongarch64 -mlsx -mlasx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
 // RUN: %clang --target=loongarch64 -mlasx -mlsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
+// RUN: %clang --target=loongarch64 -mno-lasx -mlasx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
 // MLASX: #define __loongarch_asx 1
 // MLASX: #define __loongarch_simd_width 256
 // MLASX: #define __loongarch_sx 1
@@ -840,8 +842,6 @@
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // RUN: %clang --target=loongarch64 -mno-lasx -mno-lsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
-// RUN: %clang --target=loongarch64 -mno-lasx -x c -E -dM %s -o - \
-// RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // MNO-LSX-NOT: #define __loongarch_asx
 // MNO-LSX-NOT: #define __loongarch_simd_width
 // MNO-LSX-NOT: #define __loongarch_sx
-- 
Gitee


From cac0cc4649362e0b80f61e45aec54341f40f7f77 Mon Sep 17 00:00:00 2001
From: Ami-zhang <zhanglimin@loongson.cn>
Date: Wed, 17 Jan 2024 11:15:05 +0800
Subject: [PATCH 18/23] [LoongArch] Add LoongArch V1.1 instructions definitions
 and MC tests (#78238)

LoongArch V1.1 instrucions include floating-point approximate reciprocal
instructions and atomic instrucions. And add testcases for these
instrucions meanwhile.

(cherry picked from commit 84bdee2875da364be7eb2144b1ae530f6a05f0e2)
---
 .../LoongArch/LoongArchFloat32InstrInfo.td    |  2 +
 .../LoongArch/LoongArchFloat64InstrInfo.td    |  2 +
 .../Target/LoongArch/LoongArchInstrInfo.td    | 34 ++++++-
 .../LoongArch/LoongArchLASXInstrInfo.td       |  4 +
 .../Target/LoongArch/LoongArchLSXInstrInfo.td |  4 +
 llvm/test/MC/LoongArch/Basic/Float/d-arith.s  |  8 ++
 llvm/test/MC/LoongArch/Basic/Float/f-arith.s  |  8 ++
 llvm/test/MC/LoongArch/Basic/Integer/atomic.s | 92 +++++++++++++++++++
 llvm/test/MC/LoongArch/lasx/frecip.s          |  8 ++
 llvm/test/MC/LoongArch/lasx/frsqrt.s          |  8 ++
 llvm/test/MC/LoongArch/lsx/frecip.s           |  8 ++
 llvm/test/MC/LoongArch/lsx/frsqrt.s           |  8 ++
 12 files changed, 184 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index 65120c083f49..f30837912e75 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -50,6 +50,8 @@ def FNEG_S   : FP_ALU_2R<0x01141400>;
 def FSQRT_S  : FP_ALU_2R<0x01144400>;
 def FRECIP_S : FP_ALU_2R<0x01145400>;
 def FRSQRT_S : FP_ALU_2R<0x01146400>;
+def FRECIPE_S : FP_ALU_2R<0x01147400>;
+def FRSQRTE_S : FP_ALU_2R<0x01148400>;
 def FSCALEB_S : FP_ALU_3R<0x01108000>;
 def FLOGB_S   : FP_ALU_2R<0x01142400>;
 def FCOPYSIGN_S : FP_ALU_3R<0x01128000>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
index 437c1e4d7be2..0ea4c564b045 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
@@ -34,6 +34,8 @@ def FNEG_D   : FP_ALU_2R<0x01141800, FPR64>;
 def FSQRT_D  : FP_ALU_2R<0x01144800, FPR64>;
 def FRECIP_D : FP_ALU_2R<0x01145800, FPR64>;
 def FRSQRT_D : FP_ALU_2R<0x01146800, FPR64>;
+def FRECIPE_D : FP_ALU_2R<0x01147800, FPR64>;
+def FRSQRTE_D : FP_ALU_2R<0x01148800, FPR64>;
 def FSCALEB_D : FP_ALU_3R<0x01110000, FPR64>;
 def FLOGB_D   : FP_ALU_2R<0x01142800, FPR64>;
 def FCOPYSIGN_D : FP_ALU_3R<0x01130000, FPR64>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index ecd0c2b71b85..756c460f916b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -634,15 +634,24 @@ class AM_3R<bits<32> op>
     : Fmt3R<op, (outs GPR:$rd), (ins GPR:$rk, GPRMemAtomic:$rj),
             "$rd, $rk, $rj">;
 
-let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
 class LLBase<bits<32> op>
     : Fmt2RI14<op, (outs GPR:$rd), (ins GPR:$rj, simm14_lsl2:$imm14),
                "$rd, $rj, $imm14">;
+class LLBase_ACQ<bits<32> op>
+    : Fmt2R<op, (outs GPR:$rd), (ins GPR:$rj), "$rd, $rj">;
+}
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 1, Constraints = "$rd = $dst" in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 1, Constraints = "$rd = $dst" in {
 class SCBase<bits<32> op>
     : Fmt2RI14<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rj, simm14_lsl2:$imm14),
                "$rd, $rj, $imm14">;
+class SCBase_128<bits<32> op>
+    : Fmt3R<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rk, GPR:$rj),
+               "$rd, $rk, $rj">;
+class SCBase_REL<bits<32> op>
+    : Fmt2R<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rj), "$rd, $rj">;
+}
 
 let hasSideEffects = 1 in
 class IOCSRRD<bits<32> op>
@@ -754,6 +763,8 @@ def PRELD : FmtPRELD<(outs), (ins uimm5:$imm5, GPR:$rj, simm12:$imm12),
 // Atomic Memory Access Instructions
 def LL_W : LLBase<0x20000000>;
 def SC_W : SCBase<0x21000000>;
+def LLACQ_W : LLBase_ACQ<0x38578000>;
+def SCREL_W : SCBase_REL<0x38578400>;
 
 // Barrier Instructions
 def DBAR : MISC_I15<0x38720000>;
@@ -875,8 +886,12 @@ def STLE_W : STORE_3R<0x387f0000>;
 def STLE_D : STORE_3R<0x387f8000>;
 
 // Atomic Memory Access Instructions for 64-bits
+def AMSWAP_B     : AM_3R<0x385c0000>;
+def AMSWAP_H     : AM_3R<0x385c8000>;
 def AMSWAP_W     : AM_3R<0x38600000>;
 def AMSWAP_D     : AM_3R<0x38608000>;
+def AMADD_B      : AM_3R<0x385d0000>;
+def AMADD_H      : AM_3R<0x385d8000>;
 def AMADD_W      : AM_3R<0x38610000>;
 def AMADD_D      : AM_3R<0x38618000>;
 def AMAND_W      : AM_3R<0x38620000>;
@@ -893,8 +908,12 @@ def AMMAX_WU     : AM_3R<0x38670000>;
 def AMMAX_DU     : AM_3R<0x38678000>;
 def AMMIN_WU     : AM_3R<0x38680000>;
 def AMMIN_DU     : AM_3R<0x38688000>;
+def AMSWAP__DB_B : AM_3R<0x385e0000>;
+def AMSWAP__DB_H : AM_3R<0x385e8000>;
 def AMSWAP__DB_W : AM_3R<0x38690000>;
 def AMSWAP__DB_D : AM_3R<0x38698000>;
+def AMADD__DB_B  : AM_3R<0x385f0000>;
+def AMADD__DB_H  : AM_3R<0x385f8000>;
 def AMADD__DB_W  : AM_3R<0x386a0000>;
 def AMADD__DB_D  : AM_3R<0x386a8000>;
 def AMAND__DB_W  : AM_3R<0x386b0000>;
@@ -911,8 +930,19 @@ def AMMAX__DB_WU : AM_3R<0x38700000>;
 def AMMAX__DB_DU : AM_3R<0x38708000>;
 def AMMIN__DB_WU : AM_3R<0x38710000>;
 def AMMIN__DB_DU : AM_3R<0x38718000>;
+def AMCAS_B     : AM_3R<0x38580000>;
+def AMCAS_H     : AM_3R<0x38588000>;
+def AMCAS_W     : AM_3R<0x38590000>;
+def AMCAS_D     : AM_3R<0x38598000>;
+def AMCAS__DB_B     : AM_3R<0x385a0000>;
+def AMCAS__DB_H     : AM_3R<0x385a8000>;
+def AMCAS__DB_W     : AM_3R<0x385b0000>;
+def AMCAS__DB_D     : AM_3R<0x385b8000>;
 def LL_D : LLBase<0x22000000>;
 def SC_D : SCBase<0x23000000>;
+def SC_Q : SCBase_128<0x38570000>;
+def LLACQ_D : LLBase_ACQ<0x38578800>;
+def SCREL_D : SCBase_REL<0x38578C00>;
 
 // CRC Check Instructions
 def CRC_W_B_W  : ALU_3R<0x00240000>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 5b6721cdf1b4..454915ac8c0a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -773,6 +773,10 @@ def XVFRECIP_S : LASX2R_XX<0x769cf400>;
 def XVFRECIP_D : LASX2R_XX<0x769cf800>;
 def XVFRSQRT_S : LASX2R_XX<0x769d0400>;
 def XVFRSQRT_D : LASX2R_XX<0x769d0800>;
+def XVFRECIPE_S : LASX2R_XX<0x769d1400>;
+def XVFRECIPE_D : LASX2R_XX<0x769d1800>;
+def XVFRSQRTE_S : LASX2R_XX<0x769d2400>;
+def XVFRSQRTE_D : LASX2R_XX<0x769d2800>;
 
 def XVFCVTL_S_H : LASX2R_XX<0x769de800>;
 def XVFCVTH_S_H : LASX2R_XX<0x769dec00>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 3519fa3142c3..6d60d7074ec3 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -918,6 +918,10 @@ def VFRECIP_S : LSX2R_VV<0x729cf400>;
 def VFRECIP_D : LSX2R_VV<0x729cf800>;
 def VFRSQRT_S : LSX2R_VV<0x729d0400>;
 def VFRSQRT_D : LSX2R_VV<0x729d0800>;
+def VFRECIPE_S : LSX2R_VV<0x729d1400>;
+def VFRECIPE_D : LSX2R_VV<0x729d1800>;
+def VFRSQRTE_S : LSX2R_VV<0x729d2400>;
+def VFRSQRTE_D : LSX2R_VV<0x729d2800>;
 
 def VFCVTL_S_H : LSX2R_VV<0x729de800>;
 def VFCVTH_S_H : LSX2R_VV<0x729dec00>;
diff --git a/llvm/test/MC/LoongArch/Basic/Float/d-arith.s b/llvm/test/MC/LoongArch/Basic/Float/d-arith.s
index 6b2c67e9a2cc..8e19d2e34f3c 100644
--- a/llvm/test/MC/LoongArch/Basic/Float/d-arith.s
+++ b/llvm/test/MC/LoongArch/Basic/Float/d-arith.s
@@ -78,10 +78,18 @@ fsqrt.d $fa2, $ft3
 # ASM: encoding: [0x7b,0x5b,0x14,0x01]
 frecip.d $fs3, $fs3
 
+# ASM-AND-OBJ: frecipe.d $fa0, $fa0
+# ASM: encoding: [0x00,0x78,0x14,0x01]
+frecipe.d $fa0, $fa0
+
 # ASM-AND-OBJ: frsqrt.d $ft14, $fa3
 # ASM: encoding: [0x76,0x68,0x14,0x01]
 frsqrt.d $ft14, $fa3
 
+# ASM-AND-OBJ: frsqrte.d $fa1, $fa1
+# ASM: encoding: [0x21,0x88,0x14,0x01]
+frsqrte.d $fa1, $fa1
+
 # ASM-AND-OBJ: fscaleb.d $ft4, $ft6, $fs2
 # ASM: encoding: [0xcc,0x69,0x11,0x01]
 fscaleb.d $ft4, $ft6, $fs2
diff --git a/llvm/test/MC/LoongArch/Basic/Float/f-arith.s b/llvm/test/MC/LoongArch/Basic/Float/f-arith.s
index 155e783cf435..c32151adbf3b 100644
--- a/llvm/test/MC/LoongArch/Basic/Float/f-arith.s
+++ b/llvm/test/MC/LoongArch/Basic/Float/f-arith.s
@@ -73,10 +73,18 @@ fsqrt.s $fs3, $ft10
 # ASM: encoding: [0x71,0x57,0x14,0x01]
 frecip.s $ft9, $fs3
 
+# ASM-AND-OBJ: frecipe.s $fa0, $fa0
+# ASM: encoding: [0x00,0x74,0x14,0x01]
+frecipe.s $fa0, $fa0
+
 # ASM-AND-OBJ: frsqrt.s $fs1, $ft4
 # ASM: encoding: [0x99,0x65,0x14,0x01]
 frsqrt.s $fs1, $ft4
 
+# ASM-AND-OBJ: frsqrte.s $fa1, $fa1
+# ASM: encoding: [0x21,0x84,0x14,0x01]
+frsqrte.s $fa1, $fa1
+
 # ASM-AND-OBJ: fscaleb.s $ft13, $ft15, $fa6
 # ASM: encoding: [0xf5,0x9a,0x10,0x01]
 fscaleb.s $ft13, $ft15, $fa6
diff --git a/llvm/test/MC/LoongArch/Basic/Integer/atomic.s b/llvm/test/MC/LoongArch/Basic/Integer/atomic.s
index a35211db8851..69acdeef935c 100644
--- a/llvm/test/MC/LoongArch/Basic/Integer/atomic.s
+++ b/llvm/test/MC/LoongArch/Basic/Integer/atomic.s
@@ -21,6 +21,14 @@ ll.w $tp, $s4, 220
 # CHECK-ASM: encoding: [0xd3,0x39,0x00,0x21]
 sc.w $t7, $t2, 56
 
+# CHECK-ASM-AND-OBJ: llacq.w $t1, $t2
+# CHECK-ASM: encoding: [0xcd,0x81,0x57,0x38]
+llacq.w $t1, $t2
+
+# CHECK-ASM-AND-OBJ: screl.w $t1, $t2
+# CHECK-ASM: encoding: [0xcd,0x85,0x57,0x38]
+screl.w $t1, $t2
+
 
 
 #############################################################
@@ -29,6 +37,14 @@ sc.w $t7, $t2, 56
 
 .ifdef LA64
 
+# CHECK64-ASM-AND-OBJ: amswap.b $a2, $t0, $s1
+# CHECK64-ASM: encoding: [0x06,0x33,0x5c,0x38]
+amswap.b $a2, $t0, $s1, 0
+
+# CHECK64-ASM-AND-OBJ: amswap.h $a2, $t0, $s1
+# CHECK64-ASM: encoding: [0x06,0xb3,0x5c,0x38]
+amswap.h $a2, $t0, $s1, 0
+
 # CHECK64-ASM-AND-OBJ: amswap.w $a2, $t0, $s1
 # CHECK64-ASM: encoding: [0x06,0x33,0x60,0x38]
 amswap.w $a2, $t0, $s1, 0
@@ -41,6 +57,14 @@ amswap.w $zero, $t0, $zero
 # CHECK64-ASM: encoding: [0xa0,0x00,0x6a,0x38]
 amadd_db.w $zero, $zero, $a1
 
+# CHECK64-ASM-AND-OBJ: amswap.b $a2, $t0, $s1
+# CHECK64-ASM: encoding: [0x06,0x33,0x5c,0x38]
+amswap.b $a2, $t0, $s1
+
+# CHECK64-ASM-AND-OBJ: amswap.h $a2, $t0, $s1
+# CHECK64-ASM: encoding: [0x06,0xb3,0x5c,0x38]
+amswap.h $a2, $t0, $s1
+
 # CHECK64-ASM-AND-OBJ: amswap.w $a2, $t0, $s1
 # CHECK64-ASM: encoding: [0x06,0x33,0x60,0x38]
 amswap.w $a2, $t0, $s1
@@ -49,6 +73,14 @@ amswap.w $a2, $t0, $s1
 # CHECK64-ASM: encoding: [0xc2,0xba,0x60,0x38]
 amswap.d $tp, $t2, $fp
 
+# CHECK64-ASM-AND-OBJ: amadd.b $a4, $t0, $r21
+# CHECK64-ASM: encoding: [0xa8,0x32,0x5d,0x38]
+amadd.b $a4, $t0, $r21
+
+# CHECK64-ASM-AND-OBJ: amadd.h $a1, $t5, $s6
+# CHECK64-ASM: encoding: [0xa5,0xc7,0x5d,0x38]
+amadd.h $a1, $t5, $s6
+
 # CHECK64-ASM-AND-OBJ: amadd.w $a4, $t0, $r21
 # CHECK64-ASM: encoding: [0xa8,0x32,0x61,0x38]
 amadd.w $a4, $t0, $r21
@@ -113,6 +145,14 @@ ammin.wu $a4, $t6, $s7
 # CHECK64-ASM: encoding: [0x27,0xc3,0x68,0x38]
 ammin.du $a3, $t4, $s2
 
+# CHECK64-ASM-AND-OBJ: amswap_db.b $a2, $t0, $s1
+# CHECK64-ASM: encoding: [0x06,0x33,0x5e,0x38]
+amswap_db.b $a2, $t0, $s1
+
+# CHECK64-ASM-AND-OBJ: amswap_db.h $tp, $t2, $fp
+# CHECK64-ASM: encoding: [0xc2,0xba,0x5e,0x38]
+amswap_db.h $tp, $t2, $fp
+
 # CHECK64-ASM-AND-OBJ: amswap_db.w $a2, $t0, $s1
 # CHECK64-ASM: encoding: [0x06,0x33,0x69,0x38]
 amswap_db.w $a2, $t0, $s1
@@ -121,6 +161,14 @@ amswap_db.w $a2, $t0, $s1
 # CHECK64-ASM: encoding: [0xc2,0xba,0x69,0x38]
 amswap_db.d $tp, $t2, $fp
 
+# CHECK64-ASM-AND-OBJ: amadd_db.b $zero, $zero, $a1
+# CHECK64-ASM: encoding: [0xa0,0x00,0x5f,0x38]
+amadd_db.b $zero, $zero, $a1
+
+# CHECK64-ASM-AND-OBJ: amadd_db.h $a4, $t0, $r21
+# CHECK64-ASM: encoding: [0xa8,0xb2,0x5f,0x38]
+amadd_db.h $a4, $t0, $r21
+
 # CHECK64-ASM-AND-OBJ: amadd_db.w $a4, $t0, $r21
 # CHECK64-ASM: encoding: [0xa8,0x32,0x6a,0x38]
 amadd_db.w $a4, $t0, $r21
@@ -185,6 +233,38 @@ ammin_db.wu $a4, $t6, $s7
 # CHECK64-ASM: encoding: [0x27,0xc3,0x71,0x38]
 ammin_db.du $a3, $t4, $s2
 
+# CHECK64-ASM-AND-OBJ: amcas.b $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0x39,0x58,0x38]
+amcas.b $t1, $t2, $t3
+
+# CHECK64-ASM-AND-OBJ: amcas.h $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0xb9,0x58,0x38]
+amcas.h $t1, $t2, $t3
+
+# CHECK64-ASM-AND-OBJ: amcas.w $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0x39,0x59,0x38]
+amcas.w $t1, $t2, $t3
+
+# CHECK64-ASM-AND-OBJ: amcas.d $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0xb9,0x59,0x38]
+amcas.d $t1, $t2, $t3
+
+# CHECK64-ASM-AND-OBJ: amcas_db.b $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0x39,0x5a,0x38]
+amcas_db.b $t1, $t2, $t3
+
+# CHECK64-ASM-AND-OBJ: amcas_db.h $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0xb9,0x5a,0x38]
+amcas_db.h $t1, $t2, $t3
+
+# CHECK64-ASM-AND-OBJ: amcas_db.w $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0x39,0x5b,0x38]
+amcas_db.w $t1, $t2, $t3
+
+# CHECK64-ASM-AND-OBJ: amcas_db.d $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0xb9,0x5b,0x38]
+amcas_db.d $t1, $t2, $t3
+
 # CHECK64-ASM-AND-OBJ: ll.d $s2, $s4, 16
 # CHECK64-ASM: encoding: [0x79,0x13,0x00,0x22]
 ll.d $s2, $s4, 16
@@ -193,5 +273,17 @@ ll.d $s2, $s4, 16
 # CHECK64-ASM: encoding: [0x31,0xf6,0x00,0x23]
 sc.d $t5, $t5, 244
 
+# CHECK64-ASM-AND-OBJ: sc.q $t7, $t2, $t5
+# CHECK64-ASM: encoding: [0x33,0x3a,0x57,0x38]
+sc.q $t7, $t2, $t5
+
+# CHECK64-ASM-AND-OBJ: llacq.d $t1, $t2
+# CHECK64-ASM: encoding: [0xcd,0x89,0x57,0x38]
+llacq.d $t1, $t2
+
+# CHECK64-ASM-AND-OBJ: screl.d $t1, $t2
+# CHECK64-ASM: encoding: [0xcd,0x8d,0x57,0x38]
+screl.d $t1, $t2
+
 .endif
 
diff --git a/llvm/test/MC/LoongArch/lasx/frecip.s b/llvm/test/MC/LoongArch/lasx/frecip.s
index 1bb3ce02fb9c..e95b03a96eba 100644
--- a/llvm/test/MC/LoongArch/lasx/frecip.s
+++ b/llvm/test/MC/LoongArch/lasx/frecip.s
@@ -10,3 +10,11 @@ xvfrecip.s $xr3, $xr16
 xvfrecip.d $xr17, $xr24
 # CHECK-INST: xvfrecip.d $xr17, $xr24
 # CHECK-ENCODING: encoding: [0x11,0xfb,0x9c,0x76]
+
+xvfrecipe.s $xr3, $xr16
+# CHECK-INST: xvfrecipe.s $xr3, $xr16
+# CHECK-ENCODING: encoding: [0x03,0x16,0x9d,0x76]
+
+xvfrecipe.d $xr17, $xr24
+# CHECK-INST: xvfrecipe.d $xr17, $xr24
+# CHECK-ENCODING: encoding: [0x11,0x1b,0x9d,0x76]
diff --git a/llvm/test/MC/LoongArch/lasx/frsqrt.s b/llvm/test/MC/LoongArch/lasx/frsqrt.s
index af96e10832df..d1048f9ff8f0 100644
--- a/llvm/test/MC/LoongArch/lasx/frsqrt.s
+++ b/llvm/test/MC/LoongArch/lasx/frsqrt.s
@@ -10,3 +10,11 @@ xvfrsqrt.s $xr31, $xr25
 xvfrsqrt.d $xr14, $xr22
 # CHECK-INST: xvfrsqrt.d $xr14, $xr22
 # CHECK-ENCODING: encoding: [0xce,0x0a,0x9d,0x76]
+
+xvfrsqrte.s $xr31, $xr25
+# CHECK-INST: xvfrsqrte.s $xr31, $xr25
+# CHECK-ENCODING: encoding: [0x3f,0x27,0x9d,0x76]
+
+xvfrsqrte.d $xr14, $xr22
+# CHECK-INST: xvfrsqrte.d $xr14, $xr22
+# CHECK-ENCODING: encoding: [0xce,0x2a,0x9d,0x76]
diff --git a/llvm/test/MC/LoongArch/lsx/frecip.s b/llvm/test/MC/LoongArch/lsx/frecip.s
index d8c8278d1667..cd6d925e1470 100644
--- a/llvm/test/MC/LoongArch/lsx/frecip.s
+++ b/llvm/test/MC/LoongArch/lsx/frecip.s
@@ -10,3 +10,11 @@ vfrecip.s $vr29, $vr14
 vfrecip.d $vr24, $vr9
 # CHECK-INST: vfrecip.d $vr24, $vr9
 # CHECK-ENCODING: encoding: [0x38,0xf9,0x9c,0x72]
+
+vfrecipe.s $vr29, $vr14
+# CHECK-INST: vfrecipe.s $vr29, $vr14
+# CHECK-ENCODING: encoding: [0xdd,0x15,0x9d,0x72]
+
+vfrecipe.d $vr24, $vr9
+# CHECK-INST: vfrecipe.d $vr24, $vr9
+# CHECK-ENCODING: encoding: [0x38,0x19,0x9d,0x72]
diff --git a/llvm/test/MC/LoongArch/lsx/frsqrt.s b/llvm/test/MC/LoongArch/lsx/frsqrt.s
index 68b0cc091b8a..d8b9fc3d0684 100644
--- a/llvm/test/MC/LoongArch/lsx/frsqrt.s
+++ b/llvm/test/MC/LoongArch/lsx/frsqrt.s
@@ -10,3 +10,11 @@ vfrsqrt.s $vr19, $vr30
 vfrsqrt.d $vr1, $vr0
 # CHECK-INST: vfrsqrt.d $vr1, $vr0
 # CHECK-ENCODING: encoding: [0x01,0x08,0x9d,0x72]
+
+vfrsqrte.s $vr19, $vr30
+# CHECK-INST: vfrsqrte.s $vr19, $vr30
+# CHECK-ENCODING: encoding: [0xd3,0x27,0x9d,0x72]
+
+vfrsqrte.d $vr1, $vr0
+# CHECK-INST: vfrsqrte.d $vr1, $vr0
+# CHECK-ENCODING: encoding: [0x01,0x28,0x9d,0x72]
-- 
Gitee


From 57eaecf7bdb7a7502580076b365b4f70dde1185d Mon Sep 17 00:00:00 2001
From: Ami-zhang <zhanglimin@loongson.cn>
Date: Tue, 23 Jan 2024 14:24:58 +0800
Subject: [PATCH 19/23] [LoongArch] Add definitions and feature 'frecipe' for
 FP approximation intrinsics/builtins (#78962)

This PR adds definitions and 'frecipe' feature for FP approximation
intrinsics/builtins. In additions, this adds and complements relative
testcases.

(cherry picked from commit fcb8342a219ada8ec641790a4c8a9f969d7d64ee)
---
 .../clang/Basic/BuiltinsLoongArchBase.def     |  5 +++
 .../clang/Basic/BuiltinsLoongArchLASX.def     |  6 +++
 .../clang/Basic/BuiltinsLoongArchLSX.def      |  6 +++
 clang/lib/Headers/larchintrin.h               | 12 +++++
 clang/lib/Headers/lasxintrin.h                | 24 ++++++++++
 clang/lib/Headers/lsxintrin.h                 | 24 ++++++++++
 .../LoongArch/builtin-dbl-approximate.c       | 45 +++++++++++++++++++
 .../LoongArch/builtin-flt-approximate.c       | 45 +++++++++++++++++++
 .../CodeGen/LoongArch/intrinsic-la64-error.c  | 21 +++++++++
 .../lasx/builtin-approximate-alias.c          | 37 +++++++++++++++
 .../LoongArch/lasx/builtin-approximate.c      | 38 ++++++++++++++++
 .../LoongArch/lsx/builtin-approximate-alias.c | 37 +++++++++++++++
 .../LoongArch/lsx/builtin-approximate.c       | 38 ++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsLoongArch.td   | 13 ++++++
 llvm/lib/Target/LoongArch/LoongArch.td        |  7 +++
 .../LoongArch/LoongArchFloat32InstrInfo.td    |  6 +++
 .../LoongArch/LoongArchFloat64InstrInfo.td    |  6 +++
 .../LoongArch/LoongArchLASXInstrInfo.td       | 10 +++++
 .../Target/LoongArch/LoongArchLSXInstrInfo.td | 10 +++++
 .../lib/Target/LoongArch/LoongArchSubtarget.h |  2 +
 .../LoongArch/intrinsic-frecipe-dbl.ll        | 26 +++++++++++
 .../LoongArch/intrinsic-frecipe-flt.ll        | 26 +++++++++++
 .../LoongArch/lasx/intrinsic-frecipe.ll       | 26 +++++++++++
 .../LoongArch/lasx/intrinsic-frsqrte.ll       | 26 +++++++++++
 .../LoongArch/lsx/intrinsic-frecipe.ll        | 26 +++++++++++
 .../LoongArch/lsx/intrinsic-frsqrte.ll        | 26 +++++++++++
 26 files changed, 548 insertions(+)
 create mode 100644 clang/test/CodeGen/LoongArch/builtin-dbl-approximate.c
 create mode 100644 clang/test/CodeGen/LoongArch/builtin-flt-approximate.c
 create mode 100644 clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c
 create mode 100644 clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c
 create mode 100644 clang/test/CodeGen/LoongArch/lsx/builtin-approximate-alias.c
 create mode 100644 clang/test/CodeGen/LoongArch/lsx/builtin-approximate.c
 create mode 100644 llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll

diff --git a/clang/include/clang/Basic/BuiltinsLoongArchBase.def b/clang/include/clang/Basic/BuiltinsLoongArchBase.def
index cbb239223aae..a5a07c167908 100644
--- a/clang/include/clang/Basic/BuiltinsLoongArchBase.def
+++ b/clang/include/clang/Basic/BuiltinsLoongArchBase.def
@@ -51,3 +51,8 @@ TARGET_BUILTIN(__builtin_loongarch_iocsrwr_d, "vUWiUi", "nc", "64bit")
 
 TARGET_BUILTIN(__builtin_loongarch_lddir_d, "WiWiIUWi", "nc", "64bit")
 TARGET_BUILTIN(__builtin_loongarch_ldpte_d, "vWiIUWi", "nc", "64bit")
+
+TARGET_BUILTIN(__builtin_loongarch_frecipe_s, "ff", "nc", "f,frecipe")
+TARGET_BUILTIN(__builtin_loongarch_frecipe_d, "dd", "nc", "d,frecipe")
+TARGET_BUILTIN(__builtin_loongarch_frsqrte_s, "ff", "nc", "f,frecipe")
+TARGET_BUILTIN(__builtin_loongarch_frsqrte_d, "dd", "nc", "d,frecipe")
diff --git a/clang/include/clang/Basic/BuiltinsLoongArchLASX.def b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
index 3de200f665b6..4cf51cc000f6 100644
--- a/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
+++ b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
@@ -657,9 +657,15 @@ TARGET_BUILTIN(__builtin_lasx_xvfsqrt_d, "V4dV4d", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvfrecip_s, "V8fV8f", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvfrecip_d, "V4dV4d", "nc", "lasx")
 
+TARGET_BUILTIN(__builtin_lasx_xvfrecipe_s, "V8fV8f", "nc", "lasx,frecipe")
+TARGET_BUILTIN(__builtin_lasx_xvfrecipe_d, "V4dV4d", "nc", "lasx,frecipe")
+
 TARGET_BUILTIN(__builtin_lasx_xvfrsqrt_s, "V8fV8f", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvfrsqrt_d, "V4dV4d", "nc", "lasx")
 
+TARGET_BUILTIN(__builtin_lasx_xvfrsqrte_s, "V8fV8f", "nc", "lasx,frecipe")
+TARGET_BUILTIN(__builtin_lasx_xvfrsqrte_d, "V4dV4d", "nc", "lasx,frecipe")
+
 TARGET_BUILTIN(__builtin_lasx_xvfcvtl_s_h, "V8fV16s", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvfcvth_s_h, "V8fV16s", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvfcvtl_d_s, "V4dV8f", "nc", "lasx")
diff --git a/clang/include/clang/Basic/BuiltinsLoongArchLSX.def b/clang/include/clang/Basic/BuiltinsLoongArchLSX.def
index 8e6aec886c50..c90f4dc5458f 100644
--- a/clang/include/clang/Basic/BuiltinsLoongArchLSX.def
+++ b/clang/include/clang/Basic/BuiltinsLoongArchLSX.def
@@ -641,9 +641,15 @@ TARGET_BUILTIN(__builtin_lsx_vfsqrt_d, "V2dV2d", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vfrecip_s, "V4fV4f", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vfrecip_d, "V2dV2d", "nc", "lsx")
 
+TARGET_BUILTIN(__builtin_lsx_vfrecipe_s, "V4fV4f", "nc", "lsx,frecipe")
+TARGET_BUILTIN(__builtin_lsx_vfrecipe_d, "V2dV2d", "nc", "lsx,frecipe")
+
 TARGET_BUILTIN(__builtin_lsx_vfrsqrt_s, "V4fV4f", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vfrsqrt_d, "V2dV2d", "nc", "lsx")
 
+TARGET_BUILTIN(__builtin_lsx_vfrsqrte_s, "V4fV4f", "nc", "lsx,frecipe")
+TARGET_BUILTIN(__builtin_lsx_vfrsqrte_d, "V2dV2d", "nc", "lsx,frecipe")
+
 TARGET_BUILTIN(__builtin_lsx_vfcvtl_s_h, "V4fV8s", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vfcvtl_d_s, "V2dV4f", "nc", "lsx")
 
diff --git a/clang/lib/Headers/larchintrin.h b/clang/lib/Headers/larchintrin.h
index 24dd29ce91ff..f4218295919a 100644
--- a/clang/lib/Headers/larchintrin.h
+++ b/clang/lib/Headers/larchintrin.h
@@ -228,6 +228,18 @@ extern __inline void
   ((void)__builtin_loongarch_ldpte_d((long int)(_1), (_2)))
 #endif
 
+#define __frecipe_s(/*float*/ _1)                                              \
+  (float)__builtin_loongarch_frecipe_s((float)_1)
+
+#define __frecipe_d(/*double*/ _1)                                             \
+  (double)__builtin_loongarch_frecipe_d((double)_1)
+
+#define __frsqrte_s(/*float*/ _1)                                              \
+  (float)__builtin_loongarch_frsqrte_s((float)_1)
+
+#define __frsqrte_d(/*double*/ _1)                                             \
+  (double)__builtin_loongarch_frsqrte_d((double)_1)
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/clang/lib/Headers/lasxintrin.h b/clang/lib/Headers/lasxintrin.h
index 6b4d5012a24b..dafc2a2f3e6a 100644
--- a/clang/lib/Headers/lasxintrin.h
+++ b/clang/lib/Headers/lasxintrin.h
@@ -1726,6 +1726,18 @@ extern __inline
   return (__m256d)__builtin_lasx_xvfrecip_d((v4f64)_1);
 }
 
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfrecipe_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfrecipe_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfrecipe_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfrecipe_d((v4f64)_1);
+}
+
 extern __inline
     __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
     __lasx_xvfrint_s(__m256 _1) {
@@ -1750,6 +1762,18 @@ extern __inline
   return (__m256d)__builtin_lasx_xvfrsqrt_d((v4f64)_1);
 }
 
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfrsqrte_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfrsqrte_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfrsqrte_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfrsqrte_d((v4f64)_1);
+}
+
 extern __inline
     __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
     __lasx_xvflogb_s(__m256 _1) {
diff --git a/clang/lib/Headers/lsxintrin.h b/clang/lib/Headers/lsxintrin.h
index a29bc7757ab5..f347955ce6fb 100644
--- a/clang/lib/Headers/lsxintrin.h
+++ b/clang/lib/Headers/lsxintrin.h
@@ -1776,6 +1776,18 @@ extern __inline
   return (__m128d)__builtin_lsx_vfrecip_d((v2f64)_1);
 }
 
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfrecipe_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfrecipe_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfrecipe_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfrecipe_d((v2f64)_1);
+}
+
 extern __inline
     __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
     __lsx_vfrint_s(__m128 _1) {
@@ -1800,6 +1812,18 @@ extern __inline
   return (__m128d)__builtin_lsx_vfrsqrt_d((v2f64)_1);
 }
 
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfrsqrte_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfrsqrte_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfrsqrte_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfrsqrte_d((v2f64)_1);
+}
+
 extern __inline
     __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
     __lsx_vflogb_s(__m128 _1) {
diff --git a/clang/test/CodeGen/LoongArch/builtin-dbl-approximate.c b/clang/test/CodeGen/LoongArch/builtin-dbl-approximate.c
new file mode 100644
index 000000000000..e5fe684346c0
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/builtin-dbl-approximate.c
@@ -0,0 +1,45 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple loongarch32 -target-feature +d -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +d -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include <larchintrin.h>
+
+// CHECK-LABEL: @frecipe_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call double @llvm.loongarch.frecipe.d(double [[A:%.*]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
+double frecipe_d (double _1)
+{
+  return __builtin_loongarch_frecipe_d (_1);
+}
+
+// CHECK-LABEL: @frsqrte_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call double @llvm.loongarch.frsqrte.d(double [[A:%.*]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
+double frsqrte_d (double _1)
+{
+  return __builtin_loongarch_frsqrte_d (_1);
+}
+
+// CHECK-LABEL: @frecipe_d_alia
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call double @llvm.loongarch.frecipe.d(double [[A:%.*]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
+double frecipe_d_alia (double _1)
+{
+  return __frecipe_d (_1);
+}
+
+// CHECK-LABEL: @frsqrte_d_alia
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call double @llvm.loongarch.frsqrte.d(double [[A:%.*]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
+double frsqrte_d_alia (double _1)
+{
+  return __frsqrte_d (_1);
+}
diff --git a/clang/test/CodeGen/LoongArch/builtin-flt-approximate.c b/clang/test/CodeGen/LoongArch/builtin-flt-approximate.c
new file mode 100644
index 000000000000..47bb47084364
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/builtin-flt-approximate.c
@@ -0,0 +1,45 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple loongarch32 -target-feature +f -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +f -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include <larchintrin.h>
+
+// CHECK-LABEL: @frecipe_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call float @llvm.loongarch.frecipe.s(float [[A:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float frecipe_s (float _1)
+{
+  return __builtin_loongarch_frecipe_s (_1);
+}
+
+// CHECK-LABEL: @frsqrte_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call float @llvm.loongarch.frsqrte.s(float [[A:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float frsqrte_s (float _1)
+{
+  return __builtin_loongarch_frsqrte_s (_1);
+}
+
+// CHECK-LABEL: @frecipe_s_alia
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call float @llvm.loongarch.frecipe.s(float [[A:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float frecipe_s_alia (float _1)
+{
+  return __frecipe_s (_1);
+}
+
+// CHECK-LABEL: @frsqrte_s_alia
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call float @llvm.loongarch.frsqrte.s(float [[A:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float frsqrte_s_alia (float _1)
+{
+  return __frsqrte_s (_1);
+}
diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c b/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c
index efb3b94175cf..a3242dfd41e9 100644
--- a/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c
+++ b/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c
@@ -1,7 +1,28 @@
 // RUN: %clang_cc1 -triple loongarch64 -emit-llvm -S -verify %s -o /dev/null
+// RUN: not %clang_cc1 -triple loongarch64 -DFEATURE_CHECK -emit-llvm %s -o /dev/null 2>&1 \
+// RUN:   | FileCheck %s
 
 #include <larchintrin.h>
 
+#ifdef FEATURE_CHECK
+void test_feature(unsigned long *v_ul, int *v_i, float a, double b) {
+// CHECK: error: '__builtin_loongarch_cacop_w' needs target feature 32bit
+  __builtin_loongarch_cacop_w(1, v_ul[0], 1024);
+// CHECK: error: '__builtin_loongarch_movfcsr2gr' needs target feature f
+  v_i[0] = __builtin_loongarch_movfcsr2gr(1);
+// CHECK: error: '__builtin_loongarch_movgr2fcsr' needs target feature f
+  __builtin_loongarch_movgr2fcsr(1, v_i[1]);
+// CHECK: error: '__builtin_loongarch_frecipe_s' needs target feature f,frecipe
+  float f1 = __builtin_loongarch_frecipe_s(a);
+// CHECK: error: '__builtin_loongarch_frsqrte_s' needs target feature f,frecipe
+  float f2 = __builtin_loongarch_frsqrte_s(a);
+// CHECK: error: '__builtin_loongarch_frecipe_d' needs target feature d,frecipe
+  double d1 = __builtin_loongarch_frecipe_d(b);
+// CHECK: error: '__builtin_loongarch_frsqrte_d' needs target feature d,frecipe
+  double d2 = __builtin_loongarch_frsqrte_d(b);
+}
+#endif
+
 void csrrd_d(int a) {
   __builtin_loongarch_csrrd_d(16384); // expected-error {{argument value 16384 is outside the valid range [0, 16383]}}
   __builtin_loongarch_csrrd_d(-1); // expected-error {{argument value 4294967295 is outside the valid range [0, 16383]}}
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c
new file mode 100644
index 000000000000..b79f93940399
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c
@@ -0,0 +1,37 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include <lasxintrin.h>
+
+// CHECK-LABEL: @xvfrecipe_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v8f32 xvfrecipe_s(v8f32 _1) { return __lasx_xvfrecipe_s(_1); }
+// CHECK-LABEL: @xvfrecipe_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v4f64 xvfrecipe_d(v4f64 _1) { return __lasx_xvfrecipe_d(_1); }
+// CHECK-LABEL: @xvfrsqrte_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v8f32 xvfrsqrte_s(v8f32 _1) { return __lasx_xvfrsqrte_s(_1); }
+// CHECK-LABEL: @xvfrsqrte_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v4f64 xvfrsqrte_d(v4f64 _1) { return __lasx_xvfrsqrte_d(_1); }
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c
new file mode 100644
index 000000000000..63e9ba639ea2
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c
@@ -0,0 +1,38 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+typedef float v8f32 __attribute__((vector_size(32), aligned(32)));
+typedef double v4f64 __attribute__((vector_size(32), aligned(32)));
+
+// CHECK-LABEL: @xvfrecipe_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v8f32 xvfrecipe_s(v8f32 _1) { return __builtin_lasx_xvfrecipe_s(_1); }
+// CHECK-LABEL: @xvfrecipe_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v4f64 xvfrecipe_d(v4f64 _1) { return __builtin_lasx_xvfrecipe_d(_1); }
+// CHECK-LABEL: @xvfrsqrte_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v8f32 xvfrsqrte_s(v8f32 _1) { return __builtin_lasx_xvfrsqrte_s(_1); }
+// CHECK-LABEL: @xvfrsqrte_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v4f64 xvfrsqrte_d(v4f64 _1) { return __builtin_lasx_xvfrsqrte_d(_1); }
diff --git a/clang/test/CodeGen/LoongArch/lsx/builtin-approximate-alias.c b/clang/test/CodeGen/LoongArch/lsx/builtin-approximate-alias.c
new file mode 100644
index 000000000000..f26f032c878e
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lsx/builtin-approximate-alias.c
@@ -0,0 +1,37 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lsx -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include <lsxintrin.h>
+
+// CHECK-LABEL: @vfrecipe_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 vfrecipe_s(v4f32 _1) { return __lsx_vfrecipe_s(_1); }
+// CHECK-LABEL: @vfrecipe_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 vfrecipe_d(v2f64 _1) { return __lsx_vfrecipe_d(_1); }
+// CHECK-LABEL: @vfrsqrte_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 vfrsqrte_s(v4f32 _1) { return __lsx_vfrsqrte_s(_1); }
+// CHECK-LABEL: @vfrsqrte_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 vfrsqrte_d(v2f64 _1) { return __lsx_vfrsqrte_d(_1); }
diff --git a/clang/test/CodeGen/LoongArch/lsx/builtin-approximate.c b/clang/test/CodeGen/LoongArch/lsx/builtin-approximate.c
new file mode 100644
index 000000000000..39fa1663db34
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lsx/builtin-approximate.c
@@ -0,0 +1,38 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lsx -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+typedef float v4f32 __attribute__ ((vector_size(16), aligned(16)));
+typedef double v2f64 __attribute__ ((vector_size(16), aligned(16)));
+
+// CHECK-LABEL: @vfrecipe_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 vfrecipe_s (v4f32 _1) { return __builtin_lsx_vfrecipe_s (_1); }
+// CHECK-LABEL: @vfrecipe_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 vfrecipe_d (v2f64 _1) { return __builtin_lsx_vfrecipe_d (_1); }
+// CHECK-LABEL: @vfrsqrte_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 vfrsqrte_s (v4f32 _1) { return __builtin_lsx_vfrsqrte_s (_1); }
+// CHECK-LABEL: @vfrsqrte_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 vfrsqrte_d (v2f64 _1) { return __builtin_lsx_vfrsqrte_d (_1); }
diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
index 685deaec7709..9002076e7aec 100644
--- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
@@ -122,6 +122,15 @@ def int_loongarch_lddir_d : BaseInt<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                                     [ImmArg<ArgIndex<1>>]>;
 def int_loongarch_ldpte_d : BaseInt<[], [llvm_i64_ty, llvm_i64_ty],
                                     [ImmArg<ArgIndex<1>>]>;
+
+def int_loongarch_frecipe_s : BaseInt<[llvm_float_ty], [llvm_float_ty],
+                                      [IntrNoMem]>;
+def int_loongarch_frecipe_d : BaseInt<[llvm_double_ty], [llvm_double_ty],
+                                      [IntrNoMem]>;
+def int_loongarch_frsqrte_s : BaseInt<[llvm_float_ty], [llvm_float_ty],
+                                      [IntrNoMem]>;
+def int_loongarch_frsqrte_d : BaseInt<[llvm_double_ty], [llvm_double_ty],
+                                      [IntrNoMem]>;
 } // TargetPrefix = "loongarch"
 
 /// Vector intrinsic
@@ -527,10 +536,12 @@ foreach inst = ["vfmadd_d", "vfmsub_d", "vfnmadd_d", "vfnmsub_d"] in
              [IntrNoMem]>;
 
 foreach inst = ["vflogb_s", "vfsqrt_s", "vfrecip_s", "vfrsqrt_s", "vfrint_s",
+                "vfrecipe_s", "vfrsqrte_s",
                 "vfrintrne_s", "vfrintrz_s", "vfrintrp_s", "vfrintrm_s"] in
   def int_loongarch_lsx_#inst : VecInt<[llvm_v4f32_ty], [llvm_v4f32_ty],
                                        [IntrNoMem]>;
 foreach inst = ["vflogb_d", "vfsqrt_d", "vfrecip_d", "vfrsqrt_d", "vfrint_d",
+                "vfrecipe_d", "vfrsqrte_d",
                 "vfrintrne_d", "vfrintrz_d", "vfrintrp_d", "vfrintrm_d"] in
   def int_loongarch_lsx_#inst : VecInt<[llvm_v2f64_ty], [llvm_v2f64_ty],
                                        [IntrNoMem]>;
@@ -1044,10 +1055,12 @@ foreach inst = ["xvfmadd_d", "xvfmsub_d", "xvfnmadd_d", "xvfnmsub_d"] in
              [IntrNoMem]>;
 
 foreach inst = ["xvflogb_s", "xvfsqrt_s", "xvfrecip_s", "xvfrsqrt_s", "xvfrint_s",
+                "xvfrecipe_s", "xvfrsqrte_s",
                 "xvfrintrne_s", "xvfrintrz_s", "xvfrintrp_s", "xvfrintrm_s"] in
   def int_loongarch_lasx_#inst : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty],
                                         [IntrNoMem]>;
 foreach inst = ["xvflogb_d", "xvfsqrt_d", "xvfrecip_d", "xvfrsqrt_d", "xvfrint_d",
+                "xvfrecipe_d", "xvfrsqrte_d",
                 "xvfrintrne_d", "xvfrintrz_d", "xvfrintrp_d", "xvfrintrm_d"] in
   def int_loongarch_lasx_#inst : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty],
                                         [IntrNoMem]>;
diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
index 2a4c991a43b0..5573e5415d26 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.td
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@@ -110,6 +110,13 @@ def FeatureAutoVec
     : SubtargetFeature<"auto-vec", "HasExpAutoVec", "true",
                        "Experimental auto vectorization">;
 
+// Floating point approximation operation
+def FeatureFrecipe
+    : SubtargetFeature<"frecipe", "HasFrecipe", "true",
+                       "Support frecipe.{s/d} and frsqrte.{s/d} instructions.">;
+def HasFrecipe : Predicate<"Subtarget->hasFrecipe()">;
+
+
 //===----------------------------------------------------------------------===//
 // Registers, instruction descriptions ...
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index f30837912e75..e27896768818 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -281,6 +281,12 @@ def : Pat<(loongarch_ftint FPR32:$src), (FTINTRZ_W_S FPR32:$src)>;
 // FP reciprocal operation
 def : Pat<(fdiv fpimm1, FPR32:$src), (FRECIP_S $src)>;
 
+let Predicates = [HasFrecipe] in {
+// FP approximate reciprocal operation
+def : Pat<(int_loongarch_frecipe_s FPR32:$src), (FRECIPE_S FPR32:$src)>;
+def : Pat<(int_loongarch_frsqrte_s FPR32:$src), (FRSQRTE_S FPR32:$src)>;
+}
+
 // fmadd.s: fj * fk + fa
 def : Pat<(fma FPR32:$fj, FPR32:$fk, FPR32:$fa), (FMADD_S $fj, $fk, $fa)>;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
index 0ea4c564b045..26bed67ac222 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
@@ -242,6 +242,12 @@ def : Pat<(f64 (fpextend FPR32:$src)), (FCVT_D_S FPR32:$src)>;
 // FP reciprocal operation
 def : Pat<(fdiv fpimm1, FPR64:$src), (FRECIP_D $src)>;
 
+let Predicates = [HasFrecipe] in {
+// FP approximate reciprocal operation
+def : Pat<(int_loongarch_frecipe_d FPR64:$src), (FRECIPE_D FPR64:$src)>;
+def : Pat<(int_loongarch_frsqrte_d FPR64:$src), (FRSQRTE_D FPR64:$src)>;
+}
+
 // fmadd.d: fj * fk + fa
 def : Pat<(fma FPR64:$fj, FPR64:$fk, FPR64:$fa), (FMADD_D $fj, $fk, $fa)>;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 454915ac8c0a..6f1969bf8cae 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -2080,6 +2080,16 @@ foreach Inst = ["XVFLOGB_D", "XVFCLASS_D", "XVFSQRT_D", "XVFRECIP_D", "XVFRSQRT_
   def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4f64 LASX256:$xj)),
             (!cast<LAInst>(Inst) LASX256:$xj)>;
 
+// 256-Bit vector FP approximate reciprocal operation
+let Predicates = [HasFrecipe] in {
+foreach Inst = ["XVFRECIPE_S", "XVFRSQRTE_S"] in
+  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v8f32 LASX256:$xj)),
+            (!cast<LAInst>(Inst) LASX256:$xj)>;
+foreach Inst = ["XVFRECIPE_D", "XVFRSQRTE_D"] in
+  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4f64 LASX256:$xj)),
+            (!cast<LAInst>(Inst) LASX256:$xj)>;
+}
+
 def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm),
           (XVPICKVE_W v8f32:$xj, (to_valid_timm timm:$imm))>;
 def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 6d60d7074ec3..0580683c3ce3 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -2195,6 +2195,16 @@ foreach Inst = ["VFLOGB_D", "VFCLASS_D", "VFSQRT_D", "VFRECIP_D", "VFRSQRT_D",
   def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2f64 LSX128:$vj)),
             (!cast<LAInst>(Inst) LSX128:$vj)>;
 
+// 128-Bit vector FP approximate reciprocal operation
+let Predicates = [HasFrecipe] in {
+foreach Inst = ["VFRECIPE_S", "VFRSQRTE_S"] in
+  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v4f32 LSX128:$vj)),
+            (!cast<LAInst>(Inst) LSX128:$vj)>;
+foreach Inst = ["VFRECIPE_D", "VFRSQRTE_D"] in
+  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2f64 LSX128:$vj)),
+            (!cast<LAInst>(Inst) LSX128:$vj)>;
+}
+
 // load
 def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm),
           (VLD GPR:$rj, (to_valid_timm timm:$imm))>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
index 174e4cba8326..11c0b39e176e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
@@ -45,6 +45,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
   bool HasUAL = false;
   bool HasLinkerRelax = false;
   bool HasExpAutoVec = false;
+  bool HasFrecipe = false;
   unsigned GRLen = 32;
   MVT GRLenVT = MVT::i32;
   LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown;
@@ -104,6 +105,7 @@ public:
   bool hasUAL() const { return HasUAL; }
   bool hasLinkerRelax() const { return HasLinkerRelax; }
   bool hasExpAutoVec() const { return HasExpAutoVec; }
+  bool hasFrecipe() const { return HasFrecipe; }
   MVT getGRLenVT() const { return GRLenVT; }
   unsigned getGRLen() const { return GRLen; }
   LoongArchABI::ABI getTargetABI() const { return TargetABI; }
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll
new file mode 100644
index 000000000000..9f572500caa0
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll
@@ -0,0 +1,26 @@
+; RUN: llc --mtriple=loongarch32 --mattr=+d,+frecipe < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s
+
+declare double @llvm.loongarch.frecipe.d(double)
+
+define double @frecipe_d(double %a) {
+; CHECK-LABEL: frecipe_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    frecipe.d $fa0, $fa0
+; CHECK-NEXT:    ret
+entry:
+  %res = call double @llvm.loongarch.frecipe.d(double %a)
+  ret double %res
+}
+
+declare double @llvm.loongarch.frsqrte.d(double)
+
+define double @frsqrte_d(double %a) {
+; CHECK-LABEL: frsqrte_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    frsqrte.d $fa0, $fa0
+; CHECK-NEXT:    ret
+entry:
+  %res = call double @llvm.loongarch.frsqrte.d(double %a)
+  ret double %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll
new file mode 100644
index 000000000000..0b2029f2e44a
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll
@@ -0,0 +1,26 @@
+; RUN: llc --mtriple=loongarch32 --mattr=+f,+frecipe < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+f,+frecipe < %s | FileCheck %s
+
+declare float @llvm.loongarch.frecipe.s(float)
+
+define float @frecipe_s(float %a) {
+; CHECK-LABEL: frecipe_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    frecipe.s $fa0, $fa0
+; CHECK-NEXT:    ret
+entry:
+  %res = call float @llvm.loongarch.frecipe.s(float %a)
+  ret float %res
+}
+
+declare float @llvm.loongarch.frsqrte.s(float)
+
+define float @frsqrte_s(float %a) {
+; CHECK-LABEL: frsqrte_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    frsqrte.s $fa0, $fa0
+; CHECK-NEXT:    ret
+entry:
+  %res = call float @llvm.loongarch.frsqrte.s(float %a)
+  ret float %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll
new file mode 100644
index 000000000000..215436823af8
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s
+
+declare <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float>)
+
+define <8 x float> @lasx_xvfrecipe_s(<8 x float> %va) nounwind {
+; CHECK-LABEL: lasx_xvfrecipe_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvfrecipe.s $xr0, $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> %va)
+  ret <8 x float> %res
+}
+
+declare <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double>)
+
+define <4 x double> @lasx_xvfrecipe_d(<4 x double> %va) nounwind {
+; CHECK-LABEL: lasx_xvfrecipe_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvfrecipe.d $xr0, $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> %va)
+  ret <4 x double> %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll
new file mode 100644
index 000000000000..ad36c3aa5c29
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s
+
+declare <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float>)
+
+define <8 x float> @lasx_xvfrsqrte_s(<8 x float> %va) nounwind {
+; CHECK-LABEL: lasx_xvfrsqrte_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvfrsqrte.s $xr0, $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> %va)
+  ret <8 x float> %res
+}
+
+declare <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double>)
+
+define <4 x double> @lasx_xvfrsqrte_d(<4 x double> %va) nounwind {
+; CHECK-LABEL: lasx_xvfrsqrte_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvfrsqrte.d $xr0, $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> %va)
+  ret <4 x double> %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll
new file mode 100644
index 000000000000..1b7a97d9f972
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s
+
+declare <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float>)
+
+define <4 x float> @lsx_vfrecipe_s(<4 x float> %va) nounwind {
+; CHECK-LABEL: lsx_vfrecipe_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfrecipe.s $vr0, $vr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float> %va)
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double>)
+
+define <2 x double> @lsx_vfrecipe_d(<2 x double> %va) nounwind {
+; CHECK-LABEL: lsx_vfrecipe_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfrecipe.d $vr0, $vr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double> %va)
+  ret <2 x double> %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll
new file mode 100644
index 000000000000..3cd6c78e87d7
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s
+
+declare <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float>)
+
+define <4 x float> @lsx_vfrsqrte_s(<4 x float> %va) nounwind {
+; CHECK-LABEL: lsx_vfrsqrte_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfrsqrte.s $vr0, $vr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float> %va)
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double>)
+
+define <2 x double> @lsx_vfrsqrte_d(<2 x double> %va) nounwind {
+; CHECK-LABEL: lsx_vfrsqrte_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfrsqrte.d $vr0, $vr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double> %va)
+  ret <2 x double> %res
+}
-- 
Gitee


From 6f9531b069971dc0f5c6b28bd6a6754c1b5fde72 Mon Sep 17 00:00:00 2001
From: Ami-zhang <zhanglimin@loongson.cn>
Date: Tue, 23 Jul 2024 14:03:28 +0800
Subject: [PATCH 20/23] [LoongArch] Support -march=la64v1.0 and -march=la64v1.1
 (#100057)

The newly added strings `la64v1.0` and `la64v1.1` in `-march` are as
described in LoongArch toolchains conventions (see [1]).

The target-cpu/feature attributes are forwarded to compiler when
specifying particular `-march` parameter. The default cpu `loongarch64`
is returned when archname is `la64v1.0` or `la64v1.1`.

In addition, this commit adds `la64v1.0`/`la64v1.1` to
"__loongarch_arch" and adds definition for macro "__loongarch_frecipe".

[1]: https://github.com/loongson/la-toolchain-conventions

(cherry picked from commit 5a1b9896ad5a7dcd25a1cc7a4d3fd44155e4b22d)
---
 clang/lib/Basic/Targets/LoongArch.cpp         | 23 +++++++++++++++-
 clang/lib/Basic/Targets/LoongArch.h           |  2 ++
 .../lib/Driver/ToolChains/Arch/LoongArch.cpp  | 10 +++++--
 clang/test/Driver/loongarch-march.c           | 22 +++++++++++++++
 clang/test/Preprocessor/init-loongarch.c      | 27 ++++++++++++++++---
 .../TargetParser/LoongArchTargetParser.cpp    | 11 ++++++++
 6 files changed, 88 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp
index 913404240916..5fede3d7cdc4 100644
--- a/clang/lib/Basic/Targets/LoongArch.cpp
+++ b/clang/lib/Basic/Targets/LoongArch.cpp
@@ -200,7 +200,24 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
 
   // Define __loongarch_arch.
   StringRef ArchName = getCPU();
-  Builder.defineMacro("__loongarch_arch", Twine('"') + ArchName + Twine('"'));
+  if (ArchName == "loongarch64") {
+    if (HasFeatureLSX) {
+      // TODO: As more features of the V1.1 ISA are supported, a unified "v1.1"
+      // arch feature set will be used to include all sub-features belonging to
+      // the V1.1 ISA version.
+      if (HasFeatureFrecipe)
+        Builder.defineMacro("__loongarch_arch",
+                            Twine('"') + "la64v1.1" + Twine('"'));
+      else
+        Builder.defineMacro("__loongarch_arch",
+                            Twine('"') + "la64v1.0" + Twine('"'));
+    } else {
+      Builder.defineMacro("__loongarch_arch",
+                          Twine('"') + ArchName + Twine('"'));
+    }
+  } else {
+    Builder.defineMacro("__loongarch_arch", Twine('"') + ArchName + Twine('"'));
+  }
 
   // Define __loongarch_tune.
   StringRef TuneCPU = getTargetOpts().TuneCPU;
@@ -216,6 +233,8 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__loongarch_simd_width", "128");
     Builder.defineMacro("__loongarch_sx", Twine(1));
   }
+  if (HasFeatureFrecipe)
+    Builder.defineMacro("__loongarch_frecipe", Twine(1));
 
   StringRef ABI = getABI();
   if (ABI == "lp64d" || ABI == "lp64f" || ABI == "lp64s")
@@ -289,6 +308,8 @@ bool LoongArchTargetInfo::handleTargetFeatures(
       HasFeatureLSX = true;
     else if (Feature == "+lasx")
       HasFeatureLASX = true;
+    else if (Feature == "+frecipe")
+      HasFeatureFrecipe = true;
   }
   return true;
 }
diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h
index 3313102492cb..4d2965f5b3a3 100644
--- a/clang/lib/Basic/Targets/LoongArch.h
+++ b/clang/lib/Basic/Targets/LoongArch.h
@@ -29,6 +29,7 @@ protected:
   bool HasFeatureF;
   bool HasFeatureLSX;
   bool HasFeatureLASX;
+  bool HasFeatureFrecipe;
 
 public:
   LoongArchTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
@@ -37,6 +38,7 @@ public:
     HasFeatureF = false;
     HasFeatureLSX = false;
     HasFeatureLASX = false;
+    HasFeatureFrecipe = false;
     LongDoubleWidth = 128;
     LongDoubleAlign = 128;
     LongDoubleFormat = &llvm::APFloat::IEEEquad();
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 87d7b30ef5d3..21106c425206 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -268,8 +268,14 @@ std::string loongarch::postProcessTargetCPUString(const std::string &CPU,
 std::string loongarch::getLoongArchTargetCPU(const llvm::opt::ArgList &Args,
                                              const llvm::Triple &Triple) {
   std::string CPU;
+  std::string Arch;
   // If we have -march, use that.
-  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
-    CPU = A->getValue();
+  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
+    Arch = A->getValue();
+    if (Arch == "la64v1.0" || Arch == "la64v1.1")
+      CPU = llvm::LoongArch::getDefaultArch(Triple.isLoongArch64());
+    else
+      CPU = Arch;
+  }
   return postProcessTargetCPUString(CPU, Triple);
 }
diff --git a/clang/test/Driver/loongarch-march.c b/clang/test/Driver/loongarch-march.c
index 9214130cd034..d06da72a755c 100644
--- a/clang/test/Driver/loongarch-march.c
+++ b/clang/test/Driver/loongarch-march.c
@@ -2,10 +2,18 @@
 // RUN:   FileCheck %s --check-prefix=CC1-LOONGARCH64
 // RUN: %clang --target=loongarch64 -march=la464 -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-LA464
+// RUN: %clang --target=loongarch64 -march=la64v1.0 -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LA64V1P0
+// RUN: %clang --target=loongarch64 -march=la64v1.1 -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LA64V1P1
 // RUN: %clang --target=loongarch64 -march=loongarch64 -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LOONGARCH64
 // RUN: %clang --target=loongarch64 -march=la464 -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LA464
+// RUN: %clang --target=loongarch64 -march=la64v1.0 -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LA64V1P0
+// RUN: %clang --target=loongarch64 -march=la64v1.1 -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LA64V1P1
 
 // CC1-LOONGARCH64: "-target-cpu" "loongarch64"
 // CC1-LOONGARCH64-NOT: "-target-feature"
@@ -19,8 +27,22 @@
 // CC1-LA464-NOT: "-target-feature"
 // CC1-LA464: "-target-abi" "lp64d"
 
+// CC1-LA64V1P0: "-target-cpu" "loongarch64"
+// CC1-LA64V1P0-NOT: "-target-feature"
+// CC1-LA64V1P0: "-target-feature" "+64bit" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+ual"
+// CC1-LA64V1P0-NOT: "-target-feature"
+// CC1-LA64V1P0: "-target-abi" "lp64d"
+
+// CC1-LA64V1P1: "-target-cpu" "loongarch64"
+// CC1-LA64V1P1-NOT: "-target-feature"
+// CC1-LA64V1P1: "-target-feature" "+64bit" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+ual" "-target-feature" "+frecipe"
+// CC1-LA64V1P1-NOT: "-target-feature"
+// CC1-LA64V1P1: "-target-abi" "lp64d"
+
 // IR-LOONGARCH64: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+f,+ual"
 // IR-LA464: attributes #[[#]] ={{.*}}"target-cpu"="la464" {{.*}}"target-features"="+64bit,+d,+f,+lasx,+lsx,+ual"
+// IR-LA64V1P0: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+lsx,+ual"
+// IR-LA64V1P1: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+frecipe,+lsx,+ual"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index 635d029ce9d3..cfa3ddb20f10 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -788,24 +788,43 @@
 // LA64-FPU0-LP64S-NOT: #define __loongarch_single_float
 // LA64-FPU0-LP64S: #define __loongarch_soft_float 1
 
-/// Check __loongarch_arch and __loongarch_tune.
+/// Check __loongarch_arch{_tune/_frecipe}.
 
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la464 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la464 -DTUNE=la464 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=loongarch64 | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=la464 | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la464 %s
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=la464 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -mtune=la464 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la464 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la464 -mtune=loongarch64 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la464 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang -lsx | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +frecipe | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la64v1.1 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 | \
+// RUN:   FileCheck --match-full-lines  --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la64v1.1 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -frecipe | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -lsx | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +frecipe | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +frecipe | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la64v1.1 -DTUNE=loongarch64 %s
 
 // ARCH-TUNE: #define __loongarch_arch "[[ARCH]]"
+// FRECIPE: #define __loongarch_frecipe 1
 // ARCH-TUNE: #define __loongarch_tune "[[TUNE]]"
 
 // RUN: %clang --target=loongarch64 -mlsx -x c -E -dM %s -o - \
diff --git a/llvm/lib/TargetParser/LoongArchTargetParser.cpp b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
index 772d24c5ce3d..8e86d18de2ad 100644
--- a/llvm/lib/TargetParser/LoongArchTargetParser.cpp
+++ b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
@@ -44,6 +44,17 @@ bool LoongArch::getArchFeatures(StringRef Arch,
       return true;
     }
   }
+
+  if (Arch == "la64v1.0" || Arch == "la64v1.1") {
+    Features.push_back("+64bit");
+    Features.push_back("+d");
+    Features.push_back("+lsx");
+    Features.push_back("+ual");
+    if (Arch == "la64v1.1")
+      Features.push_back("+frecipe");
+    return true;
+  }
+
   return false;
 }
 
-- 
Gitee


From 6094875aa6aab1e28a096294783cada0243e95d5 Mon Sep 17 00:00:00 2001
From: Ami-zhang <zhanglimin@loongson.cn>
Date: Tue, 23 Jul 2024 15:14:20 +0800
Subject: [PATCH 21/23] [LoongArch] Support la664 (#100068)

A new ProcessorModel called `la664` is defined in LoongArch.td to
support `-march/-mtune=la664`.

(cherry picked from commit fcec298087dba0c83f6d0bbafd6cd934c42cbf82)
---
 clang/test/Driver/loongarch-march.c                   | 11 +++++++++++
 clang/test/Driver/loongarch-mtune.c                   |  5 +++++
 clang/test/Preprocessor/init-loongarch.c              |  8 ++++++++
 .../llvm/TargetParser/LoongArchTargetParser.def       |  2 ++
 .../include/llvm/TargetParser/LoongArchTargetParser.h |  3 +++
 llvm/lib/Target/LoongArch/LoongArch.td                |  7 +++++++
 llvm/lib/TargetParser/Host.cpp                        |  2 ++
 llvm/test/CodeGen/LoongArch/cpus.ll                   |  5 +++++
 8 files changed, 43 insertions(+)

diff --git a/clang/test/Driver/loongarch-march.c b/clang/test/Driver/loongarch-march.c
index d06da72a755c..2d5b315d962a 100644
--- a/clang/test/Driver/loongarch-march.c
+++ b/clang/test/Driver/loongarch-march.c
@@ -6,6 +6,8 @@
 // RUN:   FileCheck %s --check-prefix=CC1-LA64V1P0
 // RUN: %clang --target=loongarch64 -march=la64v1.1 -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-LA64V1P1
+// RUN: %clang --target=loongarch64 -march=la664 -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LA664
 // RUN: %clang --target=loongarch64 -march=loongarch64 -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LOONGARCH64
 // RUN: %clang --target=loongarch64 -march=la464 -S -emit-llvm %s -o - | \
@@ -14,6 +16,8 @@
 // RUN:   FileCheck %s --check-prefix=IR-LA64V1P0
 // RUN: %clang --target=loongarch64 -march=la64v1.1 -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LA64V1P1
+// RUN: %clang --target=loongarch64 -march=la664 -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LA664
 
 // CC1-LOONGARCH64: "-target-cpu" "loongarch64"
 // CC1-LOONGARCH64-NOT: "-target-feature"
@@ -39,10 +43,17 @@
 // CC1-LA64V1P1-NOT: "-target-feature"
 // CC1-LA64V1P1: "-target-abi" "lp64d"
 
+// CC1-LA664: "-target-cpu" "la664"
+// CC1-LA664-NOT: "-target-feature"
+// CC1-LA664: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+lasx" "-target-feature" "+ual" "-target-feature" "+frecipe"
+// CC1-LA664-NOT: "-target-feature"
+// CC1-LA664: "-target-abi" "lp64d"
+
 // IR-LOONGARCH64: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+f,+ual"
 // IR-LA464: attributes #[[#]] ={{.*}}"target-cpu"="la464" {{.*}}"target-features"="+64bit,+d,+f,+lasx,+lsx,+ual"
 // IR-LA64V1P0: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+lsx,+ual"
 // IR-LA64V1P1: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+frecipe,+lsx,+ual"
+// IR-LA664: attributes #[[#]] ={{.*}}"target-cpu"="la664" {{.*}}"target-features"="+64bit,+d,+f,+frecipe,+lasx,+lsx,+ual"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-mtune.c b/clang/test/Driver/loongarch-mtune.c
index 6f3f39e9bbd8..face12e1a1a8 100644
--- a/clang/test/Driver/loongarch-mtune.c
+++ b/clang/test/Driver/loongarch-mtune.c
@@ -8,6 +8,11 @@
 // RUN: %clang --target=loongarch64 -mtune=la464 -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IRATTR -DCPU=la464
 
+// RUN: %clang --target=loongarch64 -mtune=la664 -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1ARG -DCPU=la664
+// RUN: %clang --target=loongarch64 -mtune=la664 -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IRATTR -DCPU=la664
+
 // RUN: %clang --target=loongarch64 -mtune=invalidcpu -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1ARG -DCPU=invalidcpu
 // RUN: not %clang --target=loongarch64 -mtune=invalidcpu -S -emit-llvm %s -o /dev/null 2>&1 | \
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index cfa3ddb20f10..7ce3d2de8c78 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -822,6 +822,14 @@
 // RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=loongarch64 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +frecipe | \
 // RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la64v1.1 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la664 -DTUNE=la664 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=la664 | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la664 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -mtune=la664 | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la664 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 -mtune=loongarch64 | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la664 -DTUNE=loongarch64 %s
 
 // ARCH-TUNE: #define __loongarch_arch "[[ARCH]]"
 // FRECIPE: #define __loongarch_frecipe 1
diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
index b20d124953f8..101a48cbd539 100644
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
@@ -10,6 +10,7 @@ LOONGARCH_FEATURE("+lasx", FK_LASX)
 LOONGARCH_FEATURE("+lbt", FK_LBT)
 LOONGARCH_FEATURE("+lvz", FK_LVZ)
 LOONGARCH_FEATURE("+ual", FK_UAL)
+LOONGARCH_FEATURE("+frecipe", FK_FRECIPE)
 
 #undef LOONGARCH_FEATURE
 
@@ -19,5 +20,6 @@ LOONGARCH_FEATURE("+ual", FK_UAL)
 
 LOONGARCH_ARCH("loongarch64", AK_LOONGARCH64, FK_64BIT | FK_FP32 | FK_FP64 | FK_UAL)
 LOONGARCH_ARCH("la464", AK_LA464, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL)
+LOONGARCH_ARCH("la664", AK_LA664, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL | FK_FRECIPE)
 
 #undef LOONGARCH_ARCH
diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
index 028844187584..c0bb15a5163b 100644
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
@@ -46,6 +46,9 @@ enum FeatureKind : uint32_t {
 
   // Allow memory accesses to be unaligned.
   FK_UAL = 1 << 8,
+
+  // Floating-point approximate reciprocal instructions are available.
+  FK_FRECIPE = 1 << 9,
 };
 
 struct FeatureInfo {
diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
index 5573e5415d26..b5cd5bb0f8a4 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.td
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@@ -147,6 +147,13 @@ def : ProcessorModel<"la464", NoSchedModel, [Feature64Bit,
                                              FeatureExtLVZ,
                                              FeatureExtLBT]>;
 
+def : ProcessorModel<"la664", NoSchedModel, [Feature64Bit,
+                                             FeatureUAL,
+                                             FeatureExtLASX,
+                                             FeatureExtLVZ,
+                                             FeatureExtLBT,
+                                             FeatureFrecipe]>;
+
 //===----------------------------------------------------------------------===//
 // Define the LoongArch target.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 8b23be02edc0..87e3e0b434d5 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1469,6 +1469,8 @@ StringRef sys::getHostCPUName() {
   switch (processor_id & 0xf000) {
   case 0xc000: // Loongson 64bit, 4-issue
     return "la464";
+  case 0xd000: // Loongson 64bit, 6-issue
+    return "la664";
   // TODO: Others.
   default:
     break;
diff --git a/llvm/test/CodeGen/LoongArch/cpus.ll b/llvm/test/CodeGen/LoongArch/cpus.ll
index 35945ae4de71..087cf887b813 100644
--- a/llvm/test/CodeGen/LoongArch/cpus.ll
+++ b/llvm/test/CodeGen/LoongArch/cpus.ll
@@ -3,6 +3,7 @@
 
 ; RUN: llc < %s --mtriple=loongarch64 --mcpu=loongarch64 2>&1 | FileCheck %s
 ; RUN: llc < %s --mtriple=loongarch64 --mcpu=la464 2>&1 | FileCheck %s
+; RUN: llc < %s --mtriple=loongarch64 --mcpu=la664 2>&1 | FileCheck %s
 ; RUN: llc < %s --mtriple=loongarch64 2>&1 | FileCheck %s
 
 ; CHECK-NOT: {{.*}} is not a recognized processor for this target
@@ -18,3 +19,7 @@ define void @tune_cpu_loongarch64() "tune-cpu"="loongarch64" {
 define void @tune_cpu_la464() "tune-cpu"="la464" {
   ret void
 }
+
+define void @tune_cpu_la664() "tune-cpu"="la664" {
+  ret void
+}
-- 
Gitee


From 18f453f2dba969c1fdcbda562079113de7bbcfca Mon Sep 17 00:00:00 2001
From: Ami-zhang <zhanglimin@loongson.cn>
Date: Tue, 23 Jul 2024 15:20:30 +0800
Subject: [PATCH 22/23] [LoongArch] Fix test issue of init-loongarch.c

(cherry picked from commit d59925c39856f255f4dd4427ccc650f2c2692a24)
---
 clang/test/Preprocessor/init-loongarch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index 7ce3d2de8c78..887b6d6af7e1 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -825,7 +825,7 @@
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 | \
 // RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la664 -DTUNE=la664 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=la664 | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la664 %s
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=la664 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -mtune=la664 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la664 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 -mtune=loongarch64 | \
-- 
Gitee


From f06fec7597485a8d90aa81e3c65abea1bdeeb90b Mon Sep 17 00:00:00 2001
From: Zhaoxin Yang <yangzhaoxin@loongson.cn>
Date: Tue, 23 Jul 2024 15:19:00 +0800
Subject: [PATCH 23/23] [LoongArch] Remove experimental `auto-vec` feature.
 (#100070)

Currently, automatic vectorization will be enabled with `-mlsx/-mlasx`
enabled.

(cherry picked from commit 89d1eb67342d75d1de8d210157fdeaeb6a4724b6)
---
 llvm/lib/Target/LoongArch/LoongArch.td                     | 4 ----
 llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp | 2 --
 llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll   | 2 +-
 3 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
index b5cd5bb0f8a4..5f85cace71af 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.td
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@@ -105,10 +105,6 @@ def FeatureUAL
 def FeatureRelax
     : SubtargetFeature<"relax", "HasLinkerRelax", "true",
                        "Enable Linker relaxation">;
-// Experimental auto vectorization
-def FeatureAutoVec
-    : SubtargetFeature<"auto-vec", "HasExpAutoVec", "true",
-                       "Experimental auto vectorization">;
 
 // Floating point approximation operation
 def FeatureFrecipe
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
index d47dded9ea6e..7961bb141e64 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
@@ -26,8 +26,6 @@ TypeSize LoongArchTTIImpl::getRegisterBitWidth(
   case TargetTransformInfo::RGK_Scalar:
     return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
   case TargetTransformInfo::RGK_FixedWidthVector:
-    if (!ST->hasExpAutoVec())
-      return DefSize;
     if (ST->hasExtLASX())
       return TypeSize::getFixed(256);
     if (ST->hasExtLSX())
diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
index a8ac2411dd82..6ab300859f9d 100644
--- a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
+++ b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx,+auto-vec -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx -S | FileCheck %s
 
 ;; This is a collection of tests whose only purpose is to show changes in the
 ;; default configuration.  Please keep these tests minimal - if you're testing
-- 
Gitee