diff --git a/clang/include/clang/Basic/BuiltinsLoongArchBase.def b/clang/include/clang/Basic/BuiltinsLoongArchBase.def
index cbb239223aae3b22e8ef15ee4627c60825aeea39..a5a07c167908ce4b4fd2f49536507225e5b31e0b 100644
--- a/clang/include/clang/Basic/BuiltinsLoongArchBase.def
+++ b/clang/include/clang/Basic/BuiltinsLoongArchBase.def
@@ -51,3 +51,8 @@ TARGET_BUILTIN(__builtin_loongarch_iocsrwr_d, "vUWiUi", "nc", "64bit")
 
 TARGET_BUILTIN(__builtin_loongarch_lddir_d, "WiWiIUWi", "nc", "64bit")
 TARGET_BUILTIN(__builtin_loongarch_ldpte_d, "vWiIUWi", "nc", "64bit")
+
+TARGET_BUILTIN(__builtin_loongarch_frecipe_s, "ff", "nc", "f,frecipe")
+TARGET_BUILTIN(__builtin_loongarch_frecipe_d, "dd", "nc", "d,frecipe")
+TARGET_BUILTIN(__builtin_loongarch_frsqrte_s, "ff", "nc", "f,frecipe")
+TARGET_BUILTIN(__builtin_loongarch_frsqrte_d, "dd", "nc", "d,frecipe")
diff --git a/clang/include/clang/Basic/BuiltinsLoongArchLASX.def b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
index 3de200f665b680afdebc08a57c79a844a0783998..4cf51cc000f6fcbf5f20f11627bceb1110b1c8bb 100644
--- a/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
+++ b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
@@ -657,9 +657,15 @@ TARGET_BUILTIN(__builtin_lasx_xvfsqrt_d, "V4dV4d", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvfrecip_s, "V8fV8f", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvfrecip_d, "V4dV4d", "nc", "lasx")
 
+TARGET_BUILTIN(__builtin_lasx_xvfrecipe_s, "V8fV8f", "nc", "lasx,frecipe")
+TARGET_BUILTIN(__builtin_lasx_xvfrecipe_d, "V4dV4d", "nc", "lasx,frecipe")
+
 TARGET_BUILTIN(__builtin_lasx_xvfrsqrt_s, "V8fV8f", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvfrsqrt_d, "V4dV4d", "nc", "lasx")
 
+TARGET_BUILTIN(__builtin_lasx_xvfrsqrte_s, "V8fV8f", "nc", "lasx,frecipe")
+TARGET_BUILTIN(__builtin_lasx_xvfrsqrte_d, "V4dV4d", "nc", "lasx,frecipe")
+
 TARGET_BUILTIN(__builtin_lasx_xvfcvtl_s_h, "V8fV16s", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvfcvth_s_h, "V8fV16s", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvfcvtl_d_s, "V4dV8f", "nc", "lasx")
diff --git a/clang/include/clang/Basic/BuiltinsLoongArchLSX.def b/clang/include/clang/Basic/BuiltinsLoongArchLSX.def
index 8e6aec886c50cd912d5993809cd9bf26b1f92da6..c90f4dc5458fa6ed73ac1de2e4eb43c768cf5f4e 100644
--- a/clang/include/clang/Basic/BuiltinsLoongArchLSX.def
+++ b/clang/include/clang/Basic/BuiltinsLoongArchLSX.def
@@ -641,9 +641,15 @@ TARGET_BUILTIN(__builtin_lsx_vfsqrt_d, "V2dV2d", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vfrecip_s, "V4fV4f", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vfrecip_d, "V2dV2d", "nc", "lsx")
 
+TARGET_BUILTIN(__builtin_lsx_vfrecipe_s, "V4fV4f", "nc", "lsx,frecipe")
+TARGET_BUILTIN(__builtin_lsx_vfrecipe_d, "V2dV2d", "nc", "lsx,frecipe")
+
 TARGET_BUILTIN(__builtin_lsx_vfrsqrt_s, "V4fV4f", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vfrsqrt_d, "V2dV2d", "nc", "lsx")
 
+TARGET_BUILTIN(__builtin_lsx_vfrsqrte_s, "V4fV4f", "nc", "lsx,frecipe")
+TARGET_BUILTIN(__builtin_lsx_vfrsqrte_d, "V2dV2d", "nc", "lsx,frecipe")
+
 TARGET_BUILTIN(__builtin_lsx_vfcvtl_s_h, "V4fV8s", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vfcvtl_d_s, "V2dV4f", "nc", "lsx")
 
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 6b68bc458b939a1806f918f07ccb4df7b9c6fab3..060f96118364d5cee84c08bbf9f98e53fca149bc 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -757,6 +757,8 @@ def err_drv_loongarch_wrong_fpu_width_for_lasx : Error<
   "wrong fpu width; LASX depends on 64-bit FPU.">;
 def err_drv_loongarch_invalid_simd_option_combination : Error<
   "invalid option combination; LASX depends on LSX.">;
+def err_drv_loongarch_invalid_msimd_EQ : Error<
+  "invalid argument '%0' to -msimd=; must be one of: none, lsx, lasx">;
 
 def err_drv_expand_response_file : Error<
   "failed to expand response file: %0">;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 344c8bd49da77736b86d9d14a380d585412dfdfd..530bb53ea9b5e6f49768721af5e743e21b152a14 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4236,6 +4236,9 @@ def mlasx : Flag<["-"], "mlasx">, Group<m_loongarch_Features_Group>,
   HelpText<"Enable Loongson Advanced SIMD Extension (LASX).">;
 def mno_lasx : Flag<["-"], "mno-lasx">, Group<m_loongarch_Features_Group>,
   HelpText<"Disable Loongson Advanced SIMD Extension (LASX).">;
+def msimd_EQ : Joined<["-"], "msimd=">, Group<m_loongarch_Features_Group>,
+  Flags<[TargetSpecific]>,
+  HelpText<"Select the SIMD extension(s) to be enabled in LoongArch either 'none', 'lsx', 'lasx'.">;
 def mnop_mcount : Flag<["-"], "mnop-mcount">, HelpText<"Generate mcount/__fentry__ calls as nops. To activate they need to be patched in.">,
   Flags<[CC1Option]>, Group<m_Group>,
   MarshallingInfoFlag<CodeGenOpts<"MNopMCount">>;
diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp
index 88537989a05129f553b89ac2e74a444e9472bea0..5fede3d7cdc4ff424818a33504916df0024d544e 100644
--- a/clang/lib/Basic/Targets/LoongArch.cpp
+++ b/clang/lib/Basic/Targets/LoongArch.cpp
@@ -200,7 +200,24 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
 
   // Define __loongarch_arch.
   StringRef ArchName = getCPU();
-  Builder.defineMacro("__loongarch_arch", Twine('"') + ArchName + Twine('"'));
+  if (ArchName == "loongarch64") {
+    if (HasFeatureLSX) {
+      // TODO: As more features of the V1.1 ISA are supported, a unified "v1.1"
+      // arch feature set will be used to include all sub-features belonging to
+      // the V1.1 ISA version.
+      if (HasFeatureFrecipe)
+        Builder.defineMacro("__loongarch_arch",
+                            Twine('"') + "la64v1.1" + Twine('"'));
+      else
+        Builder.defineMacro("__loongarch_arch",
+                            Twine('"') + "la64v1.0" + Twine('"'));
+    } else {
+      Builder.defineMacro("__loongarch_arch",
+                          Twine('"') + ArchName + Twine('"'));
+    }
+  } else {
+    Builder.defineMacro("__loongarch_arch", Twine('"') + ArchName + Twine('"'));
+  }
 
   // Define __loongarch_tune.
   StringRef TuneCPU = getTargetOpts().TuneCPU;
@@ -208,10 +225,16 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
     TuneCPU = ArchName;
   Builder.defineMacro("__loongarch_tune", Twine('"') + TuneCPU + Twine('"'));
 
-  if (HasFeatureLSX)
+  if (HasFeatureLASX) {
+    Builder.defineMacro("__loongarch_simd_width", "256");
     Builder.defineMacro("__loongarch_sx", Twine(1));
-  if (HasFeatureLASX)
     Builder.defineMacro("__loongarch_asx", Twine(1));
+  } else if (HasFeatureLSX) {
+    Builder.defineMacro("__loongarch_simd_width", "128");
+    Builder.defineMacro("__loongarch_sx", Twine(1));
+  }
+  if (HasFeatureFrecipe)
+    Builder.defineMacro("__loongarch_frecipe", Twine(1));
 
   StringRef ABI = getABI();
   if (ABI == "lp64d" || ABI == "lp64f" || ABI == "lp64s")
@@ -285,6 +308,8 @@ bool LoongArchTargetInfo::handleTargetFeatures(
       HasFeatureLSX = true;
     else if (Feature == "+lasx")
       HasFeatureLASX = true;
+    else if (Feature == "+frecipe")
+      HasFeatureFrecipe = true;
   }
   return true;
 }
diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h
index 3313102492cb8dc41b484c650630d5f451992c8c..4d2965f5b3a3b32c90b4771e5585cbda1181c5d2 100644
--- a/clang/lib/Basic/Targets/LoongArch.h
+++ b/clang/lib/Basic/Targets/LoongArch.h
@@ -29,6 +29,7 @@ protected:
   bool HasFeatureF;
   bool HasFeatureLSX;
   bool HasFeatureLASX;
+  bool HasFeatureFrecipe;
 
 public:
   LoongArchTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
@@ -37,6 +38,7 @@ public:
     HasFeatureF = false;
     HasFeatureLSX = false;
     HasFeatureLASX = false;
+    HasFeatureFrecipe = false;
     LongDoubleWidth = 128;
     LongDoubleAlign = 128;
     LongDoubleFormat = &llvm::APFloat::IEEEquad();
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 31153a67ad284033d7a63282d093a849d8ec176a..21106c425206d8776bc232880374a30724ccab90 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -127,6 +127,11 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
                                            const llvm::Triple &Triple,
                                            const ArgList &Args,
                                            std::vector<StringRef> &Features) {
+  // Enable the `lsx` feature on 64-bit LoongArch by default.
+  if (Triple.isLoongArch64() &&
+      (!Args.hasArgNoClaim(clang::driver::options::OPT_march_EQ)))
+    Features.push_back("+lsx");
+
   std::string ArchName;
   if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
     ArchName = A->getValue();
@@ -145,9 +150,11 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     } else if (A->getOption().matches(options::OPT_msingle_float)) {
       Features.push_back("+f");
       Features.push_back("-d");
+      Features.push_back("-lsx");
     } else /*Soft-float*/ {
       Features.push_back("-f");
       Features.push_back("-d");
+      Features.push_back("-lsx");
     }
   } else if (const Arg *A = Args.getLastArg(options::OPT_mfpu_EQ)) {
     StringRef FPU = A->getValue();
@@ -157,9 +164,11 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     } else if (FPU == "32") {
       Features.push_back("+f");
       Features.push_back("-d");
+      Features.push_back("-lsx");
     } else if (FPU == "0" || FPU == "none") {
       Features.push_back("-f");
       Features.push_back("-d");
+      Features.push_back("-lsx");
     } else {
       D.Diag(diag::err_drv_loongarch_invalid_mfpu_EQ) << FPU;
     }
@@ -175,6 +184,42 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     A->ignoreTargetSpecific();
   if (Arg *A = Args.getLastArgNoClaim(options::OPT_mfpu_EQ))
     A->ignoreTargetSpecific();
+  if (Arg *A = Args.getLastArgNoClaim(options::OPT_msimd_EQ))
+    A->ignoreTargetSpecific();
+
+  // Select lsx/lasx feature determined by -msimd=.
+  // Option -msimd= precedes -m[no-]lsx and -m[no-]lasx.
+  if (const Arg *A = Args.getLastArg(options::OPT_msimd_EQ)) {
+    StringRef MSIMD = A->getValue();
+    if (MSIMD == "lsx") {
+      // Option -msimd=lsx depends on 64-bit FPU.
+      // -m*-float and -mfpu=none/0/32 conflict with -msimd=lsx.
+      if (llvm::find(Features, "-d") != Features.end())
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lsx);
+      else
+        Features.push_back("+lsx");
+    } else if (MSIMD == "lasx") {
+      // Option -msimd=lasx depends on 64-bit FPU and LSX.
+      // -m*-float, -mfpu=none/0/32 and -mno-lsx conflict with -msimd=lasx.
+      if (llvm::find(Features, "-d") != Features.end())
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lasx);
+      else if (llvm::find(Features, "-lsx") != Features.end())
+        D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
+
+      // The command options do not contain -mno-lasx.
+      if (!Args.getLastArg(options::OPT_mno_lasx)) {
+        Features.push_back("+lsx");
+        Features.push_back("+lasx");
+      }
+    } else if (MSIMD == "none") {
+      if (llvm::find(Features, "+lsx") != Features.end())
+        Features.push_back("-lsx");
+      if (llvm::find(Features, "+lasx") != Features.end())
+        Features.push_back("-lasx");
+    } else {
+      D.Diag(diag::err_drv_loongarch_invalid_msimd_EQ) << MSIMD;
+    }
+  }
 
   // Select lsx feature determined by -m[no-]lsx.
   if (const Arg *A = Args.getLastArg(options::OPT_mlsx, options::OPT_mno_lsx)) {
@@ -198,8 +243,6 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     if (A->getOption().matches(options::OPT_mlasx)) {
       if (llvm::find(Features, "-d") != Features.end())
         D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lasx);
-      else if (llvm::find(Features, "-lsx") != Features.end())
-        D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
       else { /*-mlasx*/
         Features.push_back("+lsx");
         Features.push_back("+lasx");
@@ -225,8 +268,14 @@ std::string loongarch::postProcessTargetCPUString(const std::string &CPU,
 std::string loongarch::getLoongArchTargetCPU(const llvm::opt::ArgList &Args,
                                              const llvm::Triple &Triple) {
   std::string CPU;
+  std::string Arch;
   // If we have -march, use that.
-  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
-    CPU = A->getValue();
+  if (const Arg *A = Args.getLastArg(options::OPT_march_EQ)) {
+    Arch = A->getValue();
+    if (Arch == "la64v1.0" || Arch == "la64v1.1")
+      CPU = llvm::LoongArch::getDefaultArch(Triple.isLoongArch64());
+    else
+      CPU = Arch;
+  }
   return postProcessTargetCPUString(CPU, Triple);
 }
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index fac4f03d6193245aaa75891d78677b4a42e2f934..4e5f689498d69b35aba7e843b73405bf4fab71ee 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5773,18 +5773,38 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (Arg *A = Args.getLastArg(options::OPT_mcmodel_EQ)) {
     StringRef CM = A->getValue();
-    if (CM == "small" || CM == "kernel" || CM == "medium" || CM == "large" ||
-        CM == "tiny") {
-      if (Triple.isOSAIX() && CM == "medium")
-        CmdArgs.push_back("-mcmodel=large");
-      else if (Triple.isAArch64() && (CM == "kernel" || CM == "medium"))
+    if (Triple.isLoongArch()) {
+      bool Ok = false;
+      if (CM == "extreme" &&
+          Args.hasFlagNoClaim(options::OPT_fplt, options::OPT_fno_plt, false))
+        D.Diag(diag::err_drv_argument_not_allowed_with)
+            << A->getAsString(Args) << "-fplt";
+      Ok = CM == "normal" || CM == "medium" || CM == "extreme";
+      // Convert to LLVM recognizable names.
+      if (Ok) {
+        CM = llvm::StringSwitch<StringRef>(CM)
+                 .Case("normal", "small")
+                 .Case("extreme", "large")
+                 .Default(CM);
+        CmdArgs.push_back(Args.MakeArgString("-mcmodel=" + CM));
+      } else {
         D.Diag(diag::err_drv_invalid_argument_to_option)
             << CM << A->getOption().getName();
-      else
-        A->render(Args, CmdArgs);
+      }
     } else {
-      D.Diag(diag::err_drv_invalid_argument_to_option)
-          << CM << A->getOption().getName();
+      if (CM == "small" || CM == "kernel" || CM == "medium" || CM == "large" ||
+          CM == "tiny") {
+        if (Triple.isOSAIX() && CM == "medium")
+          CmdArgs.push_back("-mcmodel=large");
+        else if (Triple.isAArch64() && (CM == "kernel" || CM == "medium"))
+          D.Diag(diag::err_drv_invalid_argument_to_option)
+              << CM << A->getOption().getName();
+        else
+          A->render(Args, CmdArgs);
+      } else {
+        D.Diag(diag::err_drv_invalid_argument_to_option)
+            << CM << A->getOption().getName();
+      }
     }
   }
 
diff --git a/clang/lib/Headers/larchintrin.h b/clang/lib/Headers/larchintrin.h
index 24dd29ce91ffb9a7f7982ad4b3310fc82d1568d2..f4218295919a0d0d633e6a7b2f1bb81f83c96eda 100644
--- a/clang/lib/Headers/larchintrin.h
+++ b/clang/lib/Headers/larchintrin.h
@@ -228,6 +228,18 @@ extern __inline void
   ((void)__builtin_loongarch_ldpte_d((long int)(_1), (_2)))
 #endif
 
+#define __frecipe_s(/*float*/ _1)                                              \
+  (float)__builtin_loongarch_frecipe_s((float)_1)
+
+#define __frecipe_d(/*double*/ _1)                                             \
+  (double)__builtin_loongarch_frecipe_d((double)_1)
+
+#define __frsqrte_s(/*float*/ _1)                                              \
+  (float)__builtin_loongarch_frsqrte_s((float)_1)
+
+#define __frsqrte_d(/*double*/ _1)                                             \
+  (double)__builtin_loongarch_frsqrte_d((double)_1)
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/clang/lib/Headers/lasxintrin.h b/clang/lib/Headers/lasxintrin.h
index 6b4d5012a24b5893024424c6613265fbbe81c830..dafc2a2f3e6a70bd1aa46b2f7998ee3f223fc55c 100644
--- a/clang/lib/Headers/lasxintrin.h
+++ b/clang/lib/Headers/lasxintrin.h
@@ -1726,6 +1726,18 @@ extern __inline
   return (__m256d)__builtin_lasx_xvfrecip_d((v4f64)_1);
 }
 
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfrecipe_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfrecipe_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfrecipe_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfrecipe_d((v4f64)_1);
+}
+
 extern __inline
     __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
     __lasx_xvfrint_s(__m256 _1) {
@@ -1750,6 +1762,18 @@ extern __inline
   return (__m256d)__builtin_lasx_xvfrsqrt_d((v4f64)_1);
 }
 
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfrsqrte_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfrsqrte_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfrsqrte_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfrsqrte_d((v4f64)_1);
+}
+
 extern __inline
     __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
     __lasx_xvflogb_s(__m256 _1) {
diff --git a/clang/lib/Headers/lsxintrin.h b/clang/lib/Headers/lsxintrin.h
index a29bc7757ab5680e733561da9700716512885f71..f347955ce6fb51303a8a70f28e229c9a8b00aec7 100644
--- a/clang/lib/Headers/lsxintrin.h
+++ b/clang/lib/Headers/lsxintrin.h
@@ -1776,6 +1776,18 @@ extern __inline
   return (__m128d)__builtin_lsx_vfrecip_d((v2f64)_1);
 }
 
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfrecipe_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfrecipe_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfrecipe_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfrecipe_d((v2f64)_1);
+}
+
 extern __inline
     __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
     __lsx_vfrint_s(__m128 _1) {
@@ -1800,6 +1812,18 @@ extern __inline
   return (__m128d)__builtin_lsx_vfrsqrt_d((v2f64)_1);
 }
 
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfrsqrte_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfrsqrte_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfrsqrte_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfrsqrte_d((v2f64)_1);
+}
+
 extern __inline
     __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
     __lsx_vflogb_s(__m128 _1) {
diff --git a/clang/test/CodeGen/LoongArch/builtin-dbl-approximate.c b/clang/test/CodeGen/LoongArch/builtin-dbl-approximate.c
new file mode 100644
index 0000000000000000000000000000000000000000..e5fe684346c00de3ace41b75f58a5732344bc577
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/builtin-dbl-approximate.c
@@ -0,0 +1,45 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple loongarch32 -target-feature +d -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +d -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include <larchintrin.h>
+
+// CHECK-LABEL: @frecipe_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call double @llvm.loongarch.frecipe.d(double [[A:%.*]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
+double frecipe_d (double _1)
+{
+  return __builtin_loongarch_frecipe_d (_1);
+}
+
+// CHECK-LABEL: @frsqrte_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call double @llvm.loongarch.frsqrte.d(double [[A:%.*]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
+double frsqrte_d (double _1)
+{
+  return __builtin_loongarch_frsqrte_d (_1);
+}
+
+// CHECK-LABEL: @frecipe_d_alia
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call double @llvm.loongarch.frecipe.d(double [[A:%.*]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
+double frecipe_d_alia (double _1)
+{
+  return __frecipe_d (_1);
+}
+
+// CHECK-LABEL: @frsqrte_d_alia
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call double @llvm.loongarch.frsqrte.d(double [[A:%.*]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
+double frsqrte_d_alia (double _1)
+{
+  return __frsqrte_d (_1);
+}
diff --git a/clang/test/CodeGen/LoongArch/builtin-flt-approximate.c b/clang/test/CodeGen/LoongArch/builtin-flt-approximate.c
new file mode 100644
index 0000000000000000000000000000000000000000..47bb47084364b8baf89d274813820a723b7d298c
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/builtin-flt-approximate.c
@@ -0,0 +1,45 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple loongarch32 -target-feature +f -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +f -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include <larchintrin.h>
+
+// CHECK-LABEL: @frecipe_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call float @llvm.loongarch.frecipe.s(float [[A:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float frecipe_s (float _1)
+{
+  return __builtin_loongarch_frecipe_s (_1);
+}
+
+// CHECK-LABEL: @frsqrte_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call float @llvm.loongarch.frsqrte.s(float [[A:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float frsqrte_s (float _1)
+{
+  return __builtin_loongarch_frsqrte_s (_1);
+}
+
+// CHECK-LABEL: @frecipe_s_alia
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call float @llvm.loongarch.frecipe.s(float [[A:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float frecipe_s_alia (float _1)
+{
+  return __frecipe_s (_1);
+}
+
+// CHECK-LABEL: @frsqrte_s_alia
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call float @llvm.loongarch.frsqrte.s(float [[A:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float frsqrte_s_alia (float _1)
+{
+  return __frsqrte_s (_1);
+}
diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c b/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c
index efb3b94175cfad4e3bab9b5934058daac22a65e9..a3242dfd41e9b852da326e5488b311cdae4ec2fb 100644
--- a/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c
+++ b/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c
@@ -1,7 +1,28 @@
 // RUN: %clang_cc1 -triple loongarch64 -emit-llvm -S -verify %s -o /dev/null
+// RUN: not %clang_cc1 -triple loongarch64 -DFEATURE_CHECK -emit-llvm %s -o /dev/null 2>&1 \
+// RUN:   | FileCheck %s
 
 #include <larchintrin.h>
 
+#ifdef FEATURE_CHECK
+void test_feature(unsigned long *v_ul, int *v_i, float a, double b) {
+// CHECK: error: '__builtin_loongarch_cacop_w' needs target feature 32bit
+  __builtin_loongarch_cacop_w(1, v_ul[0], 1024);
+// CHECK: error: '__builtin_loongarch_movfcsr2gr' needs target feature f
+  v_i[0] = __builtin_loongarch_movfcsr2gr(1);
+// CHECK: error: '__builtin_loongarch_movgr2fcsr' needs target feature f
+  __builtin_loongarch_movgr2fcsr(1, v_i[1]);
+// CHECK: error: '__builtin_loongarch_frecipe_s' needs target feature f,frecipe
+  float f1 = __builtin_loongarch_frecipe_s(a);
+// CHECK: error: '__builtin_loongarch_frsqrte_s' needs target feature f,frecipe
+  float f2 = __builtin_loongarch_frsqrte_s(a);
+// CHECK: error: '__builtin_loongarch_frecipe_d' needs target feature d,frecipe
+  double d1 = __builtin_loongarch_frecipe_d(b);
+// CHECK: error: '__builtin_loongarch_frsqrte_d' needs target feature d,frecipe
+  double d2 = __builtin_loongarch_frsqrte_d(b);
+}
+#endif
+
 void csrrd_d(int a) {
   __builtin_loongarch_csrrd_d(16384); // expected-error {{argument value 16384 is outside the valid range [0, 16383]}}
   __builtin_loongarch_csrrd_d(-1); // expected-error {{argument value 4294967295 is outside the valid range [0, 16383]}}
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c
new file mode 100644
index 0000000000000000000000000000000000000000..b79f939403993c87813aaa26d352356b9e9cfcf7
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c
@@ -0,0 +1,37 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include <lasxintrin.h>
+
+// CHECK-LABEL: @xvfrecipe_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v8f32 xvfrecipe_s(v8f32 _1) { return __lasx_xvfrecipe_s(_1); }
+// CHECK-LABEL: @xvfrecipe_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v4f64 xvfrecipe_d(v4f64 _1) { return __lasx_xvfrecipe_d(_1); }
+// CHECK-LABEL: @xvfrsqrte_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v8f32 xvfrsqrte_s(v8f32 _1) { return __lasx_xvfrsqrte_s(_1); }
+// CHECK-LABEL: @xvfrsqrte_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v4f64 xvfrsqrte_d(v4f64 _1) { return __lasx_xvfrsqrte_d(_1); }
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c
new file mode 100644
index 0000000000000000000000000000000000000000..63e9ba639ea2c94bc591d0baa61fd939ce76c26d
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c
@@ -0,0 +1,38 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+typedef float v8f32 __attribute__((vector_size(32), aligned(32)));
+typedef double v4f64 __attribute__((vector_size(32), aligned(32)));
+
+// CHECK-LABEL: @xvfrecipe_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v8f32 xvfrecipe_s(v8f32 _1) { return __builtin_lasx_xvfrecipe_s(_1); }
+// CHECK-LABEL: @xvfrecipe_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v4f64 xvfrecipe_d(v4f64 _1) { return __builtin_lasx_xvfrecipe_d(_1); }
+// CHECK-LABEL: @xvfrsqrte_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v8f32 xvfrsqrte_s(v8f32 _1) { return __builtin_lasx_xvfrsqrte_s(_1); }
+// CHECK-LABEL: @xvfrsqrte_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v4f64 xvfrsqrte_d(v4f64 _1) { return __builtin_lasx_xvfrsqrte_d(_1); }
diff --git a/clang/test/CodeGen/LoongArch/lsx/builtin-approximate-alias.c b/clang/test/CodeGen/LoongArch/lsx/builtin-approximate-alias.c
new file mode 100644
index 0000000000000000000000000000000000000000..f26f032c878e6dfbf2779aa50d666e1e5bb482f6
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lsx/builtin-approximate-alias.c
@@ -0,0 +1,37 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lsx -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include <lsxintrin.h>
+
+// CHECK-LABEL: @vfrecipe_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 vfrecipe_s(v4f32 _1) { return __lsx_vfrecipe_s(_1); }
+// CHECK-LABEL: @vfrecipe_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 vfrecipe_d(v2f64 _1) { return __lsx_vfrecipe_d(_1); }
+// CHECK-LABEL: @vfrsqrte_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 vfrsqrte_s(v4f32 _1) { return __lsx_vfrsqrte_s(_1); }
+// CHECK-LABEL: @vfrsqrte_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 vfrsqrte_d(v2f64 _1) { return __lsx_vfrsqrte_d(_1); }
diff --git a/clang/test/CodeGen/LoongArch/lsx/builtin-approximate.c b/clang/test/CodeGen/LoongArch/lsx/builtin-approximate.c
new file mode 100644
index 0000000000000000000000000000000000000000..39fa1663db349b0cba038cc3b6d31ebe76608573
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lsx/builtin-approximate.c
@@ -0,0 +1,38 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lsx -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+typedef float v4f32 __attribute__ ((vector_size(16), aligned(16)));
+typedef double v2f64 __attribute__ ((vector_size(16), aligned(16)));
+
+// CHECK-LABEL: @vfrecipe_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 vfrecipe_s (v4f32 _1) { return __builtin_lsx_vfrecipe_s (_1); }
+// CHECK-LABEL: @vfrecipe_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 vfrecipe_d (v2f64 _1) { return __builtin_lsx_vfrecipe_d (_1); }
+// CHECK-LABEL: @vfrsqrte_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 vfrsqrte_s (v4f32 _1) { return __builtin_lsx_vfrsqrte_s (_1); }
+// CHECK-LABEL: @vfrsqrte_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 vfrsqrte_d (v2f64 _1) { return __builtin_lsx_vfrsqrte_d (_1); }
diff --git a/clang/test/Driver/loongarch-default-features.c b/clang/test/Driver/loongarch-default-features.c
index 3cdf3ba3d23e14407edae2dc21f35bd1599a4fb7..90634bbcf00358e86e5a93b23235ab0bf36724cd 100644
--- a/clang/test/Driver/loongarch-default-features.c
+++ b/clang/test/Driver/loongarch-default-features.c
@@ -2,7 +2,7 @@
 // RUN: %clang --target=loongarch64 -S -emit-llvm %s -o - | FileCheck %s --check-prefix=LA64
 
 // LA32: "target-features"="+32bit"
-// LA64: "target-features"="+64bit,+d,+f,+ual"
+// LA64: "target-features"="+64bit,+d,+f,+lsx,+ual"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-march.c b/clang/test/Driver/loongarch-march.c
index 9214130cd034fd58164b4860c5199d198f21f577..2d5b315d962a1e9ca669bab92a6a0ef2c6f9726e 100644
--- a/clang/test/Driver/loongarch-march.c
+++ b/clang/test/Driver/loongarch-march.c
@@ -2,10 +2,22 @@
 // RUN:   FileCheck %s --check-prefix=CC1-LOONGARCH64
 // RUN: %clang --target=loongarch64 -march=la464 -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-LA464
+// RUN: %clang --target=loongarch64 -march=la64v1.0 -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LA64V1P0
+// RUN: %clang --target=loongarch64 -march=la64v1.1 -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LA64V1P1
+// RUN: %clang --target=loongarch64 -march=la664 -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LA664
 // RUN: %clang --target=loongarch64 -march=loongarch64 -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LOONGARCH64
 // RUN: %clang --target=loongarch64 -march=la464 -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LA464
+// RUN: %clang --target=loongarch64 -march=la64v1.0 -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LA64V1P0
+// RUN: %clang --target=loongarch64 -march=la64v1.1 -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LA64V1P1
+// RUN: %clang --target=loongarch64 -march=la664 -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LA664
 
 // CC1-LOONGARCH64: "-target-cpu" "loongarch64"
 // CC1-LOONGARCH64-NOT: "-target-feature"
@@ -19,8 +31,29 @@
 // CC1-LA464-NOT: "-target-feature"
 // CC1-LA464: "-target-abi" "lp64d"
 
+// CC1-LA64V1P0: "-target-cpu" "loongarch64"
+// CC1-LA64V1P0-NOT: "-target-feature"
+// CC1-LA64V1P0: "-target-feature" "+64bit" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+ual"
+// CC1-LA64V1P0-NOT: "-target-feature"
+// CC1-LA64V1P0: "-target-abi" "lp64d"
+
+// CC1-LA64V1P1: "-target-cpu" "loongarch64"
+// CC1-LA64V1P1-NOT: "-target-feature"
+// CC1-LA64V1P1: "-target-feature" "+64bit" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+ual" "-target-feature" "+frecipe"
+// CC1-LA64V1P1-NOT: "-target-feature"
+// CC1-LA64V1P1: "-target-abi" "lp64d"
+
+// CC1-LA664: "-target-cpu" "la664"
+// CC1-LA664-NOT: "-target-feature"
+// CC1-LA664: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+lasx" "-target-feature" "+ual" "-target-feature" "+frecipe"
+// CC1-LA664-NOT: "-target-feature"
+// CC1-LA664: "-target-abi" "lp64d"
+
 // IR-LOONGARCH64: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+f,+ual"
 // IR-LA464: attributes #[[#]] ={{.*}}"target-cpu"="la464" {{.*}}"target-features"="+64bit,+d,+f,+lasx,+lsx,+ual"
+// IR-LA64V1P0: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+lsx,+ual"
+// IR-LA64V1P1: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+frecipe,+lsx,+ual"
+// IR-LA664: attributes #[[#]] ={{.*}}"target-cpu"="la664" {{.*}}"target-features"="+64bit,+d,+f,+frecipe,+lasx,+lsx,+ual"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-mlasx.c b/clang/test/Driver/loongarch-mlasx.c
index 0b934f125c9e462b5d7176367deb7ee90acd8b87..87634ff5a9a40b3063cf75473f2caa97f1f6bddb 100644
--- a/clang/test/Driver/loongarch-mlasx.c
+++ b/clang/test/Driver/loongarch-mlasx.c
@@ -5,7 +5,7 @@
 // RUN: %clang --target=loongarch64 -mno-lasx -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-NOLASX
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -fsyntax-only %s -### 2>&1 | \
-// RUN:   FileCheck %s --check-prefix=CC1-NOLASX
+// RUN:   FileCheck %s --check-prefix=CC1-LSX
 // RUN: %clang --target=loongarch64 -mno-lasx -mlasx -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1-LASX
 // RUN: %clang --target=loongarch64 -mlsx -mlasx -fsyntax-only %s -### 2>&1 | \
@@ -18,7 +18,7 @@
 // RUN: %clang --target=loongarch64 -mno-lasx -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-NOLASX
 // RUN: %clang --target=loongarch64 -mlasx -mno-lasx -S -emit-llvm %s -o - | \
-// RUN:   FileCheck %s --check-prefix=IR-NOLASX
+// RUN:   FileCheck %s --check-prefix=IR-LSX
 // RUN: %clang --target=loongarch64 -mno-lasx -mlasx -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LASX
 // RUN: %clang --target=loongarch64 -mlsx -mlasx -S -emit-llvm %s -o - | \
@@ -26,9 +26,11 @@
 // RUN: %clang --target=loongarch64 -mlasx -mlsx -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-LASX
 
+// CC1-LSX: "-target-feature" "+lsx"
 // CC1-LASX: "-target-feature" "+lsx" "-target-feature" "+lasx"
 // CC1-NOLASX: "-target-feature" "-lasx"
 
+// IR-LSX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+lsx{{(,.*)?}}"
 // IR-LASX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+lasx{{(,.*)?}}"
 // IR-NOLASX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-lasx{{(,.*)?}}"
 
diff --git a/clang/test/Driver/loongarch-msimd.c b/clang/test/Driver/loongarch-msimd.c
new file mode 100644
index 0000000000000000000000000000000000000000..49d298e1b2e3f08990825f6e5f92556103293e5f
--- /dev/null
+++ b/clang/test/Driver/loongarch-msimd.c
@@ -0,0 +1,95 @@
+/// Test -msimd options.
+
+/// COM: -msimd=none
+// RUN: %clang --target=loongarch64 -mlasx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+// RUN: %clang --target=loongarch64 -mlasx -mlsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+
+// RUN: %clang --target=loongarch64 -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlsx -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mlsx -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mlsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlsx -msimd=none -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+
+
+/// COM: -msimd=lsx
+// RUN: %clang --target=loongarch64 -mlasx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+// RUN: %clang --target=loongarch64 -mlasx -mlsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlsx -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mlsx -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -mno-lsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=NOLSX,NOLASX
+
+// RUN: %clang --target=loongarch64 -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mlsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+
+
+/// COM: -msimd=lasx
+// RUN: %clang --target=loongarch64 -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+// RUN: %clang --target=loongarch64 -mlasx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+// RUN: %clang --target=loongarch64 -mlasx -mlsx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+// RUN: %clang --target=loongarch64 -mlsx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,LASX
+
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mlsx -msimd=lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -mlsx -msimd=lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=LSX,NOLASX
+
+
+// NOLSX-NOT: "-target-feature" "+lsx"
+// NOLASX-NOT: "-target-feature" "+lasx"
+// LSX-DAG: "-target-feature" "+lsx"
+// LASX-DAG: "-target-feature" "+lasx"
+// NOLSX-NOT: "-target-feature" "+lsx"
+// NOLASX-NOT: "-target-feature" "+lasx"
diff --git a/clang/test/Driver/loongarch-msingle-float.c b/clang/test/Driver/loongarch-msingle-float.c
index bd9b3e8a8c019d02ae154f5791192061909bd389..4eb0865b53a59e04f21b53dccb71c96fb69a5840 100644
--- a/clang/test/Driver/loongarch-msingle-float.c
+++ b/clang/test/Driver/loongarch-msingle-float.c
@@ -11,10 +11,10 @@
 // WARN: warning: ignoring '-mabi=lp64s' as it conflicts with that implied by '-msingle-float' (lp64f)
 // WARN: warning: ignoring '-mfpu=64' as it conflicts with that implied by '-msingle-float' (32)
 
-// CC1: "-target-feature" "+f"{{.*}} "-target-feature" "-d"
+// CC1: "-target-feature" "+f"{{.*}} "-target-feature" "-d" "-target-feature" "-lsx"
 // CC1: "-target-abi" "lp64f"
 
-// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+f,{{(.*,)?}}-d"
+// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+f,{{(.*,)?}}-d,-lsx"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-msoft-float.c b/clang/test/Driver/loongarch-msoft-float.c
index 0e5121ac84b4c1ca51eee930b8fea47a28f4ccdb..ebf27fb00e309ed136227eb9d2ffda31ddcbf487 100644
--- a/clang/test/Driver/loongarch-msoft-float.c
+++ b/clang/test/Driver/loongarch-msoft-float.c
@@ -11,10 +11,10 @@
 // WARN: warning: ignoring '-mabi=lp64d' as it conflicts with that implied by '-msoft-float' (lp64s)
 // WARN: warning: ignoring '-mfpu=64' as it conflicts with that implied by '-msoft-float' (0)
 
-// CC1: "-target-feature" "-f"{{.*}} "-target-feature" "-d"
+// CC1: "-target-feature" "-f"{{.*}} "-target-feature" "-d" "-target-feature" "-lsx"
 // CC1: "-target-abi" "lp64s"
 
-// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-d,{{(.*,)?}}-f{{(,.*)?}}"
+// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-d,{{(.*,)?}}-f,-lsx"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-mtune.c b/clang/test/Driver/loongarch-mtune.c
index 6f3f39e9bbd86a351d161bc2bc213783a612829f..face12e1a1a82a70dedc7540c720f79524121fdb 100644
--- a/clang/test/Driver/loongarch-mtune.c
+++ b/clang/test/Driver/loongarch-mtune.c
@@ -8,6 +8,11 @@
 // RUN: %clang --target=loongarch64 -mtune=la464 -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IRATTR -DCPU=la464
 
+// RUN: %clang --target=loongarch64 -mtune=la664 -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1ARG -DCPU=la664
+// RUN: %clang --target=loongarch64 -mtune=la664 -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IRATTR -DCPU=la664
+
 // RUN: %clang --target=loongarch64 -mtune=invalidcpu -fsyntax-only %s -### 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CC1ARG -DCPU=invalidcpu
 // RUN: not %clang --target=loongarch64 -mtune=invalidcpu -S -emit-llvm %s -o /dev/null 2>&1 | \
diff --git a/clang/test/Driver/mcmodel.c b/clang/test/Driver/mcmodel.c
index 63b4320361592ab7967aaf87b5582953ac60c7e1..4aada126cf06949a56918f83f17d2806e2596047 100644
--- a/clang/test/Driver/mcmodel.c
+++ b/clang/test/Driver/mcmodel.c
@@ -8,6 +8,14 @@
 // RUN: not %clang -c -mcmodel=lager %s 2>&1 | FileCheck --check-prefix=INVALID %s
 // RUN: not %clang -c --target=aarch64 -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=AARCH64-MEDIUM %s
 // RUN: not %clang -c --target=aarch64 -mcmodel=kernel %s 2>&1 | FileCheck --check-prefix=AARCH64-KERNEL %s
+// RUN: %clang --target=loongarch64 -### -S -mcmodel=normal %s 2>&1 | FileCheck --check-prefix=SMALL %s
+// RUN: %clang --target=loongarch64 -### -S -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=MEDIUM %s
+// RUN: %clang --target=loongarch64 -### -S -mcmodel=extreme %s 2>&1 | FileCheck --check-prefix=LARGE %s
+// RUN: not %clang -c --target=loongarch64 -mcmodel=tiny %s 2>&1 | FileCheck --check-prefix=ERR-LOONGARCH64-TINY %s
+// RUN: not %clang -c --target=loongarch64 -mcmodel=small %s 2>&1 | FileCheck --check-prefix=ERR-LOONGARCH64-SMALL %s
+// RUN: not %clang -c --target=loongarch64 -mcmodel=kernel %s 2>&1 | FileCheck --check-prefix=ERR-LOONGARCH64-KERNEL %s
+// RUN: not %clang -c --target=loongarch64 -mcmodel=large %s 2>&1 | FileCheck --check-prefix=ERR-LOONGARCH64-LARGE %s
+// RUN: not %clang -c --target=loongarch64 -mcmodel=extreme -fplt %s 2>&1 | FileCheck --check-prefix=ERR-LOONGARCH64-PLT-EXTREME %s
 
 // TINY: "-mcmodel=tiny"
 // SMALL: "-mcmodel=small"
@@ -20,3 +28,10 @@
 
 // AARCH64-MEDIUM: error: invalid argument 'medium' to -mcmodel=
 // AARCH64-KERNEL: error: invalid argument 'kernel' to -mcmodel=
+
+// ERR-LOONGARCH64-TINY:   error: invalid argument 'tiny' to -mcmodel=
+// ERR-LOONGARCH64-SMALL:  error: invalid argument 'small' to -mcmodel=
+// ERR-LOONGARCH64-KERNEL: error: invalid argument 'kernel' to -mcmodel=
+// ERR-LOONGARCH64-LARGE:  error: invalid argument 'large' to -mcmodel=
+
+// ERR-LOONGARCH64-PLT-EXTREME: error: invalid argument '-mcmodel=extreme' not allowed with '-fplt'
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index e235a728302153a62617eae9c1eb17e87347fe34..887b6d6af7e1bf50713be1f169499b837b88ad92 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -788,24 +788,51 @@
 // LA64-FPU0-LP64S-NOT: #define __loongarch_single_float
 // LA64-FPU0-LP64S: #define __loongarch_soft_float 1
 
-/// Check __loongarch_arch and __loongarch_tune.
+/// Check __loongarch_arch{_tune/_frecipe}.
 
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la464 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la464 -DTUNE=la464 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=loongarch64 | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=la464 | \
-// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la464 %s
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=la464 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -mtune=la464 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la464 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la464 -mtune=loongarch64 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la464 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang -lsx | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +frecipe | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la64v1.1 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 | \
+// RUN:   FileCheck --match-full-lines  --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la64v1.1 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -frecipe | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -lsx | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +frecipe | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +frecipe | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la64v1.1 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la664 -DTUNE=la664 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=la664 | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=la664 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -mtune=la664 | \
+// RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la664 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 -mtune=loongarch64 | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=la664 -DTUNE=loongarch64 %s
 
 // ARCH-TUNE: #define __loongarch_arch "[[ARCH]]"
+// FRECIPE: #define __loongarch_frecipe 1
 // ARCH-TUNE: #define __loongarch_tune "[[TUNE]]"
 
 // RUN: %clang --target=loongarch64 -mlsx -x c -E -dM %s -o - \
@@ -814,20 +841,24 @@
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
 // RUN: %clang --target=loongarch64 -mlsx -mno-lasx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
+// RUN: %clang --target=loongarch64 -mno-lasx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
 // RUN: %clang --target=loongarch64 -mno-lasx -mlsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
 // MLSX-NOT: #define __loongarch_asx
+// MLSX: #define __loongarch_simd_width 128
 // MLSX: #define __loongarch_sx 1
 
 // RUN: %clang --target=loongarch64 -mlasx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
-// RUN: %clang --target=loongarch64 -mno-lasx -mlasx -x c -E -dM %s -o - \
-// RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
 // RUN: %clang --target=loongarch64 -mlsx -mlasx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
 // RUN: %clang --target=loongarch64 -mlasx -mlsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
+// RUN: %clang --target=loongarch64 -mno-lasx -mlasx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
 // MLASX: #define __loongarch_asx 1
+// MLASX: #define __loongarch_simd_width 256
 // MLASX: #define __loongarch_sx 1
 
 // RUN: %clang --target=loongarch64 -mno-lsx -x c -E -dM %s -o - \
@@ -838,7 +869,6 @@
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // RUN: %clang --target=loongarch64 -mno-lasx -mno-lsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
-// RUN: %clang --target=loongarch64 -mno-lasx -x c -E -dM %s -o - \
-// RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // MNO-LSX-NOT: #define __loongarch_asx
+// MNO-LSX-NOT: #define __loongarch_simd_width
 // MNO-LSX-NOT: #define __loongarch_sx
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index 160fab4aeba9a8da8ed95add75e401d49b701d2d..19147a0f6df69e35e61951d99855f08ba4186e6c 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -85,90 +85,33 @@ static uint64_t getLoongArchPage(uint64_t p) {
 static uint32_t lo12(uint32_t val) { return val & 0xfff; }
 
 // Calculate the adjusted page delta between dest and PC.
-uint64_t elf::getLoongArchPageDelta(uint64_t dest, uint64_t pc) {
-  // Consider the large code model access pattern, of which the smaller code
-  // models' access patterns are a subset:
-  //
-  //     pcalau12i       U, %foo_hi20(sym)        ; b in [-0x80000, 0x7ffff]
-  //     addi.d          T, zero, %foo_lo12(sym)  ; a in [-0x800, 0x7ff]
-  //     lu32i.d         T, %foo64_lo20(sym)      ; c in [-0x80000, 0x7ffff]
-  //     lu52i.d         T, T, %foo64_hi12(sym)   ; d in [-0x800, 0x7ff]
-  //     {ldx,stx,add}.* dest, U, T
-  //
-  // Let page(pc) = 0xRRR'QQQQQ'PPPPP'000 and dest = 0xZZZ'YYYYY'XXXXX'AAA,
-  // with RQ, P, ZY, X and A representing the respective bitfields as unsigned
-  // integers. We have:
-  //
-  //     page(dest) = 0xZZZ'YYYYY'XXXXX'000
-  //     - page(pc) = 0xRRR'QQQQQ'PPPPP'000
-  //     ----------------------------------
-  //                  0xddd'ccccc'bbbbb'000
-  //
-  // Now consider the above pattern's actual effects:
-  //
-  //     page(pc)                     0xRRR'QQQQQ'PPPPP'000
-  //     pcalau12i                  + 0xiii'iiiii'bbbbb'000
-  //     addi                       + 0xjjj'jjjjj'kkkkk'AAA
-  //     lu32i.d & lu52i.d          + 0xddd'ccccc'00000'000
-  //     --------------------------------------------------
-  //     dest = U + T
-  //          = ((RQ<<32) + (P<<12) + i + (b<<12)) + (j + k + A + (cd<<32))
-  //          = (((RQ+cd)<<32) + i + j) + (((P+b)<<12) + k) + A
-  //          = (ZY<<32)                + (X<<12)           + A
-  //
-  //     ZY<<32 = (RQ<<32)+(cd<<32)+i+j, X<<12 = (P<<12)+(b<<12)+k
-  //     cd<<32 = (ZY<<32)-(RQ<<32)-i-j, b<<12 = (X<<12)-(P<<12)-k
-  //
-  // where i and k are terms representing the effect of b's and A's sign
-  // extension respectively.
-  //
-  //     i = signed b < 0 ? -0x10000'0000 : 0
-  //     k = signed A < 0 ? -0x1000 : 0
-  //
-  // The j term is a bit complex: it represents the higher half of
-  // sign-extended bits from A that are effectively lost if i == 0 but k != 0,
-  // due to overwriting by lu32i.d & lu52i.d.
-  //
-  //     j = signed A < 0 && signed b >= 0 ? 0x10000'0000 : 0
-  //
-  // The actual effect of the instruction sequence before the final addition,
-  // i.e. our desired result value, is thus:
-  //
-  //     result = (cd<<32) + (b<<12)
-  //            = (ZY<<32)-(RQ<<32)-i-j + (X<<12)-(P<<12)-k
-  //            = ((ZY<<32)+(X<<12)) - ((RQ<<32)+(P<<12)) - i - j - k
-  //            = page(dest) - page(pc) - i - j - k
-  //
-  // when signed A >= 0 && signed b >= 0:
-  //
-  //     i = j = k = 0
-  //     result = page(dest) - page(pc)
-  //
-  // when signed A >= 0 && signed b < 0:
-  //
-  //     i = -0x10000'0000, j = k = 0
-  //     result = page(dest) - page(pc) + 0x10000'0000
-  //
-  // when signed A < 0 && signed b >= 0:
-  //
-  //     i = 0, j = 0x10000'0000, k = -0x1000
-  //     result = page(dest) - page(pc) - 0x10000'0000 + 0x1000
-  //
-  // when signed A < 0 && signed b < 0:
-  //
-  //     i = -0x10000'0000, j = 0, k = -0x1000
-  //     result = page(dest) - page(pc) + 0x1000
-  uint64_t result = getLoongArchPage(dest) - getLoongArchPage(pc);
-  bool negativeA = lo12(dest) > 0x7ff;
-  bool negativeB = (result & 0x8000'0000) != 0;
-
-  if (negativeA)
-    result += 0x1000;
-  if (negativeA && !negativeB)
-    result -= 0x10000'0000;
-  else if (!negativeA && negativeB)
-    result += 0x10000'0000;
-
+uint64_t elf::getLoongArchPageDelta(uint64_t dest, uint64_t pc, RelType type) {
+  // Note that if the sequence being relocated is `pcalau12i + addi.d + lu32i.d
+  // + lu52i.d`, they must be adjancent so that we can infer the PC of
+  // `pcalau12i` when calculating the page delta for the other two instructions
+  // (lu32i.d and lu52i.d). Compensate all the sign-extensions is a bit
+  // complicated. Just use psABI recommended algorithm.
+  uint64_t pcalau12i_pc;
+  switch (type) {
+  case R_LARCH_PCALA64_LO20:
+  case R_LARCH_GOT64_PC_LO20:
+  case R_LARCH_TLS_IE64_PC_LO20:
+    pcalau12i_pc = pc - 8;
+    break;
+  case R_LARCH_PCALA64_HI12:
+  case R_LARCH_GOT64_PC_HI12:
+  case R_LARCH_TLS_IE64_PC_HI12:
+    pcalau12i_pc = pc - 12;
+    break;
+  default:
+    pcalau12i_pc = pc;
+    break;
+  }
+  uint64_t result = getLoongArchPage(dest) - getLoongArchPage(pcalau12i_pc);
+  if (dest & 0x800)
+    result += 0x1000 - 0x1'0000'0000;
+  if (result & 0x8000'0000)
+    result += 0x1'0000'0000;
   return result;
 }
 
@@ -479,6 +422,7 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
   case R_LARCH_B16:
   case R_LARCH_B21:
   case R_LARCH_B26:
+  case R_LARCH_CALL36:
     return R_PLT_PC;
   case R_LARCH_GOT_PC_HI20:
   case R_LARCH_GOT64_PC_LO20:
@@ -607,6 +551,25 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
     write32le(loc, setD10k16(read32le(loc), val >> 2));
     return;
 
+  case R_LARCH_CALL36: {
+    // This relocation is designed for adjancent pcaddu18i+jirl pairs that
+    // are patched in one time. Because of sign extension of these insns'
+    // immediate fields, the relocation range is [-128G - 0x20000, +128G -
+    // 0x20000) (of course must be 4-byte aligned).
+    if (((int64_t)val + 0x20000) != llvm::SignExtend64(val + 0x20000, 38))
+      reportRangeError(loc, rel, Twine(val), llvm::minIntN(38) - 0x20000,
+                       llvm::maxIntN(38) - 0x20000);
+    checkAlignment(loc, val, 4, rel);
+    // Since jirl performs sign extension on the offset immediate, adds (1<<17)
+    // to original val to get the correct hi20.
+    uint32_t hi20 = extractBits(val + (1 << 17), 37, 18);
+    // Despite the name, the lower part is actually 18 bits with 4-byte aligned.
+    uint32_t lo16 = extractBits(val, 17, 2);
+    write32le(loc, setJ20(read32le(loc), hi20));
+    write32le(loc + 4, setK16(read32le(loc + 4), lo16));
+    return;
+  }
+
   // Relocs intended for `addi`, `ld` or `st`.
   case R_LARCH_PCALA_LO12:
     // We have to again inspect the insn word to handle the R_LARCH_PCALA_LO12
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index b178d82407e30703427804f9d6d14016540284f6..44444b62251da5426ec72ff26080628c16a42f00 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -712,8 +712,8 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
     return sym.getGotVA() + a - p;
   case R_LOONGARCH_GOT_PAGE_PC:
     if (sym.hasFlag(NEEDS_TLSGD))
-      return getLoongArchPageDelta(in.got->getGlobalDynAddr(sym) + a, p);
-    return getLoongArchPageDelta(sym.getGotVA() + a, p);
+      return getLoongArchPageDelta(in.got->getGlobalDynAddr(sym) + a, p, type);
+    return getLoongArchPageDelta(sym.getGotVA() + a, p, type);
   case R_MIPS_GOTREL:
     return sym.getVA(a) - in.mipsGot->getGp(file);
   case R_MIPS_GOT_GP:
@@ -763,7 +763,7 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
     return 0;
   }
   case R_LOONGARCH_PAGE_PC:
-    return getLoongArchPageDelta(sym.getVA(a), p);
+    return getLoongArchPageDelta(sym.getVA(a), p, type);
   case R_PC:
   case R_ARM_PCA: {
     uint64_t dest;
@@ -798,7 +798,7 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
   case R_PPC64_CALL_PLT:
     return sym.getPltVA() + a - p;
   case R_LOONGARCH_PLT_PAGE_PC:
-    return getLoongArchPageDelta(sym.getPltVA() + a, p);
+    return getLoongArchPageDelta(sym.getPltVA() + a, p, type);
   case R_PLT_GOTPLT:
     return sym.getPltVA() + a - in.gotPlt->getVA();
   case R_PPC32_PLTREL:
@@ -860,7 +860,7 @@ uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type,
   case R_TLSGD_PC:
     return in.got->getGlobalDynAddr(sym) + a - p;
   case R_LOONGARCH_TLSGD_PAGE_PC:
-    return getLoongArchPageDelta(in.got->getGlobalDynAddr(sym) + a, p);
+    return getLoongArchPageDelta(in.got->getGlobalDynAddr(sym) + a, p, type);
   case R_TLSLD_GOTPLT:
     return in.got->getVA() + in.got->getTlsIndexOff() + a - in.gotPlt->getVA();
   case R_TLSLD_GOT:
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index bf831afa179305e4f2b8a5d84b333871b80857d6..aeabe47f92a10b40538f06eeb9e66ff2ed3deff5 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -229,7 +229,7 @@ void addPPC64SaveRestore();
 uint64_t getPPC64TocBase();
 uint64_t getAArch64Page(uint64_t expr);
 template <typename ELFT> void writeARMCmseImportLib();
-uint64_t getLoongArchPageDelta(uint64_t dest, uint64_t pc);
+uint64_t getLoongArchPageDelta(uint64_t dest, uint64_t pc, RelType type);
 void riscvFinalizeRelax(int passes);
 void mergeRISCVAttributesSections();
 void addArmInputSectionMappingSymbols();
diff --git a/lld/test/ELF/loongarch-call36.s b/lld/test/ELF/loongarch-call36.s
new file mode 100644
index 0000000000000000000000000000000000000000..b593fdf1f6045e21cfa7d61f668dea2e06efbca9
--- /dev/null
+++ b/lld/test/ELF/loongarch-call36.s
@@ -0,0 +1,69 @@
+# REQUIRES: loongarch
+
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc --filetype=obj --triple=loongarch64-unknown-elf %t/a.s -o %t/a.o
+
+# RUN: ld.lld %t/a.o --section-start=.text=0x20010 --section-start=.sec.foo=0x60020 -o %t/exe1
+# RUN: llvm-objdump --no-show-raw-insn -d %t/exe1 | FileCheck --match-full-lines %s --check-prefix=EXE1
+## hi20 = target - pc + (1 << 17) >> 18 = 0x60020 - 0x20010 + 0x20000 >> 18 = 1
+## lo18 = target - pc & (1 << 18) - 1 = 0x60020 - 0x20010 & 0x3ffff = 16
+# EXE1:      20010: pcaddu18i $t0, 1
+# EXE1-NEXT: 20014: jirl $zero, $t0, 16
+
+# RUN: ld.lld %t/a.o --section-start=.text=0x20010 --section-start=.sec.foo=0x40020 -o %t/exe2
+# RUN: llvm-objdump --no-show-raw-insn -d %t/exe2 | FileCheck --match-full-lines %s --check-prefix=EXE2
+## hi20 = target - pc + (1 << 17) >> 18 = 0x40020 - 0x20010 + 0x20000 >> 18 = 1
+## lo18 = target - pc & (1 << 18) - 1 = 0x40020 - 0x20010 & 0x3ffff = -131056
+# EXE2:      20010: pcaddu18i $t0, 1
+# EXE2-NEXT: 20014: jirl $zero, $t0, -131056
+
+# RUN: ld.lld %t/a.o -shared -T %t/a.t -o %t/a.so
+# RUN: llvm-readelf -x .got.plt %t/a.so | FileCheck --check-prefix=GOTPLT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/a.so | FileCheck --check-prefix=SO %s
+## PLT should be present in this case.
+# SO:    Disassembly of section .plt:
+# SO:    <.plt>:
+##       foo@plt:
+# SO:    1234520:  pcaddu12i $t3, 64{{$}}
+# SO-NEXT:         ld.d $t3, $t3, 544{{$}}
+# SO-NEXT:         jirl $t1, $t3, 0
+# SO-NEXT:         nop
+
+# SO:   Disassembly of section .text:
+# SO:   <_start>:
+## hi20 = foo@plt - pc + (1 << 17) >> 18 = 0x1234520 - 0x1274670 + 0x20000 >> 18 = -1
+## lo18 = foo@plt - pc & (1 << 18) - 1 = 0x1234520 - 0x1274670 & 0x3ffff = -336
+# SO-NEXT: pcaddu18i $t0, -1{{$}}
+# SO-NEXT: jirl $zero, $t0, -336{{$}}
+
+# GOTPLT:      section '.got.plt':
+# GOTPLT-NEXT: 0x01274730 00000000 00000000 00000000 00000000
+# GOTPLT-NEXT: 0x01274740 00452301 00000000
+
+# RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x2000020000 -o /dev/null 2>&1 | \
+# RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-RANGE %s
+# ERROR-RANGE: error: [[FILE]]:(.text+0x0): relocation R_LARCH_CALL36 out of range: 137438953472 is not in [-137439084544, 137438822399]; references 'foo'
+
+## Impossible case in reality becasue all LoongArch instructions are fixed 4-bytes long.
+# RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x40001 -o /dev/null 2>&1 | \
+# RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-ALIGN %s
+# ERROR-ALIGN: error: [[FILE]]:(.text+0x0): improper alignment for relocation R_LARCH_CALL36: 0x20001 is not aligned to 4 bytes
+
+#--- a.t
+SECTIONS {
+ .plt   0x1234500: { *(.plt) }
+ .text  0x1274670: { *(.text) }
+}
+
+#--- a.s
+.text
+.global _start
+_start:
+  .reloc ., R_LARCH_CALL36, foo
+  pcaddu18i $t0, 0
+  jirl      $zero, $t0, 0
+
+.section .sec.foo,"awx"
+.global foo
+foo:
+  ret
diff --git a/lld/test/ELF/loongarch-pc-aligned.s b/lld/test/ELF/loongarch-pc-aligned.s
index 9df3492d18772ba59901e1351b9c669090a8581b..0405961e5f74ec236cf58b6acb2b5c66218173c7 100644
--- a/lld/test/ELF/loongarch-pc-aligned.s
+++ b/lld/test/ELF/loongarch-pc-aligned.s
@@ -75,8 +75,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x4443333334567111 --section-start=.text=0x0000000012345678 -o %t/extreme0
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme0 | FileCheck %s --check-prefix=EXTREME0
-# EXTREME0:      addi.d $t0, $zero, 273
-# EXTREME0-NEXT: pcalau12i $t1, 139810
+# EXTREME0:      pcalau12i $t1, 139810
+# EXTREME0-NEXT: addi.d $t0, $zero, 273
 # EXTREME0-NEXT: lu32i.d   $t0, 209715
 # EXTREME0-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -87,8 +87,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x4443333334567888 --section-start=.text=0x0000000012345678 -o %t/extreme1
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme1 | FileCheck %s --check-prefix=EXTREME1
-# EXTREME1:      addi.d $t0, $zero, -1912
-# EXTREME1-NEXT: pcalau12i $t1, 139811
+# EXTREME1:      pcalau12i $t1, 139811
+# EXTREME1-NEXT: addi.d $t0, $zero, -1912
 # EXTREME1-NEXT: lu32i.d   $t0, 209714
 # EXTREME1-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -99,8 +99,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x44433333abcde111 --section-start=.text=0x0000000012345678 -o %t/extreme2
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme2 | FileCheck %s --check-prefix=EXTREME2
-# EXTREME2:      addi.d $t0, $zero, 273
-# EXTREME2-NEXT: pcalau12i $t1, -419431
+# EXTREME2:      pcalau12i $t1, -419431
+# EXTREME2-NEXT: addi.d $t0, $zero, 273
 # EXTREME2-NEXT: lu32i.d   $t0, 209716
 # EXTREME2-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -111,8 +111,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x44433333abcde888 --section-start=.text=0x0000000012345678 -o %t/extreme3
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme3 | FileCheck %s --check-prefix=EXTREME3
-# EXTREME3:      addi.d $t0, $zero, -1912
-# EXTREME3-NEXT: pcalau12i $t1, -419430
+# EXTREME3:      pcalau12i $t1, -419430
+# EXTREME3-NEXT: addi.d $t0, $zero, -1912
 # EXTREME3-NEXT: lu32i.d   $t0, 209715
 # EXTREME3-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -123,8 +123,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x444aaaaa34567111 --section-start=.text=0x0000000012345678 -o %t/extreme4
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme4 | FileCheck %s --check-prefix=EXTREME4
-# EXTREME4:      addi.d $t0, $zero, 273
-# EXTREME4-NEXT: pcalau12i $t1, 139810
+# EXTREME4:      pcalau12i $t1, 139810
+# EXTREME4-NEXT: addi.d $t0, $zero, 273
 # EXTREME4-NEXT: lu32i.d   $t0, -349526
 # EXTREME4-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -135,8 +135,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x444aaaaa34567888 --section-start=.text=0x0000000012345678 -o %t/extreme5
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme5 | FileCheck %s --check-prefix=EXTREME5
-# EXTREME5:      addi.d $t0, $zero, -1912
-# EXTREME5-NEXT: pcalau12i $t1, 139811
+# EXTREME5:      pcalau12i $t1, 139811
+# EXTREME5-NEXT: addi.d $t0, $zero, -1912
 # EXTREME5-NEXT: lu32i.d   $t0, -349527
 # EXTREME5-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -147,8 +147,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x444aaaaaabcde111 --section-start=.text=0x0000000012345678 -o %t/extreme6
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme6 | FileCheck %s --check-prefix=EXTREME6
-# EXTREME6:      addi.d $t0, $zero, 273
-# EXTREME6-NEXT: pcalau12i $t1, -419431
+# EXTREME6:      pcalau12i $t1, -419431
+# EXTREME6-NEXT: addi.d $t0, $zero, 273
 # EXTREME6-NEXT: lu32i.d   $t0, -349525
 # EXTREME6-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -159,8 +159,8 @@
 ## %pc64_hi12 = 0x444 = 1092
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0x444aaaaaabcde888 --section-start=.text=0x0000000012345678 -o %t/extreme7
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme7 | FileCheck %s --check-prefix=EXTREME7
-# EXTREME7:      addi.d $t0, $zero, -1912
-# EXTREME7-NEXT: pcalau12i $t1, -419430
+# EXTREME7:      pcalau12i $t1, -419430
+# EXTREME7-NEXT: addi.d $t0, $zero, -1912
 # EXTREME7-NEXT: lu32i.d   $t0, -349526
 # EXTREME7-NEXT: lu52i.d   $t0, $t0, 1092
 
@@ -171,8 +171,8 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbb3333334567111 --section-start=.text=0x0000000012345678 -o %t/extreme8
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme8 | FileCheck %s --check-prefix=EXTREME8
-# EXTREME8:      addi.d $t0, $zero, 273
-# EXTREME8-NEXT: pcalau12i $t1, 139810
+# EXTREME8:      pcalau12i $t1, 139810
+# EXTREME8-NEXT: addi.d $t0, $zero, 273
 # EXTREME8-NEXT: lu32i.d   $t0, 209715
 # EXTREME8-NEXT: lu52i.d   $t0, $t0, -1093
 
@@ -183,8 +183,8 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbb3333334567888 --section-start=.text=0x0000000012345678 -o %t/extreme9
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme9 | FileCheck %s --check-prefix=EXTREME9
-# EXTREME9:      addi.d $t0, $zero, -1912
-# EXTREME9-NEXT: pcalau12i $t1, 139811
+# EXTREME9:      pcalau12i $t1, 139811
+# EXTREME9-NEXT: addi.d $t0, $zero, -1912
 # EXTREME9-NEXT: lu32i.d   $t0, 209714
 # EXTREME9-NEXT: lu52i.d   $t0, $t0, -1093
 
@@ -195,8 +195,8 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbb33333abcde111 --section-start=.text=0x0000000012345678 -o %t/extreme10
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme10 | FileCheck %s --check-prefix=EXTREME10
-# EXTREME10:      addi.d $t0, $zero, 273
-# EXTREME10-NEXT: pcalau12i $t1, -419431
+# EXTREME10:      pcalau12i $t1, -419431
+# EXTREME10-NEXT: addi.d $t0, $zero, 273
 # EXTREME10-NEXT: lu32i.d   $t0, 209716
 # EXTREME10-NEXT: lu52i.d   $t0, $t0, -1093
 
@@ -207,8 +207,8 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbb33333abcde888 --section-start=.text=0x0000000012345678 -o %t/extreme11
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme11 | FileCheck %s --check-prefix=EXTREME11
-# EXTREME11:      addi.d $t0, $zero, -1912
-# EXTREME11-NEXT: pcalau12i $t1, -419430
+# EXTREME11:      pcalau12i $t1, -419430
+# EXTREME11-NEXT: addi.d $t0, $zero, -1912
 # EXTREME11-NEXT: lu32i.d   $t0, 209715
 # EXTREME11-NEXT: lu52i.d   $t0, $t0, -1093
 
@@ -219,8 +219,8 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbbaaaaa34567111 --section-start=.text=0x0000000012345678 -o %t/extreme12
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme12 | FileCheck %s --check-prefix=EXTREME12
-# EXTREME12:      addi.d $t0, $zero, 273
-# EXTREME12-NEXT: pcalau12i $t1, 139810
+# EXTREME12:      pcalau12i $t1, 139810
+# EXTREME12-NEXT: addi.d $t0, $zero, 273
 # EXTREME12-NEXT: lu32i.d   $t0, -349526
 # EXTREME12-NEXT: lu52i.d   $t0, $t0, -1093
 
@@ -231,8 +231,8 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbbaaaaa34567888 --section-start=.text=0x0000000012345678 -o %t/extreme13
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme13 | FileCheck %s --check-prefix=EXTREME13
-# EXTREME13:      addi.d $t0, $zero, -1912
-# EXTREME13-NEXT: pcalau12i $t1, 139811
+# EXTREME13:      pcalau12i $t1, 139811
+# EXTREME13-NEXT: addi.d $t0, $zero, -1912
 # EXTREME13-NEXT: lu32i.d   $t0, -349527
 # EXTREME13-NEXT: lu52i.d   $t0, $t0, -1093
 
@@ -243,8 +243,8 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbbaaaaaabcde111 --section-start=.text=0x0000000012345678 -o %t/extreme14
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme14 | FileCheck %s --check-prefix=EXTREME14
-# EXTREME14:      addi.d $t0, $zero, 273
-# EXTREME14-NEXT: pcalau12i $t1, -419431
+# EXTREME14:      pcalau12i $t1, -419431
+# EXTREME14-NEXT: addi.d $t0, $zero, 273
 # EXTREME14-NEXT: lu32i.d   $t0, -349525
 # EXTREME14-NEXT: lu52i.d   $t0, $t0, -1093
 
@@ -255,11 +255,48 @@
 ## %pc64_hi12 = 0xbbb = -1093
 # RUN: ld.lld %t/extreme.o --section-start=.rodata=0xbbbaaaaaabcde888 --section-start=.text=0x0000000012345678 -o %t/extreme15
 # RUN: llvm-objdump -d --no-show-raw-insn %t/extreme15 | FileCheck %s --check-prefix=EXTREME15
-# EXTREME15:      addi.d $t0, $zero, -1912
-# EXTREME15-NEXT: pcalau12i $t1, -419430
+# EXTREME15:      pcalau12i $t1, -419430
+# EXTREME15-NEXT: addi.d $t0, $zero, -1912
 # EXTREME15-NEXT: lu32i.d   $t0, -349526
 # EXTREME15-NEXT: lu52i.d   $t0, $t0, -1093
 
+## page delta = 0xffffffff00000000, page offset = 0x888
+## %pc_lo12   = 0x888 = -1912
+## %pc_hi20   = 0x00000 = 0
+## %pc64_lo20 = 0xfffff = -1
+## %pc64_hi12 = 0xfff = -1
+# RUN: ld.lld %t/extreme.o --section-start=.rodata=0x0000000012344888 --section-start=.text=0x0000000012345678 -o %t/extreme16
+# RUN: llvm-objdump -d --no-show-raw-insn %t/extreme16 | FileCheck %s --check-prefix=EXTREME16
+# EXTREME16:      pcalau12i $t1, 0
+# EXTREME16-NEXT: addi.d $t0, $zero, -1912
+# EXTREME16-NEXT: lu32i.d   $t0, -1
+# EXTREME16-NEXT: lu52i.d   $t0, $t0, -1
+
+## page delta = 0x0000000080000000, page offset = 0x888
+## %pc_lo12   = 0x888 = -1912
+## %pc_hi20   = 0x80000 = -524288
+## %pc64_lo20 = 0xfffff = 0
+## %pc64_hi12 = 0xfff = 0
+# RUN: ld.lld %t/extreme.o --section-start=.rodata=0x000071238ffff888 --section-start=.text=0x0000712310000678 -o %t/extreme17
+# RUN: llvm-objdump -d --no-show-raw-insn %t/extreme17 | FileCheck %s --check-prefix=EXTREME17
+# EXTREME17:      pcalau12i $t1, -524288
+# EXTREME17-NEXT: addi.d $t0, $zero, -1912
+# EXTREME17-NEXT: lu32i.d   $t0, 0
+# EXTREME17-NEXT: lu52i.d   $t0, $t0, 0
+
+## A case that pcalau12i, lu32i.d and lu52i.d are in different pages.
+## page delta = 0x0000000080000000, page offset = 0x123
+## %pc_lo12   = 0x111 = 273
+## %pc_hi20   = 0x80000 = -524288
+## %pc64_lo20 = 0x00001 = 1
+## %pc64_hi12 = 0x000 = 0
+# RUN: ld.lld %t/extreme.o --section-start=.rodata=0x80000111 --section-start=.text=0xff8 -o %t/extreme18
+# RUN: llvm-objdump -d --no-show-raw-insn %t/extreme18 | FileCheck %s --check-prefix=EXTREME18
+# EXTREME18:      pcalau12i $t1, -524288
+# EXTREME18-NEXT: addi.d $t0, $zero, 273
+# EXTREME18-NEXT: lu32i.d   $t0, 1
+# EXTREME18-NEXT: lu52i.d   $t0, $t0, 0
+
 #--- a.s
 .rodata
 x:
@@ -277,7 +314,7 @@ x:
 .text
 .global _start
 _start:
-    addi.d    $t0, $zero, %pc_lo12(x)
     pcalau12i $t1, %pc_hi20(x)
+    addi.d    $t0, $zero, %pc_lo12(x)
     lu32i.d   $t0, %pc64_lo20(x)
     lu52i.d   $t0, $t0, %pc64_hi12(x)
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def
index 02bce3c71712743cb2951b0df9a6c304dcd54c32..c4393432677b8e3e818460a4e2f42ec4abc55b4d 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def
@@ -118,3 +118,9 @@ ELF_RELOC(R_LARCH_SUB6,        106)
 ELF_RELOC(R_LARCH_ADD_ULEB128, 107)
 ELF_RELOC(R_LARCH_SUB_ULEB128, 108)
 ELF_RELOC(R_LARCH_64_PCREL,    109)
+
+// Relocs added in ELF for the LoongArch™ Architecture v20231102, part of the
+// v2.20 LoongArch ABI specs.
+//
+// Spec addition: https://github.com/loongson/la-abi-specs/pull/4
+ELF_RELOC(R_LARCH_CALL36, 110)
diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
index 685deaec7709bd7896a65fedcd121b6a3d410a45..9002076e7aecea6260eb9c5161cbfb0cb2870563 100644
--- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
@@ -122,6 +122,15 @@ def int_loongarch_lddir_d : BaseInt<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                                     [ImmArg<ArgIndex<1>>]>;
 def int_loongarch_ldpte_d : BaseInt<[], [llvm_i64_ty, llvm_i64_ty],
                                     [ImmArg<ArgIndex<1>>]>;
+
+def int_loongarch_frecipe_s : BaseInt<[llvm_float_ty], [llvm_float_ty],
+                                      [IntrNoMem]>;
+def int_loongarch_frecipe_d : BaseInt<[llvm_double_ty], [llvm_double_ty],
+                                      [IntrNoMem]>;
+def int_loongarch_frsqrte_s : BaseInt<[llvm_float_ty], [llvm_float_ty],
+                                      [IntrNoMem]>;
+def int_loongarch_frsqrte_d : BaseInt<[llvm_double_ty], [llvm_double_ty],
+                                      [IntrNoMem]>;
 } // TargetPrefix = "loongarch"
 
 /// Vector intrinsic
@@ -527,10 +536,12 @@ foreach inst = ["vfmadd_d", "vfmsub_d", "vfnmadd_d", "vfnmsub_d"] in
              [IntrNoMem]>;
 
 foreach inst = ["vflogb_s", "vfsqrt_s", "vfrecip_s", "vfrsqrt_s", "vfrint_s",
+                "vfrecipe_s", "vfrsqrte_s",
                 "vfrintrne_s", "vfrintrz_s", "vfrintrp_s", "vfrintrm_s"] in
   def int_loongarch_lsx_#inst : VecInt<[llvm_v4f32_ty], [llvm_v4f32_ty],
                                        [IntrNoMem]>;
 foreach inst = ["vflogb_d", "vfsqrt_d", "vfrecip_d", "vfrsqrt_d", "vfrint_d",
+                "vfrecipe_d", "vfrsqrte_d",
                 "vfrintrne_d", "vfrintrz_d", "vfrintrp_d", "vfrintrm_d"] in
   def int_loongarch_lsx_#inst : VecInt<[llvm_v2f64_ty], [llvm_v2f64_ty],
                                        [IntrNoMem]>;
@@ -1044,10 +1055,12 @@ foreach inst = ["xvfmadd_d", "xvfmsub_d", "xvfnmadd_d", "xvfnmsub_d"] in
              [IntrNoMem]>;
 
 foreach inst = ["xvflogb_s", "xvfsqrt_s", "xvfrecip_s", "xvfrsqrt_s", "xvfrint_s",
+                "xvfrecipe_s", "xvfrsqrte_s",
                 "xvfrintrne_s", "xvfrintrz_s", "xvfrintrp_s", "xvfrintrm_s"] in
   def int_loongarch_lasx_#inst : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty],
                                         [IntrNoMem]>;
 foreach inst = ["xvflogb_d", "xvfsqrt_d", "xvfrecip_d", "xvfrsqrt_d", "xvfrint_d",
+                "xvfrecipe_d", "xvfrsqrte_d",
                 "xvfrintrne_d", "xvfrintrz_d", "xvfrintrp_d", "xvfrintrm_d"] in
   def int_loongarch_lasx_#inst : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty],
                                         [IntrNoMem]>;
diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
index b20d124953f882e49eeb3082f629160028a29397..101a48cbd5399474b1c231cc5f20cf5187ea1083 100644
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
@@ -10,6 +10,7 @@ LOONGARCH_FEATURE("+lasx", FK_LASX)
 LOONGARCH_FEATURE("+lbt", FK_LBT)
 LOONGARCH_FEATURE("+lvz", FK_LVZ)
 LOONGARCH_FEATURE("+ual", FK_UAL)
+LOONGARCH_FEATURE("+frecipe", FK_FRECIPE)
 
 #undef LOONGARCH_FEATURE
 
@@ -19,5 +20,6 @@ LOONGARCH_FEATURE("+ual", FK_UAL)
 
 LOONGARCH_ARCH("loongarch64", AK_LOONGARCH64, FK_64BIT | FK_FP32 | FK_FP64 | FK_UAL)
 LOONGARCH_ARCH("la464", AK_LA464, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL)
+LOONGARCH_ARCH("la664", AK_LA664, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL | FK_FRECIPE)
 
 #undef LOONGARCH_ARCH
diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
index 028844187584b236644a3283fa05a5b2602e828d..c0bb15a5163b128b259d46b96ce569a7b48d8aa2 100644
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
@@ -46,6 +46,9 @@ enum FeatureKind : uint32_t {
 
   // Allow memory accesses to be unaligned.
   FK_UAL = 1 << 8,
+
+  // Floating-point approximate reciprocal instructions are available.
+  FK_FRECIPE = 1 << 9,
 };
 
 struct FeatureInfo {
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index a132e645c8644f6a4cec2b8caff5fa4e801134df..f908e5bc63d3107ed775e8a30b69bcb08594b841 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -122,6 +122,10 @@ class LoongArchAsmParser : public MCTargetAsmParser {
   // Helper to emit pseudo instruction "li.w/d $rd, $imm".
   void emitLoadImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
 
+  // Helper to emit pseudo instruction "call36 sym" or "tail36 $rj, sym".
+  void emitFuncCall36(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                      bool IsTailCall);
+
 public:
   enum LoongArchMatchResultTy {
     Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
@@ -401,6 +405,22 @@ public:
                      IsValidKind;
   }
 
+  bool isSImm20pcaddu18i() const {
+    if (!isImm())
+      return false;
+
+    int64_t Imm;
+    LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    bool IsValidKind = VK == LoongArchMCExpr::VK_LoongArch_None ||
+                       VK == LoongArchMCExpr::VK_LoongArch_CALL36;
+
+    return IsConstantImm
+               ? isInt<20>(Imm) && IsValidKind
+               : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
+                     IsValidKind;
+  }
+
   bool isSImm21lsl2() const {
     if (!isImm())
       return false;
@@ -1111,6 +1131,35 @@ void LoongArchAsmParser::emitLoadImm(MCInst &Inst, SMLoc IDLoc,
   }
 }
 
+void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc,
+                                        MCStreamer &Out, bool IsTailCall) {
+  // call36 sym
+  // expands to:
+  //   pcaddu18i $ra, %call36(sym)
+  //   jirl      $ra, $ra, 0
+  //
+  // tail36 $rj, sym
+  // expands to:
+  //   pcaddu18i $rj, %call36(sym)
+  //   jirl      $r0, $rj, 0
+  unsigned ScratchReg =
+      IsTailCall ? Inst.getOperand(0).getReg() : (unsigned)LoongArch::R1;
+  const MCExpr *Sym =
+      IsTailCall ? Inst.getOperand(1).getExpr() : Inst.getOperand(0).getExpr();
+  const LoongArchMCExpr *LE = LoongArchMCExpr::create(
+      Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, getContext());
+
+  Out.emitInstruction(
+      MCInstBuilder(LoongArch::PCADDU18I).addReg(ScratchReg).addExpr(LE),
+      getSTI());
+  Out.emitInstruction(
+      MCInstBuilder(LoongArch::JIRL)
+          .addReg(IsTailCall ? (unsigned)LoongArch::R0 : ScratchReg)
+          .addReg(ScratchReg)
+          .addImm(0),
+      getSTI());
+}
+
 bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                                             OperandVector &Operands,
                                             MCStreamer &Out) {
@@ -1159,6 +1208,12 @@ bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   case LoongArch::PseudoLI_D:
     emitLoadImm(Inst, IDLoc, Out);
     return false;
+  case LoongArch::PseudoCALL36:
+    emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/false);
+    return false;
+  case LoongArch::PseudoTAIL36:
+    emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/true);
+    return false;
   }
   Out.emitInstruction(Inst, getSTI());
   return false;
@@ -1440,6 +1495,12 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         /*Upper=*/(1 << 19) - 1,
         "operand must be a symbol with modifier (e.g. %pc_hi20) or an integer "
         "in the range");
+  case Match_InvalidSImm20pcaddu18i:
+    return generateImmOutOfRangeError(
+        Operands, ErrorInfo, /*Lower=*/-(1 << 19),
+        /*Upper=*/(1 << 19) - 1,
+        "operand must be a symbol with modifier (e.g. %call36) or an integer "
+        "in the range");
   case Match_InvalidSImm21lsl2:
     return generateImmOutOfRangeError(
         Operands, ErrorInfo, /*Lower=*/-(1 << 22), /*Upper=*/(1 << 22) - 4,
diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
index 2a4c991a43b09cfb52844f58e6dcb760e2b2abae..5f85cace71af7f4bd1a1dafb146d7148642f67ee 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.td
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@@ -105,10 +105,13 @@ def FeatureUAL
 def FeatureRelax
     : SubtargetFeature<"relax", "HasLinkerRelax", "true",
                        "Enable Linker relaxation">;
-// Experimental auto vectorization
-def FeatureAutoVec
-    : SubtargetFeature<"auto-vec", "HasExpAutoVec", "true",
-                       "Experimental auto vectorization">;
+
+// Floating point approximation operation
+def FeatureFrecipe
+    : SubtargetFeature<"frecipe", "HasFrecipe", "true",
+                       "Support frecipe.{s/d} and frsqrte.{s/d} instructions.">;
+def HasFrecipe : Predicate<"Subtarget->hasFrecipe()">;
+
 
 //===----------------------------------------------------------------------===//
 // Registers, instruction descriptions ...
@@ -140,6 +143,13 @@ def : ProcessorModel<"la464", NoSchedModel, [Feature64Bit,
                                              FeatureExtLVZ,
                                              FeatureExtLBT]>;
 
+def : ProcessorModel<"la664", NoSchedModel, [Feature64Bit,
+                                             FeatureUAL,
+                                             FeatureExtLASX,
+                                             FeatureExtLVZ,
+                                             FeatureExtLBT,
+                                             FeatureFrecipe]>;
+
 //===----------------------------------------------------------------------===//
 // Define the LoongArch target.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index 72c1f1cec198347f4cc0ec710fbb380cf042bf40..ad39658f698e7b8ede8ed3297eec204c8c9f9b09 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -62,43 +62,24 @@ private:
                                MachineBasicBlock::iterator &NextMBBI,
                                unsigned FlagsHi, unsigned SecondOpcode,
                                unsigned FlagsLo);
-  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              unsigned LastOpcode, unsigned IdentifyingMO);
-  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              unsigned LastOpcode, unsigned IdentifyingMO,
-                              const MachineOperand &Symbol, Register DestReg,
-                              bool EraseFromParent);
   bool expandLoadAddressPcrel(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              bool Large = false);
+                              MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressGot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
-                            MachineBasicBlock::iterator &NextMBBI,
-                            bool Large = false);
+                            MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressTLSLE(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
                               MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressTLSIE(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              bool Large = false);
+                              MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressTLSLD(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              bool Large = false);
+                              MachineBasicBlock::iterator &NextMBBI);
   bool expandLoadAddressTLSGD(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
-                              MachineBasicBlock::iterator &NextMBBI,
-                              bool Large = false);
-  bool expandFunctionCALL(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI,
-                          MachineBasicBlock::iterator &NextMBBI,
-                          bool IsTailCall);
+                              MachineBasicBlock::iterator &NextMBBI);
 };
 
 char LoongArchPreRAExpandPseudo::ID = 0;
@@ -131,30 +112,16 @@ bool LoongArchPreRAExpandPseudo::expandMI(
   switch (MBBI->getOpcode()) {
   case LoongArch::PseudoLA_PCREL:
     return expandLoadAddressPcrel(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_PCREL_LARGE:
-    return expandLoadAddressPcrel(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_GOT:
     return expandLoadAddressGot(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_GOT_LARGE:
-    return expandLoadAddressGot(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_LE:
     return expandLoadAddressTLSLE(MBB, MBBI, NextMBBI);
   case LoongArch::PseudoLA_TLS_IE:
     return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_IE_LARGE:
-    return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_LD:
     return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_LD_LARGE:
-    return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI, /*Large=*/true);
   case LoongArch::PseudoLA_TLS_GD:
     return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI);
-  case LoongArch::PseudoLA_TLS_GD_LARGE:
-    return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI, /*Large=*/true);
-  case LoongArch::PseudoCALL:
-    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
-  case LoongArch::PseudoTAIL:
-    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
   }
   return false;
 }
@@ -187,118 +154,9 @@ bool LoongArchPreRAExpandPseudo::expandPcalau12iInstPair(
   return true;
 }
 
-bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
-    unsigned IdentifyingMO) {
-  MachineInstr &MI = *MBBI;
-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
-                                MI.getOperand(2), MI.getOperand(0).getReg(),
-                                true);
-}
-
-bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
-    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
-    bool EraseFromParent) {
-  // Code Sequence:
-  //
-  // Part1: pcalau12i  $scratch, %MO1(sym)
-  // Part0: addi.d     $dest, $zero, %MO0(sym)
-  // Part2: lu32i.d    $dest, %MO2(sym)
-  // Part3: lu52i.d    $dest, $dest, %MO3(sym)
-  // Fin:   LastOpcode $dest, $dest, $scratch
-
-  unsigned MO0, MO1, MO2, MO3;
-  switch (IdentifyingMO) {
-  default:
-    llvm_unreachable("unsupported identifying MO");
-  case LoongArchII::MO_PCREL_LO:
-    MO0 = IdentifyingMO;
-    MO1 = LoongArchII::MO_PCREL_HI;
-    MO2 = LoongArchII::MO_PCREL64_LO;
-    MO3 = LoongArchII::MO_PCREL64_HI;
-    break;
-  case LoongArchII::MO_GOT_PC_HI:
-  case LoongArchII::MO_LD_PC_HI:
-  case LoongArchII::MO_GD_PC_HI:
-    // These cases relocate just like the GOT case, except for Part1.
-    MO0 = LoongArchII::MO_GOT_PC_LO;
-    MO1 = IdentifyingMO;
-    MO2 = LoongArchII::MO_GOT_PC64_LO;
-    MO3 = LoongArchII::MO_GOT_PC64_HI;
-    break;
-  case LoongArchII::MO_IE_PC_LO:
-    MO0 = IdentifyingMO;
-    MO1 = LoongArchII::MO_IE_PC_HI;
-    MO2 = LoongArchII::MO_IE_PC64_LO;
-    MO3 = LoongArchII::MO_IE_PC64_HI;
-    break;
-  }
-
-  MachineFunction *MF = MBB.getParent();
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-
-  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
-         "Large code model requires LA64");
-
-  Register TmpPart1 =
-      MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
-  Register TmpPart0 =
-      DestReg.isVirtual()
-          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-          : DestReg;
-  Register TmpParts02 =
-      DestReg.isVirtual()
-          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-          : DestReg;
-  Register TmpParts023 =
-      DestReg.isVirtual()
-          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-          : DestReg;
-
-  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), TmpPart1);
-  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), TmpPart0)
-                   .addReg(LoongArch::R0);
-  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), TmpParts02)
-                   // "rj" is needed due to InstrInfo pattern requirement.
-                   .addReg(TmpPart0, RegState::Kill);
-  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), TmpParts023)
-                   .addReg(TmpParts02, RegState::Kill);
-  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
-      .addReg(TmpParts023)
-      .addReg(TmpPart1, RegState::Kill);
-
-  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
-    const char *SymName = Symbol.getSymbolName();
-    Part0.addExternalSymbol(SymName, MO0);
-    Part1.addExternalSymbol(SymName, MO1);
-    Part2.addExternalSymbol(SymName, MO2);
-    Part3.addExternalSymbol(SymName, MO3);
-  } else {
-    Part0.addDisp(Symbol, 0, MO0);
-    Part1.addDisp(Symbol, 0, MO1);
-    Part2.addDisp(Symbol, 0, MO2);
-    Part3.addDisp(Symbol, 0, MO3);
-  }
-
-  if (EraseFromParent)
-    MI.eraseFromParent();
-
-  return true;
-}
-
 bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
-  if (Large)
-    // Emit the 5-insn large address load sequence with the `%pc` family of
-    // relocs.
-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
-                                  LoongArchII::MO_PCREL_LO);
-
+    MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
   // pcalau12i $rd, %pc_hi20(sym)
   // addi.w/d $rd, $rd, %pc_lo12(sym)
@@ -311,13 +169,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressGot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
-  if (Large)
-    // Emit the 5-insn large address load sequence with the `%got_pc` family
-    // of relocs, loading the result from GOT with `ldx.d` in the end.
-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
-                                  LoongArchII::MO_GOT_PC_HI);
-
+    MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
   // pcalau12i $rd, %got_pc_hi20(sym)
   // ld.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -378,13 +230,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLE(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
-  if (Large)
-    // Emit the 5-insn large address load sequence with the `%ie_pc` family
-    // of relocs, loading the result with `ldx.d` in the end.
-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
-                                  LoongArchII::MO_IE_PC_LO);
-
+    MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
   // pcalau12i $rd, %ie_pc_hi20(sym)
   // ld.w/d $rd, $rd, %ie_pc_lo12(sym)
@@ -397,13 +243,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
-  if (Large)
-    // Emit the 5-insn large address load sequence with the `%got_pc` family
-    // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
-                                  LoongArchII::MO_LD_PC_HI);
-
+    MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
   // pcalau12i $rd, %ld_pc_hi20(sym)
   // addi.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -416,13 +256,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
 
 bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
-  if (Large)
-    // Emit the 5-insn large address load sequence with the `%got_pc` family
-    // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
-                                  LoongArchII::MO_GD_PC_HI);
-
+    MachineBasicBlock::iterator &NextMBBI) {
   // Code Sequence:
   // pcalau12i $rd, %gd_pc_hi20(sym)
   // addi.w/d $rd, $rd, %got_pc_lo12(sym)
@@ -433,88 +267,6 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
                                  SecondOpcode, LoongArchII::MO_GOT_PC_LO);
 }
 
-bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
-  MachineFunction *MF = MBB.getParent();
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-  const MachineOperand &Func = MI.getOperand(0);
-  MachineInstrBuilder CALL;
-  unsigned Opcode;
-
-  switch (MF->getTarget().getCodeModel()) {
-  default:
-    report_fatal_error("Unsupported code model");
-    break;
-  case CodeModel::Small: {
-    // CALL:
-    // bl func
-    // TAIL:
-    // b func
-    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
-    break;
-  }
-  case CodeModel::Medium: {
-    // CALL:
-    // pcalau12i  $ra, %pc_hi20(func)
-    // jirl       $ra, $ra, %pc_lo12(func)
-    // TAIL:
-    // pcalau12i  $scratch, %pc_hi20(func)
-    // jirl       $r0, $scratch, %pc_lo12(func)
-    Opcode =
-        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
-    Register ScratchReg =
-        IsTailCall
-            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-            : LoongArch::R1;
-    MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), ScratchReg);
-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg);
-    if (Func.isSymbol()) {
-      const char *FnName = Func.getSymbolName();
-      MIB.addExternalSymbol(FnName, LoongArchII::MO_PCREL_HI);
-      CALL.addExternalSymbol(FnName, LoongArchII::MO_PCREL_LO);
-      break;
-    }
-    assert(Func.isGlobal() && "Expected a GlobalValue at this time");
-    const GlobalValue *GV = Func.getGlobal();
-    MIB.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_HI);
-    CALL.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_LO);
-    break;
-  }
-  case CodeModel::Large: {
-    // Emit the 5-insn large address load sequence, either directly or
-    // indirectly in case of going through the GOT, then JIRL_TAIL or
-    // JIRL_CALL to $addr.
-    Opcode =
-        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
-    Register AddrReg =
-        IsTailCall
-            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
-            : LoongArch::R1;
-
-    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
-    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
-    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
-    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
-                           false);
-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
-    break;
-  }
-  }
-
-  // Transfer implicit operands.
-  CALL.copyImplicitOps(MI);
-
-  // Transfer MI flags.
-  CALL.setMIFlags(MI.getFlags());
-
-  MI.eraseFromParent();
-  return true;
-}
-
 class LoongArchExpandPseudo : public MachineFunctionPass {
 public:
   const LoongArchInstrInfo *TII;
@@ -536,6 +288,35 @@ private:
                 MachineBasicBlock::iterator &NextMBBI);
   bool expandCopyCFR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                      MachineBasicBlock::iterator &NextMBBI);
+  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI,
+                              MachineBasicBlock::iterator &NextMBBI,
+                              unsigned LastOpcode, unsigned IdentifyingMO);
+  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI,
+                              MachineBasicBlock::iterator &NextMBBI,
+                              unsigned LastOpcode, unsigned IdentifyingMO,
+                              const MachineOperand &Symbol, Register DestReg,
+                              bool EraseFromParent);
+  bool expandLoadAddressPcrelLarge(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI);
+  bool expandLoadAddressGotLarge(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 MachineBasicBlock::iterator &NextMBBI);
+  bool expandLoadAddressTLSIELarge(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI);
+  bool expandLoadAddressTLSLDLarge(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI);
+  bool expandLoadAddressTLSGDLarge(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI);
+  bool expandFunctionCALL(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          MachineBasicBlock::iterator &NextMBBI,
+                          bool IsTailCall);
 };
 
 char LoongArchExpandPseudo::ID = 0;
@@ -570,6 +351,24 @@ bool LoongArchExpandPseudo::expandMI(MachineBasicBlock &MBB,
   switch (MBBI->getOpcode()) {
   case LoongArch::PseudoCopyCFR:
     return expandCopyCFR(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_PCREL_LARGE:
+    return expandLoadAddressPcrelLarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_GOT_LARGE:
+    return expandLoadAddressGotLarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_IE_LARGE:
+    return expandLoadAddressTLSIELarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_LD_LARGE:
+    return expandLoadAddressTLSLDLarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoLA_TLS_GD_LARGE:
+    return expandLoadAddressTLSGDLarge(MBB, MBBI, NextMBBI);
+  case LoongArch::PseudoCALL:
+  case LoongArch::PseudoCALL_MEDIUM:
+  case LoongArch::PseudoCALL_LARGE:
+    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
+  case LoongArch::PseudoTAIL:
+  case LoongArch::PseudoTAIL_MEDIUM:
+  case LoongArch::PseudoTAIL_LARGE:
+    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
   }
 
   return false;
@@ -628,6 +427,212 @@ bool LoongArchExpandPseudo::expandCopyCFR(
   return true;
 }
 
+bool LoongArchExpandPseudo::expandLargeAddressLoad(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+    unsigned IdentifyingMO) {
+  MachineInstr &MI = *MBBI;
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
+                                MI.getOperand(2), MI.getOperand(0).getReg(),
+                                true);
+}
+
+bool LoongArchExpandPseudo::expandLargeAddressLoad(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
+    bool EraseFromParent) {
+  // Code Sequence:
+  //
+  // Part1: pcalau12i  $dst, %MO1(sym)
+  // Part0: addi.d     $t8, $zero, %MO0(sym)
+  // Part2: lu32i.d    $t8, %MO2(sym)
+  // Part3: lu52i.d    $t8, $t8, %MO3(sym)
+  // Fin:   LastOpcode $dst, $t8, $dst
+
+  unsigned MO0, MO1, MO2, MO3;
+  switch (IdentifyingMO) {
+  default:
+    llvm_unreachable("unsupported identifying MO");
+  case LoongArchII::MO_PCREL_LO:
+    MO0 = IdentifyingMO;
+    MO1 = LoongArchII::MO_PCREL_HI;
+    MO2 = LoongArchII::MO_PCREL64_LO;
+    MO3 = LoongArchII::MO_PCREL64_HI;
+    break;
+  case LoongArchII::MO_GOT_PC_HI:
+  case LoongArchII::MO_LD_PC_HI:
+  case LoongArchII::MO_GD_PC_HI:
+    // These cases relocate just like the GOT case, except for Part1.
+    MO0 = LoongArchII::MO_GOT_PC_LO;
+    MO1 = IdentifyingMO;
+    MO2 = LoongArchII::MO_GOT_PC64_LO;
+    MO3 = LoongArchII::MO_GOT_PC64_HI;
+    break;
+  case LoongArchII::MO_IE_PC_LO:
+    MO0 = IdentifyingMO;
+    MO1 = LoongArchII::MO_IE_PC_HI;
+    MO2 = LoongArchII::MO_IE_PC64_LO;
+    MO3 = LoongArchII::MO_IE_PC64_HI;
+    break;
+  }
+
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  Register ScratchReg = LoongArch::R20; // $t8
+
+  assert(MBB.getParent()->getSubtarget<LoongArchSubtarget>().is64Bit() &&
+         "Large code model requires LA64");
+
+  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg);
+  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), ScratchReg)
+                   .addReg(LoongArch::R0);
+  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), ScratchReg)
+                   // "rj" is needed due to InstrInfo pattern requirement.
+                   .addReg(ScratchReg);
+  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), ScratchReg)
+                   .addReg(ScratchReg);
+  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
+      .addReg(ScratchReg)
+      .addReg(DestReg);
+
+  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
+    const char *SymName = Symbol.getSymbolName();
+    Part0.addExternalSymbol(SymName, MO0);
+    Part1.addExternalSymbol(SymName, MO1);
+    Part2.addExternalSymbol(SymName, MO2);
+    Part3.addExternalSymbol(SymName, MO3);
+  } else {
+    Part0.addDisp(Symbol, 0, MO0);
+    Part1.addDisp(Symbol, 0, MO1);
+    Part2.addDisp(Symbol, 0, MO2);
+    Part3.addDisp(Symbol, 0, MO3);
+  }
+
+  if (EraseFromParent)
+    MI.eraseFromParent();
+
+  return true;
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressPcrelLarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Emit the 5-insn large address load sequence with the `%pc` family of
+  // relocs.
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+                                LoongArchII::MO_PCREL_LO);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressGotLarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Emit the 5-insn large address load sequence with the `%got_pc` family
+  // of relocs, loading the result from GOT with `ldx.d` in the end.
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+                                LoongArchII::MO_GOT_PC_HI);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressTLSIELarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Emit the 5-insn large address load sequence with the `%ie_pc` family
+  // of relocs, loading the result with `ldx.d` in the end.
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+                                LoongArchII::MO_IE_PC_LO);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressTLSLDLarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Emit the 5-insn large address load sequence with the `%got_pc` family
+  // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+                                LoongArchII::MO_LD_PC_HI);
+}
+
+bool LoongArchExpandPseudo::expandLoadAddressTLSGDLarge(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  // Emit the 5-insn large address load sequence with the `%got_pc` family
+  // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
+  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+                                LoongArchII::MO_GD_PC_HI);
+}
+
+bool LoongArchExpandPseudo::expandFunctionCALL(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
+  MachineFunction *MF = MBB.getParent();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  const MachineOperand &Func = MI.getOperand(0);
+  MachineInstrBuilder CALL;
+  unsigned Opcode;
+
+  switch (MF->getTarget().getCodeModel()) {
+  default:
+    report_fatal_error("Unsupported code model");
+    break;
+  case CodeModel::Small: {
+    // CALL:
+    // bl func
+    // TAIL:
+    // b func
+    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
+    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
+    break;
+  }
+  case CodeModel::Medium: {
+    // CALL:
+    // pcaddu18i  $ra, %call36(func)
+    // jirl       $ra, $ra, 0
+    // TAIL:
+    // pcaddu18i  $t8, %call36(func)
+    // jr         $t8
+    Opcode =
+        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+    Register ScratchReg = IsTailCall ? LoongArch::R20 : LoongArch::R1;
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
+
+    CALL =
+        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
+
+    if (Func.isSymbol())
+      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
+    else
+      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
+    break;
+  }
+  case CodeModel::Large: {
+    // Emit the 5-insn large address load sequence, either directly or
+    // indirectly in case of going through the GOT, then JIRL_TAIL or
+    // JIRL_CALL to $addr.
+    Opcode =
+        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+    Register AddrReg = IsTailCall ? LoongArch::R19 : LoongArch::R1;
+
+    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
+    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
+    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
+    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
+                           false);
+    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
+    break;
+  }
+  }
+
+  // Transfer implicit operands.
+  CALL.copyImplicitOps(MI);
+
+  // Transfer MI flags.
+  CALL.setMIFlags(MI.getFlags());
+
+  MI.eraseFromParent();
+  return true;
+}
+
 } // end namespace
 
 INITIALIZE_PASS(LoongArchPreRAExpandPseudo, "loongarch-prera-expand-pseudo",
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index 65120c083f498dc85ada7fef08e66a199200838a..e27896768818cb6d7a6eedc978db92c46c5e4bf6 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -50,6 +50,8 @@ def FNEG_S   : FP_ALU_2R<0x01141400>;
 def FSQRT_S  : FP_ALU_2R<0x01144400>;
 def FRECIP_S : FP_ALU_2R<0x01145400>;
 def FRSQRT_S : FP_ALU_2R<0x01146400>;
+def FRECIPE_S : FP_ALU_2R<0x01147400>;
+def FRSQRTE_S : FP_ALU_2R<0x01148400>;
 def FSCALEB_S : FP_ALU_3R<0x01108000>;
 def FLOGB_S   : FP_ALU_2R<0x01142400>;
 def FCOPYSIGN_S : FP_ALU_3R<0x01128000>;
@@ -279,6 +281,12 @@ def : Pat<(loongarch_ftint FPR32:$src), (FTINTRZ_W_S FPR32:$src)>;
 // FP reciprocal operation
 def : Pat<(fdiv fpimm1, FPR32:$src), (FRECIP_S $src)>;
 
+let Predicates = [HasFrecipe] in {
+// FP approximate reciprocal operation
+def : Pat<(int_loongarch_frecipe_s FPR32:$src), (FRECIPE_S FPR32:$src)>;
+def : Pat<(int_loongarch_frsqrte_s FPR32:$src), (FRSQRTE_S FPR32:$src)>;
+}
+
 // fmadd.s: fj * fk + fa
 def : Pat<(fma FPR32:$fj, FPR32:$fk, FPR32:$fa), (FMADD_S $fj, $fk, $fa)>;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
index 437c1e4d7be272525a47db4dcb9b3a560f5fb0cc..26bed67ac22215d2fce4cfde2115bf9018eb5f39 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
@@ -34,6 +34,8 @@ def FNEG_D   : FP_ALU_2R<0x01141800, FPR64>;
 def FSQRT_D  : FP_ALU_2R<0x01144800, FPR64>;
 def FRECIP_D : FP_ALU_2R<0x01145800, FPR64>;
 def FRSQRT_D : FP_ALU_2R<0x01146800, FPR64>;
+def FRECIPE_D : FP_ALU_2R<0x01147800, FPR64>;
+def FRSQRTE_D : FP_ALU_2R<0x01148800, FPR64>;
 def FSCALEB_D : FP_ALU_3R<0x01110000, FPR64>;
 def FLOGB_D   : FP_ALU_2R<0x01142800, FPR64>;
 def FCOPYSIGN_D : FP_ALU_3R<0x01130000, FPR64>;
@@ -240,6 +242,12 @@ def : Pat<(f64 (fpextend FPR32:$src)), (FCVT_D_S FPR32:$src)>;
 // FP reciprocal operation
 def : Pat<(fdiv fpimm1, FPR64:$src), (FRECIP_D $src)>;
 
+let Predicates = [HasFrecipe] in {
+// FP approximate reciprocal operation
+def : Pat<(int_loongarch_frecipe_d FPR64:$src), (FRECIPE_D FPR64:$src)>;
+def : Pat<(int_loongarch_frsqrte_d FPR64:$src), (FRSQRTE_D FPR64:$src)>;
+}
+
 // fmadd.d: fj * fk + fa
 def : Pat<(fma FPR64:$fj, FPR64:$fk, FPR64:$fa), (FMADD_D $fj, $fk, $fa)>;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 4fc2b4709840bfb3ab9cfa1015d80597e6c0d7c5..618ae7056425451e5a3927b70ea2cbc200a988c1 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -247,9 +247,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
 
       setOperationAction(ISD::SETCC, VT, Legal);
       setOperationAction(ISD::VSELECT, VT, Legal);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
     }
     for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
-      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
       setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
                          Legal);
@@ -293,9 +293,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
 
       setOperationAction(ISD::SETCC, VT, Legal);
       setOperationAction(ISD::VSELECT, VT, Legal);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
     }
     for (MVT VT : {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8}) {
-      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
       setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
                          Legal);
@@ -422,9 +422,926 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
   return SDValue();
 }
 
+/// Determine whether a range fits a regular pattern of values.
+/// This function accounts for the possibility of jumping over the End iterator.
+template <typename ValType>
+static bool
+fitsRegularPattern(typename SmallVectorImpl<ValType>::const_iterator Begin,
+                   unsigned CheckStride,
+                   typename SmallVectorImpl<ValType>::const_iterator End,
+                   ValType ExpectedIndex, unsigned ExpectedIndexStride) {
+  auto &I = Begin;
+
+  while (I != End) {
+    if (*I != -1 && *I != ExpectedIndex)
+      return false;
+    ExpectedIndex += ExpectedIndexStride;
+
+    // Incrementing past End is undefined behaviour so we must increment one
+    // step at a time and check for End at each step.
+    for (unsigned n = 0; n < CheckStride && I != End; ++n, ++I)
+      ; // Empty loop body.
+  }
+  return true;
+}
+
+/// Lower VECTOR_SHUFFLE into VREPLVEI (if possible).
+///
+/// VREPLVEI performs vector broadcast based on an element specified by an
+/// integer immediate, with its mask being similar to:
+///   <x, x, x, ...>
+/// where x is any valid index.
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+  int SplatIndex = -1;
+  for (const auto &M : Mask) {
+    if (M != -1) {
+      SplatIndex = M;
+      break;
+    }
+  }
+
+  if (SplatIndex == -1)
+    return DAG.getUNDEF(VT);
+
+  assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index");
+  if (fitsRegularPattern<int>(Mask.begin(), 1, Mask.end(), SplatIndex, 0)) {
+    APInt Imm(64, SplatIndex);
+    return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
+                       DAG.getConstant(Imm, DL, MVT::i64));
+  }
+
+  return SDValue();
+}
+
+/// Lower VECTOR_SHUFFLE into VSHUF4I (if possible).
+///
+/// VSHUF4I splits the vector into blocks of four elements, then shuffles these
+/// elements according to a <4 x i2> constant (encoded as an integer immediate).
+///
+/// It is therefore possible to lower into VSHUF4I when the mask takes the form:
+///   <a, b, c, d, a+4, b+4, c+4, d+4, a+8, b+8, c+8, d+8, ...>
+/// When undef's appear they are treated as if they were whatever value is
+/// necessary in order to fit the above forms.
+///
+/// For example:
+///   %2 = shufflevector <8 x i16> %0, <8 x i16> undef,
+///                      <8 x i32> <i32 3, i32 2, i32 1, i32 0,
+///                                 i32 7, i32 6, i32 5, i32 4>
+/// is lowered to:
+///   (VSHUF4I_H $v0, $v1, 27)
+/// where the 27 comes from:
+///   3 + (2 << 2) + (1 << 4) + (0 << 6)
+static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+
+  // When the size is less than 4, lower cost instructions may be used.
+  if (Mask.size() < 4)
+    return SDValue();
+
+  int SubMask[4] = {-1, -1, -1, -1};
+  for (unsigned i = 0; i < 4; ++i) {
+    for (unsigned j = i; j < Mask.size(); j += 4) {
+      int Idx = Mask[j];
+
+      // Convert from vector index to 4-element subvector index
+      // If an index refers to an element outside of the subvector then give up
+      if (Idx != -1) {
+        Idx -= 4 * (j / 4);
+        if (Idx < 0 || Idx >= 4)
+          return SDValue();
+      }
+
+      // If the mask has an undef, replace it with the current index.
+      // Note that it might still be undef if the current index is also undef
+      if (SubMask[i] == -1)
+        SubMask[i] = Idx;
+      // Check that non-undef values are the same as in the mask. If they
+      // aren't then give up
+      else if (Idx != -1 && Idx != SubMask[i])
+        return SDValue();
+    }
+  }
+
+  // Calculate the immediate. Replace any remaining undefs with zero
+  APInt Imm(64, 0);
+  for (int i = 3; i >= 0; --i) {
+    int Idx = SubMask[i];
+
+    if (Idx == -1)
+      Idx = 0;
+
+    Imm <<= 2;
+    Imm |= Idx & 0x3;
+  }
+
+  return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1,
+                     DAG.getConstant(Imm, DL, MVT::i64));
+}
+
+/// Lower VECTOR_SHUFFLE into VPACKEV (if possible).
+///
+/// VPACKEV interleaves the even elements from each vector.
+///
+/// It is possible to lower into VPACKEV when the mask consists of two of the
+/// following forms interleaved:
+///   <0, 2, 4, ...>
+///   <n, n+2, n+4, ...>
+/// where n is the number of elements in the vector.
+/// For example:
+///   <0, 0, 2, 2, 4, 4, ...>
+///   <0, n, 2, n+2, 4, n+4, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VPACKEV(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End, 0, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size(), 2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size(), 2))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPACKEV, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VPACKOD (if possible).
+///
+/// VPACKOD interleaves the odd elements from each vector.
+///
+/// It is possible to lower into VPACKOD when the mask consists of two of the
+/// following forms interleaved:
+///   <1, 3, 5, ...>
+///   <n+1, n+3, n+5, ...>
+/// where n is the number of elements in the vector.
+/// For example:
+///   <1, 1, 3, 3, 5, 5, ...>
+///   <1, n+1, 3, n+3, 5, n+5, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VPACKOD(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End, 1, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size() + 1, 2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, 1, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size() + 1, 2))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPACKOD, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VILVH (if possible).
+///
+/// VILVH interleaves consecutive elements from the left (highest-indexed) half
+/// of each vector.
+///
+/// It is possible to lower into VILVH when the mask consists of two of the
+/// following forms interleaved:
+///   <x, x+1, x+2, ...>
+///   <n+x, n+x+1, n+x+2, ...>
+/// where n is the number of elements in the vector and x is half n.
+/// For example:
+///   <x, x, x+1, x+1, x+2, x+2, ...>
+///   <x, n+x, x+1, n+x+1, x+2, n+x+2, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VILVH(const SDLoc &DL, ArrayRef<int> Mask,
+                                         MVT VT, SDValue V1, SDValue V2,
+                                         SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End, HalfSize, 1))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size() + HalfSize, 1))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, HalfSize, 1))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size() + HalfSize,
+                                   1))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VILVH, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VILVL (if possible).
+///
+/// VILVL interleaves consecutive elements from the right (lowest-indexed) half
+/// of each vector.
+///
+/// It is possible to lower into VILVL when the mask consists of two of the
+/// following forms interleaved:
+///   <0, 1, 2, ...>
+///   <n, n+1, n+2, ...>
+/// where n is the number of elements in the vector.
+/// For example:
+///   <0, 0, 1, 1, 2, 2, ...>
+///   <0, n, 1, n+1, 2, n+2, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VILVL(const SDLoc &DL, ArrayRef<int> Mask,
+                                         MVT VT, SDValue V1, SDValue V2,
+                                         SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End, 0, 1))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size(), 1))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 1))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size(), 1))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VILVL, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VPICKEV (if possible).
+///
+/// VPICKEV copies the even elements of each vector into the result vector.
+///
+/// It is possible to lower into VPICKEV when the mask consists of two of the
+/// following forms concatenated:
+///   <0, 2, 4, ...>
+///   <n, n+2, n+4, ...>
+/// where n is the number of elements in the vector.
+/// For example:
+///   <0, 2, 4, ..., 0, 2, 4, ...>
+///   <0, 2, 4, ..., n, n+2, n+4, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VPICKEV(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &Mid = Mask.begin() + Mask.size() / 2;
+  const auto &End = Mask.end();
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 1, Mid, 0, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 1, Mid, Mask.size(), 2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Mid, 1, End, 0, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Mid, 1, End, Mask.size(), 2))
+    V2 = OriV2;
+
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPICKEV, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VPICKOD (if possible).
+///
+/// VPICKOD copies the odd elements of each vector into the result vector.
+///
+/// It is possible to lower into VPICKOD when the mask consists of two of the
+/// following forms concatenated:
+///   <1, 3, 5, ...>
+///   <n+1, n+3, n+5, ...>
+/// where n is the number of elements in the vector.
+/// For example:
+///   <1, 3, 5, ..., 1, 3, 5, ...>
+///   <1, 3, 5, ..., n+1, n+3, n+5, ...>
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above forms.
+static SDValue lowerVECTOR_SHUFFLE_VPICKOD(const SDLoc &DL, ArrayRef<int> Mask,
+                                           MVT VT, SDValue V1, SDValue V2,
+                                           SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &Mid = Mask.begin() + Mask.size() / 2;
+  const auto &End = Mask.end();
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 1, Mid, 1, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 1, Mid, Mask.size() + 1, 2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Mid, 1, End, 1, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Mid, 1, End, Mask.size() + 1, 2))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into VSHUF.
+///
+/// This mostly consists of converting the shuffle mask into a BUILD_VECTOR and
+/// adding it as an operand to the resulting VSHUF.
+static SDValue lowerVECTOR_SHUFFLE_VSHUF(const SDLoc &DL, ArrayRef<int> Mask,
+                                         MVT VT, SDValue V1, SDValue V2,
+                                         SelectionDAG &DAG) {
+
+  SmallVector<SDValue, 16> Ops;
+  for (auto M : Mask)
+    Ops.push_back(DAG.getConstant(M, DL, MVT::i64));
+
+  EVT MaskVecTy = VT.changeVectorElementTypeToInteger();
+  SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops);
+
+  // VECTOR_SHUFFLE concatenates the vectors in an vectorwise fashion.
+  // <0b00, 0b01> + <0b10, 0b11> -> <0b00, 0b01, 0b10, 0b11>
+  // VSHF concatenates the vectors in a bitwise fashion:
+  // <0b00, 0b01> + <0b10, 0b11> ->
+  // 0b0100       + 0b1110       -> 0b01001110
+  //                                <0b10, 0b11, 0b00, 0b01>
+  // We must therefore swap the operands to get the correct result.
+  return DAG.getNode(LoongArchISD::VSHUF, DL, VT, MaskVec, V2, V1);
+}
+
+/// Dispatching routine to lower various 128-bit LoongArch vector shuffles.
+///
+/// This routine breaks down the specific type of 128-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                                  SDValue V1, SDValue V2, SelectionDAG &DAG) {
+  assert((VT.SimpleTy == MVT::v16i8 || VT.SimpleTy == MVT::v8i16 ||
+          VT.SimpleTy == MVT::v4i32 || VT.SimpleTy == MVT::v2i64 ||
+          VT.SimpleTy == MVT::v4f32 || VT.SimpleTy == MVT::v2f64) &&
+         "Vector type is unsupported for lsx!");
+  assert(V1.getSimpleValueType() == V2.getSimpleValueType() &&
+         "Two operands have different types!");
+  assert(VT.getVectorNumElements() == Mask.size() &&
+         "Unexpected mask size for shuffle!");
+  assert(Mask.size() % 2 == 0 && "Expected even mask size.");
+
+  SDValue Result;
+  // TODO: Add more comparison patterns.
+  if (V2.isUndef()) {
+    if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG)))
+      return Result;
+    if ((Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG)))
+      return Result;
+
+    // TODO: This comment may be enabled in the future to better match the
+    // pattern for instruction selection.
+    /* V2 = V1; */
+  }
+
+  // It is recommended not to change the pattern comparison order for better
+  // performance.
+  if ((Result = lowerVECTOR_SHUFFLE_VPACKEV(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VPACKOD(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VILVH(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VILVL(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VPICKEV(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG)))
+    return Result;
+
+  return SDValue();
+}
+
+/// Lower VECTOR_SHUFFLE into XVREPLVEI (if possible).
+///
+/// It is a XVREPLVEI when the mask is:
+///   <x, x, x, ..., x+n, x+n, x+n, ...>
+/// where the number of x is equal to n and n is half the length of vector.
+///
+/// When undef's appear in the mask they are treated as if they were whatever
+/// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL,
+                                             ArrayRef<int> Mask, MVT VT,
+                                             SDValue V1, SDValue V2,
+                                             SelectionDAG &DAG) {
+  int SplatIndex = -1;
+  for (const auto &M : Mask) {
+    if (M != -1) {
+      SplatIndex = M;
+      break;
+    }
+  }
+
+  if (SplatIndex == -1)
+    return DAG.getUNDEF(VT);
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+
+  assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index");
+  if (fitsRegularPattern<int>(Begin, 1, End - HalfSize, SplatIndex, 0) &&
+      fitsRegularPattern<int>(Begin + HalfSize, 1, End, SplatIndex + HalfSize,
+                              0)) {
+    APInt Imm(64, SplatIndex);
+    return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
+                       DAG.getConstant(Imm, DL, MVT::i64));
+  }
+
+  return SDValue();
+}
+
+/// Lower VECTOR_SHUFFLE into XVSHUF4I (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+  // When the size is less than or equal to 4, lower cost instructions may be
+  // used.
+  if (Mask.size() <= 4)
+    return SDValue();
+  return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG);
+}
+
+/// Lower VECTOR_SHUFFLE into XVPACKEV (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPACKEV(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+  return lowerVECTOR_SHUFFLE_VPACKEV(DL, Mask, VT, V1, V2, DAG);
+}
+
+/// Lower VECTOR_SHUFFLE into XVPACKOD (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPACKOD(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+  return lowerVECTOR_SHUFFLE_VPACKOD(DL, Mask, VT, V1, V2, DAG);
+}
+
+/// Lower VECTOR_SHUFFLE into XVILVH (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVILVH(const SDLoc &DL, ArrayRef<int> Mask,
+                                          MVT VT, SDValue V1, SDValue V2,
+                                          SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+  unsigned LeftSize = HalfSize / 2;
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, HalfSize - LeftSize,
+                              1) &&
+      fitsRegularPattern<int>(Begin + HalfSize, 2, End, HalfSize + LeftSize, 1))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End - HalfSize,
+                                   Mask.size() + HalfSize - LeftSize, 1) &&
+           fitsRegularPattern<int>(Begin + HalfSize, 2, End,
+                                   Mask.size() + HalfSize + LeftSize, 1))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, HalfSize - LeftSize,
+                              1) &&
+      fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End, HalfSize + LeftSize,
+                              1))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize,
+                                   Mask.size() + HalfSize - LeftSize, 1) &&
+           fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End,
+                                   Mask.size() + HalfSize + LeftSize, 1))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VILVH, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into XVILVL (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVILVL(const SDLoc &DL, ArrayRef<int> Mask,
+                                          MVT VT, SDValue V1, SDValue V2,
+                                          SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, 0, 1) &&
+      fitsRegularPattern<int>(Begin + HalfSize, 2, End, HalfSize, 1))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, Mask.size(), 1) &&
+           fitsRegularPattern<int>(Begin + HalfSize, 2, End,
+                                   Mask.size() + HalfSize, 1))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, 0, 1) &&
+      fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End, HalfSize, 1))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, Mask.size(),
+                                   1) &&
+           fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End,
+                                   Mask.size() + HalfSize, 1))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VILVL, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into XVPICKEV (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPICKEV(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &LeftMid = Mask.begin() + Mask.size() / 4;
+  const auto &Mid = Mask.begin() + Mask.size() / 2;
+  const auto &RightMid = Mask.end() - Mask.size() / 4;
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 1, LeftMid, 0, 2) &&
+      fitsRegularPattern<int>(Mid, 1, RightMid, HalfSize, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 1, LeftMid, Mask.size(), 2) &&
+           fitsRegularPattern<int>(Mid, 1, RightMid, Mask.size() + HalfSize, 2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(LeftMid, 1, Mid, 0, 2) &&
+      fitsRegularPattern<int>(RightMid, 1, End, HalfSize, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(LeftMid, 1, Mid, Mask.size(), 2) &&
+           fitsRegularPattern<int>(RightMid, 1, End, Mask.size() + HalfSize, 2))
+    V2 = OriV2;
+
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPICKEV, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into XVPICKOD (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVPICKOD(const SDLoc &DL, ArrayRef<int> Mask,
+                                            MVT VT, SDValue V1, SDValue V2,
+                                            SelectionDAG &DAG) {
+
+  const auto &Begin = Mask.begin();
+  const auto &LeftMid = Mask.begin() + Mask.size() / 4;
+  const auto &Mid = Mask.begin() + Mask.size() / 2;
+  const auto &RightMid = Mask.end() - Mask.size() / 4;
+  const auto &End = Mask.end();
+  unsigned HalfSize = Mask.size() / 2;
+  SDValue OriV1 = V1, OriV2 = V2;
+
+  if (fitsRegularPattern<int>(Begin, 1, LeftMid, 1, 2) &&
+      fitsRegularPattern<int>(Mid, 1, RightMid, HalfSize + 1, 2))
+    V1 = OriV1;
+  else if (fitsRegularPattern<int>(Begin, 1, LeftMid, Mask.size() + 1, 2) &&
+           fitsRegularPattern<int>(Mid, 1, RightMid, Mask.size() + HalfSize + 1,
+                                   2))
+    V1 = OriV2;
+  else
+    return SDValue();
+
+  if (fitsRegularPattern<int>(LeftMid, 1, Mid, 1, 2) &&
+      fitsRegularPattern<int>(RightMid, 1, End, HalfSize + 1, 2))
+    V2 = OriV1;
+  else if (fitsRegularPattern<int>(LeftMid, 1, Mid, Mask.size() + 1, 2) &&
+           fitsRegularPattern<int>(RightMid, 1, End, Mask.size() + HalfSize + 1,
+                                   2))
+    V2 = OriV2;
+  else
+    return SDValue();
+
+  return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1);
+}
+
+/// Lower VECTOR_SHUFFLE into XVSHUF (if possible).
+static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask,
+                                          MVT VT, SDValue V1, SDValue V2,
+                                          SelectionDAG &DAG) {
+
+  int MaskSize = Mask.size();
+  int HalfSize = Mask.size() / 2;
+  const auto &Begin = Mask.begin();
+  const auto &Mid = Mask.begin() + HalfSize;
+  const auto &End = Mask.end();
+
+  // VECTOR_SHUFFLE concatenates the vectors:
+  //  <0, 1, 2, 3, 4, 5, 6, 7> + <8, 9, 10, 11, 12, 13, 14, 15>
+  //  shuffling ->
+  //  <0, 1, 2, 3, 8, 9, 10, 11> <4, 5, 6, 7, 12, 13, 14, 15>
+  //
+  // XVSHUF concatenates the vectors:
+  //  <a0, a1, a2, a3, b0, b1, b2, b3> + <a4, a5, a6, a7, b4, b5, b6, b7>
+  //  shuffling ->
+  //  <a0, a1, a2, a3, a4, a5, a6, a7> + <b0, b1, b2, b3, b4, b5, b6, b7>
+  SmallVector<SDValue, 8> MaskAlloc;
+  for (auto it = Begin; it < Mid; it++) {
+    if (*it < 0) // UNDEF
+      MaskAlloc.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
+    else if ((*it >= 0 && *it < HalfSize) ||
+             (*it >= MaskSize && *it <= MaskSize + HalfSize)) {
+      int M = *it < HalfSize ? *it : *it - HalfSize;
+      MaskAlloc.push_back(DAG.getTargetConstant(M, DL, MVT::i64));
+    } else
+      return SDValue();
+  }
+  assert((int)MaskAlloc.size() == HalfSize && "xvshuf convert failed!");
+
+  for (auto it = Mid; it < End; it++) {
+    if (*it < 0) // UNDEF
+      MaskAlloc.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
+    else if ((*it >= HalfSize && *it < MaskSize) ||
+             (*it >= MaskSize + HalfSize && *it < MaskSize * 2)) {
+      int M = *it < MaskSize ? *it - HalfSize : *it - MaskSize;
+      MaskAlloc.push_back(DAG.getTargetConstant(M, DL, MVT::i64));
+    } else
+      return SDValue();
+  }
+  assert((int)MaskAlloc.size() == MaskSize && "xvshuf convert failed!");
+
+  EVT MaskVecTy = VT.changeVectorElementTypeToInteger();
+  SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, MaskAlloc);
+  return DAG.getNode(LoongArchISD::VSHUF, DL, VT, MaskVec, V2, V1);
+}
+
+/// Shuffle vectors by lane to generate more optimized instructions.
+/// 256-bit shuffles are always considered as 2-lane 128-bit shuffles.
+///
+/// Therefore, except for the following four cases, other cases are regarded
+/// as cross-lane shuffles, where optimization is relatively limited.
+///
+/// - Shuffle high, low lanes of two inputs vector
+///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <0, 5, 3, 6>
+/// - Shuffle low, high lanes of two inputs vector
+///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <3, 6, 0, 5>
+/// - Shuffle low, low lanes of two inputs vector
+///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <3, 6, 3, 6>
+/// - Shuffle high, high lanes of two inputs vector
+///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <0, 5, 0, 5>
+///
+/// The first case is the closest to LoongArch instructions and the other
+/// cases need to be converted to it for processing.
+///
+/// This function may modify V1, V2 and Mask
+static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
+                                            MutableArrayRef<int> Mask, MVT VT,
+                                            SDValue &V1, SDValue &V2,
+                                            SelectionDAG &DAG) {
+
+  enum HalfMaskType { HighLaneTy, LowLaneTy, None };
+
+  int MaskSize = Mask.size();
+  int HalfSize = Mask.size() / 2;
+
+  HalfMaskType preMask = None, postMask = None;
+
+  if (std::all_of(Mask.begin(), Mask.begin() + HalfSize, [&](int M) {
+        return M < 0 || (M >= 0 && M < HalfSize) ||
+               (M >= MaskSize && M < MaskSize + HalfSize);
+      }))
+    preMask = HighLaneTy;
+  else if (std::all_of(Mask.begin(), Mask.begin() + HalfSize, [&](int M) {
+             return M < 0 || (M >= HalfSize && M < MaskSize) ||
+                    (M >= MaskSize + HalfSize && M < MaskSize * 2);
+           }))
+    preMask = LowLaneTy;
+
+  if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
+        return M < 0 || (M >= 0 && M < HalfSize) ||
+               (M >= MaskSize && M < MaskSize + HalfSize);
+      }))
+    postMask = HighLaneTy;
+  else if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
+             return M < 0 || (M >= HalfSize && M < MaskSize) ||
+                    (M >= MaskSize + HalfSize && M < MaskSize * 2);
+           }))
+    postMask = LowLaneTy;
+
+  // The pre-half of mask is high lane type, and the post-half of mask
+  // is low lane type, which is closest to the LoongArch instructions.
+  //
+  // Note: In the LoongArch architecture, the high lane of mask corresponds
+  // to the lower 128-bit of vector register, and the low lane of mask
+  // corresponds the higher 128-bit of vector register.
+  if (preMask == HighLaneTy && postMask == LowLaneTy) {
+    return;
+  }
+  if (preMask == LowLaneTy && postMask == HighLaneTy) {
+    V1 = DAG.getBitcast(MVT::v4i64, V1);
+    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
+                     DAG.getConstant(0b01001110, DL, MVT::i64));
+    V1 = DAG.getBitcast(VT, V1);
+
+    if (!V2.isUndef()) {
+      V2 = DAG.getBitcast(MVT::v4i64, V2);
+      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
+                       DAG.getConstant(0b01001110, DL, MVT::i64));
+      V2 = DAG.getBitcast(VT, V2);
+    }
+
+    for (auto it = Mask.begin(); it < Mask.begin() + HalfSize; it++) {
+      *it = *it < 0 ? *it : *it - HalfSize;
+    }
+    for (auto it = Mask.begin() + HalfSize; it < Mask.end(); it++) {
+      *it = *it < 0 ? *it : *it + HalfSize;
+    }
+  } else if (preMask == LowLaneTy && postMask == LowLaneTy) {
+    V1 = DAG.getBitcast(MVT::v4i64, V1);
+    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
+                     DAG.getConstant(0b11101110, DL, MVT::i64));
+    V1 = DAG.getBitcast(VT, V1);
+
+    if (!V2.isUndef()) {
+      V2 = DAG.getBitcast(MVT::v4i64, V2);
+      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
+                       DAG.getConstant(0b11101110, DL, MVT::i64));
+      V2 = DAG.getBitcast(VT, V2);
+    }
+
+    for (auto it = Mask.begin(); it < Mask.begin() + HalfSize; it++) {
+      *it = *it < 0 ? *it : *it - HalfSize;
+    }
+  } else if (preMask == HighLaneTy && postMask == HighLaneTy) {
+    V1 = DAG.getBitcast(MVT::v4i64, V1);
+    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
+                     DAG.getConstant(0b01000100, DL, MVT::i64));
+    V1 = DAG.getBitcast(VT, V1);
+
+    if (!V2.isUndef()) {
+      V2 = DAG.getBitcast(MVT::v4i64, V2);
+      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
+                       DAG.getConstant(0b01000100, DL, MVT::i64));
+      V2 = DAG.getBitcast(VT, V2);
+    }
+
+    for (auto it = Mask.begin() + HalfSize; it < Mask.end(); it++) {
+      *it = *it < 0 ? *it : *it + HalfSize;
+    }
+  } else { // cross-lane
+    return;
+  }
+}
+
+/// Dispatching routine to lower various 256-bit LoongArch vector shuffles.
+///
+/// This routine breaks down the specific type of 256-bit shuffle and
+/// dispatches to the lowering routines accordingly.
+static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                                  SDValue V1, SDValue V2, SelectionDAG &DAG) {
+  assert((VT.SimpleTy == MVT::v32i8 || VT.SimpleTy == MVT::v16i16 ||
+          VT.SimpleTy == MVT::v8i32 || VT.SimpleTy == MVT::v4i64 ||
+          VT.SimpleTy == MVT::v8f32 || VT.SimpleTy == MVT::v4f64) &&
+         "Vector type is unsupported for lasx!");
+  assert(V1.getSimpleValueType() == V2.getSimpleValueType() &&
+         "Two operands have different types!");
+  assert(VT.getVectorNumElements() == Mask.size() &&
+         "Unexpected mask size for shuffle!");
+  assert(Mask.size() % 2 == 0 && "Expected even mask size.");
+  assert(Mask.size() >= 4 && "Mask size is less than 4.");
+
+  // canonicalize non cross-lane shuffle vector
+  SmallVector<int> NewMask(Mask);
+  canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG);
+
+  SDValue Result;
+  // TODO: Add more comparison patterns.
+  if (V2.isUndef()) {
+    if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG)))
+      return Result;
+    if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG)))
+      return Result;
+
+    // TODO: This comment may be enabled in the future to better match the
+    // pattern for instruction selection.
+    /* V2 = V1; */
+  }
+
+  // It is recommended not to change the pattern comparison order for better
+  // performance.
+  if ((Result = lowerVECTOR_SHUFFLE_XVPACKEV(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVPACKOD(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVILVH(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVILVL(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVPICKEV(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+  if ((Result = lowerVECTOR_SHUFFLE_XVSHUF(DL, NewMask, VT, V1, V2, DAG)))
+    return Result;
+
+  return SDValue();
+}
+
 SDValue LoongArchTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  // TODO: custom shuffle.
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> OrigMask = SVOp->getMask();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getSimpleValueType();
+  int NumElements = VT.getVectorNumElements();
+  SDLoc DL(Op);
+
+  bool V1IsUndef = V1.isUndef();
+  bool V2IsUndef = V2.isUndef();
+  if (V1IsUndef && V2IsUndef)
+    return DAG.getUNDEF(VT);
+
+  // When we create a shuffle node we put the UNDEF node to second operand,
+  // but in some cases the first operand may be transformed to UNDEF.
+  // In this case we should just commute the node.
+  if (V1IsUndef)
+    return DAG.getCommutedVectorShuffle(*SVOp);
+
+  // Check for non-undef masks pointing at an undef vector and make the masks
+  // undef as well. This makes it easier to match the shuffle based solely on
+  // the mask.
+  if (V2IsUndef &&
+      any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
+    SmallVector<int, 8> NewMask(OrigMask);
+    for (int &M : NewMask)
+      if (M >= NumElements)
+        M = -1;
+    return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
+  }
+
+  // Check for illegal shuffle mask element index values.
+  int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
+  (void)MaskUpperLimit;
+  assert(llvm::all_of(OrigMask,
+                      [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
+         "Out of bounds shuffle index");
+
+  // For each vector width, delegate to a specialized lowering routine.
+  if (VT.is128BitVector())
+    return lower128BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
+
+  if (VT.is256BitVector())
+    return lower256BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
+
   return SDValue();
 }
 
@@ -3389,8 +4306,12 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
     // TODO: Add more target-dependent nodes later.
     NODE_NAME_CASE(CALL)
+    NODE_NAME_CASE(CALL_MEDIUM)
+    NODE_NAME_CASE(CALL_LARGE)
     NODE_NAME_CASE(RET)
     NODE_NAME_CASE(TAIL)
+    NODE_NAME_CASE(TAIL_MEDIUM)
+    NODE_NAME_CASE(TAIL_LARGE)
     NODE_NAME_CASE(SLL_W)
     NODE_NAME_CASE(SRA_W)
     NODE_NAME_CASE(SRL_W)
@@ -3435,6 +4356,16 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(MOVFCSR2GR)
     NODE_NAME_CASE(CACOP_D)
     NODE_NAME_CASE(CACOP_W)
+    NODE_NAME_CASE(VSHUF)
+    NODE_NAME_CASE(VPICKEV)
+    NODE_NAME_CASE(VPICKOD)
+    NODE_NAME_CASE(VPACKEV)
+    NODE_NAME_CASE(VPACKOD)
+    NODE_NAME_CASE(VILVL)
+    NODE_NAME_CASE(VILVH)
+    NODE_NAME_CASE(VSHUF4I)
+    NODE_NAME_CASE(VREPLVEI)
+    NODE_NAME_CASE(XVPERMI)
     NODE_NAME_CASE(VPICK_SEXT_ELT)
     NODE_NAME_CASE(VPICK_ZEXT_ELT)
     NODE_NAME_CASE(VREPLVE)
@@ -4248,15 +5179,31 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Emit the call.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  unsigned Op;
+  switch (DAG.getTarget().getCodeModel()) {
+  default:
+    report_fatal_error("Unsupported code model");
+  case CodeModel::Small:
+    Op = IsTailCall ? LoongArchISD::TAIL : LoongArchISD::CALL;
+    break;
+  case CodeModel::Medium:
+    assert(Subtarget.is64Bit() && "Medium code model requires LA64");
+    Op = IsTailCall ? LoongArchISD::TAIL_MEDIUM : LoongArchISD::CALL_MEDIUM;
+    break;
+  case CodeModel::Large:
+    assert(Subtarget.is64Bit() && "Large code model requires LA64");
+    Op = IsTailCall ? LoongArchISD::TAIL_LARGE : LoongArchISD::CALL_LARGE;
+    break;
+  }
 
   if (IsTailCall) {
     MF.getFrameInfo().setHasTailCall();
-    SDValue Ret = DAG.getNode(LoongArchISD::TAIL, DL, NodeTys, Ops);
+    SDValue Ret = DAG.getNode(Op, DL, NodeTys, Ops);
     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
     return Ret;
   }
 
-  Chain = DAG.getNode(LoongArchISD::CALL, DL, NodeTys, Ops);
+  Chain = DAG.getNode(Op, DL, NodeTys, Ops);
   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   Glue = Chain.getValue(1);
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 2c9826a13237b4bb5992c78d8cbd5ed68f933632..a5ee740c1261eefd7246b2bf822b9b338691ff35 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -28,8 +28,12 @@ enum NodeType : unsigned {
 
   // TODO: add more LoongArchISDs
   CALL,
+  CALL_MEDIUM,
+  CALL_LARGE,
   RET,
   TAIL,
+  TAIL_MEDIUM,
+  TAIL_LARGE,
 
   // 32-bit shifts, directly matching the semantics of the named LoongArch
   // instructions.
@@ -113,6 +117,16 @@ enum NodeType : unsigned {
 
   // Vector Shuffle
   VREPLVE,
+  VSHUF,
+  VPICKEV,
+  VPICKOD,
+  VPACKEV,
+  VPACKOD,
+  VILVL,
+  VILVH,
+  VSHUF4I,
+  VREPLVEI,
+  XVPERMI,
 
   // Extended vector element extraction
   VPICK_SEXT_ELT,
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index ab189055681498eedfbfc91879d1efd7606e332c..756c460f916b71e800890c05ef32e2f951d3109d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -69,6 +69,18 @@ def loongarch_ret : SDNode<"LoongArchISD::RET", SDTNone,
 def loongarch_tail : SDNode<"LoongArchISD::TAIL", SDT_LoongArchCall,
                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                              SDNPVariadic]>;
+def loongarch_call_medium : SDNode<"LoongArchISD::CALL_MEDIUM", SDT_LoongArchCall,
+                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                    SDNPVariadic]>;
+def loongarch_tail_medium : SDNode<"LoongArchISD::TAIL_MEDIUM", SDT_LoongArchCall,
+                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                    SDNPVariadic]>;
+def loongarch_call_large : SDNode<"LoongArchISD::CALL_LARGE", SDT_LoongArchCall,
+                                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                   SDNPVariadic]>;
+def loongarch_tail_large : SDNode<"LoongArchISD::TAIL_LARGE", SDT_LoongArchCall,
+                                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                   SDNPVariadic]>;
 def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>;
 def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>;
 def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>;
@@ -351,6 +363,10 @@ def simm20_lu32id : SImm20Operand {
   let ParserMatchClass = SImmAsmOperand<20, "lu32id">;
 }
 
+def simm20_pcaddu18i : SImm20Operand {
+  let ParserMatchClass = SImmAsmOperand<20, "pcaddu18i">;
+}
+
 def simm21_lsl2 : Operand<OtherVT> {
   let ParserMatchClass = SImmAsmOperand<21, "lsl2">;
   let EncoderMethod = "getImmOpValueAsr<2>";
@@ -618,15 +634,24 @@ class AM_3R<bits<32> op>
     : Fmt3R<op, (outs GPR:$rd), (ins GPR:$rk, GPRMemAtomic:$rj),
             "$rd, $rk, $rj">;
 
-let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
 class LLBase<bits<32> op>
     : Fmt2RI14<op, (outs GPR:$rd), (ins GPR:$rj, simm14_lsl2:$imm14),
                "$rd, $rj, $imm14">;
+class LLBase_ACQ<bits<32> op>
+    : Fmt2R<op, (outs GPR:$rd), (ins GPR:$rj), "$rd, $rj">;
+}
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 1, Constraints = "$rd = $dst" in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 1, Constraints = "$rd = $dst" in {
 class SCBase<bits<32> op>
     : Fmt2RI14<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rj, simm14_lsl2:$imm14),
                "$rd, $rj, $imm14">;
+class SCBase_128<bits<32> op>
+    : Fmt3R<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rk, GPR:$rj),
+               "$rd, $rk, $rj">;
+class SCBase_REL<bits<32> op>
+    : Fmt2R<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rj), "$rd, $rj">;
+}
 
 let hasSideEffects = 1 in
 class IOCSRRD<bits<32> op>
@@ -738,6 +763,8 @@ def PRELD : FmtPRELD<(outs), (ins uimm5:$imm5, GPR:$rj, simm12:$imm12),
 // Atomic Memory Access Instructions
 def LL_W : LLBase<0x20000000>;
 def SC_W : SCBase<0x21000000>;
+def LLACQ_W : LLBase_ACQ<0x38578000>;
+def SCREL_W : SCBase_REL<0x38578400>;
 
 // Barrier Instructions
 def DBAR : MISC_I15<0x38720000>;
@@ -772,7 +799,7 @@ def LU32I_D : Fmt1RI20<0x16000000, (outs GPR:$dst),
                        "$rd, $imm20">;
 }
 def LU52I_D : ALU_2RI12<0x03000000, simm12_lu52id>;
-def PCADDU18I : ALU_1RI20<0x1e000000, simm20>;
+def PCADDU18I : ALU_1RI20<0x1e000000, simm20_pcaddu18i>;
 def MUL_D     : ALU_3R<0x001d8000>;
 def MULH_D    : ALU_3R<0x001e0000>;
 def MULH_DU   : ALU_3R<0x001e8000>;
@@ -859,8 +886,12 @@ def STLE_W : STORE_3R<0x387f0000>;
 def STLE_D : STORE_3R<0x387f8000>;
 
 // Atomic Memory Access Instructions for 64-bits
+def AMSWAP_B     : AM_3R<0x385c0000>;
+def AMSWAP_H     : AM_3R<0x385c8000>;
 def AMSWAP_W     : AM_3R<0x38600000>;
 def AMSWAP_D     : AM_3R<0x38608000>;
+def AMADD_B      : AM_3R<0x385d0000>;
+def AMADD_H      : AM_3R<0x385d8000>;
 def AMADD_W      : AM_3R<0x38610000>;
 def AMADD_D      : AM_3R<0x38618000>;
 def AMAND_W      : AM_3R<0x38620000>;
@@ -877,8 +908,12 @@ def AMMAX_WU     : AM_3R<0x38670000>;
 def AMMAX_DU     : AM_3R<0x38678000>;
 def AMMIN_WU     : AM_3R<0x38680000>;
 def AMMIN_DU     : AM_3R<0x38688000>;
+def AMSWAP__DB_B : AM_3R<0x385e0000>;
+def AMSWAP__DB_H : AM_3R<0x385e8000>;
 def AMSWAP__DB_W : AM_3R<0x38690000>;
 def AMSWAP__DB_D : AM_3R<0x38698000>;
+def AMADD__DB_B  : AM_3R<0x385f0000>;
+def AMADD__DB_H  : AM_3R<0x385f8000>;
 def AMADD__DB_W  : AM_3R<0x386a0000>;
 def AMADD__DB_D  : AM_3R<0x386a8000>;
 def AMAND__DB_W  : AM_3R<0x386b0000>;
@@ -895,8 +930,19 @@ def AMMAX__DB_WU : AM_3R<0x38700000>;
 def AMMAX__DB_DU : AM_3R<0x38708000>;
 def AMMIN__DB_WU : AM_3R<0x38710000>;
 def AMMIN__DB_DU : AM_3R<0x38718000>;
+def AMCAS_B     : AM_3R<0x38580000>;
+def AMCAS_H     : AM_3R<0x38588000>;
+def AMCAS_W     : AM_3R<0x38590000>;
+def AMCAS_D     : AM_3R<0x38598000>;
+def AMCAS__DB_B     : AM_3R<0x385a0000>;
+def AMCAS__DB_H     : AM_3R<0x385a8000>;
+def AMCAS__DB_W     : AM_3R<0x385b0000>;
+def AMCAS__DB_D     : AM_3R<0x385b8000>;
 def LL_D : LLBase<0x22000000>;
 def SC_D : SCBase<0x23000000>;
+def SC_Q : SCBase_128<0x38570000>;
+def LLACQ_D : LLBase_ACQ<0x38578800>;
+def SCREL_D : SCBase_REL<0x38578C00>;
 
 // CRC Check Instructions
 def CRC_W_B_W  : ALU_3R<0x00240000>;
@@ -1323,16 +1369,43 @@ def : Pat<(brind GPR:$rj), (PseudoBRIND GPR:$rj, 0)>;
 def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)),
           (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>;
 
+// Function call with 'Small' code model.
 let isCall = 1, Defs = [R1] in
-def PseudoCALL : Pseudo<(outs), (ins simm26_symbol:$func)>;
+def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func)>;
 
 def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
 def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
 
+// Function call with 'Medium' code model.
+let isCall = 1, Defs = [R1, R20], Size = 8 in
+def PseudoCALL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$func)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_call_medium tglobaladdr:$func),
+          (PseudoCALL_MEDIUM tglobaladdr:$func)>;
+def : Pat<(loongarch_call_medium texternalsym:$func),
+          (PseudoCALL_MEDIUM texternalsym:$func)>;
+} // Predicates = [IsLA64]
+
+// Function call with 'Large' code model.
+let isCall = 1, Defs = [R1, R20], Size = 24 in
+def PseudoCALL_LARGE: Pseudo<(outs), (ins bare_symbol:$func)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_call_large tglobaladdr:$func),
+          (PseudoCALL_LARGE tglobaladdr:$func)>;
+def : Pat<(loongarch_call_large texternalsym:$func),
+          (PseudoCALL_LARGE texternalsym:$func)>;
+} // Predicates = [IsLA64]
+
 let isCall = 1, Defs = [R1] in
 def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rj),
                                 [(loongarch_call GPR:$rj)]>,
                          PseudoInstExpansion<(JIRL R1, GPR:$rj, 0)>;
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_call_medium GPR:$rj), (PseudoCALLIndirect GPR:$rj)>;
+def : Pat<(loongarch_call_large GPR:$rj), (PseudoCALLIndirect GPR:$rj)>;
+}
 
 let isCall = 1, hasSideEffects = 0, mayStore = 0, mayLoad = 0, Defs = [R1] in
 def PseudoJIRL_CALL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>,
@@ -1343,18 +1416,47 @@ let isBarrier = 1, isReturn = 1, isTerminator = 1 in
 def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>,
                 PseudoInstExpansion<(JIRL R0, R1, 0)>;
 
+// Tail call with 'Small' code model.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
-def PseudoTAIL : Pseudo<(outs), (ins simm26_symbol:$dst)>;
+def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst)>;
 
 def : Pat<(loongarch_tail (iPTR tglobaladdr:$dst)),
           (PseudoTAIL tglobaladdr:$dst)>;
 def : Pat<(loongarch_tail (iPTR texternalsym:$dst)),
           (PseudoTAIL texternalsym:$dst)>;
 
+// Tail call with 'Medium' code model.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    Uses = [R3], Defs = [R20], Size = 8 in
+def PseudoTAIL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$dst)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_tail_medium (iPTR tglobaladdr:$dst)),
+          (PseudoTAIL_MEDIUM tglobaladdr:$dst)>;
+def : Pat<(loongarch_tail_medium (iPTR texternalsym:$dst)),
+          (PseudoTAIL_MEDIUM texternalsym:$dst)>;
+} // Predicates = [IsLA64]
+
+// Tail call with 'Large' code model.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    Uses = [R3], Defs = [R19, R20], Size = 24 in
+def PseudoTAIL_LARGE : Pseudo<(outs), (ins bare_symbol:$dst)>;
+
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_tail_large (iPTR tglobaladdr:$dst)),
+          (PseudoTAIL_LARGE tglobaladdr:$dst)>;
+def : Pat<(loongarch_tail_large (iPTR texternalsym:$dst)),
+          (PseudoTAIL_LARGE texternalsym:$dst)>;
+} // Predicates = [IsLA64]
+
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
 def PseudoTAILIndirect : Pseudo<(outs), (ins GPRT:$rj),
                                 [(loongarch_tail GPRT:$rj)]>,
                          PseudoInstExpansion<(JIRL R0, GPR:$rj, 0)>;
+let Predicates = [IsLA64] in {
+def : Pat<(loongarch_tail_medium GPR:$rj), (PseudoTAILIndirect GPR:$rj)>;
+def : Pat<(loongarch_tail_large GPR:$rj), (PseudoTAILIndirect GPR:$rj)>;
+}
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasSideEffects = 0, mayStore = 0, mayLoad = 0, Uses = [R3] in
@@ -1367,6 +1469,19 @@ def PseudoJIRL_TAIL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>,
                       PseudoInstExpansion<(JIRL R0, GPR:$rj,
                                            simm16_lsl2:$imm16)>;
 
+/// call36/taill36 macro instructions
+let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, isAsmParserOnly = 1,
+    Defs = [R1], Size = 8, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in
+def PseudoCALL36 : Pseudo<(outs), (ins bare_symbol:$dst), [],
+                          "call36", "$dst">,
+                   Requires<[IsLA64]>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3],
+    isCodeGenOnly = 0, isAsmParserOnly = 1, Size = 8, hasSideEffects = 0,
+    mayStore = 0, mayLoad = 0 in
+def PseudoTAIL36 : Pseudo<(outs), (ins GPR:$tmp, bare_symbol:$dst), [],
+                          "tail36", "$tmp, $dst">,
+                   Requires<[IsLA64]>;
+
 /// Load address (la*) macro instructions.
 
 // Define isCodeGenOnly = 0 to expose them to tablegened assembly parser.
@@ -1379,6 +1494,7 @@ def PseudoLA_ABS_LARGE : Pseudo<(outs GPR:$dst),
                                 "la.abs", "$dst, $src">;
 def PseudoLA_PCREL : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                             "la.pcrel", "$dst, $src">;
+let Defs = [R20], Size = 20 in
 def PseudoLA_PCREL_LARGE : Pseudo<(outs GPR:$dst),
                                   (ins GPR:$tmp, bare_symbol:$src), [],
                                   "la.pcrel", "$dst, $tmp, $src">,
@@ -1390,28 +1506,30 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
     isAsmParserOnly = 1 in {
 def PseudoLA_GOT : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                           "la.got", "$dst, $src">;
+def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+                             "la.tls.ie", "$dst, $src">;
+def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+                             "la.tls.ld", "$dst, $src">;
+def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+                             "la.tls.gd", "$dst, $src">;
+let Defs = [R20], Size = 20 in {
 def PseudoLA_GOT_LARGE : Pseudo<(outs GPR:$dst),
                                 (ins GPR:$tmp, bare_symbol:$src), [],
                                 "la.got", "$dst, $tmp, $src">,
                          Requires<[IsLA64]>;
-def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
-                             "la.tls.ie", "$dst, $src">;
 def PseudoLA_TLS_IE_LARGE : Pseudo<(outs GPR:$dst),
                                    (ins GPR:$tmp, bare_symbol:$src), [],
                                    "la.tls.ie", "$dst, $tmp, $src">,
                             Requires<[IsLA64]>;
-def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
-                             "la.tls.ld", "$dst, $src">;
 def PseudoLA_TLS_LD_LARGE : Pseudo<(outs GPR:$dst),
                                    (ins GPR:$tmp, bare_symbol:$src), [],
                                    "la.tls.ld", "$dst, $tmp, $src">,
                             Requires<[IsLA64]>;
-def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
-                             "la.tls.gd", "$dst, $src">;
 def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst),
                                    (ins GPR:$tmp, bare_symbol:$src), [],
                                    "la.tls.gd", "$dst, $tmp, $src">,
                             Requires<[IsLA64]>;
+} // Defs = [R20], Size = 20
 }
 
 // Load address inst alias: "la", "la.global" and "la.local".
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 492b62da6ce7806dff589e3adc944a7397fddd83..6f1969bf8cae051e9d792e9609d8513c9233bbc3 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -10,6 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_loongArchV1RUimm>;
+
 def lasxsplati8
   : PatFrag<(ops node:$e0),
             (v32i8 (build_vector node:$e0, node:$e0, node:$e0, node:$e0,
@@ -771,6 +773,10 @@ def XVFRECIP_S : LASX2R_XX<0x769cf400>;
 def XVFRECIP_D : LASX2R_XX<0x769cf800>;
 def XVFRSQRT_S : LASX2R_XX<0x769d0400>;
 def XVFRSQRT_D : LASX2R_XX<0x769d0800>;
+def XVFRECIPE_S : LASX2R_XX<0x769d1400>;
+def XVFRECIPE_D : LASX2R_XX<0x769d1800>;
+def XVFRSQRTE_S : LASX2R_XX<0x769d2400>;
+def XVFRSQRTE_D : LASX2R_XX<0x769d2800>;
 
 def XVFCVTL_S_H : LASX2R_XX<0x769de800>;
 def XVFCVTH_S_H : LASX2R_XX<0x769dec00>;
@@ -1571,6 +1577,134 @@ def : Pat<(loongarch_vreplve v8i32:$xj, GRLenVT:$rk),
 def : Pat<(loongarch_vreplve v4i64:$xj, GRLenVT:$rk),
           (XVREPLVE_D v4i64:$xj, GRLenVT:$rk)>;
 
+// XVSHUF_{B/H/W/D}
+def : Pat<(loongarch_vshuf v32i8:$xa, v32i8:$xj, v32i8:$xk),
+          (XVSHUF_B v32i8:$xj, v32i8:$xk, v32i8:$xa)>;
+def : Pat<(loongarch_vshuf v16i16:$xd, v16i16:$xj, v16i16:$xk),
+          (XVSHUF_H v16i16:$xd, v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vshuf v8i32:$xd, v8i32:$xj, v8i32:$xk),
+          (XVSHUF_W v8i32:$xd, v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vshuf v4i64:$xd, v4i64:$xj, v4i64:$xk),
+          (XVSHUF_D v4i64:$xd, v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vshuf v8i32:$xd, v8f32:$xj, v8f32:$xk),
+          (XVSHUF_W v8i32:$xd, v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vshuf v4i64:$xd, v4f64:$xj, v4f64:$xk),
+          (XVSHUF_D v4i64:$xd, v4f64:$xj, v4f64:$xk)>;
+
+// XVPICKEV_{B/H/W/D}
+def : Pat<(loongarch_vpickev v32i8:$xj, v32i8:$xk),
+          (XVPICKEV_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vpickev v16i16:$xj, v16i16:$xk),
+          (XVPICKEV_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vpickev v8i32:$xj, v8i32:$xk),
+          (XVPICKEV_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vpickev v4i64:$xj, v4i64:$xk),
+          (XVPICKEV_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vpickev v8f32:$xj, v8f32:$xk),
+          (XVPICKEV_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vpickev v4f64:$xj, v4f64:$xk),
+          (XVPICKEV_D v4f64:$xj, v4f64:$xk)>;
+
+// XVPICKOD_{B/H/W/D}
+def : Pat<(loongarch_vpickod v32i8:$xj, v32i8:$xk),
+          (XVPICKOD_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vpickod v16i16:$xj, v16i16:$xk),
+          (XVPICKOD_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vpickod v8i32:$xj, v8i32:$xk),
+          (XVPICKOD_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vpickod v4i64:$xj, v4i64:$xk),
+          (XVPICKOD_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vpickod v8f32:$xj, v8f32:$xk),
+          (XVPICKOD_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vpickod v4f64:$xj, v4f64:$xk),
+          (XVPICKOD_D v4f64:$xj, v4f64:$xk)>;
+
+// XVPACKEV_{B/H/W/D}
+def : Pat<(loongarch_vpackev v32i8:$xj, v32i8:$xk),
+          (XVPACKEV_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vpackev v16i16:$xj, v16i16:$xk),
+          (XVPACKEV_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vpackev v8i32:$xj, v8i32:$xk),
+          (XVPACKEV_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vpackev v4i64:$xj, v4i64:$xk),
+          (XVPACKEV_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vpackev v8f32:$xj, v8f32:$xk),
+          (XVPACKEV_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vpackev v4f64:$xj, v4f64:$xk),
+          (XVPACKEV_D v4f64:$xj, v4f64:$xk)>;
+
+// XVPACKOD_{B/H/W/D}
+def : Pat<(loongarch_vpackod v32i8:$xj, v32i8:$xk),
+          (XVPACKOD_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vpackod v16i16:$xj, v16i16:$xk),
+          (XVPACKOD_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vpackod v8i32:$xj, v8i32:$xk),
+          (XVPACKOD_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vpackod v4i64:$xj, v4i64:$xk),
+          (XVPACKOD_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vpackod v8f32:$xj, v8f32:$xk),
+          (XVPACKOD_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vpackod v4f64:$xj, v4f64:$xk),
+          (XVPACKOD_D v4f64:$xj, v4f64:$xk)>;
+
+// XVILVL_{B/H/W/D}
+def : Pat<(loongarch_vilvl v32i8:$xj, v32i8:$xk),
+          (XVILVL_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vilvl v16i16:$xj, v16i16:$xk),
+          (XVILVL_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vilvl v8i32:$xj, v8i32:$xk),
+          (XVILVL_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vilvl v4i64:$xj, v4i64:$xk),
+          (XVILVL_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vilvl v8f32:$xj, v8f32:$xk),
+          (XVILVL_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vilvl v4f64:$xj, v4f64:$xk),
+          (XVILVL_D v4f64:$xj, v4f64:$xk)>;
+
+// XVILVH_{B/H/W/D}
+def : Pat<(loongarch_vilvh v32i8:$xj, v32i8:$xk),
+          (XVILVH_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(loongarch_vilvh v16i16:$xj, v16i16:$xk),
+          (XVILVH_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(loongarch_vilvh v8i32:$xj, v8i32:$xk),
+          (XVILVH_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(loongarch_vilvh v4i64:$xj, v4i64:$xk),
+          (XVILVH_D v4i64:$xj, v4i64:$xk)>;
+def : Pat<(loongarch_vilvh v8f32:$xj, v8f32:$xk),
+          (XVILVH_W v8f32:$xj, v8f32:$xk)>;
+def : Pat<(loongarch_vilvh v4f64:$xj, v4f64:$xk),
+          (XVILVH_D v4f64:$xj, v4f64:$xk)>;
+
+// XVSHUF4I_{B/H/W}
+def : Pat<(loongarch_vshuf4i v32i8:$xj, immZExt8:$ui8),
+          (XVSHUF4I_B v32i8:$xj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v16i16:$xj, immZExt8:$ui8),
+        (XVSHUF4I_H v16i16:$xj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v8i32:$xj, immZExt8:$ui8),
+        (XVSHUF4I_W v8i32:$xj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v8f32:$xj, immZExt8:$ui8),
+        (XVSHUF4I_W v8f32:$xj, immZExt8:$ui8)>;
+
+// XVREPL128VEI_{B/H/W/D}
+def : Pat<(loongarch_vreplvei v32i8:$xj, immZExt4:$ui4),
+          (XVREPL128VEI_B v32i8:$xj, immZExt4:$ui4)>;
+def : Pat<(loongarch_vreplvei v16i16:$xj, immZExt3:$ui3),
+        (XVREPL128VEI_H v16i16:$xj, immZExt3:$ui3)>;
+def : Pat<(loongarch_vreplvei v8i32:$xj, immZExt2:$ui2),
+        (XVREPL128VEI_W v8i32:$xj, immZExt2:$ui2)>;
+def : Pat<(loongarch_vreplvei v4i64:$xj, immZExt1:$ui1),
+        (XVREPL128VEI_D v4i64:$xj, immZExt1:$ui1)>;
+def : Pat<(loongarch_vreplvei v8f32:$xj, immZExt2:$ui2),
+        (XVREPL128VEI_W v8f32:$xj, immZExt2:$ui2)>;
+def : Pat<(loongarch_vreplvei v4f64:$xj, immZExt1:$ui1),
+        (XVREPL128VEI_D v4f64:$xj, immZExt1:$ui1)>;
+
+// XVPERMI_D
+def : Pat<(loongarch_xvpermi v4i64:$xj, immZExt8: $ui8),
+          (XVPERMI_D v4i64:$xj, immZExt8: $ui8)>;
+def : Pat<(loongarch_xvpermi v4f64:$xj, immZExt8: $ui8),
+          (XVPERMI_D v4f64:$xj, immZExt8: $ui8)>;
+
 // XVREPLVE0_{W/D}
 def : Pat<(lasxsplatf32 FPR32:$fj),
           (XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>;
@@ -1946,6 +2080,16 @@ foreach Inst = ["XVFLOGB_D", "XVFCLASS_D", "XVFSQRT_D", "XVFRECIP_D", "XVFRSQRT_
   def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4f64 LASX256:$xj)),
             (!cast<LAInst>(Inst) LASX256:$xj)>;
 
+// 256-Bit vector FP approximate reciprocal operation
+let Predicates = [HasFrecipe] in {
+foreach Inst = ["XVFRECIPE_S", "XVFRSQRTE_S"] in
+  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v8f32 LASX256:$xj)),
+            (!cast<LAInst>(Inst) LASX256:$xj)>;
+foreach Inst = ["XVFRECIPE_D", "XVFRSQRTE_D"] in
+  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4f64 LASX256:$xj)),
+            (!cast<LAInst>(Inst) LASX256:$xj)>;
+}
+
 def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm),
           (XVPICKVE_W v8f32:$xj, (to_valid_timm timm:$imm))>;
 def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 99ac2f3c162fea31e076b5c267363e9fef7d0807..0580683c3ce303366cd5f18e93186b6cd907cf5a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -15,6 +15,15 @@ def SDT_LoongArchVreplve : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
                                          SDTCisSameAs<0, 1>, SDTCisInt<2>]>;
 def SDT_LoongArchVecCond : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>]>;
 
+def SDT_LoongArchVShuf : SDTypeProfile<1, 3, [SDTCisVec<0>,
+                                         SDTCisInt<1>, SDTCisVec<1>,
+                                         SDTCisSameAs<0, 2>,
+                                         SDTCisSameAs<2, 3>]>;
+def SDT_LoongArchV2R : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                         SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
+def SDT_loongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                         SDTCisSameAs<0,1>, SDTCisVT<2, i64>]>;
+
 // Target nodes.
 def loongarch_vreplve : SDNode<"LoongArchISD::VREPLVE", SDT_LoongArchVreplve>;
 def loongarch_vall_nonzero : SDNode<"LoongArchISD::VALL_NONZERO",
@@ -31,6 +40,23 @@ def loongarch_vpick_sext_elt : SDNode<"LoongArchISD::VPICK_SEXT_ELT",
 def loongarch_vpick_zext_elt : SDNode<"LoongArchISD::VPICK_ZEXT_ELT",
                                       SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>>;
 
+def loongarch_vshuf: SDNode<"LoongArchISD::VSHUF", SDT_LoongArchVShuf>;
+def loongarch_vpickev: SDNode<"LoongArchISD::VPICKEV", SDT_LoongArchV2R>;
+def loongarch_vpickod: SDNode<"LoongArchISD::VPICKOD", SDT_LoongArchV2R>;
+def loongarch_vpackev: SDNode<"LoongArchISD::VPACKEV", SDT_LoongArchV2R>;
+def loongarch_vpackod: SDNode<"LoongArchISD::VPACKOD", SDT_LoongArchV2R>;
+def loongarch_vilvl: SDNode<"LoongArchISD::VILVL", SDT_LoongArchV2R>;
+def loongarch_vilvh: SDNode<"LoongArchISD::VILVH", SDT_LoongArchV2R>;
+
+def loongarch_vshuf4i: SDNode<"LoongArchISD::VSHUF4I", SDT_loongArchV1RUimm>;
+def loongarch_vreplvei: SDNode<"LoongArchISD::VREPLVEI", SDT_loongArchV1RUimm>;
+
+def immZExt1 : ImmLeaf<i64, [{return isUInt<1>(Imm);}]>;
+def immZExt2 : ImmLeaf<i64, [{return isUInt<2>(Imm);}]>;
+def immZExt3 : ImmLeaf<i64, [{return isUInt<3>(Imm);}]>;
+def immZExt4 : ImmLeaf<i64, [{return isUInt<4>(Imm);}]>;
+def immZExt8 : ImmLeaf<i64, [{return isUInt<8>(Imm);}]>;
+
 class VecCond<SDPatternOperator OpNode, ValueType TyNode,
               RegisterClass RC = LSX128>
     : Pseudo<(outs GPR:$rd), (ins RC:$vj),
@@ -892,6 +918,10 @@ def VFRECIP_S : LSX2R_VV<0x729cf400>;
 def VFRECIP_D : LSX2R_VV<0x729cf800>;
 def VFRSQRT_S : LSX2R_VV<0x729d0400>;
 def VFRSQRT_D : LSX2R_VV<0x729d0800>;
+def VFRECIPE_S : LSX2R_VV<0x729d1400>;
+def VFRECIPE_D : LSX2R_VV<0x729d1800>;
+def VFRSQRTE_S : LSX2R_VV<0x729d2400>;
+def VFRSQRTE_D : LSX2R_VV<0x729d2800>;
 
 def VFCVTL_S_H : LSX2R_VV<0x729de800>;
 def VFCVTH_S_H : LSX2R_VV<0x729dec00>;
@@ -1678,6 +1708,128 @@ def : Pat<(loongarch_vreplve v4i32:$vj, GRLenVT:$rk),
 def : Pat<(loongarch_vreplve v2i64:$vj, GRLenVT:$rk),
           (VREPLVE_D v2i64:$vj, GRLenVT:$rk)>;
 
+// VSHUF_{B/H/W/D}
+def : Pat<(loongarch_vshuf v16i8:$va, v16i8:$vj, v16i8:$vk),
+          (VSHUF_B v16i8:$vj, v16i8:$vk, v16i8:$va)>;
+def : Pat<(loongarch_vshuf v8i16:$vd, v8i16:$vj, v8i16:$vk),
+          (VSHUF_H v8i16:$vd, v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vshuf v4i32:$vd, v4i32:$vj, v4i32:$vk),
+          (VSHUF_W v4i32:$vd, v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vshuf v2i64:$vd, v2i64:$vj, v2i64:$vk),
+          (VSHUF_D v2i64:$vd, v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vshuf v4i32:$vd, v4f32:$vj, v4f32:$vk),
+          (VSHUF_W v4i32:$vd, v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vshuf v2i64:$vd, v2f64:$vj, v2f64:$vk),
+          (VSHUF_D v2i64:$vd, v2f64:$vj, v2f64:$vk)>;
+
+// VPICKEV_{B/H/W/D}
+def : Pat<(loongarch_vpickev v16i8:$vj, v16i8:$vk),
+          (VPICKEV_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vpickev v8i16:$vj, v8i16:$vk),
+          (VPICKEV_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vpickev v4i32:$vj, v4i32:$vk),
+          (VPICKEV_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vpickev v2i64:$vj, v2i64:$vk),
+          (VPICKEV_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vpickev v4f32:$vj, v4f32:$vk),
+          (VPICKEV_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vpickev v2f64:$vj, v2f64:$vk),
+          (VPICKEV_D v2f64:$vj, v2f64:$vk)>;
+
+// VPICKOD_{B/H/W/D}
+def : Pat<(loongarch_vpickod v16i8:$vj, v16i8:$vk),
+          (VPICKOD_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vpickod v8i16:$vj, v8i16:$vk),
+          (VPICKOD_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vpickod v4i32:$vj, v4i32:$vk),
+          (VPICKOD_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vpickod v2i64:$vj, v2i64:$vk),
+          (VPICKOD_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vpickod v4f32:$vj, v4f32:$vk),
+          (VPICKOD_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vpickod v2f64:$vj, v2f64:$vk),
+          (VPICKOD_D v2f64:$vj, v2f64:$vk)>;
+
+// VPACKEV_{B/H/W/D}
+def : Pat<(loongarch_vpackev v16i8:$vj, v16i8:$vk),
+          (VPACKEV_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vpackev v8i16:$vj, v8i16:$vk),
+          (VPACKEV_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vpackev v4i32:$vj, v4i32:$vk),
+          (VPACKEV_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vpackev v2i64:$vj, v2i64:$vk),
+          (VPACKEV_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vpackev v4f32:$vj, v4f32:$vk),
+          (VPACKEV_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vpackev v2f64:$vj, v2f64:$vk),
+          (VPACKEV_D v2f64:$vj, v2f64:$vk)>;
+
+// VPACKOD_{B/H/W/D}
+def : Pat<(loongarch_vpackod v16i8:$vj, v16i8:$vk),
+          (VPACKOD_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vpackod v8i16:$vj, v8i16:$vk),
+          (VPACKOD_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vpackod v4i32:$vj, v4i32:$vk),
+          (VPACKOD_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vpackod v2i64:$vj, v2i64:$vk),
+          (VPACKOD_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vpackod v4f32:$vj, v4f32:$vk),
+          (VPACKOD_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vpackod v2f64:$vj, v2f64:$vk),
+          (VPACKOD_D v2f64:$vj, v2f64:$vk)>;
+
+// VILVL_{B/H/W/D}
+def : Pat<(loongarch_vilvl v16i8:$vj, v16i8:$vk),
+          (VILVL_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vilvl v8i16:$vj, v8i16:$vk),
+          (VILVL_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vilvl v4i32:$vj, v4i32:$vk),
+          (VILVL_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vilvl v2i64:$vj, v2i64:$vk),
+          (VILVL_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vilvl v4f32:$vj, v4f32:$vk),
+          (VILVL_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vilvl v2f64:$vj, v2f64:$vk),
+          (VILVL_D v2f64:$vj, v2f64:$vk)>;
+
+// VILVH_{B/H/W/D}
+def : Pat<(loongarch_vilvh v16i8:$vj, v16i8:$vk),
+          (VILVH_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(loongarch_vilvh v8i16:$vj, v8i16:$vk),
+          (VILVH_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(loongarch_vilvh v4i32:$vj, v4i32:$vk),
+          (VILVH_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(loongarch_vilvh v2i64:$vj, v2i64:$vk),
+          (VILVH_D v2i64:$vj, v2i64:$vk)>;
+def : Pat<(loongarch_vilvh v4f32:$vj, v4f32:$vk),
+          (VILVH_W v4f32:$vj, v4f32:$vk)>;
+def : Pat<(loongarch_vilvh v2f64:$vj, v2f64:$vk),
+          (VILVH_D v2f64:$vj, v2f64:$vk)>;
+
+// VSHUF4I_{B/H/W}
+def : Pat<(loongarch_vshuf4i v16i8:$vj, immZExt8:$ui8),
+          (VSHUF4I_B v16i8:$vj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v8i16:$vj, immZExt8:$ui8),
+        (VSHUF4I_H v8i16:$vj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v4i32:$vj, immZExt8:$ui8),
+        (VSHUF4I_W v4i32:$vj, immZExt8:$ui8)>;
+def : Pat<(loongarch_vshuf4i v4f32:$vj, immZExt8:$ui8),
+        (VSHUF4I_W v4f32:$vj, immZExt8:$ui8)>;
+
+// VREPLVEI_{B/H/W/D}
+def : Pat<(loongarch_vreplvei v16i8:$vj, immZExt4:$ui4),
+          (VREPLVEI_B v16i8:$vj, immZExt4:$ui4)>;
+def : Pat<(loongarch_vreplvei v8i16:$vj, immZExt3:$ui3),
+        (VREPLVEI_H v8i16:$vj, immZExt3:$ui3)>;
+def : Pat<(loongarch_vreplvei v4i32:$vj, immZExt2:$ui2),
+        (VREPLVEI_W v4i32:$vj, immZExt2:$ui2)>;
+def : Pat<(loongarch_vreplvei v2i64:$vj, immZExt1:$ui1),
+        (VREPLVEI_D v2i64:$vj, immZExt1:$ui1)>;
+def : Pat<(loongarch_vreplvei v4f32:$vj, immZExt2:$ui2),
+        (VREPLVEI_W v4f32:$vj, immZExt2:$ui2)>;
+def : Pat<(loongarch_vreplvei v2f64:$vj, immZExt1:$ui1),
+        (VREPLVEI_D v2f64:$vj, immZExt1:$ui1)>;
+
 // VREPLVEI_{W/D}
 def : Pat<(lsxsplatf32 FPR32:$fj),
           (VREPLVEI_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), 0)>;
@@ -2043,6 +2195,16 @@ foreach Inst = ["VFLOGB_D", "VFCLASS_D", "VFSQRT_D", "VFRECIP_D", "VFRSQRT_D",
   def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2f64 LSX128:$vj)),
             (!cast<LAInst>(Inst) LSX128:$vj)>;
 
+// 128-Bit vector FP approximate reciprocal operation
+let Predicates = [HasFrecipe] in {
+foreach Inst = ["VFRECIPE_S", "VFRSQRTE_S"] in
+  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v4f32 LSX128:$vj)),
+            (!cast<LAInst>(Inst) LSX128:$vj)>;
+foreach Inst = ["VFRECIPE_D", "VFRSQRTE_D"] in
+  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2f64 LSX128:$vj)),
+            (!cast<LAInst>(Inst) LSX128:$vj)>;
+}
+
 // load
 def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm),
           (VLD GPR:$rj, (to_valid_timm timm:$imm))>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
index 5daa9481c9072e3507c9c92cd95fcdc3ea7ff17c..98ad49f25e3f2fddab1684ede35a93311cee023f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
@@ -95,6 +95,9 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
   case LoongArchII::MO_GD_PC_HI:
     Kind = LoongArchMCExpr::VK_LoongArch_TLS_GD_PC_HI20;
     break;
+  case LoongArchII::MO_CALL36:
+    Kind = LoongArchMCExpr::VK_LoongArch_CALL36;
+    break;
     // TODO: Handle more target-flags.
   }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
index 174e4cba83263348ee0564a3e81349790e708078..11c0b39e176e61ff32ce51aab0f77fe3c4460d4e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
@@ -45,6 +45,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
   bool HasUAL = false;
   bool HasLinkerRelax = false;
   bool HasExpAutoVec = false;
+  bool HasFrecipe = false;
   unsigned GRLen = 32;
   MVT GRLenVT = MVT::i32;
   LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown;
@@ -104,6 +105,7 @@ public:
   bool hasUAL() const { return HasUAL; }
   bool hasLinkerRelax() const { return HasLinkerRelax; }
   bool hasExpAutoVec() const { return HasExpAutoVec; }
+  bool hasFrecipe() const { return HasFrecipe; }
   MVT getGRLenVT() const { return GRLenVT; }
   unsigned getGRLen() const { return GRLen; }
   LoongArchABI::ABI getTargetABI() const { return TargetABI; }
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index d0a4e937504800ca2606706dc19054f25468f17d..0efc5e6ebb99d6109a121850b97df538ea66c3b1 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -63,11 +63,11 @@ getEffectiveLoongArchCodeModel(const Triple &TT,
 
   switch (*CM) {
   case CodeModel::Small:
-  case CodeModel::Medium:
     return *CM;
+  case CodeModel::Medium:
   case CodeModel::Large:
     if (!TT.isArch64Bit())
-      report_fatal_error("Large code model requires LA64");
+      report_fatal_error("Medium/Large code model requires LA64");
     return *CM;
   default:
     report_fatal_error(
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
index d47dded9ea6ecf260c773cd6ac9684ae8a191938..7961bb141e64997c59fd11c67fe957ec35e7fc2e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
@@ -26,8 +26,6 @@ TypeSize LoongArchTTIImpl::getRegisterBitWidth(
   case TargetTransformInfo::RGK_Scalar:
     return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
   case TargetTransformInfo::RGK_FixedWidthVector:
-    if (!ST->hasExpAutoVec())
-      return DefSize;
     if (ST->hasExtLASX())
       return TypeSize::getFixed(256);
     if (ST->hasExtLSX())
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
index cee6dad1f095e12feb6f12ebace67b3ea540e6a0..0692cb92b694404e16f299903d7797b10c2f79c1 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
@@ -47,6 +47,7 @@ enum {
   MO_IE_PC64_HI,
   MO_LD_PC_HI,
   MO_GD_PC_HI,
+  MO_CALL36
   // TODO: Add more flags.
 };
 } // end namespace LoongArchII
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
index e60b9c2cfd97cc02dfbf71a7db7e7bdc11d5c250..0a52380dd2cdd1c4a097df7094d735c86e275124 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
@@ -90,6 +90,8 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_LARCH_TLS_LE64_LO20;
   case LoongArch::fixup_loongarch_tls_le64_hi12:
     return ELF::R_LARCH_TLS_LE64_HI12;
+  case LoongArch::fixup_loongarch_call36:
+    return ELF::R_LARCH_CALL36;
     // TODO: Handle more fixup-kinds.
   }
 }
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
index 78414408f21f070a1205ad2645f4a20e32f1132d..0d19d2b0fb1fe8cf39744ac981a1fbd3aaadc16d 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
@@ -111,6 +111,9 @@ enum Fixups {
   fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX,
   // Generate an R_LARCH_ALIGN which indicates the linker may fixup align here.
   fixup_loongarch_align = FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN,
+  // 36-bit fixup corresponding to %call36(foo) for a pair instructions:
+  // pcaddu18i+jirl.
+  fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36,
 };
 } // end namespace LoongArch
 } // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index 09d92ac9aa3a868da94f7c0a0e6027adef33c3d3..7c4fe9674d4e1f0d608ea470143034d81c0603cd 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -241,6 +241,9 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
     case LoongArchMCExpr::VK_LoongArch_TLS_GD_HI20:
       FixupKind = LoongArch::fixup_loongarch_tls_gd_hi20;
       break;
+    case LoongArchMCExpr::VK_LoongArch_CALL36:
+      FixupKind = LoongArch::fixup_loongarch_call36;
+      break;
     }
   } else if (Kind == MCExpr::SymbolRef &&
              cast<MCSymbolRefExpr>(Expr)->getKind() ==
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
index 82c992b1cc8c4e899d4c390eb700b63ad591eb26..8ca8876a19b936f60a4a381d1b47325d73fd8802 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
@@ -138,6 +138,8 @@ StringRef LoongArchMCExpr::getVariantKindName(VariantKind Kind) {
     return "gd_pc_hi20";
   case VK_LoongArch_TLS_GD_HI20:
     return "gd_hi20";
+  case VK_LoongArch_CALL36:
+    return "call36";
   }
 }
 
@@ -180,6 +182,7 @@ LoongArchMCExpr::getVariantKindForName(StringRef name) {
       .Case("ld_hi20", VK_LoongArch_TLS_LD_HI20)
       .Case("gd_pc_hi20", VK_LoongArch_TLS_GD_PC_HI20)
       .Case("gd_hi20", VK_LoongArch_TLS_GD_HI20)
+      .Case("call36", VK_LoongArch_CALL36)
       .Default(VK_LoongArch_Invalid);
 }
 
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
index 93251f8241033b60e6fcdd42dd0dc27a2f757772..bd828116d7fa460f856d9760d832c806a9a69d0c 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
@@ -61,6 +61,7 @@ public:
     VK_LoongArch_TLS_LD_HI20,
     VK_LoongArch_TLS_GD_PC_HI20,
     VK_LoongArch_TLS_GD_HI20,
+    VK_LoongArch_CALL36,
     VK_LoongArch_Invalid // Must be the last item.
   };
 
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 8b23be02edc0145ee854c041985ab7db5928bedd..87e3e0b434d5bc0dd83ed06cbb545deea1b494d1 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1469,6 +1469,8 @@ StringRef sys::getHostCPUName() {
   switch (processor_id & 0xf000) {
   case 0xc000: // Loongson 64bit, 4-issue
     return "la464";
+  case 0xd000: // Loongson 64bit, 6-issue
+    return "la664";
   // TODO: Others.
   default:
     break;
diff --git a/llvm/lib/TargetParser/LoongArchTargetParser.cpp b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
index 772d24c5ce3deb95f539e0231f561c7a2322f119..8e86d18de2ad9a38548fdf9879a01a46958a362d 100644
--- a/llvm/lib/TargetParser/LoongArchTargetParser.cpp
+++ b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
@@ -44,6 +44,17 @@ bool LoongArch::getArchFeatures(StringRef Arch,
       return true;
     }
   }
+
+  if (Arch == "la64v1.0" || Arch == "la64v1.1") {
+    Features.push_back("+64bit");
+    Features.push_back("+d");
+    Features.push_back("+lsx");
+    Features.push_back("+ual");
+    if (Arch == "la64v1.1")
+      Features.push_back("+frecipe");
+    return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/test/CodeGen/LoongArch/code-models.ll b/llvm/test/CodeGen/LoongArch/code-models.ll
index c610f645a06aebe40c7e58ba00ba57600db6f0ee..f93c316709284bd1c2db408b46a3c87c5a946f5c 100644
--- a/llvm/test/CodeGen/LoongArch/code-models.ll
+++ b/llvm/test/CodeGen/LoongArch/code-models.ll
@@ -23,8 +23,8 @@ define i32 @call_globaladdress(i32 %a) nounwind {
 ; MEDIUM:       # %bb.0:
 ; MEDIUM-NEXT:    addi.d $sp, $sp, -16
 ; MEDIUM-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; MEDIUM-NEXT:    pcalau12i $ra, %pc_hi20(callee)
-; MEDIUM-NEXT:    jirl $ra, $ra, %pc_lo12(callee)
+; MEDIUM-NEXT:    pcaddu18i $ra, %call36(callee)
+; MEDIUM-NEXT:    jirl $ra, $ra, 0
 ; MEDIUM-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; MEDIUM-NEXT:    addi.d $sp, $sp, 16
 ; MEDIUM-NEXT:    ret
@@ -33,11 +33,11 @@ define i32 @call_globaladdress(i32 %a) nounwind {
 ; LARGE:       # %bb.0:
 ; LARGE-NEXT:    addi.d $sp, $sp, -16
 ; LARGE-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
-; LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(callee)
-; LARGE-NEXT:    addi.d $ra, $zero, %got_pc_lo12(callee)
-; LARGE-NEXT:    lu32i.d $ra, %got64_pc_lo20(callee)
-; LARGE-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(callee)
-; LARGE-NEXT:    ldx.d $ra, $ra, $a1
+; LARGE-NEXT:    pcalau12i $ra, %got_pc_hi20(callee)
+; LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(callee)
+; LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(callee)
+; LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(callee)
+; LARGE-NEXT:    ldx.d $ra, $t8, $ra
 ; LARGE-NEXT:    jirl $ra, $ra, 0
 ; LARGE-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE-NEXT:    addi.d $sp, $sp, 16
@@ -68,8 +68,8 @@ define void @call_external_sym(ptr %dst) {
 ; MEDIUM-NEXT:    .cfi_offset 1, -8
 ; MEDIUM-NEXT:    ori $a2, $zero, 1000
 ; MEDIUM-NEXT:    move $a1, $zero
-; MEDIUM-NEXT:    pcalau12i $ra, %pc_hi20(memset)
-; MEDIUM-NEXT:    jirl $ra, $ra, %pc_lo12(memset)
+; MEDIUM-NEXT:    pcaddu18i $ra, %call36(memset)
+; MEDIUM-NEXT:    jirl $ra, $ra, 0
 ; MEDIUM-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; MEDIUM-NEXT:    addi.d $sp, $sp, 16
 ; MEDIUM-NEXT:    ret
@@ -82,11 +82,11 @@ define void @call_external_sym(ptr %dst) {
 ; LARGE-NEXT:    .cfi_offset 1, -8
 ; LARGE-NEXT:    ori $a2, $zero, 1000
 ; LARGE-NEXT:    move $a1, $zero
-; LARGE-NEXT:    pcalau12i $a3, %pc_hi20(memset)
-; LARGE-NEXT:    addi.d $ra, $zero, %pc_lo12(memset)
-; LARGE-NEXT:    lu32i.d $ra, %pc64_lo20(memset)
-; LARGE-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(memset)
-; LARGE-NEXT:    add.d $ra, $ra, $a3
+; LARGE-NEXT:    pcalau12i $ra, %pc_hi20(memset)
+; LARGE-NEXT:    addi.d $t8, $zero, %pc_lo12(memset)
+; LARGE-NEXT:    lu32i.d $t8, %pc64_lo20(memset)
+; LARGE-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(memset)
+; LARGE-NEXT:    add.d $ra, $t8, $ra
 ; LARGE-NEXT:    jirl $ra, $ra, 0
 ; LARGE-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LARGE-NEXT:    addi.d $sp, $sp, 16
@@ -105,17 +105,17 @@ define i32 @caller_tail(i32 %i) nounwind {
 ;
 ; MEDIUM-LABEL: caller_tail:
 ; MEDIUM:       # %bb.0: # %entry
-; MEDIUM-NEXT:    pcalau12i $a1, %pc_hi20(callee_tail)
-; MEDIUM-NEXT:    jirl $zero, $a1, %pc_lo12(callee_tail)
+; MEDIUM-NEXT:    pcaddu18i $t8, %call36(callee_tail)
+; MEDIUM-NEXT:    jr $t8
 ;
 ; LARGE-LABEL: caller_tail:
 ; LARGE:       # %bb.0: # %entry
-; LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(callee_tail)
-; LARGE-NEXT:    addi.d $a2, $zero, %got_pc_lo12(callee_tail)
-; LARGE-NEXT:    lu32i.d $a2, %got64_pc_lo20(callee_tail)
-; LARGE-NEXT:    lu52i.d $a2, $a2, %got64_pc_hi12(callee_tail)
-; LARGE-NEXT:    ldx.d $a1, $a2, $a1
-; LARGE-NEXT:    jr $a1
+; LARGE-NEXT:    pcalau12i $t7, %got_pc_hi20(callee_tail)
+; LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(callee_tail)
+; LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(callee_tail)
+; LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(callee_tail)
+; LARGE-NEXT:    ldx.d $t7, $t8, $t7
+; LARGE-NEXT:    jr $t7
 entry:
   %r = tail call i32 @callee_tail(i32 %i)
   ret i32 %r
diff --git a/llvm/test/CodeGen/LoongArch/cpus.ll b/llvm/test/CodeGen/LoongArch/cpus.ll
index 35945ae4de71fb5decca5c5516aecee09a8757f3..087cf887b81386e3d68d1ac5b052ac032e519e14 100644
--- a/llvm/test/CodeGen/LoongArch/cpus.ll
+++ b/llvm/test/CodeGen/LoongArch/cpus.ll
@@ -3,6 +3,7 @@
 
 ; RUN: llc < %s --mtriple=loongarch64 --mcpu=loongarch64 2>&1 | FileCheck %s
 ; RUN: llc < %s --mtriple=loongarch64 --mcpu=la464 2>&1 | FileCheck %s
+; RUN: llc < %s --mtriple=loongarch64 --mcpu=la664 2>&1 | FileCheck %s
 ; RUN: llc < %s --mtriple=loongarch64 2>&1 | FileCheck %s
 
 ; CHECK-NOT: {{.*}} is not a recognized processor for this target
@@ -18,3 +19,7 @@ define void @tune_cpu_loongarch64() "tune-cpu"="loongarch64" {
 define void @tune_cpu_la464() "tune-cpu"="la464" {
   ret void
 }
+
+define void @tune_cpu_la664() "tune-cpu"="la664" {
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/expand-call.ll b/llvm/test/CodeGen/LoongArch/expand-call.ll
index 86bf4292665b72c88b4ba5ab3cf056b9c327edda..e0d179f92de6824b28b328127b9303c948d17e3c 100644
--- a/llvm/test/CodeGen/LoongArch/expand-call.ll
+++ b/llvm/test/CodeGen/LoongArch/expand-call.ll
@@ -1,6 +1,6 @@
 ; RUN: llc --mtriple=loongarch64 --stop-before loongarch-prera-expand-pseudo \
 ; RUN:     --verify-machineinstrs < %s | FileCheck %s --check-prefix=NOEXPAND
-; RUN: llc --mtriple=loongarch64 --stop-after loongarch-prera-expand-pseudo \
+; RUN: llc --mtriple=loongarch64 --stop-before machine-opt-remark-emitter \
 ; RUN:     --verify-machineinstrs < %s | FileCheck %s --check-prefix=EXPAND
 
 declare void @callee()
diff --git a/llvm/test/CodeGen/LoongArch/global-address.ll b/llvm/test/CodeGen/LoongArch/global-address.ll
index a8f0ef648aa7c4372980e7c47f93df028bda034d..d32a17f488b1422df12553801fb0b170836b6738 100644
--- a/llvm/test/CodeGen/LoongArch/global-address.ll
+++ b/llvm/test/CodeGen/LoongArch/global-address.ll
@@ -53,32 +53,32 @@ define void @foo() nounwind {
 ; LA64LARGENOPIC-LABEL: foo:
 ; LA64LARGENOPIC:       # %bb.0:
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
+; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
+; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    ld.w $a0, $a0, 0
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %pc64_lo20(g)
-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
-; LA64LARGENOPIC-NEXT:    add.d $a0, $a1, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
+; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %pc64_lo20(g)
+; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
+; LA64LARGENOPIC-NEXT:    add.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    ld.w $a0, $a0, 0
 ; LA64LARGENOPIC-NEXT:    ret
 ;
 ; LA64LARGEPIC-LABEL: foo:
 ; LA64LARGEPIC:       # %bb.0:
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
-; LA64LARGEPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
+; LA64LARGEPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGEPIC-NEXT:    ld.w $a0, $a0, 0
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(.Lg$local)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(.Lg$local)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(.Lg$local)
+; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
 ; LA64LARGEPIC-NEXT:    ld.w $a0, $a0, 0
 ; LA64LARGEPIC-NEXT:    ret
   %V = load volatile i32, ptr @G
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9f572500caa0ea20e39fc28c37c62b107e66cbc8
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll
@@ -0,0 +1,26 @@
+; RUN: llc --mtriple=loongarch32 --mattr=+d,+frecipe < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s
+
+declare double @llvm.loongarch.frecipe.d(double)
+
+define double @frecipe_d(double %a) {
+; CHECK-LABEL: frecipe_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    frecipe.d $fa0, $fa0
+; CHECK-NEXT:    ret
+entry:
+  %res = call double @llvm.loongarch.frecipe.d(double %a)
+  ret double %res
+}
+
+declare double @llvm.loongarch.frsqrte.d(double)
+
+define double @frsqrte_d(double %a) {
+; CHECK-LABEL: frsqrte_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    frsqrte.d $fa0, $fa0
+; CHECK-NEXT:    ret
+entry:
+  %res = call double @llvm.loongarch.frsqrte.d(double %a)
+  ret double %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0b2029f2e44a01c82b9346e603a954ce613c6dc6
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll
@@ -0,0 +1,26 @@
+; RUN: llc --mtriple=loongarch32 --mattr=+f,+frecipe < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+f,+frecipe < %s | FileCheck %s
+
+declare float @llvm.loongarch.frecipe.s(float)
+
+define float @frecipe_s(float %a) {
+; CHECK-LABEL: frecipe_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    frecipe.s $fa0, $fa0
+; CHECK-NEXT:    ret
+entry:
+  %res = call float @llvm.loongarch.frecipe.s(float %a)
+  ret float %res
+}
+
+declare float @llvm.loongarch.frsqrte.s(float)
+
+define float @frsqrte_s(float %a) {
+; CHECK-LABEL: frsqrte_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    frsqrte.s $fa0, $fa0
+; CHECK-NEXT:    ret
+entry:
+  %res = call float @llvm.loongarch.frsqrte.s(float %a)
+  ret float %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll
new file mode 100644
index 0000000000000000000000000000000000000000..215436823af8313d530f1c8ed27f734c43a790c5
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s
+
+declare <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float>)
+
+define <8 x float> @lasx_xvfrecipe_s(<8 x float> %va) nounwind {
+; CHECK-LABEL: lasx_xvfrecipe_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvfrecipe.s $xr0, $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> %va)
+  ret <8 x float> %res
+}
+
+declare <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double>)
+
+define <4 x double> @lasx_xvfrecipe_d(<4 x double> %va) nounwind {
+; CHECK-LABEL: lasx_xvfrecipe_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvfrecipe.d $xr0, $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> %va)
+  ret <4 x double> %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ad36c3aa5c29d8a568cfafc19968ed10be918851
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s
+
+declare <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float>)
+
+define <8 x float> @lasx_xvfrsqrte_s(<8 x float> %va) nounwind {
+; CHECK-LABEL: lasx_xvfrsqrte_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvfrsqrte.s $xr0, $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> %va)
+  ret <8 x float> %res
+}
+
+declare <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double>)
+
+define <4 x double> @lasx_xvfrsqrte_d(<4 x double> %va) nounwind {
+; CHECK-LABEL: lasx_xvfrsqrte_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvfrsqrte.d $xr0, $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> %va)
+  ret <4 x double> %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
new file mode 100644
index 0000000000000000000000000000000000000000..22ab19b9fa44674d836c4e162f9e7f508985e885
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xvilvl.b
+define <32 x i8> @shufflevector_xvilvl_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_xvilvl_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvl.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39,
+                                                               i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+    ret <32 x i8> %c
+}
+
+;; xvilvl.h
+define <16 x i16> @shufflevector_xvilvl_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_xvilvl_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvl.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+    ret <16 x i16> %c
+}
+
+;; xvilvl.w
+define <8 x i32> @shufflevector_xvilvl_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_xvilvl_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvl.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+    ret <8 x i32> %c
+}
+
+;; xvilvh.b
+define <32 x i8> @shufflevector_xvilvh_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_xvilvh_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvh.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47,
+                                                               i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+    ret <32 x i8> %c
+}
+
+;; xvilvh.h
+define <16 x i16> @shufflevector_xvilvh_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_xvilvh_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvh.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+    ret <16 x i16> %c
+}
+
+;; xvilvh.w
+define <8 x i32> @shufflevector_xvilvh_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_xvilvh_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvh.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+    ret <8 x i32> %c
+}
+
+;; xvilvh.w
+define <8 x float> @shufflevector_xvilvh_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_xvilvh_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvilvh.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+    ret <8 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2ff9af4069b9bd873fe72248fdfe5a4a5a1b80da
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xvpackev.b
+define <32 x i8> @shufflevector_pack_ev_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46,
+                                                               i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+    ret <32 x i8> %c
+}
+
+;; xvpackev.h
+define <16 x i16> @shufflevector_pack_ev_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+    ret <16 x i16> %c
+}
+
+;; xvpackev.w
+define <8 x i32> @shufflevector_pack_ev_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+    ret <8 x i32> %c
+}
+
+;; xvpickev.d/xvpackev.d/xvilvl.d
+define <4 x i64> @shufflevector_pack_ev_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+    ret <4 x i64> %c
+}
+
+;; xvpackev.w
+define <8 x float> @shufflevector_pack_ev_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+    ret <8 x float> %c
+}
+
+;; xvpickev.d/xvpackev.d/xvilvl.d
+define <4 x double> @shufflevector_pack_ev_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+    ret <4 x double> %c
+}
+
+;; xvpackod.b
+define <32 x i8> @shufflevector_pack_od_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 33, i32 3, i32 35, i32 5, i32 37, i32 7, i32 39, i32 9, i32 41, i32 11, i32 43, i32 13, i32 45, i32 15, i32 47,
+                                                              i32 17, i32 49, i32 19, i32 51, i32 21, i32 53, i32 23, i32 55, i32 25, i32 57, i32 27, i32 59, i32 29, i32 61, i32 31, i32 63>
+    ret <32 x i8> %c
+}
+
+;; xvpackod.h
+define <16 x i16> @shufflevector_pack_od_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+    ret <16 x i16> %c
+}
+
+;; xvpackod.w
+define <8 x i32> @shufflevector_pack_od_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+    ret <8 x i32> %c
+}
+
+;; xvpickod.d/xvpackod.d/xvilvh.d
+define <4 x i64> @shufflodector_pack_od_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+    ret <4 x i64> %c
+}
+
+;; xvpackod.w
+define <8 x float> @shufflodector_pack_od_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+    ret <8 x float> %c
+}
+
+;; xvpickod.d/xvpackod.d/xvilvh.d
+define <4 x double> @shufflodector_pack_od_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpackod.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+    ret <4 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
new file mode 100644
index 0000000000000000000000000000000000000000..294d292d1764067c80a79af83c6aeb87f6982d53
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xvpickev.b
+define <32 x i8> @shufflevector_pick_ev_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickev.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46,
+                                                               i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+    ret <32 x i8> %c
+}
+
+;; xvpickev.h
+define <16 x i16> @shufflevector_pick_ev_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickev.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+    ret <16 x i16> %c
+}
+
+;; xvpickev.w
+define <8 x i32> @shufflevector_pick_ev_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+    ret <8 x i32> %c
+}
+
+;; xvpickev.w
+define <8 x float> @shufflevector_pick_ev_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+    ret <8 x float> %c
+}
+
+;; xvpickod.b
+define <32 x i8> @shufflevector_pick_od_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickod.b $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47,
+                                                               i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
+    ret <32 x i8> %c
+}
+
+;; xvpickod.h
+define <16 x i16> @shufflevector_pick_od_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickod.h $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+    ret <16 x i16> %c
+}
+
+;; xvpickod.w
+define <8 x i32> @shufflevector_pick_od_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickod.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+    ret <8 x i32> %c
+}
+
+;; xvpickod.w
+define <8 x float> @shufflodector_pick_od_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflodector_pick_od_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpickod.w $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+    ret <8 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
new file mode 100644
index 0000000000000000000000000000000000000000..dce1e4b777e291c95482d441dcc14900f18e86bf
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xvrepl128vei.b
+define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvrepl128vei.b $xr0, $xr0, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
+                                                               i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
+    ret <32 x i8> %c
+}
+
+;; xvrepl128vei.h
+define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvrepl128vei.h $xr0, $xr0, 3
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3,
+                                                                 i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+    ret <16 x i16> %c
+}
+
+;; xvrepl128vei.w
+define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
+; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 3
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 3, i32 3, i32 3, i32 3>
+    ret <8 x i32> %c
+}
+
+;; xvrepl128vei.d
+define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: shufflevector_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr0, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+    ret <4 x i64> %c
+}
+
+;; xvrepl128vei.w
+define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 3
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+    ret <8 x float> %c
+}
+
+;; xvrepl128vei.d
+define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: shufflevector_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr1, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 5, i32 7, i32 7>
+    ret <4 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fce32647da3de78a4de2635dba6b78496183e352
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xvshuf.b
+define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
+; CHECK-NEXT:    xvld $xr2, $a0, 0
+; CHECK-NEXT:    xvshuf.b $xr0, $xr1, $xr0, $xr2
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39,
+                                                               i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+    ret <32 x i8> %c
+}
+
+;; xvshuf.h
+define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
+; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 78
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_0)
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvshuf.h $xr0, $xr1, $xr2
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 27, i32 26, i32 25, i32 24,
+                                                                 i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3>
+    ret <16 x i16> %c
+}
+
+;; xvshuf.w
+define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 68
+; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 68
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvshuf.w $xr0, $xr1, $xr2
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 9, i32 3, i32 2, i32 8, i32 9, i32 3, i32 2>
+    ret <8 x i32> %c
+}
+
+;; xvshuf.d
+define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: shufflevector_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 238
+; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 238
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvshuf.d $xr0, $xr1, $xr2
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+    ret <4 x i64> %c
+}
+
+;; xvshuf.w
+define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_0)
+; CHECK-NEXT:    xvld $xr2, $a0, 0
+; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 0, i32 10, i32 9, i32 4, i32 5, i32 12, i32 13>
+    ret <8 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
new file mode 100644
index 0000000000000000000000000000000000000000..dc4532a7292abb660a17bc031afa9372c066281a
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+;; xxvshuf4i.b
+define <32 x i8> @shufflevector_xvshuf4i_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvshuf4i.b $xr0, $xr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12,
+                                                               i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 26, i32 25, i32 24, i32 31, i32 30, i32 29, i32 28>
+    ret <32 x i8> %c
+}
+
+;; xvshuf4i.h
+define <16 x i16> @shufflevector_xvshuf4i_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvshuf4i.h $xr0, $xr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+    ret <16 x i16> %c
+}
+
+;; xvshuf4i.w
+define <8 x i32> @shufflevector_xvshuf4i_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+    ret <8 x i32> %c
+}
+
+;; xvshuf4i.w
+define <8 x float> @shufflevector_xvshuf4i_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+    ret <8 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1b7a97d9f97209e6be3dfd7559fbfdccd3e10c65
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s
+
+declare <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float>)
+
+define <4 x float> @lsx_vfrecipe_s(<4 x float> %va) nounwind {
+; CHECK-LABEL: lsx_vfrecipe_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfrecipe.s $vr0, $vr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float> %va)
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double>)
+
+define <2 x double> @lsx_vfrecipe_d(<2 x double> %va) nounwind {
+; CHECK-LABEL: lsx_vfrecipe_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfrecipe.d $vr0, $vr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double> %va)
+  ret <2 x double> %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3cd6c78e87d78ba92d1fcd99b3db91b75c3d90dc
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s
+
+declare <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float>)
+
+define <4 x float> @lsx_vfrsqrte_s(<4 x float> %va) nounwind {
+; CHECK-LABEL: lsx_vfrsqrte_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfrsqrte.s $vr0, $vr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float> %va)
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double>)
+
+define <2 x double> @lsx_vfrsqrte_d(<2 x double> %va) nounwind {
+; CHECK-LABEL: lsx_vfrsqrte_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfrsqrte.d $vr0, $vr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double> %va)
+  ret <2 x double> %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
new file mode 100644
index 0000000000000000000000000000000000000000..31398c6081c0a9357ae3c700dc7ded86c1102355
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+;; vilvl.b
+define <16 x i8> @shufflevector_vilvl_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_vilvl_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvl.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+    ret <16 x i8> %c
+}
+
+;; vilvl.h
+define <8 x i16> @shufflevector_vilvl_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_vilvl_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvl.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+    ret <8 x i16> %c
+}
+
+;; vilvl.w
+define <4 x i32> @shufflevector_vilvl_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_vilvl_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+    ret <4 x i32> %c
+}
+
+;; vilvl.w
+define <4 x float> @shufflevector_vilvl_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_vilvl_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+    ret <4 x float> %c
+}
+
+;; vilvh.b
+define <16 x i8> @shufflevector_vilvh_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_vilvh_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvh.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+    ret <16 x i8> %c
+}
+
+;; vilvh.h
+define <8 x i16> @shufflevector_vilvh_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_vilvh_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvh.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+    ret <8 x i16> %c
+}
+
+;; vilvh.w
+define <4 x i32> @shufflevector_vilvh_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_vilvh_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+    ret <4 x i32> %c
+}
+
+;; vilvh.w
+define <4 x float> @shufflevector_vilvh_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_vilvh_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+    ret <4 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
new file mode 100644
index 0000000000000000000000000000000000000000..171e68306cd11026bf5b422870136fcd7e0b5e81
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+;; vpackev.b
+define <16 x i8> @shufflevector_pack_ev_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+    ret <16 x i8> %c
+}
+
+;; vpackev.h
+define <8 x i16> @shufflevector_pack_ev_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+    ret <8 x i16> %c
+}
+
+;; vpackev.w
+define <4 x i32> @shufflevector_pack_ev_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+    ret <4 x i32> %c
+}
+
+;; vpickev.d/vpackev.d/vilvl.d
+define <2 x i64> @shufflevector_pack_ev_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.d $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+    ret <2 x i64> %c
+}
+
+;; vpackev.w
+define <4 x float> @shufflevector_pack_ev_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+    ret <4 x float> %c
+}
+
+;; vpickev.d/vpackev.d/vilvl.d
+define <2 x double> @shufflevector_pack_ev_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: shufflevector_pack_ev_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackev.d $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+    ret <2 x double> %c
+}
+
+;; vpackod.b
+define <16 x i8> @shufflevector_pack_od_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+    ret <16 x i8> %c
+}
+
+;; vpackod.h
+define <8 x i16> @shufflevector_pack_od_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+    ret <8 x i16> %c
+}
+
+;; vpackod.w
+define <4 x i32> @shufflevector_pack_od_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_pack_od_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+    ret <4 x i32> %c
+}
+
+;; vpickod.d/vpackod.d/vilvh.d
+define <2 x i64> @shufflodector_pack_od_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.d $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+    ret <2 x i64> %c
+}
+
+;; vpackod.w
+define <4 x float> @shufflodector_pack_od_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+    ret <4 x float> %c
+}
+
+;; vpickod.d/vpackod.d/vilvh.d
+define <2 x double> @shufflodector_pack_od_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: shufflodector_pack_od_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackod.d $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+    ret <2 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ca636d942b583814f7bae973dd03500690b9f719
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+;; vpickev.b
+define <16 x i8> @shufflevector_pick_ev_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickev.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+    ret <16 x i8> %c
+}
+
+;; vpickev.h
+define <8 x i16> @shufflevector_pick_ev_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickev.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+    ret <8 x i16> %c
+}
+
+;; vpickev.w
+define <4 x i32> @shufflevector_pick_ev_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickev.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+    ret <4 x i32> %c
+}
+
+;; vpickev.w
+define <4 x float> @shufflevector_pick_ev_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_pick_ev_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickev.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+    ret <4 x float> %c
+}
+
+;; vpickod.b
+define <16 x i8> @shufflevector_pick_od_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickod.b $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+    ret <16 x i8> %c
+}
+
+;; vpickod.h
+define <8 x i16> @shufflevector_pick_od_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickod.h $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+    ret <8 x i16> %c
+}
+
+;; vpickod.w
+define <4 x i32> @shufflevector_pick_od_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_pick_od_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickod.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+    ret <4 x i32> %c
+}
+
+;; vpickod.w
+define <4 x float> @shufflodector_pick_od_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflodector_pick_od_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpickod.w $vr0, $vr1, $vr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+    ret <4 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
new file mode 100644
index 0000000000000000000000000000000000000000..10510786f3216287237c598ea8042ca853b6a06a
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+;; vreplvei.b
+define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.b $vr0, $vr0, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+    ret <16 x i8> %c
+}
+
+;; vreplvei.h
+define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.h $vr0, $vr1, 2
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+    ret <8 x i16> %c
+}
+
+;; vreplvei.w
+define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+    ret <4 x i32> %c
+}
+
+;; vreplvei.d
+define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: shufflevector_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 1>
+    ret <2 x i64> %c
+}
+
+;; vreplvei.w
+define <4 x float> @shufflevector_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+    ret <4 x float> %c
+}
+
+;; vreplvei.d
+define <2 x double> @shufflevector_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: shufflevector_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
+    ret <2 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
new file mode 100644
index 0000000000000000000000000000000000000000..55800b31446b3d82a5f7efab3483a8f9d2ebb481
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.b $vr0, $vr1, $vr0, $vr2
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15, i32 2, i32 4, i32 6, i32 8, i32 25, i32 30, i32 31, i32 31>
+    ret <16 x i8> %c
+}
+
+;; vshuf.h
+define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.h $vr2, $vr1, $vr0
+; CHECK-NEXT:    vori.b $vr0, $vr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15>
+    ret <8 x i16> %c
+}
+
+;; vshuf.w
+define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.w $vr2, $vr1, $vr0
+; CHECK-NEXT:    vori.b $vr0, $vr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 5, i32 7>
+    ret <4 x i32> %c
+}
+
+;; vshuf.d
+define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: shufflevector_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.d $vr2, $vr1, $vr0
+; CHECK-NEXT:    vori.b $vr0, $vr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
+    ret <2 x i64> %c
+}
+
+;; vshuf.w
+define <4 x float> @shufflevector_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.w $vr2, $vr1, $vr0
+; CHECK-NEXT:    vori.b $vr0, $vr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 5, i32 7>
+    ret <4 x float> %c
+}
+
+;; vshuf.d
+define <2 x double> @shufflevector_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: shufflevector_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
+; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_0)
+; CHECK-NEXT:    vld $vr2, $a0, 0
+; CHECK-NEXT:    vshuf.d $vr2, $vr1, $vr0
+; CHECK-NEXT:    vori.b $vr0, $vr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+    ret <2 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll
new file mode 100644
index 0000000000000000000000000000000000000000..660b9581c3d1f555b358afd67e2bf664c454355b
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+;; vilvh.b
+define <16 x i8> @shufflevector_vshuf4i_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: shufflevector_vshuf4i_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+    ret <16 x i8> %c
+}
+
+;; vilvh.h
+define <8 x i16> @shufflevector_vshuf4i_v8i4(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: shufflevector_vshuf4i_v8i4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshuf4i.h $vr0, $vr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+    ret <8 x i16> %c
+}
+
+;; vilvh.w
+define <4 x i32> @shufflevector_vshuf4i_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: shufflevector_vshuf4i_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+    ret <4 x i32> %c
+}
+
+;; vilvh.w
+define <4 x float> @shufflevector_vshuf4i_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: shufflevector_vshuf4i_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+    ret <4 x float> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
new file mode 100644
index 0000000000000000000000000000000000000000..474436a0126b91eb03e71d494a77d518e8fff6cc
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
@@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc --mtriple=loongarch64 --code-model=medium --post-RA-scheduler=0 < %s \
+; RUN:     | FileCheck %s --check-prefix=MEDIUM_NO_SCH
+; RUN: llc --mtriple=loongarch64 --code-model=medium --post-RA-scheduler=1 < %s \
+; RUN:     | FileCheck %s --check-prefix=MEDIUM_SCH
+; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=0 < %s \
+; RUN:     | FileCheck %s --check-prefix=LARGE_NO_SCH
+; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=1 < %s \
+; RUN:     | FileCheck %s --check-prefix=LARGE_SCH
+
+@g = dso_local global i64 zeroinitializer, align 4
+@G = global i64 zeroinitializer, align 4
+@gd = external thread_local global i64
+@ld = external thread_local(localdynamic) global i64
+@ie = external thread_local(initialexec) global i64
+
+declare ptr @bar(i64)
+
+define void @foo() nounwind {
+; MEDIUM_NO_SCH-LABEL: foo:
+; MEDIUM_NO_SCH:       # %bb.0:
+; MEDIUM_NO_SCH-NEXT:    addi.d $sp, $sp, -16
+; MEDIUM_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; MEDIUM_NO_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+; MEDIUM_NO_SCH-NEXT:    ori $a0, $zero, 1
+; MEDIUM_NO_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
+; MEDIUM_NO_SCH-NEXT:    jirl $ra, $ra, 0
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
+; MEDIUM_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ld)
+; MEDIUM_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ie)
+; MEDIUM_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; MEDIUM_NO_SCH-NEXT:    addi.d $sp, $sp, 16
+; MEDIUM_NO_SCH-NEXT:    ret
+;
+; MEDIUM_SCH-LABEL: foo:
+; MEDIUM_SCH:       # %bb.0:
+; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, -16
+; MEDIUM_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
+; MEDIUM_SCH-NEXT:    ori $a0, $zero, 1
+; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
+; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
+; MEDIUM_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ld)
+; MEDIUM_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ie)
+; MEDIUM_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; MEDIUM_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, 16
+; MEDIUM_SCH-NEXT:    ret
+;
+; LARGE_NO_SCH-LABEL: foo:
+; LARGE_NO_SCH:       # %bb.0:
+; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, -16
+; LARGE_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(g)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
+; LARGE_NO_SCH-NEXT:    add.d $a0, $t8, $a0
+; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+; LARGE_NO_SCH-NEXT:    ori $a0, $zero, 1
+; LARGE_NO_SCH-NEXT:    pcalau12i $ra, %got_pc_hi20(bar)
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(bar)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(bar)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
+; LARGE_NO_SCH-NEXT:    ldx.d $ra, $t8, $ra
+; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(gd)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
+; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
+; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, 16
+; LARGE_NO_SCH-NEXT:    ret
+;
+; LARGE_SCH-LABEL: foo:
+; LARGE_SCH:       # %bb.0:
+; LARGE_SCH-NEXT:    addi.d $sp, $sp, -16
+; LARGE_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LARGE_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
+; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
+; LARGE_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(g)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
+; LARGE_SCH-NEXT:    add.d $a0, $t8, $a0
+; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
+; LARGE_SCH-NEXT:    ori $a0, $zero, 1
+; LARGE_SCH-NEXT:    pcalau12i $ra, %got_pc_hi20(bar)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(bar)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(bar)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
+; LARGE_SCH-NEXT:    ldx.d $ra, $t8, $ra
+; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
+; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(gd)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
+; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
+; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
+; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
+; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
+; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
+; LARGE_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LARGE_SCH-NEXT:    addi.d $sp, $sp, 16
+; LARGE_SCH-NEXT:    ret
+  %V = load volatile i64, ptr @G
+  %v = load volatile i64, ptr @g
+  call void @bar(i64 1)
+  %v_gd = load volatile i64, ptr @gd
+  %v_ld = load volatile i64, ptr @ld
+  %v_ie = load volatile i64, ptr @ie
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/tls-models.ll b/llvm/test/CodeGen/LoongArch/tls-models.ll
index a2a3792a6a54bed7faf1038131d76a7ab7d8f18f..3994df1da7163f1e2049b52d37f5d201ac18ef0c 100644
--- a/llvm/test/CodeGen/LoongArch/tls-models.ll
+++ b/llvm/test/CodeGen/LoongArch/tls-models.ll
@@ -45,15 +45,15 @@ define ptr @f1() nounwind {
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, -16
 ; LA64LARGEPIC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %gd_pc_hi20(unspecified)
-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(unspecified)
-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(unspecified)
-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(unspecified)
-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
-; LA64LARGEPIC-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    add.d $ra, $ra, $a1
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(unspecified)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(unspecified)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(unspecified)
+; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
+; LA64LARGEPIC-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    add.d $ra, $t8, $ra
 ; LA64LARGEPIC-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGEPIC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, 16
@@ -76,10 +76,10 @@ define ptr @f1() nounwind {
 ; LA64LARGENOPIC-LABEL: f1:
 ; LA64LARGENOPIC:       # %bb.0: # %entry
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(unspecified)
-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(unspecified)
-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(unspecified)
-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(unspecified)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(unspecified)
+; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(unspecified)
+; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(unspecified)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
 entry:
@@ -116,15 +116,15 @@ define ptr @f2() nounwind {
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, -16
 ; LA64LARGEPIC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(ld)
-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(ld)
-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(ld)
-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
-; LA64LARGEPIC-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
-; LA64LARGEPIC-NEXT:    add.d $ra, $ra, $a1
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(ld)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(ld)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(ld)
+; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
+; LA64LARGEPIC-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
+; LA64LARGEPIC-NEXT:    add.d $ra, $t8, $ra
 ; LA64LARGEPIC-NEXT:    jirl $ra, $ra, 0
 ; LA64LARGEPIC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
 ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, 16
@@ -147,10 +147,10 @@ define ptr @f2() nounwind {
 ; LA64LARGENOPIC-LABEL: f2:
 ; LA64LARGENOPIC:       # %bb.0: # %entry
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
+; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
+; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
 entry:
@@ -177,10 +177,10 @@ define ptr @f3() nounwind {
 ; LA64LARGEPIC-LABEL: f3:
 ; LA64LARGEPIC:       # %bb.0: # %entry
 ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
-; LA64LARGEPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
+; LA64LARGEPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
+; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
+; LA64LARGEPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGEPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGEPIC-NEXT:    ret
 ;
@@ -201,10 +201,10 @@ define ptr @f3() nounwind {
 ; LA64LARGENOPIC-LABEL: f3:
 ; LA64LARGENOPIC:       # %bb.0: # %entry
 ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
+; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
+; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
+; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
+; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
 ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
 ; LA64LARGENOPIC-NEXT:    ret
 entry:
diff --git a/llvm/test/MC/LoongArch/Basic/Float/d-arith.s b/llvm/test/MC/LoongArch/Basic/Float/d-arith.s
index 6b2c67e9a2cc174dd03d5f43503fe60215c6aa6a..8e19d2e34f3c5a240831f69965b6cf1a0237f90c 100644
--- a/llvm/test/MC/LoongArch/Basic/Float/d-arith.s
+++ b/llvm/test/MC/LoongArch/Basic/Float/d-arith.s
@@ -78,10 +78,18 @@ fsqrt.d $fa2, $ft3
 # ASM: encoding: [0x7b,0x5b,0x14,0x01]
 frecip.d $fs3, $fs3
 
+# ASM-AND-OBJ: frecipe.d $fa0, $fa0
+# ASM: encoding: [0x00,0x78,0x14,0x01]
+frecipe.d $fa0, $fa0
+
 # ASM-AND-OBJ: frsqrt.d $ft14, $fa3
 # ASM: encoding: [0x76,0x68,0x14,0x01]
 frsqrt.d $ft14, $fa3
 
+# ASM-AND-OBJ: frsqrte.d $fa1, $fa1
+# ASM: encoding: [0x21,0x88,0x14,0x01]
+frsqrte.d $fa1, $fa1
+
 # ASM-AND-OBJ: fscaleb.d $ft4, $ft6, $fs2
 # ASM: encoding: [0xcc,0x69,0x11,0x01]
 fscaleb.d $ft4, $ft6, $fs2
diff --git a/llvm/test/MC/LoongArch/Basic/Float/f-arith.s b/llvm/test/MC/LoongArch/Basic/Float/f-arith.s
index 155e783cf4350526cc0bf9056bdf1580b19a2b67..c32151adbf3b30ca782e9e99a44de7a1240a7775 100644
--- a/llvm/test/MC/LoongArch/Basic/Float/f-arith.s
+++ b/llvm/test/MC/LoongArch/Basic/Float/f-arith.s
@@ -73,10 +73,18 @@ fsqrt.s $fs3, $ft10
 # ASM: encoding: [0x71,0x57,0x14,0x01]
 frecip.s $ft9, $fs3
 
+# ASM-AND-OBJ: frecipe.s $fa0, $fa0
+# ASM: encoding: [0x00,0x74,0x14,0x01]
+frecipe.s $fa0, $fa0
+
 # ASM-AND-OBJ: frsqrt.s $fs1, $ft4
 # ASM: encoding: [0x99,0x65,0x14,0x01]
 frsqrt.s $fs1, $ft4
 
+# ASM-AND-OBJ: frsqrte.s $fa1, $fa1
+# ASM: encoding: [0x21,0x84,0x14,0x01]
+frsqrte.s $fa1, $fa1
+
 # ASM-AND-OBJ: fscaleb.s $ft13, $ft15, $fa6
 # ASM: encoding: [0xf5,0x9a,0x10,0x01]
 fscaleb.s $ft13, $ft15, $fa6
diff --git a/llvm/test/MC/LoongArch/Basic/Integer/atomic.s b/llvm/test/MC/LoongArch/Basic/Integer/atomic.s
index a35211db885141ac8281cc5506baa99060e8de14..69acdeef935ccc362bd2aabe2e345851f5ec2a21 100644
--- a/llvm/test/MC/LoongArch/Basic/Integer/atomic.s
+++ b/llvm/test/MC/LoongArch/Basic/Integer/atomic.s
@@ -21,6 +21,14 @@ ll.w $tp, $s4, 220
 # CHECK-ASM: encoding: [0xd3,0x39,0x00,0x21]
 sc.w $t7, $t2, 56
 
+# CHECK-ASM-AND-OBJ: llacq.w $t1, $t2
+# CHECK-ASM: encoding: [0xcd,0x81,0x57,0x38]
+llacq.w $t1, $t2
+
+# CHECK-ASM-AND-OBJ: screl.w $t1, $t2
+# CHECK-ASM: encoding: [0xcd,0x85,0x57,0x38]
+screl.w $t1, $t2
+
 
 
 #############################################################
@@ -29,6 +37,14 @@ sc.w $t7, $t2, 56
 
 .ifdef LA64
 
+# CHECK64-ASM-AND-OBJ: amswap.b $a2, $t0, $s1
+# CHECK64-ASM: encoding: [0x06,0x33,0x5c,0x38]
+amswap.b $a2, $t0, $s1, 0
+
+# CHECK64-ASM-AND-OBJ: amswap.h $a2, $t0, $s1
+# CHECK64-ASM: encoding: [0x06,0xb3,0x5c,0x38]
+amswap.h $a2, $t0, $s1, 0
+
 # CHECK64-ASM-AND-OBJ: amswap.w $a2, $t0, $s1
 # CHECK64-ASM: encoding: [0x06,0x33,0x60,0x38]
 amswap.w $a2, $t0, $s1, 0
@@ -41,6 +57,14 @@ amswap.w $zero, $t0, $zero
 # CHECK64-ASM: encoding: [0xa0,0x00,0x6a,0x38]
 amadd_db.w $zero, $zero, $a1
 
+# CHECK64-ASM-AND-OBJ: amswap.b $a2, $t0, $s1
+# CHECK64-ASM: encoding: [0x06,0x33,0x5c,0x38]
+amswap.b $a2, $t0, $s1
+
+# CHECK64-ASM-AND-OBJ: amswap.h $a2, $t0, $s1
+# CHECK64-ASM: encoding: [0x06,0xb3,0x5c,0x38]
+amswap.h $a2, $t0, $s1
+
 # CHECK64-ASM-AND-OBJ: amswap.w $a2, $t0, $s1
 # CHECK64-ASM: encoding: [0x06,0x33,0x60,0x38]
 amswap.w $a2, $t0, $s1
@@ -49,6 +73,14 @@ amswap.w $a2, $t0, $s1
 # CHECK64-ASM: encoding: [0xc2,0xba,0x60,0x38]
 amswap.d $tp, $t2, $fp
 
+# CHECK64-ASM-AND-OBJ: amadd.b $a4, $t0, $r21
+# CHECK64-ASM: encoding: [0xa8,0x32,0x5d,0x38]
+amadd.b $a4, $t0, $r21
+
+# CHECK64-ASM-AND-OBJ: amadd.h $a1, $t5, $s6
+# CHECK64-ASM: encoding: [0xa5,0xc7,0x5d,0x38]
+amadd.h $a1, $t5, $s6
+
 # CHECK64-ASM-AND-OBJ: amadd.w $a4, $t0, $r21
 # CHECK64-ASM: encoding: [0xa8,0x32,0x61,0x38]
 amadd.w $a4, $t0, $r21
@@ -113,6 +145,14 @@ ammin.wu $a4, $t6, $s7
 # CHECK64-ASM: encoding: [0x27,0xc3,0x68,0x38]
 ammin.du $a3, $t4, $s2
 
+# CHECK64-ASM-AND-OBJ: amswap_db.b $a2, $t0, $s1
+# CHECK64-ASM: encoding: [0x06,0x33,0x5e,0x38]
+amswap_db.b $a2, $t0, $s1
+
+# CHECK64-ASM-AND-OBJ: amswap_db.h $tp, $t2, $fp
+# CHECK64-ASM: encoding: [0xc2,0xba,0x5e,0x38]
+amswap_db.h $tp, $t2, $fp
+
 # CHECK64-ASM-AND-OBJ: amswap_db.w $a2, $t0, $s1
 # CHECK64-ASM: encoding: [0x06,0x33,0x69,0x38]
 amswap_db.w $a2, $t0, $s1
@@ -121,6 +161,14 @@ amswap_db.w $a2, $t0, $s1
 # CHECK64-ASM: encoding: [0xc2,0xba,0x69,0x38]
 amswap_db.d $tp, $t2, $fp
 
+# CHECK64-ASM-AND-OBJ: amadd_db.b $zero, $zero, $a1
+# CHECK64-ASM: encoding: [0xa0,0x00,0x5f,0x38]
+amadd_db.b $zero, $zero, $a1
+
+# CHECK64-ASM-AND-OBJ: amadd_db.h $a4, $t0, $r21
+# CHECK64-ASM: encoding: [0xa8,0xb2,0x5f,0x38]
+amadd_db.h $a4, $t0, $r21
+
 # CHECK64-ASM-AND-OBJ: amadd_db.w $a4, $t0, $r21
 # CHECK64-ASM: encoding: [0xa8,0x32,0x6a,0x38]
 amadd_db.w $a4, $t0, $r21
@@ -185,6 +233,38 @@ ammin_db.wu $a4, $t6, $s7
 # CHECK64-ASM: encoding: [0x27,0xc3,0x71,0x38]
 ammin_db.du $a3, $t4, $s2
 
+# CHECK64-ASM-AND-OBJ: amcas.b $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0x39,0x58,0x38]
+amcas.b $t1, $t2, $t3
+
+# CHECK64-ASM-AND-OBJ: amcas.h $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0xb9,0x58,0x38]
+amcas.h $t1, $t2, $t3
+
+# CHECK64-ASM-AND-OBJ: amcas.w $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0x39,0x59,0x38]
+amcas.w $t1, $t2, $t3
+
+# CHECK64-ASM-AND-OBJ: amcas.d $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0xb9,0x59,0x38]
+amcas.d $t1, $t2, $t3
+
+# CHECK64-ASM-AND-OBJ: amcas_db.b $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0x39,0x5a,0x38]
+amcas_db.b $t1, $t2, $t3
+
+# CHECK64-ASM-AND-OBJ: amcas_db.h $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0xb9,0x5a,0x38]
+amcas_db.h $t1, $t2, $t3
+
+# CHECK64-ASM-AND-OBJ: amcas_db.w $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0x39,0x5b,0x38]
+amcas_db.w $t1, $t2, $t3
+
+# CHECK64-ASM-AND-OBJ: amcas_db.d $t1, $t2, $t3
+# CHECK64-ASM: encoding: [0xed,0xb9,0x5b,0x38]
+amcas_db.d $t1, $t2, $t3
+
 # CHECK64-ASM-AND-OBJ: ll.d $s2, $s4, 16
 # CHECK64-ASM: encoding: [0x79,0x13,0x00,0x22]
 ll.d $s2, $s4, 16
@@ -193,5 +273,17 @@ ll.d $s2, $s4, 16
 # CHECK64-ASM: encoding: [0x31,0xf6,0x00,0x23]
 sc.d $t5, $t5, 244
 
+# CHECK64-ASM-AND-OBJ: sc.q $t7, $t2, $t5
+# CHECK64-ASM: encoding: [0x33,0x3a,0x57,0x38]
+sc.q $t7, $t2, $t5
+
+# CHECK64-ASM-AND-OBJ: llacq.d $t1, $t2
+# CHECK64-ASM: encoding: [0xcd,0x89,0x57,0x38]
+llacq.d $t1, $t2
+
+# CHECK64-ASM-AND-OBJ: screl.d $t1, $t2
+# CHECK64-ASM: encoding: [0xcd,0x8d,0x57,0x38]
+screl.d $t1, $t2
+
 .endif
 
diff --git a/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s b/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
index acddca9432a698aa30f7bff95dd60b7417edd72f..1c1c658ad440f83141b32e3290a34f86f2f0fc6f 100644
--- a/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
+++ b/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
@@ -65,7 +65,7 @@ addu16i.d $a0, $a0, 32768
 
 ## simm20
 pcaddu18i $a0, 0x80000
-# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-524288, 524287]
+# CHECK: :[[#@LINE-1]]:16: error: operand must be a symbol with modifier (e.g. %call36) or an integer in the range [-524288, 524287]
 
 ## simm20_lu32id
 lu32i.d $a0, 0x80000
diff --git a/llvm/test/MC/LoongArch/Macros/macros-call.s b/llvm/test/MC/LoongArch/Macros/macros-call.s
new file mode 100644
index 0000000000000000000000000000000000000000..a648a397803817943485aecda02d5e90ffad9543
--- /dev/null
+++ b/llvm/test/MC/LoongArch/Macros/macros-call.s
@@ -0,0 +1,9 @@
+# RUN: llvm-mc --triple=loongarch64 %s | FileCheck %s
+
+call36 sym_call
+# CHECK:      pcaddu18i $ra, %call36(sym_call)
+# CHECK-NEXT: jirl $ra, $ra, 0
+
+tail36 $t0, sym_tail
+# CHECK:      pcaddu18i $t0, %call36(sym_tail)
+# CHECK-NEXT: jr $t0
diff --git a/llvm/test/MC/LoongArch/Relocations/relocations.s b/llvm/test/MC/LoongArch/Relocations/relocations.s
index 042cc93470a1e5b1b72b55814e0d78529bc0bbe1..bec71e103893331e0c3392133442f25fa986064a 100644
--- a/llvm/test/MC/LoongArch/Relocations/relocations.s
+++ b/llvm/test/MC/LoongArch/Relocations/relocations.s
@@ -218,3 +218,8 @@ lu12i.w $t1, %gd_hi20(foo)
 # RELOC: R_LARCH_TLS_GD_HI20 foo 0x0
 # INSTR: lu12i.w $t1, %gd_hi20(foo)
 # FIXUP: fixup A - offset: 0, value: %gd_hi20(foo), kind: FK_NONE
+
+pcaddu18i $t1, %call36(foo)
+# RELOC: R_LARCH_CALL36 foo 0x0
+# INSTR: pcaddu18i $t1, %call36(foo)
+# FIXUP: fixup A - offset: 0, value: %call36(foo), kind: FK_NONE
diff --git a/llvm/test/MC/LoongArch/lasx/frecip.s b/llvm/test/MC/LoongArch/lasx/frecip.s
index 1bb3ce02fb9c056f3e0621cb7f418edab2403ab1..e95b03a96ebaf4d921fd8f1bc2f25af879930209 100644
--- a/llvm/test/MC/LoongArch/lasx/frecip.s
+++ b/llvm/test/MC/LoongArch/lasx/frecip.s
@@ -10,3 +10,11 @@ xvfrecip.s $xr3, $xr16
 xvfrecip.d $xr17, $xr24
 # CHECK-INST: xvfrecip.d $xr17, $xr24
 # CHECK-ENCODING: encoding: [0x11,0xfb,0x9c,0x76]
+
+xvfrecipe.s $xr3, $xr16
+# CHECK-INST: xvfrecipe.s $xr3, $xr16
+# CHECK-ENCODING: encoding: [0x03,0x16,0x9d,0x76]
+
+xvfrecipe.d $xr17, $xr24
+# CHECK-INST: xvfrecipe.d $xr17, $xr24
+# CHECK-ENCODING: encoding: [0x11,0x1b,0x9d,0x76]
diff --git a/llvm/test/MC/LoongArch/lasx/frsqrt.s b/llvm/test/MC/LoongArch/lasx/frsqrt.s
index af96e10832dfb13206953a78f7deec2f0c3f4db9..d1048f9ff8f0ef4fa0dfe6f4b6489489348ea394 100644
--- a/llvm/test/MC/LoongArch/lasx/frsqrt.s
+++ b/llvm/test/MC/LoongArch/lasx/frsqrt.s
@@ -10,3 +10,11 @@ xvfrsqrt.s $xr31, $xr25
 xvfrsqrt.d $xr14, $xr22
 # CHECK-INST: xvfrsqrt.d $xr14, $xr22
 # CHECK-ENCODING: encoding: [0xce,0x0a,0x9d,0x76]
+
+xvfrsqrte.s $xr31, $xr25
+# CHECK-INST: xvfrsqrte.s $xr31, $xr25
+# CHECK-ENCODING: encoding: [0x3f,0x27,0x9d,0x76]
+
+xvfrsqrte.d $xr14, $xr22
+# CHECK-INST: xvfrsqrte.d $xr14, $xr22
+# CHECK-ENCODING: encoding: [0xce,0x2a,0x9d,0x76]
diff --git a/llvm/test/MC/LoongArch/lsx/frecip.s b/llvm/test/MC/LoongArch/lsx/frecip.s
index d8c8278d16675e3dbb6d3e271cfa695ad2da5d1f..cd6d925e1470c39bda67f52b76f6d0cde0526801 100644
--- a/llvm/test/MC/LoongArch/lsx/frecip.s
+++ b/llvm/test/MC/LoongArch/lsx/frecip.s
@@ -10,3 +10,11 @@ vfrecip.s $vr29, $vr14
 vfrecip.d $vr24, $vr9
 # CHECK-INST: vfrecip.d $vr24, $vr9
 # CHECK-ENCODING: encoding: [0x38,0xf9,0x9c,0x72]
+
+vfrecipe.s $vr29, $vr14
+# CHECK-INST: vfrecipe.s $vr29, $vr14
+# CHECK-ENCODING: encoding: [0xdd,0x15,0x9d,0x72]
+
+vfrecipe.d $vr24, $vr9
+# CHECK-INST: vfrecipe.d $vr24, $vr9
+# CHECK-ENCODING: encoding: [0x38,0x19,0x9d,0x72]
diff --git a/llvm/test/MC/LoongArch/lsx/frsqrt.s b/llvm/test/MC/LoongArch/lsx/frsqrt.s
index 68b0cc091b8ae254fc929b61647a2694bfe445f5..d8b9fc3d0684544ff626c8914e06024db7f09ef8 100644
--- a/llvm/test/MC/LoongArch/lsx/frsqrt.s
+++ b/llvm/test/MC/LoongArch/lsx/frsqrt.s
@@ -10,3 +10,11 @@ vfrsqrt.s $vr19, $vr30
 vfrsqrt.d $vr1, $vr0
 # CHECK-INST: vfrsqrt.d $vr1, $vr0
 # CHECK-ENCODING: encoding: [0x01,0x08,0x9d,0x72]
+
+vfrsqrte.s $vr19, $vr30
+# CHECK-INST: vfrsqrte.s $vr19, $vr30
+# CHECK-ENCODING: encoding: [0xd3,0x27,0x9d,0x72]
+
+vfrsqrte.d $vr1, $vr0
+# CHECK-INST: vfrsqrte.d $vr1, $vr0
+# CHECK-ENCODING: encoding: [0x01,0x28,0x9d,0x72]
diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
index a8ac2411dd82166c805a79f3f03dd318f999777a..6ab300859f9d539f1eb6185cc4f287758105f972 100644
--- a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
+++ b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx,+auto-vec -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx -S | FileCheck %s
 
 ;; This is a collection of tests whose only purpose is to show changes in the
 ;; default configuration.  Please keep these tests minimal - if you're testing
diff --git a/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test b/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test
index e32dc893fa7985d41986aab7c874c25117a87323..88ff7fa405ed95f2aa23507cfd5992f44fce3dc3 100644
--- a/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test
+++ b/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test
@@ -102,6 +102,7 @@
 # CHECK: Type: R_LARCH_ADD_ULEB128 (107)
 # CHECK: Type: R_LARCH_SUB_ULEB128 (108)
 # CHECK: Type: R_LARCH_64_PCREL (109)
+# CHECK: Type: R_LARCH_CALL36 (110)
 
 --- !ELF
 FileHeader:
@@ -211,3 +212,4 @@ Sections:
       - Type: R_LARCH_ADD_ULEB128
       - Type: R_LARCH_SUB_ULEB128
       - Type: R_LARCH_64_PCREL
+      - Type: R_LARCH_CALL36
diff --git a/llvm/unittests/Object/ELFTest.cpp b/llvm/unittests/Object/ELFTest.cpp
index 50b1df124a4a58c862a0cc5de6b5568bf0a98244..ed851dde4c00add5bf40bbd56e246973ac8c4132 100644
--- a/llvm/unittests/Object/ELFTest.cpp
+++ b/llvm/unittests/Object/ELFTest.cpp
@@ -251,6 +251,8 @@ TEST(ELFTest, getELFRelocationTypeNameForLoongArch) {
             getELFRelocationTypeName(EM_LOONGARCH, R_LARCH_SUB_ULEB128));
   EXPECT_EQ("R_LARCH_64_PCREL",
             getELFRelocationTypeName(EM_LOONGARCH, R_LARCH_64_PCREL));
+  EXPECT_EQ("R_LARCH_CALL36",
+            getELFRelocationTypeName(EM_LOONGARCH, R_LARCH_CALL36));
 }
 
 TEST(ELFTest, getELFRelativeRelocationType) {