From f29e3618e267533beb40d05dc5974488ce5f7847 Mon Sep 17 00:00:00 2001 From: Ami-zhang Date: Mon, 23 Sep 2024 10:16:13 +0800 Subject: [PATCH] [Backport][LoongArch] Fix and add some new support --- ...ongArch-fix-and-add-some-new-support.patch | 5463 +++++++++++++++++ llvm.spec | 6 +- 2 files changed, 5468 insertions(+), 1 deletion(-) create mode 100644 0024-Backport-LoongArch-fix-and-add-some-new-support.patch diff --git a/0024-Backport-LoongArch-fix-and-add-some-new-support.patch b/0024-Backport-LoongArch-fix-and-add-some-new-support.patch new file mode 100644 index 0000000..9681d0c --- /dev/null +++ b/0024-Backport-LoongArch-fix-and-add-some-new-support.patch @@ -0,0 +1,5463 @@ +From 53a624f1fbb2d1f837070b400812e8bddf66fd3d Mon Sep 17 00:00:00 2001 +From: Lu Weining +Date: Tue, 5 Dec 2023 09:20:48 +0800 +Subject: [PATCH 01/12] [BinaryFormat][LoongArch] Define psABI v2.20 relocs for + R_LARCH_CALL36(#73345) + +R_LARCH_CALL36 was designed for function call on medium code model where +the 2 instructions (pcaddu18i + jirl) must be adjacent. + +(cherry picked from commit c3a9c905fbc486add75e16218fe58a04b7b6c282) +--- + llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def | 6 ++++++ + .../tools/llvm-readobj/ELF/reloc-types-loongarch64.test | 2 ++ + llvm/unittests/Object/ELFTest.cpp | 2 ++ + 3 files changed, 10 insertions(+) + +diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def +index 02bce3c71712..c4393432677b 100644 +--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def ++++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def +@@ -118,3 +118,9 @@ ELF_RELOC(R_LARCH_SUB6, 106) + ELF_RELOC(R_LARCH_ADD_ULEB128, 107) + ELF_RELOC(R_LARCH_SUB_ULEB128, 108) + ELF_RELOC(R_LARCH_64_PCREL, 109) ++ ++// Relocs added in ELF for the LoongArchâ„¢ Architecture v20231102, part of the ++// v2.20 LoongArch ABI specs. ++// ++// Spec addition: https://github.com/loongson/la-abi-specs/pull/4 ++ELF_RELOC(R_LARCH_CALL36, 110) +diff --git a/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test b/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test +index e32dc893fa79..88ff7fa405ed 100644 +--- a/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test ++++ b/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test +@@ -102,6 +102,7 @@ + # CHECK: Type: R_LARCH_ADD_ULEB128 (107) + # CHECK: Type: R_LARCH_SUB_ULEB128 (108) + # CHECK: Type: R_LARCH_64_PCREL (109) ++# CHECK: Type: R_LARCH_CALL36 (110) + + --- !ELF + FileHeader: +@@ -211,3 +212,4 @@ Sections: + - Type: R_LARCH_ADD_ULEB128 + - Type: R_LARCH_SUB_ULEB128 + - Type: R_LARCH_64_PCREL ++ - Type: R_LARCH_CALL36 +diff --git a/llvm/unittests/Object/ELFTest.cpp b/llvm/unittests/Object/ELFTest.cpp +index 50b1df124a4a..ed851dde4c00 100644 +--- a/llvm/unittests/Object/ELFTest.cpp ++++ b/llvm/unittests/Object/ELFTest.cpp +@@ -251,6 +251,8 @@ TEST(ELFTest, getELFRelocationTypeNameForLoongArch) { + getELFRelocationTypeName(EM_LOONGARCH, R_LARCH_SUB_ULEB128)); + EXPECT_EQ("R_LARCH_64_PCREL", + getELFRelocationTypeName(EM_LOONGARCH, R_LARCH_64_PCREL)); ++ EXPECT_EQ("R_LARCH_CALL36", ++ getELFRelocationTypeName(EM_LOONGARCH, R_LARCH_CALL36)); + } + + TEST(ELFTest, getELFRelativeRelocationType) { +-- +2.20.1 + + +From a8ed0f26220bbacb2c485a392f79ac4b271d73af Mon Sep 17 00:00:00 2001 +From: wanglei +Date: Tue, 2 Jan 2024 10:55:02 +0800 +Subject: [PATCH 02/12] [LoongArch] Emit function call code sequence as + `PCADDU18I+JIRL` in medium code model + +According to the description of the psABI v2.20: +https://github.com/loongson/la-abi-specs/releases/tag/v2.20, adjustments +are made to the function call instructions under the medium code model. + +At the same time, AsmParser has already supported parsing the call36 and +tail36 macro instructions. + +(cherry picked from commit 2cf420d5b846a4733ef0ef7c8ed0ae0bfd1c6772) +--- + .../AsmParser/LoongArchAsmParser.cpp | 61 +++++++++++++++++++ + .../LoongArch/LoongArchExpandPseudoInsts.cpp | 29 ++++----- + .../Target/LoongArch/LoongArchInstrInfo.td | 23 ++++++- + .../Target/LoongArch/LoongArchMCInstLower.cpp | 3 + + .../LoongArch/LoongArchTargetMachine.cpp | 4 +- + .../MCTargetDesc/LoongArchBaseInfo.h | 1 + + .../MCTargetDesc/LoongArchELFObjectWriter.cpp | 2 + + .../MCTargetDesc/LoongArchFixupKinds.h | 3 + + .../MCTargetDesc/LoongArchMCCodeEmitter.cpp | 3 + + .../MCTargetDesc/LoongArchMCExpr.cpp | 3 + + .../LoongArch/MCTargetDesc/LoongArchMCExpr.h | 1 + + llvm/test/CodeGen/LoongArch/code-models.ll | 12 ++-- + .../MC/LoongArch/Basic/Integer/invalid64.s | 2 +- + llvm/test/MC/LoongArch/Macros/macros-call.s | 9 +++ + .../MC/LoongArch/Relocations/relocations.s | 5 ++ + 15 files changed, 133 insertions(+), 28 deletions(-) + create mode 100644 llvm/test/MC/LoongArch/Macros/macros-call.s + +diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp +index a132e645c864..f908e5bc63d3 100644 +--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp ++++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp +@@ -122,6 +122,10 @@ class LoongArchAsmParser : public MCTargetAsmParser { + // Helper to emit pseudo instruction "li.w/d $rd, $imm". + void emitLoadImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out); + ++ // Helper to emit pseudo instruction "call36 sym" or "tail36 $rj, sym". ++ void emitFuncCall36(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, ++ bool IsTailCall); ++ + public: + enum LoongArchMatchResultTy { + Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY, +@@ -401,6 +405,22 @@ public: + IsValidKind; + } + ++ bool isSImm20pcaddu18i() const { ++ if (!isImm()) ++ return false; ++ ++ int64_t Imm; ++ LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None; ++ bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); ++ bool IsValidKind = VK == LoongArchMCExpr::VK_LoongArch_None || ++ VK == LoongArchMCExpr::VK_LoongArch_CALL36; ++ ++ return IsConstantImm ++ ? isInt<20>(Imm) && IsValidKind ++ : LoongArchAsmParser::classifySymbolRef(getImm(), VK) && ++ IsValidKind; ++ } ++ + bool isSImm21lsl2() const { + if (!isImm()) + return false; +@@ -1111,6 +1131,35 @@ void LoongArchAsmParser::emitLoadImm(MCInst &Inst, SMLoc IDLoc, + } + } + ++void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc, ++ MCStreamer &Out, bool IsTailCall) { ++ // call36 sym ++ // expands to: ++ // pcaddu18i $ra, %call36(sym) ++ // jirl $ra, $ra, 0 ++ // ++ // tail36 $rj, sym ++ // expands to: ++ // pcaddu18i $rj, %call36(sym) ++ // jirl $r0, $rj, 0 ++ unsigned ScratchReg = ++ IsTailCall ? Inst.getOperand(0).getReg() : (unsigned)LoongArch::R1; ++ const MCExpr *Sym = ++ IsTailCall ? Inst.getOperand(1).getExpr() : Inst.getOperand(0).getExpr(); ++ const LoongArchMCExpr *LE = LoongArchMCExpr::create( ++ Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, getContext()); ++ ++ Out.emitInstruction( ++ MCInstBuilder(LoongArch::PCADDU18I).addReg(ScratchReg).addExpr(LE), ++ getSTI()); ++ Out.emitInstruction( ++ MCInstBuilder(LoongArch::JIRL) ++ .addReg(IsTailCall ? (unsigned)LoongArch::R0 : ScratchReg) ++ .addReg(ScratchReg) ++ .addImm(0), ++ getSTI()); ++} ++ + bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, + OperandVector &Operands, + MCStreamer &Out) { +@@ -1159,6 +1208,12 @@ bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, + case LoongArch::PseudoLI_D: + emitLoadImm(Inst, IDLoc, Out); + return false; ++ case LoongArch::PseudoCALL36: ++ emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/false); ++ return false; ++ case LoongArch::PseudoTAIL36: ++ emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/true); ++ return false; + } + Out.emitInstruction(Inst, getSTI()); + return false; +@@ -1440,6 +1495,12 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + /*Upper=*/(1 << 19) - 1, + "operand must be a symbol with modifier (e.g. %pc_hi20) or an integer " + "in the range"); ++ case Match_InvalidSImm20pcaddu18i: ++ return generateImmOutOfRangeError( ++ Operands, ErrorInfo, /*Lower=*/-(1 << 19), ++ /*Upper=*/(1 << 19) - 1, ++ "operand must be a symbol with modifier (e.g. %call36) or an integer " ++ "in the range"); + case Match_InvalidSImm21lsl2: + return generateImmOutOfRangeError( + Operands, ErrorInfo, /*Lower=*/-(1 << 22), /*Upper=*/(1 << 22) - 4, +diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp +index 72c1f1cec198..8eda2dcc1633 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp ++++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp +@@ -458,11 +458,11 @@ bool LoongArchPreRAExpandPseudo::expandFunctionCALL( + } + case CodeModel::Medium: { + // CALL: +- // pcalau12i $ra, %pc_hi20(func) +- // jirl $ra, $ra, %pc_lo12(func) ++ // pcaddu18i $ra, %call36(func) ++ // jirl $ra, $ra, 0 + // TAIL: +- // pcalau12i $scratch, %pc_hi20(func) +- // jirl $r0, $scratch, %pc_lo12(func) ++ // pcaddu18i $scratch, %call36(func) ++ // jirl $r0, $scratch, 0 + Opcode = + IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; + Register ScratchReg = +@@ -470,18 +470,15 @@ bool LoongArchPreRAExpandPseudo::expandFunctionCALL( + ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) + : LoongArch::R1; + MachineInstrBuilder MIB = +- BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), ScratchReg); +- CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg); +- if (Func.isSymbol()) { +- const char *FnName = Func.getSymbolName(); +- MIB.addExternalSymbol(FnName, LoongArchII::MO_PCREL_HI); +- CALL.addExternalSymbol(FnName, LoongArchII::MO_PCREL_LO); +- break; +- } +- assert(Func.isGlobal() && "Expected a GlobalValue at this time"); +- const GlobalValue *GV = Func.getGlobal(); +- MIB.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_HI); +- CALL.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_LO); ++ BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg); ++ ++ CALL = ++ BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0); ++ ++ if (Func.isSymbol()) ++ MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36); ++ else ++ MIB.addDisp(Func, 0, LoongArchII::MO_CALL36); + break; + } + case CodeModel::Large: { +diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +index ab1890556814..67de5f7afd78 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td ++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +@@ -351,6 +351,10 @@ def simm20_lu32id : SImm20Operand { + let ParserMatchClass = SImmAsmOperand<20, "lu32id">; + } + ++def simm20_pcaddu18i : SImm20Operand { ++ let ParserMatchClass = SImmAsmOperand<20, "pcaddu18i">; ++} ++ + def simm21_lsl2 : Operand { + let ParserMatchClass = SImmAsmOperand<21, "lsl2">; + let EncoderMethod = "getImmOpValueAsr<2>"; +@@ -772,7 +776,7 @@ def LU32I_D : Fmt1RI20<0x16000000, (outs GPR:$dst), + "$rd, $imm20">; + } + def LU52I_D : ALU_2RI12<0x03000000, simm12_lu52id>; +-def PCADDU18I : ALU_1RI20<0x1e000000, simm20>; ++def PCADDU18I : ALU_1RI20<0x1e000000, simm20_pcaddu18i>; + def MUL_D : ALU_3R<0x001d8000>; + def MULH_D : ALU_3R<0x001e0000>; + def MULH_DU : ALU_3R<0x001e8000>; +@@ -1324,7 +1328,7 @@ def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)), + (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>; + + let isCall = 1, Defs = [R1] in +-def PseudoCALL : Pseudo<(outs), (ins simm26_symbol:$func)>; ++def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func)>; + + def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>; + def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>; +@@ -1344,7 +1348,7 @@ def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>, + PseudoInstExpansion<(JIRL R0, R1, 0)>; + + let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in +-def PseudoTAIL : Pseudo<(outs), (ins simm26_symbol:$dst)>; ++def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst)>; + + def : Pat<(loongarch_tail (iPTR tglobaladdr:$dst)), + (PseudoTAIL tglobaladdr:$dst)>; +@@ -1367,6 +1371,19 @@ def PseudoJIRL_TAIL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>, + PseudoInstExpansion<(JIRL R0, GPR:$rj, + simm16_lsl2:$imm16)>; + ++/// call36/taill36 macro instructions ++let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, isAsmParserOnly = 1, ++ Defs = [R1], Size = 8, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in ++def PseudoCALL36 : Pseudo<(outs), (ins bare_symbol:$dst), [], ++ "call36", "$dst">, ++ Requires<[IsLA64]>; ++let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3], ++ isCodeGenOnly = 0, isAsmParserOnly = 1, Size = 8, hasSideEffects = 0, ++ mayStore = 0, mayLoad = 0 in ++def PseudoTAIL36 : Pseudo<(outs), (ins GPR:$tmp, bare_symbol:$dst), [], ++ "tail36", "$tmp, $dst">, ++ Requires<[IsLA64]>; ++ + /// Load address (la*) macro instructions. + + // Define isCodeGenOnly = 0 to expose them to tablegened assembly parser. +diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp +index 5daa9481c907..98ad49f25e3f 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp ++++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp +@@ -95,6 +95,9 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym, + case LoongArchII::MO_GD_PC_HI: + Kind = LoongArchMCExpr::VK_LoongArch_TLS_GD_PC_HI20; + break; ++ case LoongArchII::MO_CALL36: ++ Kind = LoongArchMCExpr::VK_LoongArch_CALL36; ++ break; + // TODO: Handle more target-flags. + } + +diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +index d0a4e9375048..0efc5e6ebb99 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp ++++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +@@ -63,11 +63,11 @@ getEffectiveLoongArchCodeModel(const Triple &TT, + + switch (*CM) { + case CodeModel::Small: +- case CodeModel::Medium: + return *CM; ++ case CodeModel::Medium: + case CodeModel::Large: + if (!TT.isArch64Bit()) +- report_fatal_error("Large code model requires LA64"); ++ report_fatal_error("Medium/Large code model requires LA64"); + return *CM; + default: + report_fatal_error( +diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h +index cee6dad1f095..0692cb92b694 100644 +--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h ++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h +@@ -47,6 +47,7 @@ enum { + MO_IE_PC64_HI, + MO_LD_PC_HI, + MO_GD_PC_HI, ++ MO_CALL36 + // TODO: Add more flags. + }; + } // end namespace LoongArchII +diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +index e60b9c2cfd97..0a52380dd2cd 100644 +--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp ++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +@@ -90,6 +90,8 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx, + return ELF::R_LARCH_TLS_LE64_LO20; + case LoongArch::fixup_loongarch_tls_le64_hi12: + return ELF::R_LARCH_TLS_LE64_HI12; ++ case LoongArch::fixup_loongarch_call36: ++ return ELF::R_LARCH_CALL36; + // TODO: Handle more fixup-kinds. + } + } +diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h +index 78414408f21f..0d19d2b0fb1f 100644 +--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h ++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h +@@ -111,6 +111,9 @@ enum Fixups { + fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX, + // Generate an R_LARCH_ALIGN which indicates the linker may fixup align here. + fixup_loongarch_align = FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN, ++ // 36-bit fixup corresponding to %call36(foo) for a pair instructions: ++ // pcaddu18i+jirl. ++ fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36, + }; + } // end namespace LoongArch + } // end namespace llvm +diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp +index 09d92ac9aa3a..7c4fe9674d4e 100644 +--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp ++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp +@@ -241,6 +241,9 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO, + case LoongArchMCExpr::VK_LoongArch_TLS_GD_HI20: + FixupKind = LoongArch::fixup_loongarch_tls_gd_hi20; + break; ++ case LoongArchMCExpr::VK_LoongArch_CALL36: ++ FixupKind = LoongArch::fixup_loongarch_call36; ++ break; + } + } else if (Kind == MCExpr::SymbolRef && + cast(Expr)->getKind() == +diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp +index 82c992b1cc8c..8ca8876a19b9 100644 +--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp ++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp +@@ -138,6 +138,8 @@ StringRef LoongArchMCExpr::getVariantKindName(VariantKind Kind) { + return "gd_pc_hi20"; + case VK_LoongArch_TLS_GD_HI20: + return "gd_hi20"; ++ case VK_LoongArch_CALL36: ++ return "call36"; + } + } + +@@ -180,6 +182,7 @@ LoongArchMCExpr::getVariantKindForName(StringRef name) { + .Case("ld_hi20", VK_LoongArch_TLS_LD_HI20) + .Case("gd_pc_hi20", VK_LoongArch_TLS_GD_PC_HI20) + .Case("gd_hi20", VK_LoongArch_TLS_GD_HI20) ++ .Case("call36", VK_LoongArch_CALL36) + .Default(VK_LoongArch_Invalid); + } + +diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h +index 93251f824103..bd828116d7fa 100644 +--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h ++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h +@@ -61,6 +61,7 @@ public: + VK_LoongArch_TLS_LD_HI20, + VK_LoongArch_TLS_GD_PC_HI20, + VK_LoongArch_TLS_GD_HI20, ++ VK_LoongArch_CALL36, + VK_LoongArch_Invalid // Must be the last item. + }; + +diff --git a/llvm/test/CodeGen/LoongArch/code-models.ll b/llvm/test/CodeGen/LoongArch/code-models.ll +index c610f645a06a..7c6f46d5e926 100644 +--- a/llvm/test/CodeGen/LoongArch/code-models.ll ++++ b/llvm/test/CodeGen/LoongArch/code-models.ll +@@ -23,8 +23,8 @@ define i32 @call_globaladdress(i32 %a) nounwind { + ; MEDIUM: # %bb.0: + ; MEDIUM-NEXT: addi.d $sp, $sp, -16 + ; MEDIUM-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +-; MEDIUM-NEXT: pcalau12i $ra, %pc_hi20(callee) +-; MEDIUM-NEXT: jirl $ra, $ra, %pc_lo12(callee) ++; MEDIUM-NEXT: pcaddu18i $ra, %call36(callee) ++; MEDIUM-NEXT: jirl $ra, $ra, 0 + ; MEDIUM-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload + ; MEDIUM-NEXT: addi.d $sp, $sp, 16 + ; MEDIUM-NEXT: ret +@@ -68,8 +68,8 @@ define void @call_external_sym(ptr %dst) { + ; MEDIUM-NEXT: .cfi_offset 1, -8 + ; MEDIUM-NEXT: ori $a2, $zero, 1000 + ; MEDIUM-NEXT: move $a1, $zero +-; MEDIUM-NEXT: pcalau12i $ra, %pc_hi20(memset) +-; MEDIUM-NEXT: jirl $ra, $ra, %pc_lo12(memset) ++; MEDIUM-NEXT: pcaddu18i $ra, %call36(memset) ++; MEDIUM-NEXT: jirl $ra, $ra, 0 + ; MEDIUM-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload + ; MEDIUM-NEXT: addi.d $sp, $sp, 16 + ; MEDIUM-NEXT: ret +@@ -105,8 +105,8 @@ define i32 @caller_tail(i32 %i) nounwind { + ; + ; MEDIUM-LABEL: caller_tail: + ; MEDIUM: # %bb.0: # %entry +-; MEDIUM-NEXT: pcalau12i $a1, %pc_hi20(callee_tail) +-; MEDIUM-NEXT: jirl $zero, $a1, %pc_lo12(callee_tail) ++; MEDIUM-NEXT: pcaddu18i $a1, %call36(callee_tail) ++; MEDIUM-NEXT: jr $a1 + ; + ; LARGE-LABEL: caller_tail: + ; LARGE: # %bb.0: # %entry +diff --git a/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s b/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s +index acddca9432a6..1c1c658ad440 100644 +--- a/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s ++++ b/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s +@@ -65,7 +65,7 @@ addu16i.d $a0, $a0, 32768 + + ## simm20 + pcaddu18i $a0, 0x80000 +-# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-524288, 524287] ++# CHECK: :[[#@LINE-1]]:16: error: operand must be a symbol with modifier (e.g. %call36) or an integer in the range [-524288, 524287] + + ## simm20_lu32id + lu32i.d $a0, 0x80000 +diff --git a/llvm/test/MC/LoongArch/Macros/macros-call.s b/llvm/test/MC/LoongArch/Macros/macros-call.s +new file mode 100644 +index 000000000000..a648a3978038 +--- /dev/null ++++ b/llvm/test/MC/LoongArch/Macros/macros-call.s +@@ -0,0 +1,9 @@ ++# RUN: llvm-mc --triple=loongarch64 %s | FileCheck %s ++ ++call36 sym_call ++# CHECK: pcaddu18i $ra, %call36(sym_call) ++# CHECK-NEXT: jirl $ra, $ra, 0 ++ ++tail36 $t0, sym_tail ++# CHECK: pcaddu18i $t0, %call36(sym_tail) ++# CHECK-NEXT: jr $t0 +diff --git a/llvm/test/MC/LoongArch/Relocations/relocations.s b/llvm/test/MC/LoongArch/Relocations/relocations.s +index 042cc93470a1..bec71e103893 100644 +--- a/llvm/test/MC/LoongArch/Relocations/relocations.s ++++ b/llvm/test/MC/LoongArch/Relocations/relocations.s +@@ -218,3 +218,8 @@ lu12i.w $t1, %gd_hi20(foo) + # RELOC: R_LARCH_TLS_GD_HI20 foo 0x0 + # INSTR: lu12i.w $t1, %gd_hi20(foo) + # FIXUP: fixup A - offset: 0, value: %gd_hi20(foo), kind: FK_NONE ++ ++pcaddu18i $t1, %call36(foo) ++# RELOC: R_LARCH_CALL36 foo 0x0 ++# INSTR: pcaddu18i $t1, %call36(foo) ++# FIXUP: fixup A - offset: 0, value: %call36(foo), kind: FK_NONE +-- +2.20.1 + + +From d59688f326d8f915ffc5db80b40c9b99d9f95470 Mon Sep 17 00:00:00 2001 +From: wanglei +Date: Tue, 2 Jan 2024 10:57:40 +0800 +Subject: [PATCH 03/12] [LoongArch] Pre-commit test for #76555. NFC + +(cherry picked from commit 3d6fc35b9071009c5ef37f879a12982c6a54db60) +--- + .../LoongArch/psabi-restricted-scheduling.ll | 172 ++++++++++++++++++ + 1 file changed, 172 insertions(+) + create mode 100644 llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll + +diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll +new file mode 100644 +index 000000000000..150a935d7bf8 +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll +@@ -0,0 +1,172 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ++; RUN: llc --mtriple=loongarch64 --code-model=medium --post-RA-scheduler=0 < %s \ ++; RUN: | FileCheck %s --check-prefix=MEDIUM_NO_SCH ++; RUN: llc --mtriple=loongarch64 --code-model=medium --post-RA-scheduler=1 < %s \ ++; RUN: | FileCheck %s --check-prefix=MEDIUM_SCH ++; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=0 < %s \ ++; RUN: | FileCheck %s --check-prefix=LARGE_NO_SCH ++; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=1 < %s \ ++; RUN: | FileCheck %s --check-prefix=LARGE_SCH ++ ++;; FIXME: According to the description of the psABI v2.30, the code sequences ++;; of `PseudoLA*_LARGE` instruction and Medium code model's function call must ++;; be adjacent. ++ ++@g = dso_local global i64 zeroinitializer, align 4 ++@G = global i64 zeroinitializer, align 4 ++@gd = external thread_local global i64 ++@ld = external thread_local(localdynamic) global i64 ++@ie = external thread_local(initialexec) global i64 ++ ++declare ptr @bar(i64) ++ ++define void @foo() nounwind { ++; MEDIUM_NO_SCH-LABEL: foo: ++; MEDIUM_NO_SCH: # %bb.0: ++; MEDIUM_NO_SCH-NEXT: addi.d $sp, $sp, -16 ++; MEDIUM_NO_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ++; MEDIUM_NO_SCH-NEXT: pcalau12i $a0, %got_pc_hi20(G) ++; MEDIUM_NO_SCH-NEXT: ld.d $a0, $a0, %got_pc_lo12(G) ++; MEDIUM_NO_SCH-NEXT: ld.d $a0, $a0, 0 ++; MEDIUM_NO_SCH-NEXT: pcalau12i $a0, %pc_hi20(g) ++; MEDIUM_NO_SCH-NEXT: addi.d $a0, $a0, %pc_lo12(g) ++; MEDIUM_NO_SCH-NEXT: ld.d $a0, $a0, 0 ++; MEDIUM_NO_SCH-NEXT: ori $a0, $zero, 1 ++; MEDIUM_NO_SCH-NEXT: pcaddu18i $ra, %call36(bar) ++; MEDIUM_NO_SCH-NEXT: jirl $ra, $ra, 0 ++; MEDIUM_NO_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(gd) ++; MEDIUM_NO_SCH-NEXT: ld.d $a0, $a0, %ie_pc_lo12(gd) ++; MEDIUM_NO_SCH-NEXT: ldx.d $a0, $a0, $tp ++; MEDIUM_NO_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(ld) ++; MEDIUM_NO_SCH-NEXT: ld.d $a0, $a0, %ie_pc_lo12(ld) ++; MEDIUM_NO_SCH-NEXT: ldx.d $a0, $a0, $tp ++; MEDIUM_NO_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(ie) ++; MEDIUM_NO_SCH-NEXT: ld.d $a0, $a0, %ie_pc_lo12(ie) ++; MEDIUM_NO_SCH-NEXT: ldx.d $a0, $a0, $tp ++; MEDIUM_NO_SCH-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ++; MEDIUM_NO_SCH-NEXT: addi.d $sp, $sp, 16 ++; MEDIUM_NO_SCH-NEXT: ret ++; ++; MEDIUM_SCH-LABEL: foo: ++; MEDIUM_SCH: # %bb.0: ++; MEDIUM_SCH-NEXT: addi.d $sp, $sp, -16 ++; MEDIUM_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ++; MEDIUM_SCH-NEXT: pcalau12i $a0, %got_pc_hi20(G) ++; MEDIUM_SCH-NEXT: pcaddu18i $ra, %call36(bar) ++; MEDIUM_SCH-NEXT: ld.d $a0, $a0, %got_pc_lo12(G) ++; MEDIUM_SCH-NEXT: ld.d $a0, $a0, 0 ++; MEDIUM_SCH-NEXT: pcalau12i $a0, %pc_hi20(g) ++; MEDIUM_SCH-NEXT: addi.d $a0, $a0, %pc_lo12(g) ++; MEDIUM_SCH-NEXT: ld.d $a0, $a0, 0 ++; MEDIUM_SCH-NEXT: ori $a0, $zero, 1 ++; MEDIUM_SCH-NEXT: jirl $ra, $ra, 0 ++; MEDIUM_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(gd) ++; MEDIUM_SCH-NEXT: ld.d $a0, $a0, %ie_pc_lo12(gd) ++; MEDIUM_SCH-NEXT: ldx.d $a0, $a0, $tp ++; MEDIUM_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(ld) ++; MEDIUM_SCH-NEXT: ld.d $a0, $a0, %ie_pc_lo12(ld) ++; MEDIUM_SCH-NEXT: ldx.d $a0, $a0, $tp ++; MEDIUM_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(ie) ++; MEDIUM_SCH-NEXT: ld.d $a0, $a0, %ie_pc_lo12(ie) ++; MEDIUM_SCH-NEXT: ldx.d $a0, $a0, $tp ++; MEDIUM_SCH-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ++; MEDIUM_SCH-NEXT: addi.d $sp, $sp, 16 ++; MEDIUM_SCH-NEXT: ret ++; ++; LARGE_NO_SCH-LABEL: foo: ++; LARGE_NO_SCH: # %bb.0: ++; LARGE_NO_SCH-NEXT: addi.d $sp, $sp, -16 ++; LARGE_NO_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ++; LARGE_NO_SCH-NEXT: pcalau12i $a0, %got_pc_hi20(G) ++; LARGE_NO_SCH-NEXT: addi.d $a1, $zero, %got_pc_lo12(G) ++; LARGE_NO_SCH-NEXT: lu32i.d $a1, %got64_pc_lo20(G) ++; LARGE_NO_SCH-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(G) ++; LARGE_NO_SCH-NEXT: ldx.d $a0, $a1, $a0 ++; LARGE_NO_SCH-NEXT: ld.d $a0, $a0, 0 ++; LARGE_NO_SCH-NEXT: pcalau12i $a0, %pc_hi20(g) ++; LARGE_NO_SCH-NEXT: addi.d $a1, $zero, %pc_lo12(g) ++; LARGE_NO_SCH-NEXT: lu32i.d $a1, %pc64_lo20(g) ++; LARGE_NO_SCH-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g) ++; LARGE_NO_SCH-NEXT: add.d $a0, $a1, $a0 ++; LARGE_NO_SCH-NEXT: ld.d $a0, $a0, 0 ++; LARGE_NO_SCH-NEXT: ori $a0, $zero, 1 ++; LARGE_NO_SCH-NEXT: pcalau12i $a1, %got_pc_hi20(bar) ++; LARGE_NO_SCH-NEXT: addi.d $ra, $zero, %got_pc_lo12(bar) ++; LARGE_NO_SCH-NEXT: lu32i.d $ra, %got64_pc_lo20(bar) ++; LARGE_NO_SCH-NEXT: lu52i.d $ra, $ra, %got64_pc_hi12(bar) ++; LARGE_NO_SCH-NEXT: ldx.d $ra, $ra, $a1 ++; LARGE_NO_SCH-NEXT: jirl $ra, $ra, 0 ++; LARGE_NO_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(gd) ++; LARGE_NO_SCH-NEXT: addi.d $a1, $zero, %ie_pc_lo12(gd) ++; LARGE_NO_SCH-NEXT: lu32i.d $a1, %ie64_pc_lo20(gd) ++; LARGE_NO_SCH-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(gd) ++; LARGE_NO_SCH-NEXT: ldx.d $a0, $a1, $a0 ++; LARGE_NO_SCH-NEXT: ldx.d $a0, $a0, $tp ++; LARGE_NO_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(ld) ++; LARGE_NO_SCH-NEXT: addi.d $a1, $zero, %ie_pc_lo12(ld) ++; LARGE_NO_SCH-NEXT: lu32i.d $a1, %ie64_pc_lo20(ld) ++; LARGE_NO_SCH-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(ld) ++; LARGE_NO_SCH-NEXT: ldx.d $a0, $a1, $a0 ++; LARGE_NO_SCH-NEXT: ldx.d $a0, $a0, $tp ++; LARGE_NO_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(ie) ++; LARGE_NO_SCH-NEXT: addi.d $a1, $zero, %ie_pc_lo12(ie) ++; LARGE_NO_SCH-NEXT: lu32i.d $a1, %ie64_pc_lo20(ie) ++; LARGE_NO_SCH-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(ie) ++; LARGE_NO_SCH-NEXT: ldx.d $a0, $a1, $a0 ++; LARGE_NO_SCH-NEXT: ldx.d $a0, $a0, $tp ++; LARGE_NO_SCH-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ++; LARGE_NO_SCH-NEXT: addi.d $sp, $sp, 16 ++; LARGE_NO_SCH-NEXT: ret ++; ++; LARGE_SCH-LABEL: foo: ++; LARGE_SCH: # %bb.0: ++; LARGE_SCH-NEXT: addi.d $sp, $sp, -16 ++; LARGE_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ++; LARGE_SCH-NEXT: addi.d $a1, $zero, %got_pc_lo12(G) ++; LARGE_SCH-NEXT: pcalau12i $a0, %got_pc_hi20(G) ++; LARGE_SCH-NEXT: addi.d $ra, $zero, %got_pc_lo12(bar) ++; LARGE_SCH-NEXT: lu32i.d $a1, %got64_pc_lo20(G) ++; LARGE_SCH-NEXT: lu32i.d $ra, %got64_pc_lo20(bar) ++; LARGE_SCH-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(G) ++; LARGE_SCH-NEXT: lu52i.d $ra, $ra, %got64_pc_hi12(bar) ++; LARGE_SCH-NEXT: ldx.d $a0, $a1, $a0 ++; LARGE_SCH-NEXT: addi.d $a1, $zero, %pc_lo12(g) ++; LARGE_SCH-NEXT: lu32i.d $a1, %pc64_lo20(g) ++; LARGE_SCH-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g) ++; LARGE_SCH-NEXT: ld.d $a0, $a0, 0 ++; LARGE_SCH-NEXT: pcalau12i $a0, %pc_hi20(g) ++; LARGE_SCH-NEXT: add.d $a0, $a1, $a0 ++; LARGE_SCH-NEXT: pcalau12i $a1, %got_pc_hi20(bar) ++; LARGE_SCH-NEXT: ld.d $a0, $a0, 0 ++; LARGE_SCH-NEXT: ldx.d $ra, $ra, $a1 ++; LARGE_SCH-NEXT: ori $a0, $zero, 1 ++; LARGE_SCH-NEXT: jirl $ra, $ra, 0 ++; LARGE_SCH-NEXT: addi.d $a1, $zero, %ie_pc_lo12(gd) ++; LARGE_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(gd) ++; LARGE_SCH-NEXT: lu32i.d $a1, %ie64_pc_lo20(gd) ++; LARGE_SCH-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(gd) ++; LARGE_SCH-NEXT: ldx.d $a0, $a1, $a0 ++; LARGE_SCH-NEXT: addi.d $a1, $zero, %ie_pc_lo12(ld) ++; LARGE_SCH-NEXT: lu32i.d $a1, %ie64_pc_lo20(ld) ++; LARGE_SCH-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(ld) ++; LARGE_SCH-NEXT: ldx.d $a0, $a0, $tp ++; LARGE_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(ld) ++; LARGE_SCH-NEXT: ldx.d $a0, $a1, $a0 ++; LARGE_SCH-NEXT: addi.d $a1, $zero, %ie_pc_lo12(ie) ++; LARGE_SCH-NEXT: lu32i.d $a1, %ie64_pc_lo20(ie) ++; LARGE_SCH-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(ie) ++; LARGE_SCH-NEXT: ldx.d $a0, $a0, $tp ++; LARGE_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(ie) ++; LARGE_SCH-NEXT: ldx.d $a0, $a1, $a0 ++; LARGE_SCH-NEXT: ldx.d $a0, $a0, $tp ++; LARGE_SCH-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ++; LARGE_SCH-NEXT: addi.d $sp, $sp, 16 ++; LARGE_SCH-NEXT: ret ++ %V = load volatile i64, ptr @G ++ %v = load volatile i64, ptr @g ++ call void @bar(i64 1) ++ %v_gd = load volatile i64, ptr @gd ++ %v_ld = load volatile i64, ptr @ld ++ %v_ie = load volatile i64, ptr @ie ++ ret void ++} +-- +2.20.1 + + +From 1248440ab618fcffada7fa29eed71bc04945c3ec Mon Sep 17 00:00:00 2001 +From: Weining Lu +Date: Tue, 25 Jun 2024 09:52:17 +0800 +Subject: [PATCH 04/12] [LoongArch][test] Remove the FIXME in + psabi-restricted-scheduling.ll which has been addressed by #76555 + +(cherry picked from commit 7ea63b9db4198688873036f3b0b81f9124076f7a) +--- + llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll +index 150a935d7bf8..a515939b9c2b 100644 +--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll ++++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll +@@ -8,10 +8,6 @@ + ; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=1 < %s \ + ; RUN: | FileCheck %s --check-prefix=LARGE_SCH + +-;; FIXME: According to the description of the psABI v2.30, the code sequences +-;; of `PseudoLA*_LARGE` instruction and Medium code model's function call must +-;; be adjacent. +- + @g = dso_local global i64 zeroinitializer, align 4 + @G = global i64 zeroinitializer, align 4 + @gd = external thread_local global i64 +-- +2.20.1 + + +From 0e86ae628414dac6d7ef2eaccc8655d790595f9f Mon Sep 17 00:00:00 2001 +From: wanglei +Date: Tue, 2 Jan 2024 10:57:15 +0800 +Subject: [PATCH 05/12] [LoongArch] Reimplement the expansion of + PseudoLA*_LARGE instructions (#76555) + +According to the description of the psABI v2.30: +https://github.com/loongson/la-abi-specs/releases/tag/v2.30, moved the +expansion of relevant pseudo-instructions from +`LoongArchPreRAExpandPseudo` pass to `LoongArchExpandPseudo` pass, to +ensure that the code sequences of `PseudoLA*_LARGE` instructions and +Medium code model's function call are not scheduled. + +(cherry picked from commit c56a5e895a96fec4292e9333d998cfa88770432a) +--- + .../LoongArch/LoongArchExpandPseudoInsts.cpp | 519 +++++++++--------- + .../LoongArch/LoongArchISelLowering.cpp | 24 +- + .../Target/LoongArch/LoongArchISelLowering.h | 4 + + .../Target/LoongArch/LoongArchInstrInfo.td | 83 ++- + llvm/test/CodeGen/LoongArch/code-models.ll | 36 +- + llvm/test/CodeGen/LoongArch/expand-call.ll | 2 +- + llvm/test/CodeGen/LoongArch/global-address.ll | 32 +- + .../LoongArch/psabi-restricted-scheduling.ll | 102 ++-- + llvm/test/CodeGen/LoongArch/tls-models.ll | 68 +-- + 9 files changed, 487 insertions(+), 383 deletions(-) + +diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp +index 8eda2dcc1633..f977f176066a 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp ++++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp +@@ -62,43 +62,24 @@ private: + MachineBasicBlock::iterator &NextMBBI, + unsigned FlagsHi, unsigned SecondOpcode, + unsigned FlagsLo); +- bool expandLargeAddressLoad(MachineBasicBlock &MBB, +- MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, +- unsigned LastOpcode, unsigned IdentifyingMO); +- bool expandLargeAddressLoad(MachineBasicBlock &MBB, +- MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, +- unsigned LastOpcode, unsigned IdentifyingMO, +- const MachineOperand &Symbol, Register DestReg, +- bool EraseFromParent); + bool expandLoadAddressPcrel(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, +- bool Large = false); ++ MachineBasicBlock::iterator &NextMBBI); + bool expandLoadAddressGot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, +- bool Large = false); ++ MachineBasicBlock::iterator &NextMBBI); + bool expandLoadAddressTLSLE(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + bool expandLoadAddressTLSIE(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, +- bool Large = false); ++ MachineBasicBlock::iterator &NextMBBI); + bool expandLoadAddressTLSLD(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, +- bool Large = false); ++ MachineBasicBlock::iterator &NextMBBI); + bool expandLoadAddressTLSGD(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, +- bool Large = false); +- bool expandFunctionCALL(MachineBasicBlock &MBB, +- MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, +- bool IsTailCall); ++ MachineBasicBlock::iterator &NextMBBI); + }; + + char LoongArchPreRAExpandPseudo::ID = 0; +@@ -131,30 +112,16 @@ bool LoongArchPreRAExpandPseudo::expandMI( + switch (MBBI->getOpcode()) { + case LoongArch::PseudoLA_PCREL: + return expandLoadAddressPcrel(MBB, MBBI, NextMBBI); +- case LoongArch::PseudoLA_PCREL_LARGE: +- return expandLoadAddressPcrel(MBB, MBBI, NextMBBI, /*Large=*/true); + case LoongArch::PseudoLA_GOT: + return expandLoadAddressGot(MBB, MBBI, NextMBBI); +- case LoongArch::PseudoLA_GOT_LARGE: +- return expandLoadAddressGot(MBB, MBBI, NextMBBI, /*Large=*/true); + case LoongArch::PseudoLA_TLS_LE: + return expandLoadAddressTLSLE(MBB, MBBI, NextMBBI); + case LoongArch::PseudoLA_TLS_IE: + return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI); +- case LoongArch::PseudoLA_TLS_IE_LARGE: +- return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI, /*Large=*/true); + case LoongArch::PseudoLA_TLS_LD: + return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI); +- case LoongArch::PseudoLA_TLS_LD_LARGE: +- return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI, /*Large=*/true); + case LoongArch::PseudoLA_TLS_GD: + return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI); +- case LoongArch::PseudoLA_TLS_GD_LARGE: +- return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI, /*Large=*/true); +- case LoongArch::PseudoCALL: +- return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false); +- case LoongArch::PseudoTAIL: +- return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true); + } + return false; + } +@@ -187,118 +154,9 @@ bool LoongArchPreRAExpandPseudo::expandPcalau12iInstPair( + return true; + } + +-bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad( +- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode, +- unsigned IdentifyingMO) { +- MachineInstr &MI = *MBBI; +- return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO, +- MI.getOperand(2), MI.getOperand(0).getReg(), +- true); +-} +- +-bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad( +- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode, +- unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg, +- bool EraseFromParent) { +- // Code Sequence: +- // +- // Part1: pcalau12i $scratch, %MO1(sym) +- // Part0: addi.d $dest, $zero, %MO0(sym) +- // Part2: lu32i.d $dest, %MO2(sym) +- // Part3: lu52i.d $dest, $dest, %MO3(sym) +- // Fin: LastOpcode $dest, $dest, $scratch +- +- unsigned MO0, MO1, MO2, MO3; +- switch (IdentifyingMO) { +- default: +- llvm_unreachable("unsupported identifying MO"); +- case LoongArchII::MO_PCREL_LO: +- MO0 = IdentifyingMO; +- MO1 = LoongArchII::MO_PCREL_HI; +- MO2 = LoongArchII::MO_PCREL64_LO; +- MO3 = LoongArchII::MO_PCREL64_HI; +- break; +- case LoongArchII::MO_GOT_PC_HI: +- case LoongArchII::MO_LD_PC_HI: +- case LoongArchII::MO_GD_PC_HI: +- // These cases relocate just like the GOT case, except for Part1. +- MO0 = LoongArchII::MO_GOT_PC_LO; +- MO1 = IdentifyingMO; +- MO2 = LoongArchII::MO_GOT_PC64_LO; +- MO3 = LoongArchII::MO_GOT_PC64_HI; +- break; +- case LoongArchII::MO_IE_PC_LO: +- MO0 = IdentifyingMO; +- MO1 = LoongArchII::MO_IE_PC_HI; +- MO2 = LoongArchII::MO_IE_PC64_LO; +- MO3 = LoongArchII::MO_IE_PC64_HI; +- break; +- } +- +- MachineFunction *MF = MBB.getParent(); +- MachineInstr &MI = *MBBI; +- DebugLoc DL = MI.getDebugLoc(); +- +- assert(MF->getSubtarget().is64Bit() && +- "Large code model requires LA64"); +- +- Register TmpPart1 = +- MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass); +- Register TmpPart0 = +- DestReg.isVirtual() +- ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) +- : DestReg; +- Register TmpParts02 = +- DestReg.isVirtual() +- ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) +- : DestReg; +- Register TmpParts023 = +- DestReg.isVirtual() +- ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) +- : DestReg; +- +- auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), TmpPart1); +- auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), TmpPart0) +- .addReg(LoongArch::R0); +- auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), TmpParts02) +- // "rj" is needed due to InstrInfo pattern requirement. +- .addReg(TmpPart0, RegState::Kill); +- auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), TmpParts023) +- .addReg(TmpParts02, RegState::Kill); +- BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg) +- .addReg(TmpParts023) +- .addReg(TmpPart1, RegState::Kill); +- +- if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) { +- const char *SymName = Symbol.getSymbolName(); +- Part0.addExternalSymbol(SymName, MO0); +- Part1.addExternalSymbol(SymName, MO1); +- Part2.addExternalSymbol(SymName, MO2); +- Part3.addExternalSymbol(SymName, MO3); +- } else { +- Part0.addDisp(Symbol, 0, MO0); +- Part1.addDisp(Symbol, 0, MO1); +- Part2.addDisp(Symbol, 0, MO2); +- Part3.addDisp(Symbol, 0, MO3); +- } +- +- if (EraseFromParent) +- MI.eraseFromParent(); +- +- return true; +-} +- + bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, bool Large) { +- if (Large) +- // Emit the 5-insn large address load sequence with the `%pc` family of +- // relocs. +- return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, +- LoongArchII::MO_PCREL_LO); +- ++ MachineBasicBlock::iterator &NextMBBI) { + // Code Sequence: + // pcalau12i $rd, %pc_hi20(sym) + // addi.w/d $rd, $rd, %pc_lo12(sym) +@@ -311,13 +169,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel( + + bool LoongArchPreRAExpandPseudo::expandLoadAddressGot( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, bool Large) { +- if (Large) +- // Emit the 5-insn large address load sequence with the `%got_pc` family +- // of relocs, loading the result from GOT with `ldx.d` in the end. +- return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D, +- LoongArchII::MO_GOT_PC_HI); +- ++ MachineBasicBlock::iterator &NextMBBI) { + // Code Sequence: + // pcalau12i $rd, %got_pc_hi20(sym) + // ld.w/d $rd, $rd, %got_pc_lo12(sym) +@@ -378,13 +230,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLE( + + bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, bool Large) { +- if (Large) +- // Emit the 5-insn large address load sequence with the `%ie_pc` family +- // of relocs, loading the result with `ldx.d` in the end. +- return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D, +- LoongArchII::MO_IE_PC_LO); +- ++ MachineBasicBlock::iterator &NextMBBI) { + // Code Sequence: + // pcalau12i $rd, %ie_pc_hi20(sym) + // ld.w/d $rd, $rd, %ie_pc_lo12(sym) +@@ -397,13 +243,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE( + + bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, bool Large) { +- if (Large) +- // Emit the 5-insn large address load sequence with the `%got_pc` family +- // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`. +- return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, +- LoongArchII::MO_LD_PC_HI); +- ++ MachineBasicBlock::iterator &NextMBBI) { + // Code Sequence: + // pcalau12i $rd, %ld_pc_hi20(sym) + // addi.w/d $rd, $rd, %got_pc_lo12(sym) +@@ -416,13 +256,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD( + + bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, bool Large) { +- if (Large) +- // Emit the 5-insn large address load sequence with the `%got_pc` family +- // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`. +- return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, +- LoongArchII::MO_GD_PC_HI); +- ++ MachineBasicBlock::iterator &NextMBBI) { + // Code Sequence: + // pcalau12i $rd, %gd_pc_hi20(sym) + // addi.w/d $rd, $rd, %got_pc_lo12(sym) +@@ -433,85 +267,6 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD( + SecondOpcode, LoongArchII::MO_GOT_PC_LO); + } + +-bool LoongArchPreRAExpandPseudo::expandFunctionCALL( +- MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +- MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) { +- MachineFunction *MF = MBB.getParent(); +- MachineInstr &MI = *MBBI; +- DebugLoc DL = MI.getDebugLoc(); +- const MachineOperand &Func = MI.getOperand(0); +- MachineInstrBuilder CALL; +- unsigned Opcode; +- +- switch (MF->getTarget().getCodeModel()) { +- default: +- report_fatal_error("Unsupported code model"); +- break; +- case CodeModel::Small: { +- // CALL: +- // bl func +- // TAIL: +- // b func +- Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL; +- CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func); +- break; +- } +- case CodeModel::Medium: { +- // CALL: +- // pcaddu18i $ra, %call36(func) +- // jirl $ra, $ra, 0 +- // TAIL: +- // pcaddu18i $scratch, %call36(func) +- // jirl $r0, $scratch, 0 +- Opcode = +- IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; +- Register ScratchReg = +- IsTailCall +- ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) +- : LoongArch::R1; +- MachineInstrBuilder MIB = +- BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg); +- +- CALL = +- BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0); +- +- if (Func.isSymbol()) +- MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36); +- else +- MIB.addDisp(Func, 0, LoongArchII::MO_CALL36); +- break; +- } +- case CodeModel::Large: { +- // Emit the 5-insn large address load sequence, either directly or +- // indirectly in case of going through the GOT, then JIRL_TAIL or +- // JIRL_CALL to $addr. +- Opcode = +- IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; +- Register AddrReg = +- IsTailCall +- ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) +- : LoongArch::R1; +- +- bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal(); +- unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO; +- unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D; +- expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg, +- false); +- CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0); +- break; +- } +- } +- +- // Transfer implicit operands. +- CALL.copyImplicitOps(MI); +- +- // Transfer MI flags. +- CALL.setMIFlags(MI.getFlags()); +- +- MI.eraseFromParent(); +- return true; +-} +- + class LoongArchExpandPseudo : public MachineFunctionPass { + public: + const LoongArchInstrInfo *TII; +@@ -533,6 +288,35 @@ private: + MachineBasicBlock::iterator &NextMBBI); + bool expandCopyCFR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); ++ bool expandLargeAddressLoad(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI, ++ unsigned LastOpcode, unsigned IdentifyingMO); ++ bool expandLargeAddressLoad(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI, ++ unsigned LastOpcode, unsigned IdentifyingMO, ++ const MachineOperand &Symbol, Register DestReg, ++ bool EraseFromParent); ++ bool expandLoadAddressPcrelLarge(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI); ++ bool expandLoadAddressGotLarge(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI); ++ bool expandLoadAddressTLSIELarge(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI); ++ bool expandLoadAddressTLSLDLarge(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI); ++ bool expandLoadAddressTLSGDLarge(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI); ++ bool expandFunctionCALL(MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI, ++ bool IsTailCall); + }; + + char LoongArchExpandPseudo::ID = 0; +@@ -567,6 +351,24 @@ bool LoongArchExpandPseudo::expandMI(MachineBasicBlock &MBB, + switch (MBBI->getOpcode()) { + case LoongArch::PseudoCopyCFR: + return expandCopyCFR(MBB, MBBI, NextMBBI); ++ case LoongArch::PseudoLA_PCREL_LARGE: ++ return expandLoadAddressPcrelLarge(MBB, MBBI, NextMBBI); ++ case LoongArch::PseudoLA_GOT_LARGE: ++ return expandLoadAddressGotLarge(MBB, MBBI, NextMBBI); ++ case LoongArch::PseudoLA_TLS_IE_LARGE: ++ return expandLoadAddressTLSIELarge(MBB, MBBI, NextMBBI); ++ case LoongArch::PseudoLA_TLS_LD_LARGE: ++ return expandLoadAddressTLSLDLarge(MBB, MBBI, NextMBBI); ++ case LoongArch::PseudoLA_TLS_GD_LARGE: ++ return expandLoadAddressTLSGDLarge(MBB, MBBI, NextMBBI); ++ case LoongArch::PseudoCALL: ++ case LoongArch::PseudoCALL_MEDIUM: ++ case LoongArch::PseudoCALL_LARGE: ++ return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false); ++ case LoongArch::PseudoTAIL: ++ case LoongArch::PseudoTAIL_MEDIUM: ++ case LoongArch::PseudoTAIL_LARGE: ++ return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true); + } + + return false; +@@ -625,6 +427,213 @@ bool LoongArchExpandPseudo::expandCopyCFR( + return true; + } + ++bool LoongArchExpandPseudo::expandLargeAddressLoad( ++ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode, ++ unsigned IdentifyingMO) { ++ MachineInstr &MI = *MBBI; ++ return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO, ++ MI.getOperand(2), MI.getOperand(0).getReg(), ++ true); ++} ++ ++bool LoongArchExpandPseudo::expandLargeAddressLoad( ++ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode, ++ unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg, ++ bool EraseFromParent) { ++ // Code Sequence: ++ // ++ // Part1: pcalau12i $dst, %MO1(sym) ++ // Part0: addi.d $t8, $zero, %MO0(sym) ++ // Part2: lu32i.d $t8, %MO2(sym) ++ // Part3: lu52i.d $t8, $t8, %MO3(sym) ++ // Fin: LastOpcode $dst, $t8, $dst ++ ++ unsigned MO0, MO1, MO2, MO3; ++ switch (IdentifyingMO) { ++ default: ++ llvm_unreachable("unsupported identifying MO"); ++ case LoongArchII::MO_PCREL_LO: ++ MO0 = IdentifyingMO; ++ MO1 = LoongArchII::MO_PCREL_HI; ++ MO2 = LoongArchII::MO_PCREL64_LO; ++ MO3 = LoongArchII::MO_PCREL64_HI; ++ break; ++ case LoongArchII::MO_GOT_PC_HI: ++ case LoongArchII::MO_LD_PC_HI: ++ case LoongArchII::MO_GD_PC_HI: ++ // These cases relocate just like the GOT case, except for Part1. ++ MO0 = LoongArchII::MO_GOT_PC_LO; ++ MO1 = IdentifyingMO; ++ MO2 = LoongArchII::MO_GOT_PC64_LO; ++ MO3 = LoongArchII::MO_GOT_PC64_HI; ++ break; ++ case LoongArchII::MO_IE_PC_LO: ++ MO0 = IdentifyingMO; ++ MO1 = LoongArchII::MO_IE_PC_HI; ++ MO2 = LoongArchII::MO_IE_PC64_LO; ++ MO3 = LoongArchII::MO_IE_PC64_HI; ++ break; ++ } ++ ++ MachineFunction *MF = MBB.getParent(); ++ MachineInstr &MI = *MBBI; ++ DebugLoc DL = MI.getDebugLoc(); ++ Register ScratchReg = LoongArch::R20; // $t8 ++ ++ assert(MF->getSubtarget().is64Bit() && ++ "Large code model requires LA64"); ++ ++ auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg); ++ auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), ScratchReg) ++ .addReg(LoongArch::R0); ++ auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), ScratchReg) ++ // "rj" is needed due to InstrInfo pattern requirement. ++ .addReg(ScratchReg); ++ auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), ScratchReg) ++ .addReg(ScratchReg); ++ BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg) ++ .addReg(ScratchReg) ++ .addReg(DestReg); ++ ++ if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) { ++ const char *SymName = Symbol.getSymbolName(); ++ Part0.addExternalSymbol(SymName, MO0); ++ Part1.addExternalSymbol(SymName, MO1); ++ Part2.addExternalSymbol(SymName, MO2); ++ Part3.addExternalSymbol(SymName, MO3); ++ } else { ++ Part0.addDisp(Symbol, 0, MO0); ++ Part1.addDisp(Symbol, 0, MO1); ++ Part2.addDisp(Symbol, 0, MO2); ++ Part3.addDisp(Symbol, 0, MO3); ++ } ++ ++ if (EraseFromParent) ++ MI.eraseFromParent(); ++ ++ return true; ++} ++ ++bool LoongArchExpandPseudo::expandLoadAddressPcrelLarge( ++ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI) { ++ // Emit the 5-insn large address load sequence with the `%pc` family of ++ // relocs. ++ return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, ++ LoongArchII::MO_PCREL_LO); ++} ++ ++bool LoongArchExpandPseudo::expandLoadAddressGotLarge( ++ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI) { ++ // Emit the 5-insn large address load sequence with the `%got_pc` family ++ // of relocs, loading the result from GOT with `ldx.d` in the end. ++ return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D, ++ LoongArchII::MO_GOT_PC_HI); ++} ++ ++bool LoongArchExpandPseudo::expandLoadAddressTLSIELarge( ++ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI) { ++ // Emit the 5-insn large address load sequence with the `%ie_pc` family ++ // of relocs, loading the result with `ldx.d` in the end. ++ return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D, ++ LoongArchII::MO_IE_PC_LO); ++} ++ ++bool LoongArchExpandPseudo::expandLoadAddressTLSLDLarge( ++ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI) { ++ // Emit the 5-insn large address load sequence with the `%got_pc` family ++ // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`. ++ return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, ++ LoongArchII::MO_LD_PC_HI); ++} ++ ++bool LoongArchExpandPseudo::expandLoadAddressTLSGDLarge( ++ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI) { ++ // Emit the 5-insn large address load sequence with the `%got_pc` family ++ // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`. ++ return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, ++ LoongArchII::MO_GD_PC_HI); ++} ++ ++bool LoongArchExpandPseudo::expandFunctionCALL( ++ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, ++ MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) { ++ MachineFunction *MF = MBB.getParent(); ++ MachineInstr &MI = *MBBI; ++ DebugLoc DL = MI.getDebugLoc(); ++ const MachineOperand &Func = MI.getOperand(0); ++ MachineInstrBuilder CALL; ++ unsigned Opcode; ++ ++ switch (MF->getTarget().getCodeModel()) { ++ default: ++ report_fatal_error("Unsupported code model"); ++ break; ++ case CodeModel::Small: { ++ // CALL: ++ // bl func ++ // TAIL: ++ // b func ++ Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL; ++ CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func); ++ break; ++ } ++ case CodeModel::Medium: { ++ // CALL: ++ // pcaddu18i $ra, %call36(func) ++ // jirl $ra, $ra, 0 ++ // TAIL: ++ // pcaddu18i $t8, %call36(func) ++ // jr $t8 ++ Opcode = ++ IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; ++ Register ScratchReg = IsTailCall ? LoongArch::R20 : LoongArch::R1; ++ MachineInstrBuilder MIB = ++ BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg); ++ ++ CALL = ++ BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0); ++ ++ if (Func.isSymbol()) ++ MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36); ++ else ++ MIB.addDisp(Func, 0, LoongArchII::MO_CALL36); ++ break; ++ } ++ case CodeModel::Large: { ++ // Emit the 5-insn large address load sequence, either directly or ++ // indirectly in case of going through the GOT, then JIRL_TAIL or ++ // JIRL_CALL to $addr. ++ Opcode = ++ IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; ++ Register AddrReg = IsTailCall ? LoongArch::R19 : LoongArch::R1; ++ ++ bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal(); ++ unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO; ++ unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D; ++ expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg, ++ false); ++ CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0); ++ break; ++ } ++ } ++ ++ // Transfer implicit operands. ++ CALL.copyImplicitOps(MI); ++ ++ // Transfer MI flags. ++ CALL.setMIFlags(MI.getFlags()); ++ ++ MI.eraseFromParent(); ++ return true; ++} ++ + } // end namespace + + INITIALIZE_PASS(LoongArchPreRAExpandPseudo, "loongarch-prera-expand-pseudo", +diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +index 4fc2b4709840..df1b17649b7d 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp ++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +@@ -3389,8 +3389,12 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { + + // TODO: Add more target-dependent nodes later. + NODE_NAME_CASE(CALL) ++ NODE_NAME_CASE(CALL_MEDIUM) ++ NODE_NAME_CASE(CALL_LARGE) + NODE_NAME_CASE(RET) + NODE_NAME_CASE(TAIL) ++ NODE_NAME_CASE(TAIL_MEDIUM) ++ NODE_NAME_CASE(TAIL_LARGE) + NODE_NAME_CASE(SLL_W) + NODE_NAME_CASE(SRA_W) + NODE_NAME_CASE(SRL_W) +@@ -4248,15 +4252,31 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, + + // Emit the call. + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); ++ unsigned Op; ++ switch (DAG.getTarget().getCodeModel()) { ++ default: ++ report_fatal_error("Unsupported code model"); ++ case CodeModel::Small: ++ Op = IsTailCall ? LoongArchISD::TAIL : LoongArchISD::CALL; ++ break; ++ case CodeModel::Medium: ++ assert(Subtarget.is64Bit() && "Medium code model requires LA64"); ++ Op = IsTailCall ? LoongArchISD::TAIL_MEDIUM : LoongArchISD::CALL_MEDIUM; ++ break; ++ case CodeModel::Large: ++ assert(Subtarget.is64Bit() && "Large code model requires LA64"); ++ Op = IsTailCall ? LoongArchISD::TAIL_LARGE : LoongArchISD::CALL_LARGE; ++ break; ++ } + + if (IsTailCall) { + MF.getFrameInfo().setHasTailCall(); +- SDValue Ret = DAG.getNode(LoongArchISD::TAIL, DL, NodeTys, Ops); ++ SDValue Ret = DAG.getNode(Op, DL, NodeTys, Ops); + DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); + return Ret; + } + +- Chain = DAG.getNode(LoongArchISD::CALL, DL, NodeTys, Ops); ++ Chain = DAG.getNode(Op, DL, NodeTys, Ops); + DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); + Glue = Chain.getValue(1); + +diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +index 2c9826a13237..a2ed149f4bb7 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h ++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +@@ -28,8 +28,12 @@ enum NodeType : unsigned { + + // TODO: add more LoongArchISDs + CALL, ++ CALL_MEDIUM, ++ CALL_LARGE, + RET, + TAIL, ++ TAIL_MEDIUM, ++ TAIL_LARGE, + + // 32-bit shifts, directly matching the semantics of the named LoongArch + // instructions. +diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +index 67de5f7afd78..ecd0c2b71b85 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td ++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +@@ -69,6 +69,18 @@ def loongarch_ret : SDNode<"LoongArchISD::RET", SDTNone, + def loongarch_tail : SDNode<"LoongArchISD::TAIL", SDT_LoongArchCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; ++def loongarch_call_medium : SDNode<"LoongArchISD::CALL_MEDIUM", SDT_LoongArchCall, ++ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, ++ SDNPVariadic]>; ++def loongarch_tail_medium : SDNode<"LoongArchISD::TAIL_MEDIUM", SDT_LoongArchCall, ++ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, ++ SDNPVariadic]>; ++def loongarch_call_large : SDNode<"LoongArchISD::CALL_LARGE", SDT_LoongArchCall, ++ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, ++ SDNPVariadic]>; ++def loongarch_tail_large : SDNode<"LoongArchISD::TAIL_LARGE", SDT_LoongArchCall, ++ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, ++ SDNPVariadic]>; + def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>; + def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>; + def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>; +@@ -1327,16 +1339,43 @@ def : Pat<(brind GPR:$rj), (PseudoBRIND GPR:$rj, 0)>; + def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)), + (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>; + ++// Function call with 'Small' code model. + let isCall = 1, Defs = [R1] in + def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func)>; + + def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>; + def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>; + ++// Function call with 'Medium' code model. ++let isCall = 1, Defs = [R1, R20], Size = 8 in ++def PseudoCALL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$func)>; ++ ++let Predicates = [IsLA64] in { ++def : Pat<(loongarch_call_medium tglobaladdr:$func), ++ (PseudoCALL_MEDIUM tglobaladdr:$func)>; ++def : Pat<(loongarch_call_medium texternalsym:$func), ++ (PseudoCALL_MEDIUM texternalsym:$func)>; ++} // Predicates = [IsLA64] ++ ++// Function call with 'Large' code model. ++let isCall = 1, Defs = [R1, R20], Size = 24 in ++def PseudoCALL_LARGE: Pseudo<(outs), (ins bare_symbol:$func)>; ++ ++let Predicates = [IsLA64] in { ++def : Pat<(loongarch_call_large tglobaladdr:$func), ++ (PseudoCALL_LARGE tglobaladdr:$func)>; ++def : Pat<(loongarch_call_large texternalsym:$func), ++ (PseudoCALL_LARGE texternalsym:$func)>; ++} // Predicates = [IsLA64] ++ + let isCall = 1, Defs = [R1] in + def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rj), + [(loongarch_call GPR:$rj)]>, + PseudoInstExpansion<(JIRL R1, GPR:$rj, 0)>; ++let Predicates = [IsLA64] in { ++def : Pat<(loongarch_call_medium GPR:$rj), (PseudoCALLIndirect GPR:$rj)>; ++def : Pat<(loongarch_call_large GPR:$rj), (PseudoCALLIndirect GPR:$rj)>; ++} + + let isCall = 1, hasSideEffects = 0, mayStore = 0, mayLoad = 0, Defs = [R1] in + def PseudoJIRL_CALL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>, +@@ -1347,6 +1386,7 @@ let isBarrier = 1, isReturn = 1, isTerminator = 1 in + def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>, + PseudoInstExpansion<(JIRL R0, R1, 0)>; + ++// Tail call with 'Small' code model. + let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in + def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst)>; + +@@ -1355,10 +1395,38 @@ def : Pat<(loongarch_tail (iPTR tglobaladdr:$dst)), + def : Pat<(loongarch_tail (iPTR texternalsym:$dst)), + (PseudoTAIL texternalsym:$dst)>; + ++// Tail call with 'Medium' code model. ++let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, ++ Uses = [R3], Defs = [R20], Size = 8 in ++def PseudoTAIL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$dst)>; ++ ++let Predicates = [IsLA64] in { ++def : Pat<(loongarch_tail_medium (iPTR tglobaladdr:$dst)), ++ (PseudoTAIL_MEDIUM tglobaladdr:$dst)>; ++def : Pat<(loongarch_tail_medium (iPTR texternalsym:$dst)), ++ (PseudoTAIL_MEDIUM texternalsym:$dst)>; ++} // Predicates = [IsLA64] ++ ++// Tail call with 'Large' code model. ++let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, ++ Uses = [R3], Defs = [R19, R20], Size = 24 in ++def PseudoTAIL_LARGE : Pseudo<(outs), (ins bare_symbol:$dst)>; ++ ++let Predicates = [IsLA64] in { ++def : Pat<(loongarch_tail_large (iPTR tglobaladdr:$dst)), ++ (PseudoTAIL_LARGE tglobaladdr:$dst)>; ++def : Pat<(loongarch_tail_large (iPTR texternalsym:$dst)), ++ (PseudoTAIL_LARGE texternalsym:$dst)>; ++} // Predicates = [IsLA64] ++ + let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in + def PseudoTAILIndirect : Pseudo<(outs), (ins GPRT:$rj), + [(loongarch_tail GPRT:$rj)]>, + PseudoInstExpansion<(JIRL R0, GPR:$rj, 0)>; ++let Predicates = [IsLA64] in { ++def : Pat<(loongarch_tail_medium GPR:$rj), (PseudoTAILIndirect GPR:$rj)>; ++def : Pat<(loongarch_tail_large GPR:$rj), (PseudoTAILIndirect GPR:$rj)>; ++} + + let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, + hasSideEffects = 0, mayStore = 0, mayLoad = 0, Uses = [R3] in +@@ -1396,6 +1464,7 @@ def PseudoLA_ABS_LARGE : Pseudo<(outs GPR:$dst), + "la.abs", "$dst, $src">; + def PseudoLA_PCREL : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], + "la.pcrel", "$dst, $src">; ++let Defs = [R20], Size = 20 in + def PseudoLA_PCREL_LARGE : Pseudo<(outs GPR:$dst), + (ins GPR:$tmp, bare_symbol:$src), [], + "la.pcrel", "$dst, $tmp, $src">, +@@ -1407,28 +1476,30 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0, + isAsmParserOnly = 1 in { + def PseudoLA_GOT : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], + "la.got", "$dst, $src">; ++def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], ++ "la.tls.ie", "$dst, $src">; ++def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], ++ "la.tls.ld", "$dst, $src">; ++def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], ++ "la.tls.gd", "$dst, $src">; ++let Defs = [R20], Size = 20 in { + def PseudoLA_GOT_LARGE : Pseudo<(outs GPR:$dst), + (ins GPR:$tmp, bare_symbol:$src), [], + "la.got", "$dst, $tmp, $src">, + Requires<[IsLA64]>; +-def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], +- "la.tls.ie", "$dst, $src">; + def PseudoLA_TLS_IE_LARGE : Pseudo<(outs GPR:$dst), + (ins GPR:$tmp, bare_symbol:$src), [], + "la.tls.ie", "$dst, $tmp, $src">, + Requires<[IsLA64]>; +-def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], +- "la.tls.ld", "$dst, $src">; + def PseudoLA_TLS_LD_LARGE : Pseudo<(outs GPR:$dst), + (ins GPR:$tmp, bare_symbol:$src), [], + "la.tls.ld", "$dst, $tmp, $src">, + Requires<[IsLA64]>; +-def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], +- "la.tls.gd", "$dst, $src">; + def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst), + (ins GPR:$tmp, bare_symbol:$src), [], + "la.tls.gd", "$dst, $tmp, $src">, + Requires<[IsLA64]>; ++} // Defs = [R20], Size = 20 + } + + // Load address inst alias: "la", "la.global" and "la.local". +diff --git a/llvm/test/CodeGen/LoongArch/code-models.ll b/llvm/test/CodeGen/LoongArch/code-models.ll +index 7c6f46d5e926..f93c31670928 100644 +--- a/llvm/test/CodeGen/LoongArch/code-models.ll ++++ b/llvm/test/CodeGen/LoongArch/code-models.ll +@@ -33,11 +33,11 @@ define i32 @call_globaladdress(i32 %a) nounwind { + ; LARGE: # %bb.0: + ; LARGE-NEXT: addi.d $sp, $sp, -16 + ; LARGE-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +-; LARGE-NEXT: pcalau12i $a1, %got_pc_hi20(callee) +-; LARGE-NEXT: addi.d $ra, $zero, %got_pc_lo12(callee) +-; LARGE-NEXT: lu32i.d $ra, %got64_pc_lo20(callee) +-; LARGE-NEXT: lu52i.d $ra, $ra, %got64_pc_hi12(callee) +-; LARGE-NEXT: ldx.d $ra, $ra, $a1 ++; LARGE-NEXT: pcalau12i $ra, %got_pc_hi20(callee) ++; LARGE-NEXT: addi.d $t8, $zero, %got_pc_lo12(callee) ++; LARGE-NEXT: lu32i.d $t8, %got64_pc_lo20(callee) ++; LARGE-NEXT: lu52i.d $t8, $t8, %got64_pc_hi12(callee) ++; LARGE-NEXT: ldx.d $ra, $t8, $ra + ; LARGE-NEXT: jirl $ra, $ra, 0 + ; LARGE-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload + ; LARGE-NEXT: addi.d $sp, $sp, 16 +@@ -82,11 +82,11 @@ define void @call_external_sym(ptr %dst) { + ; LARGE-NEXT: .cfi_offset 1, -8 + ; LARGE-NEXT: ori $a2, $zero, 1000 + ; LARGE-NEXT: move $a1, $zero +-; LARGE-NEXT: pcalau12i $a3, %pc_hi20(memset) +-; LARGE-NEXT: addi.d $ra, $zero, %pc_lo12(memset) +-; LARGE-NEXT: lu32i.d $ra, %pc64_lo20(memset) +-; LARGE-NEXT: lu52i.d $ra, $ra, %pc64_hi12(memset) +-; LARGE-NEXT: add.d $ra, $ra, $a3 ++; LARGE-NEXT: pcalau12i $ra, %pc_hi20(memset) ++; LARGE-NEXT: addi.d $t8, $zero, %pc_lo12(memset) ++; LARGE-NEXT: lu32i.d $t8, %pc64_lo20(memset) ++; LARGE-NEXT: lu52i.d $t8, $t8, %pc64_hi12(memset) ++; LARGE-NEXT: add.d $ra, $t8, $ra + ; LARGE-NEXT: jirl $ra, $ra, 0 + ; LARGE-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload + ; LARGE-NEXT: addi.d $sp, $sp, 16 +@@ -105,17 +105,17 @@ define i32 @caller_tail(i32 %i) nounwind { + ; + ; MEDIUM-LABEL: caller_tail: + ; MEDIUM: # %bb.0: # %entry +-; MEDIUM-NEXT: pcaddu18i $a1, %call36(callee_tail) +-; MEDIUM-NEXT: jr $a1 ++; MEDIUM-NEXT: pcaddu18i $t8, %call36(callee_tail) ++; MEDIUM-NEXT: jr $t8 + ; + ; LARGE-LABEL: caller_tail: + ; LARGE: # %bb.0: # %entry +-; LARGE-NEXT: pcalau12i $a1, %got_pc_hi20(callee_tail) +-; LARGE-NEXT: addi.d $a2, $zero, %got_pc_lo12(callee_tail) +-; LARGE-NEXT: lu32i.d $a2, %got64_pc_lo20(callee_tail) +-; LARGE-NEXT: lu52i.d $a2, $a2, %got64_pc_hi12(callee_tail) +-; LARGE-NEXT: ldx.d $a1, $a2, $a1 +-; LARGE-NEXT: jr $a1 ++; LARGE-NEXT: pcalau12i $t7, %got_pc_hi20(callee_tail) ++; LARGE-NEXT: addi.d $t8, $zero, %got_pc_lo12(callee_tail) ++; LARGE-NEXT: lu32i.d $t8, %got64_pc_lo20(callee_tail) ++; LARGE-NEXT: lu52i.d $t8, $t8, %got64_pc_hi12(callee_tail) ++; LARGE-NEXT: ldx.d $t7, $t8, $t7 ++; LARGE-NEXT: jr $t7 + entry: + %r = tail call i32 @callee_tail(i32 %i) + ret i32 %r +diff --git a/llvm/test/CodeGen/LoongArch/expand-call.ll b/llvm/test/CodeGen/LoongArch/expand-call.ll +index 86bf4292665b..e0d179f92de6 100644 +--- a/llvm/test/CodeGen/LoongArch/expand-call.ll ++++ b/llvm/test/CodeGen/LoongArch/expand-call.ll +@@ -1,6 +1,6 @@ + ; RUN: llc --mtriple=loongarch64 --stop-before loongarch-prera-expand-pseudo \ + ; RUN: --verify-machineinstrs < %s | FileCheck %s --check-prefix=NOEXPAND +-; RUN: llc --mtriple=loongarch64 --stop-after loongarch-prera-expand-pseudo \ ++; RUN: llc --mtriple=loongarch64 --stop-before machine-opt-remark-emitter \ + ; RUN: --verify-machineinstrs < %s | FileCheck %s --check-prefix=EXPAND + + declare void @callee() +diff --git a/llvm/test/CodeGen/LoongArch/global-address.ll b/llvm/test/CodeGen/LoongArch/global-address.ll +index a8f0ef648aa7..d32a17f488b1 100644 +--- a/llvm/test/CodeGen/LoongArch/global-address.ll ++++ b/llvm/test/CodeGen/LoongArch/global-address.ll +@@ -53,32 +53,32 @@ define void @foo() nounwind { + ; LA64LARGENOPIC-LABEL: foo: + ; LA64LARGENOPIC: # %bb.0: + ; LA64LARGENOPIC-NEXT: pcalau12i $a0, %got_pc_hi20(G) +-; LA64LARGENOPIC-NEXT: addi.d $a1, $zero, %got_pc_lo12(G) +-; LA64LARGENOPIC-NEXT: lu32i.d $a1, %got64_pc_lo20(G) +-; LA64LARGENOPIC-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(G) +-; LA64LARGENOPIC-NEXT: ldx.d $a0, $a1, $a0 ++; LA64LARGENOPIC-NEXT: addi.d $t8, $zero, %got_pc_lo12(G) ++; LA64LARGENOPIC-NEXT: lu32i.d $t8, %got64_pc_lo20(G) ++; LA64LARGENOPIC-NEXT: lu52i.d $t8, $t8, %got64_pc_hi12(G) ++; LA64LARGENOPIC-NEXT: ldx.d $a0, $t8, $a0 + ; LA64LARGENOPIC-NEXT: ld.w $a0, $a0, 0 + ; LA64LARGENOPIC-NEXT: pcalau12i $a0, %pc_hi20(g) +-; LA64LARGENOPIC-NEXT: addi.d $a1, $zero, %pc_lo12(g) +-; LA64LARGENOPIC-NEXT: lu32i.d $a1, %pc64_lo20(g) +-; LA64LARGENOPIC-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g) +-; LA64LARGENOPIC-NEXT: add.d $a0, $a1, $a0 ++; LA64LARGENOPIC-NEXT: addi.d $t8, $zero, %pc_lo12(g) ++; LA64LARGENOPIC-NEXT: lu32i.d $t8, %pc64_lo20(g) ++; LA64LARGENOPIC-NEXT: lu52i.d $t8, $t8, %pc64_hi12(g) ++; LA64LARGENOPIC-NEXT: add.d $a0, $t8, $a0 + ; LA64LARGENOPIC-NEXT: ld.w $a0, $a0, 0 + ; LA64LARGENOPIC-NEXT: ret + ; + ; LA64LARGEPIC-LABEL: foo: + ; LA64LARGEPIC: # %bb.0: + ; LA64LARGEPIC-NEXT: pcalau12i $a0, %got_pc_hi20(G) +-; LA64LARGEPIC-NEXT: addi.d $a1, $zero, %got_pc_lo12(G) +-; LA64LARGEPIC-NEXT: lu32i.d $a1, %got64_pc_lo20(G) +-; LA64LARGEPIC-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(G) +-; LA64LARGEPIC-NEXT: ldx.d $a0, $a1, $a0 ++; LA64LARGEPIC-NEXT: addi.d $t8, $zero, %got_pc_lo12(G) ++; LA64LARGEPIC-NEXT: lu32i.d $t8, %got64_pc_lo20(G) ++; LA64LARGEPIC-NEXT: lu52i.d $t8, $t8, %got64_pc_hi12(G) ++; LA64LARGEPIC-NEXT: ldx.d $a0, $t8, $a0 + ; LA64LARGEPIC-NEXT: ld.w $a0, $a0, 0 + ; LA64LARGEPIC-NEXT: pcalau12i $a0, %pc_hi20(.Lg$local) +-; LA64LARGEPIC-NEXT: addi.d $a1, $zero, %pc_lo12(.Lg$local) +-; LA64LARGEPIC-NEXT: lu32i.d $a1, %pc64_lo20(.Lg$local) +-; LA64LARGEPIC-NEXT: lu52i.d $a1, $a1, %pc64_hi12(.Lg$local) +-; LA64LARGEPIC-NEXT: add.d $a0, $a1, $a0 ++; LA64LARGEPIC-NEXT: addi.d $t8, $zero, %pc_lo12(.Lg$local) ++; LA64LARGEPIC-NEXT: lu32i.d $t8, %pc64_lo20(.Lg$local) ++; LA64LARGEPIC-NEXT: lu52i.d $t8, $t8, %pc64_hi12(.Lg$local) ++; LA64LARGEPIC-NEXT: add.d $a0, $t8, $a0 + ; LA64LARGEPIC-NEXT: ld.w $a0, $a0, 0 + ; LA64LARGEPIC-NEXT: ret + %V = load volatile i32, ptr @G +diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll +index a515939b9c2b..474436a0126b 100644 +--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll ++++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll +@@ -48,13 +48,13 @@ define void @foo() nounwind { + ; MEDIUM_SCH-NEXT: addi.d $sp, $sp, -16 + ; MEDIUM_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill + ; MEDIUM_SCH-NEXT: pcalau12i $a0, %got_pc_hi20(G) +-; MEDIUM_SCH-NEXT: pcaddu18i $ra, %call36(bar) + ; MEDIUM_SCH-NEXT: ld.d $a0, $a0, %got_pc_lo12(G) + ; MEDIUM_SCH-NEXT: ld.d $a0, $a0, 0 + ; MEDIUM_SCH-NEXT: pcalau12i $a0, %pc_hi20(g) + ; MEDIUM_SCH-NEXT: addi.d $a0, $a0, %pc_lo12(g) + ; MEDIUM_SCH-NEXT: ld.d $a0, $a0, 0 + ; MEDIUM_SCH-NEXT: ori $a0, $zero, 1 ++; MEDIUM_SCH-NEXT: pcaddu18i $ra, %call36(bar) + ; MEDIUM_SCH-NEXT: jirl $ra, $ra, 0 + ; MEDIUM_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(gd) + ; MEDIUM_SCH-NEXT: ld.d $a0, $a0, %ie_pc_lo12(gd) +@@ -74,41 +74,41 @@ define void @foo() nounwind { + ; LARGE_NO_SCH-NEXT: addi.d $sp, $sp, -16 + ; LARGE_NO_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill + ; LARGE_NO_SCH-NEXT: pcalau12i $a0, %got_pc_hi20(G) +-; LARGE_NO_SCH-NEXT: addi.d $a1, $zero, %got_pc_lo12(G) +-; LARGE_NO_SCH-NEXT: lu32i.d $a1, %got64_pc_lo20(G) +-; LARGE_NO_SCH-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(G) +-; LARGE_NO_SCH-NEXT: ldx.d $a0, $a1, $a0 ++; LARGE_NO_SCH-NEXT: addi.d $t8, $zero, %got_pc_lo12(G) ++; LARGE_NO_SCH-NEXT: lu32i.d $t8, %got64_pc_lo20(G) ++; LARGE_NO_SCH-NEXT: lu52i.d $t8, $t8, %got64_pc_hi12(G) ++; LARGE_NO_SCH-NEXT: ldx.d $a0, $t8, $a0 + ; LARGE_NO_SCH-NEXT: ld.d $a0, $a0, 0 + ; LARGE_NO_SCH-NEXT: pcalau12i $a0, %pc_hi20(g) +-; LARGE_NO_SCH-NEXT: addi.d $a1, $zero, %pc_lo12(g) +-; LARGE_NO_SCH-NEXT: lu32i.d $a1, %pc64_lo20(g) +-; LARGE_NO_SCH-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g) +-; LARGE_NO_SCH-NEXT: add.d $a0, $a1, $a0 ++; LARGE_NO_SCH-NEXT: addi.d $t8, $zero, %pc_lo12(g) ++; LARGE_NO_SCH-NEXT: lu32i.d $t8, %pc64_lo20(g) ++; LARGE_NO_SCH-NEXT: lu52i.d $t8, $t8, %pc64_hi12(g) ++; LARGE_NO_SCH-NEXT: add.d $a0, $t8, $a0 + ; LARGE_NO_SCH-NEXT: ld.d $a0, $a0, 0 + ; LARGE_NO_SCH-NEXT: ori $a0, $zero, 1 +-; LARGE_NO_SCH-NEXT: pcalau12i $a1, %got_pc_hi20(bar) +-; LARGE_NO_SCH-NEXT: addi.d $ra, $zero, %got_pc_lo12(bar) +-; LARGE_NO_SCH-NEXT: lu32i.d $ra, %got64_pc_lo20(bar) +-; LARGE_NO_SCH-NEXT: lu52i.d $ra, $ra, %got64_pc_hi12(bar) +-; LARGE_NO_SCH-NEXT: ldx.d $ra, $ra, $a1 ++; LARGE_NO_SCH-NEXT: pcalau12i $ra, %got_pc_hi20(bar) ++; LARGE_NO_SCH-NEXT: addi.d $t8, $zero, %got_pc_lo12(bar) ++; LARGE_NO_SCH-NEXT: lu32i.d $t8, %got64_pc_lo20(bar) ++; LARGE_NO_SCH-NEXT: lu52i.d $t8, $t8, %got64_pc_hi12(bar) ++; LARGE_NO_SCH-NEXT: ldx.d $ra, $t8, $ra + ; LARGE_NO_SCH-NEXT: jirl $ra, $ra, 0 + ; LARGE_NO_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(gd) +-; LARGE_NO_SCH-NEXT: addi.d $a1, $zero, %ie_pc_lo12(gd) +-; LARGE_NO_SCH-NEXT: lu32i.d $a1, %ie64_pc_lo20(gd) +-; LARGE_NO_SCH-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(gd) +-; LARGE_NO_SCH-NEXT: ldx.d $a0, $a1, $a0 ++; LARGE_NO_SCH-NEXT: addi.d $t8, $zero, %ie_pc_lo12(gd) ++; LARGE_NO_SCH-NEXT: lu32i.d $t8, %ie64_pc_lo20(gd) ++; LARGE_NO_SCH-NEXT: lu52i.d $t8, $t8, %ie64_pc_hi12(gd) ++; LARGE_NO_SCH-NEXT: ldx.d $a0, $t8, $a0 + ; LARGE_NO_SCH-NEXT: ldx.d $a0, $a0, $tp + ; LARGE_NO_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(ld) +-; LARGE_NO_SCH-NEXT: addi.d $a1, $zero, %ie_pc_lo12(ld) +-; LARGE_NO_SCH-NEXT: lu32i.d $a1, %ie64_pc_lo20(ld) +-; LARGE_NO_SCH-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(ld) +-; LARGE_NO_SCH-NEXT: ldx.d $a0, $a1, $a0 ++; LARGE_NO_SCH-NEXT: addi.d $t8, $zero, %ie_pc_lo12(ld) ++; LARGE_NO_SCH-NEXT: lu32i.d $t8, %ie64_pc_lo20(ld) ++; LARGE_NO_SCH-NEXT: lu52i.d $t8, $t8, %ie64_pc_hi12(ld) ++; LARGE_NO_SCH-NEXT: ldx.d $a0, $t8, $a0 + ; LARGE_NO_SCH-NEXT: ldx.d $a0, $a0, $tp + ; LARGE_NO_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(ie) +-; LARGE_NO_SCH-NEXT: addi.d $a1, $zero, %ie_pc_lo12(ie) +-; LARGE_NO_SCH-NEXT: lu32i.d $a1, %ie64_pc_lo20(ie) +-; LARGE_NO_SCH-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(ie) +-; LARGE_NO_SCH-NEXT: ldx.d $a0, $a1, $a0 ++; LARGE_NO_SCH-NEXT: addi.d $t8, $zero, %ie_pc_lo12(ie) ++; LARGE_NO_SCH-NEXT: lu32i.d $t8, %ie64_pc_lo20(ie) ++; LARGE_NO_SCH-NEXT: lu52i.d $t8, $t8, %ie64_pc_hi12(ie) ++; LARGE_NO_SCH-NEXT: ldx.d $a0, $t8, $a0 + ; LARGE_NO_SCH-NEXT: ldx.d $a0, $a0, $tp + ; LARGE_NO_SCH-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload + ; LARGE_NO_SCH-NEXT: addi.d $sp, $sp, 16 +@@ -118,42 +118,42 @@ define void @foo() nounwind { + ; LARGE_SCH: # %bb.0: + ; LARGE_SCH-NEXT: addi.d $sp, $sp, -16 + ; LARGE_SCH-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +-; LARGE_SCH-NEXT: addi.d $a1, $zero, %got_pc_lo12(G) + ; LARGE_SCH-NEXT: pcalau12i $a0, %got_pc_hi20(G) +-; LARGE_SCH-NEXT: addi.d $ra, $zero, %got_pc_lo12(bar) +-; LARGE_SCH-NEXT: lu32i.d $a1, %got64_pc_lo20(G) +-; LARGE_SCH-NEXT: lu32i.d $ra, %got64_pc_lo20(bar) +-; LARGE_SCH-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(G) +-; LARGE_SCH-NEXT: lu52i.d $ra, $ra, %got64_pc_hi12(bar) +-; LARGE_SCH-NEXT: ldx.d $a0, $a1, $a0 +-; LARGE_SCH-NEXT: addi.d $a1, $zero, %pc_lo12(g) +-; LARGE_SCH-NEXT: lu32i.d $a1, %pc64_lo20(g) +-; LARGE_SCH-NEXT: lu52i.d $a1, $a1, %pc64_hi12(g) ++; LARGE_SCH-NEXT: addi.d $t8, $zero, %got_pc_lo12(G) ++; LARGE_SCH-NEXT: lu32i.d $t8, %got64_pc_lo20(G) ++; LARGE_SCH-NEXT: lu52i.d $t8, $t8, %got64_pc_hi12(G) ++; LARGE_SCH-NEXT: ldx.d $a0, $t8, $a0 + ; LARGE_SCH-NEXT: ld.d $a0, $a0, 0 + ; LARGE_SCH-NEXT: pcalau12i $a0, %pc_hi20(g) +-; LARGE_SCH-NEXT: add.d $a0, $a1, $a0 +-; LARGE_SCH-NEXT: pcalau12i $a1, %got_pc_hi20(bar) ++; LARGE_SCH-NEXT: addi.d $t8, $zero, %pc_lo12(g) ++; LARGE_SCH-NEXT: lu32i.d $t8, %pc64_lo20(g) ++; LARGE_SCH-NEXT: lu52i.d $t8, $t8, %pc64_hi12(g) ++; LARGE_SCH-NEXT: add.d $a0, $t8, $a0 + ; LARGE_SCH-NEXT: ld.d $a0, $a0, 0 +-; LARGE_SCH-NEXT: ldx.d $ra, $ra, $a1 + ; LARGE_SCH-NEXT: ori $a0, $zero, 1 ++; LARGE_SCH-NEXT: pcalau12i $ra, %got_pc_hi20(bar) ++; LARGE_SCH-NEXT: addi.d $t8, $zero, %got_pc_lo12(bar) ++; LARGE_SCH-NEXT: lu32i.d $t8, %got64_pc_lo20(bar) ++; LARGE_SCH-NEXT: lu52i.d $t8, $t8, %got64_pc_hi12(bar) ++; LARGE_SCH-NEXT: ldx.d $ra, $t8, $ra + ; LARGE_SCH-NEXT: jirl $ra, $ra, 0 +-; LARGE_SCH-NEXT: addi.d $a1, $zero, %ie_pc_lo12(gd) + ; LARGE_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(gd) +-; LARGE_SCH-NEXT: lu32i.d $a1, %ie64_pc_lo20(gd) +-; LARGE_SCH-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(gd) +-; LARGE_SCH-NEXT: ldx.d $a0, $a1, $a0 +-; LARGE_SCH-NEXT: addi.d $a1, $zero, %ie_pc_lo12(ld) +-; LARGE_SCH-NEXT: lu32i.d $a1, %ie64_pc_lo20(ld) +-; LARGE_SCH-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(ld) ++; LARGE_SCH-NEXT: addi.d $t8, $zero, %ie_pc_lo12(gd) ++; LARGE_SCH-NEXT: lu32i.d $t8, %ie64_pc_lo20(gd) ++; LARGE_SCH-NEXT: lu52i.d $t8, $t8, %ie64_pc_hi12(gd) ++; LARGE_SCH-NEXT: ldx.d $a0, $t8, $a0 + ; LARGE_SCH-NEXT: ldx.d $a0, $a0, $tp + ; LARGE_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(ld) +-; LARGE_SCH-NEXT: ldx.d $a0, $a1, $a0 +-; LARGE_SCH-NEXT: addi.d $a1, $zero, %ie_pc_lo12(ie) +-; LARGE_SCH-NEXT: lu32i.d $a1, %ie64_pc_lo20(ie) +-; LARGE_SCH-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(ie) ++; LARGE_SCH-NEXT: addi.d $t8, $zero, %ie_pc_lo12(ld) ++; LARGE_SCH-NEXT: lu32i.d $t8, %ie64_pc_lo20(ld) ++; LARGE_SCH-NEXT: lu52i.d $t8, $t8, %ie64_pc_hi12(ld) ++; LARGE_SCH-NEXT: ldx.d $a0, $t8, $a0 + ; LARGE_SCH-NEXT: ldx.d $a0, $a0, $tp + ; LARGE_SCH-NEXT: pcalau12i $a0, %ie_pc_hi20(ie) +-; LARGE_SCH-NEXT: ldx.d $a0, $a1, $a0 ++; LARGE_SCH-NEXT: addi.d $t8, $zero, %ie_pc_lo12(ie) ++; LARGE_SCH-NEXT: lu32i.d $t8, %ie64_pc_lo20(ie) ++; LARGE_SCH-NEXT: lu52i.d $t8, $t8, %ie64_pc_hi12(ie) ++; LARGE_SCH-NEXT: ldx.d $a0, $t8, $a0 + ; LARGE_SCH-NEXT: ldx.d $a0, $a0, $tp + ; LARGE_SCH-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload + ; LARGE_SCH-NEXT: addi.d $sp, $sp, 16 +diff --git a/llvm/test/CodeGen/LoongArch/tls-models.ll b/llvm/test/CodeGen/LoongArch/tls-models.ll +index a2a3792a6a54..3994df1da716 100644 +--- a/llvm/test/CodeGen/LoongArch/tls-models.ll ++++ b/llvm/test/CodeGen/LoongArch/tls-models.ll +@@ -45,15 +45,15 @@ define ptr @f1() nounwind { + ; LA64LARGEPIC-NEXT: addi.d $sp, $sp, -16 + ; LA64LARGEPIC-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill + ; LA64LARGEPIC-NEXT: pcalau12i $a0, %gd_pc_hi20(unspecified) +-; LA64LARGEPIC-NEXT: addi.d $a1, $zero, %got_pc_lo12(unspecified) +-; LA64LARGEPIC-NEXT: lu32i.d $a1, %got64_pc_lo20(unspecified) +-; LA64LARGEPIC-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(unspecified) +-; LA64LARGEPIC-NEXT: add.d $a0, $a1, $a0 +-; LA64LARGEPIC-NEXT: pcalau12i $a1, %pc_hi20(__tls_get_addr) +-; LA64LARGEPIC-NEXT: addi.d $ra, $zero, %pc_lo12(__tls_get_addr) +-; LA64LARGEPIC-NEXT: lu32i.d $ra, %pc64_lo20(__tls_get_addr) +-; LA64LARGEPIC-NEXT: lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr) +-; LA64LARGEPIC-NEXT: add.d $ra, $ra, $a1 ++; LA64LARGEPIC-NEXT: addi.d $t8, $zero, %got_pc_lo12(unspecified) ++; LA64LARGEPIC-NEXT: lu32i.d $t8, %got64_pc_lo20(unspecified) ++; LA64LARGEPIC-NEXT: lu52i.d $t8, $t8, %got64_pc_hi12(unspecified) ++; LA64LARGEPIC-NEXT: add.d $a0, $t8, $a0 ++; LA64LARGEPIC-NEXT: pcalau12i $ra, %pc_hi20(__tls_get_addr) ++; LA64LARGEPIC-NEXT: addi.d $t8, $zero, %pc_lo12(__tls_get_addr) ++; LA64LARGEPIC-NEXT: lu32i.d $t8, %pc64_lo20(__tls_get_addr) ++; LA64LARGEPIC-NEXT: lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr) ++; LA64LARGEPIC-NEXT: add.d $ra, $t8, $ra + ; LA64LARGEPIC-NEXT: jirl $ra, $ra, 0 + ; LA64LARGEPIC-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload + ; LA64LARGEPIC-NEXT: addi.d $sp, $sp, 16 +@@ -76,10 +76,10 @@ define ptr @f1() nounwind { + ; LA64LARGENOPIC-LABEL: f1: + ; LA64LARGENOPIC: # %bb.0: # %entry + ; LA64LARGENOPIC-NEXT: pcalau12i $a0, %ie_pc_hi20(unspecified) +-; LA64LARGENOPIC-NEXT: addi.d $a1, $zero, %ie_pc_lo12(unspecified) +-; LA64LARGENOPIC-NEXT: lu32i.d $a1, %ie64_pc_lo20(unspecified) +-; LA64LARGENOPIC-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(unspecified) +-; LA64LARGENOPIC-NEXT: ldx.d $a0, $a1, $a0 ++; LA64LARGENOPIC-NEXT: addi.d $t8, $zero, %ie_pc_lo12(unspecified) ++; LA64LARGENOPIC-NEXT: lu32i.d $t8, %ie64_pc_lo20(unspecified) ++; LA64LARGENOPIC-NEXT: lu52i.d $t8, $t8, %ie64_pc_hi12(unspecified) ++; LA64LARGENOPIC-NEXT: ldx.d $a0, $t8, $a0 + ; LA64LARGENOPIC-NEXT: add.d $a0, $a0, $tp + ; LA64LARGENOPIC-NEXT: ret + entry: +@@ -116,15 +116,15 @@ define ptr @f2() nounwind { + ; LA64LARGEPIC-NEXT: addi.d $sp, $sp, -16 + ; LA64LARGEPIC-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill + ; LA64LARGEPIC-NEXT: pcalau12i $a0, %ld_pc_hi20(ld) +-; LA64LARGEPIC-NEXT: addi.d $a1, $zero, %got_pc_lo12(ld) +-; LA64LARGEPIC-NEXT: lu32i.d $a1, %got64_pc_lo20(ld) +-; LA64LARGEPIC-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(ld) +-; LA64LARGEPIC-NEXT: add.d $a0, $a1, $a0 +-; LA64LARGEPIC-NEXT: pcalau12i $a1, %pc_hi20(__tls_get_addr) +-; LA64LARGEPIC-NEXT: addi.d $ra, $zero, %pc_lo12(__tls_get_addr) +-; LA64LARGEPIC-NEXT: lu32i.d $ra, %pc64_lo20(__tls_get_addr) +-; LA64LARGEPIC-NEXT: lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr) +-; LA64LARGEPIC-NEXT: add.d $ra, $ra, $a1 ++; LA64LARGEPIC-NEXT: addi.d $t8, $zero, %got_pc_lo12(ld) ++; LA64LARGEPIC-NEXT: lu32i.d $t8, %got64_pc_lo20(ld) ++; LA64LARGEPIC-NEXT: lu52i.d $t8, $t8, %got64_pc_hi12(ld) ++; LA64LARGEPIC-NEXT: add.d $a0, $t8, $a0 ++; LA64LARGEPIC-NEXT: pcalau12i $ra, %pc_hi20(__tls_get_addr) ++; LA64LARGEPIC-NEXT: addi.d $t8, $zero, %pc_lo12(__tls_get_addr) ++; LA64LARGEPIC-NEXT: lu32i.d $t8, %pc64_lo20(__tls_get_addr) ++; LA64LARGEPIC-NEXT: lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr) ++; LA64LARGEPIC-NEXT: add.d $ra, $t8, $ra + ; LA64LARGEPIC-NEXT: jirl $ra, $ra, 0 + ; LA64LARGEPIC-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload + ; LA64LARGEPIC-NEXT: addi.d $sp, $sp, 16 +@@ -147,10 +147,10 @@ define ptr @f2() nounwind { + ; LA64LARGENOPIC-LABEL: f2: + ; LA64LARGENOPIC: # %bb.0: # %entry + ; LA64LARGENOPIC-NEXT: pcalau12i $a0, %ie_pc_hi20(ld) +-; LA64LARGENOPIC-NEXT: addi.d $a1, $zero, %ie_pc_lo12(ld) +-; LA64LARGENOPIC-NEXT: lu32i.d $a1, %ie64_pc_lo20(ld) +-; LA64LARGENOPIC-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(ld) +-; LA64LARGENOPIC-NEXT: ldx.d $a0, $a1, $a0 ++; LA64LARGENOPIC-NEXT: addi.d $t8, $zero, %ie_pc_lo12(ld) ++; LA64LARGENOPIC-NEXT: lu32i.d $t8, %ie64_pc_lo20(ld) ++; LA64LARGENOPIC-NEXT: lu52i.d $t8, $t8, %ie64_pc_hi12(ld) ++; LA64LARGENOPIC-NEXT: ldx.d $a0, $t8, $a0 + ; LA64LARGENOPIC-NEXT: add.d $a0, $a0, $tp + ; LA64LARGENOPIC-NEXT: ret + entry: +@@ -177,10 +177,10 @@ define ptr @f3() nounwind { + ; LA64LARGEPIC-LABEL: f3: + ; LA64LARGEPIC: # %bb.0: # %entry + ; LA64LARGEPIC-NEXT: pcalau12i $a0, %ie_pc_hi20(ie) +-; LA64LARGEPIC-NEXT: addi.d $a1, $zero, %ie_pc_lo12(ie) +-; LA64LARGEPIC-NEXT: lu32i.d $a1, %ie64_pc_lo20(ie) +-; LA64LARGEPIC-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(ie) +-; LA64LARGEPIC-NEXT: ldx.d $a0, $a1, $a0 ++; LA64LARGEPIC-NEXT: addi.d $t8, $zero, %ie_pc_lo12(ie) ++; LA64LARGEPIC-NEXT: lu32i.d $t8, %ie64_pc_lo20(ie) ++; LA64LARGEPIC-NEXT: lu52i.d $t8, $t8, %ie64_pc_hi12(ie) ++; LA64LARGEPIC-NEXT: ldx.d $a0, $t8, $a0 + ; LA64LARGEPIC-NEXT: add.d $a0, $a0, $tp + ; LA64LARGEPIC-NEXT: ret + ; +@@ -201,10 +201,10 @@ define ptr @f3() nounwind { + ; LA64LARGENOPIC-LABEL: f3: + ; LA64LARGENOPIC: # %bb.0: # %entry + ; LA64LARGENOPIC-NEXT: pcalau12i $a0, %ie_pc_hi20(ie) +-; LA64LARGENOPIC-NEXT: addi.d $a1, $zero, %ie_pc_lo12(ie) +-; LA64LARGENOPIC-NEXT: lu32i.d $a1, %ie64_pc_lo20(ie) +-; LA64LARGENOPIC-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(ie) +-; LA64LARGENOPIC-NEXT: ldx.d $a0, $a1, $a0 ++; LA64LARGENOPIC-NEXT: addi.d $t8, $zero, %ie_pc_lo12(ie) ++; LA64LARGENOPIC-NEXT: lu32i.d $t8, %ie64_pc_lo20(ie) ++; LA64LARGENOPIC-NEXT: lu52i.d $t8, $t8, %ie64_pc_hi12(ie) ++; LA64LARGENOPIC-NEXT: ldx.d $a0, $t8, $a0 + ; LA64LARGENOPIC-NEXT: add.d $a0, $a0, $tp + ; LA64LARGENOPIC-NEXT: ret + entry: +-- +2.20.1 + + +From 34e8c30579faf4a8ef69fa686bd9b2d9e832d299 Mon Sep 17 00:00:00 2001 +From: Jie Fu +Date: Fri, 5 Jan 2024 12:05:23 +0800 +Subject: [PATCH 06/12] [LoongArch] Fix -Wunused-variable in + LoongArchExpandPseudoInsts.cpp (NFC) + +llvm-project/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp:480:20: + error: unused variable 'MF' [-Werror,-Wunused-variable] + MachineFunction *MF = MBB.getParent(); + ^ +1 error generated. + +(cherry picked from commit 52d1397e38ee88b170585c9c824d08e6975890ca) +--- + llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp +index f977f176066a..ad39658f698e 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp ++++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp +@@ -477,12 +477,11 @@ bool LoongArchExpandPseudo::expandLargeAddressLoad( + break; + } + +- MachineFunction *MF = MBB.getParent(); + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + Register ScratchReg = LoongArch::R20; // $t8 + +- assert(MF->getSubtarget().is64Bit() && ++ assert(MBB.getParent()->getSubtarget().is64Bit() && + "Large code model requires LA64"); + + auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg); +-- +2.20.1 + + +From b5d3aa3ac0dcf98fbb5f8d2d9de295be991c9e8f Mon Sep 17 00:00:00 2001 +From: Zhaoxin Yang +Date: Tue, 23 Jul 2024 12:06:59 +0800 +Subject: [PATCH 07/12] [LoongArch][CodeGen] Implement 128-bit and 256-bit + vector shuffle. (#100054) + +[LoongArch][CodeGen] Implement 128-bit and 256-bit vector shuffle +operations. + +In LoongArch, shuffle operations can be divided into two types: +- Single-vector shuffle: Shuffle using only one vector, with the other +vector being `undef` or not selected by mask. This can be expanded to +instructions such as `vreplvei` and `vshuf4i`. +- Two-vector shuffle: Shuflle using two vectors. This can be expanded to +instructions like `vilv[l/h]`, `vpack[ev/od]`, `vpick[ev/od]` and the +basic `vshuf`. + +In the future, more optimizations may be added, such as handling 1-bit +vectors and processing single element patterns, etc. + +(cherry picked from commit 464ea880cf7710cc8675c83001d7ae020406cf42) +--- + .../LoongArch/LoongArchISelLowering.cpp | 933 +++++++++++++++++- + .../Target/LoongArch/LoongArchISelLowering.h | 10 + + .../LoongArch/LoongArchLASXInstrInfo.td | 130 +++ + .../Target/LoongArch/LoongArchLSXInstrInfo.td | 148 +++ + .../lasx/ir-instruction/shuffle-as-xvilv.ll | 74 ++ + .../lasx/ir-instruction/shuffle-as-xvpack.ll | 124 +++ + .../lasx/ir-instruction/shuffle-as-xvpick.ll | 84 ++ + .../ir-instruction/shuffle-as-xvrepl128vei.ll | 65 ++ + .../lasx/ir-instruction/shuffle-as-xvshuf.ll | 76 ++ + .../ir-instruction/shuffle-as-xvshuf4i.ll | 43 + + .../lsx/ir-instruction/shuffle-as-vilv.ll | 82 ++ + .../lsx/ir-instruction/shuffle-as-vpack.ll | 122 +++ + .../lsx/ir-instruction/shuffle-as-vpick.ll | 82 ++ + .../lsx/ir-instruction/shuffle-as-vreplvei.ll | 62 ++ + .../lsx/ir-instruction/shuffle-as-vshuf.ll | 84 ++ + .../lsx/ir-instruction/shuffle-as-vshuf4i.ll | 42 + + 16 files changed, 2158 insertions(+), 3 deletions(-) + create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll + create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll + create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll + create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll + create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll + create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll + create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll + create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll + create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll + create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll + create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll + create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll + +diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +index df1b17649b7d..618ae7056425 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp ++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +@@ -247,9 +247,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, + + setOperationAction(ISD::SETCC, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Legal); ++ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + } + for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) { +- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal); + setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT, + Legal); +@@ -293,9 +293,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, + + setOperationAction(ISD::SETCC, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Legal); ++ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + } + for (MVT VT : {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8}) { +- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal); + setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT, + Legal); +@@ -422,9 +422,926 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, + return SDValue(); + } + ++/// Determine whether a range fits a regular pattern of values. ++/// This function accounts for the possibility of jumping over the End iterator. ++template ++static bool ++fitsRegularPattern(typename SmallVectorImpl::const_iterator Begin, ++ unsigned CheckStride, ++ typename SmallVectorImpl::const_iterator End, ++ ValType ExpectedIndex, unsigned ExpectedIndexStride) { ++ auto &I = Begin; ++ ++ while (I != End) { ++ if (*I != -1 && *I != ExpectedIndex) ++ return false; ++ ExpectedIndex += ExpectedIndexStride; ++ ++ // Incrementing past End is undefined behaviour so we must increment one ++ // step at a time and check for End at each step. ++ for (unsigned n = 0; n < CheckStride && I != End; ++n, ++I) ++ ; // Empty loop body. ++ } ++ return true; ++} ++ ++/// Lower VECTOR_SHUFFLE into VREPLVEI (if possible). ++/// ++/// VREPLVEI performs vector broadcast based on an element specified by an ++/// integer immediate, with its mask being similar to: ++/// ++/// where x is any valid index. ++/// ++/// When undef's appear in the mask they are treated as if they were whatever ++/// value is necessary in order to fit the above form. ++static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ int SplatIndex = -1; ++ for (const auto &M : Mask) { ++ if (M != -1) { ++ SplatIndex = M; ++ break; ++ } ++ } ++ ++ if (SplatIndex == -1) ++ return DAG.getUNDEF(VT); ++ ++ assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index"); ++ if (fitsRegularPattern(Mask.begin(), 1, Mask.end(), SplatIndex, 0)) { ++ APInt Imm(64, SplatIndex); ++ return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1, ++ DAG.getConstant(Imm, DL, MVT::i64)); ++ } ++ ++ return SDValue(); ++} ++ ++/// Lower VECTOR_SHUFFLE into VSHUF4I (if possible). ++/// ++/// VSHUF4I splits the vector into blocks of four elements, then shuffles these ++/// elements according to a <4 x i2> constant (encoded as an integer immediate). ++/// ++/// It is therefore possible to lower into VSHUF4I when the mask takes the form: ++/// ++/// When undef's appear they are treated as if they were whatever value is ++/// necessary in order to fit the above forms. ++/// ++/// For example: ++/// %2 = shufflevector <8 x i16> %0, <8 x i16> undef, ++/// <8 x i32> ++/// is lowered to: ++/// (VSHUF4I_H $v0, $v1, 27) ++/// where the 27 comes from: ++/// 3 + (2 << 2) + (1 << 4) + (0 << 6) ++static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ ++ // When the size is less than 4, lower cost instructions may be used. ++ if (Mask.size() < 4) ++ return SDValue(); ++ ++ int SubMask[4] = {-1, -1, -1, -1}; ++ for (unsigned i = 0; i < 4; ++i) { ++ for (unsigned j = i; j < Mask.size(); j += 4) { ++ int Idx = Mask[j]; ++ ++ // Convert from vector index to 4-element subvector index ++ // If an index refers to an element outside of the subvector then give up ++ if (Idx != -1) { ++ Idx -= 4 * (j / 4); ++ if (Idx < 0 || Idx >= 4) ++ return SDValue(); ++ } ++ ++ // If the mask has an undef, replace it with the current index. ++ // Note that it might still be undef if the current index is also undef ++ if (SubMask[i] == -1) ++ SubMask[i] = Idx; ++ // Check that non-undef values are the same as in the mask. If they ++ // aren't then give up ++ else if (Idx != -1 && Idx != SubMask[i]) ++ return SDValue(); ++ } ++ } ++ ++ // Calculate the immediate. Replace any remaining undefs with zero ++ APInt Imm(64, 0); ++ for (int i = 3; i >= 0; --i) { ++ int Idx = SubMask[i]; ++ ++ if (Idx == -1) ++ Idx = 0; ++ ++ Imm <<= 2; ++ Imm |= Idx & 0x3; ++ } ++ ++ return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1, ++ DAG.getConstant(Imm, DL, MVT::i64)); ++} ++ ++/// Lower VECTOR_SHUFFLE into VPACKEV (if possible). ++/// ++/// VPACKEV interleaves the even elements from each vector. ++/// ++/// It is possible to lower into VPACKEV when the mask consists of two of the ++/// following forms interleaved: ++/// <0, 2, 4, ...> ++/// ++/// where n is the number of elements in the vector. ++/// For example: ++/// <0, 0, 2, 2, 4, 4, ...> ++/// <0, n, 2, n+2, 4, n+4, ...> ++/// ++/// When undef's appear in the mask they are treated as if they were whatever ++/// value is necessary in order to fit the above forms. ++static SDValue lowerVECTOR_SHUFFLE_VPACKEV(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ ++ const auto &Begin = Mask.begin(); ++ const auto &End = Mask.end(); ++ SDValue OriV1 = V1, OriV2 = V2; ++ ++ if (fitsRegularPattern(Begin, 2, End, 0, 2)) ++ V1 = OriV1; ++ else if (fitsRegularPattern(Begin, 2, End, Mask.size(), 2)) ++ V1 = OriV2; ++ else ++ return SDValue(); ++ ++ if (fitsRegularPattern(Begin + 1, 2, End, 0, 2)) ++ V2 = OriV1; ++ else if (fitsRegularPattern(Begin + 1, 2, End, Mask.size(), 2)) ++ V2 = OriV2; ++ else ++ return SDValue(); ++ ++ return DAG.getNode(LoongArchISD::VPACKEV, DL, VT, V2, V1); ++} ++ ++/// Lower VECTOR_SHUFFLE into VPACKOD (if possible). ++/// ++/// VPACKOD interleaves the odd elements from each vector. ++/// ++/// It is possible to lower into VPACKOD when the mask consists of two of the ++/// following forms interleaved: ++/// <1, 3, 5, ...> ++/// ++/// where n is the number of elements in the vector. ++/// For example: ++/// <1, 1, 3, 3, 5, 5, ...> ++/// <1, n+1, 3, n+3, 5, n+5, ...> ++/// ++/// When undef's appear in the mask they are treated as if they were whatever ++/// value is necessary in order to fit the above forms. ++static SDValue lowerVECTOR_SHUFFLE_VPACKOD(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ ++ const auto &Begin = Mask.begin(); ++ const auto &End = Mask.end(); ++ SDValue OriV1 = V1, OriV2 = V2; ++ ++ if (fitsRegularPattern(Begin, 2, End, 1, 2)) ++ V1 = OriV1; ++ else if (fitsRegularPattern(Begin, 2, End, Mask.size() + 1, 2)) ++ V1 = OriV2; ++ else ++ return SDValue(); ++ ++ if (fitsRegularPattern(Begin + 1, 2, End, 1, 2)) ++ V2 = OriV1; ++ else if (fitsRegularPattern(Begin + 1, 2, End, Mask.size() + 1, 2)) ++ V2 = OriV2; ++ else ++ return SDValue(); ++ ++ return DAG.getNode(LoongArchISD::VPACKOD, DL, VT, V2, V1); ++} ++ ++/// Lower VECTOR_SHUFFLE into VILVH (if possible). ++/// ++/// VILVH interleaves consecutive elements from the left (highest-indexed) half ++/// of each vector. ++/// ++/// It is possible to lower into VILVH when the mask consists of two of the ++/// following forms interleaved: ++/// ++/// ++/// where n is the number of elements in the vector and x is half n. ++/// For example: ++/// ++/// ++/// ++/// When undef's appear in the mask they are treated as if they were whatever ++/// value is necessary in order to fit the above forms. ++static SDValue lowerVECTOR_SHUFFLE_VILVH(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ ++ const auto &Begin = Mask.begin(); ++ const auto &End = Mask.end(); ++ unsigned HalfSize = Mask.size() / 2; ++ SDValue OriV1 = V1, OriV2 = V2; ++ ++ if (fitsRegularPattern(Begin, 2, End, HalfSize, 1)) ++ V1 = OriV1; ++ else if (fitsRegularPattern(Begin, 2, End, Mask.size() + HalfSize, 1)) ++ V1 = OriV2; ++ else ++ return SDValue(); ++ ++ if (fitsRegularPattern(Begin + 1, 2, End, HalfSize, 1)) ++ V2 = OriV1; ++ else if (fitsRegularPattern(Begin + 1, 2, End, Mask.size() + HalfSize, ++ 1)) ++ V2 = OriV2; ++ else ++ return SDValue(); ++ ++ return DAG.getNode(LoongArchISD::VILVH, DL, VT, V2, V1); ++} ++ ++/// Lower VECTOR_SHUFFLE into VILVL (if possible). ++/// ++/// VILVL interleaves consecutive elements from the right (lowest-indexed) half ++/// of each vector. ++/// ++/// It is possible to lower into VILVL when the mask consists of two of the ++/// following forms interleaved: ++/// <0, 1, 2, ...> ++/// ++/// where n is the number of elements in the vector. ++/// For example: ++/// <0, 0, 1, 1, 2, 2, ...> ++/// <0, n, 1, n+1, 2, n+2, ...> ++/// ++/// When undef's appear in the mask they are treated as if they were whatever ++/// value is necessary in order to fit the above forms. ++static SDValue lowerVECTOR_SHUFFLE_VILVL(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ ++ const auto &Begin = Mask.begin(); ++ const auto &End = Mask.end(); ++ SDValue OriV1 = V1, OriV2 = V2; ++ ++ if (fitsRegularPattern(Begin, 2, End, 0, 1)) ++ V1 = OriV1; ++ else if (fitsRegularPattern(Begin, 2, End, Mask.size(), 1)) ++ V1 = OriV2; ++ else ++ return SDValue(); ++ ++ if (fitsRegularPattern(Begin + 1, 2, End, 0, 1)) ++ V2 = OriV1; ++ else if (fitsRegularPattern(Begin + 1, 2, End, Mask.size(), 1)) ++ V2 = OriV2; ++ else ++ return SDValue(); ++ ++ return DAG.getNode(LoongArchISD::VILVL, DL, VT, V2, V1); ++} ++ ++/// Lower VECTOR_SHUFFLE into VPICKEV (if possible). ++/// ++/// VPICKEV copies the even elements of each vector into the result vector. ++/// ++/// It is possible to lower into VPICKEV when the mask consists of two of the ++/// following forms concatenated: ++/// <0, 2, 4, ...> ++/// ++/// where n is the number of elements in the vector. ++/// For example: ++/// <0, 2, 4, ..., 0, 2, 4, ...> ++/// <0, 2, 4, ..., n, n+2, n+4, ...> ++/// ++/// When undef's appear in the mask they are treated as if they were whatever ++/// value is necessary in order to fit the above forms. ++static SDValue lowerVECTOR_SHUFFLE_VPICKEV(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ ++ const auto &Begin = Mask.begin(); ++ const auto &Mid = Mask.begin() + Mask.size() / 2; ++ const auto &End = Mask.end(); ++ SDValue OriV1 = V1, OriV2 = V2; ++ ++ if (fitsRegularPattern(Begin, 1, Mid, 0, 2)) ++ V1 = OriV1; ++ else if (fitsRegularPattern(Begin, 1, Mid, Mask.size(), 2)) ++ V1 = OriV2; ++ else ++ return SDValue(); ++ ++ if (fitsRegularPattern(Mid, 1, End, 0, 2)) ++ V2 = OriV1; ++ else if (fitsRegularPattern(Mid, 1, End, Mask.size(), 2)) ++ V2 = OriV2; ++ ++ else ++ return SDValue(); ++ ++ return DAG.getNode(LoongArchISD::VPICKEV, DL, VT, V2, V1); ++} ++ ++/// Lower VECTOR_SHUFFLE into VPICKOD (if possible). ++/// ++/// VPICKOD copies the odd elements of each vector into the result vector. ++/// ++/// It is possible to lower into VPICKOD when the mask consists of two of the ++/// following forms concatenated: ++/// <1, 3, 5, ...> ++/// ++/// where n is the number of elements in the vector. ++/// For example: ++/// <1, 3, 5, ..., 1, 3, 5, ...> ++/// <1, 3, 5, ..., n+1, n+3, n+5, ...> ++/// ++/// When undef's appear in the mask they are treated as if they were whatever ++/// value is necessary in order to fit the above forms. ++static SDValue lowerVECTOR_SHUFFLE_VPICKOD(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ ++ const auto &Begin = Mask.begin(); ++ const auto &Mid = Mask.begin() + Mask.size() / 2; ++ const auto &End = Mask.end(); ++ SDValue OriV1 = V1, OriV2 = V2; ++ ++ if (fitsRegularPattern(Begin, 1, Mid, 1, 2)) ++ V1 = OriV1; ++ else if (fitsRegularPattern(Begin, 1, Mid, Mask.size() + 1, 2)) ++ V1 = OriV2; ++ else ++ return SDValue(); ++ ++ if (fitsRegularPattern(Mid, 1, End, 1, 2)) ++ V2 = OriV1; ++ else if (fitsRegularPattern(Mid, 1, End, Mask.size() + 1, 2)) ++ V2 = OriV2; ++ else ++ return SDValue(); ++ ++ return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1); ++} ++ ++/// Lower VECTOR_SHUFFLE into VSHUF. ++/// ++/// This mostly consists of converting the shuffle mask into a BUILD_VECTOR and ++/// adding it as an operand to the resulting VSHUF. ++static SDValue lowerVECTOR_SHUFFLE_VSHUF(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ ++ SmallVector Ops; ++ for (auto M : Mask) ++ Ops.push_back(DAG.getConstant(M, DL, MVT::i64)); ++ ++ EVT MaskVecTy = VT.changeVectorElementTypeToInteger(); ++ SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops); ++ ++ // VECTOR_SHUFFLE concatenates the vectors in an vectorwise fashion. ++ // <0b00, 0b01> + <0b10, 0b11> -> <0b00, 0b01, 0b10, 0b11> ++ // VSHF concatenates the vectors in a bitwise fashion: ++ // <0b00, 0b01> + <0b10, 0b11> -> ++ // 0b0100 + 0b1110 -> 0b01001110 ++ // <0b10, 0b11, 0b00, 0b01> ++ // We must therefore swap the operands to get the correct result. ++ return DAG.getNode(LoongArchISD::VSHUF, DL, VT, MaskVec, V2, V1); ++} ++ ++/// Dispatching routine to lower various 128-bit LoongArch vector shuffles. ++/// ++/// This routine breaks down the specific type of 128-bit shuffle and ++/// dispatches to the lowering routines accordingly. ++static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, ++ SDValue V1, SDValue V2, SelectionDAG &DAG) { ++ assert((VT.SimpleTy == MVT::v16i8 || VT.SimpleTy == MVT::v8i16 || ++ VT.SimpleTy == MVT::v4i32 || VT.SimpleTy == MVT::v2i64 || ++ VT.SimpleTy == MVT::v4f32 || VT.SimpleTy == MVT::v2f64) && ++ "Vector type is unsupported for lsx!"); ++ assert(V1.getSimpleValueType() == V2.getSimpleValueType() && ++ "Two operands have different types!"); ++ assert(VT.getVectorNumElements() == Mask.size() && ++ "Unexpected mask size for shuffle!"); ++ assert(Mask.size() % 2 == 0 && "Expected even mask size."); ++ ++ SDValue Result; ++ // TODO: Add more comparison patterns. ++ if (V2.isUndef()) { ++ if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG))) ++ return Result; ++ if ((Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG))) ++ return Result; ++ ++ // TODO: This comment may be enabled in the future to better match the ++ // pattern for instruction selection. ++ /* V2 = V1; */ ++ } ++ ++ // It is recommended not to change the pattern comparison order for better ++ // performance. ++ if ((Result = lowerVECTOR_SHUFFLE_VPACKEV(DL, Mask, VT, V1, V2, DAG))) ++ return Result; ++ if ((Result = lowerVECTOR_SHUFFLE_VPACKOD(DL, Mask, VT, V1, V2, DAG))) ++ return Result; ++ if ((Result = lowerVECTOR_SHUFFLE_VILVH(DL, Mask, VT, V1, V2, DAG))) ++ return Result; ++ if ((Result = lowerVECTOR_SHUFFLE_VILVL(DL, Mask, VT, V1, V2, DAG))) ++ return Result; ++ if ((Result = lowerVECTOR_SHUFFLE_VPICKEV(DL, Mask, VT, V1, V2, DAG))) ++ return Result; ++ if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG))) ++ return Result; ++ if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG))) ++ return Result; ++ ++ return SDValue(); ++} ++ ++/// Lower VECTOR_SHUFFLE into XVREPLVEI (if possible). ++/// ++/// It is a XVREPLVEI when the mask is: ++/// ++/// where the number of x is equal to n and n is half the length of vector. ++/// ++/// When undef's appear in the mask they are treated as if they were whatever ++/// value is necessary in order to fit the above form. ++static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, ++ ArrayRef Mask, MVT VT, ++ SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ int SplatIndex = -1; ++ for (const auto &M : Mask) { ++ if (M != -1) { ++ SplatIndex = M; ++ break; ++ } ++ } ++ ++ if (SplatIndex == -1) ++ return DAG.getUNDEF(VT); ++ ++ const auto &Begin = Mask.begin(); ++ const auto &End = Mask.end(); ++ unsigned HalfSize = Mask.size() / 2; ++ ++ assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index"); ++ if (fitsRegularPattern(Begin, 1, End - HalfSize, SplatIndex, 0) && ++ fitsRegularPattern(Begin + HalfSize, 1, End, SplatIndex + HalfSize, ++ 0)) { ++ APInt Imm(64, SplatIndex); ++ return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1, ++ DAG.getConstant(Imm, DL, MVT::i64)); ++ } ++ ++ return SDValue(); ++} ++ ++/// Lower VECTOR_SHUFFLE into XVSHUF4I (if possible). ++static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ // When the size is less than or equal to 4, lower cost instructions may be ++ // used. ++ if (Mask.size() <= 4) ++ return SDValue(); ++ return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG); ++} ++ ++/// Lower VECTOR_SHUFFLE into XVPACKEV (if possible). ++static SDValue lowerVECTOR_SHUFFLE_XVPACKEV(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ return lowerVECTOR_SHUFFLE_VPACKEV(DL, Mask, VT, V1, V2, DAG); ++} ++ ++/// Lower VECTOR_SHUFFLE into XVPACKOD (if possible). ++static SDValue lowerVECTOR_SHUFFLE_XVPACKOD(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ return lowerVECTOR_SHUFFLE_VPACKOD(DL, Mask, VT, V1, V2, DAG); ++} ++ ++/// Lower VECTOR_SHUFFLE into XVILVH (if possible). ++static SDValue lowerVECTOR_SHUFFLE_XVILVH(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ ++ const auto &Begin = Mask.begin(); ++ const auto &End = Mask.end(); ++ unsigned HalfSize = Mask.size() / 2; ++ unsigned LeftSize = HalfSize / 2; ++ SDValue OriV1 = V1, OriV2 = V2; ++ ++ if (fitsRegularPattern(Begin, 2, End - HalfSize, HalfSize - LeftSize, ++ 1) && ++ fitsRegularPattern(Begin + HalfSize, 2, End, HalfSize + LeftSize, 1)) ++ V1 = OriV1; ++ else if (fitsRegularPattern(Begin, 2, End - HalfSize, ++ Mask.size() + HalfSize - LeftSize, 1) && ++ fitsRegularPattern(Begin + HalfSize, 2, End, ++ Mask.size() + HalfSize + LeftSize, 1)) ++ V1 = OriV2; ++ else ++ return SDValue(); ++ ++ if (fitsRegularPattern(Begin + 1, 2, End - HalfSize, HalfSize - LeftSize, ++ 1) && ++ fitsRegularPattern(Begin + 1 + HalfSize, 2, End, HalfSize + LeftSize, ++ 1)) ++ V2 = OriV1; ++ else if (fitsRegularPattern(Begin + 1, 2, End - HalfSize, ++ Mask.size() + HalfSize - LeftSize, 1) && ++ fitsRegularPattern(Begin + 1 + HalfSize, 2, End, ++ Mask.size() + HalfSize + LeftSize, 1)) ++ V2 = OriV2; ++ else ++ return SDValue(); ++ ++ return DAG.getNode(LoongArchISD::VILVH, DL, VT, V2, V1); ++} ++ ++/// Lower VECTOR_SHUFFLE into XVILVL (if possible). ++static SDValue lowerVECTOR_SHUFFLE_XVILVL(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ ++ const auto &Begin = Mask.begin(); ++ const auto &End = Mask.end(); ++ unsigned HalfSize = Mask.size() / 2; ++ SDValue OriV1 = V1, OriV2 = V2; ++ ++ if (fitsRegularPattern(Begin, 2, End - HalfSize, 0, 1) && ++ fitsRegularPattern(Begin + HalfSize, 2, End, HalfSize, 1)) ++ V1 = OriV1; ++ else if (fitsRegularPattern(Begin, 2, End - HalfSize, Mask.size(), 1) && ++ fitsRegularPattern(Begin + HalfSize, 2, End, ++ Mask.size() + HalfSize, 1)) ++ V1 = OriV2; ++ else ++ return SDValue(); ++ ++ if (fitsRegularPattern(Begin + 1, 2, End - HalfSize, 0, 1) && ++ fitsRegularPattern(Begin + 1 + HalfSize, 2, End, HalfSize, 1)) ++ V2 = OriV1; ++ else if (fitsRegularPattern(Begin + 1, 2, End - HalfSize, Mask.size(), ++ 1) && ++ fitsRegularPattern(Begin + 1 + HalfSize, 2, End, ++ Mask.size() + HalfSize, 1)) ++ V2 = OriV2; ++ else ++ return SDValue(); ++ ++ return DAG.getNode(LoongArchISD::VILVL, DL, VT, V2, V1); ++} ++ ++/// Lower VECTOR_SHUFFLE into XVPICKEV (if possible). ++static SDValue lowerVECTOR_SHUFFLE_XVPICKEV(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ ++ const auto &Begin = Mask.begin(); ++ const auto &LeftMid = Mask.begin() + Mask.size() / 4; ++ const auto &Mid = Mask.begin() + Mask.size() / 2; ++ const auto &RightMid = Mask.end() - Mask.size() / 4; ++ const auto &End = Mask.end(); ++ unsigned HalfSize = Mask.size() / 2; ++ SDValue OriV1 = V1, OriV2 = V2; ++ ++ if (fitsRegularPattern(Begin, 1, LeftMid, 0, 2) && ++ fitsRegularPattern(Mid, 1, RightMid, HalfSize, 2)) ++ V1 = OriV1; ++ else if (fitsRegularPattern(Begin, 1, LeftMid, Mask.size(), 2) && ++ fitsRegularPattern(Mid, 1, RightMid, Mask.size() + HalfSize, 2)) ++ V1 = OriV2; ++ else ++ return SDValue(); ++ ++ if (fitsRegularPattern(LeftMid, 1, Mid, 0, 2) && ++ fitsRegularPattern(RightMid, 1, End, HalfSize, 2)) ++ V2 = OriV1; ++ else if (fitsRegularPattern(LeftMid, 1, Mid, Mask.size(), 2) && ++ fitsRegularPattern(RightMid, 1, End, Mask.size() + HalfSize, 2)) ++ V2 = OriV2; ++ ++ else ++ return SDValue(); ++ ++ return DAG.getNode(LoongArchISD::VPICKEV, DL, VT, V2, V1); ++} ++ ++/// Lower VECTOR_SHUFFLE into XVPICKOD (if possible). ++static SDValue lowerVECTOR_SHUFFLE_XVPICKOD(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ ++ const auto &Begin = Mask.begin(); ++ const auto &LeftMid = Mask.begin() + Mask.size() / 4; ++ const auto &Mid = Mask.begin() + Mask.size() / 2; ++ const auto &RightMid = Mask.end() - Mask.size() / 4; ++ const auto &End = Mask.end(); ++ unsigned HalfSize = Mask.size() / 2; ++ SDValue OriV1 = V1, OriV2 = V2; ++ ++ if (fitsRegularPattern(Begin, 1, LeftMid, 1, 2) && ++ fitsRegularPattern(Mid, 1, RightMid, HalfSize + 1, 2)) ++ V1 = OriV1; ++ else if (fitsRegularPattern(Begin, 1, LeftMid, Mask.size() + 1, 2) && ++ fitsRegularPattern(Mid, 1, RightMid, Mask.size() + HalfSize + 1, ++ 2)) ++ V1 = OriV2; ++ else ++ return SDValue(); ++ ++ if (fitsRegularPattern(LeftMid, 1, Mid, 1, 2) && ++ fitsRegularPattern(RightMid, 1, End, HalfSize + 1, 2)) ++ V2 = OriV1; ++ else if (fitsRegularPattern(LeftMid, 1, Mid, Mask.size() + 1, 2) && ++ fitsRegularPattern(RightMid, 1, End, Mask.size() + HalfSize + 1, ++ 2)) ++ V2 = OriV2; ++ else ++ return SDValue(); ++ ++ return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1); ++} ++ ++/// Lower VECTOR_SHUFFLE into XVSHUF (if possible). ++static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef Mask, ++ MVT VT, SDValue V1, SDValue V2, ++ SelectionDAG &DAG) { ++ ++ int MaskSize = Mask.size(); ++ int HalfSize = Mask.size() / 2; ++ const auto &Begin = Mask.begin(); ++ const auto &Mid = Mask.begin() + HalfSize; ++ const auto &End = Mask.end(); ++ ++ // VECTOR_SHUFFLE concatenates the vectors: ++ // <0, 1, 2, 3, 4, 5, 6, 7> + <8, 9, 10, 11, 12, 13, 14, 15> ++ // shuffling -> ++ // <0, 1, 2, 3, 8, 9, 10, 11> <4, 5, 6, 7, 12, 13, 14, 15> ++ // ++ // XVSHUF concatenates the vectors: ++ // + ++ // shuffling -> ++ // + ++ SmallVector MaskAlloc; ++ for (auto it = Begin; it < Mid; it++) { ++ if (*it < 0) // UNDEF ++ MaskAlloc.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); ++ else if ((*it >= 0 && *it < HalfSize) || ++ (*it >= MaskSize && *it <= MaskSize + HalfSize)) { ++ int M = *it < HalfSize ? *it : *it - HalfSize; ++ MaskAlloc.push_back(DAG.getTargetConstant(M, DL, MVT::i64)); ++ } else ++ return SDValue(); ++ } ++ assert((int)MaskAlloc.size() == HalfSize && "xvshuf convert failed!"); ++ ++ for (auto it = Mid; it < End; it++) { ++ if (*it < 0) // UNDEF ++ MaskAlloc.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); ++ else if ((*it >= HalfSize && *it < MaskSize) || ++ (*it >= MaskSize + HalfSize && *it < MaskSize * 2)) { ++ int M = *it < MaskSize ? *it - HalfSize : *it - MaskSize; ++ MaskAlloc.push_back(DAG.getTargetConstant(M, DL, MVT::i64)); ++ } else ++ return SDValue(); ++ } ++ assert((int)MaskAlloc.size() == MaskSize && "xvshuf convert failed!"); ++ ++ EVT MaskVecTy = VT.changeVectorElementTypeToInteger(); ++ SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, MaskAlloc); ++ return DAG.getNode(LoongArchISD::VSHUF, DL, VT, MaskVec, V2, V1); ++} ++ ++/// Shuffle vectors by lane to generate more optimized instructions. ++/// 256-bit shuffles are always considered as 2-lane 128-bit shuffles. ++/// ++/// Therefore, except for the following four cases, other cases are regarded ++/// as cross-lane shuffles, where optimization is relatively limited. ++/// ++/// - Shuffle high, low lanes of two inputs vector ++/// <0, 1, 2, 3> + <4, 5, 6, 7> --- <0, 5, 3, 6> ++/// - Shuffle low, high lanes of two inputs vector ++/// <0, 1, 2, 3> + <4, 5, 6, 7> --- <3, 6, 0, 5> ++/// - Shuffle low, low lanes of two inputs vector ++/// <0, 1, 2, 3> + <4, 5, 6, 7> --- <3, 6, 3, 6> ++/// - Shuffle high, high lanes of two inputs vector ++/// <0, 1, 2, 3> + <4, 5, 6, 7> --- <0, 5, 0, 5> ++/// ++/// The first case is the closest to LoongArch instructions and the other ++/// cases need to be converted to it for processing. ++/// ++/// This function may modify V1, V2 and Mask ++static void canonicalizeShuffleVectorByLane(const SDLoc &DL, ++ MutableArrayRef Mask, MVT VT, ++ SDValue &V1, SDValue &V2, ++ SelectionDAG &DAG) { ++ ++ enum HalfMaskType { HighLaneTy, LowLaneTy, None }; ++ ++ int MaskSize = Mask.size(); ++ int HalfSize = Mask.size() / 2; ++ ++ HalfMaskType preMask = None, postMask = None; ++ ++ if (std::all_of(Mask.begin(), Mask.begin() + HalfSize, [&](int M) { ++ return M < 0 || (M >= 0 && M < HalfSize) || ++ (M >= MaskSize && M < MaskSize + HalfSize); ++ })) ++ preMask = HighLaneTy; ++ else if (std::all_of(Mask.begin(), Mask.begin() + HalfSize, [&](int M) { ++ return M < 0 || (M >= HalfSize && M < MaskSize) || ++ (M >= MaskSize + HalfSize && M < MaskSize * 2); ++ })) ++ preMask = LowLaneTy; ++ ++ if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) { ++ return M < 0 || (M >= 0 && M < HalfSize) || ++ (M >= MaskSize && M < MaskSize + HalfSize); ++ })) ++ postMask = HighLaneTy; ++ else if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) { ++ return M < 0 || (M >= HalfSize && M < MaskSize) || ++ (M >= MaskSize + HalfSize && M < MaskSize * 2); ++ })) ++ postMask = LowLaneTy; ++ ++ // The pre-half of mask is high lane type, and the post-half of mask ++ // is low lane type, which is closest to the LoongArch instructions. ++ // ++ // Note: In the LoongArch architecture, the high lane of mask corresponds ++ // to the lower 128-bit of vector register, and the low lane of mask ++ // corresponds the higher 128-bit of vector register. ++ if (preMask == HighLaneTy && postMask == LowLaneTy) { ++ return; ++ } ++ if (preMask == LowLaneTy && postMask == HighLaneTy) { ++ V1 = DAG.getBitcast(MVT::v4i64, V1); ++ V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1, ++ DAG.getConstant(0b01001110, DL, MVT::i64)); ++ V1 = DAG.getBitcast(VT, V1); ++ ++ if (!V2.isUndef()) { ++ V2 = DAG.getBitcast(MVT::v4i64, V2); ++ V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2, ++ DAG.getConstant(0b01001110, DL, MVT::i64)); ++ V2 = DAG.getBitcast(VT, V2); ++ } ++ ++ for (auto it = Mask.begin(); it < Mask.begin() + HalfSize; it++) { ++ *it = *it < 0 ? *it : *it - HalfSize; ++ } ++ for (auto it = Mask.begin() + HalfSize; it < Mask.end(); it++) { ++ *it = *it < 0 ? *it : *it + HalfSize; ++ } ++ } else if (preMask == LowLaneTy && postMask == LowLaneTy) { ++ V1 = DAG.getBitcast(MVT::v4i64, V1); ++ V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1, ++ DAG.getConstant(0b11101110, DL, MVT::i64)); ++ V1 = DAG.getBitcast(VT, V1); ++ ++ if (!V2.isUndef()) { ++ V2 = DAG.getBitcast(MVT::v4i64, V2); ++ V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2, ++ DAG.getConstant(0b11101110, DL, MVT::i64)); ++ V2 = DAG.getBitcast(VT, V2); ++ } ++ ++ for (auto it = Mask.begin(); it < Mask.begin() + HalfSize; it++) { ++ *it = *it < 0 ? *it : *it - HalfSize; ++ } ++ } else if (preMask == HighLaneTy && postMask == HighLaneTy) { ++ V1 = DAG.getBitcast(MVT::v4i64, V1); ++ V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1, ++ DAG.getConstant(0b01000100, DL, MVT::i64)); ++ V1 = DAG.getBitcast(VT, V1); ++ ++ if (!V2.isUndef()) { ++ V2 = DAG.getBitcast(MVT::v4i64, V2); ++ V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2, ++ DAG.getConstant(0b01000100, DL, MVT::i64)); ++ V2 = DAG.getBitcast(VT, V2); ++ } ++ ++ for (auto it = Mask.begin() + HalfSize; it < Mask.end(); it++) { ++ *it = *it < 0 ? *it : *it + HalfSize; ++ } ++ } else { // cross-lane ++ return; ++ } ++} ++ ++/// Dispatching routine to lower various 256-bit LoongArch vector shuffles. ++/// ++/// This routine breaks down the specific type of 256-bit shuffle and ++/// dispatches to the lowering routines accordingly. ++static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, ++ SDValue V1, SDValue V2, SelectionDAG &DAG) { ++ assert((VT.SimpleTy == MVT::v32i8 || VT.SimpleTy == MVT::v16i16 || ++ VT.SimpleTy == MVT::v8i32 || VT.SimpleTy == MVT::v4i64 || ++ VT.SimpleTy == MVT::v8f32 || VT.SimpleTy == MVT::v4f64) && ++ "Vector type is unsupported for lasx!"); ++ assert(V1.getSimpleValueType() == V2.getSimpleValueType() && ++ "Two operands have different types!"); ++ assert(VT.getVectorNumElements() == Mask.size() && ++ "Unexpected mask size for shuffle!"); ++ assert(Mask.size() % 2 == 0 && "Expected even mask size."); ++ assert(Mask.size() >= 4 && "Mask size is less than 4."); ++ ++ // canonicalize non cross-lane shuffle vector ++ SmallVector NewMask(Mask); ++ canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG); ++ ++ SDValue Result; ++ // TODO: Add more comparison patterns. ++ if (V2.isUndef()) { ++ if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG))) ++ return Result; ++ if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG))) ++ return Result; ++ ++ // TODO: This comment may be enabled in the future to better match the ++ // pattern for instruction selection. ++ /* V2 = V1; */ ++ } ++ ++ // It is recommended not to change the pattern comparison order for better ++ // performance. ++ if ((Result = lowerVECTOR_SHUFFLE_XVPACKEV(DL, NewMask, VT, V1, V2, DAG))) ++ return Result; ++ if ((Result = lowerVECTOR_SHUFFLE_XVPACKOD(DL, NewMask, VT, V1, V2, DAG))) ++ return Result; ++ if ((Result = lowerVECTOR_SHUFFLE_XVILVH(DL, NewMask, VT, V1, V2, DAG))) ++ return Result; ++ if ((Result = lowerVECTOR_SHUFFLE_XVILVL(DL, NewMask, VT, V1, V2, DAG))) ++ return Result; ++ if ((Result = lowerVECTOR_SHUFFLE_XVPICKEV(DL, NewMask, VT, V1, V2, DAG))) ++ return Result; ++ if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG))) ++ return Result; ++ if ((Result = lowerVECTOR_SHUFFLE_XVSHUF(DL, NewMask, VT, V1, V2, DAG))) ++ return Result; ++ ++ return SDValue(); ++} ++ + SDValue LoongArchTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, + SelectionDAG &DAG) const { +- // TODO: custom shuffle. ++ ShuffleVectorSDNode *SVOp = cast(Op); ++ ArrayRef OrigMask = SVOp->getMask(); ++ SDValue V1 = Op.getOperand(0); ++ SDValue V2 = Op.getOperand(1); ++ MVT VT = Op.getSimpleValueType(); ++ int NumElements = VT.getVectorNumElements(); ++ SDLoc DL(Op); ++ ++ bool V1IsUndef = V1.isUndef(); ++ bool V2IsUndef = V2.isUndef(); ++ if (V1IsUndef && V2IsUndef) ++ return DAG.getUNDEF(VT); ++ ++ // When we create a shuffle node we put the UNDEF node to second operand, ++ // but in some cases the first operand may be transformed to UNDEF. ++ // In this case we should just commute the node. ++ if (V1IsUndef) ++ return DAG.getCommutedVectorShuffle(*SVOp); ++ ++ // Check for non-undef masks pointing at an undef vector and make the masks ++ // undef as well. This makes it easier to match the shuffle based solely on ++ // the mask. ++ if (V2IsUndef && ++ any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) { ++ SmallVector NewMask(OrigMask); ++ for (int &M : NewMask) ++ if (M >= NumElements) ++ M = -1; ++ return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); ++ } ++ ++ // Check for illegal shuffle mask element index values. ++ int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2); ++ (void)MaskUpperLimit; ++ assert(llvm::all_of(OrigMask, ++ [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && ++ "Out of bounds shuffle index"); ++ ++ // For each vector width, delegate to a specialized lowering routine. ++ if (VT.is128BitVector()) ++ return lower128BitShuffle(DL, OrigMask, VT, V1, V2, DAG); ++ ++ if (VT.is256BitVector()) ++ return lower256BitShuffle(DL, OrigMask, VT, V1, V2, DAG); ++ + return SDValue(); + } + +@@ -3439,6 +4356,16 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { + NODE_NAME_CASE(MOVFCSR2GR) + NODE_NAME_CASE(CACOP_D) + NODE_NAME_CASE(CACOP_W) ++ NODE_NAME_CASE(VSHUF) ++ NODE_NAME_CASE(VPICKEV) ++ NODE_NAME_CASE(VPICKOD) ++ NODE_NAME_CASE(VPACKEV) ++ NODE_NAME_CASE(VPACKOD) ++ NODE_NAME_CASE(VILVL) ++ NODE_NAME_CASE(VILVH) ++ NODE_NAME_CASE(VSHUF4I) ++ NODE_NAME_CASE(VREPLVEI) ++ NODE_NAME_CASE(XVPERMI) + NODE_NAME_CASE(VPICK_SEXT_ELT) + NODE_NAME_CASE(VPICK_ZEXT_ELT) + NODE_NAME_CASE(VREPLVE) +diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +index a2ed149f4bb7..a5ee740c1261 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h ++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +@@ -117,6 +117,16 @@ enum NodeType : unsigned { + + // Vector Shuffle + VREPLVE, ++ VSHUF, ++ VPICKEV, ++ VPICKOD, ++ VPACKEV, ++ VPACKOD, ++ VILVL, ++ VILVH, ++ VSHUF4I, ++ VREPLVEI, ++ XVPERMI, + + // Extended vector element extraction + VPICK_SEXT_ELT, +diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +index 492b62da6ce7..5b6721cdf1b4 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td ++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +@@ -10,6 +10,8 @@ + // + //===----------------------------------------------------------------------===// + ++def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_loongArchV1RUimm>; ++ + def lasxsplati8 + : PatFrag<(ops node:$e0), + (v32i8 (build_vector node:$e0, node:$e0, node:$e0, node:$e0, +@@ -1571,6 +1573,134 @@ def : Pat<(loongarch_vreplve v8i32:$xj, GRLenVT:$rk), + def : Pat<(loongarch_vreplve v4i64:$xj, GRLenVT:$rk), + (XVREPLVE_D v4i64:$xj, GRLenVT:$rk)>; + ++// XVSHUF_{B/H/W/D} ++def : Pat<(loongarch_vshuf v32i8:$xa, v32i8:$xj, v32i8:$xk), ++ (XVSHUF_B v32i8:$xj, v32i8:$xk, v32i8:$xa)>; ++def : Pat<(loongarch_vshuf v16i16:$xd, v16i16:$xj, v16i16:$xk), ++ (XVSHUF_H v16i16:$xd, v16i16:$xj, v16i16:$xk)>; ++def : Pat<(loongarch_vshuf v8i32:$xd, v8i32:$xj, v8i32:$xk), ++ (XVSHUF_W v8i32:$xd, v8i32:$xj, v8i32:$xk)>; ++def : Pat<(loongarch_vshuf v4i64:$xd, v4i64:$xj, v4i64:$xk), ++ (XVSHUF_D v4i64:$xd, v4i64:$xj, v4i64:$xk)>; ++def : Pat<(loongarch_vshuf v8i32:$xd, v8f32:$xj, v8f32:$xk), ++ (XVSHUF_W v8i32:$xd, v8f32:$xj, v8f32:$xk)>; ++def : Pat<(loongarch_vshuf v4i64:$xd, v4f64:$xj, v4f64:$xk), ++ (XVSHUF_D v4i64:$xd, v4f64:$xj, v4f64:$xk)>; ++ ++// XVPICKEV_{B/H/W/D} ++def : Pat<(loongarch_vpickev v32i8:$xj, v32i8:$xk), ++ (XVPICKEV_B v32i8:$xj, v32i8:$xk)>; ++def : Pat<(loongarch_vpickev v16i16:$xj, v16i16:$xk), ++ (XVPICKEV_H v16i16:$xj, v16i16:$xk)>; ++def : Pat<(loongarch_vpickev v8i32:$xj, v8i32:$xk), ++ (XVPICKEV_W v8i32:$xj, v8i32:$xk)>; ++def : Pat<(loongarch_vpickev v4i64:$xj, v4i64:$xk), ++ (XVPICKEV_D v4i64:$xj, v4i64:$xk)>; ++def : Pat<(loongarch_vpickev v8f32:$xj, v8f32:$xk), ++ (XVPICKEV_W v8f32:$xj, v8f32:$xk)>; ++def : Pat<(loongarch_vpickev v4f64:$xj, v4f64:$xk), ++ (XVPICKEV_D v4f64:$xj, v4f64:$xk)>; ++ ++// XVPICKOD_{B/H/W/D} ++def : Pat<(loongarch_vpickod v32i8:$xj, v32i8:$xk), ++ (XVPICKOD_B v32i8:$xj, v32i8:$xk)>; ++def : Pat<(loongarch_vpickod v16i16:$xj, v16i16:$xk), ++ (XVPICKOD_H v16i16:$xj, v16i16:$xk)>; ++def : Pat<(loongarch_vpickod v8i32:$xj, v8i32:$xk), ++ (XVPICKOD_W v8i32:$xj, v8i32:$xk)>; ++def : Pat<(loongarch_vpickod v4i64:$xj, v4i64:$xk), ++ (XVPICKOD_D v4i64:$xj, v4i64:$xk)>; ++def : Pat<(loongarch_vpickod v8f32:$xj, v8f32:$xk), ++ (XVPICKOD_W v8f32:$xj, v8f32:$xk)>; ++def : Pat<(loongarch_vpickod v4f64:$xj, v4f64:$xk), ++ (XVPICKOD_D v4f64:$xj, v4f64:$xk)>; ++ ++// XVPACKEV_{B/H/W/D} ++def : Pat<(loongarch_vpackev v32i8:$xj, v32i8:$xk), ++ (XVPACKEV_B v32i8:$xj, v32i8:$xk)>; ++def : Pat<(loongarch_vpackev v16i16:$xj, v16i16:$xk), ++ (XVPACKEV_H v16i16:$xj, v16i16:$xk)>; ++def : Pat<(loongarch_vpackev v8i32:$xj, v8i32:$xk), ++ (XVPACKEV_W v8i32:$xj, v8i32:$xk)>; ++def : Pat<(loongarch_vpackev v4i64:$xj, v4i64:$xk), ++ (XVPACKEV_D v4i64:$xj, v4i64:$xk)>; ++def : Pat<(loongarch_vpackev v8f32:$xj, v8f32:$xk), ++ (XVPACKEV_W v8f32:$xj, v8f32:$xk)>; ++def : Pat<(loongarch_vpackev v4f64:$xj, v4f64:$xk), ++ (XVPACKEV_D v4f64:$xj, v4f64:$xk)>; ++ ++// XVPACKOD_{B/H/W/D} ++def : Pat<(loongarch_vpackod v32i8:$xj, v32i8:$xk), ++ (XVPACKOD_B v32i8:$xj, v32i8:$xk)>; ++def : Pat<(loongarch_vpackod v16i16:$xj, v16i16:$xk), ++ (XVPACKOD_H v16i16:$xj, v16i16:$xk)>; ++def : Pat<(loongarch_vpackod v8i32:$xj, v8i32:$xk), ++ (XVPACKOD_W v8i32:$xj, v8i32:$xk)>; ++def : Pat<(loongarch_vpackod v4i64:$xj, v4i64:$xk), ++ (XVPACKOD_D v4i64:$xj, v4i64:$xk)>; ++def : Pat<(loongarch_vpackod v8f32:$xj, v8f32:$xk), ++ (XVPACKOD_W v8f32:$xj, v8f32:$xk)>; ++def : Pat<(loongarch_vpackod v4f64:$xj, v4f64:$xk), ++ (XVPACKOD_D v4f64:$xj, v4f64:$xk)>; ++ ++// XVILVL_{B/H/W/D} ++def : Pat<(loongarch_vilvl v32i8:$xj, v32i8:$xk), ++ (XVILVL_B v32i8:$xj, v32i8:$xk)>; ++def : Pat<(loongarch_vilvl v16i16:$xj, v16i16:$xk), ++ (XVILVL_H v16i16:$xj, v16i16:$xk)>; ++def : Pat<(loongarch_vilvl v8i32:$xj, v8i32:$xk), ++ (XVILVL_W v8i32:$xj, v8i32:$xk)>; ++def : Pat<(loongarch_vilvl v4i64:$xj, v4i64:$xk), ++ (XVILVL_D v4i64:$xj, v4i64:$xk)>; ++def : Pat<(loongarch_vilvl v8f32:$xj, v8f32:$xk), ++ (XVILVL_W v8f32:$xj, v8f32:$xk)>; ++def : Pat<(loongarch_vilvl v4f64:$xj, v4f64:$xk), ++ (XVILVL_D v4f64:$xj, v4f64:$xk)>; ++ ++// XVILVH_{B/H/W/D} ++def : Pat<(loongarch_vilvh v32i8:$xj, v32i8:$xk), ++ (XVILVH_B v32i8:$xj, v32i8:$xk)>; ++def : Pat<(loongarch_vilvh v16i16:$xj, v16i16:$xk), ++ (XVILVH_H v16i16:$xj, v16i16:$xk)>; ++def : Pat<(loongarch_vilvh v8i32:$xj, v8i32:$xk), ++ (XVILVH_W v8i32:$xj, v8i32:$xk)>; ++def : Pat<(loongarch_vilvh v4i64:$xj, v4i64:$xk), ++ (XVILVH_D v4i64:$xj, v4i64:$xk)>; ++def : Pat<(loongarch_vilvh v8f32:$xj, v8f32:$xk), ++ (XVILVH_W v8f32:$xj, v8f32:$xk)>; ++def : Pat<(loongarch_vilvh v4f64:$xj, v4f64:$xk), ++ (XVILVH_D v4f64:$xj, v4f64:$xk)>; ++ ++// XVSHUF4I_{B/H/W} ++def : Pat<(loongarch_vshuf4i v32i8:$xj, immZExt8:$ui8), ++ (XVSHUF4I_B v32i8:$xj, immZExt8:$ui8)>; ++def : Pat<(loongarch_vshuf4i v16i16:$xj, immZExt8:$ui8), ++ (XVSHUF4I_H v16i16:$xj, immZExt8:$ui8)>; ++def : Pat<(loongarch_vshuf4i v8i32:$xj, immZExt8:$ui8), ++ (XVSHUF4I_W v8i32:$xj, immZExt8:$ui8)>; ++def : Pat<(loongarch_vshuf4i v8f32:$xj, immZExt8:$ui8), ++ (XVSHUF4I_W v8f32:$xj, immZExt8:$ui8)>; ++ ++// XVREPL128VEI_{B/H/W/D} ++def : Pat<(loongarch_vreplvei v32i8:$xj, immZExt4:$ui4), ++ (XVREPL128VEI_B v32i8:$xj, immZExt4:$ui4)>; ++def : Pat<(loongarch_vreplvei v16i16:$xj, immZExt3:$ui3), ++ (XVREPL128VEI_H v16i16:$xj, immZExt3:$ui3)>; ++def : Pat<(loongarch_vreplvei v8i32:$xj, immZExt2:$ui2), ++ (XVREPL128VEI_W v8i32:$xj, immZExt2:$ui2)>; ++def : Pat<(loongarch_vreplvei v4i64:$xj, immZExt1:$ui1), ++ (XVREPL128VEI_D v4i64:$xj, immZExt1:$ui1)>; ++def : Pat<(loongarch_vreplvei v8f32:$xj, immZExt2:$ui2), ++ (XVREPL128VEI_W v8f32:$xj, immZExt2:$ui2)>; ++def : Pat<(loongarch_vreplvei v4f64:$xj, immZExt1:$ui1), ++ (XVREPL128VEI_D v4f64:$xj, immZExt1:$ui1)>; ++ ++// XVPERMI_D ++def : Pat<(loongarch_xvpermi v4i64:$xj, immZExt8: $ui8), ++ (XVPERMI_D v4i64:$xj, immZExt8: $ui8)>; ++def : Pat<(loongarch_xvpermi v4f64:$xj, immZExt8: $ui8), ++ (XVPERMI_D v4f64:$xj, immZExt8: $ui8)>; ++ + // XVREPLVE0_{W/D} + def : Pat<(lasxsplatf32 FPR32:$fj), + (XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>; +diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +index 99ac2f3c162f..3519fa3142c3 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td ++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +@@ -15,6 +15,15 @@ def SDT_LoongArchVreplve : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>, + SDTCisSameAs<0, 1>, SDTCisInt<2>]>; + def SDT_LoongArchVecCond : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>]>; + ++def SDT_LoongArchVShuf : SDTypeProfile<1, 3, [SDTCisVec<0>, ++ SDTCisInt<1>, SDTCisVec<1>, ++ SDTCisSameAs<0, 2>, ++ SDTCisSameAs<2, 3>]>; ++def SDT_LoongArchV2R : SDTypeProfile<1, 2, [SDTCisVec<0>, ++ SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>; ++def SDT_loongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>, ++ SDTCisSameAs<0,1>, SDTCisVT<2, i64>]>; ++ + // Target nodes. + def loongarch_vreplve : SDNode<"LoongArchISD::VREPLVE", SDT_LoongArchVreplve>; + def loongarch_vall_nonzero : SDNode<"LoongArchISD::VALL_NONZERO", +@@ -31,6 +40,23 @@ def loongarch_vpick_sext_elt : SDNode<"LoongArchISD::VPICK_SEXT_ELT", + def loongarch_vpick_zext_elt : SDNode<"LoongArchISD::VPICK_ZEXT_ELT", + SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>>; + ++def loongarch_vshuf: SDNode<"LoongArchISD::VSHUF", SDT_LoongArchVShuf>; ++def loongarch_vpickev: SDNode<"LoongArchISD::VPICKEV", SDT_LoongArchV2R>; ++def loongarch_vpickod: SDNode<"LoongArchISD::VPICKOD", SDT_LoongArchV2R>; ++def loongarch_vpackev: SDNode<"LoongArchISD::VPACKEV", SDT_LoongArchV2R>; ++def loongarch_vpackod: SDNode<"LoongArchISD::VPACKOD", SDT_LoongArchV2R>; ++def loongarch_vilvl: SDNode<"LoongArchISD::VILVL", SDT_LoongArchV2R>; ++def loongarch_vilvh: SDNode<"LoongArchISD::VILVH", SDT_LoongArchV2R>; ++ ++def loongarch_vshuf4i: SDNode<"LoongArchISD::VSHUF4I", SDT_loongArchV1RUimm>; ++def loongarch_vreplvei: SDNode<"LoongArchISD::VREPLVEI", SDT_loongArchV1RUimm>; ++ ++def immZExt1 : ImmLeaf(Imm);}]>; ++def immZExt2 : ImmLeaf(Imm);}]>; ++def immZExt3 : ImmLeaf(Imm);}]>; ++def immZExt4 : ImmLeaf(Imm);}]>; ++def immZExt8 : ImmLeaf(Imm);}]>; ++ + class VecCond + : Pseudo<(outs GPR:$rd), (ins RC:$vj), +@@ -1678,6 +1704,128 @@ def : Pat<(loongarch_vreplve v4i32:$vj, GRLenVT:$rk), + def : Pat<(loongarch_vreplve v2i64:$vj, GRLenVT:$rk), + (VREPLVE_D v2i64:$vj, GRLenVT:$rk)>; + ++// VSHUF_{B/H/W/D} ++def : Pat<(loongarch_vshuf v16i8:$va, v16i8:$vj, v16i8:$vk), ++ (VSHUF_B v16i8:$vj, v16i8:$vk, v16i8:$va)>; ++def : Pat<(loongarch_vshuf v8i16:$vd, v8i16:$vj, v8i16:$vk), ++ (VSHUF_H v8i16:$vd, v8i16:$vj, v8i16:$vk)>; ++def : Pat<(loongarch_vshuf v4i32:$vd, v4i32:$vj, v4i32:$vk), ++ (VSHUF_W v4i32:$vd, v4i32:$vj, v4i32:$vk)>; ++def : Pat<(loongarch_vshuf v2i64:$vd, v2i64:$vj, v2i64:$vk), ++ (VSHUF_D v2i64:$vd, v2i64:$vj, v2i64:$vk)>; ++def : Pat<(loongarch_vshuf v4i32:$vd, v4f32:$vj, v4f32:$vk), ++ (VSHUF_W v4i32:$vd, v4f32:$vj, v4f32:$vk)>; ++def : Pat<(loongarch_vshuf v2i64:$vd, v2f64:$vj, v2f64:$vk), ++ (VSHUF_D v2i64:$vd, v2f64:$vj, v2f64:$vk)>; ++ ++// VPICKEV_{B/H/W/D} ++def : Pat<(loongarch_vpickev v16i8:$vj, v16i8:$vk), ++ (VPICKEV_B v16i8:$vj, v16i8:$vk)>; ++def : Pat<(loongarch_vpickev v8i16:$vj, v8i16:$vk), ++ (VPICKEV_H v8i16:$vj, v8i16:$vk)>; ++def : Pat<(loongarch_vpickev v4i32:$vj, v4i32:$vk), ++ (VPICKEV_W v4i32:$vj, v4i32:$vk)>; ++def : Pat<(loongarch_vpickev v2i64:$vj, v2i64:$vk), ++ (VPICKEV_D v2i64:$vj, v2i64:$vk)>; ++def : Pat<(loongarch_vpickev v4f32:$vj, v4f32:$vk), ++ (VPICKEV_W v4f32:$vj, v4f32:$vk)>; ++def : Pat<(loongarch_vpickev v2f64:$vj, v2f64:$vk), ++ (VPICKEV_D v2f64:$vj, v2f64:$vk)>; ++ ++// VPICKOD_{B/H/W/D} ++def : Pat<(loongarch_vpickod v16i8:$vj, v16i8:$vk), ++ (VPICKOD_B v16i8:$vj, v16i8:$vk)>; ++def : Pat<(loongarch_vpickod v8i16:$vj, v8i16:$vk), ++ (VPICKOD_H v8i16:$vj, v8i16:$vk)>; ++def : Pat<(loongarch_vpickod v4i32:$vj, v4i32:$vk), ++ (VPICKOD_W v4i32:$vj, v4i32:$vk)>; ++def : Pat<(loongarch_vpickod v2i64:$vj, v2i64:$vk), ++ (VPICKOD_D v2i64:$vj, v2i64:$vk)>; ++def : Pat<(loongarch_vpickod v4f32:$vj, v4f32:$vk), ++ (VPICKOD_W v4f32:$vj, v4f32:$vk)>; ++def : Pat<(loongarch_vpickod v2f64:$vj, v2f64:$vk), ++ (VPICKOD_D v2f64:$vj, v2f64:$vk)>; ++ ++// VPACKEV_{B/H/W/D} ++def : Pat<(loongarch_vpackev v16i8:$vj, v16i8:$vk), ++ (VPACKEV_B v16i8:$vj, v16i8:$vk)>; ++def : Pat<(loongarch_vpackev v8i16:$vj, v8i16:$vk), ++ (VPACKEV_H v8i16:$vj, v8i16:$vk)>; ++def : Pat<(loongarch_vpackev v4i32:$vj, v4i32:$vk), ++ (VPACKEV_W v4i32:$vj, v4i32:$vk)>; ++def : Pat<(loongarch_vpackev v2i64:$vj, v2i64:$vk), ++ (VPACKEV_D v2i64:$vj, v2i64:$vk)>; ++def : Pat<(loongarch_vpackev v4f32:$vj, v4f32:$vk), ++ (VPACKEV_W v4f32:$vj, v4f32:$vk)>; ++def : Pat<(loongarch_vpackev v2f64:$vj, v2f64:$vk), ++ (VPACKEV_D v2f64:$vj, v2f64:$vk)>; ++ ++// VPACKOD_{B/H/W/D} ++def : Pat<(loongarch_vpackod v16i8:$vj, v16i8:$vk), ++ (VPACKOD_B v16i8:$vj, v16i8:$vk)>; ++def : Pat<(loongarch_vpackod v8i16:$vj, v8i16:$vk), ++ (VPACKOD_H v8i16:$vj, v8i16:$vk)>; ++def : Pat<(loongarch_vpackod v4i32:$vj, v4i32:$vk), ++ (VPACKOD_W v4i32:$vj, v4i32:$vk)>; ++def : Pat<(loongarch_vpackod v2i64:$vj, v2i64:$vk), ++ (VPACKOD_D v2i64:$vj, v2i64:$vk)>; ++def : Pat<(loongarch_vpackod v4f32:$vj, v4f32:$vk), ++ (VPACKOD_W v4f32:$vj, v4f32:$vk)>; ++def : Pat<(loongarch_vpackod v2f64:$vj, v2f64:$vk), ++ (VPACKOD_D v2f64:$vj, v2f64:$vk)>; ++ ++// VILVL_{B/H/W/D} ++def : Pat<(loongarch_vilvl v16i8:$vj, v16i8:$vk), ++ (VILVL_B v16i8:$vj, v16i8:$vk)>; ++def : Pat<(loongarch_vilvl v8i16:$vj, v8i16:$vk), ++ (VILVL_H v8i16:$vj, v8i16:$vk)>; ++def : Pat<(loongarch_vilvl v4i32:$vj, v4i32:$vk), ++ (VILVL_W v4i32:$vj, v4i32:$vk)>; ++def : Pat<(loongarch_vilvl v2i64:$vj, v2i64:$vk), ++ (VILVL_D v2i64:$vj, v2i64:$vk)>; ++def : Pat<(loongarch_vilvl v4f32:$vj, v4f32:$vk), ++ (VILVL_W v4f32:$vj, v4f32:$vk)>; ++def : Pat<(loongarch_vilvl v2f64:$vj, v2f64:$vk), ++ (VILVL_D v2f64:$vj, v2f64:$vk)>; ++ ++// VILVH_{B/H/W/D} ++def : Pat<(loongarch_vilvh v16i8:$vj, v16i8:$vk), ++ (VILVH_B v16i8:$vj, v16i8:$vk)>; ++def : Pat<(loongarch_vilvh v8i16:$vj, v8i16:$vk), ++ (VILVH_H v8i16:$vj, v8i16:$vk)>; ++def : Pat<(loongarch_vilvh v4i32:$vj, v4i32:$vk), ++ (VILVH_W v4i32:$vj, v4i32:$vk)>; ++def : Pat<(loongarch_vilvh v2i64:$vj, v2i64:$vk), ++ (VILVH_D v2i64:$vj, v2i64:$vk)>; ++def : Pat<(loongarch_vilvh v4f32:$vj, v4f32:$vk), ++ (VILVH_W v4f32:$vj, v4f32:$vk)>; ++def : Pat<(loongarch_vilvh v2f64:$vj, v2f64:$vk), ++ (VILVH_D v2f64:$vj, v2f64:$vk)>; ++ ++// VSHUF4I_{B/H/W} ++def : Pat<(loongarch_vshuf4i v16i8:$vj, immZExt8:$ui8), ++ (VSHUF4I_B v16i8:$vj, immZExt8:$ui8)>; ++def : Pat<(loongarch_vshuf4i v8i16:$vj, immZExt8:$ui8), ++ (VSHUF4I_H v8i16:$vj, immZExt8:$ui8)>; ++def : Pat<(loongarch_vshuf4i v4i32:$vj, immZExt8:$ui8), ++ (VSHUF4I_W v4i32:$vj, immZExt8:$ui8)>; ++def : Pat<(loongarch_vshuf4i v4f32:$vj, immZExt8:$ui8), ++ (VSHUF4I_W v4f32:$vj, immZExt8:$ui8)>; ++ ++// VREPLVEI_{B/H/W/D} ++def : Pat<(loongarch_vreplvei v16i8:$vj, immZExt4:$ui4), ++ (VREPLVEI_B v16i8:$vj, immZExt4:$ui4)>; ++def : Pat<(loongarch_vreplvei v8i16:$vj, immZExt3:$ui3), ++ (VREPLVEI_H v8i16:$vj, immZExt3:$ui3)>; ++def : Pat<(loongarch_vreplvei v4i32:$vj, immZExt2:$ui2), ++ (VREPLVEI_W v4i32:$vj, immZExt2:$ui2)>; ++def : Pat<(loongarch_vreplvei v2i64:$vj, immZExt1:$ui1), ++ (VREPLVEI_D v2i64:$vj, immZExt1:$ui1)>; ++def : Pat<(loongarch_vreplvei v4f32:$vj, immZExt2:$ui2), ++ (VREPLVEI_W v4f32:$vj, immZExt2:$ui2)>; ++def : Pat<(loongarch_vreplvei v2f64:$vj, immZExt1:$ui1), ++ (VREPLVEI_D v2f64:$vj, immZExt1:$ui1)>; ++ + // VREPLVEI_{W/D} + def : Pat<(lsxsplatf32 FPR32:$fj), + (VREPLVEI_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), 0)>; +diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll +new file mode 100644 +index 000000000000..22ab19b9fa44 +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll +@@ -0,0 +1,74 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s ++ ++;; xvilvl.b ++define <32 x i8> @shufflevector_xvilvl_v32i8(<32 x i8> %a, <32 x i8> %b) { ++; CHECK-LABEL: shufflevector_xvilvl_v32i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvilvl.b $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ++ ret <32 x i8> %c ++} ++ ++;; xvilvl.h ++define <16 x i16> @shufflevector_xvilvl_v16i16(<16 x i16> %a, <16 x i16> %b) { ++; CHECK-LABEL: shufflevector_xvilvl_v16i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvilvl.h $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ++ ret <16 x i16> %c ++} ++ ++;; xvilvl.w ++define <8 x i32> @shufflevector_xvilvl_v8i32(<8 x i32> %a, <8 x i32> %b) { ++; CHECK-LABEL: shufflevector_xvilvl_v8i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvilvl.w $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ++ ret <8 x i32> %c ++} ++ ++;; xvilvh.b ++define <32 x i8> @shufflevector_xvilvh_v32i8(<32 x i8> %a, <32 x i8> %b) { ++; CHECK-LABEL: shufflevector_xvilvh_v32i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvilvh.b $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ++ ret <32 x i8> %c ++} ++ ++;; xvilvh.h ++define <16 x i16> @shufflevector_xvilvh_v16i16(<16 x i16> %a, <16 x i16> %b) { ++; CHECK-LABEL: shufflevector_xvilvh_v16i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvilvh.h $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ++ ret <16 x i16> %c ++} ++ ++;; xvilvh.w ++define <8 x i32> @shufflevector_xvilvh_v8i32(<8 x i32> %a, <8 x i32> %b) { ++; CHECK-LABEL: shufflevector_xvilvh_v8i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvilvh.w $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ++ ret <8 x i32> %c ++} ++ ++;; xvilvh.w ++define <8 x float> @shufflevector_xvilvh_v8f32(<8 x float> %a, <8 x float> %b) { ++; CHECK-LABEL: shufflevector_xvilvh_v8f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvilvh.w $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ++ ret <8 x float> %c ++} +diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll +new file mode 100644 +index 000000000000..2ff9af4069b9 +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll +@@ -0,0 +1,124 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s ++ ++;; xvpackev.b ++define <32 x i8> @shufflevector_pack_ev_v32i8(<32 x i8> %a, <32 x i8> %b) { ++; CHECK-LABEL: shufflevector_pack_ev_v32i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpackev.b $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ++ ret <32 x i8> %c ++} ++ ++;; xvpackev.h ++define <16 x i16> @shufflevector_pack_ev_v16i16(<16 x i16> %a, <16 x i16> %b) { ++; CHECK-LABEL: shufflevector_pack_ev_v16i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpackev.h $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ++ ret <16 x i16> %c ++} ++ ++;; xvpackev.w ++define <8 x i32> @shufflevector_pack_ev_v8i32(<8 x i32> %a, <8 x i32> %b) { ++; CHECK-LABEL: shufflevector_pack_ev_v8i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpackev.w $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ++ ret <8 x i32> %c ++} ++ ++;; xvpickev.d/xvpackev.d/xvilvl.d ++define <4 x i64> @shufflevector_pack_ev_v4i64(<4 x i64> %a, <4 x i64> %b) { ++; CHECK-LABEL: shufflevector_pack_ev_v4i64: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpackev.d $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ++ ret <4 x i64> %c ++} ++ ++;; xvpackev.w ++define <8 x float> @shufflevector_pack_ev_v8f32(<8 x float> %a, <8 x float> %b) { ++; CHECK-LABEL: shufflevector_pack_ev_v8f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpackev.w $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ++ ret <8 x float> %c ++} ++ ++;; xvpickev.d/xvpackev.d/xvilvl.d ++define <4 x double> @shufflevector_pack_ev_v4f64(<4 x double> %a, <4 x double> %b) { ++; CHECK-LABEL: shufflevector_pack_ev_v4f64: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpackev.d $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ++ ret <4 x double> %c ++} ++ ++;; xvpackod.b ++define <32 x i8> @shufflevector_pack_od_v32i8(<32 x i8> %a, <32 x i8> %b) { ++; CHECK-LABEL: shufflevector_pack_od_v32i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpackod.b $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ++ ret <32 x i8> %c ++} ++ ++;; xvpackod.h ++define <16 x i16> @shufflevector_pack_od_v16i16(<16 x i16> %a, <16 x i16> %b) { ++; CHECK-LABEL: shufflevector_pack_od_v16i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpackod.h $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ++ ret <16 x i16> %c ++} ++ ++;; xvpackod.w ++define <8 x i32> @shufflevector_pack_od_v8i32(<8 x i32> %a, <8 x i32> %b) { ++; CHECK-LABEL: shufflevector_pack_od_v8i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpackod.w $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ++ ret <8 x i32> %c ++} ++ ++;; xvpickod.d/xvpackod.d/xvilvh.d ++define <4 x i64> @shufflodector_pack_od_v4i64(<4 x i64> %a, <4 x i64> %b) { ++; CHECK-LABEL: shufflodector_pack_od_v4i64: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpackod.d $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ++ ret <4 x i64> %c ++} ++ ++;; xvpackod.w ++define <8 x float> @shufflodector_pack_od_v8f32(<8 x float> %a, <8 x float> %b) { ++; CHECK-LABEL: shufflodector_pack_od_v8f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpackod.w $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ++ ret <8 x float> %c ++} ++ ++;; xvpickod.d/xvpackod.d/xvilvh.d ++define <4 x double> @shufflodector_pack_od_v4f64(<4 x double> %a, <4 x double> %b) { ++; CHECK-LABEL: shufflodector_pack_od_v4f64: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpackod.d $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ++ ret <4 x double> %c ++} +diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll +new file mode 100644 +index 000000000000..294d292d1764 +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll +@@ -0,0 +1,84 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s ++ ++;; xvpickev.b ++define <32 x i8> @shufflevector_pick_ev_v32i8(<32 x i8> %a, <32 x i8> %b) { ++; CHECK-LABEL: shufflevector_pick_ev_v32i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpickev.b $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ++ ret <32 x i8> %c ++} ++ ++;; xvpickev.h ++define <16 x i16> @shufflevector_pick_ev_v16i16(<16 x i16> %a, <16 x i16> %b) { ++; CHECK-LABEL: shufflevector_pick_ev_v16i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpickev.h $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ++ ret <16 x i16> %c ++} ++ ++;; xvpickev.w ++define <8 x i32> @shufflevector_pick_ev_v8i32(<8 x i32> %a, <8 x i32> %b) { ++; CHECK-LABEL: shufflevector_pick_ev_v8i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpickev.w $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ++ ret <8 x i32> %c ++} ++ ++;; xvpickev.w ++define <8 x float> @shufflevector_pick_ev_v8f32(<8 x float> %a, <8 x float> %b) { ++; CHECK-LABEL: shufflevector_pick_ev_v8f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpickev.w $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ++ ret <8 x float> %c ++} ++ ++;; xvpickod.b ++define <32 x i8> @shufflevector_pick_od_v32i8(<32 x i8> %a, <32 x i8> %b) { ++; CHECK-LABEL: shufflevector_pick_od_v32i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpickod.b $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ++ ret <32 x i8> %c ++} ++ ++;; xvpickod.h ++define <16 x i16> @shufflevector_pick_od_v16i16(<16 x i16> %a, <16 x i16> %b) { ++; CHECK-LABEL: shufflevector_pick_od_v16i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpickod.h $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ++ ret <16 x i16> %c ++} ++ ++;; xvpickod.w ++define <8 x i32> @shufflevector_pick_od_v8i32(<8 x i32> %a, <8 x i32> %b) { ++; CHECK-LABEL: shufflevector_pick_od_v8i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpickod.w $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ++ ret <8 x i32> %c ++} ++ ++;; xvpickod.w ++define <8 x float> @shufflodector_pick_od_v8f32(<8 x float> %a, <8 x float> %b) { ++; CHECK-LABEL: shufflodector_pick_od_v8f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpickod.w $xr0, $xr1, $xr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ++ ret <8 x float> %c ++} +diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll +new file mode 100644 +index 000000000000..dce1e4b777e2 +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll +@@ -0,0 +1,65 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s ++ ++;; xvrepl128vei.b ++define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) { ++; CHECK-LABEL: shufflevector_v32i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvrepl128vei.b $xr0, $xr0, 1 ++; CHECK-NEXT: ret ++ %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ++ ret <32 x i8> %c ++} ++ ++;; xvrepl128vei.h ++define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) { ++; CHECK-LABEL: shufflevector_v16i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvrepl128vei.h $xr0, $xr0, 3 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ++ ret <16 x i16> %c ++} ++ ++;; xvrepl128vei.w ++define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) { ++; CHECK-LABEL: shufflevector_v8i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpermi.d $xr0, $xr0, 78 ++; CHECK-NEXT: xvrepl128vei.w $xr0, $xr0, 3 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ++ ret <8 x i32> %c ++} ++ ++;; xvrepl128vei.d ++define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) { ++; CHECK-LABEL: shufflevector_v4i64: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvrepl128vei.d $xr0, $xr0, 1 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ++ ret <4 x i64> %c ++} ++ ++;; xvrepl128vei.w ++define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) { ++; CHECK-LABEL: shufflevector_v8f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvrepl128vei.w $xr0, $xr0, 3 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ++ ret <8 x float> %c ++} ++ ++;; xvrepl128vei.d ++define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) { ++; CHECK-LABEL: shufflevector_v4f64: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvrepl128vei.d $xr0, $xr1, 1 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ++ ret <4 x double> %c ++} +diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll +new file mode 100644 +index 000000000000..fce32647da3d +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll +@@ -0,0 +1,76 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s ++ ++;; xvshuf.b ++define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) { ++; CHECK-LABEL: shufflevector_v32i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0) ++; CHECK-NEXT: addi.d $a0, $a0, %pc_lo12(.LCPI0_0) ++; CHECK-NEXT: xvld $xr2, $a0, 0 ++; CHECK-NEXT: xvshuf.b $xr0, $xr1, $xr0, $xr2 ++; CHECK-NEXT: ret ++ %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ++ ret <32 x i8> %c ++} ++ ++;; xvshuf.h ++define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) { ++; CHECK-LABEL: shufflevector_v16i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpermi.d $xr2, $xr0, 78 ++; CHECK-NEXT: xvpermi.d $xr1, $xr1, 78 ++; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) ++; CHECK-NEXT: addi.d $a0, $a0, %pc_lo12(.LCPI1_0) ++; CHECK-NEXT: xvld $xr0, $a0, 0 ++; CHECK-NEXT: xvshuf.h $xr0, $xr1, $xr2 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ++ ret <16 x i16> %c ++} ++ ++;; xvshuf.w ++define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) { ++; CHECK-LABEL: shufflevector_v8i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpermi.d $xr2, $xr0, 68 ++; CHECK-NEXT: xvpermi.d $xr1, $xr1, 68 ++; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) ++; CHECK-NEXT: addi.d $a0, $a0, %pc_lo12(.LCPI2_0) ++; CHECK-NEXT: xvld $xr0, $a0, 0 ++; CHECK-NEXT: xvshuf.w $xr0, $xr1, $xr2 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ++ ret <8 x i32> %c ++} ++ ++;; xvshuf.d ++define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) { ++; CHECK-LABEL: shufflevector_v4i64: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvpermi.d $xr2, $xr0, 238 ++; CHECK-NEXT: xvpermi.d $xr1, $xr1, 238 ++; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0) ++; CHECK-NEXT: addi.d $a0, $a0, %pc_lo12(.LCPI3_0) ++; CHECK-NEXT: xvld $xr0, $a0, 0 ++; CHECK-NEXT: xvshuf.d $xr0, $xr1, $xr2 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ++ ret <4 x i64> %c ++} ++ ++;; xvshuf.w ++define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) { ++; CHECK-LABEL: shufflevector_v8f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0) ++; CHECK-NEXT: addi.d $a0, $a0, %pc_lo12(.LCPI4_0) ++; CHECK-NEXT: xvld $xr2, $a0, 0 ++; CHECK-NEXT: xvshuf.w $xr2, $xr1, $xr0 ++; CHECK-NEXT: xvori.b $xr0, $xr2, 0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ++ ret <8 x float> %c ++} +diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll +new file mode 100644 +index 000000000000..dc4532a7292a +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll +@@ -0,0 +1,43 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s ++ ++;; xxvshuf4i.b ++define <32 x i8> @shufflevector_xvshuf4i_v32i8(<32 x i8> %a, <32 x i8> %b) { ++; CHECK-LABEL: shufflevector_xvshuf4i_v32i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvshuf4i.b $xr0, $xr0, 27 ++; CHECK-NEXT: ret ++ %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ++ ret <32 x i8> %c ++} ++ ++;; xvshuf4i.h ++define <16 x i16> @shufflevector_xvshuf4i_v16i16(<16 x i16> %a, <16 x i16> %b) { ++; CHECK-LABEL: shufflevector_xvshuf4i_v16i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvshuf4i.h $xr0, $xr0, 27 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ++ ret <16 x i16> %c ++} ++ ++;; xvshuf4i.w ++define <8 x i32> @shufflevector_xvshuf4i_v8i32(<8 x i32> %a, <8 x i32> %b) { ++; CHECK-LABEL: shufflevector_xvshuf4i_v8i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvshuf4i.w $xr0, $xr0, 27 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ++ ret <8 x i32> %c ++} ++ ++;; xvshuf4i.w ++define <8 x float> @shufflevector_xvshuf4i_v8f32(<8 x float> %a, <8 x float> %b) { ++; CHECK-LABEL: shufflevector_xvshuf4i_v8f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: xvshuf4i.w $xr0, $xr0, 27 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ++ ret <8 x float> %c ++} +diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll +new file mode 100644 +index 000000000000..31398c6081c0 +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll +@@ -0,0 +1,82 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s ++ ++;; vilvl.b ++define <16 x i8> @shufflevector_vilvl_v16i8(<16 x i8> %a, <16 x i8> %b) { ++; CHECK-LABEL: shufflevector_vilvl_v16i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vilvl.b $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ++ ret <16 x i8> %c ++} ++ ++;; vilvl.h ++define <8 x i16> @shufflevector_vilvl_v8i16(<8 x i16> %a, <8 x i16> %b) { ++; CHECK-LABEL: shufflevector_vilvl_v8i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vilvl.h $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ++ ret <8 x i16> %c ++} ++ ++;; vilvl.w ++define <4 x i32> @shufflevector_vilvl_v4i32(<4 x i32> %a, <4 x i32> %b) { ++; CHECK-LABEL: shufflevector_vilvl_v4i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vilvl.w $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ++ ret <4 x i32> %c ++} ++ ++;; vilvl.w ++define <4 x float> @shufflevector_vilvl_v4f32(<4 x float> %a, <4 x float> %b) { ++; CHECK-LABEL: shufflevector_vilvl_v4f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vilvl.w $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ++ ret <4 x float> %c ++} ++ ++;; vilvh.b ++define <16 x i8> @shufflevector_vilvh_v16i8(<16 x i8> %a, <16 x i8> %b) { ++; CHECK-LABEL: shufflevector_vilvh_v16i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vilvh.b $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ++ ret <16 x i8> %c ++} ++ ++;; vilvh.h ++define <8 x i16> @shufflevector_vilvh_v8i16(<8 x i16> %a, <8 x i16> %b) { ++; CHECK-LABEL: shufflevector_vilvh_v8i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vilvh.h $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ++ ret <8 x i16> %c ++} ++ ++;; vilvh.w ++define <4 x i32> @shufflevector_vilvh_v4i32(<4 x i32> %a, <4 x i32> %b) { ++; CHECK-LABEL: shufflevector_vilvh_v4i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vilvh.w $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ++ ret <4 x i32> %c ++} ++ ++;; vilvh.w ++define <4 x float> @shufflevector_vilvh_v4f32(<4 x float> %a, <4 x float> %b) { ++; CHECK-LABEL: shufflevector_vilvh_v4f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vilvh.w $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ++ ret <4 x float> %c ++} +diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll +new file mode 100644 +index 000000000000..171e68306cd1 +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll +@@ -0,0 +1,122 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s ++ ++;; vpackev.b ++define <16 x i8> @shufflevector_pack_ev_v16i8(<16 x i8> %a, <16 x i8> %b) { ++; CHECK-LABEL: shufflevector_pack_ev_v16i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpackev.b $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ++ ret <16 x i8> %c ++} ++ ++;; vpackev.h ++define <8 x i16> @shufflevector_pack_ev_v8i16(<8 x i16> %a, <8 x i16> %b) { ++; CHECK-LABEL: shufflevector_pack_ev_v8i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpackev.h $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ++ ret <8 x i16> %c ++} ++ ++;; vpackev.w ++define <4 x i32> @shufflevector_pack_ev_v4i32(<4 x i32> %a, <4 x i32> %b) { ++; CHECK-LABEL: shufflevector_pack_ev_v4i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpackev.w $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ++ ret <4 x i32> %c ++} ++ ++;; vpickev.d/vpackev.d/vilvl.d ++define <2 x i64> @shufflevector_pack_ev_v2i64(<2 x i64> %a, <2 x i64> %b) { ++; CHECK-LABEL: shufflevector_pack_ev_v2i64: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpackev.d $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ++ ret <2 x i64> %c ++} ++ ++;; vpackev.w ++define <4 x float> @shufflevector_pack_ev_v4f32(<4 x float> %a, <4 x float> %b) { ++; CHECK-LABEL: shufflevector_pack_ev_v4f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpackev.w $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ++ ret <4 x float> %c ++} ++ ++;; vpickev.d/vpackev.d/vilvl.d ++define <2 x double> @shufflevector_pack_ev_v2f64(<2 x double> %a, <2 x double> %b) { ++; CHECK-LABEL: shufflevector_pack_ev_v2f64: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpackev.d $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ++ ret <2 x double> %c ++} ++ ++;; vpackod.b ++define <16 x i8> @shufflevector_pack_od_v16i8(<16 x i8> %a, <16 x i8> %b) { ++; CHECK-LABEL: shufflevector_pack_od_v16i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpackod.b $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ++ ret <16 x i8> %c ++} ++ ++;; vpackod.h ++define <8 x i16> @shufflevector_pack_od_v8i16(<8 x i16> %a, <8 x i16> %b) { ++; CHECK-LABEL: shufflevector_pack_od_v8i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpackod.h $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ++ ret <8 x i16> %c ++} ++ ++;; vpackod.w ++define <4 x i32> @shufflevector_pack_od_v4i32(<4 x i32> %a, <4 x i32> %b) { ++; CHECK-LABEL: shufflevector_pack_od_v4i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpackod.w $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ++ ret <4 x i32> %c ++} ++ ++;; vpickod.d/vpackod.d/vilvh.d ++define <2 x i64> @shufflodector_pack_od_v2i64(<2 x i64> %a, <2 x i64> %b) { ++; CHECK-LABEL: shufflodector_pack_od_v2i64: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpackod.d $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ++ ret <2 x i64> %c ++} ++ ++;; vpackod.w ++define <4 x float> @shufflodector_pack_od_v4f32(<4 x float> %a, <4 x float> %b) { ++; CHECK-LABEL: shufflodector_pack_od_v4f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpackod.w $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ++ ret <4 x float> %c ++} ++ ++;; vpickod.d/vpackod.d/vilvh.d ++define <2 x double> @shufflodector_pack_od_v2f64(<2 x double> %a, <2 x double> %b) { ++; CHECK-LABEL: shufflodector_pack_od_v2f64: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpackod.d $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ++ ret <2 x double> %c ++} +diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll +new file mode 100644 +index 000000000000..ca636d942b58 +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll +@@ -0,0 +1,82 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s ++ ++;; vpickev.b ++define <16 x i8> @shufflevector_pick_ev_v16i8(<16 x i8> %a, <16 x i8> %b) { ++; CHECK-LABEL: shufflevector_pick_ev_v16i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpickev.b $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ++ ret <16 x i8> %c ++} ++ ++;; vpickev.h ++define <8 x i16> @shufflevector_pick_ev_v8i16(<8 x i16> %a, <8 x i16> %b) { ++; CHECK-LABEL: shufflevector_pick_ev_v8i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpickev.h $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ++ ret <8 x i16> %c ++} ++ ++;; vpickev.w ++define <4 x i32> @shufflevector_pick_ev_v4i32(<4 x i32> %a, <4 x i32> %b) { ++; CHECK-LABEL: shufflevector_pick_ev_v4i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpickev.w $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ++ ret <4 x i32> %c ++} ++ ++;; vpickev.w ++define <4 x float> @shufflevector_pick_ev_v4f32(<4 x float> %a, <4 x float> %b) { ++; CHECK-LABEL: shufflevector_pick_ev_v4f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpickev.w $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ++ ret <4 x float> %c ++} ++ ++;; vpickod.b ++define <16 x i8> @shufflevector_pick_od_v16i8(<16 x i8> %a, <16 x i8> %b) { ++; CHECK-LABEL: shufflevector_pick_od_v16i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpickod.b $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ++ ret <16 x i8> %c ++} ++ ++;; vpickod.h ++define <8 x i16> @shufflevector_pick_od_v8i16(<8 x i16> %a, <8 x i16> %b) { ++; CHECK-LABEL: shufflevector_pick_od_v8i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpickod.h $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ++ ret <8 x i16> %c ++} ++ ++;; vpickod.w ++define <4 x i32> @shufflevector_pick_od_v4i32(<4 x i32> %a, <4 x i32> %b) { ++; CHECK-LABEL: shufflevector_pick_od_v4i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpickod.w $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ++ ret <4 x i32> %c ++} ++ ++;; vpickod.w ++define <4 x float> @shufflodector_pick_od_v4f32(<4 x float> %a, <4 x float> %b) { ++; CHECK-LABEL: shufflodector_pick_od_v4f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vpickod.w $vr0, $vr1, $vr0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ++ ret <4 x float> %c ++} +diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll +new file mode 100644 +index 000000000000..10510786f321 +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll +@@ -0,0 +1,62 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s ++ ++;; vreplvei.b ++define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) { ++; CHECK-LABEL: shufflevector_v16i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vreplvei.b $vr0, $vr0, 1 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ++ ret <16 x i8> %c ++} ++ ++;; vreplvei.h ++define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) { ++; CHECK-LABEL: shufflevector_v8i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vreplvei.h $vr0, $vr1, 2 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ++ ret <8 x i16> %c ++} ++ ++;; vreplvei.w ++define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) { ++; CHECK-LABEL: shufflevector_v4i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ++ ret <4 x i32> %c ++} ++ ++;; vreplvei.d ++define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) { ++; CHECK-LABEL: shufflevector_v2i64: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vreplvei.d $vr0, $vr0, 1 ++; CHECK-NEXT: ret ++ %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ++ ret <2 x i64> %c ++} ++ ++;; vreplvei.w ++define <4 x float> @shufflevector_v4f32(<4 x float> %a, <4 x float> %b) { ++; CHECK-LABEL: shufflevector_v4f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ++ ret <4 x float> %c ++} ++ ++;; vreplvei.d ++define <2 x double> @shufflevector_v2f64(<2 x double> %a, <2 x double> %b) { ++; CHECK-LABEL: shufflevector_v2f64: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vreplvei.d $vr0, $vr0, 1 ++; CHECK-NEXT: ret ++ %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ++ ret <2 x double> %c ++} +diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll +new file mode 100644 +index 000000000000..55800b31446b +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll +@@ -0,0 +1,84 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s ++ ++define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) { ++; CHECK-LABEL: shufflevector_v16i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0) ++; CHECK-NEXT: addi.d $a0, $a0, %pc_lo12(.LCPI0_0) ++; CHECK-NEXT: vld $vr2, $a0, 0 ++; CHECK-NEXT: vshuf.b $vr0, $vr1, $vr0, $vr2 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ++ ret <16 x i8> %c ++} ++ ++;; vshuf.h ++define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) { ++; CHECK-LABEL: shufflevector_v8i16: ++; CHECK: # %bb.0: ++; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) ++; CHECK-NEXT: addi.d $a0, $a0, %pc_lo12(.LCPI1_0) ++; CHECK-NEXT: vld $vr2, $a0, 0 ++; CHECK-NEXT: vshuf.h $vr2, $vr1, $vr0 ++; CHECK-NEXT: vori.b $vr0, $vr2, 0 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ++ ret <8 x i16> %c ++} ++ ++;; vshuf.w ++define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) { ++; CHECK-LABEL: shufflevector_v4i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) ++; CHECK-NEXT: addi.d $a0, $a0, %pc_lo12(.LCPI2_0) ++; CHECK-NEXT: vld $vr2, $a0, 0 ++; CHECK-NEXT: vshuf.w $vr2, $vr1, $vr0 ++; CHECK-NEXT: vori.b $vr0, $vr2, 0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ++ ret <4 x i32> %c ++} ++ ++;; vshuf.d ++define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) { ++; CHECK-LABEL: shufflevector_v2i64: ++; CHECK: # %bb.0: ++; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0) ++; CHECK-NEXT: addi.d $a0, $a0, %pc_lo12(.LCPI3_0) ++; CHECK-NEXT: vld $vr2, $a0, 0 ++; CHECK-NEXT: vshuf.d $vr2, $vr1, $vr0 ++; CHECK-NEXT: vori.b $vr0, $vr2, 0 ++; CHECK-NEXT: ret ++ %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ++ ret <2 x i64> %c ++} ++ ++;; vshuf.w ++define <4 x float> @shufflevector_v4f32(<4 x float> %a, <4 x float> %b) { ++; CHECK-LABEL: shufflevector_v4f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0) ++; CHECK-NEXT: addi.d $a0, $a0, %pc_lo12(.LCPI4_0) ++; CHECK-NEXT: vld $vr2, $a0, 0 ++; CHECK-NEXT: vshuf.w $vr2, $vr1, $vr0 ++; CHECK-NEXT: vori.b $vr0, $vr2, 0 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ++ ret <4 x float> %c ++} ++ ++;; vshuf.d ++define <2 x double> @shufflevector_v2f64(<2 x double> %a, <2 x double> %b) { ++; CHECK-LABEL: shufflevector_v2f64: ++; CHECK: # %bb.0: ++; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_0) ++; CHECK-NEXT: addi.d $a0, $a0, %pc_lo12(.LCPI5_0) ++; CHECK-NEXT: vld $vr2, $a0, 0 ++; CHECK-NEXT: vshuf.d $vr2, $vr1, $vr0 ++; CHECK-NEXT: vori.b $vr0, $vr2, 0 ++; CHECK-NEXT: ret ++ %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ++ ret <2 x double> %c ++} +diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll +new file mode 100644 +index 000000000000..660b9581c3d1 +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll +@@ -0,0 +1,42 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s ++ ++;; vilvh.b ++define <16 x i8> @shufflevector_vshuf4i_v16i8(<16 x i8> %a, <16 x i8> %b) { ++; CHECK-LABEL: shufflevector_vshuf4i_v16i8: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 27 ++; CHECK-NEXT: ret ++ %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ++ ret <16 x i8> %c ++} ++ ++;; vilvh.h ++define <8 x i16> @shufflevector_vshuf4i_v8i4(<8 x i16> %a, <8 x i16> %b) { ++; CHECK-LABEL: shufflevector_vshuf4i_v8i4: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 27 ++; CHECK-NEXT: ret ++ %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ++ ret <8 x i16> %c ++} ++ ++;; vilvh.w ++define <4 x i32> @shufflevector_vshuf4i_v4i32(<4 x i32> %a, <4 x i32> %b) { ++; CHECK-LABEL: shufflevector_vshuf4i_v4i32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 27 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ++ ret <4 x i32> %c ++} ++ ++;; vilvh.w ++define <4 x float> @shufflevector_vshuf4i_v4f32(<4 x float> %a, <4 x float> %b) { ++; CHECK-LABEL: shufflevector_vshuf4i_v4f32: ++; CHECK: # %bb.0: ++; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 27 ++; CHECK-NEXT: ret ++ %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ++ ret <4 x float> %c ++} +-- +2.20.1 + + +From cac0cc4649362e0b80f61e45aec54341f40f7f77 Mon Sep 17 00:00:00 2001 +From: Ami-zhang +Date: Wed, 17 Jan 2024 11:15:05 +0800 +Subject: [PATCH 08/12] [LoongArch] Add LoongArch V1.1 instructions definitions + and MC tests (#78238) + +LoongArch V1.1 instrucions include floating-point approximate reciprocal +instructions and atomic instrucions. And add testcases for these +instrucions meanwhile. + +(cherry picked from commit 84bdee2875da364be7eb2144b1ae530f6a05f0e2) +--- + .../LoongArch/LoongArchFloat32InstrInfo.td | 2 + + .../LoongArch/LoongArchFloat64InstrInfo.td | 2 + + .../Target/LoongArch/LoongArchInstrInfo.td | 34 ++++++- + .../LoongArch/LoongArchLASXInstrInfo.td | 4 + + .../Target/LoongArch/LoongArchLSXInstrInfo.td | 4 + + llvm/test/MC/LoongArch/Basic/Float/d-arith.s | 8 ++ + llvm/test/MC/LoongArch/Basic/Float/f-arith.s | 8 ++ + llvm/test/MC/LoongArch/Basic/Integer/atomic.s | 92 +++++++++++++++++++ + llvm/test/MC/LoongArch/lasx/frecip.s | 8 ++ + llvm/test/MC/LoongArch/lasx/frsqrt.s | 8 ++ + llvm/test/MC/LoongArch/lsx/frecip.s | 8 ++ + llvm/test/MC/LoongArch/lsx/frsqrt.s | 8 ++ + 12 files changed, 184 insertions(+), 2 deletions(-) + +diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +index 65120c083f49..f30837912e75 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td ++++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +@@ -50,6 +50,8 @@ def FNEG_S : FP_ALU_2R<0x01141400>; + def FSQRT_S : FP_ALU_2R<0x01144400>; + def FRECIP_S : FP_ALU_2R<0x01145400>; + def FRSQRT_S : FP_ALU_2R<0x01146400>; ++def FRECIPE_S : FP_ALU_2R<0x01147400>; ++def FRSQRTE_S : FP_ALU_2R<0x01148400>; + def FSCALEB_S : FP_ALU_3R<0x01108000>; + def FLOGB_S : FP_ALU_2R<0x01142400>; + def FCOPYSIGN_S : FP_ALU_3R<0x01128000>; +diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +index 437c1e4d7be2..0ea4c564b045 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td ++++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +@@ -34,6 +34,8 @@ def FNEG_D : FP_ALU_2R<0x01141800, FPR64>; + def FSQRT_D : FP_ALU_2R<0x01144800, FPR64>; + def FRECIP_D : FP_ALU_2R<0x01145800, FPR64>; + def FRSQRT_D : FP_ALU_2R<0x01146800, FPR64>; ++def FRECIPE_D : FP_ALU_2R<0x01147800, FPR64>; ++def FRSQRTE_D : FP_ALU_2R<0x01148800, FPR64>; + def FSCALEB_D : FP_ALU_3R<0x01110000, FPR64>; + def FLOGB_D : FP_ALU_2R<0x01142800, FPR64>; + def FCOPYSIGN_D : FP_ALU_3R<0x01130000, FPR64>; +diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +index ecd0c2b71b85..756c460f916b 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td ++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +@@ -634,15 +634,24 @@ class AM_3R op> + : Fmt3R; + +-let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in ++let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { + class LLBase op> + : Fmt2RI14; ++class LLBase_ACQ op> ++ : Fmt2R; ++} + +-let hasSideEffects = 0, mayLoad = 0, mayStore = 1, Constraints = "$rd = $dst" in ++let hasSideEffects = 0, mayLoad = 0, mayStore = 1, Constraints = "$rd = $dst" in { + class SCBase op> + : Fmt2RI14; ++class SCBase_128 op> ++ : Fmt3R; ++class SCBase_REL op> ++ : Fmt2R; ++} + + let hasSideEffects = 1 in + class IOCSRRD op> +@@ -754,6 +763,8 @@ def PRELD : FmtPRELD<(outs), (ins uimm5:$imm5, GPR:$rj, simm12:$imm12), + // Atomic Memory Access Instructions + def LL_W : LLBase<0x20000000>; + def SC_W : SCBase<0x21000000>; ++def LLACQ_W : LLBase_ACQ<0x38578000>; ++def SCREL_W : SCBase_REL<0x38578400>; + + // Barrier Instructions + def DBAR : MISC_I15<0x38720000>; +@@ -875,8 +886,12 @@ def STLE_W : STORE_3R<0x387f0000>; + def STLE_D : STORE_3R<0x387f8000>; + + // Atomic Memory Access Instructions for 64-bits ++def AMSWAP_B : AM_3R<0x385c0000>; ++def AMSWAP_H : AM_3R<0x385c8000>; + def AMSWAP_W : AM_3R<0x38600000>; + def AMSWAP_D : AM_3R<0x38608000>; ++def AMADD_B : AM_3R<0x385d0000>; ++def AMADD_H : AM_3R<0x385d8000>; + def AMADD_W : AM_3R<0x38610000>; + def AMADD_D : AM_3R<0x38618000>; + def AMAND_W : AM_3R<0x38620000>; +@@ -893,8 +908,12 @@ def AMMAX_WU : AM_3R<0x38670000>; + def AMMAX_DU : AM_3R<0x38678000>; + def AMMIN_WU : AM_3R<0x38680000>; + def AMMIN_DU : AM_3R<0x38688000>; ++def AMSWAP__DB_B : AM_3R<0x385e0000>; ++def AMSWAP__DB_H : AM_3R<0x385e8000>; + def AMSWAP__DB_W : AM_3R<0x38690000>; + def AMSWAP__DB_D : AM_3R<0x38698000>; ++def AMADD__DB_B : AM_3R<0x385f0000>; ++def AMADD__DB_H : AM_3R<0x385f8000>; + def AMADD__DB_W : AM_3R<0x386a0000>; + def AMADD__DB_D : AM_3R<0x386a8000>; + def AMAND__DB_W : AM_3R<0x386b0000>; +@@ -911,8 +930,19 @@ def AMMAX__DB_WU : AM_3R<0x38700000>; + def AMMAX__DB_DU : AM_3R<0x38708000>; + def AMMIN__DB_WU : AM_3R<0x38710000>; + def AMMIN__DB_DU : AM_3R<0x38718000>; ++def AMCAS_B : AM_3R<0x38580000>; ++def AMCAS_H : AM_3R<0x38588000>; ++def AMCAS_W : AM_3R<0x38590000>; ++def AMCAS_D : AM_3R<0x38598000>; ++def AMCAS__DB_B : AM_3R<0x385a0000>; ++def AMCAS__DB_H : AM_3R<0x385a8000>; ++def AMCAS__DB_W : AM_3R<0x385b0000>; ++def AMCAS__DB_D : AM_3R<0x385b8000>; + def LL_D : LLBase<0x22000000>; + def SC_D : SCBase<0x23000000>; ++def SC_Q : SCBase_128<0x38570000>; ++def LLACQ_D : LLBase_ACQ<0x38578800>; ++def SCREL_D : SCBase_REL<0x38578C00>; + + // CRC Check Instructions + def CRC_W_B_W : ALU_3R<0x00240000>; +diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +index 5b6721cdf1b4..454915ac8c0a 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td ++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +@@ -773,6 +773,10 @@ def XVFRECIP_S : LASX2R_XX<0x769cf400>; + def XVFRECIP_D : LASX2R_XX<0x769cf800>; + def XVFRSQRT_S : LASX2R_XX<0x769d0400>; + def XVFRSQRT_D : LASX2R_XX<0x769d0800>; ++def XVFRECIPE_S : LASX2R_XX<0x769d1400>; ++def XVFRECIPE_D : LASX2R_XX<0x769d1800>; ++def XVFRSQRTE_S : LASX2R_XX<0x769d2400>; ++def XVFRSQRTE_D : LASX2R_XX<0x769d2800>; + + def XVFCVTL_S_H : LASX2R_XX<0x769de800>; + def XVFCVTH_S_H : LASX2R_XX<0x769dec00>; +diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +index 3519fa3142c3..6d60d7074ec3 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td ++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +@@ -918,6 +918,10 @@ def VFRECIP_S : LSX2R_VV<0x729cf400>; + def VFRECIP_D : LSX2R_VV<0x729cf800>; + def VFRSQRT_S : LSX2R_VV<0x729d0400>; + def VFRSQRT_D : LSX2R_VV<0x729d0800>; ++def VFRECIPE_S : LSX2R_VV<0x729d1400>; ++def VFRECIPE_D : LSX2R_VV<0x729d1800>; ++def VFRSQRTE_S : LSX2R_VV<0x729d2400>; ++def VFRSQRTE_D : LSX2R_VV<0x729d2800>; + + def VFCVTL_S_H : LSX2R_VV<0x729de800>; + def VFCVTH_S_H : LSX2R_VV<0x729dec00>; +diff --git a/llvm/test/MC/LoongArch/Basic/Float/d-arith.s b/llvm/test/MC/LoongArch/Basic/Float/d-arith.s +index 6b2c67e9a2cc..8e19d2e34f3c 100644 +--- a/llvm/test/MC/LoongArch/Basic/Float/d-arith.s ++++ b/llvm/test/MC/LoongArch/Basic/Float/d-arith.s +@@ -78,10 +78,18 @@ fsqrt.d $fa2, $ft3 + # ASM: encoding: [0x7b,0x5b,0x14,0x01] + frecip.d $fs3, $fs3 + ++# ASM-AND-OBJ: frecipe.d $fa0, $fa0 ++# ASM: encoding: [0x00,0x78,0x14,0x01] ++frecipe.d $fa0, $fa0 ++ + # ASM-AND-OBJ: frsqrt.d $ft14, $fa3 + # ASM: encoding: [0x76,0x68,0x14,0x01] + frsqrt.d $ft14, $fa3 + ++# ASM-AND-OBJ: frsqrte.d $fa1, $fa1 ++# ASM: encoding: [0x21,0x88,0x14,0x01] ++frsqrte.d $fa1, $fa1 ++ + # ASM-AND-OBJ: fscaleb.d $ft4, $ft6, $fs2 + # ASM: encoding: [0xcc,0x69,0x11,0x01] + fscaleb.d $ft4, $ft6, $fs2 +diff --git a/llvm/test/MC/LoongArch/Basic/Float/f-arith.s b/llvm/test/MC/LoongArch/Basic/Float/f-arith.s +index 155e783cf435..c32151adbf3b 100644 +--- a/llvm/test/MC/LoongArch/Basic/Float/f-arith.s ++++ b/llvm/test/MC/LoongArch/Basic/Float/f-arith.s +@@ -73,10 +73,18 @@ fsqrt.s $fs3, $ft10 + # ASM: encoding: [0x71,0x57,0x14,0x01] + frecip.s $ft9, $fs3 + ++# ASM-AND-OBJ: frecipe.s $fa0, $fa0 ++# ASM: encoding: [0x00,0x74,0x14,0x01] ++frecipe.s $fa0, $fa0 ++ + # ASM-AND-OBJ: frsqrt.s $fs1, $ft4 + # ASM: encoding: [0x99,0x65,0x14,0x01] + frsqrt.s $fs1, $ft4 + ++# ASM-AND-OBJ: frsqrte.s $fa1, $fa1 ++# ASM: encoding: [0x21,0x84,0x14,0x01] ++frsqrte.s $fa1, $fa1 ++ + # ASM-AND-OBJ: fscaleb.s $ft13, $ft15, $fa6 + # ASM: encoding: [0xf5,0x9a,0x10,0x01] + fscaleb.s $ft13, $ft15, $fa6 +diff --git a/llvm/test/MC/LoongArch/Basic/Integer/atomic.s b/llvm/test/MC/LoongArch/Basic/Integer/atomic.s +index a35211db8851..69acdeef935c 100644 +--- a/llvm/test/MC/LoongArch/Basic/Integer/atomic.s ++++ b/llvm/test/MC/LoongArch/Basic/Integer/atomic.s +@@ -21,6 +21,14 @@ ll.w $tp, $s4, 220 + # CHECK-ASM: encoding: [0xd3,0x39,0x00,0x21] + sc.w $t7, $t2, 56 + ++# CHECK-ASM-AND-OBJ: llacq.w $t1, $t2 ++# CHECK-ASM: encoding: [0xcd,0x81,0x57,0x38] ++llacq.w $t1, $t2 ++ ++# CHECK-ASM-AND-OBJ: screl.w $t1, $t2 ++# CHECK-ASM: encoding: [0xcd,0x85,0x57,0x38] ++screl.w $t1, $t2 ++ + + + ############################################################# +@@ -29,6 +37,14 @@ sc.w $t7, $t2, 56 + + .ifdef LA64 + ++# CHECK64-ASM-AND-OBJ: amswap.b $a2, $t0, $s1 ++# CHECK64-ASM: encoding: [0x06,0x33,0x5c,0x38] ++amswap.b $a2, $t0, $s1, 0 ++ ++# CHECK64-ASM-AND-OBJ: amswap.h $a2, $t0, $s1 ++# CHECK64-ASM: encoding: [0x06,0xb3,0x5c,0x38] ++amswap.h $a2, $t0, $s1, 0 ++ + # CHECK64-ASM-AND-OBJ: amswap.w $a2, $t0, $s1 + # CHECK64-ASM: encoding: [0x06,0x33,0x60,0x38] + amswap.w $a2, $t0, $s1, 0 +@@ -41,6 +57,14 @@ amswap.w $zero, $t0, $zero + # CHECK64-ASM: encoding: [0xa0,0x00,0x6a,0x38] + amadd_db.w $zero, $zero, $a1 + ++# CHECK64-ASM-AND-OBJ: amswap.b $a2, $t0, $s1 ++# CHECK64-ASM: encoding: [0x06,0x33,0x5c,0x38] ++amswap.b $a2, $t0, $s1 ++ ++# CHECK64-ASM-AND-OBJ: amswap.h $a2, $t0, $s1 ++# CHECK64-ASM: encoding: [0x06,0xb3,0x5c,0x38] ++amswap.h $a2, $t0, $s1 ++ + # CHECK64-ASM-AND-OBJ: amswap.w $a2, $t0, $s1 + # CHECK64-ASM: encoding: [0x06,0x33,0x60,0x38] + amswap.w $a2, $t0, $s1 +@@ -49,6 +73,14 @@ amswap.w $a2, $t0, $s1 + # CHECK64-ASM: encoding: [0xc2,0xba,0x60,0x38] + amswap.d $tp, $t2, $fp + ++# CHECK64-ASM-AND-OBJ: amadd.b $a4, $t0, $r21 ++# CHECK64-ASM: encoding: [0xa8,0x32,0x5d,0x38] ++amadd.b $a4, $t0, $r21 ++ ++# CHECK64-ASM-AND-OBJ: amadd.h $a1, $t5, $s6 ++# CHECK64-ASM: encoding: [0xa5,0xc7,0x5d,0x38] ++amadd.h $a1, $t5, $s6 ++ + # CHECK64-ASM-AND-OBJ: amadd.w $a4, $t0, $r21 + # CHECK64-ASM: encoding: [0xa8,0x32,0x61,0x38] + amadd.w $a4, $t0, $r21 +@@ -113,6 +145,14 @@ ammin.wu $a4, $t6, $s7 + # CHECK64-ASM: encoding: [0x27,0xc3,0x68,0x38] + ammin.du $a3, $t4, $s2 + ++# CHECK64-ASM-AND-OBJ: amswap_db.b $a2, $t0, $s1 ++# CHECK64-ASM: encoding: [0x06,0x33,0x5e,0x38] ++amswap_db.b $a2, $t0, $s1 ++ ++# CHECK64-ASM-AND-OBJ: amswap_db.h $tp, $t2, $fp ++# CHECK64-ASM: encoding: [0xc2,0xba,0x5e,0x38] ++amswap_db.h $tp, $t2, $fp ++ + # CHECK64-ASM-AND-OBJ: amswap_db.w $a2, $t0, $s1 + # CHECK64-ASM: encoding: [0x06,0x33,0x69,0x38] + amswap_db.w $a2, $t0, $s1 +@@ -121,6 +161,14 @@ amswap_db.w $a2, $t0, $s1 + # CHECK64-ASM: encoding: [0xc2,0xba,0x69,0x38] + amswap_db.d $tp, $t2, $fp + ++# CHECK64-ASM-AND-OBJ: amadd_db.b $zero, $zero, $a1 ++# CHECK64-ASM: encoding: [0xa0,0x00,0x5f,0x38] ++amadd_db.b $zero, $zero, $a1 ++ ++# CHECK64-ASM-AND-OBJ: amadd_db.h $a4, $t0, $r21 ++# CHECK64-ASM: encoding: [0xa8,0xb2,0x5f,0x38] ++amadd_db.h $a4, $t0, $r21 ++ + # CHECK64-ASM-AND-OBJ: amadd_db.w $a4, $t0, $r21 + # CHECK64-ASM: encoding: [0xa8,0x32,0x6a,0x38] + amadd_db.w $a4, $t0, $r21 +@@ -185,6 +233,38 @@ ammin_db.wu $a4, $t6, $s7 + # CHECK64-ASM: encoding: [0x27,0xc3,0x71,0x38] + ammin_db.du $a3, $t4, $s2 + ++# CHECK64-ASM-AND-OBJ: amcas.b $t1, $t2, $t3 ++# CHECK64-ASM: encoding: [0xed,0x39,0x58,0x38] ++amcas.b $t1, $t2, $t3 ++ ++# CHECK64-ASM-AND-OBJ: amcas.h $t1, $t2, $t3 ++# CHECK64-ASM: encoding: [0xed,0xb9,0x58,0x38] ++amcas.h $t1, $t2, $t3 ++ ++# CHECK64-ASM-AND-OBJ: amcas.w $t1, $t2, $t3 ++# CHECK64-ASM: encoding: [0xed,0x39,0x59,0x38] ++amcas.w $t1, $t2, $t3 ++ ++# CHECK64-ASM-AND-OBJ: amcas.d $t1, $t2, $t3 ++# CHECK64-ASM: encoding: [0xed,0xb9,0x59,0x38] ++amcas.d $t1, $t2, $t3 ++ ++# CHECK64-ASM-AND-OBJ: amcas_db.b $t1, $t2, $t3 ++# CHECK64-ASM: encoding: [0xed,0x39,0x5a,0x38] ++amcas_db.b $t1, $t2, $t3 ++ ++# CHECK64-ASM-AND-OBJ: amcas_db.h $t1, $t2, $t3 ++# CHECK64-ASM: encoding: [0xed,0xb9,0x5a,0x38] ++amcas_db.h $t1, $t2, $t3 ++ ++# CHECK64-ASM-AND-OBJ: amcas_db.w $t1, $t2, $t3 ++# CHECK64-ASM: encoding: [0xed,0x39,0x5b,0x38] ++amcas_db.w $t1, $t2, $t3 ++ ++# CHECK64-ASM-AND-OBJ: amcas_db.d $t1, $t2, $t3 ++# CHECK64-ASM: encoding: [0xed,0xb9,0x5b,0x38] ++amcas_db.d $t1, $t2, $t3 ++ + # CHECK64-ASM-AND-OBJ: ll.d $s2, $s4, 16 + # CHECK64-ASM: encoding: [0x79,0x13,0x00,0x22] + ll.d $s2, $s4, 16 +@@ -193,5 +273,17 @@ ll.d $s2, $s4, 16 + # CHECK64-ASM: encoding: [0x31,0xf6,0x00,0x23] + sc.d $t5, $t5, 244 + ++# CHECK64-ASM-AND-OBJ: sc.q $t7, $t2, $t5 ++# CHECK64-ASM: encoding: [0x33,0x3a,0x57,0x38] ++sc.q $t7, $t2, $t5 ++ ++# CHECK64-ASM-AND-OBJ: llacq.d $t1, $t2 ++# CHECK64-ASM: encoding: [0xcd,0x89,0x57,0x38] ++llacq.d $t1, $t2 ++ ++# CHECK64-ASM-AND-OBJ: screl.d $t1, $t2 ++# CHECK64-ASM: encoding: [0xcd,0x8d,0x57,0x38] ++screl.d $t1, $t2 ++ + .endif + +diff --git a/llvm/test/MC/LoongArch/lasx/frecip.s b/llvm/test/MC/LoongArch/lasx/frecip.s +index 1bb3ce02fb9c..e95b03a96eba 100644 +--- a/llvm/test/MC/LoongArch/lasx/frecip.s ++++ b/llvm/test/MC/LoongArch/lasx/frecip.s +@@ -10,3 +10,11 @@ xvfrecip.s $xr3, $xr16 + xvfrecip.d $xr17, $xr24 + # CHECK-INST: xvfrecip.d $xr17, $xr24 + # CHECK-ENCODING: encoding: [0x11,0xfb,0x9c,0x76] ++ ++xvfrecipe.s $xr3, $xr16 ++# CHECK-INST: xvfrecipe.s $xr3, $xr16 ++# CHECK-ENCODING: encoding: [0x03,0x16,0x9d,0x76] ++ ++xvfrecipe.d $xr17, $xr24 ++# CHECK-INST: xvfrecipe.d $xr17, $xr24 ++# CHECK-ENCODING: encoding: [0x11,0x1b,0x9d,0x76] +diff --git a/llvm/test/MC/LoongArch/lasx/frsqrt.s b/llvm/test/MC/LoongArch/lasx/frsqrt.s +index af96e10832df..d1048f9ff8f0 100644 +--- a/llvm/test/MC/LoongArch/lasx/frsqrt.s ++++ b/llvm/test/MC/LoongArch/lasx/frsqrt.s +@@ -10,3 +10,11 @@ xvfrsqrt.s $xr31, $xr25 + xvfrsqrt.d $xr14, $xr22 + # CHECK-INST: xvfrsqrt.d $xr14, $xr22 + # CHECK-ENCODING: encoding: [0xce,0x0a,0x9d,0x76] ++ ++xvfrsqrte.s $xr31, $xr25 ++# CHECK-INST: xvfrsqrte.s $xr31, $xr25 ++# CHECK-ENCODING: encoding: [0x3f,0x27,0x9d,0x76] ++ ++xvfrsqrte.d $xr14, $xr22 ++# CHECK-INST: xvfrsqrte.d $xr14, $xr22 ++# CHECK-ENCODING: encoding: [0xce,0x2a,0x9d,0x76] +diff --git a/llvm/test/MC/LoongArch/lsx/frecip.s b/llvm/test/MC/LoongArch/lsx/frecip.s +index d8c8278d1667..cd6d925e1470 100644 +--- a/llvm/test/MC/LoongArch/lsx/frecip.s ++++ b/llvm/test/MC/LoongArch/lsx/frecip.s +@@ -10,3 +10,11 @@ vfrecip.s $vr29, $vr14 + vfrecip.d $vr24, $vr9 + # CHECK-INST: vfrecip.d $vr24, $vr9 + # CHECK-ENCODING: encoding: [0x38,0xf9,0x9c,0x72] ++ ++vfrecipe.s $vr29, $vr14 ++# CHECK-INST: vfrecipe.s $vr29, $vr14 ++# CHECK-ENCODING: encoding: [0xdd,0x15,0x9d,0x72] ++ ++vfrecipe.d $vr24, $vr9 ++# CHECK-INST: vfrecipe.d $vr24, $vr9 ++# CHECK-ENCODING: encoding: [0x38,0x19,0x9d,0x72] +diff --git a/llvm/test/MC/LoongArch/lsx/frsqrt.s b/llvm/test/MC/LoongArch/lsx/frsqrt.s +index 68b0cc091b8a..d8b9fc3d0684 100644 +--- a/llvm/test/MC/LoongArch/lsx/frsqrt.s ++++ b/llvm/test/MC/LoongArch/lsx/frsqrt.s +@@ -10,3 +10,11 @@ vfrsqrt.s $vr19, $vr30 + vfrsqrt.d $vr1, $vr0 + # CHECK-INST: vfrsqrt.d $vr1, $vr0 + # CHECK-ENCODING: encoding: [0x01,0x08,0x9d,0x72] ++ ++vfrsqrte.s $vr19, $vr30 ++# CHECK-INST: vfrsqrte.s $vr19, $vr30 ++# CHECK-ENCODING: encoding: [0xd3,0x27,0x9d,0x72] ++ ++vfrsqrte.d $vr1, $vr0 ++# CHECK-INST: vfrsqrte.d $vr1, $vr0 ++# CHECK-ENCODING: encoding: [0x01,0x28,0x9d,0x72] +-- +2.20.1 + + +From 57eaecf7bdb7a7502580076b365b4f70dde1185d Mon Sep 17 00:00:00 2001 +From: Ami-zhang +Date: Tue, 23 Jan 2024 14:24:58 +0800 +Subject: [PATCH 09/12] [LoongArch] Add definitions and feature 'frecipe' for + FP approximation intrinsics/builtins (#78962) + +This PR adds definitions and 'frecipe' feature for FP approximation +intrinsics/builtins. In additions, this adds and complements relative +testcases. + +(cherry picked from commit fcb8342a219ada8ec641790a4c8a9f969d7d64ee) +--- + llvm/include/llvm/IR/IntrinsicsLoongArch.td | 13 ++++++++++ + llvm/lib/Target/LoongArch/LoongArch.td | 7 +++++ + .../LoongArch/LoongArchFloat32InstrInfo.td | 6 +++++ + .../LoongArch/LoongArchFloat64InstrInfo.td | 6 +++++ + .../LoongArch/LoongArchLASXInstrInfo.td | 10 +++++++ + .../Target/LoongArch/LoongArchLSXInstrInfo.td | 10 +++++++ + .../lib/Target/LoongArch/LoongArchSubtarget.h | 2 ++ + .../LoongArch/intrinsic-frecipe-dbl.ll | 26 +++++++++++++++++++ + .../LoongArch/intrinsic-frecipe-flt.ll | 26 +++++++++++++++++++ + .../LoongArch/lasx/intrinsic-frecipe.ll | 26 +++++++++++++++++++ + .../LoongArch/lasx/intrinsic-frsqrte.ll | 26 +++++++++++++++++++ + .../LoongArch/lsx/intrinsic-frecipe.ll | 26 +++++++++++++++++++ + .../LoongArch/lsx/intrinsic-frsqrte.ll | 26 +++++++++++++++++++ + 13 files changed, 210 insertions(+) + create mode 100644 llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll + create mode 100644 llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll + create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll + create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll + create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll + create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll + +diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td +index 685deaec7709..9002076e7aec 100644 +--- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td ++++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td +@@ -122,6 +122,15 @@ def int_loongarch_lddir_d : BaseInt<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], + [ImmArg>]>; + def int_loongarch_ldpte_d : BaseInt<[], [llvm_i64_ty, llvm_i64_ty], + [ImmArg>]>; ++ ++def int_loongarch_frecipe_s : BaseInt<[llvm_float_ty], [llvm_float_ty], ++ [IntrNoMem]>; ++def int_loongarch_frecipe_d : BaseInt<[llvm_double_ty], [llvm_double_ty], ++ [IntrNoMem]>; ++def int_loongarch_frsqrte_s : BaseInt<[llvm_float_ty], [llvm_float_ty], ++ [IntrNoMem]>; ++def int_loongarch_frsqrte_d : BaseInt<[llvm_double_ty], [llvm_double_ty], ++ [IntrNoMem]>; + } // TargetPrefix = "loongarch" + + /// Vector intrinsic +@@ -527,10 +536,12 @@ foreach inst = ["vfmadd_d", "vfmsub_d", "vfnmadd_d", "vfnmsub_d"] in + [IntrNoMem]>; + + foreach inst = ["vflogb_s", "vfsqrt_s", "vfrecip_s", "vfrsqrt_s", "vfrint_s", ++ "vfrecipe_s", "vfrsqrte_s", + "vfrintrne_s", "vfrintrz_s", "vfrintrp_s", "vfrintrm_s"] in + def int_loongarch_lsx_#inst : VecInt<[llvm_v4f32_ty], [llvm_v4f32_ty], + [IntrNoMem]>; + foreach inst = ["vflogb_d", "vfsqrt_d", "vfrecip_d", "vfrsqrt_d", "vfrint_d", ++ "vfrecipe_d", "vfrsqrte_d", + "vfrintrne_d", "vfrintrz_d", "vfrintrp_d", "vfrintrm_d"] in + def int_loongarch_lsx_#inst : VecInt<[llvm_v2f64_ty], [llvm_v2f64_ty], + [IntrNoMem]>; +@@ -1044,10 +1055,12 @@ foreach inst = ["xvfmadd_d", "xvfmsub_d", "xvfnmadd_d", "xvfnmsub_d"] in + [IntrNoMem]>; + + foreach inst = ["xvflogb_s", "xvfsqrt_s", "xvfrecip_s", "xvfrsqrt_s", "xvfrint_s", ++ "xvfrecipe_s", "xvfrsqrte_s", + "xvfrintrne_s", "xvfrintrz_s", "xvfrintrp_s", "xvfrintrm_s"] in + def int_loongarch_lasx_#inst : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty], + [IntrNoMem]>; + foreach inst = ["xvflogb_d", "xvfsqrt_d", "xvfrecip_d", "xvfrsqrt_d", "xvfrint_d", ++ "xvfrecipe_d", "xvfrsqrte_d", + "xvfrintrne_d", "xvfrintrz_d", "xvfrintrp_d", "xvfrintrm_d"] in + def int_loongarch_lasx_#inst : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty], + [IntrNoMem]>; +diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td +index 2a4c991a43b0..5573e5415d26 100644 +--- a/llvm/lib/Target/LoongArch/LoongArch.td ++++ b/llvm/lib/Target/LoongArch/LoongArch.td +@@ -110,6 +110,13 @@ def FeatureAutoVec + : SubtargetFeature<"auto-vec", "HasExpAutoVec", "true", + "Experimental auto vectorization">; + ++// Floating point approximation operation ++def FeatureFrecipe ++ : SubtargetFeature<"frecipe", "HasFrecipe", "true", ++ "Support frecipe.{s/d} and frsqrte.{s/d} instructions.">; ++def HasFrecipe : Predicate<"Subtarget->hasFrecipe()">; ++ ++ + //===----------------------------------------------------------------------===// + // Registers, instruction descriptions ... + //===----------------------------------------------------------------------===// +diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +index f30837912e75..e27896768818 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td ++++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +@@ -281,6 +281,12 @@ def : Pat<(loongarch_ftint FPR32:$src), (FTINTRZ_W_S FPR32:$src)>; + // FP reciprocal operation + def : Pat<(fdiv fpimm1, FPR32:$src), (FRECIP_S $src)>; + ++let Predicates = [HasFrecipe] in { ++// FP approximate reciprocal operation ++def : Pat<(int_loongarch_frecipe_s FPR32:$src), (FRECIPE_S FPR32:$src)>; ++def : Pat<(int_loongarch_frsqrte_s FPR32:$src), (FRSQRTE_S FPR32:$src)>; ++} ++ + // fmadd.s: fj * fk + fa + def : Pat<(fma FPR32:$fj, FPR32:$fk, FPR32:$fa), (FMADD_S $fj, $fk, $fa)>; + +diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +index 0ea4c564b045..26bed67ac222 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td ++++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +@@ -242,6 +242,12 @@ def : Pat<(f64 (fpextend FPR32:$src)), (FCVT_D_S FPR32:$src)>; + // FP reciprocal operation + def : Pat<(fdiv fpimm1, FPR64:$src), (FRECIP_D $src)>; + ++let Predicates = [HasFrecipe] in { ++// FP approximate reciprocal operation ++def : Pat<(int_loongarch_frecipe_d FPR64:$src), (FRECIPE_D FPR64:$src)>; ++def : Pat<(int_loongarch_frsqrte_d FPR64:$src), (FRSQRTE_D FPR64:$src)>; ++} ++ + // fmadd.d: fj * fk + fa + def : Pat<(fma FPR64:$fj, FPR64:$fk, FPR64:$fa), (FMADD_D $fj, $fk, $fa)>; + +diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +index 454915ac8c0a..6f1969bf8cae 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td ++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +@@ -2080,6 +2080,16 @@ foreach Inst = ["XVFLOGB_D", "XVFCLASS_D", "XVFSQRT_D", "XVFRECIP_D", "XVFRSQRT_ + def : Pat<(deriveLASXIntrinsic.ret (v4f64 LASX256:$xj)), + (!cast(Inst) LASX256:$xj)>; + ++// 256-Bit vector FP approximate reciprocal operation ++let Predicates = [HasFrecipe] in { ++foreach Inst = ["XVFRECIPE_S", "XVFRSQRTE_S"] in ++ def : Pat<(deriveLASXIntrinsic.ret (v8f32 LASX256:$xj)), ++ (!cast(Inst) LASX256:$xj)>; ++foreach Inst = ["XVFRECIPE_D", "XVFRSQRTE_D"] in ++ def : Pat<(deriveLASXIntrinsic.ret (v4f64 LASX256:$xj)), ++ (!cast(Inst) LASX256:$xj)>; ++} ++ + def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm), + (XVPICKVE_W v8f32:$xj, (to_valid_timm timm:$imm))>; + def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm), +diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +index 6d60d7074ec3..0580683c3ce3 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td ++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +@@ -2195,6 +2195,16 @@ foreach Inst = ["VFLOGB_D", "VFCLASS_D", "VFSQRT_D", "VFRECIP_D", "VFRSQRT_D", + def : Pat<(deriveLSXIntrinsic.ret (v2f64 LSX128:$vj)), + (!cast(Inst) LSX128:$vj)>; + ++// 128-Bit vector FP approximate reciprocal operation ++let Predicates = [HasFrecipe] in { ++foreach Inst = ["VFRECIPE_S", "VFRSQRTE_S"] in ++ def : Pat<(deriveLSXIntrinsic.ret (v4f32 LSX128:$vj)), ++ (!cast(Inst) LSX128:$vj)>; ++foreach Inst = ["VFRECIPE_D", "VFRSQRTE_D"] in ++ def : Pat<(deriveLSXIntrinsic.ret (v2f64 LSX128:$vj)), ++ (!cast(Inst) LSX128:$vj)>; ++} ++ + // load + def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm), + (VLD GPR:$rj, (to_valid_timm timm:$imm))>; +diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h +index 174e4cba8326..11c0b39e176e 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h ++++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h +@@ -45,6 +45,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo { + bool HasUAL = false; + bool HasLinkerRelax = false; + bool HasExpAutoVec = false; ++ bool HasFrecipe = false; + unsigned GRLen = 32; + MVT GRLenVT = MVT::i32; + LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown; +@@ -104,6 +105,7 @@ public: + bool hasUAL() const { return HasUAL; } + bool hasLinkerRelax() const { return HasLinkerRelax; } + bool hasExpAutoVec() const { return HasExpAutoVec; } ++ bool hasFrecipe() const { return HasFrecipe; } + MVT getGRLenVT() const { return GRLenVT; } + unsigned getGRLen() const { return GRLen; } + LoongArchABI::ABI getTargetABI() const { return TargetABI; } +diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll +new file mode 100644 +index 000000000000..9f572500caa0 +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll +@@ -0,0 +1,26 @@ ++; RUN: llc --mtriple=loongarch32 --mattr=+d,+frecipe < %s | FileCheck %s ++; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s ++ ++declare double @llvm.loongarch.frecipe.d(double) ++ ++define double @frecipe_d(double %a) { ++; CHECK-LABEL: frecipe_d: ++; CHECK: # %bb.0: # %entry ++; CHECK-NEXT: frecipe.d $fa0, $fa0 ++; CHECK-NEXT: ret ++entry: ++ %res = call double @llvm.loongarch.frecipe.d(double %a) ++ ret double %res ++} ++ ++declare double @llvm.loongarch.frsqrte.d(double) ++ ++define double @frsqrte_d(double %a) { ++; CHECK-LABEL: frsqrte_d: ++; CHECK: # %bb.0: # %entry ++; CHECK-NEXT: frsqrte.d $fa0, $fa0 ++; CHECK-NEXT: ret ++entry: ++ %res = call double @llvm.loongarch.frsqrte.d(double %a) ++ ret double %res ++} +diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll +new file mode 100644 +index 000000000000..0b2029f2e44a +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll +@@ -0,0 +1,26 @@ ++; RUN: llc --mtriple=loongarch32 --mattr=+f,+frecipe < %s | FileCheck %s ++; RUN: llc --mtriple=loongarch64 --mattr=+f,+frecipe < %s | FileCheck %s ++ ++declare float @llvm.loongarch.frecipe.s(float) ++ ++define float @frecipe_s(float %a) { ++; CHECK-LABEL: frecipe_s: ++; CHECK: # %bb.0: # %entry ++; CHECK-NEXT: frecipe.s $fa0, $fa0 ++; CHECK-NEXT: ret ++entry: ++ %res = call float @llvm.loongarch.frecipe.s(float %a) ++ ret float %res ++} ++ ++declare float @llvm.loongarch.frsqrte.s(float) ++ ++define float @frsqrte_s(float %a) { ++; CHECK-LABEL: frsqrte_s: ++; CHECK: # %bb.0: # %entry ++; CHECK-NEXT: frsqrte.s $fa0, $fa0 ++; CHECK-NEXT: ret ++entry: ++ %res = call float @llvm.loongarch.frsqrte.s(float %a) ++ ret float %res ++} +diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll +new file mode 100644 +index 000000000000..215436823af8 +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll +@@ -0,0 +1,26 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s ++ ++declare <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float>) ++ ++define <8 x float> @lasx_xvfrecipe_s(<8 x float> %va) nounwind { ++; CHECK-LABEL: lasx_xvfrecipe_s: ++; CHECK: # %bb.0: # %entry ++; CHECK-NEXT: xvfrecipe.s $xr0, $xr0 ++; CHECK-NEXT: ret ++entry: ++ %res = call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> %va) ++ ret <8 x float> %res ++} ++ ++declare <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double>) ++ ++define <4 x double> @lasx_xvfrecipe_d(<4 x double> %va) nounwind { ++; CHECK-LABEL: lasx_xvfrecipe_d: ++; CHECK: # %bb.0: # %entry ++; CHECK-NEXT: xvfrecipe.d $xr0, $xr0 ++; CHECK-NEXT: ret ++entry: ++ %res = call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> %va) ++ ret <4 x double> %res ++} +diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll +new file mode 100644 +index 000000000000..ad36c3aa5c29 +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll +@@ -0,0 +1,26 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s ++ ++declare <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float>) ++ ++define <8 x float> @lasx_xvfrsqrte_s(<8 x float> %va) nounwind { ++; CHECK-LABEL: lasx_xvfrsqrte_s: ++; CHECK: # %bb.0: # %entry ++; CHECK-NEXT: xvfrsqrte.s $xr0, $xr0 ++; CHECK-NEXT: ret ++entry: ++ %res = call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> %va) ++ ret <8 x float> %res ++} ++ ++declare <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double>) ++ ++define <4 x double> @lasx_xvfrsqrte_d(<4 x double> %va) nounwind { ++; CHECK-LABEL: lasx_xvfrsqrte_d: ++; CHECK: # %bb.0: # %entry ++; CHECK-NEXT: xvfrsqrte.d $xr0, $xr0 ++; CHECK-NEXT: ret ++entry: ++ %res = call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> %va) ++ ret <4 x double> %res ++} +diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll +new file mode 100644 +index 000000000000..1b7a97d9f972 +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll +@@ -0,0 +1,26 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s ++ ++declare <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float>) ++ ++define <4 x float> @lsx_vfrecipe_s(<4 x float> %va) nounwind { ++; CHECK-LABEL: lsx_vfrecipe_s: ++; CHECK: # %bb.0: # %entry ++; CHECK-NEXT: vfrecipe.s $vr0, $vr0 ++; CHECK-NEXT: ret ++entry: ++ %res = call <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float> %va) ++ ret <4 x float> %res ++} ++ ++declare <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double>) ++ ++define <2 x double> @lsx_vfrecipe_d(<2 x double> %va) nounwind { ++; CHECK-LABEL: lsx_vfrecipe_d: ++; CHECK: # %bb.0: # %entry ++; CHECK-NEXT: vfrecipe.d $vr0, $vr0 ++; CHECK-NEXT: ret ++entry: ++ %res = call <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double> %va) ++ ret <2 x double> %res ++} +diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll +new file mode 100644 +index 000000000000..3cd6c78e87d7 +--- /dev/null ++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll +@@ -0,0 +1,26 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s ++ ++declare <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float>) ++ ++define <4 x float> @lsx_vfrsqrte_s(<4 x float> %va) nounwind { ++; CHECK-LABEL: lsx_vfrsqrte_s: ++; CHECK: # %bb.0: # %entry ++; CHECK-NEXT: vfrsqrte.s $vr0, $vr0 ++; CHECK-NEXT: ret ++entry: ++ %res = call <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float> %va) ++ ret <4 x float> %res ++} ++ ++declare <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double>) ++ ++define <2 x double> @lsx_vfrsqrte_d(<2 x double> %va) nounwind { ++; CHECK-LABEL: lsx_vfrsqrte_d: ++; CHECK: # %bb.0: # %entry ++; CHECK-NEXT: vfrsqrte.d $vr0, $vr0 ++; CHECK-NEXT: ret ++entry: ++ %res = call <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double> %va) ++ ret <2 x double> %res ++} +-- +2.20.1 + + +From 6f9531b069971dc0f5c6b28bd6a6754c1b5fde72 Mon Sep 17 00:00:00 2001 +From: Ami-zhang +Date: Tue, 23 Jul 2024 14:03:28 +0800 +Subject: [PATCH 10/12] [LoongArch] Support -march=la64v1.0 and -march=la64v1.1 + (#100057) + +The newly added strings `la64v1.0` and `la64v1.1` in `-march` are as +described in LoongArch toolchains conventions (see [1]). + +The target-cpu/feature attributes are forwarded to compiler when +specifying particular `-march` parameter. The default cpu `loongarch64` +is returned when archname is `la64v1.0` or `la64v1.1`. + +In addition, this commit adds `la64v1.0`/`la64v1.1` to +"__loongarch_arch" and adds definition for macro "__loongarch_frecipe". + +[1]: https://github.com/loongson/la-toolchain-conventions + +(cherry picked from commit 5a1b9896ad5a7dcd25a1cc7a4d3fd44155e4b22d) +--- + llvm/lib/TargetParser/LoongArchTargetParser.cpp | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/llvm/lib/TargetParser/LoongArchTargetParser.cpp b/llvm/lib/TargetParser/LoongArchTargetParser.cpp +index 772d24c5ce3d..8e86d18de2ad 100644 +--- a/llvm/lib/TargetParser/LoongArchTargetParser.cpp ++++ b/llvm/lib/TargetParser/LoongArchTargetParser.cpp +@@ -44,6 +44,17 @@ bool LoongArch::getArchFeatures(StringRef Arch, + return true; + } + } ++ ++ if (Arch == "la64v1.0" || Arch == "la64v1.1") { ++ Features.push_back("+64bit"); ++ Features.push_back("+d"); ++ Features.push_back("+lsx"); ++ Features.push_back("+ual"); ++ if (Arch == "la64v1.1") ++ Features.push_back("+frecipe"); ++ return true; ++ } ++ + return false; + } + +-- +2.20.1 + + +From 6094875aa6aab1e28a096294783cada0243e95d5 Mon Sep 17 00:00:00 2001 +From: Ami-zhang +Date: Tue, 23 Jul 2024 15:14:20 +0800 +Subject: [PATCH 11/12] [LoongArch] Support la664 (#100068) + +A new ProcessorModel called `la664` is defined in LoongArch.td to +support `-march/-mtune=la664`. + +(cherry picked from commit fcec298087dba0c83f6d0bbafd6cd934c42cbf82) +--- + llvm/include/llvm/TargetParser/LoongArchTargetParser.def | 2 ++ + llvm/include/llvm/TargetParser/LoongArchTargetParser.h | 3 +++ + llvm/lib/Target/LoongArch/LoongArch.td | 7 +++++++ + llvm/lib/TargetParser/Host.cpp | 2 ++ + llvm/test/CodeGen/LoongArch/cpus.ll | 5 +++++ + 5 files changed, 19 insertions(+) + +diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def +index b20d124953f8..101a48cbd539 100644 +--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def ++++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def +@@ -10,6 +10,7 @@ LOONGARCH_FEATURE("+lasx", FK_LASX) + LOONGARCH_FEATURE("+lbt", FK_LBT) + LOONGARCH_FEATURE("+lvz", FK_LVZ) + LOONGARCH_FEATURE("+ual", FK_UAL) ++LOONGARCH_FEATURE("+frecipe", FK_FRECIPE) + + #undef LOONGARCH_FEATURE + +@@ -19,5 +20,6 @@ LOONGARCH_FEATURE("+ual", FK_UAL) + + LOONGARCH_ARCH("loongarch64", AK_LOONGARCH64, FK_64BIT | FK_FP32 | FK_FP64 | FK_UAL) + LOONGARCH_ARCH("la464", AK_LA464, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL) ++LOONGARCH_ARCH("la664", AK_LA664, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL | FK_FRECIPE) + + #undef LOONGARCH_ARCH +diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h +index 028844187584..c0bb15a5163b 100644 +--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h ++++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h +@@ -46,6 +46,9 @@ enum FeatureKind : uint32_t { + + // Allow memory accesses to be unaligned. + FK_UAL = 1 << 8, ++ ++ // Floating-point approximate reciprocal instructions are available. ++ FK_FRECIPE = 1 << 9, + }; + + struct FeatureInfo { +diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td +index 5573e5415d26..b5cd5bb0f8a4 100644 +--- a/llvm/lib/Target/LoongArch/LoongArch.td ++++ b/llvm/lib/Target/LoongArch/LoongArch.td +@@ -147,6 +147,13 @@ def : ProcessorModel<"la464", NoSchedModel, [Feature64Bit, + FeatureExtLVZ, + FeatureExtLBT]>; + ++def : ProcessorModel<"la664", NoSchedModel, [Feature64Bit, ++ FeatureUAL, ++ FeatureExtLASX, ++ FeatureExtLVZ, ++ FeatureExtLBT, ++ FeatureFrecipe]>; ++ + //===----------------------------------------------------------------------===// + // Define the LoongArch target. + //===----------------------------------------------------------------------===// +diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp +index 8b23be02edc0..87e3e0b434d5 100644 +--- a/llvm/lib/TargetParser/Host.cpp ++++ b/llvm/lib/TargetParser/Host.cpp +@@ -1469,6 +1469,8 @@ StringRef sys::getHostCPUName() { + switch (processor_id & 0xf000) { + case 0xc000: // Loongson 64bit, 4-issue + return "la464"; ++ case 0xd000: // Loongson 64bit, 6-issue ++ return "la664"; + // TODO: Others. + default: + break; +diff --git a/llvm/test/CodeGen/LoongArch/cpus.ll b/llvm/test/CodeGen/LoongArch/cpus.ll +index 35945ae4de71..087cf887b813 100644 +--- a/llvm/test/CodeGen/LoongArch/cpus.ll ++++ b/llvm/test/CodeGen/LoongArch/cpus.ll +@@ -3,6 +3,7 @@ + + ; RUN: llc < %s --mtriple=loongarch64 --mcpu=loongarch64 2>&1 | FileCheck %s + ; RUN: llc < %s --mtriple=loongarch64 --mcpu=la464 2>&1 | FileCheck %s ++; RUN: llc < %s --mtriple=loongarch64 --mcpu=la664 2>&1 | FileCheck %s + ; RUN: llc < %s --mtriple=loongarch64 2>&1 | FileCheck %s + + ; CHECK-NOT: {{.*}} is not a recognized processor for this target +@@ -18,3 +19,7 @@ define void @tune_cpu_loongarch64() "tune-cpu"="loongarch64" { + define void @tune_cpu_la464() "tune-cpu"="la464" { + ret void + } ++ ++define void @tune_cpu_la664() "tune-cpu"="la664" { ++ ret void ++} +-- +2.20.1 + + +From f06fec7597485a8d90aa81e3c65abea1bdeeb90b Mon Sep 17 00:00:00 2001 +From: Zhaoxin Yang +Date: Tue, 23 Jul 2024 15:19:00 +0800 +Subject: [PATCH 12/12] [LoongArch] Remove experimental `auto-vec` feature. + (#100070) + +Currently, automatic vectorization will be enabled with `-mlsx/-mlasx` +enabled. + +(cherry picked from commit 89d1eb67342d75d1de8d210157fdeaeb6a4724b6) +--- + llvm/lib/Target/LoongArch/LoongArch.td | 4 ---- + llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp | 2 -- + llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll | 2 +- + 3 files changed, 1 insertion(+), 7 deletions(-) + +diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td +index b5cd5bb0f8a4..5f85cace71af 100644 +--- a/llvm/lib/Target/LoongArch/LoongArch.td ++++ b/llvm/lib/Target/LoongArch/LoongArch.td +@@ -105,10 +105,6 @@ def FeatureUAL + def FeatureRelax + : SubtargetFeature<"relax", "HasLinkerRelax", "true", + "Enable Linker relaxation">; +-// Experimental auto vectorization +-def FeatureAutoVec +- : SubtargetFeature<"auto-vec", "HasExpAutoVec", "true", +- "Experimental auto vectorization">; + + // Floating point approximation operation + def FeatureFrecipe +diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp +index d47dded9ea6e..7961bb141e64 100644 +--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp ++++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp +@@ -26,8 +26,6 @@ TypeSize LoongArchTTIImpl::getRegisterBitWidth( + case TargetTransformInfo::RGK_Scalar: + return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); + case TargetTransformInfo::RGK_FixedWidthVector: +- if (!ST->hasExpAutoVec()) +- return DefSize; + if (ST->hasExtLASX()) + return TypeSize::getFixed(256); + if (ST->hasExtLSX()) +diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll +index a8ac2411dd82..6ab300859f9d 100644 +--- a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll ++++ b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll +@@ -1,5 +1,5 @@ + ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +-; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx,+auto-vec -S | FileCheck %s ++; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx -S | FileCheck %s + + ;; This is a collection of tests whose only purpose is to show changes in the + ;; default configuration. Please keep these tests minimal - if you're testing +-- +2.20.1 + diff --git a/llvm.spec b/llvm.spec index 79c026e..313c703 100644 --- a/llvm.spec +++ b/llvm.spec @@ -44,7 +44,7 @@ Name: %{pkg_name} Version: %{maj_ver}.%{min_ver}.%{patch_ver} -Release: 18 +Release: 19 Summary: The Low Level Virtual Machine License: NCSA @@ -78,6 +78,7 @@ Patch20: 0020-Update-llvm-lit-config-to-support-build_for_openeule.patch Patch21: 0021-Add-BiSheng-Autotuner-support-for-LLVM-compiler.patch Patch22: 0022-Prevent-environment-variables-from-exceeding-NAME_MA.patch Patch23: 0023-AArch64-Support-HiSilicon-s-HIP09-Processor.patch +Patch24: 0024-Backport-LoongArch-fix-and-add-some-new-support.patch BuildRequires: binutils-devel BuildRequires: cmake @@ -374,6 +375,9 @@ LD_LIBRARY_PATH=%{buildroot}/%{install_libdir} %{__ninja} check-all -C ./_build %{install_includedir}/llvm-gmock %changelog +* Mon Sep 23 2024 zhanglimin - 17.0.6-19 +- [LoongArch] Backport some new support + * Thu Sep 12 2024 xiajingze - 17.0.6-18 - [AArch64] Support HiSilicon's HIP09 Processor -- Gitee