diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index 87aa3b98d93826560a09a4d9e6e1355e7a4e173a..66d2b018dfd742f7d019d1a369ee29b86252c794 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -65,7 +65,9 @@ #include "AArch64ExpandImm.h" #include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineLoopInfo.h" @@ -73,6 +75,867 @@ using namespace llvm; #define DEBUG_TYPE "aarch64-mi-peephole-opt" +static cl::opt EnableSVEPeephole( + "aarch64-sve-peephole", cl::init(true), cl::Hidden, + cl::desc("Enable SVE gather/scatter peephole optimizations")); + +//===----------------------------------------------------------------------===// +// SVELoopAddressHoisting +//===----------------------------------------------------------------------===// +// +// This SVELoopAddressHoisting class optimizes SVE gather/scatter addressing +// modes (induction variable strength reduction). It transforms loop-invariant +// index increment patterns (addr = index + Const/LoopInvariant) into +// precomputed base addresses + fixed indices: +// +// Transformation patterns: +// 1) Original: address = base + index + Const +// Optimized: address = base_Const + index +// +// 2) Original: address = base + index + LoopInvariant +// Optimized: address = base_LoopInvariant + index +// +// C code example: +// - Hoists loop-variant address components to loop preheader +// - Replaces dynamic index calculations with static base offsets +// +// // Before optimization +// char *basePtr; +// for (int i = 0; ...; i += VSCALE) { +// svint32_t index = svindex_s32(i, 1); +// index = svadd_z(pg, index, LoopInvariantOffset); +// svld1_gather_index(pgNew, basePtr, index); +// } +// +// // After optimization +// char *hoistedBasePtr = basePtr + LoopInvariantScalar * ElementSizeInBytes; // Precomputed outside +// for (int i = 0; ...; i += VSCALE) { +// svint32_t index = svindex_s32(i, 1); +// svld1_gather_index(pgNew, hoistedBasePtr, index); // Fixed index in loop +// } +// + +namespace { +class SVELoopAddressHoisting { + MachineRegisterInfo *MRI; + const TargetInstrInfo *TII; + +public: + bool runOnMachineFunction(MachineFunction &MF, MachineLoopInfo *MLI); + +private: + using InstAndOffset = std::pair; + using ChainKey = std::tuple; + // Key: {BaseReg, RootIndexReg, InvariantGPROffset} + // Value: Vector of {Instruction, ElementOffset} pairs + using ChainMap = DenseMap>; + + // Define an enum for the SVE offset type. + enum class SVEOffsetType { + NOT_APPLICABLE, // Not a recognized gather/scatter instruction + SXTW, + UXTW, + D64 + }; + + bool isLoopInvariant(Register Reg, MachineLoop *L) const; + bool isConstantVector(Register Reg, int64_t &Value) const; + bool isInvariantBroadcastGPR(Register VecReg, MachineLoop *L, + Register &GPR) const; + unsigned getElementSizeInBytes(const MachineInstr &MI, + SVEOffsetType *OffsetKind) const; + void traceIndexChain(Register IndexReg, Register &RootIndex, + int64_t &AccumulatedOffset, Register &InvariantGPROffset, + MachineLoop *L, + SmallVectorImpl &ChainsInsts) const; + void collectOptimizationCandidates( + MachineLoop *L, ChainMap &Chains, + SetVector &CandidateDeadInsts) const; + bool hoistInvariantsAndRewrite(MachineLoop *L, const ChainMap &Chains); + bool cleanupDeadCode(SetVector &CandidateDeadInsts); + bool sveMulStrengthReduction(MachineLoop *L); + bool processLoop(MachineLoop *L); +}; +} // end anonymous namespace + +// Check if Reg is a loop invariant to Loop L +bool SVELoopAddressHoisting::isLoopInvariant(Register Reg, + MachineLoop *L) const { + if (!Reg.isVirtual()) + return false; + MachineInstr *Def = MRI->getVRegDef(Reg); + if (!Def) + return true; + return !L->contains(Def->getParent()); +} + +// Check if a vector register represents a constant value +// and retrieve that constant value if it exists +bool SVELoopAddressHoisting::isConstantVector(Register Reg, + int64_t &Value) const { + if (!Reg.isVirtual()) + return false; + + MachineInstr *Def = MRI->getVRegDef(Reg); + if (!Def) + return false; + + // Match the DUP instruction pattern: %Def = DUP_ZI_S Imm, 0 + // This instruction broadcasts the immediate value to all vector elements + unsigned DupOp = Def->getOpcode(); + if (DupOp == AArch64::DUP_ZI_S || DupOp == AArch64::DUP_ZI_D) { + Value = Def->getOperand(1).getImm(); + return true; + } + return false; +} + +// Checks if a vector register is broadcasted from a loop-invariant GPR +// Matches instruction pattern: %VecReg = DUP_ZR_S %GPR +// Where %GPR is loop-invariant to loop L +bool SVELoopAddressHoisting::isInvariantBroadcastGPR(Register VecReg, + MachineLoop *L, + Register &GPR) const { + if (!VecReg.isVirtual()) + return false; + + MachineInstr *Def = MRI->getVRegDef(VecReg); + if (!Def) + return false; + + unsigned DupOp = Def->getOpcode(); + if (DupOp == AArch64::DUP_ZR_S || DupOp == AArch64::DUP_ZR_D) { + Register SrcGPR = Def->getOperand(1).getReg(); + if (isLoopInvariant(SrcGPR, L)) { + GPR = SrcGPR; + return true; + } + } + return false; +} + +// Returns element size in bytes for gather/scatter instructions +// Returns 0 for non-gather/scatter instructions +unsigned +SVELoopAddressHoisting::getElementSizeInBytes(const MachineInstr &MI, + SVEOffsetType *OffsetKind) const { + switch (MI.getOpcode()) { + // --- Element Size: 2 Bytes (Half-Word) --- + case AArch64::GLD1H_D_SCALED: + case AArch64::GLD1SH_D_SCALED: + case AArch64::GLDFF1H_D_SCALED: + case AArch64::GLDFF1SH_D_SCALED: + case AArch64::LDNT1H_ZZR_D_REAL: + case AArch64::LDNT1SH_ZZR_D_REAL: + case AArch64::SST1H_D_SCALED: + case AArch64::STNT1H_ZZR_D_REAL: + *OffsetKind = SVEOffsetType::D64; + return 2; + case AArch64::GLD1H_S_SXTW_SCALED: + case AArch64::GLD1SH_S_SXTW_SCALED: + case AArch64::GLDFF1H_S_SXTW_SCALED: + case AArch64::GLDFF1SH_S_SXTW_SCALED: + case AArch64::SST1H_S_SXTW_SCALED: + *OffsetKind = SVEOffsetType::SXTW; + return 2; + case AArch64::GLD1H_S_UXTW_SCALED: + case AArch64::GLD1SH_S_UXTW_SCALED: + case AArch64::GLDFF1H_S_UXTW_SCALED: + case AArch64::GLDFF1SH_S_UXTW_SCALED: + case AArch64::SST1H_S_UXTW_SCALED: + *OffsetKind = SVEOffsetType::UXTW; + return 2; + + // --- Element Size: 4 Bytes (Word) --- + case AArch64::GLD1SW_D_SCALED: + case AArch64::GLD1W_D_SCALED: + case AArch64::GLDFF1SW_D_SCALED: + case AArch64::GLDFF1W_D_SCALED: + case AArch64::LDNT1SW_ZZR_D_REAL: + case AArch64::LDNT1W_ZZR_D_REAL: + case AArch64::SST1W_D_SCALED: + case AArch64::STNT1W_ZZR_D_REAL: + *OffsetKind = SVEOffsetType::D64; + return 4; + case AArch64::GLD1W_SXTW_SCALED: + case AArch64::GLDFF1W_SXTW_SCALED: + case AArch64::PRFW_S_SXTW_SCALED: + case AArch64::SST1W_SXTW_SCALED: + *OffsetKind = SVEOffsetType::SXTW; + return 4; + case AArch64::GLD1W_UXTW_SCALED: + case AArch64::GLDFF1W_UXTW_SCALED: + case AArch64::PRFW_S_UXTW_SCALED: + case AArch64::SST1W_UXTW_SCALED: + *OffsetKind = SVEOffsetType::UXTW; + return 4; + + // --- Element Size: 8 Bytes (Double-Word) --- + case AArch64::GLD1D_SCALED: + case AArch64::GLDFF1D_SCALED: + case AArch64::LDNT1D_ZZR_D_REAL: + case AArch64::PRFW_D_SCALED: + case AArch64::SST1D_SCALED: + case AArch64::STNT1D_ZZR_D_REAL: + *OffsetKind = SVEOffsetType::D64; + return 8; + case AArch64::GLD1D_SXTW_SCALED: + case AArch64::SST1D_SXTW_SCALED: + *OffsetKind = SVEOffsetType::SXTW; + return 8; + case AArch64::GLD1D_UXTW_SCALED: + case AArch64::SST1D_UXTW_SCALED: + *OffsetKind = SVEOffsetType::UXTW; + return 8; + default: + StringRef InstName = TII->getName(MI.getOpcode()); + if (InstName.startswith("GLD") || InstName.startswith("SST") || + InstName.startswith("LDNT") || InstName.startswith("STNT") || + InstName.startswith("PRFW")) { + LLVM_DEBUG(dbgs() << "SVELoopAddressHoisting: Unhandled SVE gather/scatter-like instruction found: " + << MI); + } + + *OffsetKind = SVEOffsetType::NOT_APPLICABLE; + return 0; + } +} + +// Traces index chain to discover: +// - Root index register +// - Accumulated constant offset +// - Loop-invariant GPR offset component +// - And collects the chain instructions for potential deletion +void SVELoopAddressHoisting::traceIndexChain( + Register IndexReg, Register &RootIndex, int64_t &AccumulatedOffset, + Register &InvariantGPROffset, MachineLoop *L, + SmallVectorImpl &ChainInsts) const { + AccumulatedOffset = 0; + InvariantGPROffset = Register(0); + Register CurrentReg = IndexReg; + + while (true) { + if (!CurrentReg.isVirtual()) + break; + + MachineInstr *Def = MRI->getVRegDef(CurrentReg); + // Index must be defined within loop as induction variable + if (!Def || !L->contains(Def->getParent())) + break; + + // Match svadd index increment pattern: + // %index = ADD_ZI_[S/D] %prev_index, %offset, %pg + // %index = ADD_ZZZ_D %prev_index, %offset + // %index = ADD_ZPZZ_[S/D]_ZERO %pg, %prev_index, %offset + unsigned IndexOp = Def->getOpcode(); + if (IndexOp == AArch64::ADD_ZI_S || IndexOp == AArch64::ADD_ZI_D) { + int64_t ConstValue = Def->getOperand(2).getImm(); + AccumulatedOffset += ConstValue; + CurrentReg = Def->getOperand(1).getReg(); + ChainInsts.push_back(Def); + continue; + } + + Register Op1, Op2; + if (IndexOp == AArch64::ADD_ZZZ_S || IndexOp == AArch64::ADD_ZZZ_D) { + Op1 = Def->getOperand(1).getReg(); + Op2 = Def->getOperand(2).getReg(); + } else if (IndexOp == AArch64::ADD_ZPZZ_S_ZERO || + IndexOp == AArch64::ADD_ZPZZ_D_ZERO || + IndexOp == AArch64::ADD_ZPmZ_S || + IndexOp == AArch64::ADD_ZPmZ_D) { + Op1 = Def->getOperand(2).getReg(); + Op2 = Def->getOperand(3).getReg(); + } else { + break; + } + + int64_t ConstValue; + Register InvariantGPR; + + // Op2 case 1: Constant vector offset + if (isConstantVector(Op2, ConstValue)) { + AccumulatedOffset += ConstValue; + CurrentReg = Op1; + ChainInsts.push_back(MRI->getVRegDef(Op2)); + ChainInsts.push_back(Def); + continue; + } + + // Op2 case 2: Loop-invariant GPR broadcast offset + if (InvariantGPROffset == 0 && + isInvariantBroadcastGPR(Op2, L, InvariantGPR)) { + InvariantGPROffset = InvariantGPR; + CurrentReg = Op1; + ChainInsts.push_back(MRI->getVRegDef(Op2)); + ChainInsts.push_back(Def); + continue; + } + + // Op1 case 1: Constant vector offset + if (isConstantVector(Op1, ConstValue)) { + AccumulatedOffset += ConstValue; + CurrentReg = Op2; + ChainInsts.push_back(MRI->getVRegDef(Op1)); + ChainInsts.push_back(Def); + continue; + } + + // Op1 case 2: Loop-invariant GPR broadcast offset + if (InvariantGPROffset == 0 && + isInvariantBroadcastGPR(Op1, L, InvariantGPR)) { + InvariantGPROffset = InvariantGPR; + CurrentReg = Op2; + ChainInsts.push_back(MRI->getVRegDef(Op1)); + ChainInsts.push_back(Def); + continue; + } + break; + } + + RootIndex = CurrentReg; +} + +// Collects all optimizable gather/scatter instructions +// and groups them into chains. +void SVELoopAddressHoisting::collectOptimizationCandidates( + MachineLoop *L, ChainMap &Chains, + SetVector &CandidateDeadInsts) const { + for (MachineBasicBlock *MBB : L->getBlocks()) { + for (MachineInstr &MI : *MBB) { + SVEOffsetType OffsetType; + unsigned ElementSize = getElementSizeInBytes(MI, &OffsetType); + if (ElementSize == 0) + continue; + + // Verify instruction format: + // Gather: DstZPR, PredicatePPR, BaseGPR, IndexZPR + // Scatter: SrcZPR, PredicatePPR, BaseGPR, IndexZPR + if (MI.getNumOperands() < 4) + continue; + + Register BaseReg = MI.getOperand(2).getReg(); + Register IndexReg = MI.getOperand(3).getReg(); + // Only optimize loop-invariant base addresses + if (!isLoopInvariant(BaseReg, L)) + continue; + + Register RootIndex, InvariantGPROffset; + int64_t ElemOffset; + SmallVector TmpChainInsts; // Store chain for this MI + + // Trace index computation chain + traceIndexChain(IndexReg, RootIndex, ElemOffset, InvariantGPROffset, L, + TmpChainInsts); + + // If the chain is empty, there's nothing to optimize or delete. + if (TmpChainInsts.empty() && InvariantGPROffset == 0 && ElemOffset == 0) + continue; + + LLVM_DEBUG(dbgs() << "Found candidate instruction: "; MI.dump(); + dbgs() << " BaseReg: " << printReg(BaseReg) + << ", IndexReg: " << printReg(IndexReg) + << " -> RootIndex: " << printReg(RootIndex) + << ", ElemOffset: " << ElemOffset + << ", InvariantGPROffset: " + << printReg(InvariantGPROffset) << "\n"); + + Chains[{BaseReg, RootIndex, InvariantGPROffset}].push_back( + {&MI, ElemOffset}); + + // Add the identified chain instructions to the master set of candidates. + CandidateDeadInsts.insert(TmpChainInsts.begin(), TmpChainInsts.end()); + } + } +} + +bool SVELoopAddressHoisting::hoistInvariantsAndRewrite(MachineLoop *L, + const ChainMap &Chains) { + bool Changed = false; + MachineBasicBlock *Preheader = L->getLoopPreheader(); + + for (auto &ChainInfo : Chains) { + auto &Addressings = ChainInfo.second; + // Skip chains without optimizable offsets + if (Addressings.size() < 2 && std::get<2>(ChainInfo.first) == 0 && + Addressings[0].second == 0) + continue; + + Register BaseReg = std::get<0>(ChainInfo.first); + Register RootIndex = std::get<1>(ChainInfo.first); + Register InvariantGPROffset = std::get<2>(ChainInfo.first); + + LLVM_DEBUG(dbgs() << "Optimizing chain with BaseReg: " << printReg(BaseReg) + << ", RootIndex: " << printReg(RootIndex) + << ", InvariantGPROffset: " + << printReg(InvariantGPROffset) << "\n"); + + auto InsertPt = Preheader->getFirstTerminator(); + DebugLoc DL = Addressings[0].first->getDebugLoc(); + + // Handle loop-invariant GPR offset first + Register CurrentBaseReg = BaseReg; + if (InvariantGPROffset != 0) { + Register HoistedBaseReg = + MRI->createVirtualRegister(&AArch64::GPR64RegClass); + SVEOffsetType OffsetType; + unsigned ElementSize = + getElementSizeInBytes(*Addressings[0].first, &OffsetType); + unsigned ShiftAmt = Log2_64(ElementSize); + + const TargetRegisterClass *RC = MRI->getRegClass(InvariantGPROffset); + // Skip if offset is not a GPR register, cannot be used as address offset + if (!AArch64::GPR32RegClass.hasSubClassEq(RC) && + !AArch64::GPR64RegClass.hasSubClassEq(RC)) { + LLVM_DEBUG(dbgs() << " Skipping due to non-GPR offset register: " + << printReg(InvariantGPROffset) << "\n"); + continue; + } + + unsigned AddOp, ShiftExtender; + if (AArch64::GPR32RegClass.hasSubClassEq(RC)) { + if (OffsetType == SVEOffsetType::SXTW) + ShiftExtender = + AArch64_AM::getArithExtendImm(AArch64_AM::SXTW, ShiftAmt); + else + ShiftExtender = + AArch64_AM::getArithExtendImm(AArch64_AM::UXTW, ShiftAmt); + AddOp = AArch64::ADDXrx; + } else { + ShiftExtender = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); + AddOp = AArch64::ADDXrs; + } + + // HoistedBase = Base + (InvariantGPROffset << log2(ElementSize)) + BuildMI(*Preheader, InsertPt, DL, TII->get(AddOp), HoistedBaseReg) + .addReg(BaseReg) + .addReg(InvariantGPROffset) + .addImm(ShiftExtender); + CurrentBaseReg = HoistedBaseReg; + LLVM_DEBUG(dbgs() << " Hoisted and scaled Invariant GPR Offset" + << " into new base " << printReg(CurrentBaseReg) + << "\n"); + } + + // Map constant element offsets to newly created base registers to avoid + // redundant ADD instructions in the preheader. + DenseMap OffsetToNewBaseMap; + + for (auto &AddressInfo : Addressings) { + MachineInstr *MI = AddressInfo.first; + int64_t ElemOffset = AddressInfo.second; + + // Case 1: Zero offset - use current base directly + if (ElemOffset == 0) { + MI->getOperand(2).setReg(CurrentBaseReg); + MI->getOperand(3).setReg(RootIndex); + Changed = true; + continue; + } + + // Case 2: Non-zero offset - create or reuse offset base + Register NewBaseReg; + if (OffsetToNewBaseMap.count(ElemOffset)) { + NewBaseReg = OffsetToNewBaseMap[ElemOffset]; + } else { + // Create new base: NewBase = BaseReg + (ElemOffset * ElementSize) + SVEOffsetType OffsetType; + unsigned ElementSize = getElementSizeInBytes(*MI, &OffsetType); + int64_t ByteOffset = ElemOffset * ElementSize; + + NewBaseReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + DebugLoc DL = MI->getDebugLoc(); + BuildMI(*Preheader, InsertPt, DL, TII->get(AArch64::ADDXri), NewBaseReg) + .addReg(CurrentBaseReg) + .addImm(ByteOffset) + .addImm(0); + OffsetToNewBaseMap[ElemOffset] = NewBaseReg; + LLVM_DEBUG(dbgs() << " Hoisted new base for ElemOffset " << ElemOffset + << " (ByteOffset " << ByteOffset << ") into " + << printReg(NewBaseReg) << "\n"); + } + + // Rewrite Gather/Scatter MIR + // Original: GLD*/SST* ..., [BaseReg], [IndexReg] + // Optimized: GLD*/SST* ..., [NewBaseReg], [RootIndex] + MI->getOperand(2).setReg(NewBaseReg); + MI->getOperand(3).setReg(RootIndex); + + LLVM_DEBUG(dbgs() << " Rewrote instruction: "; MI->dump()); + Changed = true; + } + } + return Changed; +} + +bool SVELoopAddressHoisting::cleanupDeadCode( + SetVector &CandidateDeadInsts) { + if (CandidateDeadInsts.empty()) + return false; + + bool Changed = false; + LLVM_DEBUG(dbgs() << "--- Cleaning up dead instructions ---\n"); + for (MachineInstr *MI : llvm::reverse(CandidateDeadInsts)) { + bool IsDead = true; + for (const MachineOperand &MO : MI->operands()) { + if (MO.isReg() && MO.isDef() && MO.getReg().isVirtual()) { + if (!MRI->use_empty(MO.getReg())) { + IsDead = false; + break; + } + } + } + + if (!IsDead) + continue; + + LLVM_DEBUG(dbgs() << "Deleting dead instruction: "; MI->dump()); + MI->eraseFromParent(); + Changed = true; + } + return Changed; +} + +// This optimization performs strength reduction for gather/scatter index (SVE +// MUL instruction) in a loop. It identifies a specific pattern of a multiply +// operation on an induction variable within a loop and replaces it with a more +// efficient addition-based approach. The targeted pattern is essentially: +// for (int jp = jp_init; jp < loopTime; jp += svcnt[w/d]()) { +// jpsv = svindex_[](jp, IndexStep) +// result = svmul_z(pgNew, jpsv, Mutipilier) +// } +// +// This is transformed into: +// new_step = IndexStep * svcnt[w/d]() +// new_offset = svindex_[](0, new_step) +// for (int jp = jp_init, base = jp_init * multipiler; jp < loopTime; jp += svcnt[w/d](), base += new_step) { +// result = svadd_z(pgNew, new_offset, base) +// } +bool SVELoopAddressHoisting::sveMulStrengthReduction(MachineLoop *L) { + MachineBasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + return false; + + MachineBasicBlock *Header = L->getHeader(); + MachineBasicBlock *Latch = L->getLoopLatch(); + if (!Header || !Latch) + return false; + + for (MachineBasicBlock *MBB : L->getBlocks()) { + for (MachineInstr &MI : *MBB) { + // --- Start of the SVE MUL_Z Pattern Match --- + if (MI.getOpcode() != AArch64::MUL_ZPmZ_S && + MI.getOpcode() != AArch64::MUL_ZPmZ_D) { + continue; + } + + LLVM_DEBUG(dbgs() << "Found candidate MUL: "; MI.dump()); + + // Set up the specific AArch64 opcodes based on whether we have a 32-bit + // or 64-bit operation. + bool is64Bit = (MI.getOpcode() == AArch64::MUL_ZPmZ_D); + unsigned SelOpc = is64Bit ? AArch64::SEL_ZPZZ_D : AArch64::SEL_ZPZZ_S; + unsigned IndexRiOpc = is64Bit ? AArch64::INDEX_RI_D : AArch64::INDEX_RI_S; + unsigned AddZzzOpc = is64Bit ? AArch64::ADD_ZZZ_D : AArch64::ADD_ZZZ_S; + unsigned DupZrOpc = is64Bit ? AArch64::DUP_ZR_D : AArch64::DUP_ZR_S; + unsigned IndexIiOpc = is64Bit ? AArch64::INDEX_II_D : AArch64::INDEX_II_S; + unsigned DupZiOpc = is64Bit ? AArch64::DUP_ZI_D : AArch64::DUP_ZI_S; + unsigned AddGprOpc = is64Bit ? AArch64::ADDXrr : AArch64::ADDWrr; + unsigned CntOpc = is64Bit ? AArch64::CNTD_XPiI : AArch64::CNTW_XPiI; + unsigned MaddGprOpc = is64Bit ? AArch64::MADDXrrr : AArch64::MADDWrrr; + unsigned MovImmOpc = is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm; + unsigned ZeroReg = is64Bit ? AArch64::XZR : AArch64::WZR; + + const TargetRegisterClass *GprRegClass = + is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass; + const TargetRegisterClass *GprAllRegClass = + is64Bit ? &AArch64::GPR64allRegClass : &AArch64::GPR32allRegClass; + const TargetRegisterClass *ZprRegClass = &AArch64::ZPRRegClass; + + // Deconstruct the multiply instruction to see if it matches our target + // pattern. The matched pattern is: MUL(SEL(Pred, INDEX(IV, IdxStep), + // Zero), DUP(Multiplier)) + MachineInstr *SelMI = MRI->getVRegDef(MI.getOperand(2).getReg()); + if (!SelMI || (SelMI->getOpcode() != SelOpc)) + continue; + + // The second operand of the select should be an index operation. + MachineInstr *IndexMI = MRI->getVRegDef(SelMI->getOperand(2).getReg()); + if (!IndexMI) + continue; + + Register IVReg; + int64_t IndexStep; + // Detect the two index generated ways + if (IndexMI->getOpcode() == IndexRiOpc) { + // Case 1: INDEX_RI (reg, imm) + IVReg = IndexMI->getOperand(1).getReg(); + if (!IVReg.isVirtual()) + continue; + IndexStep = IndexMI->getOperand(2).getImm(); + } else if (IndexMI->getOpcode() == AddZzzOpc) { + // Case 2: ADD(INDEX_II(0, imm), DUP(reg)) + MachineInstr *Op1 = MRI->getVRegDef(IndexMI->getOperand(1).getReg()); + MachineInstr *Op2 = MRI->getVRegDef(IndexMI->getOperand(2).getReg()); + if (!Op1 || !Op2) + continue; + + auto matchIndexAddPattern = [&](MachineInstr *A, MachineInstr *B) { + return (A->getOpcode() == IndexIiOpc && B->getOpcode() == DupZrOpc && + A->getOperand(1).getImm() == 0); + }; + + if (matchIndexAddPattern(Op1, Op2)) { + IndexStep = Op1->getOperand(2).getImm(); + IVReg = Op2->getOperand(1).getReg(); + } else if (matchIndexAddPattern(Op2, Op1)) { + IndexStep = Op2->getOperand(2).getImm(); + IVReg = Op1->getOperand(1).getReg(); + } else { + continue; + } + } else { + continue; + } + + // The third operand of the multiply should be a duplicated immediate + // value. + MachineInstr *MultiplierMI = MRI->getVRegDef(MI.getOperand(3).getReg()); + if (!MultiplierMI || !isLoopInvariant(MI.getOperand(3).getReg(), L) || + (MultiplierMI->getOpcode() != DupZiOpc)) + continue; + int64_t MultiplierVal = MultiplierMI->getOperand(1).getImm(); + + // Check if the identified register is a basic loop induction variable. + MachineInstr *IVPhi = MRI->getVRegDef(IVReg); + if (!IVPhi || !IVPhi->isPHI() || IVPhi->getParent() != Header) + continue; + + // Find the instruction that updates the induction variable (usually an + // ADD in the latch). + Register IVInitReg = Register(0), IVNextReg = Register(0); + for (unsigned i = 1; i < IVPhi->getNumOperands(); i += 2) { + if (IVPhi->getOperand(i + 1).getMBB() == Preheader) { + IVInitReg = IVPhi->getOperand(i).getReg(); + break; + } + } + for (unsigned i = 1; i < IVPhi->getNumOperands(); i += 2) { + if (IVPhi->getOperand(i + 1).getMBB() == Latch) { + IVNextReg = IVPhi->getOperand(i).getReg(); + break; + } + } + if (!IVInitReg || !IVNextReg) + continue; + + // Get the definition of the next value of the induction variable. + MachineInstr *IVUpdateMI = MRI->getVRegDef(IVNextReg); + if (!IVUpdateMI) + continue; + if (IVUpdateMI->getOpcode() == AArch64::COPY) + IVUpdateMI = MRI->getVRegDef(IVUpdateMI->getOperand(1).getReg()); + if (IVUpdateMI->getOpcode() != AddGprOpc) + continue; + + // Determine the step of the induction variable. + Register IVStepReg; + if (IVUpdateMI->getOperand(1).getReg() == IVReg) + IVStepReg = IVUpdateMI->getOperand(2).getReg(); + else if (IVUpdateMI->getOperand(2).getReg() == IVReg) + IVStepReg = IVUpdateMI->getOperand(1).getReg(); + else + continue; + + LLVM_DEBUG( + dbgs() << "Sve Mul Strength reduction pattern matched for MUL: "; + MI.dump();); + + // --- Start of the Transformation --- + auto PreheaderInsertPt = Preheader->getFirstTerminator(); + DebugLoc DL = MI.getDebugLoc(); + + // In the preheader, create a new offset = index(0, IndexStep * + // MultiplierVal) + Register OffsetVecReg = MRI->createVirtualRegister(ZprRegClass); + BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(IndexIiOpc), + OffsetVecReg) + .addImm(0) + .addImm(IndexStep * MultiplierVal) + .addReg(AArch64::VG, RegState::Implicit); + + // In the preheader, calculate the new step value for our new induction + // variable. This is: NewStep = IVStep * MultiplierVal + MachineInstr *IVStepDef = MRI->getVRegDef(IVStepReg); + if (IVStepDef->getOpcode() == AArch64::COPY) + IVStepDef = MRI->getVRegDef(IVStepDef->getOperand(1).getReg()); + + // Check if the original IV step is the vector length (vl). + bool isStepVL = + IVStepDef && IVStepDef->getOpcode() == CntOpc && + IVStepDef->getOperand(1).getImm() == 31 && // Pattern for 'all' + IVStepDef->getOperand(2).getImm() == 1; // Multiplier of 1 + Register NewStepReg = MRI->createVirtualRegister(GprRegClass); + + // If the step is 'vl' and the multiplier is small, we can use a more + // efficient 'cnt' instruction. + if (isStepVL && MultiplierVal <= 15) { + Register NewStep64Reg = + MRI->createVirtualRegister(&AArch64::GPR64RegClass); + LLVM_DEBUG(dbgs() << "IV Step is vl, using CNT[W/D] for new step.\n"); + BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(CntOpc), + NewStep64Reg) + .addImm(31) // Pattern 'all' for vl + .addImm(MultiplierVal) + .addReg(AArch64::VG, RegState::Implicit); + if (!is64Bit) { + BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(AArch64::COPY), + NewStepReg) + .addReg(NewStep64Reg, 0, AArch64::sub_32); + } + } else { + // Otherwise, we use a general multiplication. + LLVM_DEBUG( + dbgs() << "IV Step is not vl, using generic MUL for new step.\n"); + Register MultReg = MRI->createVirtualRegister(GprRegClass); + BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MovImmOpc), MultReg) + .addImm(MultiplierVal); + BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MaddGprOpc), + NewStepReg) + .addReg(IVStepReg) + .addReg(MultReg) + .addReg(ZeroReg); + } + + // In the preheader, calculate the initial value for the new base IV. + // BaseIVInit = IVInit * MultiplierVal + Register BaseIVInitReg = MRI->createVirtualRegister(GprAllRegClass); + Register MultRegForInit = MRI->createVirtualRegister(GprRegClass); + BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MovImmOpc), + MultRegForInit) + .addImm(MultiplierVal); + BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MaddGprOpc), + BaseIVInitReg) + .addReg(IVInitReg) + .addReg(MultRegForInit) + .addReg(ZeroReg); + + // Create a new PHI node in the header + // for our new base induction variable. + Register BaseIVReg = MRI->createVirtualRegister(GprAllRegClass); + Register NextBaseIVReg = MRI->createVirtualRegister(GprAllRegClass); + auto BaseIVPhi = BuildMI(*Header, Header->getFirstNonPHI(), DL, + TII->get(AArch64::PHI), BaseIVReg); + BaseIVPhi.addReg(BaseIVInitReg).addMBB(Preheader); + + // In the loop latch, update our new base induction variable + // by adding the new step + BuildMI(*Latch, Latch->getFirstTerminator(), DL, TII->get(AddGprOpc), + NextBaseIVReg) + .addReg(BaseIVReg) + .addReg(NewStepReg); + + BaseIVPhi.addReg(NextBaseIVReg).addMBB(Latch); + + // Now, replace the original multiply operation in the loop body + // with a new add operation + auto BodyInsertPt = MI.getIterator(); + + // Broadcast the new base IV into a vector register. + Register BaseVecReg = MRI->createVirtualRegister(ZprRegClass); + BuildMI(*MI.getParent(), BodyInsertPt, DL, TII->get(DupZrOpc), BaseVecReg) + .addReg(BaseIVReg); + + // Perform the vector addition: NewResult = OffsetVector + BaseVector + Register AddTmpReg = MRI->createVirtualRegister(ZprRegClass); + BuildMI(*MI.getParent(), BodyInsertPt, DL, TII->get(AddZzzOpc), AddTmpReg) + .addReg(OffsetVecReg) + .addReg(BaseVecReg); + + // Replace all uses of the original multiplication result + // with our new addition result + MRI->replaceRegWith(MI.getOperand(0).getReg(), AddTmpReg); + + // Clean up the now-dead instructions from the old calculation + MI.eraseFromParent(); + if (MRI->use_empty(SelMI->getOperand(0).getReg())) + SelMI->eraseFromParent(); + if (MRI->use_empty(IndexMI->getOperand(0).getReg())) { + if (IndexMI->getOpcode() == AddZzzOpc) { + MachineInstr *Op1 = MRI->getVRegDef(IndexMI->getOperand(1).getReg()); + MachineInstr *Op2 = MRI->getVRegDef(IndexMI->getOperand(2).getReg()); + if (MRI->use_empty(Op1->getOperand(0).getReg())) + Op1->eraseFromParent(); + if (MRI->use_empty(Op2->getOperand(0).getReg())) + Op2->eraseFromParent(); + } + IndexMI->eraseFromParent(); + } + if (MRI->use_empty(MultiplierMI->getOperand(0).getReg())) + MultiplierMI->eraseFromParent(); + + LLVM_DEBUG(dbgs() << "Successfully applied strength reduction.\n"); + + return true; + } + } + return false; +} + +bool SVELoopAddressHoisting::processLoop(MachineLoop *L) { + MachineBasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + return false; + + bool Changed = false; + LLVM_DEBUG(dbgs() << "********** Processing Loop in Function: " + << L->getHeader()->getParent()->getName() + << " (Loop Header: " << L->getHeader()->getName() + << ") **********\n"); + + // Collect all candadate instructions and their addressing chains + ChainMap Chains; + SetVector CandidateDeadInsts; + collectOptimizationCandidates(L, Chains, CandidateDeadInsts); + + if (Chains.empty()) + return false; + + // Hosist invariants and rewrite the instructions in the loop + Changed |= hoistInvariantsAndRewrite(L, Chains); + + // Clean up the original, now-dead, address computation instructions + if (Changed) + Changed |= cleanupDeadCode(CandidateDeadInsts); + Changed |= sveMulStrengthReduction(L); + return Changed; +} + +bool SVELoopAddressHoisting::runOnMachineFunction(MachineFunction &MF, MachineLoopInfo *MLI) { + if (!MF.getSubtarget().hasSVE()) + return false; + + MRI = &MF.getRegInfo(); + TII = MF.getSubtarget().getInstrInfo(); + + LLVM_DEBUG(dbgs() << "Running AArch64 SVE Loop Address Hoisting on function: " + << MF.getName() << "\n"); + + bool Changed = false; + for (MachineLoop *L : *MLI) { + for (MachineLoop *SubL : *L) { + Changed |= processLoop(SubL); + } + Changed |= processLoop(L); + } + + return Changed; +} + +//===----------------------------------------------------------------------===// +// AArch64MIPeepholeOpt +//===----------------------------------------------------------------------===// + namespace { struct AArch64MIPeepholeOpt : public MachineFunctionPass { @@ -86,6 +949,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass { const AArch64RegisterInfo *TRI; MachineLoopInfo *MLI; MachineRegisterInfo *MRI; + SVELoopAddressHoisting AddressHoister; using OpcodePair = std::pair; template @@ -751,7 +1615,8 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { } } } - + if (EnableSVEPeephole) + Changed |= AddressHoister.runOnMachineFunction(MF, MLI); return Changed; } diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll new file mode 100644 index 0000000000000000000000000000000000000000..a2f826f9aba965ab4e5951f392a95b8fc948bfec --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll @@ -0,0 +1,540 @@ +; RUN: llc -mtriple=aarch64-unknown -mcpu=hip09 -O1 %s -o - | FileCheck %s + +define dso_local void @test_gather_multi_constOffset(i32 noundef %loopTime, ptr noundef %x, float noundef %ipx, float noundef %ipy, float noundef %ipz, ptr nocapture noundef nonnull align 4 dereferenceable(4) %tempx, ptr nocapture noundef nonnull align 4 dereferenceable(4) %tempy, ptr nocapture noundef nonnull align 4 dereferenceable(4) %tempz) local_unnamed_addr #0 { +; CHECK-LABEL: test_gather_multi_constOffset: +; CHECK: // %bb.1: // %for.body.lr.ph +; CHECK: mov w[[MULTIPLIER:[0-9]+]], #3 +; CHECK: index z[[OFFSET:[0-9]+]].s, #0, #3 +; CHECK: add x[[NEWBASE1:[0-9]+]], x1, #4 +; CHECK: mul w[[IV:[0-9]+]], wzr, w[[MULTIPLIER]] +; CHECK: add x[[NEWBASE2:[0-9]+]], x1, #8 +; CHECK: cntw x[[STRIDE:[0-9]+]], all, mul #3 +; CHECK: .LBB0_2: // %for.body +; CHECK: mov z[[INDEX:[0-9]+]].s, w[[IV]] +; CHECK: add w[[IV]], w[[IV]], w[[STRIDE]] +; CHECK: add z[[INDEX]].s, z[[OFFSET]].s, z[[INDEX]].s +; CHECK: ld1w { {{z[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x1, z[[INDEX:[0-9]+]].s, sxtw #2] +; CHECK: ld1w { {{z[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE1]], z[[INDEX]].s, sxtw #2] +; CHECK: ld1w { {{z[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z[[INDEX]].s, sxtw #2] +entry: + %cmp18 = icmp sgt i32 %loopTime, 0 + br i1 %cmp18, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %.splatinsert = insertelement poison, float %ipx, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %.splatinsert2 = insertelement poison, float %ipy, i64 0 + %.splat3 = shufflevector %.splatinsert2, poison, zeroinitializer + %.splatinsert5 = insertelement poison, float %ipz, i64 0 + %.splat6 = shufflevector %.splatinsert5, poison, zeroinitializer + %.tr = tail call i32 @llvm.vscale.i32() + %0 = shl nuw nsw i32 %.tr, 2 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %jp.019 = phi i32 [ 0, %for.body.lr.ph ], [ %conv10, %for.body ] + %1 = tail call @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.019, i32 %loopTime) + %2 = tail call @llvm.aarch64.sve.index.nxv4i32(i32 %jp.019, i32 1) + %3 = select %1, %2, zeroinitializer + %4 = tail call @llvm.aarch64.sve.mul.nxv4i32( %1, %3, shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer)) + %5 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %1, ptr %x, %4) + %6 = select %1, %5, zeroinitializer + %7 = tail call @llvm.aarch64.sve.fsubr.nxv4f32( %1, %6, %.splat) + %8 = select %1, %7, zeroinitializer + %9 = tail call @llvm.aarch64.sve.fmul.nxv4f32( %1, %8, %7) + %10 = select %1, %4, zeroinitializer + %11 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %10, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + %12 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %1, ptr %x, %11) + %13 = select %1, %12, zeroinitializer + %14 = tail call @llvm.aarch64.sve.fsub.nxv4f32( %1, %13, %.splat3) + %15 = select %1, %14, zeroinitializer + %16 = tail call @llvm.aarch64.sve.fmad.nxv4f32( %1, %15, %14, %9) + %17 = select %1, %11, zeroinitializer + %18 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %17, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + %19 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %1, ptr %x, %18) + %20 = select %1, %19, zeroinitializer + %21 = tail call @llvm.aarch64.sve.fsub.nxv4f32( %1, %20, %.splat6) + %22 = select %1, %21, zeroinitializer + %23 = tail call @llvm.aarch64.sve.fmad.nxv4f32( %1, %22, %21, %16) + %24 = tail call @llvm.aarch64.sve.fsqrt.nxv4f32( zeroinitializer, %1, %23) + %25 = tail call @llvm.aarch64.sve.fmul.nxv4f32( %1, %8, %24) + %26 = tail call float @llvm.aarch64.sve.faddv.nxv4f32( %1, %25) + %27 = load float, ptr %tempx, align 4, !tbaa !5 + %add = fadd float %26, %27 + store float %add, ptr %tempx, align 4, !tbaa !5 + %28 = tail call @llvm.aarch64.sve.fmul.nxv4f32( %1, %15, %24) + %29 = tail call float @llvm.aarch64.sve.faddv.nxv4f32( %1, %28) + %30 = load float, ptr %tempy, align 4, !tbaa !5 + %add7 = fadd float %29, %30 + store float %add7, ptr %tempy, align 4, !tbaa !5 + %31 = tail call @llvm.aarch64.sve.fmul.nxv4f32( %1, %22, %24) + %32 = tail call float @llvm.aarch64.sve.faddv.nxv4f32( %1, %31) + %33 = load float, ptr %tempz, align 4, !tbaa !5 + %add8 = fadd float %32, %33 + store float %add8, ptr %tempz, align 4, !tbaa !5 + %conv10 = add i32 %0, %jp.019 + %cmp = icmp slt i32 %conv10, %loopTime + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !9 +} + +define dso_local void @test_scatter_constOffset(i32 noundef %loopTime, ptr noalias noundef %dst, ptr noalias nocapture noundef readonly %tempx, ptr noalias nocapture noundef readonly %tempy, ptr noalias nocapture noundef readonly %tempz) local_unnamed_addr #0 { +; CHECK-LABEL: test_scatter_constOffset: +; CHECK: // %bb.1: // %for.body.lr.ph +; CHECK: add x[[NEWBASE1:[0-9]+]], x1, #4 +; CHECK-NEXT: add x[[NEWBASE2:[0-9]+]], x1, #8 +; CHECK: .LBB1_2: // %for.body +; CHECK: st1w { {{z[0-9]+}}.s }, p[[PG:[0-9]+]], [x1, z[[INDEX:[0-9]+]].s, sxtw #2] +; CHECK-NEXT: st1w { {{z[0-9]+}}.s }, p[[PG]], [x[[NEWBASE1]], z[[INDEX]].s, sxtw #2] +; CHECK-NEXT: st1w { {{z[0-9]+}}.s }, p[[PG]], [x[[NEWBASE2]], z[[INDEX]].s, sxtw #2] +entry: + %cmp15 = icmp sgt i32 %loopTime, 0 + br i1 %cmp15, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %.tr = tail call i32 @llvm.vscale.i32() + %0 = shl nuw nsw i32 %.tr, 2 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %jp.016 = phi i32 [ 0, %for.body.lr.ph ], [ %conv5, %for.body ] + %1 = tail call @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.016, i32 %loopTime) + %2 = tail call @llvm.aarch64.sve.index.nxv4i32(i32 %jp.016, i32 1) + %3 = select %1, %2, zeroinitializer + %4 = tail call @llvm.aarch64.sve.mul.nxv4i32( %1, %3, shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer)) + %5 = select %1, %4, zeroinitializer + %6 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %5, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + %7 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %5, shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer)) + %idx.ext = sext i32 %jp.016 to i64 + %add.ptr = getelementptr inbounds float, ptr %tempx, i64 %idx.ext + %8 = tail call @llvm.masked.load.nxv4f32.p0(ptr %add.ptr, i32 1, %1, zeroinitializer), !tbaa !5 + %add.ptr2 = getelementptr inbounds float, ptr %tempy, i64 %idx.ext + %9 = tail call @llvm.masked.load.nxv4f32.p0(ptr %add.ptr2, i32 1, %1, zeroinitializer), !tbaa !5 + %add.ptr4 = getelementptr inbounds float, ptr %tempz, i64 %idx.ext + %10 = tail call @llvm.masked.load.nxv4f32.p0(ptr %add.ptr4, i32 1, %1, zeroinitializer), !tbaa !5 + tail call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32( %8, %1, ptr %dst, %4) + tail call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32( %9, %1, ptr %dst, %6) + tail call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32( %10, %1, ptr %dst, %7) + %conv5 = add i32 %0, %jp.016 + %cmp = icmp slt i32 %conv5, %loopTime + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !11 +} + +define dso_local void @test_prefetch_constOffset(i32 noundef %loopTime, ptr nocapture noundef %data) local_unnamed_addr #4 { +; CHECK-LABEL: test_prefetch_constOffset: +; CHECK: // %bb.1: // %for.body.lr.ph +; CHECK: add x[[NEWBASE1:[0-9]+]], x1, #4 +; CHECK-NEXT: add x[[NEWBASE2:[0-9]+]], x1, #8 +; CHECK: // %bb.4: // %if.end +; CHECK: prfw pldl1keep, p[[PG:[0-9]+]], [x1, z[[INDEX:[0-9]+]].s, sxtw #2] +; CHECK-NEXT: prfw pldl1keep, p[[PG]], [x[[NEWBASE1]], z[[INDEX]].s, sxtw #2] +; CHECK-NEXT: prfw pldl1keep, p[[PG]], [x[[NEWBASE2]], z[[INDEX]].s, sxtw #2] +entry: + %.tr = tail call i32 @llvm.vscale.i32() + %conv = shl nuw nsw i32 %.tr, 2 + %cmp13 = icmp sgt i32 %loopTime, 0 + br i1 %cmp13, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %0 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + br label %for.body + +for.cond.cleanup: ; preds = %cleanup, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %cleanup + %jp.014 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %cleanup ] + %add = add i32 %jp.014, %conv + %1 = tail call @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %add, i32 %loopTime) + %2 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv4i1( %0, %1) + br i1 %2, label %if.end, label %cleanup + +if.end: ; preds = %for.body + %3 = tail call @llvm.aarch64.sve.index.nxv4i32(i32 %add, i32 1) + %4 = select %1, %3, zeroinitializer + %5 = tail call @llvm.aarch64.sve.mul.nxv4i32( %1, %4, shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer)) + tail call void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32( %1, ptr %data, %5, i32 0) + %6 = select %1, %5, zeroinitializer + %7 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %6, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + tail call void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32( %1, ptr %data, %7, i32 0) + %8 = select %1, %7, zeroinitializer + %9 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %8, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + tail call void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32( %1, ptr %data, %9, i32 0) + br label %cleanup + +cleanup: ; preds = %for.body, %if.end + %cmp = icmp slt i32 %add, %loopTime + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !12 +} + +define dso_local void @test_stride_constOffset(i32 noundef %loopTime, ptr noundef %data, ptr nocapture noundef %result) local_unnamed_addr #0 { +; CHECK-LABEL: test_stride_constOffset: +; CHECK: // %bb.1: // %for.body.lr.ph +; CHECK: add x[[NEWBASE1:[0-9]+]], x1, #8 +; CHECK-NEXT: add x[[NEWBASE2:[0-9]+]], x1, #16 +; CHECK: .LBB3_2: // %for.body +; CHECK: ld1w { {{z[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x1, z[[INDEX:[0-9]+]].s, sxtw #2] +; CHECK-NEXT: ld1w { {{z[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE1]], z[[INDEX]].s, sxtw #2] +; CHECK-NEXT: ld1w { {{z[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z[[INDEX]].s, sxtw #2] +entry: + %cmp9 = icmp sgt i32 %loopTime, 0 + br i1 %cmp9, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %.tr = tail call i32 @llvm.vscale.i32() + %0 = shl nuw nsw i32 %.tr, 2 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %jp.010 = phi i32 [ 0, %for.body.lr.ph ], [ %conv1, %for.body ] + %1 = tail call @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.010, i32 %loopTime) + %2 = tail call @llvm.aarch64.sve.index.nxv4i32(i32 %jp.010, i32 1) + %3 = select %1, %2, zeroinitializer + %4 = tail call @llvm.aarch64.sve.mul.nxv4i32( %1, %3, shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer)) + %5 = select %1, %4, zeroinitializer + %6 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %5, zeroinitializer) + %7 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %1, ptr %data, %6) + %8 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %5, shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer)) + %9 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %1, ptr %data, %8) + %10 = tail call @llvm.aarch64.sve.add.nxv4i32( %1, %5, shufflevector ( insertelement ( poison, i32 4, i64 0), poison, zeroinitializer)) + %11 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %1, ptr %data, %10) + %12 = select %1, %7, zeroinitializer + %13 = tail call @llvm.aarch64.sve.fadd.nxv4f32( %1, %12, %9) + %14 = select %1, %13, zeroinitializer + %15 = tail call @llvm.aarch64.sve.fadd.nxv4f32( %1, %14, %11) + %idx.ext = sext i32 %jp.010 to i64 + %add.ptr = getelementptr inbounds float, ptr %result, i64 %idx.ext + tail call void @llvm.masked.store.nxv4f32.p0( %15, ptr %add.ptr, i32 1, %1), !tbaa !5 + %conv1 = add i32 %0, %jp.010 + %cmp = icmp slt i32 %conv1, %loopTime + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !13 +} + +define dso_local void @test_invariantOffset32bit(i32 noundef %N, i32 noundef %M, ptr noundef %matrix, ptr nocapture noundef %result) local_unnamed_addr #0 { +; CHECK-LABEL: test_invariantOffset32bit: +; CHECK: // %bb.4: // %for.body4.lr.ph +; CHECK: add x[[NEWBASE1:[0-9]+]], x2, w[[NEWBASE1]], sxtw #2 +; CHECK: add x[[NEWBASE2:[0-9]+]], x2, w[[NEWBASE2]], sxtw #2 +; CHECK: add x[[NEWBASE3:[0-9]+]], x2, w[[NEWBASE3]], sxtw #2 +; CHECK: .LBB4_5: // %for.body4 +; CHECK: ld1w { {{z[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z[[INDEX:[0-9]+]].s, sxtw #2] +; CHECK-NEXT: ld1w { {{z[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z[[INDEX]].s, sxtw #2] +; CHECK-NEXT: ld1w { {{z[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE3]], z[[INDEX]].s, sxtw #2] +entry: + %cmp41 = icmp sgt i32 %N, 2 + br i1 %cmp41, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup + +for.cond1.preheader.lr.ph: ; preds = %entry + %div51 = udiv i32 %N, 3 + %cmp239 = icmp sgt i32 %M, 0 + %0 = sext i32 %M to i64 + %wide.trip.count = zext i32 %div51 to i64 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.cond.cleanup3 + %indvars.iv = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next, %for.cond.cleanup3 ] + br i1 %cmp239, label %for.body4.lr.ph, label %for.cond.cleanup3 + +for.body4.lr.ph: ; preds = %for.cond1.preheader + %1 = mul nuw nsw i64 %indvars.iv, 3 + %2 = trunc i64 %1 to i32 + %3 = mul i32 %2, %M + %.splatinsert = insertelement poison, i32 %3, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %4 = trunc i64 %1 to i32 + %5 = add i32 %4, 1 + %6 = mul i32 %5, %M + %.splatinsert9 = insertelement poison, i32 %6, i64 0 + %.splat10 = shufflevector %.splatinsert9, poison, zeroinitializer + %7 = trunc i64 %1 to i32 + %8 = add i32 %7, 2 + %9 = mul i32 %8, %M + %.splatinsert14 = insertelement poison, i32 %9, i64 0 + %.splat15 = shufflevector %.splatinsert14, poison, zeroinitializer + %10 = mul nsw i64 %indvars.iv, %0 + %add.ptr = getelementptr inbounds float, ptr %result, i64 %10 + %.tr = tail call i32 @llvm.vscale.i32() + %11 = shl nuw nsw i32 %.tr, 2 + br label %for.body4 + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.cond.cleanup3: ; preds = %for.body4, %for.cond1.preheader + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !14 + +for.body4: ; preds = %for.body4.lr.ph, %for.body4 + %jp.040 = phi i32 [ 0, %for.body4.lr.ph ], [ %conv20, %for.body4 ] + %12 = tail call @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.040, i32 %M) + %13 = tail call @llvm.aarch64.sve.index.nxv4i32(i32 %jp.040, i32 1) + %14 = select %12, %13, zeroinitializer + %15 = tail call @llvm.aarch64.sve.add.nxv4i32( %12, %14, %.splat) + %16 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %12, ptr %matrix, %15) + %17 = tail call @llvm.aarch64.sve.add.nxv4i32( %12, %14, %.splat10) + %18 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %12, ptr %matrix, %17) + %19 = tail call @llvm.aarch64.sve.add.nxv4i32( %12, %14, %.splat15) + %20 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32( %12, ptr %matrix, %19) + %21 = select %12, %16, zeroinitializer + %22 = tail call @llvm.aarch64.sve.fadd.nxv4f32( %12, %21, %18) + %23 = select %12, %22, zeroinitializer + %24 = tail call @llvm.aarch64.sve.fadd.nxv4f32( %12, %23, %20) + %25 = select %12, %24, zeroinitializer + %26 = tail call @llvm.aarch64.sve.fdiv.nxv4f32( %12, %25, shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer)) + %idx.ext17 = sext i32 %jp.040 to i64 + %add.ptr18 = getelementptr inbounds float, ptr %add.ptr, i64 %idx.ext17 + tail call void @llvm.masked.store.nxv4f32.p0( %26, ptr %add.ptr18, i32 1, %12), !tbaa !5 + %conv20 = add i32 %11, %jp.040 + %cmp2 = icmp slt i32 %conv20, %M + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3, !llvm.loop !15 +} + +define dso_local void @test_invariantOffset64bit(i64 noundef %N, i64 noundef %M, ptr noundef %matrix, ptr nocapture noundef %result) local_unnamed_addr #0 { +; CHECK-LABEL: test_invariantOffset64bit: +; CHECK: // %bb.5: // %for.body4.lr.ph +; CHECK: add x[[NEWBASE1:[0-9]+]], x2, x[[NEWBASE1]], lsl #3 +; CHECK-NEXT: add x[[NEWBASE3:[0-9]+]], x2, x[[NEWBASE3]], lsl #3 +; CHECK-NEXT: add x[[NEWBASE2:[0-9]+]], x2, x[[NEWBASE2]], lsl #3 +; CHECK: .LBB5_6: // %for.body4 +; CHECK: ld1d { {{z[0-9]+}}.d }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z[[INDEX:[0-9]+]].d, lsl #3] +; CHECK-NEXT: ld1d { {{z[0-9]+}}.d }, p[[PG]]/z, [x[[NEWBASE2]], z[[INDEX]].d, lsl #3] +; CHECK-NEXT: ld1d { {{z[0-9]+}}.d }, p[[PG]]/z, [x[[NEWBASE3]], z[[INDEX]].d, lsl #3] +entry: + %cmp39.not = icmp ult i64 %N, 3 + br i1 %cmp39.not, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph + +for.cond1.preheader.lr.ph: ; preds = %entry + %div = udiv i64 %N, 3 + %cmp237.not = icmp eq i64 %M, 0 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.cond.cleanup3 + %i.040 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %inc, %for.cond.cleanup3 ] + br i1 %cmp237.not, label %for.cond.cleanup3, label %for.body4.lr.ph + +for.body4.lr.ph: ; preds = %for.cond1.preheader + %mul = mul nuw i64 %i.040, 3 + %mul5 = mul i64 %mul, %M + %.splatinsert = insertelement poison, i64 %mul5, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %add7 = add nuw i64 %mul, 1 + %mul8 = mul i64 %add7, %M + %.splatinsert9 = insertelement poison, i64 %mul8, i64 0 + %.splat10 = shufflevector %.splatinsert9, poison, zeroinitializer + %add12 = add nuw i64 %mul, 2 + %mul13 = mul i64 %add12, %M + %.splatinsert14 = insertelement poison, i64 %mul13, i64 0 + %.splat15 = shufflevector %.splatinsert14, poison, zeroinitializer + %mul16 = mul i64 %i.040, %M + %add.ptr = getelementptr inbounds double, ptr %result, i64 %mul16 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 1 + br label %for.body4 + +for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry + ret void + +for.cond.cleanup3: ; preds = %for.body4, %for.cond1.preheader + %inc = add nuw nsw i64 %i.040, 1 + %exitcond.not = icmp eq i64 %inc, %div + br i1 %exitcond.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !16 + +for.body4: ; preds = %for.body4.lr.ph, %for.body4 + %jp.038 = phi i64 [ 0, %for.body4.lr.ph ], [ %add18, %for.body4 ] + %2 = tail call @llvm.aarch64.sve.whilelo.nxv2i1.i64(i64 %jp.038, i64 %M) + %3 = tail call @llvm.aarch64.sve.index.nxv2i64(i64 %jp.038, i64 1) + %4 = select %2, %3, zeroinitializer + %5 = tail call @llvm.aarch64.sve.add.nxv2i64( %2, %4, %.splat) + %6 = tail call @llvm.aarch64.sve.ld1.gather.index.nxv2f64( %2, ptr %matrix, %5) + %7 = tail call @llvm.aarch64.sve.add.nxv2i64( %2, %4, %.splat10) + %8 = tail call @llvm.aarch64.sve.ld1.gather.index.nxv2f64( %2, ptr %matrix, %7) + %9 = tail call @llvm.aarch64.sve.add.nxv2i64( %2, %4, %.splat15) + %10 = tail call @llvm.aarch64.sve.ld1.gather.index.nxv2f64( %2, ptr %matrix, %9) + %11 = select %2, %6, zeroinitializer + %12 = tail call @llvm.aarch64.sve.fadd.nxv2f64( %2, %11, %8) + %13 = select %2, %12, zeroinitializer + %14 = tail call @llvm.aarch64.sve.fadd.nxv2f64( %2, %13, %10) + %15 = select %2, %14, zeroinitializer + %16 = tail call @llvm.aarch64.sve.fdiv.nxv2f64( %2, %15, shufflevector ( insertelement ( poison, double 3.000000e+00, i64 0), poison, zeroinitializer)) + %add.ptr17 = getelementptr inbounds double, ptr %add.ptr, i64 %jp.038 + tail call void @llvm.masked.store.nxv2f64.p0( %16, ptr %add.ptr17, i32 1, %2), !tbaa !17 + %add18 = add i64 %1, %jp.038 + %cmp2 = icmp ult i64 %add18, %M + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3, !llvm.loop !19 +} + +define dso_local void @test_svaddx_constOffset(ptr noundef %base, %index) local_unnamed_addr #0 { +; CHECK-LABEL: test_svaddx_constOffset: +; CHECK: // %bb.0: // %entry +; CHECK: add x[[NEWBASE1:[0-9]+]], x0, #40 +; CHECK: add x[[NEWBASE2:[0-9]+]], x0, #44 +; CHECK: .LBB6_1: // %for.body +; CHECK: ld1w { {{z[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z[[INDEX:[0-9]+]].s, uxtw #2] +; CHECK: ld1w { {{z[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE2]], z[[INDEX:[0-9]+]].s, uxtw #2] +entry: + %0 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ] + %index.addr.05 = phi [ %index, %entry ], [ %8, %for.body ] + %1 = tail call @llvm.aarch64.sve.add.u.nxv4i32( %0, %index.addr.05, shufflevector ( insertelement ( poison, i32 10, i64 0), poison, zeroinitializer)) + %2 = tail call @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32( %0, ptr %base, %1) + %3 = shl nuw nsw i64 %indvars.iv, 4 + %add.ptr = getelementptr inbounds i32, ptr %base, i64 %3 + store %2, ptr %add.ptr, align 16, !tbaa !20 + %4 = tail call @llvm.aarch64.sve.add.u.nxv4i32( %0, %index.addr.05, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + %5 = tail call @llvm.aarch64.sve.add.u.nxv4i32( %0, %4, shufflevector ( insertelement ( poison, i32 10, i64 0), poison, zeroinitializer)) + %6 = tail call @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32( %0, ptr %base, %5) + %indvars.iv.next = shl i64 %indvars.iv, 4 + %7 = or i64 %indvars.iv.next, 16 + %add.ptr.1 = getelementptr inbounds i32, ptr %base, i64 %7 + store %6, ptr %add.ptr.1, align 16, !tbaa !20 + %8 = tail call @llvm.aarch64.sve.add.u.nxv4i32( %0, %4, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2 + %exitcond.not.1 = icmp eq i64 %indvars.iv.next.1, 100 + br i1 %exitcond.not.1, label %for.cond.cleanup, label %for.body, !llvm.loop !22 +} + +define dso_local void @_Z26test_loop_invariant_offsetPlu11__SVInt64_tl(ptr noundef %base, %index, i64 noundef %invariant_offset) local_unnamed_addr #6 { + +entry: + %0 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %.splatinsert = insertelement poison, i64 %invariant_offset, i64 0 + %1 = shufflevector %.splatinsert, poison, zeroinitializer + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %index.addr.05 = phi [ %index, %entry ], [ %4, %for.body ] + %2 = tail call @llvm.aarch64.sve.add.u.nxv2i64( %0, %index.addr.05, %1) + %.splatinsert3 = insertelement poison, i64 %indvars.iv, i64 0 + %3 = shufflevector %.splatinsert3, poison, zeroinitializer + tail call void @llvm.aarch64.sve.st1.scatter.index.nxv2i64( %3, %0, ptr %base, %2) + %4 = tail call @llvm.aarch64.sve.add.u.nxv2i64( %0, %index.addr.05, shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer)) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !23 +} + +define dso_local void @test_combined_const_and_invariant_offset(ptr noundef %base, %index, i32 noundef %invariant_offset) local_unnamed_addr #0 { +; CHECK-LABEL: test_combined_const_and_invariant_offset: +; CHECK: // %bb.0: // %entry +; CHECK: add x[[NEWBASE:[0-9]+]], x0, w1, sxtw #2 +; CHECK: add x[[NEWBASE]], x[[NEWBASE]], #40 +; CHECK: .LBB8_1: // %for.body +; CHECK: ld1w { {{z[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE]], z[[INDEX:[0-9]+]].s, sxtw #2] +entry: + %0 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %.splatinsert = insertelement poison, i32 %invariant_offset, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %index.addr.06 = phi [ %index, %entry ], [ %7, %for.body ] + %1 = select %0, %index.addr.06, zeroinitializer + %2 = tail call @llvm.aarch64.sve.add.u.nxv4i32( %0, %1, shufflevector ( insertelement ( poison, i32 10, i64 0), poison, zeroinitializer)) + %3 = select %0, %2, zeroinitializer + %4 = tail call @llvm.aarch64.sve.add.u.nxv4i32( %0, %3, %.splat) + %5 = tail call @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32( %0, ptr %base, %4) + %6 = shl nuw nsw i64 %indvars.iv, 4 + %add.ptr = getelementptr inbounds i32, ptr %base, i64 %6 + store %5, ptr %add.ptr, align 16, !tbaa !20 + %7 = tail call @llvm.aarch64.sve.add.u.nxv4i32( %0, %index.addr.06, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer)) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !24 +} + +declare @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32, i32) #1 +declare @llvm.aarch64.sve.index.nxv4i32(i32, i32) #1 +declare @llvm.aarch64.sve.mul.nxv4i32(, , ) #1 +declare @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(, ptr, ) #2 +declare @llvm.aarch64.sve.fsubr.nxv4f32(, , ) #1 +declare @llvm.aarch64.sve.fmul.nxv4f32(, , ) #1 +declare @llvm.aarch64.sve.add.nxv4i32(, , ) #1 +declare @llvm.aarch64.sve.fsub.nxv4f32(, , ) #1 +declare @llvm.aarch64.sve.fmad.nxv4f32(, , , ) #1 +declare @llvm.aarch64.sve.fsqrt.nxv4f32(, , ) #1 +declare float @llvm.aarch64.sve.faddv.nxv4f32(, ) #1 +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(, , ptr, ) #3 +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32 immarg) #1 +declare void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32(, ptr nocapture, , i32 immarg) #5 +declare @llvm.aarch64.sve.fadd.nxv4f32(, , ) #1 +declare @llvm.aarch64.sve.fdiv.nxv4f32(, , ) #1 +declare @llvm.aarch64.sve.whilelo.nxv2i1.i64(i64, i64) #1 +declare @llvm.aarch64.sve.index.nxv2i64(i64, i64) #1 +declare @llvm.aarch64.sve.add.nxv2i64(, , ) #1 +declare @llvm.aarch64.sve.ld1.gather.index.nxv2f64(, ptr, ) #2 +declare @llvm.aarch64.sve.fadd.nxv2f64(, , ) #1 +declare @llvm.aarch64.sve.fdiv.nxv2f64(, , ) #1 +declare @llvm.aarch64.sve.add.u.nxv4i32(, , ) #1 +declare @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(, ptr, ) #2 +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32 immarg) #1 +declare @llvm.aarch64.sve.add.u.nxv2i64(, , ) #1 +declare void @llvm.aarch64.sve.st1.scatter.index.nxv2i64(, , ptr, ) #3 +declare @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(, ptr, ) #2 +declare i64 @llvm.vscale.i64() #7 +declare i32 @llvm.vscale.i32() #7 +declare @llvm.masked.load.nxv4f32.p0(ptr nocapture, i32 immarg, , ) #8 +declare i1 @llvm.aarch64.sve.ptest.any.nxv4i1(, ) #7 +declare void @llvm.masked.store.nxv4f32.p0(, ptr nocapture, i32 immarg, ) #9 +declare void @llvm.masked.store.nxv2f64.p0(, ptr nocapture, i32 immarg, ) #9 + +attributes #0 = { mustprogress nofree nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hip09" "target-features"="+aes,+bf16,+crc,+dotprod,+f32mm,+f64mm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+sve,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,-fmv" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) } +attributes #4 = { mustprogress nofree nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hip09" "target-features"="+aes,+bf16,+crc,+dotprod,+f32mm,+f64mm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+sve,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,-fmv" } +attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) } +attributes #6 = { mustprogress nofree nosync nounwind memory(argmem: write) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hip09" "target-features"="+aes,+bf16,+crc,+dotprod,+f32mm,+f64mm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+sve,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,-fmv" } +attributes #7 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #8 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #9 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2, !3, !4} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 8, !"PIC Level", i32 2} +!2 = !{i32 7, !"PIE Level", i32 2} +!3 = !{i32 7, !"uwtable", i32 2} +!4 = !{i32 7, !"frame-pointer", i32 1} +!5 = !{!6, !6, i64 0} +!6 = !{!"float", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C++ TBAA"} +!9 = distinct !{!9, !10} +!10 = !{!"llvm.loop.mustprogress"} +!11 = distinct !{!11, !10} +!12 = distinct !{!12, !10} +!13 = distinct !{!13, !10} +!14 = distinct !{!14, !10} +!15 = distinct !{!15, !10} +!16 = distinct !{!16, !10} +!17 = !{!18, !18, i64 0} +!18 = !{!"double", !7, i64 0} +!19 = distinct !{!19, !10} +!20 = !{!21, !21, i64 0} +!21 = !{!"int", !7, i64 0} +!22 = distinct !{!22, !10} +!23 = distinct !{!23, !10} +!24 = distinct !{!24, !10}