diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 87aa3b98d93826560a09a4d9e6e1355e7a4e173a..66d2b018dfd742f7d019d1a369ee29b86252c794 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -65,7 +65,9 @@
 
 #include "AArch64ExpandImm.h"
 #include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 
@@ -73,6 +75,867 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-mi-peephole-opt"
 
+static cl::opt<bool> EnableSVEPeephole(
+    "aarch64-sve-peephole", cl::init(true), cl::Hidden,
+    cl::desc("Enable SVE gather/scatter peephole optimizations"));
+
+//===----------------------------------------------------------------------===//
+//                          SVELoopAddressHoisting
+//===----------------------------------------------------------------------===//
+//
+// This SVELoopAddressHoisting class optimizes SVE gather/scatter addressing
+// modes (induction variable strength reduction). It transforms loop-invariant
+// index increment patterns (addr = index + Const/LoopInvariant) into
+// precomputed base addresses + fixed indices:
+//
+// Transformation patterns:
+//   1) Original:  address = base + index + Const
+//      Optimized: address = base_Const + index
+//
+//   2) Original:  address = base + index + LoopInvariant
+//      Optimized: address = base_LoopInvariant + index
+//
+// C code example:
+// - Hoists loop-variant address components to loop preheader
+// - Replaces dynamic index calculations with static base offsets
+//
+//   // Before optimization
+//   char *basePtr;
+//   for (int i = 0; ...; i += VSCALE) {
+//       svint32_t index = svindex_s32(i, 1);
+//       index = svadd_z(pg, index, LoopInvariantOffset);
+//       svld1_gather_index(pgNew, basePtr, index);
+//   }
+//
+//   // After optimization
+//   char *hoistedBasePtr = basePtr + LoopInvariantScalar * ElementSizeInBytes; // Precomputed outside
+//   for (int i = 0; ...; i += VSCALE) {
+//       svint32_t index = svindex_s32(i, 1);
+//       svld1_gather_index(pgNew, hoistedBasePtr, index); // Fixed index in loop
+//   }
+//
+
+namespace {
+class SVELoopAddressHoisting {
+  MachineRegisterInfo *MRI;
+  const TargetInstrInfo *TII;
+
+public:
+  bool runOnMachineFunction(MachineFunction &MF, MachineLoopInfo *MLI);
+
+private:
+  using InstAndOffset = std::pair<MachineInstr *, int64_t>;
+  using ChainKey = std::tuple<Register, Register, Register>;
+  // Key: {BaseReg, RootIndexReg, InvariantGPROffset}
+  // Value: Vector of {Instruction, ElementOffset} pairs
+  using ChainMap = DenseMap<ChainKey, SmallVector<InstAndOffset, 4>>;
+
+  // Define an enum for the SVE offset type.
+  enum class SVEOffsetType {
+    NOT_APPLICABLE, // Not a recognized gather/scatter instruction
+    SXTW,
+    UXTW,
+    D64
+  };
+
+  bool isLoopInvariant(Register Reg, MachineLoop *L) const;
+  bool isConstantVector(Register Reg, int64_t &Value) const;
+  bool isInvariantBroadcastGPR(Register VecReg, MachineLoop *L,
+                               Register &GPR) const;
+  unsigned getElementSizeInBytes(const MachineInstr &MI,
+                                 SVEOffsetType *OffsetKind) const;
+  void traceIndexChain(Register IndexReg, Register &RootIndex,
+                       int64_t &AccumulatedOffset, Register &InvariantGPROffset,
+                       MachineLoop *L,
+                       SmallVectorImpl<MachineInstr *> &ChainsInsts) const;
+  void collectOptimizationCandidates(
+      MachineLoop *L, ChainMap &Chains,
+      SetVector<MachineInstr *> &CandidateDeadInsts) const;
+  bool hoistInvariantsAndRewrite(MachineLoop *L, const ChainMap &Chains);
+  bool cleanupDeadCode(SetVector<MachineInstr *> &CandidateDeadInsts);
+  bool sveMulStrengthReduction(MachineLoop *L);
+  bool processLoop(MachineLoop *L);
+};
+} // end anonymous namespace
+
+// Check if Reg is a loop invariant to Loop L
+bool SVELoopAddressHoisting::isLoopInvariant(Register Reg,
+                                             MachineLoop *L) const {
+  if (!Reg.isVirtual())
+    return false;
+  MachineInstr *Def = MRI->getVRegDef(Reg);
+  if (!Def)
+    return true;
+  return !L->contains(Def->getParent());
+}
+
+// Check if a vector register represents a constant value
+// and retrieve that constant value if it exists
+bool SVELoopAddressHoisting::isConstantVector(Register Reg,
+                                              int64_t &Value) const {
+  if (!Reg.isVirtual())
+    return false;
+
+  MachineInstr *Def = MRI->getVRegDef(Reg);
+  if (!Def)
+    return false;
+
+  // Match the DUP instruction pattern: %Def = DUP_ZI_S Imm, 0
+  // This instruction broadcasts the immediate value to all vector elements
+  unsigned DupOp = Def->getOpcode();
+  if (DupOp == AArch64::DUP_ZI_S || DupOp == AArch64::DUP_ZI_D) {
+    Value = Def->getOperand(1).getImm();
+    return true;
+  }
+  return false;
+}
+
+// Checks if a vector register is broadcasted from a loop-invariant GPR
+// Matches instruction pattern: %VecReg = DUP_ZR_S %GPR
+// Where %GPR is loop-invariant to loop L
+bool SVELoopAddressHoisting::isInvariantBroadcastGPR(Register VecReg,
+                                                     MachineLoop *L,
+                                                     Register &GPR) const {
+  if (!VecReg.isVirtual())
+    return false;
+
+  MachineInstr *Def = MRI->getVRegDef(VecReg);
+  if (!Def)
+    return false;
+
+  unsigned DupOp = Def->getOpcode();
+  if (DupOp == AArch64::DUP_ZR_S || DupOp == AArch64::DUP_ZR_D) {
+    Register SrcGPR = Def->getOperand(1).getReg();
+    if (isLoopInvariant(SrcGPR, L)) {
+      GPR = SrcGPR;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns element size in bytes for gather/scatter instructions
+// Returns 0 for non-gather/scatter instructions
+unsigned
+SVELoopAddressHoisting::getElementSizeInBytes(const MachineInstr &MI,
+                                              SVEOffsetType *OffsetKind) const {
+  switch (MI.getOpcode()) {
+  // --- Element Size: 2 Bytes (Half-Word) ---
+  case AArch64::GLD1H_D_SCALED:
+  case AArch64::GLD1SH_D_SCALED:
+  case AArch64::GLDFF1H_D_SCALED:
+  case AArch64::GLDFF1SH_D_SCALED:
+  case AArch64::LDNT1H_ZZR_D_REAL:
+  case AArch64::LDNT1SH_ZZR_D_REAL:
+  case AArch64::SST1H_D_SCALED:
+  case AArch64::STNT1H_ZZR_D_REAL:
+    *OffsetKind = SVEOffsetType::D64;
+    return 2;
+  case AArch64::GLD1H_S_SXTW_SCALED:
+  case AArch64::GLD1SH_S_SXTW_SCALED:
+  case AArch64::GLDFF1H_S_SXTW_SCALED:
+  case AArch64::GLDFF1SH_S_SXTW_SCALED:
+  case AArch64::SST1H_S_SXTW_SCALED:
+    *OffsetKind = SVEOffsetType::SXTW;
+    return 2;
+  case AArch64::GLD1H_S_UXTW_SCALED:
+  case AArch64::GLD1SH_S_UXTW_SCALED:
+  case AArch64::GLDFF1H_S_UXTW_SCALED:
+  case AArch64::GLDFF1SH_S_UXTW_SCALED:
+  case AArch64::SST1H_S_UXTW_SCALED:
+    *OffsetKind = SVEOffsetType::UXTW;
+    return 2;
+
+  // --- Element Size: 4 Bytes (Word) ---
+  case AArch64::GLD1SW_D_SCALED:
+  case AArch64::GLD1W_D_SCALED:
+  case AArch64::GLDFF1SW_D_SCALED:
+  case AArch64::GLDFF1W_D_SCALED:
+  case AArch64::LDNT1SW_ZZR_D_REAL:
+  case AArch64::LDNT1W_ZZR_D_REAL:
+  case AArch64::SST1W_D_SCALED:
+  case AArch64::STNT1W_ZZR_D_REAL:
+    *OffsetKind = SVEOffsetType::D64;
+    return 4;
+  case AArch64::GLD1W_SXTW_SCALED:
+  case AArch64::GLDFF1W_SXTW_SCALED:
+  case AArch64::PRFW_S_SXTW_SCALED:
+  case AArch64::SST1W_SXTW_SCALED:
+    *OffsetKind = SVEOffsetType::SXTW;
+    return 4;
+  case AArch64::GLD1W_UXTW_SCALED:
+  case AArch64::GLDFF1W_UXTW_SCALED:
+  case AArch64::PRFW_S_UXTW_SCALED:
+  case AArch64::SST1W_UXTW_SCALED:
+    *OffsetKind = SVEOffsetType::UXTW;
+    return 4;
+
+  // --- Element Size: 8 Bytes (Double-Word) ---
+  case AArch64::GLD1D_SCALED:
+  case AArch64::GLDFF1D_SCALED:
+  case AArch64::LDNT1D_ZZR_D_REAL:
+  case AArch64::PRFW_D_SCALED:
+  case AArch64::SST1D_SCALED:
+  case AArch64::STNT1D_ZZR_D_REAL:
+    *OffsetKind = SVEOffsetType::D64;
+    return 8;
+  case AArch64::GLD1D_SXTW_SCALED:
+  case AArch64::SST1D_SXTW_SCALED:
+    *OffsetKind = SVEOffsetType::SXTW;
+    return 8;
+  case AArch64::GLD1D_UXTW_SCALED:
+  case AArch64::SST1D_UXTW_SCALED:
+    *OffsetKind = SVEOffsetType::UXTW;
+    return 8;
+  default:
+    StringRef InstName = TII->getName(MI.getOpcode());
+    if (InstName.startswith("GLD") || InstName.startswith("SST") ||
+        InstName.startswith("LDNT") || InstName.startswith("STNT") ||
+        InstName.startswith("PRFW")) {
+      LLVM_DEBUG(dbgs() << "SVELoopAddressHoisting: Unhandled SVE gather/scatter-like instruction found: "
+                        << MI);
+    }
+
+    *OffsetKind = SVEOffsetType::NOT_APPLICABLE;
+    return 0;
+  }
+}
+
+// Traces index chain to discover:
+// - Root index register
+// - Accumulated constant offset
+// - Loop-invariant GPR offset component
+// - And collects the chain instructions for potential deletion
+void SVELoopAddressHoisting::traceIndexChain(
+    Register IndexReg, Register &RootIndex, int64_t &AccumulatedOffset,
+    Register &InvariantGPROffset, MachineLoop *L,
+    SmallVectorImpl<MachineInstr *> &ChainInsts) const {
+  AccumulatedOffset = 0;
+  InvariantGPROffset = Register(0);
+  Register CurrentReg = IndexReg;
+
+  while (true) {
+    if (!CurrentReg.isVirtual())
+      break;
+
+    MachineInstr *Def = MRI->getVRegDef(CurrentReg);
+    // Index must be defined within loop as induction variable
+    if (!Def || !L->contains(Def->getParent()))
+      break;
+
+    // Match svadd index increment pattern:
+    // %index = ADD_ZI_[S/D] %prev_index, %offset, %pg
+    // %index = ADD_ZZZ_D %prev_index, %offset
+    // %index = ADD_ZPZZ_[S/D]_ZERO %pg, %prev_index, %offset
+    unsigned IndexOp = Def->getOpcode();
+    if (IndexOp == AArch64::ADD_ZI_S || IndexOp == AArch64::ADD_ZI_D) {
+      int64_t ConstValue = Def->getOperand(2).getImm();
+      AccumulatedOffset += ConstValue;
+      CurrentReg = Def->getOperand(1).getReg();
+      ChainInsts.push_back(Def);
+      continue;
+    }
+
+    Register Op1, Op2;
+    if (IndexOp == AArch64::ADD_ZZZ_S || IndexOp == AArch64::ADD_ZZZ_D) {
+      Op1 = Def->getOperand(1).getReg();
+      Op2 = Def->getOperand(2).getReg();
+    } else if (IndexOp == AArch64::ADD_ZPZZ_S_ZERO ||
+               IndexOp == AArch64::ADD_ZPZZ_D_ZERO ||
+               IndexOp == AArch64::ADD_ZPmZ_S ||
+               IndexOp == AArch64::ADD_ZPmZ_D) {
+      Op1 = Def->getOperand(2).getReg();
+      Op2 = Def->getOperand(3).getReg();
+    } else {
+      break;
+    }
+
+    int64_t ConstValue;
+    Register InvariantGPR;
+
+    // Op2 case 1: Constant vector offset
+    if (isConstantVector(Op2, ConstValue)) {
+      AccumulatedOffset += ConstValue;
+      CurrentReg = Op1;
+      ChainInsts.push_back(MRI->getVRegDef(Op2));
+      ChainInsts.push_back(Def);
+      continue;
+    }
+
+    // Op2 case 2: Loop-invariant GPR broadcast offset
+    if (InvariantGPROffset == 0 &&
+        isInvariantBroadcastGPR(Op2, L, InvariantGPR)) {
+      InvariantGPROffset = InvariantGPR;
+      CurrentReg = Op1;
+      ChainInsts.push_back(MRI->getVRegDef(Op2));
+      ChainInsts.push_back(Def);
+      continue;
+    }
+
+    // Op1 case 1: Constant vector offset
+    if (isConstantVector(Op1, ConstValue)) {
+      AccumulatedOffset += ConstValue;
+      CurrentReg = Op2;
+      ChainInsts.push_back(MRI->getVRegDef(Op1));
+      ChainInsts.push_back(Def);
+      continue;
+    }
+
+    // Op1 case 2: Loop-invariant GPR broadcast offset
+    if (InvariantGPROffset == 0 &&
+        isInvariantBroadcastGPR(Op1, L, InvariantGPR)) {
+      InvariantGPROffset = InvariantGPR;
+      CurrentReg = Op2;
+      ChainInsts.push_back(MRI->getVRegDef(Op1));
+      ChainInsts.push_back(Def);
+      continue;
+    }
+    break;
+  }
+
+  RootIndex = CurrentReg;
+}
+
+// Collects all optimizable gather/scatter instructions
+// and groups them into chains.
+void SVELoopAddressHoisting::collectOptimizationCandidates(
+    MachineLoop *L, ChainMap &Chains,
+    SetVector<MachineInstr *> &CandidateDeadInsts) const {
+  for (MachineBasicBlock *MBB : L->getBlocks()) {
+    for (MachineInstr &MI : *MBB) {
+      SVEOffsetType OffsetType;
+      unsigned ElementSize = getElementSizeInBytes(MI, &OffsetType);
+      if (ElementSize == 0)
+        continue;
+
+      // Verify instruction format:
+      // Gather:  DstZPR, PredicatePPR, BaseGPR, IndexZPR
+      // Scatter: SrcZPR, PredicatePPR, BaseGPR, IndexZPR
+      if (MI.getNumOperands() < 4)
+        continue;
+
+      Register BaseReg = MI.getOperand(2).getReg();
+      Register IndexReg = MI.getOperand(3).getReg();
+      // Only optimize loop-invariant base addresses
+      if (!isLoopInvariant(BaseReg, L))
+        continue;
+
+      Register RootIndex, InvariantGPROffset;
+      int64_t ElemOffset;
+      SmallVector<MachineInstr *, 8> TmpChainInsts; // Store chain for this MI
+
+      // Trace index computation chain
+      traceIndexChain(IndexReg, RootIndex, ElemOffset, InvariantGPROffset, L,
+                      TmpChainInsts);
+
+      // If the chain is empty, there's nothing to optimize or delete.
+      if (TmpChainInsts.empty() && InvariantGPROffset == 0 && ElemOffset == 0)
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Found candidate instruction: "; MI.dump();
+                 dbgs() << "  BaseReg: " << printReg(BaseReg)
+                        << ", IndexReg: " << printReg(IndexReg)
+                        << " -> RootIndex: " << printReg(RootIndex)
+                        << ", ElemOffset: " << ElemOffset
+                        << ", InvariantGPROffset: "
+                        << printReg(InvariantGPROffset) << "\n");
+
+      Chains[{BaseReg, RootIndex, InvariantGPROffset}].push_back(
+          {&MI, ElemOffset});
+
+      // Add the identified chain instructions to the master set of candidates.
+      CandidateDeadInsts.insert(TmpChainInsts.begin(), TmpChainInsts.end());
+    }
+  }
+}
+
+bool SVELoopAddressHoisting::hoistInvariantsAndRewrite(MachineLoop *L,
+                                                       const ChainMap &Chains) {
+  bool Changed = false;
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+
+  for (auto &ChainInfo : Chains) {
+    auto &Addressings = ChainInfo.second;
+    // Skip chains without optimizable offsets
+    if (Addressings.size() < 2 && std::get<2>(ChainInfo.first) == 0 &&
+        Addressings[0].second == 0)
+      continue;
+
+    Register BaseReg = std::get<0>(ChainInfo.first);
+    Register RootIndex = std::get<1>(ChainInfo.first);
+    Register InvariantGPROffset = std::get<2>(ChainInfo.first);
+
+    LLVM_DEBUG(dbgs() << "Optimizing chain with BaseReg: " << printReg(BaseReg)
+                      << ", RootIndex: " << printReg(RootIndex)
+                      << ", InvariantGPROffset: "
+                      << printReg(InvariantGPROffset) << "\n");
+
+    auto InsertPt = Preheader->getFirstTerminator();
+    DebugLoc DL = Addressings[0].first->getDebugLoc();
+
+    // Handle loop-invariant GPR offset first
+    Register CurrentBaseReg = BaseReg;
+    if (InvariantGPROffset != 0) {
+      Register HoistedBaseReg =
+          MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+      SVEOffsetType OffsetType;
+      unsigned ElementSize =
+          getElementSizeInBytes(*Addressings[0].first, &OffsetType);
+      unsigned ShiftAmt = Log2_64(ElementSize);
+
+      const TargetRegisterClass *RC = MRI->getRegClass(InvariantGPROffset);
+      // Skip if offset is not a GPR register, cannot be used as address offset
+      if (!AArch64::GPR32RegClass.hasSubClassEq(RC) &&
+          !AArch64::GPR64RegClass.hasSubClassEq(RC)) {
+        LLVM_DEBUG(dbgs() << "  Skipping due to non-GPR offset register: "
+                          << printReg(InvariantGPROffset) << "\n");
+        continue;
+      }
+
+      unsigned AddOp, ShiftExtender;
+      if (AArch64::GPR32RegClass.hasSubClassEq(RC)) {
+        if (OffsetType == SVEOffsetType::SXTW)
+          ShiftExtender =
+              AArch64_AM::getArithExtendImm(AArch64_AM::SXTW, ShiftAmt);
+        else
+          ShiftExtender =
+              AArch64_AM::getArithExtendImm(AArch64_AM::UXTW, ShiftAmt);
+        AddOp = AArch64::ADDXrx;
+      } else {
+        ShiftExtender = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+        AddOp = AArch64::ADDXrs;
+      }
+
+      // HoistedBase = Base + (InvariantGPROffset << log2(ElementSize))
+      BuildMI(*Preheader, InsertPt, DL, TII->get(AddOp), HoistedBaseReg)
+          .addReg(BaseReg)
+          .addReg(InvariantGPROffset)
+          .addImm(ShiftExtender);
+      CurrentBaseReg = HoistedBaseReg;
+      LLVM_DEBUG(dbgs() << "  Hoisted and scaled Invariant GPR Offset"
+                        << " into new base " << printReg(CurrentBaseReg)
+                        << "\n");
+    }
+
+    // Map constant element offsets to newly created base registers to avoid
+    // redundant ADD instructions in the preheader.
+    DenseMap<int64_t, Register> OffsetToNewBaseMap;
+
+    for (auto &AddressInfo : Addressings) {
+      MachineInstr *MI = AddressInfo.first;
+      int64_t ElemOffset = AddressInfo.second;
+
+      // Case 1: Zero offset - use current base directly
+      if (ElemOffset == 0) {
+        MI->getOperand(2).setReg(CurrentBaseReg);
+        MI->getOperand(3).setReg(RootIndex);
+        Changed = true;
+        continue;
+      }
+
+      // Case 2: Non-zero offset - create or reuse offset base
+      Register NewBaseReg;
+      if (OffsetToNewBaseMap.count(ElemOffset)) {
+        NewBaseReg = OffsetToNewBaseMap[ElemOffset];
+      } else {
+        // Create new base: NewBase = BaseReg + (ElemOffset * ElementSize)
+        SVEOffsetType OffsetType;
+        unsigned ElementSize = getElementSizeInBytes(*MI, &OffsetType);
+        int64_t ByteOffset = ElemOffset * ElementSize;
+
+        NewBaseReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+        DebugLoc DL = MI->getDebugLoc();
+        BuildMI(*Preheader, InsertPt, DL, TII->get(AArch64::ADDXri), NewBaseReg)
+            .addReg(CurrentBaseReg)
+            .addImm(ByteOffset)
+            .addImm(0);
+        OffsetToNewBaseMap[ElemOffset] = NewBaseReg;
+        LLVM_DEBUG(dbgs() << "  Hoisted new base for ElemOffset " << ElemOffset
+                          << " (ByteOffset " << ByteOffset << ") into "
+                          << printReg(NewBaseReg) << "\n");
+      }
+
+      // Rewrite Gather/Scatter MIR
+      // Original: GLD*/SST* ..., [BaseReg], [IndexReg]
+      // Optimized: GLD*/SST* ..., [NewBaseReg], [RootIndex]
+      MI->getOperand(2).setReg(NewBaseReg);
+      MI->getOperand(3).setReg(RootIndex);
+
+      LLVM_DEBUG(dbgs() << "  Rewrote instruction: "; MI->dump());
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool SVELoopAddressHoisting::cleanupDeadCode(
+    SetVector<MachineInstr *> &CandidateDeadInsts) {
+  if (CandidateDeadInsts.empty())
+    return false;
+
+  bool Changed = false;
+  LLVM_DEBUG(dbgs() << "--- Cleaning up dead instructions ---\n");
+  for (MachineInstr *MI : llvm::reverse(CandidateDeadInsts)) {
+    bool IsDead = true;
+    for (const MachineOperand &MO : MI->operands()) {
+      if (MO.isReg() && MO.isDef() && MO.getReg().isVirtual()) {
+        if (!MRI->use_empty(MO.getReg())) {
+          IsDead = false;
+          break;
+        }
+      }
+    }
+
+    if (!IsDead)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Deleting dead instruction: "; MI->dump());
+    MI->eraseFromParent();
+    Changed = true;
+  }
+  return Changed;
+}
+
+// This optimization performs strength reduction for gather/scatter index (SVE
+// MUL instruction) in a loop. It identifies a specific pattern of a multiply
+// operation on an induction variable within a loop and replaces it with a more
+// efficient addition-based approach. The targeted pattern is essentially:
+//   for (int jp = jp_init; jp < loopTime; jp += svcnt[w/d]()) {
+//     jpsv = svindex_[](jp, IndexStep)
+//     result = svmul_z(pgNew, jpsv, Mutipilier)
+//   }
+//
+// This is transformed into:
+//   new_step = IndexStep * svcnt[w/d]()
+//   new_offset = svindex_[](0, new_step)
+//   for (int jp = jp_init, base = jp_init * multipiler; jp < loopTime; jp += svcnt[w/d](), base += new_step) {
+//     result = svadd_z(pgNew, new_offset, base)
+//   }
+bool SVELoopAddressHoisting::sveMulStrengthReduction(MachineLoop *L) {
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader)
+    return false;
+
+  MachineBasicBlock *Header = L->getHeader();
+  MachineBasicBlock *Latch = L->getLoopLatch();
+  if (!Header || !Latch)
+    return false;
+
+  for (MachineBasicBlock *MBB : L->getBlocks()) {
+    for (MachineInstr &MI : *MBB) {
+      // --- Start of the SVE MUL_Z Pattern Match ---
+      if (MI.getOpcode() != AArch64::MUL_ZPmZ_S &&
+          MI.getOpcode() != AArch64::MUL_ZPmZ_D) {
+        continue;
+      }
+
+      LLVM_DEBUG(dbgs() << "Found candidate MUL: "; MI.dump());
+
+      // Set up the specific AArch64 opcodes based on whether we have a 32-bit
+      // or 64-bit operation.
+      bool is64Bit = (MI.getOpcode() == AArch64::MUL_ZPmZ_D);
+      unsigned SelOpc = is64Bit ? AArch64::SEL_ZPZZ_D : AArch64::SEL_ZPZZ_S;
+      unsigned IndexRiOpc = is64Bit ? AArch64::INDEX_RI_D : AArch64::INDEX_RI_S;
+      unsigned AddZzzOpc = is64Bit ? AArch64::ADD_ZZZ_D : AArch64::ADD_ZZZ_S;
+      unsigned DupZrOpc = is64Bit ? AArch64::DUP_ZR_D : AArch64::DUP_ZR_S;
+      unsigned IndexIiOpc = is64Bit ? AArch64::INDEX_II_D : AArch64::INDEX_II_S;
+      unsigned DupZiOpc = is64Bit ? AArch64::DUP_ZI_D : AArch64::DUP_ZI_S;
+      unsigned AddGprOpc = is64Bit ? AArch64::ADDXrr : AArch64::ADDWrr;
+      unsigned CntOpc = is64Bit ? AArch64::CNTD_XPiI : AArch64::CNTW_XPiI;
+      unsigned MaddGprOpc = is64Bit ? AArch64::MADDXrrr : AArch64::MADDWrrr;
+      unsigned MovImmOpc = is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm;
+      unsigned ZeroReg = is64Bit ? AArch64::XZR : AArch64::WZR;
+
+      const TargetRegisterClass *GprRegClass =
+          is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+      const TargetRegisterClass *GprAllRegClass =
+          is64Bit ? &AArch64::GPR64allRegClass : &AArch64::GPR32allRegClass;
+      const TargetRegisterClass *ZprRegClass = &AArch64::ZPRRegClass;
+
+      // Deconstruct the multiply instruction to see if it matches our target
+      // pattern. The matched pattern is: MUL(SEL(Pred, INDEX(IV, IdxStep),
+      // Zero), DUP(Multiplier))
+      MachineInstr *SelMI = MRI->getVRegDef(MI.getOperand(2).getReg());
+      if (!SelMI || (SelMI->getOpcode() != SelOpc))
+        continue;
+
+      // The second operand of the select should be an index operation.
+      MachineInstr *IndexMI = MRI->getVRegDef(SelMI->getOperand(2).getReg());
+      if (!IndexMI)
+        continue;
+
+      Register IVReg;
+      int64_t IndexStep;
+      // Detect the two index generated ways
+      if (IndexMI->getOpcode() == IndexRiOpc) {
+        // Case 1: INDEX_RI (reg, imm)
+        IVReg = IndexMI->getOperand(1).getReg();
+        if (!IVReg.isVirtual())
+          continue;
+        IndexStep = IndexMI->getOperand(2).getImm();
+      } else if (IndexMI->getOpcode() == AddZzzOpc) {
+        // Case 2: ADD(INDEX_II(0, imm), DUP(reg))
+        MachineInstr *Op1 = MRI->getVRegDef(IndexMI->getOperand(1).getReg());
+        MachineInstr *Op2 = MRI->getVRegDef(IndexMI->getOperand(2).getReg());
+        if (!Op1 || !Op2)
+          continue;
+
+        auto matchIndexAddPattern = [&](MachineInstr *A, MachineInstr *B) {
+          return (A->getOpcode() == IndexIiOpc && B->getOpcode() == DupZrOpc &&
+                  A->getOperand(1).getImm() == 0);
+        };
+
+        if (matchIndexAddPattern(Op1, Op2)) {
+          IndexStep = Op1->getOperand(2).getImm();
+          IVReg = Op2->getOperand(1).getReg();
+        } else if (matchIndexAddPattern(Op2, Op1)) {
+          IndexStep = Op2->getOperand(2).getImm();
+          IVReg = Op1->getOperand(1).getReg();
+        } else {
+          continue;
+        }
+      } else {
+        continue;
+      }
+
+      // The third operand of the multiply should be a duplicated immediate
+      // value.
+      MachineInstr *MultiplierMI = MRI->getVRegDef(MI.getOperand(3).getReg());
+      if (!MultiplierMI || !isLoopInvariant(MI.getOperand(3).getReg(), L) ||
+          (MultiplierMI->getOpcode() != DupZiOpc))
+        continue;
+      int64_t MultiplierVal = MultiplierMI->getOperand(1).getImm();
+
+      // Check if the identified register is a basic loop induction variable.
+      MachineInstr *IVPhi = MRI->getVRegDef(IVReg);
+      if (!IVPhi || !IVPhi->isPHI() || IVPhi->getParent() != Header)
+        continue;
+
+      // Find the instruction that updates the induction variable (usually an
+      // ADD in the latch).
+      Register IVInitReg = Register(0), IVNextReg = Register(0);
+      for (unsigned i = 1; i < IVPhi->getNumOperands(); i += 2) {
+        if (IVPhi->getOperand(i + 1).getMBB() == Preheader) {
+          IVInitReg = IVPhi->getOperand(i).getReg();
+          break;
+        }
+      }
+      for (unsigned i = 1; i < IVPhi->getNumOperands(); i += 2) {
+        if (IVPhi->getOperand(i + 1).getMBB() == Latch) {
+          IVNextReg = IVPhi->getOperand(i).getReg();
+          break;
+        }
+      }
+      if (!IVInitReg || !IVNextReg)
+        continue;
+
+      // Get the definition of the next value of the induction variable.
+      MachineInstr *IVUpdateMI = MRI->getVRegDef(IVNextReg);
+      if (!IVUpdateMI)
+        continue;
+      if (IVUpdateMI->getOpcode() == AArch64::COPY)
+        IVUpdateMI = MRI->getVRegDef(IVUpdateMI->getOperand(1).getReg());
+      if (IVUpdateMI->getOpcode() != AddGprOpc)
+        continue;
+
+      // Determine the step of the induction variable.
+      Register IVStepReg;
+      if (IVUpdateMI->getOperand(1).getReg() == IVReg)
+        IVStepReg = IVUpdateMI->getOperand(2).getReg();
+      else if (IVUpdateMI->getOperand(2).getReg() == IVReg)
+        IVStepReg = IVUpdateMI->getOperand(1).getReg();
+      else
+        continue;
+
+      LLVM_DEBUG(
+          dbgs() << "Sve Mul Strength reduction pattern matched for MUL: ";
+          MI.dump(););
+
+      // --- Start of the Transformation ---
+      auto PreheaderInsertPt = Preheader->getFirstTerminator();
+      DebugLoc DL = MI.getDebugLoc();
+
+      // In the preheader, create a new offset = index(0, IndexStep *
+      // MultiplierVal)
+      Register OffsetVecReg = MRI->createVirtualRegister(ZprRegClass);
+      BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(IndexIiOpc),
+              OffsetVecReg)
+          .addImm(0)
+          .addImm(IndexStep * MultiplierVal)
+          .addReg(AArch64::VG, RegState::Implicit);
+
+      // In the preheader, calculate the new step value for our new induction
+      // variable. This is: NewStep = IVStep * MultiplierVal
+      MachineInstr *IVStepDef = MRI->getVRegDef(IVStepReg);
+      if (IVStepDef->getOpcode() == AArch64::COPY)
+        IVStepDef = MRI->getVRegDef(IVStepDef->getOperand(1).getReg());
+
+      // Check if the original IV step is the vector length (vl).
+      bool isStepVL =
+          IVStepDef && IVStepDef->getOpcode() == CntOpc &&
+          IVStepDef->getOperand(1).getImm() == 31 && // Pattern for 'all'
+          IVStepDef->getOperand(2).getImm() == 1;    // Multiplier of 1
+      Register NewStepReg = MRI->createVirtualRegister(GprRegClass);
+
+      // If the step is 'vl' and the multiplier is small, we can use a more
+      // efficient 'cnt' instruction.
+      if (isStepVL && MultiplierVal <= 15) {
+        Register NewStep64Reg =
+            MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+        LLVM_DEBUG(dbgs() << "IV Step is vl, using CNT[W/D] for new step.\n");
+        BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(CntOpc),
+                NewStep64Reg)
+            .addImm(31) // Pattern 'all' for vl
+            .addImm(MultiplierVal)
+            .addReg(AArch64::VG, RegState::Implicit);
+        if (!is64Bit) {
+          BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(AArch64::COPY),
+                  NewStepReg)
+              .addReg(NewStep64Reg, 0, AArch64::sub_32);
+        }
+      } else {
+        // Otherwise, we use a general multiplication.
+        LLVM_DEBUG(
+            dbgs() << "IV Step is not vl, using generic MUL for new step.\n");
+        Register MultReg = MRI->createVirtualRegister(GprRegClass);
+        BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MovImmOpc), MultReg)
+            .addImm(MultiplierVal);
+        BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MaddGprOpc),
+                NewStepReg)
+            .addReg(IVStepReg)
+            .addReg(MultReg)
+            .addReg(ZeroReg);
+      }
+
+      // In the preheader, calculate the initial value for the new base IV.
+      // BaseIVInit = IVInit * MultiplierVal
+      Register BaseIVInitReg = MRI->createVirtualRegister(GprAllRegClass);
+      Register MultRegForInit = MRI->createVirtualRegister(GprRegClass);
+      BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MovImmOpc),
+              MultRegForInit)
+          .addImm(MultiplierVal);
+      BuildMI(*Preheader, PreheaderInsertPt, DL, TII->get(MaddGprOpc),
+              BaseIVInitReg)
+          .addReg(IVInitReg)
+          .addReg(MultRegForInit)
+          .addReg(ZeroReg);
+
+      // Create a new PHI node in the header
+      // for our new base induction variable.
+      Register BaseIVReg = MRI->createVirtualRegister(GprAllRegClass);
+      Register NextBaseIVReg = MRI->createVirtualRegister(GprAllRegClass);
+      auto BaseIVPhi = BuildMI(*Header, Header->getFirstNonPHI(), DL,
+                               TII->get(AArch64::PHI), BaseIVReg);
+      BaseIVPhi.addReg(BaseIVInitReg).addMBB(Preheader);
+
+      // In the loop latch, update our new base induction variable
+      // by adding the new step
+      BuildMI(*Latch, Latch->getFirstTerminator(), DL, TII->get(AddGprOpc),
+              NextBaseIVReg)
+          .addReg(BaseIVReg)
+          .addReg(NewStepReg);
+
+      BaseIVPhi.addReg(NextBaseIVReg).addMBB(Latch);
+
+      // Now, replace the original multiply operation in the loop body
+      // with a new add operation
+      auto BodyInsertPt = MI.getIterator();
+
+      // Broadcast the new base IV into a vector register.
+      Register BaseVecReg = MRI->createVirtualRegister(ZprRegClass);
+      BuildMI(*MI.getParent(), BodyInsertPt, DL, TII->get(DupZrOpc), BaseVecReg)
+          .addReg(BaseIVReg);
+
+      // Perform the vector addition: NewResult = OffsetVector + BaseVector
+      Register AddTmpReg = MRI->createVirtualRegister(ZprRegClass);
+      BuildMI(*MI.getParent(), BodyInsertPt, DL, TII->get(AddZzzOpc), AddTmpReg)
+          .addReg(OffsetVecReg)
+          .addReg(BaseVecReg);
+
+      // Replace all uses of the original multiplication result
+      // with our new addition result
+      MRI->replaceRegWith(MI.getOperand(0).getReg(), AddTmpReg);
+
+      // Clean up the now-dead instructions from the old calculation
+      MI.eraseFromParent();
+      if (MRI->use_empty(SelMI->getOperand(0).getReg()))
+        SelMI->eraseFromParent();
+      if (MRI->use_empty(IndexMI->getOperand(0).getReg())) {
+        if (IndexMI->getOpcode() == AddZzzOpc) {
+          MachineInstr *Op1 = MRI->getVRegDef(IndexMI->getOperand(1).getReg());
+          MachineInstr *Op2 = MRI->getVRegDef(IndexMI->getOperand(2).getReg());
+          if (MRI->use_empty(Op1->getOperand(0).getReg()))
+            Op1->eraseFromParent();
+          if (MRI->use_empty(Op2->getOperand(0).getReg()))
+            Op2->eraseFromParent();
+        }
+        IndexMI->eraseFromParent();
+      }
+      if (MRI->use_empty(MultiplierMI->getOperand(0).getReg()))
+        MultiplierMI->eraseFromParent();
+
+      LLVM_DEBUG(dbgs() << "Successfully applied strength reduction.\n");
+
+      return true;
+    }
+  }
+  return false;
+}
+
+bool SVELoopAddressHoisting::processLoop(MachineLoop *L) {
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader)
+    return false;
+
+  bool Changed = false;
+  LLVM_DEBUG(dbgs() << "********** Processing Loop in Function: "
+                    << L->getHeader()->getParent()->getName()
+                    << " (Loop Header: " << L->getHeader()->getName()
+                    << ") **********\n");
+
+  // Collect all candadate instructions and their addressing chains
+  ChainMap Chains;
+  SetVector<MachineInstr *> CandidateDeadInsts;
+  collectOptimizationCandidates(L, Chains, CandidateDeadInsts);
+
+  if (Chains.empty())
+    return false;
+
+  // Hosist invariants and rewrite the instructions in the loop
+  Changed |= hoistInvariantsAndRewrite(L, Chains);
+
+  // Clean up the original, now-dead, address computation instructions
+  if (Changed)
+    Changed |= cleanupDeadCode(CandidateDeadInsts);
+  Changed |= sveMulStrengthReduction(L);
+  return Changed;
+}
+
+bool SVELoopAddressHoisting::runOnMachineFunction(MachineFunction &MF, MachineLoopInfo *MLI) {
+  if (!MF.getSubtarget<AArch64Subtarget>().hasSVE())
+    return false;
+
+  MRI = &MF.getRegInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  LLVM_DEBUG(dbgs() << "Running AArch64 SVE Loop Address Hoisting on function: "
+                    << MF.getName() << "\n");
+
+  bool Changed = false;
+  for (MachineLoop *L : *MLI) {
+    for (MachineLoop *SubL : *L) {
+      Changed |= processLoop(SubL);
+    }
+    Changed |= processLoop(L);
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                           AArch64MIPeepholeOpt
+//===----------------------------------------------------------------------===//
+
 namespace {
 
 struct AArch64MIPeepholeOpt : public MachineFunctionPass {
@@ -86,6 +949,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   const AArch64RegisterInfo *TRI;
   MachineLoopInfo *MLI;
   MachineRegisterInfo *MRI;
+  SVELoopAddressHoisting AddressHoister;
 
   using OpcodePair = std::pair<unsigned, unsigned>;
   template <typename T>
@@ -751,7 +1615,8 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
       }
     }
   }
-
+  if (EnableSVEPeephole)
+    Changed |= AddressHoister.runOnMachineFunction(MF, MLI);
   return Changed;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a2f826f9aba965ab4e5951f392a95b8fc948bfec
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-addressing-peephole.ll
@@ -0,0 +1,540 @@
+; RUN: llc -mtriple=aarch64-unknown -mcpu=hip09 -O1 %s -o - | FileCheck %s
+
+define dso_local void @test_gather_multi_constOffset(i32 noundef %loopTime, ptr noundef %x, float noundef %ipx, float noundef %ipy, float noundef %ipz, ptr nocapture noundef nonnull align 4 dereferenceable(4) %tempx, ptr nocapture noundef nonnull align 4 dereferenceable(4) %tempy, ptr nocapture noundef nonnull align 4 dereferenceable(4) %tempz) local_unnamed_addr #0 {
+; CHECK-LABEL: test_gather_multi_constOffset:
+; CHECK:       // %bb.1:                                // %for.body.lr.ph
+; CHECK:       mov	w[[MULTIPLIER:[0-9]+]], #3
+; CHECK:       index  z[[OFFSET:[0-9]+]].s, #0, #3
+; CHECK:       add	x[[NEWBASE1:[0-9]+]], x1, #4
+; CHECK:       mul	w[[IV:[0-9]+]], wzr, w[[MULTIPLIER]]
+; CHECK:       add  x[[NEWBASE2:[0-9]+]], x1, #8
+; CHECK:       cntw	x[[STRIDE:[0-9]+]], all, mul #3
+; CHECK:       .LBB0_2:                                // %for.body
+; CHECK:      mov z[[INDEX:[0-9]+]].s, w[[IV]]
+; CHECK:      add w[[IV]], w[[IV]], w[[STRIDE]]
+; CHECK:      add z[[INDEX]].s, z[[OFFSET]].s, z[[INDEX]].s
+; CHECK:      ld1w  { {{z[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x1, z[[INDEX:[0-9]+]].s, sxtw #2]
+; CHECK:      ld1w  { {{z[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE1]], z[[INDEX]].s, sxtw #2]
+; CHECK:      ld1w  { {{z[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z[[INDEX]].s, sxtw #2]
+entry:
+  %cmp18 = icmp sgt i32 %loopTime, 0
+  br i1 %cmp18, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %.splatinsert = insertelement <vscale x 4 x float> poison, float %ipx, i64 0
+  %.splat = shufflevector <vscale x 4 x float> %.splatinsert, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  %.splatinsert2 = insertelement <vscale x 4 x float> poison, float %ipy, i64 0
+  %.splat3 = shufflevector <vscale x 4 x float> %.splatinsert2, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  %.splatinsert5 = insertelement <vscale x 4 x float> poison, float %ipz, i64 0
+  %.splat6 = shufflevector <vscale x 4 x float> %.splatinsert5, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  %.tr = tail call i32 @llvm.vscale.i32()
+  %0 = shl nuw nsw i32 %.tr, 2
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %jp.019 = phi i32 [ 0, %for.body.lr.ph ], [ %conv10, %for.body ]
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.019, i32 %loopTime)
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.index.nxv4i32(i32 %jp.019, i32 1)
+  %3 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> zeroinitializer
+  %4 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %5 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %1, ptr %x, <vscale x 4 x i32> %4)
+  %6 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %5, <vscale x 4 x float> zeroinitializer
+  %7 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fsubr.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %6, <vscale x 4 x float> %.splat)
+  %8 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %7, <vscale x 4 x float> zeroinitializer
+  %9 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %8, <vscale x 4 x float> %7)
+  %10 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %4, <vscale x 4 x i32> zeroinitializer
+  %11 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %10, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %12 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %1, ptr %x, <vscale x 4 x i32> %11)
+  %13 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %12, <vscale x 4 x float> zeroinitializer
+  %14 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %13, <vscale x 4 x float> %.splat3)
+  %15 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %14, <vscale x 4 x float> zeroinitializer
+  %16 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmad.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %15, <vscale x 4 x float> %14, <vscale x 4 x float> %9)
+  %17 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %11, <vscale x 4 x i32> zeroinitializer
+  %18 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %17, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %19 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %1, ptr %x, <vscale x 4 x i32> %18)
+  %20 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %19, <vscale x 4 x float> zeroinitializer
+  %21 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %20, <vscale x 4 x float> %.splat6)
+  %22 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %21, <vscale x 4 x float> zeroinitializer
+  %23 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmad.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %22, <vscale x 4 x float> %21, <vscale x 4 x float> %16)
+  %24 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fsqrt.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %1, <vscale x 4 x float> %23)
+  %25 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %8, <vscale x 4 x float> %24)
+  %26 = tail call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %25)
+  %27 = load float, ptr %tempx, align 4, !tbaa !5
+  %add = fadd float %26, %27
+  store float %add, ptr %tempx, align 4, !tbaa !5
+  %28 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %15, <vscale x 4 x float> %24)
+  %29 = tail call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %28)
+  %30 = load float, ptr %tempy, align 4, !tbaa !5
+  %add7 = fadd float %29, %30
+  store float %add7, ptr %tempy, align 4, !tbaa !5
+  %31 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %22, <vscale x 4 x float> %24)
+  %32 = tail call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %31)
+  %33 = load float, ptr %tempz, align 4, !tbaa !5
+  %add8 = fadd float %32, %33
+  store float %add8, ptr %tempz, align 4, !tbaa !5
+  %conv10 = add i32 %0, %jp.019
+  %cmp = icmp slt i32 %conv10, %loopTime
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !9
+}
+
+define dso_local void @test_scatter_constOffset(i32 noundef %loopTime, ptr noalias noundef %dst, ptr noalias nocapture noundef readonly %tempx, ptr noalias nocapture noundef readonly %tempy, ptr noalias nocapture noundef readonly %tempz) local_unnamed_addr #0 {
+; CHECK-LABEL: test_scatter_constOffset:
+; CHECK:       // %bb.1:                               // %for.body.lr.ph
+; CHECK:       add	x[[NEWBASE1:[0-9]+]], x1, #4
+; CHECK-NEXT:  add	x[[NEWBASE2:[0-9]+]], x1, #8
+; CHECK:       .LBB1_2:                                // %for.body
+; CHECK:       st1w	{ {{z[0-9]+}}.s }, p[[PG:[0-9]+]], [x1, z[[INDEX:[0-9]+]].s, sxtw #2]
+; CHECK-NEXT:  st1w	{ {{z[0-9]+}}.s }, p[[PG]], [x[[NEWBASE1]], z[[INDEX]].s, sxtw #2]
+; CHECK-NEXT:  st1w	{ {{z[0-9]+}}.s }, p[[PG]], [x[[NEWBASE2]], z[[INDEX]].s, sxtw #2]
+entry:
+  %cmp15 = icmp sgt i32 %loopTime, 0
+  br i1 %cmp15, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %.tr = tail call i32 @llvm.vscale.i32()
+  %0 = shl nuw nsw i32 %.tr, 2
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %jp.016 = phi i32 [ 0, %for.body.lr.ph ], [ %conv5, %for.body ]
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.016, i32 %loopTime)
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.index.nxv4i32(i32 %jp.016, i32 1)
+  %3 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> zeroinitializer
+  %4 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %5 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %4, <vscale x 4 x i32> zeroinitializer
+  %6 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %5, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %7 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %5, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %idx.ext = sext i32 %jp.016 to i64
+  %add.ptr = getelementptr inbounds float, ptr %tempx, i64 %idx.ext
+  %8 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %add.ptr, i32 1, <vscale x 4 x i1> %1, <vscale x 4 x float> zeroinitializer), !tbaa !5
+  %add.ptr2 = getelementptr inbounds float, ptr %tempy, i64 %idx.ext
+  %9 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %add.ptr2, i32 1, <vscale x 4 x i1> %1, <vscale x 4 x float> zeroinitializer), !tbaa !5
+  %add.ptr4 = getelementptr inbounds float, ptr %tempz, i64 %idx.ext
+  %10 = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %add.ptr4, i32 1, <vscale x 4 x i1> %1, <vscale x 4 x float> zeroinitializer), !tbaa !5
+  tail call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(<vscale x 4 x float> %8, <vscale x 4 x i1> %1, ptr %dst, <vscale x 4 x i32> %4)
+  tail call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(<vscale x 4 x float> %9, <vscale x 4 x i1> %1, ptr %dst, <vscale x 4 x i32> %6)
+  tail call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(<vscale x 4 x float> %10, <vscale x 4 x i1> %1, ptr %dst, <vscale x 4 x i32> %7)
+  %conv5 = add i32 %0, %jp.016
+  %cmp = icmp slt i32 %conv5, %loopTime
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !11
+}
+
+define dso_local void @test_prefetch_constOffset(i32 noundef %loopTime, ptr nocapture noundef %data) local_unnamed_addr #4 {
+; CHECK-LABEL: test_prefetch_constOffset:
+; CHECK:       // %bb.1:                               // %for.body.lr.ph
+; CHECK:       add	x[[NEWBASE1:[0-9]+]], x1, #4
+; CHECK-NEXT:  add	x[[NEWBASE2:[0-9]+]], x1, #8
+; CHECK:       // %bb.4:                               // %if.end
+; CHECK:       prfw	pldl1keep, p[[PG:[0-9]+]], [x1, z[[INDEX:[0-9]+]].s, sxtw #2]
+; CHECK-NEXT:  prfw	pldl1keep, p[[PG]], [x[[NEWBASE1]], z[[INDEX]].s, sxtw #2]
+; CHECK-NEXT:  prfw	pldl1keep, p[[PG]], [x[[NEWBASE2]], z[[INDEX]].s, sxtw #2]
+entry:
+  %.tr = tail call i32 @llvm.vscale.i32()
+  %conv = shl nuw nsw i32 %.tr, 2
+  %cmp13 = icmp sgt i32 %loopTime, 0
+  br i1 %cmp13, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %cleanup, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %cleanup
+  %jp.014 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %cleanup ]
+  %add = add i32 %jp.014, %conv
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %add, i32 %loopTime)
+  %2 = tail call i1 @llvm.aarch64.sve.ptest.any.nxv4i1(<vscale x 4 x i1> %0, <vscale x 4 x i1> %1)
+  br i1 %2, label %if.end, label %cleanup
+
+if.end:                                           ; preds = %for.body
+  %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.index.nxv4i32(i32 %add, i32 1)
+  %4 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 4 x i32> zeroinitializer
+  %5 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %4, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  tail call void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> %1, ptr %data, <vscale x 4 x i32> %5, i32 0)
+  %6 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %5, <vscale x 4 x i32> zeroinitializer
+  %7 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %6, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  tail call void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> %1, ptr %data, <vscale x 4 x i32> %7, i32 0)
+  %8 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %7, <vscale x 4 x i32> zeroinitializer
+  %9 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %8, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  tail call void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> %1, ptr %data, <vscale x 4 x i32> %9, i32 0)
+  br label %cleanup
+
+cleanup:                                          ; preds = %for.body, %if.end
+  %cmp = icmp slt i32 %add, %loopTime
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !12
+}
+
+define dso_local void @test_stride_constOffset(i32 noundef %loopTime, ptr noundef %data, ptr nocapture noundef %result) local_unnamed_addr #0 {
+; CHECK-LABEL: test_stride_constOffset:
+; CHECK:       // %bb.1:                               // %for.body.lr.ph
+; CHECK:       add	x[[NEWBASE1:[0-9]+]], x1, #8
+; CHECK-NEXT:  add	x[[NEWBASE2:[0-9]+]], x1, #16
+; CHECK:       .LBB3_2:                                // %for.body
+; CHECK:       ld1w	{ {{z[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x1, z[[INDEX:[0-9]+]].s, sxtw #2]
+; CHECK-NEXT:  ld1w	{ {{z[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE1]], z[[INDEX]].s, sxtw #2]
+; CHECK-NEXT:  ld1w	{ {{z[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z[[INDEX]].s, sxtw #2]
+entry:
+  %cmp9 = icmp sgt i32 %loopTime, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %.tr = tail call i32 @llvm.vscale.i32()
+  %0 = shl nuw nsw i32 %.tr, 2
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %jp.010 = phi i32 [ 0, %for.body.lr.ph ], [ %conv1, %for.body ]
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.010, i32 %loopTime)
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.index.nxv4i32(i32 %jp.010, i32 1)
+  %3 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %2, <vscale x 4 x i32> zeroinitializer
+  %4 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %3, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %5 = select <vscale x 4 x i1> %1, <vscale x 4 x i32> %4, <vscale x 4 x i32> zeroinitializer
+  %6 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %5, <vscale x 4 x i32> zeroinitializer)
+  %7 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %1, ptr %data, <vscale x 4 x i32> %6)
+  %8 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %5, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %9 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %1, ptr %data, <vscale x 4 x i32> %8)
+  %10 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %1, <vscale x 4 x i32> %5, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 4, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %11 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %1, ptr %data, <vscale x 4 x i32> %10)
+  %12 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %7, <vscale x 4 x float> zeroinitializer
+  %13 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %12, <vscale x 4 x float> %9)
+  %14 = select <vscale x 4 x i1> %1, <vscale x 4 x float> %13, <vscale x 4 x float> zeroinitializer
+  %15 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1> %1, <vscale x 4 x float> %14, <vscale x 4 x float> %11)
+  %idx.ext = sext i32 %jp.010 to i64
+  %add.ptr = getelementptr inbounds float, ptr %result, i64 %idx.ext
+  tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %15, ptr %add.ptr, i32 1, <vscale x 4 x i1> %1), !tbaa !5
+  %conv1 = add i32 %0, %jp.010
+  %cmp = icmp slt i32 %conv1, %loopTime
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !13
+}
+
+define dso_local void @test_invariantOffset32bit(i32 noundef %N, i32 noundef %M, ptr noundef %matrix, ptr nocapture noundef %result) local_unnamed_addr #0 {
+; CHECK-LABEL: test_invariantOffset32bit:
+; CHECK:       // %bb.4:                               // %for.body4.lr.ph
+; CHECK:       add	x[[NEWBASE1:[0-9]+]], x2, w[[NEWBASE1]], sxtw #2
+; CHECK:       add	x[[NEWBASE2:[0-9]+]], x2, w[[NEWBASE2]], sxtw #2
+; CHECK:       add	x[[NEWBASE3:[0-9]+]], x2, w[[NEWBASE3]], sxtw #2
+; CHECK:       .LBB4_5:                                // %for.body4
+; CHECK:       ld1w	{ {{z[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z[[INDEX:[0-9]+]].s, sxtw #2]
+; CHECK-NEXT:  ld1w	{ {{z[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE2]], z[[INDEX]].s, sxtw #2]
+; CHECK-NEXT:  ld1w	{ {{z[0-9]+}}.s }, p[[PG]]/z, [x[[NEWBASE3]], z[[INDEX]].s, sxtw #2]
+entry:
+  %cmp41 = icmp sgt i32 %N, 2
+  br i1 %cmp41, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %div51 = udiv i32 %N, 3
+  %cmp239 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %div51 to i64
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.cond.cleanup3
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next, %for.cond.cleanup3 ]
+  br i1 %cmp239, label %for.body4.lr.ph, label %for.cond.cleanup3
+
+for.body4.lr.ph:                                  ; preds = %for.cond1.preheader
+  %1 = mul nuw nsw i64 %indvars.iv, 3
+  %2 = trunc i64 %1 to i32
+  %3 = mul i32 %2, %M
+  %.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %3, i64 0
+  %.splat = shufflevector <vscale x 4 x i32> %.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %4 = trunc i64 %1 to i32
+  %5 = add i32 %4, 1
+  %6 = mul i32 %5, %M
+  %.splatinsert9 = insertelement <vscale x 4 x i32> poison, i32 %6, i64 0
+  %.splat10 = shufflevector <vscale x 4 x i32> %.splatinsert9, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %7 = trunc i64 %1 to i32
+  %8 = add i32 %7, 2
+  %9 = mul i32 %8, %M
+  %.splatinsert14 = insertelement <vscale x 4 x i32> poison, i32 %9, i64 0
+  %.splat15 = shufflevector <vscale x 4 x i32> %.splatinsert14, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %10 = mul nsw i64 %indvars.iv, %0
+  %add.ptr = getelementptr inbounds float, ptr %result, i64 %10
+  %.tr = tail call i32 @llvm.vscale.i32()
+  %11 = shl nuw nsw i32 %.tr, 2
+  br label %for.body4
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
+  ret void
+
+for.cond.cleanup3:                                ; preds = %for.body4, %for.cond1.preheader
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !14
+
+for.body4:                                        ; preds = %for.body4.lr.ph, %for.body4
+  %jp.040 = phi i32 [ 0, %for.body4.lr.ph ], [ %conv20, %for.body4 ]
+  %12 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %jp.040, i32 %M)
+  %13 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.index.nxv4i32(i32 %jp.040, i32 1)
+  %14 = select <vscale x 4 x i1> %12, <vscale x 4 x i32> %13, <vscale x 4 x i32> zeroinitializer
+  %15 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %12, <vscale x 4 x i32> %14, <vscale x 4 x i32> %.splat)
+  %16 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %12, ptr %matrix, <vscale x 4 x i32> %15)
+  %17 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %12, <vscale x 4 x i32> %14, <vscale x 4 x i32> %.splat10)
+  %18 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %12, ptr %matrix, <vscale x 4 x i32> %17)
+  %19 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %12, <vscale x 4 x i32> %14, <vscale x 4 x i32> %.splat15)
+  %20 = tail call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %12, ptr %matrix, <vscale x 4 x i32> %19)
+  %21 = select <vscale x 4 x i1> %12, <vscale x 4 x float> %16, <vscale x 4 x float> zeroinitializer
+  %22 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1> %12, <vscale x 4 x float> %21, <vscale x 4 x float> %18)
+  %23 = select <vscale x 4 x i1> %12, <vscale x 4 x float> %22, <vscale x 4 x float> zeroinitializer
+  %24 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1> %12, <vscale x 4 x float> %23, <vscale x 4 x float> %20)
+  %25 = select <vscale x 4 x i1> %12, <vscale x 4 x float> %24, <vscale x 4 x float> zeroinitializer
+  %26 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1> %12, <vscale x 4 x float> %25, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer))
+  %idx.ext17 = sext i32 %jp.040 to i64
+  %add.ptr18 = getelementptr inbounds float, ptr %add.ptr, i64 %idx.ext17
+  tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %26, ptr %add.ptr18, i32 1, <vscale x 4 x i1> %12), !tbaa !5
+  %conv20 = add i32 %11, %jp.040
+  %cmp2 = icmp slt i32 %conv20, %M
+  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3, !llvm.loop !15
+}
+
+define dso_local void @test_invariantOffset64bit(i64 noundef %N, i64 noundef %M, ptr noundef %matrix, ptr nocapture noundef %result) local_unnamed_addr #0 {
+; CHECK-LABEL: test_invariantOffset64bit:
+; CHECK:       // %bb.5:                               // %for.body4.lr.ph
+; CHECK:       add	x[[NEWBASE1:[0-9]+]], x2, x[[NEWBASE1]], lsl #3
+; CHECK-NEXT:  add	x[[NEWBASE3:[0-9]+]], x2, x[[NEWBASE3]], lsl #3
+; CHECK-NEXT:  add	x[[NEWBASE2:[0-9]+]], x2, x[[NEWBASE2]], lsl #3
+; CHECK:       .LBB5_6:                                // %for.body4
+; CHECK:       ld1d	{ {{z[0-9]+}}.d }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z[[INDEX:[0-9]+]].d, lsl #3]
+; CHECK-NEXT:  ld1d	{ {{z[0-9]+}}.d }, p[[PG]]/z, [x[[NEWBASE2]], z[[INDEX]].d, lsl #3]
+; CHECK-NEXT:  ld1d	{ {{z[0-9]+}}.d }, p[[PG]]/z, [x[[NEWBASE3]], z[[INDEX]].d, lsl #3]
+entry:
+  %cmp39.not = icmp ult i64 %N, 3
+  br i1 %cmp39.not, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %div = udiv i64 %N, 3
+  %cmp237.not = icmp eq i64 %M, 0
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.cond.cleanup3
+  %i.040 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %inc, %for.cond.cleanup3 ]
+  br i1 %cmp237.not, label %for.cond.cleanup3, label %for.body4.lr.ph
+
+for.body4.lr.ph:                                  ; preds = %for.cond1.preheader
+  %mul = mul nuw i64 %i.040, 3
+  %mul5 = mul i64 %mul, %M
+  %.splatinsert = insertelement <vscale x 2 x i64> poison, i64 %mul5, i64 0
+  %.splat = shufflevector <vscale x 2 x i64> %.splatinsert, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %add7 = add nuw i64 %mul, 1
+  %mul8 = mul i64 %add7, %M
+  %.splatinsert9 = insertelement <vscale x 2 x i64> poison, i64 %mul8, i64 0
+  %.splat10 = shufflevector <vscale x 2 x i64> %.splatinsert9, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %add12 = add nuw i64 %mul, 2
+  %mul13 = mul i64 %add12, %M
+  %.splatinsert14 = insertelement <vscale x 2 x i64> poison, i64 %mul13, i64 0
+  %.splat15 = shufflevector <vscale x 2 x i64> %.splatinsert14, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %mul16 = mul i64 %i.040, %M
+  %add.ptr = getelementptr inbounds double, ptr %result, i64 %mul16
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 1
+  br label %for.body4
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
+  ret void
+
+for.cond.cleanup3:                                ; preds = %for.body4, %for.cond1.preheader
+  %inc = add nuw nsw i64 %i.040, 1
+  %exitcond.not = icmp eq i64 %inc, %div
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.cond1.preheader, !llvm.loop !16
+
+for.body4:                                        ; preds = %for.body4.lr.ph, %for.body4
+  %jp.038 = phi i64 [ 0, %for.body4.lr.ph ], [ %add18, %for.body4 ]
+  %2 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i64(i64 %jp.038, i64 %M)
+  %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %jp.038, i64 1)
+  %4 = select <vscale x 2 x i1> %2, <vscale x 2 x i64> %3, <vscale x 2 x i64> zeroinitializer
+  %5 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.add.nxv2i64(<vscale x 2 x i1> %2, <vscale x 2 x i64> %4, <vscale x 2 x i64> %.splat)
+  %6 = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %2, ptr %matrix, <vscale x 2 x i64> %5)
+  %7 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.add.nxv2i64(<vscale x 2 x i1> %2, <vscale x 2 x i64> %4, <vscale x 2 x i64> %.splat10)
+  %8 = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %2, ptr %matrix, <vscale x 2 x i64> %7)
+  %9 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.add.nxv2i64(<vscale x 2 x i1> %2, <vscale x 2 x i64> %4, <vscale x 2 x i64> %.splat15)
+  %10 = tail call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %2, ptr %matrix, <vscale x 2 x i64> %9)
+  %11 = select <vscale x 2 x i1> %2, <vscale x 2 x double> %6, <vscale x 2 x double> zeroinitializer
+  %12 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1> %2, <vscale x 2 x double> %11, <vscale x 2 x double> %8)
+  %13 = select <vscale x 2 x i1> %2, <vscale x 2 x double> %12, <vscale x 2 x double> zeroinitializer
+  %14 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1> %2, <vscale x 2 x double> %13, <vscale x 2 x double> %10)
+  %15 = select <vscale x 2 x i1> %2, <vscale x 2 x double> %14, <vscale x 2 x double> zeroinitializer
+  %16 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1> %2, <vscale x 2 x double> %15, <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer))
+  %add.ptr17 = getelementptr inbounds double, ptr %add.ptr, i64 %jp.038
+  tail call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> %16, ptr %add.ptr17, i32 1, <vscale x 2 x i1> %2), !tbaa !17
+  %add18 = add i64 %1, %jp.038
+  %cmp2 = icmp ult i64 %add18, %M
+  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3, !llvm.loop !19
+}
+
+define dso_local void @test_svaddx_constOffset(ptr noundef %base, <vscale x 4 x i32> %index) local_unnamed_addr #0 {
+; CHECK-LABEL: test_svaddx_constOffset:
+; CHECK:       // %bb.0:                               // %entry
+; CHECK:       add	x[[NEWBASE1:[0-9]+]], x0, #40
+; CHECK:       add	x[[NEWBASE2:[0-9]+]], x0, #44
+; CHECK:       .LBB6_1:                                // %for.body
+; CHECK:       ld1w	{ {{z[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE1]], z[[INDEX:[0-9]+]].s, uxtw #2]
+; CHECK:       ld1w	{ {{z[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE2]], z[[INDEX:[0-9]+]].s, uxtw #2]
+entry:
+  %0 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ]
+  %index.addr.05 = phi <vscale x 4 x i32> [ %index, %entry ], [ %8, %for.body ]
+  %1 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %index.addr.05, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 10, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> %0, ptr %base, <vscale x 4 x i32> %1)
+  %3 = shl nuw nsw i64 %indvars.iv, 4
+  %add.ptr = getelementptr inbounds i32, ptr %base, i64 %3
+  store <vscale x 4 x i32> %2, ptr %add.ptr, align 16, !tbaa !20
+  %4 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %index.addr.05, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %5 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %4, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 10, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %6 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> %0, ptr %base, <vscale x 4 x i32> %5)
+  %indvars.iv.next = shl i64 %indvars.iv, 4
+  %7 = or i64 %indvars.iv.next, 16
+  %add.ptr.1 = getelementptr inbounds i32, ptr %base, i64 %7
+  store <vscale x 4 x i32> %6, ptr %add.ptr.1, align 16, !tbaa !20
+  %8 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %4, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2
+  %exitcond.not.1 = icmp eq i64 %indvars.iv.next.1, 100
+  br i1 %exitcond.not.1, label %for.cond.cleanup, label %for.body, !llvm.loop !22
+}
+
+define dso_local void @_Z26test_loop_invariant_offsetPlu11__SVInt64_tl(ptr noundef %base, <vscale x 2 x i64> %index, i64 noundef %invariant_offset) local_unnamed_addr #6 {
+
+entry:
+  %0 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %.splatinsert = insertelement <vscale x 2 x i64> poison, i64 %invariant_offset, i64 0
+  %1 = shufflevector <vscale x 2 x i64> %.splatinsert, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %index.addr.05 = phi <vscale x 2 x i64> [ %index, %entry ], [ %4, %for.body ]
+  %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.add.u.nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %index.addr.05, <vscale x 2 x i64> %1)
+  %.splatinsert3 = insertelement <vscale x 2 x i64> poison, i64 %indvars.iv, i64 0
+  %3 = shufflevector <vscale x 2 x i64> %.splatinsert3, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  tail call void @llvm.aarch64.sve.st1.scatter.index.nxv2i64(<vscale x 2 x i64> %3, <vscale x 2 x i1> %0, ptr %base, <vscale x 2 x i64> %2)
+  %4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.add.u.nxv2i64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %index.addr.05, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !23
+}
+
+define dso_local void @test_combined_const_and_invariant_offset(ptr noundef %base, <vscale x 4 x i32> %index, i32 noundef %invariant_offset) local_unnamed_addr #0 {
+; CHECK-LABEL: test_combined_const_and_invariant_offset:
+; CHECK:       // %bb.0:                               // %entry
+; CHECK:       add	x[[NEWBASE:[0-9]+]], x0, w1, sxtw #2
+; CHECK:       add	x[[NEWBASE]], x[[NEWBASE]], #40
+; CHECK:       .LBB8_1:                                // %for.body
+; CHECK:       ld1w	{ {{z[0-9]+}}.s }, p[[PG:[0-9]+]]/z, [x[[NEWBASE]], z[[INDEX:[0-9]+]].s, sxtw #2]
+entry:
+  %0 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %invariant_offset, i64 0
+  %.splat = shufflevector <vscale x 4 x i32> %.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %index.addr.06 = phi <vscale x 4 x i32> [ %index, %entry ], [ %7, %for.body ]
+  %1 = select <vscale x 4 x i1> %0, <vscale x 4 x i32> %index.addr.06, <vscale x 4 x i32> zeroinitializer
+  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %1, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 10, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %3 = select <vscale x 4 x i1> %0, <vscale x 4 x i32> %2, <vscale x 4 x i32> zeroinitializer
+  %4 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %3, <vscale x 4 x i32> %.splat)
+  %5 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> %0, ptr %base, <vscale x 4 x i32> %4)
+  %6 = shl nuw nsw i64 %indvars.iv, 4
+  %add.ptr = getelementptr inbounds i32, ptr %base, i64 %6
+  store <vscale x 4 x i32> %5, ptr %add.ptr, align 16, !tbaa !20
+  %7 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1> %0, <vscale x 4 x i32> %index.addr.06, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !24
+}
+
+declare <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32, i32) #1
+declare <vscale x 4 x i32> @llvm.aarch64.sve.index.nxv4i32(i32, i32) #1
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>) #1
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1>, ptr, <vscale x 4 x i32>) #2
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsubr.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) #1
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) #1
+declare <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>) #1
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) #1
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmad.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) #1
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsqrt.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x float>) #1
+declare float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>) #1
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, ptr, <vscale x 4 x i32>) #3
+declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 immarg) #1
+declare void @llvm.aarch64.sve.prfw.gather.sxtw.index.nxv4i32(<vscale x 4 x i1>, ptr nocapture, <vscale x 4 x i32>, i32 immarg) #5
+declare <vscale x 4 x float> @llvm.aarch64.sve.fadd.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) #1
+declare <vscale x 4 x float> @llvm.aarch64.sve.fdiv.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>) #1
+declare <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i64(i64, i64) #1
+declare <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64, i64) #1
+declare <vscale x 2 x i64> @llvm.aarch64.sve.add.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>) #1
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1>, ptr, <vscale x 2 x i64>) #2
+declare <vscale x 2 x double> @llvm.aarch64.sve.fadd.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>) #1
+declare <vscale x 2 x double> @llvm.aarch64.sve.fdiv.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>) #1
+declare <vscale x 4 x i32> @llvm.aarch64.sve.add.u.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>) #1
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1>, ptr, <vscale x 4 x i32>) #2
+declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 immarg) #1
+declare <vscale x 2 x i64> @llvm.aarch64.sve.add.u.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>) #1
+declare void @llvm.aarch64.sve.st1.scatter.index.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, ptr, <vscale x 2 x i64>) #3
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1>, ptr, <vscale x 4 x i32>) #2
+declare i64 @llvm.vscale.i64() #7
+declare i32 @llvm.vscale.i32() #7
+declare <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x float>) #8
+declare i1 @llvm.aarch64.sve.ptest.any.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>) #7
+declare void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float>, ptr nocapture, i32 immarg, <vscale x 4 x i1>) #9
+declare void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double>, ptr nocapture, i32 immarg, <vscale x 2 x i1>) #9
+
+attributes #0 = { mustprogress nofree nosync nounwind memory(argmem: readwrite) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hip09" "target-features"="+aes,+bf16,+crc,+dotprod,+f32mm,+f64mm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+sve,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,-fmv" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+attributes #4 = { mustprogress nofree nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hip09" "target-features"="+aes,+bf16,+crc,+dotprod,+f32mm,+f64mm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+sve,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,-fmv" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
+attributes #6 = { mustprogress nofree nosync nounwind memory(argmem: write) uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hip09" "target-features"="+aes,+bf16,+crc,+dotprod,+f32mm,+f64mm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+sve,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,-fmv" }
+attributes #7 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #8 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #9 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 1}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"float", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C++ TBAA"}
+!9 = distinct !{!9, !10}
+!10 = !{!"llvm.loop.mustprogress"}
+!11 = distinct !{!11, !10}
+!12 = distinct !{!12, !10}
+!13 = distinct !{!13, !10}
+!14 = distinct !{!14, !10}
+!15 = distinct !{!15, !10}
+!16 = distinct !{!16, !10}
+!17 = !{!18, !18, i64 0}
+!18 = !{!"double", !7, i64 0}
+!19 = distinct !{!19, !10}
+!20 = !{!21, !21, i64 0}
+!21 = !{!"int", !7, i64 0}
+!22 = distinct !{!22, !10}
+!23 = distinct !{!23, !10}
+!24 = distinct !{!24, !10}