diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index ebdf96b04ccfdaa5712d99144b2b48fbda4ba6fa..fe2474646cfc33f10f0cdac36a83530f405bb6e8 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -263,14 +263,18 @@ void AArch64Subtarget::initializeProperties() { break; case TSV110: CacheLineSize = 64; + PrefetchDistance = 940; PrefFunctionLogAlignment = 4; PrefLoopLogAlignment = 2; + MinPrefetchStride = 4; break; case HIP09: CacheLineSize = 64; + PrefetchDistance = 940; PrefFunctionLogAlignment = 4; PrefLoopLogAlignment = 2; VScaleForTuning = 2; + MinPrefetchStride = 4; break; case HIP12: CacheLineSize = 64; diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index 360cd782756afbf4c0de89c805dbe9e94352eafd..59a04360e32cc6d4a253786c12e36e303b42f8b5 100644 --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -38,6 +38,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" @@ -71,7 +72,8 @@ static cl::opt static cl::opt PrefetchIterationsAhead( "indirect-prefetch-iters-ahead", - cl::desc("Number of iterations for indirect-load prefetch"), cl::Hidden); + cl::desc("Number of iterations for indirect-load prefetch"), cl::Hidden, + cl::init(0)); static cl::opt SkipIntermediate( "indirect-prefetch-skip-intermediate", cl::Hidden, cl::init(false), @@ -91,6 +93,20 @@ static cl::opt CachelineSize("prefetch-cache-line-size", cl::desc("Specify cache line size"), cl::Hidden, cl::init(64)); +static cl::opt + OuterLoopPrefetch("outer-loop-prefetch", cl::Hidden, cl::init(false), + cl::desc("Enable prefetch in outer loops")); + +static cl::opt + DisableDirectLoadPrefetch("disable-direct-prefetch", cl::Hidden, + cl::init(false), + cl::desc("Disable direct load prefetch")); + +static cl::opt + PrefetchLoopDepth("prefetch-loop-depth", + cl::desc("Least loop depth to insert prefetch"), + cl::Hidden, cl::init(1)); + STATISTIC(NumPrefetches, "Number of prefetches inserted"); STATISTIC(NumIndPrefetches, "Number of indirect prefetches inserted"); STATISTIC(NumOuterLoopPrefetches, "Number of outer loop prefetches inserted"); @@ -137,6 +153,14 @@ private: bool getAuxIndVarBound(Loop *L, PHINode *PHI, Value *NumIterations, ValueMap &AuxIndBounds); + bool insertPrefetcherInOuterloopForIndirectLoad( + Loop *L, unsigned Idx, Value *NumIterations, + SmallVector &CandidateMemoryLoads, + SmallSetVector &DependentInsts, + ValueMap &AuxIndBounds, + SmallVectorImpl> &Transforms, + unsigned ItersAhead); + bool insertPrefetcherForIndirectLoad( Loop *L, unsigned Idx, Value *NumIterations, SmallVector &CandidateMemoryLoads, @@ -150,7 +174,8 @@ private: SmallPtrSet &InstSet, SmallVector &CandidateMemoryLoads, std::vector> &DependentInstList, - SmallPtrSet LoopAuxIndPHINodes, Loop *L); + SmallPtrSet LoopAuxIndPHINodes, + bool PrefetchInOuterLoop, Loop *L); /// Helper function to determine whether the given load is in /// CandidateMemoryLoads. If yes, add the candidate's depending inst to the @@ -165,6 +190,30 @@ private: /// processed to insert prefetches for indirect loads. bool canDoIndirectPrefetch(Loop *L); + bool isCrcHashDataAccess(Instruction *I, Instruction *PrefetchingLoad); + + bool sortAndVerifyDependendInstructions( + Loop *L, SmallPtrSet &Visited, + SmallSetVector &DependentInsts, + SmallPtrSet &IndirectLoadDependents, + SmallVector, 8> &SortedDependentInsts); + + bool verifyNewBasicBlocks(Loop *L, BasicBlock *&ClonedEndBB, + SmallVector &ClonedBasicBlocks, + SmallVector &ClonedInstructions); + + void updateDominatorTree(DomTreeUpdater &DTU, BasicBlock *&LoopPreheader, + BasicBlock *&ClonedRootBB); + void + addNewBasicBlocksInDT(DomTreeUpdater &DTU, BasicBlock *&ClonedEndBB, + SmallPtrSet &ClonedBranchInstructions, + DenseMap &BBTransformMap, + SmallPtrSet &PrefetchCalls); + + void moveInvariantOperandsToPreheader( + Loop *L, const SmallVector &DependentInsts); + + /// Check if the stride of the accesses is large enough to /// warrant a prefetch. bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride); @@ -275,6 +324,7 @@ Value *LoopDataPrefetch::getLoopIterationNumber( Value *LoopBoundValue; Value *LoopStepValue; Value *LoopStartValue; + Value *LoopPreHeader; Value *NumIterations; // Use induction variable to derive number of iterations for the loop which @@ -291,9 +341,10 @@ Value *LoopDataPrefetch::getLoopIterationNumber( LoopStartValue = &(LoopLB->getInitialIVValue()); LoopStepValue = LoopLB->getStepValue(); LoopBoundValue = &(LoopLB->getFinalIVValue()); + LoopPreHeader = L->getLoopPreheader(); if (LoopStartValue == nullptr || LoopStepValue == nullptr || - LoopBoundValue == nullptr) + LoopBoundValue == nullptr || LoopPreHeader == nullptr) return nullptr; // Step should be constant. @@ -327,22 +378,7 @@ Value *LoopDataPrefetch::getLoopIterationNumber( /// If prefetch instruction is not inserted. Need to clean iteration instruction /// in the preheader. void LoopDataPrefetch::cleanLoopIterationNumber(Value *NumIterations) { - std::deque IterationInsts; - if (NumIterations != nullptr && NumIterations->use_empty()) { - IterationInsts.push_back(dyn_cast(NumIterations)); - while (IterationInsts.size() > 0) { - auto *IInst = IterationInsts.front(); - IterationInsts.pop_front(); - if (IInst->use_empty()) { - for (unsigned i = 0; i < IInst->getNumOperands(); i++) { - if (isa(IInst->getOperand(i))) - IterationInsts.push_back( - dyn_cast(IInst->getOperand(i))); - } - dyn_cast(IInst)->eraseFromParent(); - } - } - } + RecursivelyDeleteTriviallyDeadInstructions(NumIterations); } /// Returns whether the auxiliary induction variable can generate bound. @@ -370,9 +406,6 @@ bool LoopDataPrefetch::canGetAuxIndVarBound( IndDesc.getInductionOpcode() != Instruction::Sub && IndDesc.getKind() != InductionDescriptor::IK_PtrInduction) return false; - - LoopAuxIndPHINodes.insert(PHI); - return true; } return false; @@ -531,6 +564,19 @@ bool LoopDataPrefetch::insertPrefetcherForIndirectLoad( IRBuilder<> Builder(TargetIndirectLoad); Module *M = TargetIndirectLoad->getModule(); Type *I32Ty = Type::getInt32Ty(TargetIndirectLoad->getParent()->getContext()); + + if (RandomAccessPrefetchOnly) { + bool isRandomAccess = false; + for (auto *I : DependentInsts) { + if (isCrcHashDataAccess(I, TargetIndirectLoad)) { + isRandomAccess = true; + break; + } + } + if (!isRandomAccess) + return false; + } + LLVM_DEBUG(dbgs() << "Inserting indirect prefetchers for\t" << *TargetIndirectLoad << "\twith " << DependentInsts.size() << " dependent instructions\n"); @@ -735,12 +781,16 @@ bool LoopDataPrefetch::findCandidateMemoryLoads( SmallPtrSet &InstSet, SmallVector &CandidateMemoryLoads, std::vector> &DependentInstList, - SmallPtrSet LoopAuxIndPHINodes, Loop *L) { + SmallPtrSet LoopAuxIndPHINodes, bool PrefetchInOuterLoop, + Loop *L) { bool ret = false; for (Use &U : I->operands()) { // If value is loop invariant, just continue - if (LI->getLoopFor(I->getParent())->isLoopInvariant(U.get())) + if (PrefetchInOuterLoop) { + if (L->getParentLoop()->isLoopInvariant(U.get())) + continue; + } else if (LI->getLoopFor(I->getParent())->isLoopInvariant(U.get())) continue; Instruction *OperandInst = dyn_cast(U.get()); @@ -777,9 +827,37 @@ bool LoopDataPrefetch::findCandidateMemoryLoads( break; } case Instruction::Call: { - // We currently can not handle case where indirect load depends on other - // functions yet. - return false; + if (PrefetchInOuterLoop || RandomAccessPrefetchOnly) { + if (OperandInst->mayReadOrWriteMemory()) + return false; + CallInst *Call = dyn_cast(OperandInst); + if (!Call->doesNotThrow()) + return false; + + // Use DFS to search though the operands. + InstList.insert(OperandInst); + if (findCandidateMemoryLoads(OperandInst, InstList, InstSet, + CandidateMemoryLoads, DependentInstList, + LoopAuxIndPHINodes, PrefetchInOuterLoop, + L)) { + // We do not return early in case there are other auxiliary + // induction variable to check + ret = true; + } else { + // If the Operand isn't dependent on an auxiliary induction + // variable, remove any instructions added to DependentInstList from + // this operand + if (InstList.count(OperandInst)) + InstList.remove(OperandInst); + InstList.insert(OperandInst); + return false; + } + break; + } else { + // We currently can not handle case where indirect load depends on + // other functions yet. + return false; + } } case Instruction::Invoke: { // We currently can not handle case where indirect load depends on other @@ -793,7 +871,8 @@ bool LoopDataPrefetch::findCandidateMemoryLoads( InstList.insert(OperandInst); if (findCandidateMemoryLoads(OperandInst, InstList, InstSet, CandidateMemoryLoads, DependentInstList, - LoopAuxIndPHINodes, L)) { + LoopAuxIndPHINodes, PrefetchInOuterLoop, + L)) { // We do not return early in case there are other auxiliary induction // variables to check ret = true; @@ -810,6 +889,242 @@ bool LoopDataPrefetch::findCandidateMemoryLoads( return ret; } +bool LoopDataPrefetch::sortAndVerifyDependendInstructions( + Loop *L, SmallPtrSet &Visited, + SmallSetVector &DependentInsts, + SmallPtrSet &IndirectLoadDependents, + SmallVector, 8> &SortedDependentInsts) { + BasicBlock *LoopPreheader = L->getLoopPreheader(); + DenseMap BBPostNumbers; + Instruction *CandidateLoad = DependentInsts[0]; + if (Visited.insert(LoopPreheader->getTerminator()).second) + DependentInsts.insert(LoopPreheader->getTerminator()); + + // Start from target indirect load block, get the list of predecessor blocks + // till loop preheader. And we assign each block with post order number with + // which we can sort. + SmallSetVector BBPredecessors; + BBPredecessors.insert(CandidateLoad->getParent()); + BBPostNumbers.insert({CandidateLoad->getParent(), 0}); + while (BBPredecessors.size()) { + BasicBlock *BBPred = BBPredecessors[0]; + BBPredecessors.remove(BBPred); + int Depth = BBPostNumbers[BBPred]; + // Check all predecessors and add their branch instr into dependent list + for (BasicBlock *Predecessor : predecessors(BBPred)) { + if (LoopPreheader != Predecessor && !DT->dominates(BBPred, Predecessor)) { + if (BBPostNumbers.end() == BBPostNumbers.find(Predecessor)) { + BBPostNumbers.insert({Predecessor, Depth - 1}); + BBPredecessors.insert(Predecessor); + // Check each terminator is a branch instr. + if (Predecessor->getTerminator() == nullptr || + !isa(Predecessor->getTerminator())) + return false; + // Add branch instruction as dependent instr. + if (Visited.insert(Predecessor->getTerminator()).second) + DependentInsts.insert(Predecessor->getTerminator()); + } + } + } + } + + // Loop preheader is last depend block. + BBPostNumbers.insert({LoopPreheader, -1 * BBPostNumbers.size()}); + + // Update DependentInsts to include instructions that branch instruction + // depends. + for (unsigned j = 0; j < DependentInsts.size(); j++) { + Instruction *Inst = DependentInsts[j]; + if (Inst == nullptr) + return false; + + if (auto *PN = dyn_cast(Inst)) { + if (!IndirectLoadDependents.count(Inst)) { + if (0 > PN->getBasicBlockIndex(LoopPreheader)) + return false; + } + } else if (auto *BranchInstr = dyn_cast(Inst)) { + // Add condition of branch instruction into dependent insts. + if (BranchInstr->isConditional()) { + auto *BranchCond = BranchInstr->getCondition(); + if (BranchCond == nullptr) + return false; + if (Instruction *BranchCondInst = dyn_cast(BranchCond)) + if (Visited.insert(BranchCondInst).second) + DependentInsts.insert(BranchCondInst); + } else if (BranchInstr->getSuccessor(0)->isEHPad()) + return false; + } else if (isa(Inst)) { + return false; + } else { + if (CallInst *Call = dyn_cast(Inst)) + if (Inst->mayReadOrWriteMemory() || !Call->doesNotThrow()) + return false; + // Traverse instruction operands and add dependent instructions till + // function argument, constant or value outside current loop. + for (unsigned i = 0; i < Inst->getNumOperands(); i++) { + Value *Operand = Inst->getOperand(i); + if (Operand == nullptr) + return false; + if (isa(Operand) || isa(Operand)) + continue; + if (Instruction *I = dyn_cast(Operand)) + if (L->contains(I) || I->getParent() == LoopPreheader) + if (Visited.insert(I).second) + DependentInsts.insert(I); + } + } + } + DT->updateDFSNumbers(); + SortedDependentInsts.reserve(DependentInsts.size()); + for (auto I : DependentInsts) { + auto *NodeI = DT->getNode(I->getParent()); + SortedDependentInsts.push_back({I, NodeI->getDFSNumIn()}); + } + llvm::sort(SortedDependentInsts, [&](auto const &LHS, auto const &RHS) { + if (get<0>(RHS)->getParent() == get<0>(LHS)->getParent()) + return get<0>(RHS)->comesBefore(get<0>(LHS)); + if (BBPostNumbers.end() == BBPostNumbers.find(get<0>(LHS)->getParent()) || + BBPostNumbers.end() == BBPostNumbers.find(get<0>(RHS)->getParent())) + return get<1>(RHS) < get<1>(LHS); + if (BBPostNumbers[get<0>(LHS)->getParent()] == + BBPostNumbers[get<0>(RHS)->getParent()]) + return get<1>(RHS) < get<1>(LHS); + return BBPostNumbers[get<0>(LHS)->getParent()] > + BBPostNumbers[get<0>(RHS)->getParent()]; + }); + + // Checking all the BasicBlocks have branch instruction + int BBDepth = 0; + for (auto I : SortedDependentInsts) { + if (BBDepth && get<1>(I) != BBDepth) + if (!isa(get<0>(I)) && + BBPostNumbers.end() != BBPostNumbers.find(get<0>(I)->getParent())) + return false; + BBDepth = get<1>(I); + } + + if (!isa(get<0>(SortedDependentInsts[0]))) + return false; + + if (!L->contains(get<0>(SortedDependentInsts[0]))) + return false; + + if (!isa( + get<0>(SortedDependentInsts[SortedDependentInsts.size() - 1]))) + return false; + else if (auto *PN = dyn_cast( + get<0>(SortedDependentInsts[SortedDependentInsts.size() - 1]))) + if (!L->contains(PN)) + return false; + return true; +} + +/// Checking all new BasicBlock has Terminator instruction. If not, +/// considered as incomplete. Delete all new BasicBlocks. +bool LoopDataPrefetch::verifyNewBasicBlocks( + Loop *L, BasicBlock *&ClonedEndBB, + SmallVector &ClonedBasicBlocks, + SmallVector &ClonedInstructions) { + + for (BasicBlock *BB : ClonedBasicBlocks) { + if (BB->getTerminator() == nullptr) { + for (unsigned j = 0; j < ClonedInstructions.size(); j++) { + auto *I = ClonedInstructions[j]; + I->replaceAllUsesWith(UndefValue::get(I->getType())); + I->eraseFromParent(); + } + for (unsigned j = 0; j < ClonedBasicBlocks.size(); j++) { + auto *DelBBlock = ClonedBasicBlocks[j]; + L->removeBlockFromLoop(DelBBlock); + DelBBlock->eraseFromParent(); + } + L->removeBlockFromLoop(ClonedEndBB); + ClonedEndBB->eraseFromParent(); + return false; + } + } + return true; +} + +// Updating DominatorTree with all branches to Loop Pre-header to new cloned +// root BasicBlock. +void LoopDataPrefetch::updateDominatorTree(DomTreeUpdater &DTU, + BasicBlock *&LoopPreheader, + BasicBlock *&ClonedRootBB) { + for (BasicBlock *PredecessorBB : predecessors(LoopPreheader)) { + auto *BrInstr = PredecessorBB->getTerminator(); + for (unsigned i = 0, NumSuccessor = BrInstr->getNumSuccessors(); + i < NumSuccessor; i++) { + auto *OldSuccessor = BrInstr->getSuccessor(i); + if (OldSuccessor == LoopPreheader) { + DTU.applyUpdates( + {{DominatorTree::Delete, PredecessorBB, LoopPreheader}}); + BrInstr->setSuccessor(i, ClonedRootBB); + DTU.applyUpdates( + {{DominatorTree::Insert, PredecessorBB, ClonedRootBB}}); + } + } + } +} + +// Updating DominatorTree for all cloned BasicBlock. +void LoopDataPrefetch::addNewBasicBlocksInDT( + DomTreeUpdater &DTU, BasicBlock *&ClonedEndBB, + SmallPtrSet &ClonedBranchInstructions, + DenseMap &BBTransformMap, + SmallPtrSet &PrefetchCalls) { + + for (auto *I : ClonedBranchInstructions) { + auto *BrInstr = dyn_cast(I); + for (unsigned i = 0, NumSuccessor = BrInstr->getNumSuccessors(); + i < NumSuccessor; i++) { + auto *OldSuccessor = BrInstr->getSuccessor(i); + if (BBTransformMap.end() != BBTransformMap.find(OldSuccessor)) { + auto *NewSuccessor = BBTransformMap[OldSuccessor]; + BrInstr->setSuccessor(i, NewSuccessor); + DTU.applyUpdates( + {{DominatorTree::Insert, BrInstr->getParent(), NewSuccessor}}); + } else { + BrInstr->setSuccessor(i, ClonedEndBB); + DTU.applyUpdates( + {{DominatorTree::Insert, BrInstr->getParent(), ClonedEndBB}}); + } + } + } + + for (CallInst *PrefetchCall : PrefetchCalls) { + if (!PrefetchCall->getParent()->getTerminator()) { + DTU.applyUpdates( + {{DominatorTree::Insert, PrefetchCall->getParent(), ClonedEndBB}}); + } + } +} + +// Move all invariant instructions to preheader of the loop +void LoopDataPrefetch::moveInvariantOperandsToPreheader( + Loop *L, const SmallVector &DependentInsts) { + Instruction *EndPoint = nullptr; + auto *InsertPoint = L->getLoopPreheader(); + auto *BBTerminator = InsertPoint->getTerminator(); + if (InsertPoint) + return; + + for (unsigned j = 0; j < DependentInsts.size(); j++) { + auto *I = DependentInsts[j]; + if (I->getOpcode() != Instruction::Br) + if (L->hasLoopInvariantOperands(I)) { + auto *InvariantInstr = I->clone(); + InvariantInstr->insertInto(InsertPoint, InsertPoint->end()); + EndPoint = InvariantInstr; + I->replaceAllUsesWith(InvariantInstr); + I->eraseFromParent(); + } + } + if (EndPoint) + BBTerminator->moveAfter(EndPoint); +} + /// Helper function to determine whether the given load is in /// CandidateMemoryLoads. If Yes, add the candidate's depending instr to the /// list. @@ -851,6 +1166,322 @@ bool LoopDataPrefetch::canDoIndirectPrefetch(Loop *L) { return true; } +/// Check if the load depends on Crc Hash functions. +bool LoopDataPrefetch::isCrcHashDataAccess(Instruction *I, + Instruction *PrefetchingLoad) { + if (llvm::IntrinsicInst *II = dyn_cast(I)) + // If CRC functions are used for offset calculation then offset will be + // random. To avoid cache misses, data prefetch is needed. + switch (II->getIntrinsicID()) { + case Intrinsic::aarch64_crc32b: + case Intrinsic::aarch64_crc32cb: + case Intrinsic::aarch64_crc32h: + case Intrinsic::aarch64_crc32ch: + case Intrinsic::aarch64_crc32w: + case Intrinsic::aarch64_crc32cw: + case Intrinsic::aarch64_crc32x: + case Intrinsic::aarch64_crc32cx: { + // Checking Candidate load is incremented by 1. + if (auto *LI = dyn_cast(PrefetchingLoad)) { + if (auto *GEPI = dyn_cast(LI->getPointerOperand())) { + // The data access will be consecutive, if the gep has one indices. + if (GEPI->getNumOperands() > 2) + return false; + auto *PtrIndices = dyn_cast(GEPI->getOperand(1)); + if (!PtrIndices || isa(PtrIndices)) + return true; + for (auto &U : PtrIndices->uses()) + if (auto *PN = dyn_cast(U.getUser())) + if (getStep(PN, SE) <= 1) + return true; + } + } + break; + } + } + return false; +} + +/// Checks the indirect loads inside the inner loop and +/// it is derived from induction variable of outer loop then, +/// insert the prefetch instruction in outer loop. +/// It maintains the same CFG structure of inner loop and +/// clone it in the outerloop. Insert the prefetch for +/// the last indirect load, not for the intermediate loads. +bool LoopDataPrefetch::insertPrefetcherInOuterloopForIndirectLoad( + Loop *L, unsigned Idx, Value *NumIterations, + SmallVector &CandidateMemoryLoads, + SmallSetVector &DependentInsts, + ValueMap &AuxIndBounds, + SmallVectorImpl> &Transforms, + unsigned ItersAhead) { + Instruction *TargetIndirectLoad = CandidateMemoryLoads[Idx]; + IRBuilder<> Builder(TargetIndirectLoad); + Module *M = TargetIndirectLoad->getModule(); + auto *ParentLoop = L->getParentLoop(); + + if (!ParentLoop) + return false; + + SmallVector ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) { + return isa(Exit->getTerminator()); + }); + if (HasCatchSwitch) + return false; + + SmallVector NewBBlocks; + SmallVector AllDependentInsts; + SmallPtrSet Visited; + SmallPtrSet IndirectLoadDependents; + SmallPtrSet BranchInsts; + SmallPtrSet InsertedPrefetchCalls; + DenseMap BBTransforms; + + BasicBlock *NewRootBB = nullptr; + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + + if (!isa(DependentInsts[DependentInsts.size() - 1])) { + return false; + } else { + if (auto *PN = + dyn_cast(DependentInsts[DependentInsts.size() - 1])) { + if (!ParentLoop->contains(PN)) { + return false; + } + if (!getStep(PN, SE)) + return false; + if (isa(PN->getType())) + return false; + } + } + + ExitBlocks.clear(); + ParentLoop->getUniqueExitBlocks(ExitBlocks); + if (HasCatchSwitch) + return false; + + Instruction *CandidateLoad = DependentInsts[0]; + BasicBlock *LoopPreheader = L->getLoopPreheader(); + + // Only consider crc hashed random data accesses. + bool isRandomAccess = false; + for (auto *I : DependentInsts) { + IndirectLoadDependents.insert(I); + Visited.insert(I); + isRandomAccess |= isCrcHashDataAccess(I, CandidateLoad); + } + if (!isRandomAccess) + return false; + + if (!LoopPreheader || !ParentLoop->getLoopPreheader()) + return false; + + if (LoopPreheader->getTerminator() == nullptr || + !isa(LoopPreheader->getTerminator())) + return false; + + // Sort dependent instruction based on PostNumber id and instruction ordering + // in the same block. + SmallVector, 8> SortedDependentInsts; + + if (!sortAndVerifyDependendInstructions(ParentLoop, Visited, DependentInsts, + IndirectLoadDependents, + SortedDependentInsts)) + return false; + + auto cloneInstructionWithBB = [&](llvm::Instruction *Inst, + llvm::Instruction *NewInstr = nullptr) { + Instruction *TransformedInstr = NewInstr; + if (TransformedInstr == nullptr) + TransformedInstr = Inst->clone(); + + BasicBlock *NewBlock; + BasicBlock *OldBlock = Inst->getParent(); + // Check if block had been created before. + if (BBTransforms.count(OldBlock)) { + NewBlock = BBTransforms[OldBlock]; + } else { + NewBlock = BasicBlock::Create(OldBlock->getContext(), + "prefetch." + OldBlock->getName()); + NewBlock->insertInto(OldBlock->getParent(), LoopPreheader); + if (NewRootBB == nullptr) + NewRootBB = NewBlock; + if (!ParentLoop->contains(NewBlock)) + ParentLoop->addBasicBlockToLoop(NewBlock, *LI); + BBTransforms.insert( + std::pair(OldBlock, NewBlock)); + NewBBlocks.push_back(NewBlock); + } + TransformedInstr->insertInto(NewBlock, NewBlock->end()); + if (NewInstr == nullptr) { + for (unsigned i = 0; i < TransformedInstr->getNumOperands(); i++) { + Value *Operand = TransformedInstr->getOperand(i); + if (Transforms[0].count(Operand)) + TransformedInstr->replaceUsesOfWith(Operand, Transforms[0][Operand]); + } + } + Transforms[0].insert(std::pair(Inst, TransformedInstr)); + AllDependentInsts.push_back(TransformedInstr); + return TransformedInstr; + }; + + // We create block and instructions with topdown manner, e.g. from PHI node in + // the parent loop to target indirect load. + bool PositiveStep = true; + int64_t Step; + while (!SortedDependentInsts.empty()) { + Instruction *DependentInst = get<0>(SortedDependentInsts.pop_back_val()); + Instruction *Inst = dyn_cast(DependentInst); + + // For target load related instruction. + switch (Inst->getOpcode()) { + case Instruction::PHI: { + // For non-root phi node, replace phi node with incoming value. + if (!IndirectLoadDependents.count(Inst)) { + if (Transforms[0].count(Inst)) + continue; + auto *PN = dyn_cast(Inst); + Transforms[0].insert(std::pair( + Inst, PN->getIncomingValueForBlock(LoopPreheader))); + break; + } + // Replace root phi node with following value: + // select((phi + step) < bound, (phi + step), bound) + // Get the constant step for the induction phi so we can use it to + // calculate how much we should increase the induction for prefetching + PHINode *PN = dyn_cast(Inst); + Step = getStep(PN, SE); + PositiveStep = isPositiveStep(PN, SE); + Type *InstType = getStepTypeFromPHINode(PN, SE); + if (!PositiveStep) + Step = -Step; + + // Make sure phi node is i64 or i32. + if (!InstType->isIntegerTy(64) && !InstType->isIntegerTy(32)) + return false; + + // Create the bound for this PHI if needed: + if (!AuxIndBounds.count(PN)) + getAuxIndVarBound(ParentLoop, PN, NumIterations, AuxIndBounds); + + // Insert the new instruction after all PHI nodes + auto InsertionPoint = Inst; + if (auto FirstNonPHI = Inst->getParent()->getFirstNonPHI()) + InsertionPoint = FirstNonPHI->getPrevNode(); + + if (Transforms.size() < 1) + Transforms.push_back(DenseMap()); + else if (Transforms[0].count(Inst)) + continue; + + // FullStep is the inital offset for the new value, taking into account, + // both Step and the number of iterations ahead to prefetch. If indirect + // prefetch iteration ahead is enabled, we directly use the supplied + // indirect-prefetch-iters-ahead value. + int64_t FullStep = PrefetchIterationsAhead + ? PrefetchIterationsAhead * Step + : ItersAhead * Step; + + Instruction::BinaryOps BiOp = + PositiveStep ? Instruction::Add : Instruction::Sub; + auto *NewOp = Builder.CreateBinOp( + BiOp, Inst, ConstantInt::get(Inst->getType(), FullStep)); + if (auto NewOpInstr = dyn_cast(NewOp)) { + NewOpInstr->moveAfter(InsertionPoint); + InsertionPoint = NewOpInstr; + AllDependentInsts.push_back(NewOpInstr); + } + + Value *NewCmp = Builder.CreateICmp( + PositiveStep ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, NewOp, + AuxIndBounds[cast(Inst)]); + Value *NewSelect = Builder.CreateSelect(NewCmp, NewOp, AuxIndBounds[PN]); + Transforms[0].insert(std::pair(Inst, NewSelect)); + + if (auto NewCmpInstr = dyn_cast(NewCmp)) { + NewCmpInstr->moveAfter(InsertionPoint); + InsertionPoint = NewCmpInstr; + AllDependentInsts.push_back(NewCmpInstr); + } + if (auto NewSelectInstr = dyn_cast(NewSelect)) { + NewSelectInstr->moveAfter(InsertionPoint); + InsertionPoint = NewSelectInstr; + AllDependentInsts.push_back(NewSelectInstr); + } + break; + } + case Instruction::Load: { + LoadInst *LoadI = dyn_cast(Inst); + Value *LoadPtr = LoadI->getPointerOperand(); + auto GeneratePrefetcher = [&](llvm::Value *PrefetchPtr) { + Function *PrefetchFunc = Intrinsic::getDeclaration( + M, Intrinsic::prefetch, LoadPtr->getType()); + Type *I32Ty = + Type::getInt32Ty(CandidateLoad->getParent()->getContext()); + Value *PrefetchArg[] = {PrefetchPtr, ConstantInt::get(I32Ty, 0), + ConstantInt::get(I32Ty, 3), + ConstantInt::get(I32Ty, 1)}; + CallInst *PrefetchCall = CallInst::Create(PrefetchFunc, PrefetchArg); + return PrefetchCall; + }; + + // We clone the intermediate load but prefetch the target load. + if (!SortedDependentInsts.empty()) { + if (Transforms[0].count(LoadI)) + continue; + cloneInstructionWithBB(LoadI); + } else { + CallInst *PrefetchCall = GeneratePrefetcher(Transforms[0][LoadPtr]); + cloneInstructionWithBB(LoadI, PrefetchCall); + InsertedPrefetchCalls.insert(PrefetchCall); + } + break; + } + case Instruction::Br: { + BranchInsts.insert(cloneInstructionWithBB(Inst)); + break; + } + default: { + // For other types of instructions, we make a clone of the instruction and + // replace operands that we already transformed before. + if (Transforms[0].count(Inst)) + continue; + cloneInstructionWithBB(Inst); + break; + } + } + } + + BasicBlock *EndBlock = + BasicBlock::Create(LoopPreheader->getContext(), "prefetch.end"); + ParentLoop->addBasicBlockToLoop(EndBlock, *LI); + EndBlock->insertInto(LoopPreheader->getParent(), LoopPreheader); + + // Create branch from prefetch call block to end block. + for (CallInst *PrefetchCall : InsertedPrefetchCalls) + if (!PrefetchCall->getParent()->getTerminator()) { + AllDependentInsts.push_back( + BranchInst::Create(EndBlock, PrefetchCall->getParent())); + } + + if (!verifyNewBasicBlocks(ParentLoop, EndBlock, NewBBlocks, + AllDependentInsts)) + return false; + + updateDominatorTree(DTU, NewRootBB, LoopPreheader); + + AllDependentInsts.push_back(BranchInst::Create(LoopPreheader, EndBlock)); + + addNewBasicBlocksInDT(DTU, EndBlock, BranchInsts, BBTransforms, + InsertedPrefetchCalls); + moveInvariantOperandsToPreheader(ParentLoop, AllDependentInsts); + + NumOuterLoopPrefetches++; + return true; +} + PreservedAnalyses LoopDataPrefetchPass::run(Function &F, FunctionAnalysisManager &AM) { AliasAnalysis *AA = &AM.getResult(F); @@ -914,6 +1545,10 @@ bool LoopDataPrefetch::run() { } bool MadeChange = false; + if (RandomAccessPrefetchOnly) { + OuterLoopPrefetch = true; + IndirectLoadPrefetch = true; + } for (Loop *I : *LI) for (Loop *L : depth_first(I)) @@ -967,10 +1602,18 @@ struct Prefetch { bool LoopDataPrefetch::runOnLoop(Loop *L) { bool MadeChange = false; - // Only prefetch in the inner-most loop - if (!L->isInnermost()) + if (L->getLoopDepth() < PrefetchLoopDepth) return MadeChange; + bool IsInnerMost = true; + // Prefetch outer loop if needed. + if (!L->isInnermost()) { + if (OuterLoopPrefetch) + IsInnerMost = false; + else + return MadeChange; + } + SmallPtrSet EphValues; CodeMetrics::collectEphemeralValues(L, AC, EphValues); @@ -1016,52 +1659,73 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { unsigned NumMemAccesses = 0; unsigned NumStridedMemAccesses = 0; SmallVector Prefetches; - for (const auto BB : L->blocks()) - for (auto &I : *BB) { - Value *PtrValue; - Instruction *MemI; - - if (LoadInst *LMemI = dyn_cast(&I)) { - MemI = LMemI; - PtrValue = LMemI->getPointerOperand(); - } else if (StoreInst *SMemI = dyn_cast(&I)) { - if (!doPrefetchWrites()) continue; - MemI = SMemI; - PtrValue = SMemI->getPointerOperand(); - } else continue; - - unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); - if (!TTI->shouldPrefetchAddressSpace(PtrAddrSpace)) - continue; - NumMemAccesses++; - if (L->isLoopInvariant(PtrValue)) - continue; + if (!DisableDirectLoadPrefetch) { + for (const auto BB : L->blocks()) { + // If this is not inner most, we avoid prefetching in sub loops. + for (auto &I : *BB) { + Value *PtrValue = nullptr; + Instruction *MemI; + + if (LoadInst *LMemI = dyn_cast(&I)) { + MemI = LMemI; + PtrValue = LMemI->getPointerOperand(); + } else if (StoreInst *SMemI = dyn_cast(&I)) { + if (!doPrefetchWrites()) continue; + MemI = SMemI; + PtrValue = SMemI->getPointerOperand(); + } else continue; + + if (!PtrValue) + continue; + if (getPrefetchDistance() == 0) + continue; - const SCEV *LSCEV = SE->getSCEV(PtrValue); - const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); - if (!LSCEVAddRec) - continue; - NumStridedMemAccesses++; - - // We don't want to double prefetch individual cache lines. If this - // access is known to be within one cache line of some other one that - // has already been prefetched, then don't prefetch this one as well. - bool DupPref = false; - for (auto &Pref : Prefetches) { - const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec); - if (const SCEVConstant *ConstPtrDiff = - dyn_cast(PtrDiff)) { - int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue()); - if (PD < (int64_t) TTI->getCacheLineSize()) { - Pref.addInstruction(MemI, DT, PD); - DupPref = true; - break; + unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); + if (!TTI->shouldPrefetchAddressSpace(PtrAddrSpace)) + continue; + NumMemAccesses++; + if (L->isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE->getSCEV(PtrValue); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + if (!LSCEVAddRec) + continue; + NumStridedMemAccesses++; + + // For outer loops, we only prefetch memory instruction with stride + // depending on the current loop. + if (!IsInnerMost && LSCEVAddRec->getLoop() != L) + continue; + + // We don't want to double prefetch individual cache lines. If this + // access is known to be within one cache line of some other one that + // has already been prefetched, then don't prefetch this one as well. + bool DupPref = false; + for (auto &Pref : Prefetches) { + const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec); + if (const SCEVConstant *ConstPtrDiff = + dyn_cast(PtrDiff)) { + int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue()); + // Use the CachelineSize value from compiler option. + int64_t CacheLineSize = CachelineSize.getNumOccurrences() + ? CachelineSize + : TTI->getCacheLineSize(); + // if TTI CacheLineSize is zero then, default CachelineSize will + // use. + CacheLineSize = CacheLineSize ? CacheLineSize : CachelineSize; + if (PD < (int64_t)CacheLineSize) { + Pref.addInstruction(MemI, DT, PD); + DupPref = true; + break; + } } } + if (!DupPref) + Prefetches.push_back(Prefetch(LSCEVAddRec, MemI)); } - if (!DupPref) - Prefetches.push_back(Prefetch(LSCEVAddRec, MemI)); } + } unsigned TargetMinStride = getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, @@ -1079,15 +1743,17 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { for (auto &P : Prefetches) { // Check if the stride of the accesses is large enough to warrant a - // prefetch. + // prefetch. If MinPrefetchStride <= 1, no need to check if any stride + // goes. if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride)) continue; BasicBlock *BB = P.InsertPt->getParent(); SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr"); - const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr( - SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead), - P.LSCEVAddRec->getStepRecurrence(*SE))); + const SCEV *NextLSCEV = SE->getAddExpr( + P.LSCEVAddRec, + SE->getMulExpr(SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead), + P.LSCEVAddRec->getStepRecurrence(*SE))); if (!SCEVE.isSafeToExpand(NextLSCEV)) continue; @@ -1100,11 +1766,10 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { Type *I32 = Type::getInt32Ty(BB->getContext()); Function *PrefetchFunc = Intrinsic::getDeclaration( M, Intrinsic::prefetch, PrefPtrValue->getType()); - Builder.CreateCall( - PrefetchFunc, - {PrefPtrValue, - ConstantInt::get(I32, P.Writes), - ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); + Builder.CreateCall(PrefetchFunc, + {PrefPtrValue, ConstantInt::get(I32, P.Writes), + ConstantInt::get(I32, IsInnerMost ? 3 : 0), + ConstantInt::get(I32, 1)}); ++NumPrefetches; LLVM_DEBUG(dbgs() << " Access: " << *P.MemI->getOperand(isa(P.MemI) ? 0 : 1) @@ -1120,9 +1785,6 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { if (!IndirectLoadPrefetch) return MadeChange; - if (!canDoIndirectPrefetch(L)) - return MadeChange; - // List of valid phi nodes that indirect loads can depend on. SmallPtrSet LoopAuxIndPHINodes; // Map of valid phi node to its bound value in the preheader. @@ -1134,28 +1796,51 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { // List of store instr in the loop. SmallVector LoopStorePtrs; - // Get loop induction and auxilary induction phis. (Thye will be candidates - // for phi node matching during constrution of the candidate instructions.) + // Get loop induction and auxiliary induction phis. (Thye will be candidates + // for phi node matching during construction of the candidate instructions.) // And we use the phi nodes to determine the loop upperbound. Value *NumIterations = getLoopIterationNumber(L, LoopAuxIndPHINodes, AuxIndBounds); - if (NumIterations == nullptr) + bool PrefetchInOuterLoop = false; + if (NumIterations == nullptr) { + if (!L->isOutermost()) { + NumIterations = getLoopIterationNumber(L->getParentLoop(), + LoopAuxIndPHINodes, AuxIndBounds); + if (NumIterations == nullptr) + return MadeChange; + PrefetchInOuterLoop = true; + } else + return MadeChange; + } + + if (!RandomAccessPrefetchOnly && !PrefetchInOuterLoop && + !canDoIndirectPrefetch(L)) { + cleanLoopIterationNumber(NumIterations); return MadeChange; - else - MadeChange = true; + } // Find candidate auxiliary induction variables which could be a dependent for // the indirect load. - for (auto &I : *(L->getHeader())) + BasicBlock *Header = nullptr; + Loop *CurrentLoop = L; + if (PrefetchInOuterLoop) { + Header = L->getParentLoop()->getHeader(); + CurrentLoop = L->getParentLoop(); + } else { + Header = L->getHeader(); + } + + for (auto &I : *Header) if (PHINode *PHI = dyn_cast(&I)) { InductionDescriptor IndDesc; - if (InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc) && - L->getInductionVariable(*SE) != PHI) { - canGetAuxIndVarBound(L, PHI, LoopAuxIndPHINodes); + if (InductionDescriptor::isInductionPHI(PHI, CurrentLoop, SE, IndDesc) && + CurrentLoop->getInductionVariable(*SE) != PHI) { + if (canGetAuxIndVarBound(CurrentLoop, PHI, LoopAuxIndPHINodes)) + LoopAuxIndPHINodes.insert(PHI); } } - // WIll search for candidates in the parent loop of the current inner most + // Will search for candidates in the parent loop of the current inner most // loop. This will capture more opportunities in the outer loop. SmallVector BBList; for (auto &BB : L->blocks()) @@ -1169,7 +1854,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { } // Iterate through the loop and keep track of the memory loads and the - // instruction list they dependd on. + // instruction list they depend on. for (const auto BB : BBList) { for (auto &I : *BB) if (LoadInst *LoadI = dyn_cast(&I)) { @@ -1179,7 +1864,8 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { InstSet.insert(LoadI); if (findCandidateMemoryLoads(LoadI, InstList, InstSet, CandidateMemoryLoads, DependentInstList, - LoopAuxIndPHINodes, L)) { + LoopAuxIndPHINodes, PrefetchInOuterLoop, + L)) { LLVM_DEBUG(dbgs() << "Found load candidate " << *LoadI << "\n"); CandidateMemoryLoads.push_back(LoadI); DependentInstList.push_back(InstList); @@ -1191,7 +1877,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { } // Keep track of previously transformed instrs for offset load and target - // loads so we can resuse them. + // loads so we can reuse them. SmallVector> Transforms; for (unsigned i = 0; i < CandidateMemoryLoads.size(); i++) { SmallSetVector DependentInsts = DependentInstList[i]; @@ -1215,11 +1901,18 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { } } - // Prefetch all indirect load without conflict to the offset load. + // Prefetch all indirect loads without conflict to the offset load. if (NumLoads == IndirectionLevel && NoConflict) { - MadeChange |= insertPrefetcherForIndirectLoad( - L, i, NumIterations, CandidateMemoryLoads, DependentInsts, - AuxIndBounds, Transforms, ItersAhead); + if (PrefetchInOuterLoop) { + MadeChange |= insertPrefetcherInOuterloopForIndirectLoad( + L, i, NumIterations, CandidateMemoryLoads, DependentInsts, + AuxIndBounds, Transforms, ItersAhead); + break; + } else { + MadeChange |= insertPrefetcherForIndirectLoad( + L, i, NumIterations, CandidateMemoryLoads, DependentInsts, + AuxIndBounds, Transforms, ItersAhead); + } } } diff --git a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-crc-outerloop.ll b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-crc-outerloop.ll new file mode 100644 index 0000000000000000000000000000000000000000..7c7ddce91539c3c0583e962b17f0f178e8454883 --- /dev/null +++ b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-crc-outerloop.ll @@ -0,0 +1,147 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=unknown -passes=loop-data-prefetch --prefetch-distance=512 --random-access-prefetch-only=true -disable-direct-prefetch -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64:128-a:0:32-n32-S64" +target triple = "armv8a-unknown-linux-gun" + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare i32 @llvm.aarch64.crc32w(i32, i32) + +; Function Attrs: mustprogress nofree nosync nounwind willreturn memory(read, inaccessiblemem: none) +define dso_local arm_aapcscc noundef i32 @_z12matchcolumnsPPiS_ii(ptr nocapture noundef readonly %A, ptr nocapture noundef readnone %key, i32 noundef %index, i32 noundef %count) local_unnamed_addr { +; CHECK-LABEL: @_z12matchcolumnsPPiS_ii( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 [[SUM_1_LCSSA:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[SUM_040:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUM_1_LCSSA]], [[FOR_COND_CLEANUP4:%.*]] ] +; CHECK-NEXT: [[I_039:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC17:%.*]], [[FOR_COND_CLEANUP4]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[I_039]], 19 +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], 99 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 99 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[A]], i32 [[I_039]] +; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[CMP336:%.*]] = icmp sgt i32 [[TMP6]], 0 +; CHECK-NEXT: br i1 [[CMP336]], label [[PREFETCH_FOR_BODY5_PREHEADER:%.*]], label [[FOR_COND_CLEANUP4]] +; CHECK: prefetch.for.body5.preheader: +; CHECK-NEXT: br label [[PREFETCH_FOR_BODY5:%.*]] +; CHECK: prefetch.for.body5: +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP8]], i32 -1) +; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 255 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 [[TMP10]] +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP11]], i32 0, i32 3, i32 1) +; CHECK-NEXT: br label [[PREFETCH_END:%.*]] +; CHECK: prefetch.end: +; CHECK-NEXT: br label [[FOR_BODY5_PREHEADER:%.*]] +; CHECK: for.body5.preheader: +; CHECK-NEXT: br label [[FOR_BODY5:%.*]] +; CHECK: for.cond.cleanup4.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP4]] +; CHECK: for.cond.cleanup4: +; CHECK-NEXT: [[SUM_1_LCSSA]] = phi i32 [ [[SUM_040]], [[FOR_BODY]] ], [ [[ADD:%.*]], [[FOR_COND_CLEANUP4_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[INC17]] = add nuw nsw i32 [[I_039]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC17]], 100 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK: for.body5: +; CHECK-NEXT: [[J_038:%.*]] = phi i32 [ [[INC15:%.*]], [[IF_END:%.*]] ], [ 0, [[FOR_BODY5_PREHEADER]] ] +; CHECK-NEXT: [[SUM_137:%.*]] = phi i32 [ [[ADD]], [[IF_END]] ], [ [[SUM_040]], [[FOR_BODY5_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds ptr, ptr [[A]], i32 [[J_038]] +; CHECK-NEXT: [[TMP12:%.*]] = load ptr, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 [[I_039]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP13]], i32 -1) +; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP14]], 255 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 [[AND]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp eq i32 [[TMP15]], [[INDEX:%.*]] +; CHECK-NEXT: br i1 [[CMP9_NOT]], label [[IF_END]], label [[DO_BODY_PREHEADER:%.*]] +; CHECK: do.body.preheader: +; CHECK-NEXT: br label [[DO_BODY:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[J_1:%.*]] = phi i32 [ [[INC10:%.*]], [[DO_BODY]] ], [ [[J_038]], [[DO_BODY_PREHEADER]] ] +; CHECK-NEXT: [[AKEY_0:%.*]] = phi i32 [ [[INC:%.*]], [[DO_BODY]] ], [ [[AND]], [[DO_BODY_PREHEADER]] ] +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[AKEY_0]], 1 +; CHECK-NEXT: [[INC10]] = add nsw i32 [[J_1]], 1 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[A]], i32 [[INC10]] +; CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[INC]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[TMP17]], [[INDEX]] +; CHECK-NEXT: br i1 [[CMP13_NOT]], label [[IF_END_LOOPEXIT:%.*]], label [[DO_BODY]] +; CHECK: if.end.loopexit: +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[J_2:%.*]] = phi i32 [ [[J_038]], [[FOR_BODY5]] ], [ [[INC10]], [[IF_END_LOOPEXIT]] ] +; CHECK-NEXT: [[B_0:%.*]] = phi ptr [ [[TMP12]], [[FOR_BODY5]] ], [ [[TMP16]], [[IF_END_LOOPEXIT]] ] +; CHECK-NEXT: [[AKEY_1:%.*]] = phi i32 [ [[AND]], [[FOR_BODY5]] ], [ [[INC]], [[IF_END_LOOPEXIT]] ] +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[B_0]], i32 [[AKEY_1]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP18]], [[SUM_137]] +; CHECK-NEXT: [[INC15]] = add nsw i32 [[J_2]], 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp slt i32 [[INC15]], [[TMP6]] +; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_BODY5]], label [[FOR_COND_CLEANUP4_LOOPEXIT]] +; +entry: + br label %for.body + +for.cond.cleanup: + ret i32 %sum.1.lcssa + +for.body: + %sum.040 = phi i32 [ 0, %entry ], [ %sum.1.lcssa, %for.cond.cleanup4 ] + %i.039 = phi i32 [ 0, %entry ], [ %inc17, %for.cond.cleanup4 ] + %arrayidx = getelementptr inbounds ptr, ptr %A, i32 %i.039 + %0 = load ptr, ptr %arrayidx, align 4 + %1 = load i32, ptr %0, align 4 + %cmp336 = icmp sgt i32 %1, 0 + br i1 %cmp336, label %for.body5, label %for.cond.cleanup4 + +for.cond.cleanup4: + %sum.1.lcssa = phi i32 [ %sum.040, %for.body ], [ %add, %if.end ] + %inc17 = add nuw nsw i32 %i.039, 1 + %exitcond.not = icmp eq i32 %inc17, 100 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.body5: + %j.038 = phi i32 [ %inc15, %if.end ], [ 0, %for.body ] + %sum.137 = phi i32 [ %add, %if.end ], [ %sum.040, %for.body ] + %arrayidx6 = getelementptr inbounds ptr, ptr %A, i32 %j.038 + %2 = load ptr, ptr %arrayidx6, align 4 + %arrayidx7 = getelementptr inbounds i32, ptr %2, i32 %i.039 + %3 = load i32, ptr %arrayidx7, align 4 + %4 = tail call i32 @llvm.aarch64.crc32w(i32 %3, i32 -1) + %and = and i32 %4, 255 + %arrayidx8 = getelementptr inbounds i32, ptr %2, i32 %and + %5 = load i32, ptr %arrayidx8, align 4 + %cmp9.not = icmp eq i32 %5, %index + br i1 %cmp9.not, label %if.end, label %do.body + +do.body: + %j.1 = phi i32 [ %inc10, %do.body ], [ %j.038, %for.body5 ] + %AKey.0 = phi i32 [ %inc, %do.body ], [ %and, %for.body5 ] + %inc = add nuw nsw i32 %AKey.0, 1 + %inc10 = add nsw i32 %j.1, 1 + %arrayidx11 = getelementptr inbounds ptr, ptr %A, i32 %inc10 + %6 = load ptr, ptr %arrayidx11, align 4 + %arrayidx12 = getelementptr inbounds i32, ptr %6, i32 %inc + %7 = load i32, ptr %arrayidx12, align 4 + %cmp13.not = icmp eq i32 %7, %index + br i1 %cmp13.not, label %if.end, label %do.body + +if.end: + %j.2 = phi i32 [ %j.038, %for.body5 ], [ %inc10, %do.body ] + %B.0 = phi ptr [ %2, %for.body5 ], [ %6, %do.body ] + %AKey.1 = phi i32 [ %and, %for.body5 ], [ %inc, %do.body ] + %arrayidx14 = getelementptr inbounds i32, ptr %B.0, i32 %AKey.1 + %8 = load i32, ptr %arrayidx14, align 4 + %add = add nsw i32 %8, %sum.137 + %inc15 = add nsw i32 %j.2, 1 + %cmp3 = icmp slt i32 %inc15, %1 + br i1 %cmp3, label %for.body5, label %for.cond.cleanup4 +} diff --git a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll new file mode 100644 index 0000000000000000000000000000000000000000..a731b0b6322961baa719a4e96260f93035d98f66 --- /dev/null +++ b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=unknown -passes=loop-data-prefetch --prefetch-distance=512 --random-access-prefetch-only=true -disable-direct-prefetch -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gun" + +declare i32 @llvm.aarch64.crc32w(i32, i32) + +; Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: read) uwtable +define dso_local noundef i32 @_Z12matchcolumnsPiiS_ii(ptr nocapture noundef readonly %A, i32 noundef %B, ptr nocapture noundef readonly %Key, i32 noundef %index, i32 noundef %count) { +; CHECK-LABEL: @_Z12matchcolumnsPiiS_ii( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 [[ADD:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV22:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT23:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[SUM_020:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD]], [[IF_END]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDVARS_IV22]], 22 +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i64 [[TMP0]], 99 +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 99 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDVARS_IV22]], 44 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV22]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 4 +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP4]], i32 0, i32 3, i32 1) +; CHECK-NEXT: [[TMP8:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP6]], i32 -1) +; CHECK-NEXT: [[TMP9:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP7]], i32 -1) +; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP8]], 255 +; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 255 +; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[AND]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[KEY:%.*]], i64 [[IDXPROM1]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[KEY]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP12]], i32 0, i32 3, i32 1) +; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP14]], [[B:%.*]] +; CHECK-NEXT: br i1 [[CMP3_NOT]], label [[IF_END]], label [[DO_BODY_PREHEADER:%.*]] +; CHECK: do.body.preheader: +; CHECK-NEXT: br label [[DO_BODY:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DO_BODY]] ], [ [[IDXPROM1]], [[DO_BODY_PREHEADER]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[KEY]], i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[CMP6_NOT:%.*]] = icmp eq i32 [[TMP15]], [[B]] +; CHECK-NEXT: br i1 [[CMP6_NOT]], label [[IF_END_LOOPEXIT:%.*]], label [[DO_BODY]] +; CHECK: if.end.loopexit: +; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[AKEY_1:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP16]], [[IF_END_LOOPEXIT]] ] +; CHECK-NEXT: [[IDXPROM7:%.*]] = sext i32 [[AKEY_1]] to i64 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM7]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP17]], [[SUM_020]] +; CHECK-NEXT: [[INDVARS_IV_NEXT23]] = add nuw nsw i64 [[INDVARS_IV22]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT23]], 100 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; +entry: + br label %for.body + +for.cond.cleanup: + ret i32 %add + +for.body: + %indvars.iv22 = phi i64 [ 0, %entry ], [ %indvars.iv.next23, %if.end ] + %sum.020 = phi i32 [ 0, %entry ], [ %add, %if.end ] + %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv22 + %0 = load i32, ptr %arrayidx, align 4 + %1 = tail call i32 @llvm.aarch64.crc32w(i32 %0, i32 -1) + %and = and i32 %1, 255 + %idxprom1 = zext i32 %and to i64 + %arrayidx2 = getelementptr inbounds i32, ptr %Key, i64 %idxprom1 + %2 = load i32, ptr %arrayidx2, align 4 + %cmp3.not = icmp eq i32 %2, %B + br i1 %cmp3.not, label %if.end, label %do.body + +do.body: + %indvars.iv = phi i64 [ %idxprom1, %for.body ], [ %indvars.iv.next, %do.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx5 = getelementptr inbounds i32, ptr %Key, i64 %indvars.iv.next + %3 = load i32, ptr %arrayidx5, align 4 + %cmp6.not = icmp eq i32 %3, %B + br i1 %cmp6.not, label %if.end.loopexit, label %do.body + +if.end.loopexit: + %4 = trunc i64 %indvars.iv.next to i32 + br label %if.end + +if.end: + %AKey.1 = phi i32 [ %and, %for.body ], [ %4, %if.end.loopexit ] + %idxprom7 = sext i32 %AKey.1 to i64 + %arrayidx8 = getelementptr inbounds i32, ptr %A, i64 %idxprom7 + %5 = load i32, ptr %arrayidx8, align 4 + %add = add nsw i32 %5, %sum.020 + %indvars.iv.next23 = add nuw nsw i64 %indvars.iv22, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next23, 100 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} diff --git a/llvm/test/Transforms/LoopDataPrefetch/AArch64/pr56681.ll b/llvm/test/Transforms/LoopDataPrefetch/AArch64/pr56681.ll index 0dc8bb5022e1c60255f23cc123830a05215b20f6..823571b37b1bac8c0a578307c8186d8189b381e7 100644 --- a/llvm/test/Transforms/LoopDataPrefetch/AArch64/pr56681.ll +++ b/llvm/test/Transforms/LoopDataPrefetch/AArch64/pr56681.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=loop-data-prefetch --prefetch-distance=3000 -debug-only=loop-data-prefetch -S < %s 2>&1 | FileCheck %s +; RUN: opt -passes=loop-data-prefetch -debug-only=loop-data-prefetch -S < %s 2>&1 | FileCheck %s ; REQUIRES: asserts