From 88f8580bf5d845a116d6cb559af01d925d651a1e Mon Sep 17 00:00:00 2001 From: Yangguang Li Date: Thu, 14 Aug 2025 20:05:14 +0800 Subject: [PATCH 1/2] [LoopDataPrefetch] Add support for indirect load prefetch Prefetch indirect loads that depends on loop induction variable in a loop. --- .../Transforms/Scalar/LoopDataPrefetch.cpp | 815 +++++++++++++++++- .../AArch64/indirect-load-prefetch.ll | 80 ++ 2 files changed, 888 insertions(+), 7 deletions(-) create mode 100644 llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch.ll diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index 7c2770979a90..360cd782756a 100644 --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -14,21 +14,32 @@ #include "llvm/InitializePasses.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ReplaceConstant.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #define DEBUG_TYPE "loop-data-prefetch" @@ -54,23 +65,106 @@ static cl::opt MaxPrefetchIterationsAhead( "max-prefetch-iters-ahead", cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden); +static cl::opt + IndirectLoadPrefetch("indirect-load-prefetch", cl::Hidden, cl::init(false), + cl::desc("Enable indirect laod prefetch")); + +static cl::opt PrefetchIterationsAhead( + "indirect-prefetch-iters-ahead", + cl::desc("Number of iterations for indirect-load prefetch"), cl::Hidden); + +static cl::opt SkipIntermediate( + "indirect-prefetch-skip-intermediate", cl::Hidden, cl::init(false), + cl::desc( + "Skip prefetching intermediate loads while doing indirect prefetch")); + +static cl::opt IndirectionLevel( + "indirect-level", + cl::desc("Indirection level considered for indirect load prefetch"), + cl::Hidden, cl::init(2)); + +static cl::opt RandomAccessPrefetchOnly( + "random-access-prefetch-only", cl::Hidden, cl::init(false), + cl::desc("Enable only outer loop indirect load prefetch")); + +static cl::opt CachelineSize("prefetch-cache-line-size", + cl::desc("Specify cache line size"), + cl::Hidden, cl::init(64)); + STATISTIC(NumPrefetches, "Number of prefetches inserted"); +STATISTIC(NumIndPrefetches, "Number of indirect prefetches inserted"); +STATISTIC(NumOuterLoopPrefetches, "Number of outer loop prefetches inserted"); namespace { +// Helper function to return a type with the same size as +// given step size +static Type *getPtrTypefromPHI(PHINode *PHI, int64_t StepSize) { + Type *Int8Ty = Type::getInt8Ty(PHI->getParent()->getContext()); + return ArrayType::get(Int8Ty, StepSize); +} + /// Loop prefetch implementation class. class LoopDataPrefetch { public: - LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI, - ScalarEvolution *SE, const TargetTransformInfo *TTI, + LoopDataPrefetch(AliasAnalysis *AA, AssumptionCache *AC, DominatorTree *DT, + LoopInfo *LI, ScalarEvolution *SE, + const TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE) - : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {} + : AA(AA), AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {} bool run(); private: bool runOnLoop(Loop *L); + Value *getCanonicalishSizeVariable(Loop *L, PHINode *PHI) const; + Value * + getLoopIterationNumber(Loop *L, + SmallPtrSet &LoopAuxIndPHINodes, + ValueMap &AuxIndBounds); + /// If prefetch instruction is not inserted, need to clean iteration + /// instructions in the preheader. + void cleanLoopIterationNumber(Value *NumIterations); + /// Returns whether the auxiliary induction variable can generate bound. + /// If it can, add PHI to LoopAuxIndPHINodes + bool canGetAuxIndVarBound(Loop *L, PHINode *PHI, + SmallPtrSet &LoopAuxIndPHINodes); + + /// Generate bound for the auxiliary induction variable at the + /// preheader and add it to AuxIndBounds. + /// Returns whether the bound was successfully generated. + bool getAuxIndVarBound(Loop *L, PHINode *PHI, Value *NumIterations, + ValueMap &AuxIndBounds); + + bool insertPrefetcherForIndirectLoad( + Loop *L, unsigned Idx, Value *NumIterations, + SmallVector &CandidateMemoryLoads, + SmallSetVector &DependentInsts, + ValueMap &AuxIndBounds, + SmallVectorImpl> &Transforms, + unsigned ItersAhead); + + bool findCandidateMemoryLoads( + Instruction *I, SmallSetVector &InstList, + SmallPtrSet &InstSet, + SmallVector &CandidateMemoryLoads, + std::vector> &DependentInstList, + SmallPtrSet LoopAuxIndPHINodes, Loop *L); + + /// Helper function to determine whether the given load is in + /// CandidateMemoryLoads. If yes, add the candidate's depending inst to the + /// list + bool isLoadInCandidateMemoryLoads( + LoadInst *LoadI, SmallSetVector &InstList, + SmallPtrSet &InstSet, + SmallVector &CandidateMemoryLoads, + std::vector> &DependentInstList); + + /// Returns whether the given loop can do indirect prefetch and should be + /// processed to insert prefetches for indirect loads. + bool canDoIndirectPrefetch(Loop *L); + /// Check if the stride of the accesses is large enough to /// warrant a prefetch. bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride); @@ -103,6 +197,7 @@ private: return TTI->enableWritePrefetching(); } + AliasAnalysis *AA; AssumptionCache *AC; DominatorTree *DT; LoopInfo *LI; @@ -120,6 +215,8 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); @@ -140,6 +237,7 @@ public: char LoopDataPrefetchLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch", "Loop Data Prefetch", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) @@ -169,8 +267,593 @@ bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR, return TargetMinStride <= AbsStride; } +/// Use the induction variable to generate value represeting the total num of +/// iterations for the loop in the preheader. +Value *LoopDataPrefetch::getLoopIterationNumber( + Loop *L, SmallPtrSet &LoopAuxIndPHINodes, + ValueMap &AuxIndBounds) { + Value *LoopBoundValue; + Value *LoopStepValue; + Value *LoopStartValue; + Value *NumIterations; + + // Use induction variable to derive number of iterations for the loop which + // will be used to calculate the upper bound for other auxiliary induction + // variables. + PHINode *PHI = L->getInductionVariable(*SE); + if (PHI == nullptr) + return nullptr; + + auto LoopLB = L->getBounds(*SE); + if (!LoopLB) + return nullptr; + + LoopStartValue = &(LoopLB->getInitialIVValue()); + LoopStepValue = LoopLB->getStepValue(); + LoopBoundValue = &(LoopLB->getFinalIVValue()); + + if (LoopStartValue == nullptr || LoopStepValue == nullptr || + LoopBoundValue == nullptr) + return nullptr; + + // Step should be constant. + if (!isa(SE->getSCEV(LoopStepValue))) + return nullptr; + + // Make sure each of them is invariant so we can use them in the preheader. + if (!L->isLoopInvariant(LoopBoundValue) || + !L->isLoopInvariant(LoopStepValue) || !L->isLoopInvariant(LoopStartValue)) + return nullptr; + + // Generate instruction that calculated the total number of iterations of the + // loop in the preheader. + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + Value *Range = Builder.CreateSub(LoopBoundValue, LoopStartValue); + NumIterations = Builder.CreateSDiv(Range, LoopStepValue); + + LoopAuxIndPHINodes.insert(PHI); + Value *Bound = nullptr; + // If the step is positive, the upper bound isn't included, i.e. accessing + // [bound] is not legal, so subtract the bound by LoopStepValue to prevent out + // of bounds memory access. + if (SE->isKnownNegative(SE->getSCEV(LoopStepValue))) + Bound = LoopBoundValue; + else + Bound = Builder.CreateSub(LoopBoundValue, LoopStepValue); + AuxIndBounds.insert(std::pair(PHI, Bound)); + return NumIterations; +} + +/// If prefetch instruction is not inserted. Need to clean iteration instruction +/// in the preheader. +void LoopDataPrefetch::cleanLoopIterationNumber(Value *NumIterations) { + std::deque IterationInsts; + if (NumIterations != nullptr && NumIterations->use_empty()) { + IterationInsts.push_back(dyn_cast(NumIterations)); + while (IterationInsts.size() > 0) { + auto *IInst = IterationInsts.front(); + IterationInsts.pop_front(); + if (IInst->use_empty()) { + for (unsigned i = 0; i < IInst->getNumOperands(); i++) { + if (isa(IInst->getOperand(i))) + IterationInsts.push_back( + dyn_cast(IInst->getOperand(i))); + } + dyn_cast(IInst)->eraseFromParent(); + } + } + } +} + +/// Returns whether the auxiliary induction variable can generate bound. +/// If it can genearte a bound, add PHI to LoopAuxIndPHINodes +bool LoopDataPrefetch::canGetAuxIndVarBound( + Loop *L, PHINode *PHI, SmallPtrSet &LoopAuxIndPHINodes) { + Value *AuxIndVarStartValue = + PHI->getIncomingValueForBlock(L->getLoopPreheader()); + if (AuxIndVarStartValue == nullptr) + return false; + + const SCEV *LSCEV = SE->getSCEV(PHI); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + + if (LSCEVAddRec == nullptr) + return false; + + // Currently, we only support constant steps. + if (dyn_cast(LSCEVAddRec->getStepRecurrence(*SE))) { + InductionDescriptor IndDesc; + if (!InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc)) + return false; + + if (IndDesc.getInductionOpcode() != Instruction::Add && + IndDesc.getInductionOpcode() != Instruction::Sub && + IndDesc.getKind() != InductionDescriptor::IK_PtrInduction) + return false; + + LoopAuxIndPHINodes.insert(PHI); + + return true; + } + return false; +} + +/// Generate bound for the auxiliary induction variable at the preheader and add +/// it to AuxIndBounds. Returns whether the bound was successfully generated. +bool LoopDataPrefetch::getAuxIndVarBound( + Loop *L, PHINode *PHI, Value *NumIterations, + ValueMap &AuxIndBounds) { + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + Value *AuxIndVarStartValue = + PHI->getIncomingValueForBlock(L->getLoopPreheader()); + if (AuxIndVarStartValue == nullptr) + return false; + + const SCEV *LSCEV = SE->getSCEV(PHI); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + + // Currently, we only support constant steps. + if (const SCEVConstant *ConstPtrDiff = + dyn_cast(LSCEVAddRec->getStepRecurrence(*SE))) { + Value *AuxIndVarBound; + InductionDescriptor IndDesc; + if (!InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc)) + return false; + + // Calculate the upper bound for the auxiliary induction variable. + Value *CastedNumIterations = + Builder.CreateSExtOrTrunc(NumIterations, ConstPtrDiff->getType()); + + // Subtract one from CastedNumIterations as we want the bound to be in + // bounds. If there are N iterations, the first iteration will access the + // array at offset 0. On the N-th iteration, it will access the array at + // offset N-1, not N. + CastedNumIterations = Builder.CreateSub( + CastedNumIterations, ConstantInt::get(ConstPtrDiff->getType(), 1)); + // Teh induction operator is add / sub + if (IndDesc.getInductionOpcode() == Instruction::Add || + IndDesc.getInductionOpcode() == Instruction::Sub) { + Value *Range = + Builder.CreateMul(ConstPtrDiff->getValue(), CastedNumIterations); + AuxIndVarBound = Builder.CreateAdd(Range, AuxIndVarStartValue); + } else if (IndDesc.getKind() == InductionDescriptor::IK_PtrInduction) { + // The induction variable is a pointer + int64_t StepSize = ConstPtrDiff->getAPInt().getSExtValue(); + if (SE->isKnownNegative(ConstPtrDiff)) { + StepSize = -StepSize; + CastedNumIterations = Builder.CreateMul( + ConstantInt::getSigned(ConstPtrDiff->getType(), -1), + CastedNumIterations); + } + Type *GEPType = getPtrTypefromPHI(PHI, StepSize); + AuxIndVarBound = Builder.CreateInBoundsGEP(GEPType, AuxIndVarStartValue, + CastedNumIterations); + } else + return false; + + LLVM_DEBUG(dbgs() << "Added " + << (isa(SE->getSCEV(AuxIndVarBound)) + ? "Constant " + : "") + << "AuxIndVarBound " << *AuxIndVarBound + << " for AuxIndVar:" << *PHI << "\n"); + AuxIndBounds.insert(std::pair(PHI, AuxIndVarBound)); + + return true; + } + return false; +} + +// Helper function to calculate the step for a given loop +static uint64_t getStep(PHINode *PN, ScalarEvolution *SE) { + // Get the constant step for the induction phi so we can use it to calculate + // how much we should increase the induction for prefetching. + uint64_t Step = 0; + const SCEV *LSCEV = SE->getSCEV(PN); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + + if (LSCEVAddRec == nullptr) + return Step; + + if (const SCEVConstant *ConstPtrDiff = + dyn_cast(LSCEVAddRec->getStepRecurrence(*SE))) { + Step = ConstPtrDiff->getAPInt().getZExtValue(); + } + return Step; +} + +// Helper function to determine if the loop step is positive +static bool isPositiveStep(PHINode *PN, ScalarEvolution *SE) { + bool PositiveStep = true; + const SCEV *LSCEV = SE->getSCEV(PN); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + if (const SCEVConstant *ConstPtrDiff = + dyn_cast(LSCEVAddRec->getStepRecurrence(*SE))) { + if (SE->isKnownNegative(ConstPtrDiff)) { + PositiveStep = false; + } + } + return PositiveStep; +} + +// Helper function to calculate the step type of a PHI node. If the PHI node is +// not a pointer type, get the type PHI Node itself. Otherwise, get the integer +// type of the PHI's step/offset value. +static Type *getStepTypeFromPHINode(PHINode *PN, ScalarEvolution *SE) { + // Get the constant step for the induction phi so we can use it to calculate + // how much we should increase the induction for prefetching. + Type *T = PN->getType(); + if (!T->isPointerTy()) + return T; + + const SCEV *LSCEV = SE->getSCEV(PN); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + if (const SCEVConstant *ConstPtrDiff = + dyn_cast(LSCEVAddRec->getStepRecurrence(*SE))) + return ConstPtrDiff->getType(); + + return T; +} + +/// This function will take an instr list that contains indirect loads and +/// transform them into prefetchers. E.g. Transform following indirect load +/// A[B[i]]: +/// phi indvar [0] [bound] +/// idxB = gep *B, indvar +/// offsetA = load * idxB +/// idxA = gep *A, offsetA +/// valueA = load *idxA +/// To indirect load with prefetchers N iteration ahead: +/// phi indvar [0] [bound] +/// offsetN = add indvar, N +/// offset2N = add indvar, 2N +/// compare = icmp offsetN, bound +/// offsetN = select compare, offsetN, bound +/// preIdxN = gep *B, offsetN +/// preIdx2N = get *B, offset2N +/// call prefetch(preIdx2N) +/// preOffsetA = load preIdxN +/// preIdxA = gep *A, preOffsetA +/// call prefetch(preIdxA) +/// idxB = gep *B, indvar +/// offsetA = load *idxB +/// idxA = gep *A, offsetA +/// valueA = load *idxA +bool LoopDataPrefetch::insertPrefetcherForIndirectLoad( + Loop *L, unsigned Idx, Value *NumIterations, + SmallVector &CandidateMemoryLoads, + SmallSetVector &DependentInsts, + ValueMap &AuxIndBounds, + SmallVectorImpl> &Transforms, + unsigned ItersAhead) { + bool PositiveStep = true; + Instruction *TargetIndirectLoad = CandidateMemoryLoads[Idx]; + IRBuilder<> Builder(TargetIndirectLoad); + Module *M = TargetIndirectLoad->getModule(); + Type *I32Ty = Type::getInt32Ty(TargetIndirectLoad->getParent()->getContext()); + LLVM_DEBUG(dbgs() << "Inserting indirect prefetchers for\t" + << *TargetIndirectLoad << "\twith " << DependentInsts.size() + << " dependent instructions\n"); + + // Keep track of the number of prefetches left to process among the + // DependentInst List. We assume that for given indirectLevel N, we will have + // N prefetches to do, unless we are skipping intermediate loads, then we are + // only doing 1 prefetch. + size_t NumPrefetchesLeft = SkipIntermediate ? 1 : IndirectionLevel; + int64_t Step; + while (!DependentInsts.empty()) { + Instruction *DependentInst = DependentInsts.pop_back_val(); + Instruction *Inst = dyn_cast(DependentInst); + + switch (Inst->getOpcode()) { + case Instruction::PHI: { + // Get the constant step for the induction phi so we can use it to + // calculate how much we should increase the induction for prefetching. + PHINode *PN = dyn_cast(Inst); + Step = getStep(PN, SE); + PositiveStep = isPositiveStep(PN, SE); + Type *InstType = getStepTypeFromPHINode(PN, SE); + if (!PositiveStep) + Step = -Step; + + // Make sure phi node is i64 or i32. + if (!InstType->isIntegerTy(64) && !InstType->isIntegerTy(32)) + return false; + + // Create the bound for this PHI if needed: + if (!AuxIndBounds.count(PN)) + getAuxIndVarBound(L, PN, NumIterations, AuxIndBounds); + + // We create values based on the induction variable so we can use it to + // generate prefetcher later on. The first value (indvar + IterationAhead + // * step) will be used for the load of prefetched address and it must + // not exceeding the bound. The second value (indvar + 2 * IterationAhead + // * step) will be used to generate prefether for the load of address. + // The subsequent values are generated in a similar fashion to generate + // prefetchers for offset of all dependent loads. + + // Insert the new instruction after all PHI node. + auto InsertionPoint = Inst; + if (auto FirstNonPHI = Inst->getParent()->getFirstNonPHI()) + InsertionPoint = FirstNonPHI->getPrevNode(); + + for (size_t i = 0; i < NumPrefetchesLeft; i++) { + if (i > 0 && SkipIntermediate) + break; + + if (Transforms.size() < i + 1) { + Transforms.push_back(DenseMap()); + } else if (Transforms[i].count(Inst)) + continue; + + // Create the new operation for the target load + Value *NewOp = nullptr; + if (Inst->getType()->isPointerTy()) { + Type *GEPType = getPtrTypefromPHI(PN, Step); + int64_t Offset = + PrefetchIterationsAhead ? PrefetchIterationsAhead : ItersAhead; + if (!PositiveStep) + Offset = -Offset; + // Do not need to calculate Offset * Step as it is calculated + // implicitly within the GEP instruction + NewOp = Builder.CreateInBoundsGEP( + GEPType, Inst, + ConstantInt::getSigned(InstType, (i + 1) * Offset)); + } else { + // FullStep is the initial offset for the new value, taking into + // account, both Step and the number of iterations ahead to prefetch. + // If indirect prefetch iterations ahead is enabled, we directly use + // the supplied indirect-prefetch-iters-ahead value. + int64_t FullStep = PrefetchIterationsAhead + ? PrefetchIterationsAhead * Step + : ItersAhead * Step; + + Instruction::BinaryOps BiOp = + PositiveStep ? Instruction::Add : Instruction::Sub; + NewOp = Builder.CreateBinOp( + BiOp, Inst, + ConstantInt::get(Inst->getType(), (i + 1) * FullStep)); + } + + if (auto NewOpInstr = dyn_cast(NewOp)) { + NewOpInstr->moveAfter(InsertionPoint); + InsertionPoint = NewOpInstr; + } + + // Create the new operations for the offset loads + if (i > 0 && i == NumPrefetchesLeft - 1) { + Transforms[i].insert(std::pair(Inst, NewOp)); + } else { + Value *NewCmp = Builder.CreateICmp( + PositiveStep ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, NewOp, + AuxIndBounds[cast(Inst)]); + Value *NewSelect = + Builder.CreateSelect(NewCmp, NewOp, AuxIndBounds[PN]); + Transforms[i].insert(std::pair(Inst, NewSelect)); + + if (auto NewCmpInstr = dyn_cast(NewCmp)) { + NewCmpInstr->moveAfter(InsertionPoint); + InsertionPoint = NewCmpInstr; + } + + if (auto NewSelectInstr = dyn_cast(NewSelect)) { + NewSelectInstr->moveAfter(InsertionPoint); + InsertionPoint = NewSelectInstr; + } + } + } + break; + } + case Instruction::Load: { + LoadInst *LoadI = dyn_cast(Inst); + Value *LoadPtr = LoadI->getPointerOperand(); + if (!SkipIntermediate) + NumPrefetchesLeft--; + + auto GeneratePrefetcher = [&](llvm::Value *PrefetchPtr) { + Function *PrefetchFunc = Intrinsic::getDeclaration( + M, Intrinsic::prefetch, LoadPtr->getType()); + Value *PrefetchArg[] = {PrefetchPtr, ConstantInt::get(I32Ty, 0), + ConstantInt::get(I32Ty, 3), + ConstantInt::get(I32Ty, 1)}; + CallInst *PrefetchCall = CallInst::Create(PrefetchFunc, PrefetchArg); + return PrefetchCall; + }; + + if (!DependentInsts.empty()) { + // For any intermediate (not last) load, we generate a load for all the + // offset at min(indvar+N*IterationsAhead*step, bound)] for each N up to + // NumPrefetchesLeft - 1, and generate a prefetcher at + // (indvar+(N+1)*IterationAhead*step) for the offset load. + Instruction *PrefetchOffsetLoad = nullptr; + for (size_t i = 0; i < NumPrefetchesLeft; i++) { + if (Transforms[i].count(LoadI)) + continue; + PrefetchOffsetLoad = LoadI->clone(); + Builder.Insert(PrefetchOffsetLoad); + for (size_t i = 0; i < NumPrefetchesLeft; i++) { + if (Transforms[i].count(LoadI)) + continue; + PrefetchOffsetLoad = LoadI->clone(); + Builder.Insert(PrefetchOffsetLoad); + PrefetchOffsetLoad->moveAfter(LoadI); + PrefetchOffsetLoad->replaceUsesOfWith(LoadPtr, + Transforms[i][LoadPtr]); + + Transforms[i].insert( + std::pair(LoadI, PrefetchOffsetLoad)); + } + } + + if (SkipIntermediate) + break; + + // Create a prefetcher for the offset laod. + if (PrefetchOffsetLoad) { + CallInst *PrefetchCall = + GeneratePrefetcher(Transforms[NumPrefetchesLeft][LoadPtr]); + PrefetchCall->insertAfter(PrefetchOffsetLoad); + NumIndPrefetches++; + } + } else { + CallInst *PrefetchCall = GeneratePrefetcher(Transforms[0][LoadPtr]); + PrefetchCall->insertAfter(LoadI); + NumIndPrefetches++; + } + break; + } + default: { + // For other types of instructions, we make a clone of the instruction and + // repalce operands that we already transformed before. + for (size_t j = 0; j < NumPrefetchesLeft; j++) { + if (j >= Transforms.size() || Transforms[j].count(Inst)) + continue; + Instruction *TransformedInst = Inst->clone(); + Builder.Insert(TransformedInst); + TransformedInst->moveAfter(Inst); + for (unsigned i = 0; i < TransformedInst->getNumOperands(); i++) { + Value *Operand = TransformedInst->getOperand(i); + if (Transforms[j].count(Operand)) + TransformedInst->replaceUsesOfWith(Operand, Transforms[j][Operand]); + } + + Transforms[j].insert( + std::pair(Inst, TransformedInst)); + } + break; + } + } + } + return true; +} + +/// Find the indirect load that depends on the auxiliary induction variable and +/// construct an instr list that contains loop variant instruction from the +/// target load to the candidate phi instr. +bool LoopDataPrefetch::findCandidateMemoryLoads( + Instruction *I, SmallSetVector &InstList, + SmallPtrSet &InstSet, + SmallVector &CandidateMemoryLoads, + std::vector> &DependentInstList, + SmallPtrSet LoopAuxIndPHINodes, Loop *L) { + bool ret = false; + + for (Use &U : I->operands()) { + // If value is loop invariant, just continue + if (LI->getLoopFor(I->getParent())->isLoopInvariant(U.get())) + continue; + + Instruction *OperandInst = dyn_cast(U.get()); + if (OperandInst != nullptr) { + switch (OperandInst->getOpcode()) { + case Instruction::Load: { + // Check if the load instruction that it depends on is already in the + // candidate. If yes, add the canddiate's depending instr to the list. + // If not, the load instruction it depends on is invalid. + LoadInst *LoadI = dyn_cast(OperandInst); + if (isLoadInCandidateMemoryLoads(LoadI, InstList, InstSet, + CandidateMemoryLoads, + DependentInstList)) { + // We do not return early in case there are other auxiliary induction + // variables to check. + ret = true; + } + break; + } + case Instruction::PHI: { + // Check if PHI is the loop auxiliary induction PHI. If yes, found a + // valid load dependent on loop auxiliary induction variable. If not, + // invalid candidate. + PHINode *PhiInst = dyn_cast(OperandInst); + if (LoopAuxIndPHINodes.contains(PhiInst)) { + // In order to prevent the size of SmallVector from going out of + // bounds for large cases, only the last access of the element is + // retained. Update the position of OperandInst in the InstList. + if (InstList.count(OperandInst)) + InstList.remove(OperandInst); + InstList.insert(OperandInst); + return true; + } + break; + } + case Instruction::Call: { + // We currently can not handle case where indirect load depends on other + // functions yet. + return false; + } + case Instruction::Invoke: { + // We currently can not handle case where indirect load depends on other + // functions yet. + return false; + } + default: { + // Use DFS to search though the operands. + if (InstList.count(OperandInst)) + InstList.remove(OperandInst); + InstList.insert(OperandInst); + if (findCandidateMemoryLoads(OperandInst, InstList, InstSet, + CandidateMemoryLoads, DependentInstList, + LoopAuxIndPHINodes, L)) { + // We do not return early in case there are other auxiliary induction + // variables to check + ret = true; + } else { + // If the operand isn't dependent on an auxiliary induction variable, + // remove any instructions added to DependentInstList from this + // operand + InstList.remove(OperandInst); + } + } + } + } + } + return ret; +} + +/// Helper function to determine whether the given load is in +/// CandidateMemoryLoads. If Yes, add the candidate's depending instr to the +/// list. +bool LoopDataPrefetch::isLoadInCandidateMemoryLoads( + LoadInst *LoadI, SmallSetVector &InstList, + SmallPtrSet &InstSet, + SmallVector &CandidateMemoryLoads, + std::vector> &DependentInstList) { + size_t CandidateLoadIndex = 0; + for (auto CandidateMemoryLoad : CandidateMemoryLoads) { + if (LoadI == CandidateMemoryLoad) + break; + CandidateLoadIndex++; + } + + if (CandidateLoadIndex >= CandidateMemoryLoads.size() || InstSet.count(LoadI)) + return false; + + for (auto CandidateInst : DependentInstList[CandidateLoadIndex]) { + if (InstList.count(CandidateInst)) + InstList.remove(CandidateInst); + InstList.insert(CandidateInst); + InstSet.insert(CandidateInst); + } + return true; +} + +/// Returns whether the given loop should be processed to insert prefetches for +/// indirect loads. +bool LoopDataPrefetch::canDoIndirectPrefetch(Loop *L) { + // Support inner most loops in a simple form. However, the parent of inner + // loop will be processed as well in the case of nested loops. If + // indirectLevel is low, only allow one block loop, otherwise, allow up to 5 + // under certain conditions. + if (!L->isInnermost() || !L->getLoopPreheader() || + (IndirectionLevel <= 3 && L->getNumBlocks() != 1) || + (IndirectionLevel > 3 && L->getNumBlocks() == 1) || L->getNumBlocks() > 5) + return false; + return true; +} + PreservedAnalyses LoopDataPrefetchPass::run(Function &F, FunctionAnalysisManager &AM) { + AliasAnalysis *AA = &AM.getResult(F); DominatorTree *DT = &AM.getResult(F); LoopInfo *LI = &AM.getResult(F); ScalarEvolution *SE = &AM.getResult(F); @@ -179,8 +862,16 @@ PreservedAnalyses LoopDataPrefetchPass::run(Function &F, &AM.getResult(F); const TargetTransformInfo *TTI = &AM.getResult(F); - LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE); - bool Changed = LDP.run(); + // Ensure loops are in simplified form which is a pre-requisite for loop data + // prefetch pass. Added only for new PM since the legacy PM has already added + // LoopSimplify pass as a dependency. + bool Changed = false; + for (auto &L : *LI) { + Changed |= simplifyLoop(L, DT, LI, SE, AC, nullptr, false); + } + + LoopDataPrefetch LDP(AA, AC, DT, LI, SE, TTI, ORE); + Changed |= LDP.run(); if (Changed) { PreservedAnalyses PA; @@ -196,6 +887,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) { if (skipFunction(F)) return false; + AliasAnalysis *AA = &getAnalysis().getAAResults(); DominatorTree *DT = &getAnalysis().getDomTree(); LoopInfo *LI = &getAnalysis().getLoopInfo(); ScalarEvolution *SE = &getAnalysis().getSE(); @@ -206,7 +898,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) { const TargetTransformInfo *TTI = &getAnalysis().getTTI(F); - LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE); + LoopDataPrefetch LDP(AA, AC, DT, LI, SE, TTI, ORE); return LDP.run(); } @@ -214,7 +906,8 @@ bool LoopDataPrefetch::run() { // If PrefetchDistance is not set, don't run the pass. This gives an // opportunity for targets to run this pass for selected subtargets only // (whose TTI sets PrefetchDistance and CacheLineSize). - if (getPrefetchDistance() == 0 || TTI->getCacheLineSize() == 0) { + if (getPrefetchDistance() == 0 || + (TTI->getCacheLineSize() == 0 && CachelineSize == 0)) { LLVM_DEBUG(dbgs() << "Please set both PrefetchDistance and CacheLineSize " "for loop data prefetch.\n"); return false; @@ -424,5 +1117,113 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { MadeChange = true; } + if (!IndirectLoadPrefetch) + return MadeChange; + + if (!canDoIndirectPrefetch(L)) + return MadeChange; + + // List of valid phi nodes that indirect loads can depend on. + SmallPtrSet LoopAuxIndPHINodes; + // Map of valid phi node to its bound value in the preheader. + ValueMap AuxIndBounds; + // Candidate memory loads in the loop. + SmallVector CandidateMemoryLoads; + // List of instruction from phi to load. + std::vector> DependentInstList; + // List of store instr in the loop. + SmallVector LoopStorePtrs; + + // Get loop induction and auxilary induction phis. (Thye will be candidates + // for phi node matching during constrution of the candidate instructions.) + // And we use the phi nodes to determine the loop upperbound. + Value *NumIterations = + getLoopIterationNumber(L, LoopAuxIndPHINodes, AuxIndBounds); + if (NumIterations == nullptr) + return MadeChange; + else + MadeChange = true; + + // Find candidate auxiliary induction variables which could be a dependent for + // the indirect load. + for (auto &I : *(L->getHeader())) + if (PHINode *PHI = dyn_cast(&I)) { + InductionDescriptor IndDesc; + if (InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc) && + L->getInductionVariable(*SE) != PHI) { + canGetAuxIndVarBound(L, PHI, LoopAuxIndPHINodes); + } + } + + // WIll search for candidates in the parent loop of the current inner most + // loop. This will capture more opportunities in the outer loop. + SmallVector BBList; + for (auto &BB : L->blocks()) + BBList.push_back(BB); + if (L->getParentLoop()) + for (auto &BB : L->getParentLoop()->blocks()) { + // We don't want to repeat blocks in the case of nested loops. + if (L->contains(BB)) + continue; + BBList.push_back(BB); + } + + // Iterate through the loop and keep track of the memory loads and the + // instruction list they dependd on. + for (const auto BB : BBList) { + for (auto &I : *BB) + if (LoadInst *LoadI = dyn_cast(&I)) { + SmallSetVector InstList; + SmallSet InstSet; + InstList.insert(LoadI); + InstSet.insert(LoadI); + if (findCandidateMemoryLoads(LoadI, InstList, InstSet, + CandidateMemoryLoads, DependentInstList, + LoopAuxIndPHINodes, L)) { + LLVM_DEBUG(dbgs() << "Found load candidate " << *LoadI << "\n"); + CandidateMemoryLoads.push_back(LoadI); + DependentInstList.push_back(InstList); + } + } else if (StoreInst *StoreI = dyn_cast(&I)) { + // Keep track of store insts to avoid conflict. + LoopStorePtrs.push_back(StoreI->getPointerOperand()); + } + } + + // Keep track of previously transformed instrs for offset load and target + // loads so we can resuse them. + SmallVector> Transforms; + for (unsigned i = 0; i < CandidateMemoryLoads.size(); i++) { + SmallSetVector DependentInsts = DependentInstList[i]; + unsigned NumLoads = 0; + bool NoConflict = true; + // Find candidate that contains indirect loads and check load for offset + // doesn't alias with other stores. + for (auto DependentInst : DependentInsts) { + if (LoadInst *LoadI = dyn_cast(DependentInst)) { + NumLoads++; + // For the load of target address offset, we avoid the load being + // conflict with stores in the same loop. + if (NumLoads == IndirectionLevel) { + Value *LoadPtr = LoadI->getPointerOperand(); + for (Value *StorePtr : LoopStorePtrs) + if (AA->isMustAlias(LoadPtr, StorePtr)) { + NoConflict = false; + break; + } + } + } + } + + // Prefetch all indirect load without conflict to the offset load. + if (NumLoads == IndirectionLevel && NoConflict) { + MadeChange |= insertPrefetcherForIndirectLoad( + L, i, NumIterations, CandidateMemoryLoads, DependentInsts, + AuxIndBounds, Transforms, ItersAhead); + } + } + + cleanLoopIterationNumber(NumIterations); + return MadeChange; } diff --git a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch.ll b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch.ll new file mode 100644 index 000000000000..7d65952e2a2a --- /dev/null +++ b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=unknown -passes=loop-data-prefetch --indirect-load-prefetch=true --prefetch-distance=512 --min-prefetch-stride=4 -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @test(i32 %Num, float* nocapture readonly %TargetArray, i32* nocapture readonly %OffsetArray, float* noalias nocapture %TempArray) local_unnamed_addr #0 { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[NUM:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP13]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[NUM]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDVARS_IV]], 42 +; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i64 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDVARS_IV]], 84 +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 168 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[OFFSETARRAY:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 168 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[TEMPARRAY:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TEMPARRAY]], i64 [[INDVARS_IV]] +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP]], i32 0, i32 3, i32 1) +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[OFFSETARRAY]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[OFFSETARRAY]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[OFFSETARRAY]], i64 [[TMP3]] +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP1]], i32 0, i32 3, i32 1) +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4 +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP10]], i32 0, i32 3, i32 1) +; CHECK-NEXT: [[IDXPROM3:%.*]] = sext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TARGETARRAY:%.*]], i64 [[IDXPROM3]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TARGETARRAY]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP15]], i32 0, i32 3, i32 1) +; CHECK-NEXT: [[MUL:%.*]] = fmul contract float [[TMP9]], [[TMP17]] +; CHECK-NEXT: store float [[MUL]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] +; +entry: + %cmp13 = icmp sgt i32 %Num, 0 + br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext i32 %Num to i64 + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %TempArray, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %OffsetArray, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %idxprom3 = sext i32 %1 to i64 + %arrayidx4 = getelementptr inbounds float, float* %TargetArray, i64 %idxprom3 + %2 = load float, float* %arrayidx4, align 4 + %mul = fmul contract float %0, %2 + store float %mul, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} -- Gitee From 7ae9e174365736b51cebac58fc6a7761a3727564 Mon Sep 17 00:00:00 2001 From: Yangguang Li Date: Thu, 14 Aug 2025 20:05:27 +0800 Subject: [PATCH 2/2] [LoopDataPrefetch] Add support for crc hash data access prefetch Prefetch crc hash data access in outer loops or depends on outer loop induction variables. --- .../Transforms/Scalar/LoopDataPrefetch.cpp | 753 ++++++++++++++++-- .../AArch64/indirect-load-crc-outerloop.ll | 147 ++++ .../AArch64/indirect-load-prefetch.ll | 8 +- .../AArch64/indirect-load-prefetch_crc.ll | 104 +++ 4 files changed, 951 insertions(+), 61 deletions(-) create mode 100644 llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-crc-outerloop.ll create mode 100644 llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index 360cd782756a..dde7de406c58 100644 --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -71,7 +71,7 @@ static cl::opt static cl::opt PrefetchIterationsAhead( "indirect-prefetch-iters-ahead", - cl::desc("Number of iterations for indirect-load prefetch"), cl::Hidden); + cl::desc("Number of iterations for indirect-load prefetch"), cl::Hidden, cl::init(0)); static cl::opt SkipIntermediate( "indirect-prefetch-skip-intermediate", cl::Hidden, cl::init(false), @@ -91,6 +91,20 @@ static cl::opt CachelineSize("prefetch-cache-line-size", cl::desc("Specify cache line size"), cl::Hidden, cl::init(64)); +static cl::opt + OuterLoopPrefetch("outer-loop-prefetch", cl::Hidden, cl::init(false), + cl::desc("Enable prefetch in outer loops")); + +static cl::opt + DisableDirectLoadPrefetch("disable-direct-prefetch", cl::Hidden, + cl::init(false), + cl::desc("Disable direct load prefetch")); + +static cl::opt + PrefetchLoopDepth("prefetch-loop-depth", + cl::desc("Least loop depth to insert prefetch"), + cl::Hidden, cl::init(1)); + STATISTIC(NumPrefetches, "Number of prefetches inserted"); STATISTIC(NumIndPrefetches, "Number of indirect prefetches inserted"); STATISTIC(NumOuterLoopPrefetches, "Number of outer loop prefetches inserted"); @@ -137,6 +151,14 @@ private: bool getAuxIndVarBound(Loop *L, PHINode *PHI, Value *NumIterations, ValueMap &AuxIndBounds); + bool insertPrefetcherInOuterloopForIndirectLoad( + Loop *L, unsigned Idx, Value *NumIterations, + SmallVector &CandidateMemoryLoads, + SmallSetVector &DependentInsts, + ValueMap &AuxIndBounds, + SmallVectorImpl> &Transforms, + unsigned ItersAhead); + bool insertPrefetcherForIndirectLoad( Loop *L, unsigned Idx, Value *NumIterations, SmallVector &CandidateMemoryLoads, @@ -150,7 +172,8 @@ private: SmallPtrSet &InstSet, SmallVector &CandidateMemoryLoads, std::vector> &DependentInstList, - SmallPtrSet LoopAuxIndPHINodes, Loop *L); + SmallPtrSet LoopAuxIndPHINodes, + bool PrefetchInOuterLoop, Loop *L); /// Helper function to determine whether the given load is in /// CandidateMemoryLoads. If yes, add the candidate's depending inst to the @@ -165,6 +188,8 @@ private: /// processed to insert prefetches for indirect loads. bool canDoIndirectPrefetch(Loop *L); + bool isCrcHashDataAccess(Instruction *I, Instruction *PrefetchingLoad); + /// Check if the stride of the accesses is large enough to /// warrant a prefetch. bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride); @@ -327,20 +352,14 @@ Value *LoopDataPrefetch::getLoopIterationNumber( /// If prefetch instruction is not inserted. Need to clean iteration instruction /// in the preheader. void LoopDataPrefetch::cleanLoopIterationNumber(Value *NumIterations) { - std::deque IterationInsts; - if (NumIterations != nullptr && NumIterations->use_empty()) { - IterationInsts.push_back(dyn_cast(NumIterations)); - while (IterationInsts.size() > 0) { - auto *IInst = IterationInsts.front(); - IterationInsts.pop_front(); - if (IInst->use_empty()) { - for (unsigned i = 0; i < IInst->getNumOperands(); i++) { - if (isa(IInst->getOperand(i))) - IterationInsts.push_back( - dyn_cast(IInst->getOperand(i))); - } - dyn_cast(IInst)->eraseFromParent(); - } + auto *IDiv = dyn_cast(NumIterations); + if (IDiv != nullptr && IDiv->getOpcode() == Instruction::SDiv && + IDiv->use_empty()) { + auto *IRange = dyn_cast(IDiv->getOperand(0)); + IDiv->eraseFromParent(); + if (IRange != nullptr && IRange->getOpcode() == Instruction::Sub && + IRange->use_empty()) { + IRange->eraseFromParent(); } } } @@ -531,6 +550,19 @@ bool LoopDataPrefetch::insertPrefetcherForIndirectLoad( IRBuilder<> Builder(TargetIndirectLoad); Module *M = TargetIndirectLoad->getModule(); Type *I32Ty = Type::getInt32Ty(TargetIndirectLoad->getParent()->getContext()); + + if (RandomAccessPrefetchOnly) { + bool isRandomAccess = false; + for (auto *I : DependentInsts) { + if (isCrcHashDataAccess(I, TargetIndirectLoad)) { + isRandomAccess = true; + break; + } + } + if (!isRandomAccess) + return false; + } + LLVM_DEBUG(dbgs() << "Inserting indirect prefetchers for\t" << *TargetIndirectLoad << "\twith " << DependentInsts.size() << " dependent instructions\n"); @@ -735,12 +767,16 @@ bool LoopDataPrefetch::findCandidateMemoryLoads( SmallPtrSet &InstSet, SmallVector &CandidateMemoryLoads, std::vector> &DependentInstList, - SmallPtrSet LoopAuxIndPHINodes, Loop *L) { + SmallPtrSet LoopAuxIndPHINodes, bool PrefetchInOuterLoop, + Loop *L) { bool ret = false; for (Use &U : I->operands()) { // If value is loop invariant, just continue - if (LI->getLoopFor(I->getParent())->isLoopInvariant(U.get())) + if (PrefetchInOuterLoop) { + if (L->getParentLoop()->isLoopInvariant(U.get())) + continue; + } else if (LI->getLoopFor(I->getParent())->isLoopInvariant(U.get())) continue; Instruction *OperandInst = dyn_cast(U.get()); @@ -777,9 +813,37 @@ bool LoopDataPrefetch::findCandidateMemoryLoads( break; } case Instruction::Call: { - // We currently can not handle case where indirect load depends on other - // functions yet. - return false; + if (PrefetchInOuterLoop || RandomAccessPrefetchOnly) { + if (OperandInst->mayReadOrWriteMemory()) + return false; + CallInst *Call = dyn_cast(OperandInst); + if (!Call->doesNotThrow()) + return false; + + // Use DFS to search though the operands. + InstList.insert(OperandInst); + if (findCandidateMemoryLoads(OperandInst, InstList, InstSet, + CandidateMemoryLoads, DependentInstList, + LoopAuxIndPHINodes, PrefetchInOuterLoop, + L)) { + // We do not return early in case there are other auxiliary + // induction variable to check + ret = true; + } else { + // If the Operand isn't dependent on an auxiliary induction + // variable, remove any instructions added to DependentInstList from + // this operand + if (InstList.count(OperandInst)) + InstList.remove(OperandInst); + InstList.insert(OperandInst); + return false; + } + break; + } else { + // We currently can not handle case where indirect load depends on + // other functions yet. + return false; + } } case Instruction::Invoke: { // We currently can not handle case where indirect load depends on other @@ -793,7 +857,8 @@ bool LoopDataPrefetch::findCandidateMemoryLoads( InstList.insert(OperandInst); if (findCandidateMemoryLoads(OperandInst, InstList, InstSet, CandidateMemoryLoads, DependentInstList, - LoopAuxIndPHINodes, L)) { + LoopAuxIndPHINodes, PrefetchInOuterLoop, + L)) { // We do not return early in case there are other auxiliary induction // variables to check ret = true; @@ -851,6 +916,506 @@ bool LoopDataPrefetch::canDoIndirectPrefetch(Loop *L) { return true; } +/// Check if the load depends on Crc Hash functions. +bool LoopDataPrefetch::isCrcHashDataAccess(Instruction *I, + Instruction *PrefetchingLoad) { + if (llvm::IntrinsicInst *II = dyn_cast(I)) + // If CRC functions are used for offset calculation then offset will be + // random. To avoid cache misses, data prefetch is needed. + switch (II->getIntrinsicID()) { + case Intrinsic::aarch64_crc32b: + case Intrinsic::aarch64_crc32cb: + case Intrinsic::aarch64_crc32h: + case Intrinsic::aarch64_crc32ch: + case Intrinsic::aarch64_crc32w: + case Intrinsic::aarch64_crc32cw: + case Intrinsic::aarch64_crc32x: + case Intrinsic::aarch64_crc32cx: { + // Checking Candidate load is incremented by 1. + if (auto *LI = dyn_cast(PrefetchingLoad)) { + if (auto *GEPI = dyn_cast(LI->getPointerOperand())) { + // The data access will be consecutive, if the gep has one indices. + if (GEPI->getNumOperands() > 2) + return false; + auto *PtrIndices = dyn_cast(GEPI->getOperand(1)); + if (!PtrIndices || isa(PtrIndices)) + return true; + for (auto &U : PtrIndices->uses()) + if (auto *PN = dyn_cast(U.getUser())) + if (getStep(PN, SE) <= 1) + return true; + } + } + break; + } + } + return false; +} + +/// Checks the indirect loads inside the inner loop and +/// it is derived from induction variable of outer loop then, +/// insert the prefetch instruction in outer loop. +/// It maintains the same CFG structure of inner loop and +/// clone it in the outerloop. Insert the prefetch for +/// the last indirect load, not for the intermediate loads. +bool LoopDataPrefetch::insertPrefetcherInOuterloopForIndirectLoad( + Loop *L, unsigned Idx, Value *NumIterations, + SmallVector &CandidateMemoryLoads, + SmallSetVector &DependentInsts, + ValueMap &AuxIndBounds, + SmallVectorImpl> &Transforms, + unsigned ItersAhead) { + Instruction *TargetIndirectLoad = CandidateMemoryLoads[Idx]; + IRBuilder<> Builder(TargetIndirectLoad); + Module *M = TargetIndirectLoad->getModule(); + auto *ParentLoop = L->getParentLoop(); + + if (!ParentLoop) + return false; + + SmallVector ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) { + return isa(Exit->getTerminator()); + }); + if (HasCatchSwitch) + return false; + + SmallVector NewBBlocks; + SmallVector AllDependentInsts; + SmallPtrSet Visited; + SmallPtrSet IndirectLoadDependents; + SmallPtrSet BranchInsts; + SmallPtrSet InsertedPrefetchCalls; + DenseMap BBTransforms; + DenseMap BBPostNumbers; + BasicBlock *NewRootBB = nullptr; + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + + if (!isa(DependentInsts[DependentInsts.size() - 1])) { + return false; + } else { + if (auto *PN = + dyn_cast(DependentInsts[DependentInsts.size() - 1])) { + if (!ParentLoop->contains(PN)) { + return false; + } + if (!getStep(PN, SE)) + return false; + if (isa(PN->getType())) + return false; + } + } + + ExitBlocks.clear(); + ParentLoop->getUniqueExitBlocks(ExitBlocks); + if (HasCatchSwitch) + return false; + + Instruction *CandidateLoad = DependentInsts[0]; + BasicBlock *LoopPreheader = L->getLoopPreheader(); + + // Only consider crc hashed random data accesses. + bool isRandomAccess = false; + for (auto *I : DependentInsts) { + IndirectLoadDependents.insert(I); + Visited.insert(I); + isRandomAccess |= isCrcHashDataAccess(I, CandidateLoad); + } + if (!isRandomAccess) + return false; + + if (!LoopPreheader || !ParentLoop->getLoopPreheader()) + return false; + + if (LoopPreheader->getTerminator() == nullptr || + !isa(LoopPreheader->getTerminator())) + return false; + if (Visited.insert(LoopPreheader->getTerminator()).second) + DependentInsts.insert(LoopPreheader->getTerminator()); + + // Start from target indirect load block, get the list of predecessor blocks + // till loop preheader. And we assign each block with post order number with + // which we can sort. + SmallSetVector BBPredecessors; + BBPredecessors.insert(CandidateLoad->getParent()); + BBPostNumbers.insert({CandidateLoad->getParent(), 0}); + while (BBPredecessors.size()) { + BasicBlock *BBPred = BBPredecessors[0]; + BBPredecessors.remove(BBPred); + int Depth = BBPostNumbers[BBPred]; + // Check all predecessors and add their branch instr into dependent list + for (BasicBlock *Predecessor : predecessors(BBPred)) { + if (LoopPreheader != Predecessor && !DT->dominates(BBPred, Predecessor)) { + if (BBPostNumbers.end() == BBPostNumbers.find(Predecessor)) { + BBPostNumbers.insert({Predecessor, Depth - 1}); + BBPredecessors.insert(Predecessor); + // Check each terminator is a branch instr. + if (Predecessor->getTerminator() == nullptr || + !isa(Predecessor->getTerminator())) + return false; + // Add branch instruction as dependent instr. + if (Visited.insert(Predecessor->getTerminator()).second) + DependentInsts.insert(Predecessor->getTerminator()); + } + } + } + } + + // Loop preheader is last depend block. + BBPostNumbers.insert({LoopPreheader, -1 * BBPostNumbers.size()}); + + // Update DependentInsts to include instructions that branch instruction + // depends. + for (unsigned j = 0; j < DependentInsts.size(); j++) { + Instruction *Inst = DependentInsts[j]; + if (Inst == nullptr) + return false; + + if (auto *PN = dyn_cast(Inst)) { + if (!IndirectLoadDependents.count(Inst)) { + if (0 > PN->getBasicBlockIndex(LoopPreheader)) + return false; + } + } else if (auto *BranchInstr = dyn_cast(Inst)) { + // Add condition of branch instruction into dependent insts. + if (BranchInstr->isConditional()) { + auto *BranchCond = BranchInstr->getCondition(); + if (BranchCond == nullptr) + return false; + if (Instruction *BranchCondInst = dyn_cast(BranchCond)) + if (Visited.insert(BranchCondInst).second) + DependentInsts.insert(BranchCondInst); + } else if (BranchInstr->getSuccessor(0)->isEHPad()) + return false; + } else if (isa(Inst)) { + return false; + } else { + if (CallInst *Call = dyn_cast(Inst)) + if (Inst->mayReadOrWriteMemory() || !Call->doesNotThrow()) + return false; + // Traverse instruction operands and add dependent instructions till + // function argument, constant or value outside current loop. + for (unsigned i = 0; i < Inst->getNumOperands(); i++) { + Value *Operand = Inst->getOperand(i); + if (Operand == nullptr) + return false; + if (isa(Operand) || isa(Operand)) + continue; + if (Instruction *I = dyn_cast(Operand)) + if (L->contains(I) || I->getParent() == LoopPreheader) + if (Visited.insert(I).second) + DependentInsts.insert(I); + } + } + } + + // Sort dependent instruction based on PostNumber id and instruction ordering + // in the same block. + SmallVector, 8> SortedDependentInsts; + DT->updateDFSNumbers(); + SortedDependentInsts.reserve(DependentInsts.size()); + for (auto I : DependentInsts) { + auto *NodeI = DT->getNode(I->getParent()); + SortedDependentInsts.push_back({I, NodeI->getDFSNumIn()}); + } + llvm::sort(SortedDependentInsts, [&](auto const &LHS, auto const &RHS) { + if (get<0>(RHS)->getParent() == get<0>(LHS)->getParent()) + return get<0>(RHS)->comesBefore(get<0>(LHS)); + if (BBPostNumbers.end() == BBPostNumbers.find(get<0>(LHS)->getParent()) || + BBPostNumbers.end() == BBPostNumbers.find(get<0>(RHS)->getParent())) + return get<1>(RHS) < get<1>(LHS); + if (BBPostNumbers[get<0>(LHS)->getParent()] == + BBPostNumbers[get<0>(RHS)->getParent()]) + return get<1>(RHS) < get<1>(LHS); + return BBPostNumbers[get<0>(LHS)->getParent()] > + BBPostNumbers[get<0>(RHS)->getParent()]; + }); + + // Checking all the BasicBlocks have branch instruction + int BBDepth = 0; + for (auto I : SortedDependentInsts) { + if (BBDepth && get<1>(I) != BBDepth) + if (!isa(get<0>(I)) && + BBPostNumbers.end() != BBPostNumbers.find(get<0>(I)->getParent())) + return false; + BBDepth = get<1>(I); + } + + if (!isa(get<0>(SortedDependentInsts[0]))) + return false; + + if (!L->contains(get<0>(SortedDependentInsts[0]))) + return false; + + if (!isa( + get<0>(SortedDependentInsts[SortedDependentInsts.size() - 1]))) + return false; + else if (auto *PN = dyn_cast( + get<0>(SortedDependentInsts[SortedDependentInsts.size() - 1]))) + if (!ParentLoop->contains(PN)) + return false; + + auto cloneInstructionWithBB = [&](llvm::Instruction *Inst, + llvm::Instruction *NewInstr = nullptr) { + Instruction *TransformedInstr = NewInstr; + if (TransformedInstr == nullptr) + TransformedInstr = Inst->clone(); + + BasicBlock *NewBlock; + BasicBlock *OldBlock = Inst->getParent(); + // Check if block had been created before. + if (BBTransforms.count(OldBlock)) { + NewBlock = BBTransforms[OldBlock]; + } else { + NewBlock = BasicBlock::Create(OldBlock->getContext(), + "prefetch." + OldBlock->getName()); + NewBlock->insertInto(OldBlock->getParent(), LoopPreheader); + if (NewRootBB == nullptr) + NewRootBB = NewBlock; + if (!ParentLoop->contains(NewBlock)) + ParentLoop->addBasicBlockToLoop(NewBlock, *LI); + BBTransforms.insert( + std::pair(OldBlock, NewBlock)); + NewBBlocks.push_back(NewBlock); + } + TransformedInstr->insertInto(NewBlock, NewBlock->end()); + if (NewInstr == nullptr) { + for (unsigned i = 0; i < TransformedInstr->getNumOperands(); i++) { + Value *Operand = TransformedInstr->getOperand(i); + if (Transforms[0].count(Operand)) + TransformedInstr->replaceUsesOfWith(Operand, Transforms[0][Operand]); + } + } + Transforms[0].insert(std::pair(Inst, TransformedInstr)); + AllDependentInsts.push_back(TransformedInstr); + return TransformedInstr; + }; + + // We create block and instructions with topdown manner, e.g. from PHI node in + // the parent loop to target indirect load. + bool PositiveStep = true; + int64_t Step; + while (!SortedDependentInsts.empty()) { + Instruction *DependentInst = get<0>(SortedDependentInsts.pop_back_val()); + Instruction *Inst = dyn_cast(DependentInst); + + // For target load related instruction. + switch (Inst->getOpcode()) { + case Instruction::PHI: { + // For non-root phi node, replace phi node with incoming value. + if (!IndirectLoadDependents.count(Inst)) { + if (Transforms[0].count(Inst)) + continue; + auto *PN = dyn_cast(Inst); + Transforms[0].insert(std::pair( + Inst, PN->getIncomingValueForBlock(LoopPreheader))); + break; + } + // Replace root phi node with following value: + // select((phi + step) < bound, (phi + step), bound) + // Get the constant step for the induction phi so we can use it to + // calculate how much we should increase the induction for prefetching + PHINode *PN = dyn_cast(Inst); + Step = getStep(PN, SE); + PositiveStep = isPositiveStep(PN, SE); + Type *InstType = getStepTypeFromPHINode(PN, SE); + if (!PositiveStep) + Step = -Step; + + // Make sure phi node is i64 or i32. + if (!InstType->isIntegerTy(64) && !InstType->isIntegerTy(32)) + return false; + + // Create the bound for this PHI if needed: + if (!AuxIndBounds.count(PN)) + getAuxIndVarBound(ParentLoop, PN, NumIterations, AuxIndBounds); + + // Insert the new instruction after all PHI nodes + auto InsertionPoint = Inst; + if (auto FirstNonPHI = Inst->getParent()->getFirstNonPHI()) + InsertionPoint = FirstNonPHI->getPrevNode(); + + if (Transforms.size() < 1) + Transforms.push_back(DenseMap()); + else if (Transforms[0].count(Inst)) + continue; + + // FullStep is the inital offset for the new value, taking into account, + // both Step and the number of iterations ahead to prefetch. If indirect + // prefetch iteration ahead is enabled, we directly use the supplied + // indirect-prefetch-iters-ahead value. + int64_t FullStep = PrefetchIterationsAhead + ? PrefetchIterationsAhead * Step + : ItersAhead * Step; + + Instruction::BinaryOps BiOp = + PositiveStep ? Instruction::Add : Instruction::Sub; + auto *NewOp = Builder.CreateBinOp( + BiOp, Inst, ConstantInt::get(Inst->getType(), FullStep)); + if (auto NewOpInstr = dyn_cast(NewOp)) { + NewOpInstr->moveAfter(InsertionPoint); + InsertionPoint = NewOpInstr; + AllDependentInsts.push_back(NewOpInstr); + } + + Value *NewCmp = Builder.CreateICmp( + PositiveStep ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, NewOp, + AuxIndBounds[cast(Inst)]); + Value *NewSelect = Builder.CreateSelect(NewCmp, NewOp, AuxIndBounds[PN]); + Transforms[0].insert(std::pair(Inst, NewSelect)); + + if (auto NewCmpInstr = dyn_cast(NewCmp)) { + NewCmpInstr->moveAfter(InsertionPoint); + InsertionPoint = NewCmpInstr; + AllDependentInsts.push_back(NewCmpInstr); + } + if (auto NewSelectInstr = dyn_cast(NewSelect)) { + NewSelectInstr->moveAfter(InsertionPoint); + InsertionPoint = NewSelectInstr; + AllDependentInsts.push_back(NewSelectInstr); + } + break; + } + case Instruction::Load: { + LoadInst *LoadI = dyn_cast(Inst); + Value *LoadPtr = LoadI->getPointerOperand(); + auto GeneratePrefetcher = [&](llvm::Value *PrefetchPtr) { + Function *PrefetchFunc = Intrinsic::getDeclaration( + M, Intrinsic::prefetch, LoadPtr->getType()); + Type *I32Ty = + Type::getInt32Ty(CandidateLoad->getParent()->getContext()); + Value *PrefetchArg[] = {PrefetchPtr, ConstantInt::get(I32Ty, 0), + ConstantInt::get(I32Ty, 3), + ConstantInt::get(I32Ty, 1)}; + CallInst *PrefetchCall = CallInst::Create(PrefetchFunc, PrefetchArg); + return PrefetchCall; + }; + + // We clone the intermediate load but prefetch the target load. + if (!SortedDependentInsts.empty()) { + if (Transforms[0].count(LoadI)) + continue; + cloneInstructionWithBB(LoadI); + } else { + CallInst *PrefetchCall = GeneratePrefetcher(Transforms[0][LoadPtr]); + cloneInstructionWithBB(LoadI, PrefetchCall); + InsertedPrefetchCalls.insert(PrefetchCall); + } + break; + } + case Instruction::Br: { + BranchInsts.insert(cloneInstructionWithBB(Inst)); + break; + } + default: { + // For other types of instructions, we make a clone of the instruction and + // replace operands that we already transformed before. + if (Transforms[0].count(Inst)) + continue; + cloneInstructionWithBB(Inst); + break; + } + } + } + + BasicBlock *EndBlock = + BasicBlock::Create(LoopPreheader->getContext(), "prefetch.end"); + ParentLoop->addBasicBlockToLoop(EndBlock, *LI); + EndBlock->insertInto(LoopPreheader->getParent(), LoopPreheader); + + // Create branch from prefetch call block to end block. + for (CallInst *PrefetchCall : InsertedPrefetchCalls) + if (!PrefetchCall->getParent()->getTerminator()) { + AllDependentInsts.push_back( + BranchInst::Create(EndBlock, PrefetchCall->getParent())); + } + + // Checking all the newly created BasicBlock has Terminator instruction. If + // not, considered as incomplete. Delete all new BasicBlocks and return. + for (BasicBlock *BB : NewBBlocks) { + if (BB->getTerminator() == nullptr) { + for (unsigned j = 0; j < AllDependentInsts.size(); j++) { + auto *I = AllDependentInsts[j]; + I->replaceAllUsesWith(UndefValue::get(I->getType())); + I->eraseFromParent(); + } + for (unsigned j = 0; j < NewBBlocks.size(); j++) { + auto *DelBBlock = NewBBlocks[j]; + ParentLoop->removeBlockFromLoop(DelBBlock); + DelBBlock->eraseFromParent(); + } + ParentLoop->removeBlockFromLoop(EndBlock); + EndBlock->eraseFromParent(); + return false; + } + } + + // Updating with branch from Entry to PreHeader to NewRootBB + for (BasicBlock *PredecessorBB : predecessors(LoopPreheader)) { + auto *BrInstr = PredecessorBB->getTerminator(); + for (unsigned i = 0, NumSuccessor = BrInstr->getNumSuccessors(); + i < NumSuccessor; i++) { + auto *OldSuccessor = BrInstr->getSuccessor(i); + if (OldSuccessor == LoopPreheader) { + DTU.applyUpdates( + {{DominatorTree::Delete, PredecessorBB, LoopPreheader}}); + BrInstr->setSuccessor(i, NewRootBB); + DTU.applyUpdates({{DominatorTree::Insert, PredecessorBB, NewRootBB}}); + } + } + } + AllDependentInsts.push_back(BranchInst::Create(LoopPreheader, EndBlock)); + + // Updating with new BasicBlock in all newly created branch instruction. + // Updating DominatorTree for all new BasicBlocks. + for (auto *I : BranchInsts) { + auto *BrInstr = dyn_cast(I); + for (unsigned i = 0, NumSuccessor = BrInstr->getNumSuccessors(); + i < NumSuccessor; i++) { + auto *OldSuccessor = BrInstr->getSuccessor(i); + if (BBTransforms.end() != BBTransforms.find(OldSuccessor)) { + auto *NewSuccessor = BBTransforms[OldSuccessor]; + BrInstr->setSuccessor(i, NewSuccessor); + DTU.applyUpdates( + {{DominatorTree::Insert, BrInstr->getParent(), NewSuccessor}}); + } else { + BrInstr->setSuccessor(i, EndBlock); + DTU.applyUpdates( + {{DominatorTree::Insert, BrInstr->getParent(), EndBlock}}); + } + } + } + + for (CallInst *PrefetchCall : InsertedPrefetchCalls) { + if (!PrefetchCall->getParent()->getTerminator()) { + DTU.applyUpdates( + {{DominatorTree::Insert, PrefetchCall->getParent(), EndBlock}}); + } + } + + auto *InsertPoint = ParentLoop->getLoopPreheader(); + auto *BBTerminator = InsertPoint->getTerminator(); + Instruction *EndPoint = nullptr; + if (InsertPoint) { + for (unsigned j = 0; j < AllDependentInsts.size(); j++) { + auto *I = AllDependentInsts[j]; + if (I->getOpcode() != Instruction::Br) + if (ParentLoop->hasLoopInvariantOperands(I)) { + auto *InvariantInstr = I->clone(); + InvariantInstr->insertInto(InsertPoint, InsertPoint->end()); + EndPoint = InvariantInstr; + I->replaceAllUsesWith(InvariantInstr); + I->eraseFromParent(); + } + } + if (EndPoint) + BBTerminator->moveAfter(EndPoint); + NumOuterLoopPrefetches++; + } + return true; +} + PreservedAnalyses LoopDataPrefetchPass::run(Function &F, FunctionAnalysisManager &AM) { AliasAnalysis *AA = &AM.getResult(F); @@ -967,13 +1532,36 @@ struct Prefetch { bool LoopDataPrefetch::runOnLoop(Loop *L) { bool MadeChange = false; - // Only prefetch in the inner-most loop - if (!L->isInnermost()) + if (L->getLoopDepth() < PrefetchLoopDepth) return MadeChange; + bool IsInnerMost = true; + // Prefetch outer loop if needed. + if (!L->isInnermost()) { + if (OuterLoopPrefetch) + IsInnerMost = false; + else + return MadeChange; + } + SmallPtrSet EphValues; CodeMetrics::collectEphemeralValues(L, AC, EphValues); + CodeMetrics InnerLoopMetrics; + // Calculate the sub loop size when prefetching outer loops. + SmallPtrSet InnerMostBBs; + if (!IsInnerMost) { + for (Loop *LL : L->getSubLoops()) { + // Make sure all sub loops are inner most loop. + if (!LL->isInnermost()) + return MadeChange; + for (const auto BB : LL->blocks()) { + InnerMostBBs.insert(BB); + InnerLoopMetrics.analyzeBasicBlock(BB, *TTI, EphValues); + } + } + } + // Calculate the number of iterations ahead to prefetch CodeMetrics Metrics; bool HasCall = false; @@ -1002,6 +1590,12 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { if (!LoopSize) LoopSize = 1; + // Only prefetch small outer loops with small sub loops. + if (!IsInnerMost) + if (LoopSize - InnerLoopMetrics.NumInsts > 128 || + InnerLoopMetrics.NumInsts > 128) + return MadeChange; + unsigned ItersAhead = getPrefetchDistance() / LoopSize; if (!ItersAhead) ItersAhead = 1; @@ -1016,9 +1610,10 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { unsigned NumMemAccesses = 0; unsigned NumStridedMemAccesses = 0; SmallVector Prefetches; - for (const auto BB : L->blocks()) + for (const auto BB : L->blocks()) { + // If this is not inner most, we avoid prefetching in sub loops. for (auto &I : *BB) { - Value *PtrValue; + Value *PtrValue = nullptr; Instruction *MemI; if (LoadInst *LMemI = dyn_cast(&I)) { @@ -1030,6 +1625,11 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { PtrValue = SMemI->getPointerOperand(); } else continue; + if (!PtrValue) + continue; + if (getPrefetchDistance() == 0) + continue; + unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); if (!TTI->shouldPrefetchAddressSpace(PtrAddrSpace)) continue; @@ -1043,6 +1643,11 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { continue; NumStridedMemAccesses++; + // For outer loops, we only prefetch memory instruction with stride + // depending on the current loop. + if (!IsInnerMost && LSCEVAddRec->getLoop() != L) + continue; + // We don't want to double prefetch individual cache lines. If this // access is known to be within one cache line of some other one that // has already been prefetched, then don't prefetch this one as well. @@ -1052,16 +1657,19 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { if (const SCEVConstant *ConstPtrDiff = dyn_cast(PtrDiff)) { int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue()); - if (PD < (int64_t) TTI->getCacheLineSize()) { + int64_t CacheLineSize = + TTI->getCacheLineSize() ? TTI->getCacheLineSize() : CachelineSize; + if (PD < (int64_t)CacheLineSize) { Pref.addInstruction(MemI, DT, PD); DupPref = true; break; } } } - if (!DupPref) + if (!DupPref && !DisableDirectLoadPrefetch) Prefetches.push_back(Prefetch(LSCEVAddRec, MemI)); } + } unsigned TargetMinStride = getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, @@ -1079,15 +1687,18 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { for (auto &P : Prefetches) { // Check if the stride of the accesses is large enough to warrant a - // prefetch. + // prefetch. If MinPrefetchStride <= 1, no need to check if any stride + // goes. + const SCEV *StrideExpr = P.LSCEVAddRec->getStepRecurrence(*SE); if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride)) continue; BasicBlock *BB = P.InsertPt->getParent(); SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr"); - const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr( - SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead), - P.LSCEVAddRec->getStepRecurrence(*SE))); + const SCEV *NextLSCEV = SE->getAddExpr( + P.LSCEVAddRec, + SE->getMulExpr(SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead), + StrideExpr)); if (!SCEVE.isSafeToExpand(NextLSCEV)) continue; @@ -1100,11 +1711,10 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { Type *I32 = Type::getInt32Ty(BB->getContext()); Function *PrefetchFunc = Intrinsic::getDeclaration( M, Intrinsic::prefetch, PrefPtrValue->getType()); - Builder.CreateCall( - PrefetchFunc, - {PrefPtrValue, - ConstantInt::get(I32, P.Writes), - ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); + Builder.CreateCall(PrefetchFunc, + {PrefPtrValue, ConstantInt::get(I32, P.Writes), + ConstantInt::get(I32, IsInnerMost ? 3 : 0), + ConstantInt::get(I32, 1)}); ++NumPrefetches; LLVM_DEBUG(dbgs() << " Access: " << *P.MemI->getOperand(isa(P.MemI) ? 0 : 1) @@ -1120,9 +1730,6 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { if (!IndirectLoadPrefetch) return MadeChange; - if (!canDoIndirectPrefetch(L)) - return MadeChange; - // List of valid phi nodes that indirect loads can depend on. SmallPtrSet LoopAuxIndPHINodes; // Map of valid phi node to its bound value in the preheader. @@ -1134,28 +1741,52 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { // List of store instr in the loop. SmallVector LoopStorePtrs; - // Get loop induction and auxilary induction phis. (Thye will be candidates - // for phi node matching during constrution of the candidate instructions.) + // Get loop induction and auxiliary induction phis. (Thye will be candidates + // for phi node matching during construction of the candidate instructions.) // And we use the phi nodes to determine the loop upperbound. Value *NumIterations = getLoopIterationNumber(L, LoopAuxIndPHINodes, AuxIndBounds); - if (NumIterations == nullptr) + bool PrefetchInOuterLoop = false; + if (NumIterations == nullptr) { + if (!L->isOutermost()) { + NumIterations = getLoopIterationNumber(L->getParentLoop(), + LoopAuxIndPHINodes, AuxIndBounds); + if (NumIterations == nullptr) + return MadeChange; + PrefetchInOuterLoop = true; + } else + return MadeChange; + } + + if (!RandomAccessPrefetchOnly && !PrefetchInOuterLoop && + !canDoIndirectPrefetch(L)) { + cleanLoopIterationNumber(NumIterations); return MadeChange; - else - MadeChange = true; + } // Find candidate auxiliary induction variables which could be a dependent for // the indirect load. - for (auto &I : *(L->getHeader())) + BasicBlock *Header = nullptr; + Loop *CurrentLoop = L; + if (PrefetchInOuterLoop) { + Header = L->getParentLoop()->getHeader(); + CurrentLoop = L->getParentLoop(); + } else { + if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) + return false; + Header = L->getHeader(); + } + + for (auto &I : *Header) if (PHINode *PHI = dyn_cast(&I)) { InductionDescriptor IndDesc; - if (InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc) && - L->getInductionVariable(*SE) != PHI) { - canGetAuxIndVarBound(L, PHI, LoopAuxIndPHINodes); + if (InductionDescriptor::isInductionPHI(PHI, CurrentLoop, SE, IndDesc) && + CurrentLoop->getInductionVariable(*SE) != PHI) { + canGetAuxIndVarBound(CurrentLoop, PHI, LoopAuxIndPHINodes); } } - // WIll search for candidates in the parent loop of the current inner most + // Will search for candidates in the parent loop of the current inner most // loop. This will capture more opportunities in the outer loop. SmallVector BBList; for (auto &BB : L->blocks()) @@ -1169,7 +1800,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { } // Iterate through the loop and keep track of the memory loads and the - // instruction list they dependd on. + // instruction list they depend on. for (const auto BB : BBList) { for (auto &I : *BB) if (LoadInst *LoadI = dyn_cast(&I)) { @@ -1179,7 +1810,8 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { InstSet.insert(LoadI); if (findCandidateMemoryLoads(LoadI, InstList, InstSet, CandidateMemoryLoads, DependentInstList, - LoopAuxIndPHINodes, L)) { + LoopAuxIndPHINodes, PrefetchInOuterLoop, + L)) { LLVM_DEBUG(dbgs() << "Found load candidate " << *LoadI << "\n"); CandidateMemoryLoads.push_back(LoadI); DependentInstList.push_back(InstList); @@ -1191,7 +1823,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { } // Keep track of previously transformed instrs for offset load and target - // loads so we can resuse them. + // loads so we can reuse them. SmallVector> Transforms; for (unsigned i = 0; i < CandidateMemoryLoads.size(); i++) { SmallSetVector DependentInsts = DependentInstList[i]; @@ -1215,11 +1847,18 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { } } - // Prefetch all indirect load without conflict to the offset load. + // Prefetch all indirect loads without conflict to the offset load. if (NumLoads == IndirectionLevel && NoConflict) { - MadeChange |= insertPrefetcherForIndirectLoad( - L, i, NumIterations, CandidateMemoryLoads, DependentInsts, - AuxIndBounds, Transforms, ItersAhead); + if (PrefetchInOuterLoop) { + MadeChange |= insertPrefetcherInOuterloopForIndirectLoad( + L, i, NumIterations, CandidateMemoryLoads, DependentInsts, + AuxIndBounds, Transforms, ItersAhead); + break; + } else { + MadeChange |= insertPrefetcherForIndirectLoad( + L, i, NumIterations, CandidateMemoryLoads, DependentInsts, + AuxIndBounds, Transforms, ItersAhead); + } } } diff --git a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-crc-outerloop.ll b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-crc-outerloop.ll new file mode 100644 index 000000000000..afde478f89e7 --- /dev/null +++ b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-crc-outerloop.ll @@ -0,0 +1,147 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=unknown -passes=loop-data-prefetch --indirect-load-prefetch=true --prefetch-distance=512 --outer-loop-prefetch=true --random-access-prefetch-only=true -disable-direct-prefetch -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64:128-a:0:32-n32-S64" +target triple = "armv8a-unknown-linux-gun" + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare i32 @llvm.aarch64.crc32w(i32, i32) + +; Function Attrs: mustprogress nofree nosync nounwind willreturn memory(read, inaccessiblemem: none) +define dso_local arm_aapcscc noundef i32 @_z12matchcolumnsPPiS_ii(ptr nocapture noundef readonly %A, ptr nocapture noundef readnone %key, i32 noundef %index, i32 noundef %count) local_unnamed_addr { +; CHECK-LABEL: @_z12matchcolumnsPPiS_ii( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 [[SUM_1_LCSSA:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[SUM_040:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUM_1_LCSSA]], [[FOR_COND_CLEANUP4:%.*]] ] +; CHECK-NEXT: [[I_039:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC17:%.*]], [[FOR_COND_CLEANUP4]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[I_039]], 19 +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], 99 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 99 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[A]], i32 [[I_039]] +; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[CMP336:%.*]] = icmp sgt i32 [[TMP6]], 0 +; CHECK-NEXT: br i1 [[CMP336]], label [[PREFETCH_FOR_BODY5_PREHEADER:%.*]], label [[FOR_COND_CLEANUP4]] +; CHECK: prefetch.for.body5.preheader: +; CHECK-NEXT: br label [[PREFETCH_FOR_BODY5:%.*]] +; CHECK: prefetch.for.body5: +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP8]], i32 -1) +; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 255 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 [[TMP10]] +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP11]], i32 0, i32 3, i32 1) +; CHECK-NEXT: br label [[PREFETCH_END:%.*]] +; CHECK: prefetch.end: +; CHECK-NEXT: br label [[FOR_BODY5_PREHEADER:%.*]] +; CHECK: for.body5.preheader: +; CHECK-NEXT: br label [[FOR_BODY5:%.*]] +; CHECK: for.cond.cleanup4.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP4]] +; CHECK: for.cond.cleanup4: +; CHECK-NEXT: [[SUM_1_LCSSA]] = phi i32 [ [[SUM_040]], [[FOR_BODY]] ], [ [[ADD:%.*]], [[FOR_COND_CLEANUP4_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[INC17]] = add nuw nsw i32 [[I_039]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC17]], 100 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK: for.body5: +; CHECK-NEXT: [[J_038:%.*]] = phi i32 [ [[INC15:%.*]], [[IF_END:%.*]] ], [ 0, [[FOR_BODY5_PREHEADER]] ] +; CHECK-NEXT: [[SUM_137:%.*]] = phi i32 [ [[ADD]], [[IF_END]] ], [ [[SUM_040]], [[FOR_BODY5_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds ptr, ptr [[A]], i32 [[J_038]] +; CHECK-NEXT: [[TMP12:%.*]] = load ptr, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 [[I_039]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP13]], i32 -1) +; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP14]], 255 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 [[AND]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp eq i32 [[TMP15]], [[INDEX:%.*]] +; CHECK-NEXT: br i1 [[CMP9_NOT]], label [[IF_END]], label [[DO_BODY_PREHEADER:%.*]] +; CHECK: do.body.preheader: +; CHECK-NEXT: br label [[DO_BODY:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[J_1:%.*]] = phi i32 [ [[INC10:%.*]], [[DO_BODY]] ], [ [[J_038]], [[DO_BODY_PREHEADER]] ] +; CHECK-NEXT: [[AKEY_0:%.*]] = phi i32 [ [[INC:%.*]], [[DO_BODY]] ], [ [[AND]], [[DO_BODY_PREHEADER]] ] +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[AKEY_0]], 1 +; CHECK-NEXT: [[INC10]] = add nsw i32 [[J_1]], 1 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[A]], i32 [[INC10]] +; CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[INC]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[TMP17]], [[INDEX]] +; CHECK-NEXT: br i1 [[CMP13_NOT]], label [[IF_END_LOOPEXIT:%.*]], label [[DO_BODY]] +; CHECK: if.end.loopexit: +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[J_2:%.*]] = phi i32 [ [[J_038]], [[FOR_BODY5]] ], [ [[INC10]], [[IF_END_LOOPEXIT]] ] +; CHECK-NEXT: [[B_0:%.*]] = phi ptr [ [[TMP12]], [[FOR_BODY5]] ], [ [[TMP16]], [[IF_END_LOOPEXIT]] ] +; CHECK-NEXT: [[AKEY_1:%.*]] = phi i32 [ [[AND]], [[FOR_BODY5]] ], [ [[INC]], [[IF_END_LOOPEXIT]] ] +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[B_0]], i32 [[AKEY_1]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP18]], [[SUM_137]] +; CHECK-NEXT: [[INC15]] = add nsw i32 [[J_2]], 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp slt i32 [[INC15]], [[TMP6]] +; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_BODY5]], label [[FOR_COND_CLEANUP4_LOOPEXIT]] +; +entry: + br label %for.body + +for.cond.cleanup: + ret i32 %sum.1.lcssa + +for.body: + %sum.040 = phi i32 [ 0, %entry ], [ %sum.1.lcssa, %for.cond.cleanup4 ] + %i.039 = phi i32 [ 0, %entry ], [ %inc17, %for.cond.cleanup4 ] + %arrayidx = getelementptr inbounds ptr, ptr %A, i32 %i.039 + %0 = load ptr, ptr %arrayidx, align 4 + %1 = load i32, ptr %0, align 4 + %cmp336 = icmp sgt i32 %1, 0 + br i1 %cmp336, label %for.body5, label %for.cond.cleanup4 + +for.cond.cleanup4: + %sum.1.lcssa = phi i32 [ %sum.040, %for.body ], [ %add, %if.end ] + %inc17 = add nuw nsw i32 %i.039, 1 + %exitcond.not = icmp eq i32 %inc17, 100 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.body5: + %j.038 = phi i32 [ %inc15, %if.end ], [ 0, %for.body ] + %sum.137 = phi i32 [ %add, %if.end ], [ %sum.040, %for.body ] + %arrayidx6 = getelementptr inbounds ptr, ptr %A, i32 %j.038 + %2 = load ptr, ptr %arrayidx6, align 4 + %arrayidx7 = getelementptr inbounds i32, ptr %2, i32 %i.039 + %3 = load i32, ptr %arrayidx7, align 4 + %4 = tail call i32 @llvm.aarch64.crc32w(i32 %3, i32 -1) + %and = and i32 %4, 255 + %arrayidx8 = getelementptr inbounds i32, ptr %2, i32 %and + %5 = load i32, ptr %arrayidx8, align 4 + %cmp9.not = icmp eq i32 %5, %index + br i1 %cmp9.not, label %if.end, label %do.body + +do.body: + %j.1 = phi i32 [ %inc10, %do.body ], [ %j.038, %for.body5 ] + %AKey.0 = phi i32 [ %inc, %do.body ], [ %and, %for.body5 ] + %inc = add nuw nsw i32 %AKey.0, 1 + %inc10 = add nsw i32 %j.1, 1 + %arrayidx11 = getelementptr inbounds ptr, ptr %A, i32 %inc10 + %6 = load ptr, ptr %arrayidx11, align 4 + %arrayidx12 = getelementptr inbounds i32, ptr %6, i32 %inc + %7 = load i32, ptr %arrayidx12, align 4 + %cmp13.not = icmp eq i32 %7, %index + br i1 %cmp13.not, label %if.end, label %do.body + +if.end: + %j.2 = phi i32 [ %j.038, %for.body5 ], [ %inc10, %do.body ] + %B.0 = phi ptr [ %2, %for.body5 ], [ %6, %do.body ] + %AKey.1 = phi i32 [ %and, %for.body5 ], [ %inc, %do.body ] + %arrayidx14 = getelementptr inbounds i32, ptr %B.0, i32 %AKey.1 + %8 = load i32, ptr %arrayidx14, align 4 + %add = add nsw i32 %8, %sum.137 + %inc15 = add nsw i32 %j.2, 1 + %cmp3 = icmp slt i32 %inc15, %1 + br i1 %cmp3, label %for.body5, label %for.cond.cleanup4 +} diff --git a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch.ll b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch.ll index 7d65952e2a2a..ad7029b6b8c0 100644 --- a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch.ll +++ b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch.ll @@ -25,17 +25,17 @@ define dso_local void @test(i32 %Num, float* nocapture readonly %TargetArray, i3 ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDVARS_IV]], 84 ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 168 -; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[OFFSETARRAY:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[OFFSETARRAY:%.*]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 168 -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[TEMPARRAY:%.*]], i64 [[TMP8]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TEMPARRAY:%.*]], i64 [[TMP8]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TEMPARRAY]], i64 [[INDVARS_IV]] -; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP]], i32 0, i32 3, i32 1) +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[SCEVGEP]], i32 0, i32 3, i32 1) ; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[OFFSETARRAY]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[OFFSETARRAY]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[OFFSETARRAY]], i64 [[TMP3]] -; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP1]], i32 0, i32 3, i32 1) +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[SCEVGEP1]], i32 0, i32 3, i32 1) ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4 ; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP10]], i32 0, i32 3, i32 1) diff --git a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll new file mode 100644 index 000000000000..b4d85c62e18f --- /dev/null +++ b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=unknown -passes=loop-data-prefetch --indirect-load-prefetch=true --prefetch-distance=512 --outer-loop-prefetch=true --random-access-prefetch-only=true -disable-direct-prefetch -S | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gun" + +declare i32 @llvm.aarch64.crc32w(i32, i32) + +; Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: read) uwtable +define dso_local noundef i32 @_Z12matchcolumnsPiiS_ii(ptr nocapture noundef readonly %A, i32 noundef %B, ptr nocapture noundef readonly %Key, i32 noundef %index, i32 noundef %count) { +; CHECK-LABEL: @_Z12matchcolumnsPiiS_ii( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 [[ADD:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV22:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT23:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[SUM_020:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD]], [[IF_END]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDVARS_IV22]], 22 +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i64 [[TMP0]], 99 +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 99 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDVARS_IV22]], 44 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV22]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 4 +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP4]], i32 0, i32 3, i32 1) +; CHECK-NEXT: [[TMP8:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP6]], i32 -1) +; CHECK-NEXT: [[TMP9:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP7]], i32 -1) +; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP8]], 255 +; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 255 +; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[AND]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[KEY:%.*]], i64 [[IDXPROM1]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[KEY]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP12]], i32 0, i32 3, i32 1) +; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP14]], [[B:%.*]] +; CHECK-NEXT: br i1 [[CMP3_NOT]], label [[IF_END]], label [[DO_BODY_PREHEADER:%.*]] +; CHECK: do.body.preheader: +; CHECK-NEXT: br label [[DO_BODY:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DO_BODY]] ], [ [[IDXPROM1]], [[DO_BODY_PREHEADER]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[KEY]], i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[CMP6_NOT:%.*]] = icmp eq i32 [[TMP15]], [[B]] +; CHECK-NEXT: br i1 [[CMP6_NOT]], label [[IF_END_LOOPEXIT:%.*]], label [[DO_BODY]] +; CHECK: if.end.loopexit: +; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[AKEY_1:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP16]], [[IF_END_LOOPEXIT]] ] +; CHECK-NEXT: [[IDXPROM7:%.*]] = sext i32 [[AKEY_1]] to i64 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM7]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP17]], [[SUM_020]] +; CHECK-NEXT: [[INDVARS_IV_NEXT23]] = add nuw nsw i64 [[INDVARS_IV22]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT23]], 100 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; +entry: + br label %for.body + +for.cond.cleanup: + ret i32 %add + +for.body: + %indvars.iv22 = phi i64 [ 0, %entry ], [ %indvars.iv.next23, %if.end ] + %sum.020 = phi i32 [ 0, %entry ], [ %add, %if.end ] + %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv22 + %0 = load i32, ptr %arrayidx, align 4 + %1 = tail call i32 @llvm.aarch64.crc32w(i32 %0, i32 -1) + %and = and i32 %1, 255 + %idxprom1 = zext i32 %and to i64 + %arrayidx2 = getelementptr inbounds i32, ptr %Key, i64 %idxprom1 + %2 = load i32, ptr %arrayidx2, align 4 + %cmp3.not = icmp eq i32 %2, %B + br i1 %cmp3.not, label %if.end, label %do.body + +do.body: + %indvars.iv = phi i64 [ %idxprom1, %for.body ], [ %indvars.iv.next, %do.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx5 = getelementptr inbounds i32, ptr %Key, i64 %indvars.iv.next + %3 = load i32, ptr %arrayidx5, align 4 + %cmp6.not = icmp eq i32 %3, %B + br i1 %cmp6.not, label %if.end.loopexit, label %do.body + +if.end.loopexit: + %4 = trunc i64 %indvars.iv.next to i32 + br label %if.end + +if.end: + %AKey.1 = phi i32 [ %and, %for.body ], [ %4, %if.end.loopexit ] + %idxprom7 = sext i32 %AKey.1 to i64 + %arrayidx8 = getelementptr inbounds i32, ptr %A, i64 %idxprom7 + %5 = load i32, ptr %arrayidx8, align 4 + %add = add nsw i32 %5, %sum.020 + %indvars.iv.next23 = add nuw nsw i64 %indvars.iv22, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next23, 100 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} -- Gitee