diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index cc31fc79c2de1818f1aa58759edd7117fcf347dd..7303d3ca94b58a24fda24898be40fdb24aa0303b 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -173,10 +173,10 @@ bool sinkRegionForLoopNest(DomTreeNode *, AAResults *, LoopInfo *, /// \p AllowSpeculation is whether values should be hoisted even if they are not /// guaranteed to execute in the loop, but are safe to speculatively execute. bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *, - AssumptionCache *, TargetLibraryInfo *, Loop *, - MemorySSAUpdater &, ScalarEvolution *, ICFLoopSafetyInfo *, - SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *, bool, - bool AllowSpeculation); + AssumptionCache *, TargetLibraryInfo *, TargetTransformInfo *, + Loop *, MemorySSAUpdater &, ScalarEvolution *, + ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &, + OptimizationRemarkEmitter *, bool, bool AllowSpeculation); /// Return true if the induction variable \p IV in a Loop whose latch is /// \p LatchBlock would become dead if the exit test \p Cond were removed. diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 71b567bc7c966055d6121e61449c8cacac423ebe..785afabf7141465db5e5e85b28d85e23bc141715 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -60,6 +60,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -68,6 +69,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" @@ -128,6 +130,10 @@ static cl::opt DisableMovStoreInsOutsideOfLoopInSigFun( cl::desc( "Disable move store instruction outside of loop in signal function.")); +static cl::opt EnableHoistCondLoad( + "licm-hoist-cond-load", cl::Hidden, cl::init(true), + cl::desc("Enable hoisting of conditional loads in LICM")); + static cl::opt MaxNumUsesTraversed( "licm-max-num-uses-traversed", cl::Hidden, cl::init(8), cl::desc("Max num uses visited for identifying load " @@ -202,6 +208,13 @@ using PointersAndHasReadsOutsideSet = static SmallVector collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L); +static ScalableVectorType *getSVEContainerType(Type *EltTy); + +static bool findConditionalLoad( + LoopInfo *LI, Loop *CurLoop, TargetTransformInfo *TTI, Instruction *I); + +static Instruction *replaceLoadWithLdnf(Instruction *I); + namespace { struct LoopInvariantCodeMotion { bool runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, @@ -456,8 +469,8 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, MSSAU, &SafetyInfo, Flags, ORE); Flags.setIsSink(false); if (Preheader) - Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L, - MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode, + Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, TTI, + L, MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode, LicmAllowSpeculation); // Now that all loop invariants have been removed from the loop, promote any @@ -855,6 +868,97 @@ public: }; } // namespace +static Instruction *replaceLoadWithLdnf(Instruction *I) { + auto *LoadI = dyn_cast(I); + auto *PointerOp = LoadI->getPointerOperand(); + const DataLayout &DL = LoadI->getModule()->getDataLayout(); + auto *LITy = LoadI->getType(); + IRBuilder<> B(LoadI); + auto *EltTy = LITy->isPointerTy() ? B.getIntNTy(DL.getPointerSizeInBits()) + : LITy; + auto *SVTy = getSVEContainerType(EltTy); + assert(SVTy && "Unsupport type of load instruction"); + + auto *PredTy = ScalableVectorType::get(B.getInt1Ty(), SVTy->getMinNumElements()); + Value *Imm = B.getInt32(31); + CallInst *Pred = B.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, {Imm}); + + Type *PtrTy = LITy->getPointerTo(PointerOp->getType()->getPointerAddressSpace()); + if (PointerOp->getType() != PtrTy) + PointerOp = B.CreateBitCast(PointerOp, PtrTy); + + CallInst *Ldnf = + B.CreateIntrinsic(Intrinsic::aarch64_sve_ldnf1, {SVTy}, {Pred, PointerOp}); + + propagateMetadata(Ldnf, LoadI); + + Value *Scalar = B.CreateExtractElement(Ldnf, B.getInt64(0), "extract"); + if (LITy->isPointerTy()) { + Value *PtrValue = B.CreateIntToPtr(Scalar, PointerType::getUnqual(Scalar->getContext())); + return dyn_cast(PtrValue); + } + return dyn_cast(Scalar); +} + +static ScalableVectorType *getSVEContainerType(Type *EltTy) { + if (EltTy == Type::getDoubleTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 2); + + if(EltTy == Type::getFloatTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 4); + + if(EltTy == Type::getBFloatTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 8); + + if(EltTy == Type::getHalfTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 8); + + if(EltTy == Type::getInt64Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 2); + + if(EltTy == Type::getInt32Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 4); + + if(EltTy == Type::getInt16Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 8); + + if(EltTy == Type::getInt8Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 16); + + return nullptr; +} + +static bool +findConditionalLoad(LoopInfo *LI, Loop *CurLoop, TargetTransformInfo *TTI, + Instruction *I) { + auto LoadI = dyn_cast(I); + if (!LoadI) + return false; + + if (LoadI->isAtomic() || LoadI->isVolatile()) + return false; + + Module *M = LoadI->getModule(); + Triple TargetTriple(M->getTargetTriple()); + if (!TargetTriple.isAArch64() || !TTI->supportsScalableVectors()) + return false; + + // TODO: add support for vector type + if (LoadI->getType()->isVectorTy()) + return false; + + auto PointerOp = LoadI->getPointerOperand(); + if (isa(PointerOp)) + return true; + + auto *PI = dyn_cast(PointerOp); + if (!PI || CurLoop->contains(PI)) + return false; + + return true; +} + + /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before @@ -862,9 +966,9 @@ public: /// bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, - TargetLibraryInfo *TLI, Loop *CurLoop, - MemorySSAUpdater &MSSAU, ScalarEvolution *SE, - ICFLoopSafetyInfo *SafetyInfo, + TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + Loop *CurLoop, MemorySSAUpdater &MSSAU, + ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE, bool LoopNestMode, bool AllowSpeculation) { @@ -893,6 +997,11 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, continue; for (Instruction &I : llvm::make_early_inc_range(*BB)) { + bool SafeHoist = + isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo, ORE, + Preheader->getTerminator(), AC, + AllowSpeculation); + // Try hoisting the instruction out to the preheader. We can only do // this if all of the operands of the instruction are loop invariant and // if it is safe to hoist the instruction. We also check block frequency @@ -902,9 +1011,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, // to that block. if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) && - isSafeToExecuteUnconditionally( - I, DT, TLI, CurLoop, SafetyInfo, ORE, - Preheader->getTerminator(), AC, AllowSpeculation)) { + SafeHoist) { hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, MSSAU, SE, ORE); HoistedInstructions.push_back(&I); @@ -912,6 +1019,29 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, continue; } + if (!SafeHoist) { + if (findConditionalLoad(LI, CurLoop, TTI, &I)) { + LLVM_DEBUG(dbgs() << "LICM: find the conditional load: " << I << "\n"); + if (CurLoop->hasLoopInvariantOperands(&I) && + canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) && + (getSVEContainerType(I.getType()) || I.getType()->isPointerTy())) { + hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, + MSSAU, SE, ORE); + // Replace hoisted load with @llvm.aarch64.sve.ldnf1.* + Instruction *ExtractI = replaceLoadWithLdnf(&I); + assert(ExtractI && "Failed to create ldnf1 to replace load"); + if (ExtractI) { + LLVM_DEBUG(dbgs() << "LICM: repalced with ldnf1: " << *ExtractI << "\n"); + I.replaceAllUsesWith(ExtractI); + eraseInstruction(I, *SafetyInfo, MSSAU); + + Changed = true; + continue; + } + } + } + } + // Attempt to remove floating point division out of the loop by // converting it to a reciprocal multiplication. if (I.getOpcode() == Instruction::FDiv && I.hasAllowReciprocal() &&