From 43fda78618cc4774d3123cf1f9732e90246703ed Mon Sep 17 00:00:00 2001 From: eastb233 Date: Tue, 8 Jul 2025 14:49:17 +0800 Subject: [PATCH] [LoopVersionLICM] Move pipeline to benefit from vectorization --- .../llvm/Analysis/TargetTransformInfo.h | 6 +++++ .../llvm/Analysis/TargetTransformInfoImpl.h | 2 ++ llvm/include/llvm/CodeGen/BasicTTIImpl.h | 2 ++ llvm/lib/Analysis/TargetTransformInfo.cpp | 4 ++++ llvm/lib/Passes/PassBuilderPipelines.cpp | 20 +++++++++++++++++ llvm/lib/Target/AArch64/AArch64Subtarget.h | 13 +++++++++++ .../Target/AArch64/AArch64TargetMachine.cpp | 10 --------- .../AArch64/AArch64TargetTransformInfo.cpp | 12 ++++++++++ .../AArch64/AArch64TargetTransformInfo.h | 2 ++ .../Transforms/Scalar/LoopVersioningLICM.cpp | 22 +++++++++++++------ .../LoopVersioningLICM/lvoverlap.ll | 4 ++-- 11 files changed, 78 insertions(+), 19 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 1ae595d21104..7a18a834410d 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -825,6 +825,8 @@ public: /// then/else to before if. bool isProfitableToHoist(Instruction *I) const; + bool isProfitableToLoopVersioning() const; + bool useAA() const; /// Return true if this type is legal. @@ -1795,6 +1797,7 @@ public: virtual bool LSRWithInstrQueries() = 0; virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0; virtual bool isProfitableToHoist(Instruction *I) = 0; + virtual bool isProfitableToLoopVersioning() = 0; virtual bool useAA() = 0; virtual bool isTypeLegal(Type *Ty) = 0; virtual unsigned getRegUsageForType(Type *Ty) = 0; @@ -2285,6 +2288,9 @@ public: bool isProfitableToHoist(Instruction *I) override { return Impl.isProfitableToHoist(I); } + bool isProfitableToLoopVersioning() override { + return Impl.isProfitableToLoopVersioning(); + } bool useAA() override { return Impl.useAA(); } bool isTypeLegal(Type *Ty) override { return Impl.isTypeLegal(Ty); } unsigned getRegUsageForType(Type *Ty) override { diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 4ab339956182..14e635127b59 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -325,6 +325,8 @@ public: bool isProfitableToHoist(Instruction *I) const { return true; } + bool isProfitableToLoopVersioning() const { return false; } + bool useAA() const { return false; } bool isTypeLegal(Type *Ty) const { return false; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 383fdd1f4d79..b8b74f6ab278 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -407,6 +407,8 @@ public: return getTLI()->isProfitableToHoist(I); } + bool isProfitableToLoopVersioning() const { return false; } + bool useAA() const { return getST()->useAA(); } bool isTypeLegal(Type *Ty) { diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index c751d174a48a..5b91415bcb37 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -519,6 +519,10 @@ bool TargetTransformInfo::isProfitableToHoist(Instruction *I) const { return TTIImpl->isProfitableToHoist(I); } +bool TargetTransformInfo::isProfitableToLoopVersioning() const { + return TTIImpl->isProfitableToLoopVersioning(); +} + bool TargetTransformInfo::useAA() const { return TTIImpl->useAA(); } bool TargetTransformInfo::isTypeLegal(Type *Ty) const { diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 4d92fdd3f1d7..c9f3512da32b 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -105,6 +105,7 @@ #include "llvm/Transforms/Scalar/LoopSink.h" #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h" #include "llvm/Transforms/Scalar/LoopUnrollPass.h" +#include "llvm/Transforms/Scalar/LoopVersioningLICM.h" #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" @@ -284,6 +285,10 @@ static cl::opt AttributorRun( clEnumValN(AttributorRunOption::NONE, "none", "disable attributor runs"))); +static cl::opt UseLoopVersioningLICM( + "enable-loop-versioning-licm", cl::init(false), cl::Hidden, + cl::desc("Enable the experimental Loop Versioning LICM pass")); + cl::opt EnableMemProfContextDisambiguation( "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden, cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation")); @@ -1486,6 +1491,21 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, invokeOptimizerEarlyEPCallbacks(MPM, Level); FunctionPassManager OptimizePM; + // Scheduling LoopVersioningLICM when inlining is over, because after that + // we may see more accurate aliasing. Reason to run this late is that too + // early versioning may prevent further inlining due to increase of code + // size. Other optimizations which runs later might get benefit of no-alias + // assumption in clone loop. + if (UseLoopVersioningLICM) { + OptimizePM.addPass( + createFunctionToLoopPassAdaptor(LoopVersioningLICMPass())); + // LoopVersioningLICM pass might increase new LICM opportunities. + OptimizePM.addPass(createFunctionToLoopPassAdaptor( + LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true), + /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); + } + OptimizePM.addPass(Float2IntPass()); OptimizePM.addPass(LowerConstantIntrinsicsPass()); diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 11cee89bfd76..641a94f06ada 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -206,6 +206,19 @@ public: return ARMProcFamily; } + bool isHiSiliconProc() const { + switch (ARMProcFamily) { + case TSV110: + case HIP09: + case HIP10C: + case HIP11: + case HIP12: + return true; + default: + return false; + } + } + bool isXRaySupported() const override { return true; } /// Returns true if the function has a streaming body. diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index f4e07b2dbdac..adbbfee579eb 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -197,10 +197,6 @@ static cl::opt EnableGISelLoadStoreOptPostLegal( cl::desc("Enable GlobalISel's post-legalizer load/store optimization pass"), cl::init(false), cl::Hidden); -static cl::opt EnableLoopVersioningLICM( - "aarch64-enable-loop-versioning-licm", cl::init(false), cl::Hidden, - cl::desc("Enable the experimental Loop Versioning LICM pass")); - extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine X(getTheAArch64leTarget()); @@ -571,12 +567,6 @@ void AArch64PassConfig::addIRPasses() { .hoistCommonInsts(true) .sinkCommonInsts(true))); - if (EnableLoopVersioningLICM) { - // Loop needs to be in loop simplify form. - addPass(createLoopSimplifyPass()); - addPass(createLoopVersioningLICMPass()); - } - // Run LoopDataPrefetch // // Run this before LSR to remove the multiplies involved in computing the diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 029b0931eb95..e7927352a5ca 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -46,6 +46,10 @@ static cl::opt NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden); +static cl::opt + ForceEnableExperimentalOpt("force-enable-experimental-optimization", + cl::init(false), cl::Hidden); + namespace { class TailFoldingOption { // These bitfields will only ever be set to something non-zero in operator=, @@ -454,6 +458,14 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } +bool AArch64TTIImpl::isProfitableToLoopVersioning() const { + // Prove to work well for HiSilicon Processors. + // You can experimentally enable optimization by option + // -mllvm -force-enable-experimental-optimization if you + // want to test it on other platforms. + return ST->isHiSiliconProc() || ForceEnableExperimentalOpt; +} + InstructionCost AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index d1977a62a76d..be62dabaa8c8 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -96,6 +96,8 @@ public: TTI::TargetCostKind CostKind); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + bool isProfitableToLoopVersioning() const; + /// @} /// \name Vector TTI Implementations diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 73469e47c4f9..93800d08b789 100644 --- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -124,10 +124,9 @@ struct LoopVersioningLICM { // for versioning. By passing the proxy instead the construction of // LoopAccessInfo will take place only when it's necessary. LoopVersioningLICM(AliasAnalysis *AA, ScalarEvolution *SE, - OptimizationRemarkEmitter *ORE, - LoopAccessInfoManager &LAIs, LoopInfo &LI, - Loop *CurLoop) - : AA(AA), SE(SE), LAIs(LAIs), LI(LI), CurLoop(CurLoop), + TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, + LoopAccessInfoManager &LAIs, LoopInfo &LI, Loop *CurLoop) + : AA(AA), SE(SE), TTI(TTI), LAIs(LAIs), LI(LI), CurLoop(CurLoop), LoopDepthThreshold(LVLoopDepthThreshold), InvariantThreshold(LVInvarThreshold), ORE(ORE) {} @@ -140,6 +139,8 @@ private: // Current ScalarEvolution ScalarEvolution *SE; + TargetTransformInfo *TTI; + // Current Loop's LoopAccessInfo const LoopAccessInfo *LAI = nullptr; @@ -630,7 +631,7 @@ bool LoopVersioningLICM::run(DominatorTree *DT) { // Try loop versioning overlap optimization, if it fails, go through // to the original LoopVersioningLICM. - if (LVOverlap) { + if (LVOverlap && TTI->isProfitableToLoopVersioning()) { EnableLVOverlap = true; if (isLegalForVersioning()) { LLVM_DEBUG(dbgs() << " Do Loop Versioning Overlap transformation\n\n"); @@ -722,11 +723,12 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM, AliasAnalysis *AA = &LAR.AA; ScalarEvolution *SE = &LAR.SE; DominatorTree *DT = &LAR.DT; + TargetTransformInfo *TTI = &LAR.TTI; const Function *F = L.getHeader()->getParent(); OptimizationRemarkEmitter ORE(F); LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr); - if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT)) + if (!LoopVersioningLICM(AA, SE, TTI, &ORE, LAIs, LAR.LI, &L).run(DT)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); } @@ -744,21 +746,26 @@ public: if (skipLoop(L)) return false; + Function *F = L->getHeader()->getParent(); + AliasAnalysis *AA = &getAnalysis().getAAResults(); ScalarEvolution *SE = &getAnalysis().getSE(); DominatorTree *DT = &getAnalysis().getDomTree(); + TargetTransformInfo *TTI = + &getAnalysis().getTTI(*F); LoopInfo *LI = &getAnalysis().getLoopInfo(); OptimizationRemarkEmitter *ORE = &getAnalysis().getORE(); LoopAccessInfoManager LAIs(*SE, *AA, *DT, *LI, nullptr); - return LoopVersioningLICM(AA, SE, ORE, LAIs, *LI, L).run(DT); + return LoopVersioningLICM(AA, SE, TTI, ORE, LAIs, *LI, L).run(DT); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); } @@ -774,6 +781,7 @@ INITIALIZE_PASS_BEGIN(LoopVersioningLICMLegacyPass, "loop-versioning-licm", INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) diff --git a/llvm/test/Transforms/LoopVersioningLICM/lvoverlap.ll b/llvm/test/Transforms/LoopVersioningLICM/lvoverlap.ll index 300a230798f7..eddc614116cd 100644 --- a/llvm/test/Transforms/LoopVersioningLICM/lvoverlap.ll +++ b/llvm/test/Transforms/LoopVersioningLICM/lvoverlap.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -S --passes='loop-versioning-licm' -loop-versioning-overlap -debug-only=loop-versioning-licm 2>&1 | FileCheck %s -; REQUIRES: asserts +; RUN: opt < %s -S --passes='loop-versioning-licm' --mcpu=tsv110 -mtriple aarch64-linux-gnu -loop-versioning-overlap -debug-only=loop-versioning-licm 2>&1 | FileCheck %s +; REQUIRES: asserts, aarch64-registered-target ; ; CHECK: Do Loop Versioning Overlap transformation ; -- Gitee