diff --git a/acpo/model-recipe.acpo b/acpo/model-recipe.acpo new file mode 100644 index 0000000000000000000000000000000000000000..5570ff72cdc0f9bcfff9b6cdf582744dc890b99a --- /dev/null +++ b/acpo/model-recipe.acpo @@ -0,0 +1 @@ +annotation2metadata,forceattrs,inferattrs,coro-early,cgscc(dse,function(loop-simplify,lcssa,coro-elide,simplifycfg,instcombine,reassociate),function-attrs,function(require),coro-split,function(invalidate)),deadargelim,coro-cleanup,globalopt,globaldce,elim-avail-extern,rpo-function-attrs,recompute-globalsaa,ipsccp,function(float2int,lower-constant-intrinsics),constmerge,cg-profile,rel-lookup-table-converter,ir-library-injection,globalopt,cgscc(devirt<4>(inline,inline,move-auto-init,argpromotion,openmp-opt-cgscc,function(sroa,speculative-execution,tailcallelim,loop-mssa(licm,simple-loop-unswitch),loop(loop-idiom,indvars,loop-deletion),loop-unroll,early-cse<>,callsite-splitting,sroa,early-cse,speculative-execution,jump-threading,correlated-propagation,lower-expect,simplifycfg,instcombine,aggressive-instcombine,libcalls-shrinkwrap,tailcallelim,simplifycfg,reassociate))),cgscc(dse,function(loop-simplify,lcssa,coro-elide,simplifycfg,instcombine,reassociate),function-attrs,function(require),coro-split,function(invalidate)),deadargelim,coro-cleanup,globalopt,globaldce,elim-avail-extern,rpo-function-attrs,recompute-globalsaa,ipsccp,function(float2int,lower-constant-intrinsics),constmerge,cg-profile,rel-lookup-table-converter,ir-library-injection,function(sroa,gvn-hoist,mldst-motion,gvn,sccp,bdce,instcombine,jump-threading,correlated-propagation,adce,memcpyopt),function(loop-simplify,lcssa,crypto,chr,loop(loop-rotate,loop-deletion),annotation-remarks,constraint-elimination,mem2reg,instcombine,loop-simplify,lcssa,indvars,loop-deletion,loop-simplify,lcssa,loop-instsimplify,loop-simplifycfg,function(loop-mssa(licm)),simple-loop-unswitch,simplifycfg,instcombine),require,function(invalidate),require,function(loop-simplify,lcssa,loop(loop-idiom,loop-deletion,loop-unroll-full),loop-data-prefetch,hash-data-prefetch,separate-const-offset-from-gep),verify \ No newline at end of file diff --git a/build.sh b/build.sh index 549998059c79cc69d2f15c1c30b0c597df2ba7c4..4a1e7ba516765a15eeaea932f177e2613afed63c 100755 --- a/build.sh +++ b/build.sh @@ -406,6 +406,7 @@ cmake $CMAKE_OPTIONS \ -DLIBCXX_STATICALLY_LINK_ABI_IN_STATIC_LIBRARY=ON \ -DLIBCXX_ENABLE_ABI_LINKER_SCRIPT=ON \ -DLIBOMP_INSTALL_ALIASES=OFF \ + -DENABLE_ACPO=$enable_acpo \ $llvm_binutils_incdir \ $verbose \ ../llvm diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index d492b8681c5da847f5d6dd7b9268131b7f994162..518e5d476b083fd0269dfbdaefa98c1b37f252aa 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -212,11 +212,15 @@ CODEGENOPT(ObjCAvoidHeapifyLocalBlocks, 1, 0) VALUE_CODEGENOPT(OptimizationLevel, 2, 0) ///< The -O[0-3] option specified. VALUE_CODEGENOPT(OptimizeSize, 2, 0) ///< If -Os (==1) or -Oz (==2) is specified. +CODEGENOPT(AI4CRecipe, 1, 0) // Enable AI4C Phase ordering +CODEGENOPT(AI4CRecipeVerbose, 1, 0) // Enable AI4C Phase ordering verbose mode + CODEGENOPT(AtomicProfileUpdate , 1, 0) ///< Set -fprofile-update=atomic /// Choose profile instrumenation kind or no instrumentation. ENUM_CODEGENOPT(ProfileInstr, ProfileInstrKind, 2, ProfileNone) /// Choose profile kind for PGO use compilation. ENUM_CODEGENOPT(ProfileUse, ProfileInstrKind, 2, ProfileNone) +CODEGENOPT(AI4CAnalysis, 1, 0) ///< Enable AI4C Analysis /// Partition functions into N groups and select only functions in group i to be /// instrumented. Selected group numbers can be 0 to N-1 inclusive. VALUE_CODEGENOPT(ProfileTotalFunctionGroups, 32, 1) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index b8971182ae760dd41808df5bd7c52fc33e62bd99..7031dca091b3cbf41c06edc07f04209bc85a7f2c 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1421,6 +1421,12 @@ def fcs_profile_generate : Flag<["-"], "fcs-profile-generate">, def fcs_profile_generate_EQ : Joined<["-"], "fcs-profile-generate=">, Group, Flags<[CoreOption]>, MetaVarName<"">, HelpText<"Generate instrumented code to collect context sensitive execution counts into /default.profraw (overridden by LLVM_PROFILE_FILE env var)">; +def fai4c_recipe: Flag<["-"], "fai4c-recipe">, Group, + Flags<[CC1Option]>, MarshallingInfoFlag>; +def fai4c_recipe_verbose: Flag<["-"], "fai4c-recipe-verbose">, Group, + Flags<[CC1Option]>, MarshallingInfoFlag>; +def fai4c_analysis: Flag<["-"], "fai4c-analysis">, Group, + Flags<[CC1Option]>,MarshallingInfoFlag>; def fprofile_use : Flag<["-"], "fprofile-use">, Group, Flags<[CoreOption]>, Alias; def fprofile_use_EQ : Joined<["-"], "fprofile-use=">, diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index cef5e0d16ba7aa621a966bd6e24be7e918c0f7f2..da287a1935bb89e91a6c6f799634a04007460bbe 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -39,6 +39,7 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Object/OffloadBinary.h" +#include "llvm/Passes/OptimizationLevel.h" #include "llvm/Passes/PassBuilder.h" #include "llvm/Passes/PassPlugin.h" #include "llvm/Passes/StandardInstrumentations.h" @@ -46,6 +47,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/PrettyStackTrace.h" +#include "llvm/Support/Process.h" #include "llvm/Support/TimeProfiler.h" #include "llvm/Support/Timer.h" #include "llvm/Support/ToolOutputFile.h" @@ -79,8 +81,10 @@ #include "llvm/Transforms/Utils/Debugify.h" #include "llvm/Transforms/Utils/EntryExitInstrumenter.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include #include #include +#include using namespace clang; using namespace llvm; @@ -847,6 +851,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline( PTO.CallGraphProfile = !CodeGenOpts.DisableIntegratedAS; PTO.UnifiedLTO = CodeGenOpts.UnifiedLTO; + PTO.AI4CAnalysis = CodeGenOpts.AI4CAnalysis; + PTO.AI4CRecipe = CodeGenOpts.AI4CRecipe; + PTO.AI4CRecipeVerbose = CodeGenOpts.AI4CRecipeVerbose; + LoopAnalysisManager LAM; FunctionAnalysisManager FAM; CGSCCAnalysisManager CGAM; @@ -1041,10 +1049,78 @@ void EmitAssemblyHelper::RunOptimizationPipeline( if (auto Err = PB.parsePassPipeline(MPM, PassPipeline)) errs() << "AutoTuner: cannot add pass:" << toString(std::move(Err)) - << "\n"; + << '\n'; + } + } + + // Use pass order define by ACPO recipe library instead (w/ clang frontend clang -O3 -fai4c-recipe) + // 1. Clang arguments passed before are intact in the compilation pipeline. + // 2. If necessary, Using opt, supplimentary passes should be added with -passes="PASSES". + // 3. Optimization levels should be set at non -O0 when compiling with clang, e.g.: + // "clang -O3 -mllvm -use-acpo-bw-model -f" + if (PTO.AI4CRecipe) { + if (PTO.AI4CRecipeVerbose) { + errs() << "AI4C: Module: " << TheModule->getName() << " - ACPO Phase-ordering recipes are activated: 724-DADCB\n"; + errs() << "AI4C: Optimization is set at " << Level.getSpeedupLevel() << '\n'; + } + std::string ACPORecipeModelDir; + std::string PassPipeline = ""; + std::string ACPOPipeline = ""; + + if (std::optional Path = llvm::sys::Process::GetEnv("BISHENG_ACPO_DIR")) { + ACPORecipeModelDir = *Path; + } else { + ACPORecipeModelDir = *Path; + } + + std::string FileName = ACPORecipeModelDir + "/model-recipe.acpo"; + if (PTO.AI4CRecipeVerbose) { + errs() << "AI4C-Recipe : Reading custom recipe from " << FileName << '\n'; + } + std::ifstream file(FileName); + if (file.is_open()) { + // For now, we only support the custom recipe at the 1st line. + // This can be later be expanded into an (APP A, Line X) pair. + std::string line = ""; + if (std::getline(file, line)) { + ACPOPipeline = line; + } + file.close(); + } else { + if (PTO.AI4CRecipeVerbose) { + errs() << "Could not open recipe file, reading hardcoded recipes instead:\n"; + } + ACPOPipeline = "annotation2metadata,forceattrs,inferattrs,coro-early,cgscc(dse,function(loop-simplify,lcssa,coro-elide,simplifycfg,instcombine,reassociate),function-attrs,function(require),coro-split,function(invalidate)),deadargelim,coro-cleanup,globalopt,globaldce,elim-avail-extern,rpo-function-attrs,recompute-globalsaa,ipsccp,function(float2int,lower-constant-intrinsics),constmerge,cg-profile,rel-lookup-table-converter,ir-library-injection,globalopt,cgscc(devirt<4>(inline,inline,move-auto-init,argpromotion,openmp-opt-cgscc,function(sroa,speculative-execution,tailcallelim,loop-mssa(licm,simple-loop-unswitch),loop(loop-idiom,indvars,loop-deletion),loop-unroll,early-cse<>,callsite-splitting,sroa,early-cse,speculative-execution,jump-threading,correlated-propagation,lower-expect,simplifycfg,instcombine,aggressive-instcombine,libcalls-shrinkwrap,tailcallelim,simplifycfg,reassociate))),cgscc(dse,function(loop-simplify,lcssa,coro-elide,simplifycfg,instcombine,reassociate),function-attrs,function(require),coro-split,function(invalidate)),deadargelim,coro-cleanup,globalopt,globaldce,elim-avail-extern,rpo-function-attrs,recompute-globalsaa,ipsccp,function(float2int,lower-constant-intrinsics),constmerge,cg-profile,rel-lookup-table-converter,ir-library-injection,function(sroa,gvn-hoist,mldst-motion,gvn,sccp,bdce,instcombine,jump-threading,correlated-propagation,adce,memcpyopt),function(loop-simplify,lcssa,crypto,chr,loop(loop-rotate,loop-deletion),annotation-remarks,constraint-elimination,mem2reg,instcombine,loop-simplify,lcssa,indvars,loop-deletion,loop-simplify,lcssa,loop-instsimplify,loop-simplifycfg,function(loop-mssa(licm)),simple-loop-unswitch,simplifycfg,instcombine),require,function(invalidate),require,function(loop-simplify,lcssa,loop(loop-idiom,loop-deletion,loop-unroll-full),loop-data-prefetch,hash-data-prefetch,separate-const-offset-from-gep),verify"; + } + if (IsThinLTO || IsLTO) { + // This pass removes available external global defs from the module + // turning them into declerations, so we remove it from the pipeline in LTO + std::string RemovePass = ",elim-avail-extern"; + size_t Pos = ACPOPipeline.find(RemovePass); + int Counter = 0; + while (Pos != std::string::npos) { + Counter++; + if (PTO.AI4CRecipeVerbose) { + errs() << "AI4C: removed " << Counter << " elim-avail-exterm instances.\n"; + } + ACPOPipeline.erase(Pos, RemovePass.length()); + Pos = ACPOPipeline.find(RemovePass, Pos); + } + } + PassPipeline = ACPOPipeline; + if (PTO.AI4CRecipeVerbose) { + errs() << "AI4C final passpipeline: " << PassPipeline << "\n"; + } + + if (auto Err = PB.parsePassPipeline(MPM, PassPipeline)) + errs() << "ACPO Pass Ordering Recipe: cannot add pass:" << toString(std::move(Err)) << '\n'; + + if (IsThinLTO || IsLTO) { + MPM.addPass(PB.addAutoTunerLTOPreLinkPasses()); } } - if (!Changed) { + + if (!Changed && !PTO.AI4CRecipe) { #endif if (IsThinLTO || (IsLTO && CodeGenOpts.UnifiedLTO)) { MPM = PB.buildThinLTOPreLinkDefaultPipeline(Level); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 55c40bdc793f37247eb176a2fb5f50f1e7eeade6..911daa9c8b05dd1ddae145506dd47e706f8fc615 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6061,6 +6061,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_finstrument_functions_after_inlining, options::OPT_finstrument_function_entry_bare); + if (Args.hasArg(options::OPT_fai4c_analysis)) CmdArgs.push_back("-fai4c-analysis"); + + if (Args.hasArg(options::OPT_fai4c_recipe)) { + CmdArgs.push_back("-fai4c-recipe"); + } + // NVPTX/AMDGCN doesn't support PGO or coverage. There's no runtime support // for sampling, overhead of call arc collection is way too high and there's // no way to collect the output. diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 04d0f1c9f7a70aea0a7e178036174ac208daa02b..d3c6bf6b5530328144cd7cceb093995a4c8d473d 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -1650,6 +1650,9 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, OptimizationLevel = MaxOptLevel; } Opts.OptimizationLevel = OptimizationLevel; + Opts.AI4CRecipe = Args.hasArg(options::OPT_fai4c_recipe); + Opts.AI4CRecipeVerbose = Args.hasArg(options::OPT_fai4c_recipe_verbose); + Opts.AI4CAnalysis = Args.hasArg(options::OPT_fai4c_analysis); // The key paths of codegen options defined in Options.td start with // "CodeGenOpts.". Let's provide the expected variable name and type. diff --git a/llvm/include/llvm/Analysis/ACPOBWModel.h b/llvm/include/llvm/Analysis/ACPOBWModel.h new file mode 100644 index 0000000000000000000000000000000000000000..8070eb47de5bd3fedeaa1a18a245658634d3e282 --- /dev/null +++ b/llvm/include/llvm/Analysis/ACPOBWModel.h @@ -0,0 +1,39 @@ +//===- ACPOBWModel.h - ACPO Branch weight inference -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. +// +//==-----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_ACPOBWMODEL_H +#define LLVM_ANALYSIS_ACPOBWMODEL_H + +#include "llvm/Analysis/ACPOModel.h" + +namespace llvm { + +class ACPOBWModel : public ACPOModel { +public: + ACPOBWModel(LLVMContext *Context, OptimizationRemarkEmitter *OER); + ~ACPOBWModel(); + void setMLCustomFeatures( + std::vector> FeatureValues); + +protected: + // Interface to run the MLInference/default advisor and get advice from + // model/default advisor + virtual std::unique_ptr getAdviceML() override; + + virtual std::unique_ptr getAdviceNoML() override; + +private: + std::vector> CustomFeatureValues; + uint64_t BranchWeight; +}; + +} // end namespace llvm + +#endif // LLVM_ANALYSIS_ACPOBWMODEL_H \ No newline at end of file diff --git a/llvm/include/llvm/Analysis/ACPOCollectFeatures.h b/llvm/include/llvm/Analysis/ACPOCollectFeatures.h index 8b266b3bc756d53b15ea503462f953a0ac163f1c..0c44b1cee6e4b185f2e99d67ae0e57a10aeb0bcc 100644 --- a/llvm/include/llvm/Analysis/ACPOCollectFeatures.h +++ b/llvm/include/llvm/Analysis/ACPOCollectFeatures.h @@ -38,9 +38,12 @@ public: enum class Scope { Module, Function, + BasicBlock, + Edge, Loop, Callgraph, CallSite, + MemOpt, NumOfScope, }; @@ -56,6 +59,9 @@ public: HotColdCallSite, InlineCostFeatureGroup, ACPOFIExtendedFeatures, + BasicBlockFeatures, + EdgeFeatures, + MemOptFeatures, NumOfGroupID }; @@ -147,6 +153,45 @@ public: ACPOFIExtendedFeaturesFloatFeatureEnd, // End: ACPOFIExtendedFeatures + // Begin: BasicBlockFeatures + NumSuccessors, + NumInstrs, + NumCriticalEdges, + HighestNumInstrsInSucc, + SuccNumWithHighestNumInstrs, + IsBranchInst, + IsSwitchInst, + IsIndirectBrInst, + IsInvokeInst, + IsCallBrInst, + IsFirstOpPtr, + IsSecondOpNull, + IsSecondOpConstant, + IsEqCmp, + IsNeCmp, + IsGtCmp, + IsLtCmp, + IsGeCmp, + IsLeCmp, + IsIVCmp , + IsBBInLoop , + IsFirstSuccInLoop , + IsSecondSuccInLoop, + // BBName, + // End: BasicBlockFeatures + + // Begin: EdgeFeatures + DestNumSuccessors, + DestNumInstrs , + DestNumCriticalEdges , + DestIsBranchInst , + DestIsSwitchInst , + DestIsIndirectBrInst , + DestIsInvokeInst , + DestIsCallBrInst , + DestSuccNumber , + // End: EdgeFeatures + CallerBlockFreq, CallSiteHeight, ConstantParam, @@ -159,6 +204,20 @@ public: IsInInnerLoop, IsMustTailCall, IsTailCall, + + // Begin: MemOptFeatures + NumInst, + NumPhis, + NumCalls, + NumLoads, + NumStores, + NumPreds, + NumSuccs, + EndsWithUnreachable, + EndsWithReturn, + EndsWithCondBranch, + EndsWithBranch, + // End: MemOptFeatures NumOfFeatures }; @@ -175,6 +234,7 @@ public: BasicBlock *BB = nullptr; Module *M = nullptr; Loop *L = nullptr; + BasicBlock *DestBB = nullptr; // Can add Instructions or other types later. }; diff --git a/llvm/include/llvm/Analysis/ModelDataCollector.h b/llvm/include/llvm/Analysis/ModelDataCollector.h index ad3fc476a9b2576959d1133c3615439233725f08..515c767e405a856baf82b36b4fa6f897608e0da8 100644 --- a/llvm/include/llvm/Analysis/ModelDataCollector.h +++ b/llvm/include/llvm/Analysis/ModelDataCollector.h @@ -44,6 +44,8 @@ public: void addFeatures(std::vector> NewFeatures); + void setOutput(std::string Output) { Out << Output; } + // Print out the features void printRow(bool printHeader = false); diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h index fdb407263787f694eaf2f331b06f89755b63517f..fd48af04bab68271bc0ed2de0bc997f3bf33cfe1 100644 --- a/llvm/include/llvm/Passes/PassBuilder.h +++ b/llvm/include/llvm/Passes/PassBuilder.h @@ -92,6 +92,15 @@ public: // analyses after various module->function or cgscc->function adaptors in the // default pipelines. bool EagerlyInvalidateAnalyses; + + // AI4C Option to optimize workloads + // 1. AI4CRecipe: Leveraging ACPO Recipe library to derive phase ordering + // 2. AI4CAnalysis + // 3. AI4CBW: + // 4. AI4CAV + bool AI4CRecipe = false; + bool AI4CRecipeVerbose = false; + bool AI4CAnalysis = false; }; /// This class provides access to building LLVM's passes. @@ -292,6 +301,8 @@ public: ModulePassManager buildO0DefaultPipeline(OptimizationLevel Level, bool LTOPreLink = false); + ModulePassManager addAutoTunerLTOPreLinkPasses(); + /// Build the default `AAManager` with the default alias analysis pipeline /// registered. /// @@ -633,6 +644,15 @@ private: ThinOrFullLTOPhase LTOPhase, IntrusiveRefCntPtr FS); + void addAI4CRelatedPassesForO0(ModulePassManager &MPM); + + void addAI4CRelatedPasses(ModulePassManager &MPM, + OptimizationLevel Level, + ThinOrFullLTOPhase LOTPhase); + + void addACPOBWPasses(ModulePassManager &PMP, OptimizationLevel Level, + ThinOrFullLTOPhase LTOPhase, bool skipPreInline); + // Extension Point callbacks SmallVector, 2> PeepholeEPCallbacks; diff --git a/llvm/include/llvm/Transforms/Instrumentation/ACPOAI4CFHModel.h b/llvm/include/llvm/Transforms/Instrumentation/ACPOAI4CFHModel.h new file mode 100644 index 0000000000000000000000000000000000000000..798e7d220fb7c9c663111ca788df77c8e8850858 --- /dev/null +++ b/llvm/include/llvm/Transforms/Instrumentation/ACPOAI4CFHModel.h @@ -0,0 +1,31 @@ +#ifndef LLVM_TRANSFORMS_ACPOAI4CFHMODEL_H +#define LLVM_TRANSFORMS_ACPOAI4CFHMODEL_H + +#include "llvm/Analysis/ACPOBWModel.h" +#include "llvm/Analysis/DumpFeature.h" +#include "llvm/Analysis/FunctionPropertiesAnalysis.h" +#include "llvm/Analysis/InlineSizeEstimatorAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include + +namespace llvm { +class ACPOAI4CFHModel : public ACPOModel { +public: + ACPOAI4CFHModel(LLVMContext *Context, OptimizationRemarkEmitter *ORE); + ~ACPOAI4CFHModel(); + void setMLCustomFeatures(std::vector> FeatureValues); + static void clearCache(); +protected: + // Interface to run the MLInterface/default advisor and get advice from the + // odel/default advisor + virtual std::unique_ptr getAdviceML() override; + virtual std::unique_ptr getAdviceNoML() override; +private: + std::vector> CustomFeatureValues; + int64_t Hotness = 0; +}; +} // end namespace llvm + +#endif \ No newline at end of file diff --git a/llvm/include/llvm/Transforms/Instrumentation/ACPOAI4CMEMOPModel.h b/llvm/include/llvm/Transforms/Instrumentation/ACPOAI4CMEMOPModel.h new file mode 100644 index 0000000000000000000000000000000000000000..c563eec5a404ff2f25e67921ef7772eaab9e3a67 --- /dev/null +++ b/llvm/include/llvm/Transforms/Instrumentation/ACPOAI4CMEMOPModel.h @@ -0,0 +1,31 @@ +#ifndef LLVM_TRANSFORMS_ACPOAI4CMEMOPMODEL_H +#define LLVM_TRANSFORMS_ACPOAI4CMEMOPMODEL_H + +#include "llvm/Analysis/ACPOBWModel.h" +#include "llvm/Analysis/DumpFeature.h" +#include "llvm/Analysis/FunctionPropertiesAnalysis.h" +#include "llvm/Analysis/InlineAdvisor.h" +#include "llvm/Analysis/InlineSizeEstimatorAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include + +namespace llvm { +class ACPOAI4CMEMOPModel : public ACPOModel { +public: + ACPOAI4CMEMOPModel(LLVMContext *Context, OptimizationRemarkEmitter *ORE); + ~ACPOAI4CMEMOPModel(); + void setMLCustomFeatures(std::vector> FeatureValues); + static void clearCache(); +protected: + // Interface to run the MLInference/default advisor and get advice from the + // model/default advisor + virtual std::unique_ptr getAdviceML() override; + virtual std::unique_ptr getAdviceNoML() override; +private: + std::vector> CustomFeatureValues; + int64_t ShouldOPT = 0; +}; +} // end namespace llvm +#endif diff --git a/llvm/include/llvm/Transforms/Instrumentation/AI4CAnalysis.h b/llvm/include/llvm/Transforms/Instrumentation/AI4CAnalysis.h new file mode 100644 index 0000000000000000000000000000000000000000..fb0206bbbc6653949dbd0becd5158c7a94b02aae --- /dev/null +++ b/llvm/include/llvm/Transforms/Instrumentation/AI4CAnalysis.h @@ -0,0 +1,26 @@ +#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_AI4CANALYSIS_H +#define LLVM_TRANSFORMS_INSTRUMENTATION_AI4CANALYSIS_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/IntrusiveRefCntPtr.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/CommandLine.h" +#include +#include + +namespace llvm { +class Function; +class Instruction; +class Module; + +enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot }; + +class AI4CAnalysis : public PassInfoMixin { +public: + AI4CAnalysis(); + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MMA); +}; + +}// end name sace llvm + +#endif \ No newline at end of file diff --git a/llvm/include/llvm/Transforms/Instrumentation/AI4CFHModelRunner.h b/llvm/include/llvm/Transforms/Instrumentation/AI4CFHModelRunner.h new file mode 100644 index 0000000000000000000000000000000000000000..8fe2d762365ee1eb54f276da167d2b4bc9bef16d --- /dev/null +++ b/llvm/include/llvm/Transforms/Instrumentation/AI4CFHModelRunner.h @@ -0,0 +1,73 @@ +#ifdef LLVM_HAVE_TF_AOT_AI4CFHCOMPILEDMODEL + +#ifndef LLVM_ANALYSIS_AI4CFHMODELRUNNER_H +#define LLVM_ANALYSIS_AI4CFHMODELRUNNER_H + +#include "llvm/Analysis/AI4CFHCompiledModel.h" +#include "llvm/Analysis/AOTModelRunner.h" + +namespace llvm { +class AI4CFHModelRunner : public AOTModelRunner { + std::vector Means = {1.465734698027552, 1.4197828709288298, + 3.3613992762364293, 0.0, + 0.1746682750301568, 6.631604342581423, + 12.014957780458383, 56.88612786489747, + 0.2858866103739445, 2.3667068757539202, + 5.74477117323329, 0.436670687575392, + 11.69384800965018, 0.0, + 11.710977080820266, 5.822677925211098, + 1.7367913148371532, 4.858624849215923, + 0.3968636911942099, 0.0, + 1.389384800965018, 0.014234016887816647, + 1.918455971049457, 0.3884197828709288, + 9008478354872.904, 35479107656.01327, + 0.9443126047764814, 0.0, + 0.2858866103739445, 13.500092101470985}; + std::vector Scales = {5.4790464827052485, 0.9966866464997833, + 4.576085734290281, 1.0, + 0.379683116201057, 17.919229448736246, + 25.571183422373167, 129.11724994974335, + 0.45183565196079006, 4.186387428100155, + 19.723929700921122, 1.0048886603298528, + 27.891191708844115, 1.0, + 24.802290598830172, 16.219667195304115, + 5.212905932089306, 11.682770650628337, + 1.0872835783415236, 1.0, + 4.221820876751944, 0.11845425130004923, + 10.538761107164044, 1.9269308552742552, + 577601301282448.0, 493848252785.81384, + 0.6107184002557943, 1.0, + 0.45183565196079006, 62.50548300088351}; +public: + AI4CFHModelRunner(LLVMContext &Ctx, std::vector> Features, StringRef DecisionName) : + AOTModelRunner(Ctx, {{"input_1", "float["+std::to_string(Features.size())+"]"}}, DecisionName){} + + bool setCustomFeature(int FeatureIndex, float FeatureValue) override { + float ScaledValue = (FeatureValue - Means[FeatureIndex]) / Scales(FeatureIndex); + float *Location = getTensor(0) + FeatureIndex; + *Location = ScaledValue; + return true; + } + + int getModelResultI(std::string OutputName) override { + if (OutputName = "FH") { + int Classes[] = {0, 1, 2}; + void *ResultUntyped = CompiledModel->result_data(0); + float *Result = reinterpret_cast(ResultUntyped); + float Ma = Result[0]; + int MaxClass = 0; + for (size_t I = 0; I < sizeof(Classes)/sizeof(int); ++I) { + if (Result[I] > Max) { + Max = Result[I]; + MaxClass = I; + } + } + } + return Classes[MaxClass]; + } + assert(false && "ModelRunner received invalid result name"); +}; +} + +#endif +#endif \ No newline at end of file diff --git a/llvm/include/llvm/Transforms/Instrumentation/AI4CMEMOPModelRunner.h b/llvm/include/llvm/Transforms/Instrumentation/AI4CMEMOPModelRunner.h new file mode 100644 index 0000000000000000000000000000000000000000..d3984cd530044af983c80e58638d18ef75961097 --- /dev/null +++ b/llvm/include/llvm/Transforms/Instrumentation/AI4CMEMOPModelRunner.h @@ -0,0 +1,100 @@ +#ifdef LLVM_HAVE_TF_AOT_AI4CMEMOPCOMPILEDMODEL + +#ifndef LLVM_ANALYSIS_AI4CMEMOPMODELRUNNER_H +#define LLVM_ANALYSIS_AI4CMEMOPMODELRUNNER_H + +#include "llvm/Analysis/AI4CMEMOPCompiledModel.h" +#include "llvm/Analysis/AOTModelRunner.h" + +namespace llvm { +class AI4CMEMOPModelRunner : public AOTModelRunner { + std::vector Means = {5.902094114865505, 1.9211618257261411, + 11.605809128630705, 0.0, + 0.17012448132780084, 29.585062240663902, + 66.91701244813278, 314.3526970954357, + 0.5311203319502075, 4.153526970954357, + 5.242383737287086, 1.5767634854771784, + 71.06639004149378, 0.0, + 66.86721991701245, 23.286307053941908, + 9.244813278008298, 27.502074688796682, + 1.3941908713692945, 0.0, + 3.211618257261411, 0.024896265560165973, + 12.556016597510373, 1.4688796680497926, + 2.2994485024434755e+17, 1.5308973833539498e+17, + 1.484186989637826, 0.0, + 0.5311203319502075, 61.61799606544843, + 0.5269709543568465, 0.29045643153526973, + 0.04564315352697095, 0.0, + 1.8506224066390042, 1.4190871369294606, + 0.946058091286307, 1.004149377593361, + 4.394190871369295, 0.17842323651452283, + 12.348547717842324, 1.0788381742738589, + 1.9170124481327802, 1.0290456431535269, + 1.7178423236514522, 0.9336099585062241, + 10.946058091286307}; + std::vector Scales = {10.320994966903598, 1.0769213741084371, + 12.785422175115425, 1.0, + 0.37574212191441586, 89.78282941276281, + 103.94904473562026, 596.798404979162, + 0.4990305851742043, 6.4090572264124965, + 2.6100173708575176, 3.088480723013504, + 109.24510752282497, 1.0, + 103.95354354673327, 67.94906698858647, + 20.696839217519198, 49.92194109972631, + 2.1476145863960108, 1.0, + 6.5267427741688, 0.155808990502229, + 49.316216255834725, 4.396347313306036, + 2.0452401653902395e+18, 1.6734645469172273e+18, + 0.2550799453207434, 1.0, + 0.4990305851742043, 116.57277158707657, + 0.49927203769195905, 0.45397300901602894, + 0.20870998074621255, 1.0, + 1.533339596437842, 1.1642535392303865, + 1.4494221987807236, 1.5093015210423568, + 3.53497748777652, 0.5664470355711709, + 9.105814564701344, 0.26948602292331114, + 2.2886516432701987, 1.3917311833516468, + 2.0661400638678655, 1.5738234218680085, + 53.640199004027394}; + +public: + AI4CMEMOPModelRunner( + LLVMContext &Ctx, + std::vector> Features, + StringRef DecisionName) + : AOTModelRunner( + Ctx, + {{"input_1", "float32[" + std::to_string(Features.size()) + "]"}}, + DecisionName) {} + bool setCustomFeature(int FeatureIndex, float FeatureValue) override { + float ScaledValue = + (FeatureValue - Means[FeatureIndex]) / Scales[FeatureIndex]; + // Assuming the Buffer at index 0 is for feature input of shape: + // (Feature.size()) + float *Location = getTensor(0) + FeatureIndex; + *Location = ScaledValue; + return true; + } + int getModelResultI(std::string OutputName) override { + if (OutputName == "OPT") { + int Classes[] = {0, 1}; + void *ResultUntyped = CompiledModel->result_data(0); + float *Result = reinterpret_cast(ResultUntyped); + float Max = Result[0]; + int MaxClass = 0; + for (size_t I = 0; I < sizeof(Classes) / sizeof(int); ++I) { + if (Result[I] > Max) { + Max = Result[I]; + MaxClass = I; + } + } + + return Classes[MaxClass]; + } + assert(false && "ModelRunner received invalid result name"); + } +}; +} // namespace llvm + +#endif // LLVM_ANALYSIS_AI4CMEMOPMODELRUNNER_H +#endif // LLVM_HAVE_TF_AOT_AI4CMEMOPCOMPILEDMODEL \ No newline at end of file diff --git a/llvm/include/llvm/Transforms/Instrumentation/BWModelRunner.h b/llvm/include/llvm/Transforms/Instrumentation/BWModelRunner.h new file mode 100644 index 0000000000000000000000000000000000000000..2da88721dfd43664dc252b372e308d0263474d49 --- /dev/null +++ b/llvm/include/llvm/Transforms/Instrumentation/BWModelRunner.h @@ -0,0 +1,99 @@ +#ifdef LLVM_HAVE_TF_AOT_BWCOMPILEDMODEL + +#ifndef LLVM_ANALYSIS_BWMODELRUNNER_H +#define LLVM_ANALYSIS_BWMODELRUNNER_H + +#include "llvm/Analysis/BWCompiledModel.h" +#include "llvm/Analysis/AOTModelRunner.h" + +namespace llvm { +class BWModelRunner : public AOTModelRUnner { + std::vector Means = { + 8.557268, 1.575299, 10.177474, + 0.000000, 0.227816, 24.805887, + 51.475043, 254.383319, 0.605589, + 4.116894, 4.828423, 1.471630, + 52.938567, 0.000000, 49.909983, + 21.196672, 6.254693, 21.723123, + 0.838737, 0.000000, 3.145904, + 0.074872, 13.087884, 1.245734, + 0.000000, 96216648323.836823, 1.473124, + 0.000000, 0.605589, 112.409294, + 3.081271, 5.340444, 18.912116, + 7.253200, 0.898251, 0.824232, + 0.175768, 0.000000, 0.000000, + 0.000000, 0.200939, 0.202645, + 0.261519, 1.000000, 0.000000, + 0.000000, 0.104522, 0.071672, + 0.003413, 0.555887, 0.622440, + 0.150171, 0.177688, 1.397398, + 4.822739, 18.912116, 0.848763, + 0.028157, 0.000000, 0.000000, + 0.000000, 1.040102}; + std::vector Scales = { + 27.745577, 1.052121, 8.415261, + 1.000000, 0.419423, 59.940392, + 73.032409, 466.404899, 0.488724, + 6.324453, 2.269509, 2.059701, + 68.311258, 1.000000, 70.720114, + 58.380312, 17.650317, 28.678274, + 1.739062, 1.000000, 6.435559, + 0.263185, 57.460392, 4.073632, + 1.000000, 947094553445.211548, 0.168403, + 1.000000, 0.488724, 434.287281, + 3.942960, 5.554390, 21.879605, + 8.374124, 3.171252, 0.380623, + 0.380623, 1.000000, 1.000000, + 1.000000, 0.400702, 0.401970, + 0.439462, 1.000000, 1.000000, + 1.000000, 0.305937, 0.257945, + 0.058321, 0.496867, 0.484777, + 0.357239, 0.382250, 1.110054, + 5.924685, 21.879605, 0.358280, + 0.165421, 1.000000, 1.000000, + 1.000000, 2.427114};; + +public: + BWModelRunner( + LLVMContext &Ctx, + std::vector> Features, + StringRef DecisionName) + : AOTModelRunner( + Ctx, + {{"input_1", "float32[" + std::to_string(Features.size()) + "]"}}, + DecisionName) {} + bool setCustomFeature(int FeatureIndex, float FeatureValue) override { + float ScaledValue = + (FeatureValue - Means[FeatureIndex]) / Scales[FeatureIndex]; + // Assuming the Buffer at index - is for feature input of shape: + // (Feature.size()) + float *Location = getTensor(0) + FeatureIndex; + *Location = ScaledValue; + return true; + } + int getModelResultI(std::string OutputName) override { + if (OutputName == "BW-BranchWeight") { + // Each class represents one bin + // E.g., 0: 0%, 1:10$, ..., 9:90% + int Classes[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + void *ResultUntyped = CompiledModel->result_data(0); + float *Result = reinterpret_cast(ResultUntyped); + float Max = Result[0]; + int MaxClass = 0; + for (size_t I = 0; i < sizeof(Classes) / sizeof(int); ++I) { + if (Result[I] > Max) { + Max = Result[I]; + MaxClass = I; + } + } + + return Classes[MaxClass] * 10; + } + assert(false && "ModelRunner received invalid result name"); + } +}; +} // namespace llvm + +#endif //LLVM_ANALYSIS_BWMODELRUNNER_H + +#endif //LLVM_HAVE_TF_AOT_BWCOMPILEDMODEL \ No newline at end of file diff --git a/llvm/include/llvm/Transforms/Utils/ACPOBranchWeightModel.h b/llvm/include/llvm/Transforms/Utils/ACPOBranchWeightModel.h new file mode 100644 index 0000000000000000000000000000000000000000..05b9c645ff76114c619e9c2f95ff20c086231a51 --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/ACPOBranchWeightModel.h @@ -0,0 +1,30 @@ +// ===- ACPOBranchWeightModel.h - ACPO Branch weight model ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. +// +//===------------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORM_UTILS_ACPOBRANCHWEIGHTMODEL_H +#define LLVM_TRANSFORM_UTILS_ACPOBRANCHWEIGHTMODEL_H + +#include "llvm/ADT/StringMap.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class ACPOBranchWeightModelPass + : public PassInfoMixin { +public: + PreservedAnalyses run(Module &m, ModuleAnalysisManager &MAM); + +private: + bool applyBranchWeightUsingACPOModel(Module &M, ModuleAnalysisManager &MAM); +}; +} + +#endif //LLVM_TRANSFORM_UTILS_ACPOBRANCHWEIGHTMODEL_H \ No newline at end of file diff --git a/llvm/lib/Analysis/ACPOBWModel.cpp b/llvm/lib/Analysis/ACPOBWModel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6a95b34df97878d44f3063c983e888544cf37cab --- /dev/null +++ b/llvm/lib/Analysis/ACPOBWModel.cpp @@ -0,0 +1,58 @@ +//===- ACPOBWModel.cpp - ACPO Branch weight inference ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. +// +//==-----------------------------------------------------------------------===// +// +// This file implements the interface between ACPO and ML-guided optimizations. +// It delegates decision making to inference with a pre-trained model. +// +//==-----------------------------------------------------------------------===// + +#include "llvm/Analysis/ACPOBWModel.h" + +using namespace llvm; + +#define DEBUG_TYPE "acpo-bw-model" + +ACPOBWModel::ACPOBWModel(LLVMContext *Context, OptimizationRemarkEmitter *ORE) + : ACPOModel(ORE, true) { + setContextPtr(Context); + // Python support is turned off + setMLIF(createPersistentCompiledMLIF()); +} + +ACPOBWModel::~ACPOBWModel() {} + +void ACPOBWModel::setMLCustomFeatures( + std::vector> FeatureValues) { + CustomFeatureValues = FeatureValues; +} + +std::unique_ptr ACPOBWModel::getAdviceML() { + std::shared_ptr MLIF = getMLIF(); + // Generate result. + std::unique_ptr Advice = std::make_unique(); + assert(MLIR != nullptr); + if (!MLIF->loadModel("") || + !MLIF->initializeFeatures("BW", CustomFeatureValues)) { + outs() << "Model not loaded or features not initialized. " + << "Did you export BISHENG_ACPO_DIR to $LLVM_DIR/acpo ?\n" + << "Falling back to default advisor. \n"; + return nullptr; + } + bool ModelRunOK = MLIF->runModel("BW"); + assert(ModelRunOK); + BranchWeight = MLIF->getModelResultI("BW-BranchWeight"); + assert(getContextPtr() != nullptr); + Advice->addField("BW-BranchWeight", ConstantInt::get(Type::getInt64Ty(*(getContextPtr())), + (int64_t)BranchWeight)); + + return Advice; +} + +std::unique_ptr ACPOBWModel::getAdviceNoML() { return nullptr; } \ No newline at end of file diff --git a/llvm/lib/Analysis/ACPOCollectFeatures.cpp b/llvm/lib/Analysis/ACPOCollectFeatures.cpp index daa924f2cb3b1a59c7424cbeb6fb0bc4bdcccb11..b5cdd509888393914829431505430ab359859a27 100644 --- a/llvm/lib/Analysis/ACPOCollectFeatures.cpp +++ b/llvm/lib/Analysis/ACPOCollectFeatures.cpp @@ -19,11 +19,13 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/DumpFeature.h" #include "llvm/Analysis/FunctionPropertiesAnalysis.h" #include "llvm/Analysis/InlineAdvisor.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" @@ -68,6 +70,10 @@ calculateInlineCostFeatures(ACPOCollectFeatures &ACF, const ACPOCollectFeatures::FeatureInfo &info); static void calculateACPOFIExtendedFeaturesFeatures( ACPOCollectFeatures &ACF, const ACPOCollectFeatures::FeatureInfo &info); +static void calculateBasicBlockFeatures( + ACPOCollectFeatures &ACF, const ACPOCollectFeatures::FeatureInfo &info); +static void calculateEdgeFeatures( + ACPOCollectFeatures &ACF, const ACPOCollectFeatures::FeatureInfo &info); static void calculateIsIndirectCall(ACPOCollectFeatures &ACF, const ACPOCollectFeatures::FeatureInfo &info); @@ -82,6 +88,10 @@ static void calculateIsTailCall(ACPOCollectFeatures &ACF, static void calculateOptCode(ACPOCollectFeatures &ACF, const ACPOCollectFeatures::FeatureInfo &info); +static void +calculateMemOptFeatures(ACPOCollectFeatures &ACF, + const ACPOCollectFeatures::FeatureInfo &info); + // Register FeatureIdx -> Feature name // FeatureIdx -> Scope, Scope -> FeatureIdx // FeatureIdx -> Group, Group -> FeatureIdx @@ -158,6 +168,39 @@ const std::unordered_map REGISTER_NAME(ACPOFIExtendedFeaturesInstrPerLoop, "InstrPerLoop"), REGISTER_NAME(ACPOFIExtendedFeaturesBlockWithMultipleSuccecorsPerLoop, "BlockWithMultipleSuccecorsPerLoop"), + REGISTER_NAME(NumSuccessors, "num_successors"), + REGISTER_NAME(NumInstrs, "num_instrs"), + REGISTER_NAME(NumCriticalEdges, "num_critical_deges"), + REGISTER_NAME(HighestNumInstrsInSucc, "highest_num_instrs_in_succ"), + REGISTER_NAME(SuccNumWithHighestNumInstrs, + "succ_num_with_highest_num_instrs"), + REGISTER_NAME(IsBranchInst, "is_branch_inst"), + REGISTER_NAME(IsSwitchInst, "is_switch_inst"), + REGISTER_NAME(IsIndirectBrInst, "is_indirect_br_inst"), + REGISTER_NAME(IsInvokeInst, "is_invoke_inst"), + REGISTER_NAME(IsCallBrInst, "iscall_br_inst"), + REGISTER_NAME(IsFirstOpPtr, "is_first_op_ptr"), + REGISTER_NAME(IsSecondOpNull, "is_second_op_null"), + REGISTER_NAME(IsSecondOpConstant, "is_second_op_constant"), + REGISTER_NAME(IsEqCmp, "is_eq_cmp"), + REGISTER_NAME(IsNeCmp, "is_ne_cmp"), + REGISTER_NAME(IsGtCmp, "is_gt_cmp"), + REGISTER_NAME(IsLtCmp, "is_lt_cmp"), + REGISTER_NAME(IsGeCmp, "is_ge_cmp"), + REGISTER_NAME(IsLeCmp, "is_le_cmp"), + REGISTER_NAME(IsIVCmp, "is_iv_cmp"), + REGISTER_NAME(IsBBInLoop, "is_bb_in_loop"), + REGISTER_NAME(IsFirstSuccInLoop, "is_first_succ_in_loop"), + REGISTER_NAME(IsSecondSuccInLoop, "is_second_succ_in_loop"), + REGISTER_NAME(DestNumSuccessors, "dest_num_successors"), + REGISTER_NAME(DestNumInstrs, "dest_num_instrs"), + REGISTER_NAME(DestNumCriticalEdges, "dest_num_critical_edges"), + REGISTER_NAME(DestIsBranchInst, "dest_is_branch_inst"), + REGISTER_NAME(DestIsSwitchInst, "dest_is_switch_inst"), + REGISTER_NAME(DestIsIndirectBrInst, "dest_is_indirect_br_inst"), + REGISTER_NAME(DestIsInvokeInst, "dest_is_invoke_inst"), + REGISTER_NAME(DestIsCallBrInst, "dest_is_call_br_inst"), + REGISTER_NAME(DestSuccNumber, "dest_succ_number"), REGISTER_NAME(CallerBlockFreq, "block_freq"), REGISTER_NAME(CallSiteHeight, "callsite_height"), REGISTER_NAME(ConstantParam, "nr_ctant_params"), @@ -170,6 +213,17 @@ const std::unordered_map REGISTER_NAME(IsInInnerLoop, "is_in_inner_loop"), REGISTER_NAME(IsMustTailCall, "is_must_tail"), REGISTER_NAME(IsTailCall, "is_tail"), + REGISTER_NAME(NumInst, "num_inst"), + REGISTER_NAME(NumPhis, "num_phis"), + REGISTER_NAME(NumCalls, "num_calls"), + REGISTER_NAME(NumLoads, "num_loads"), + REGISTER_NAME(NumStores, "num_stores"), + REGISTER_NAME(NumPreds, "num_preds"), + REGISTER_NAME(NumSuccs, "num_succs"), + REGISTER_NAME(EndsWithUnreachable, "ends_with_unreachable"), + REGISTER_NAME(EndsWithReturn, "ends_with_return"), + REGISTER_NAME(EndsWithCondBranch, "ends_with_cond_branch"), + REGISTER_NAME(EndsWithBranch, "ends_with_branch"), REGISTER_NAME(NumOfFeatures,"num_features"), }; #undef REGISTER_NAME @@ -241,6 +295,38 @@ const std::unordered_mapgetTerminator(); + + int num_insts = 0; + int num_phis = 0; + int num_calls = 0; + int num_loads = 0; + int num_stores = 0; + bool end_with_cond_branch = 0; + bool end_with_branch = 0; + + for (auto &inst : *BB) { + num_insts++; + if (isa(inst)) + num_phis++; + if (isa(inst)) + num_calls++; + if (isa(inst)) + num_loads++; + if (isa(inst)) + num_stores++; + } + + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::NumInst, Info, + std::to_string(num_insts)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::NumPhis, Info, + std::to_string(num_phis)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::NumCalls, Info, + std::to_string(num_calls)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::NumLoads, Info, + std::to_string(num_loads)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::NumStores, Info, + std::to_string(num_stores)); + ACF.setFeatureValueAndInfo( + ACPOCollectFeatures::FeatureIndex::NumPreds, Info, + std::to_string(std::distance(pred_begin(BB), pred_end(BB)))); + ACF.setFeatureValueAndInfo( + ACPOCollectFeatures::FeatureIndex::NumSuccs, Info, + std::to_string(std::distance(succ_begin(BB), succ_end(BB)))); + ACF.setFeatureValueAndInfo( + ACPOCollectFeatures::FeatureIndex::EndsWithUnreachable, Info, + std::to_string(isa(*T))); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::EndsWithReturn, + Info, std::to_string(isa(*T))); + if (auto *BR = dyn_cast(T)) { + if (BR->isConditional()) + end_with_cond_branch = true; + else if (BR->isUnconditional()) + end_with_branch = true; + } + + ACF.setFeatureValueAndInfo( + ACPOCollectFeatures::FeatureIndex::EndsWithCondBranch, Info, + std::to_string(end_with_cond_branch)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::EndsWithBranch, + Info, std::to_string(end_with_branch)); +} + void calculateInlineCostFeatures(ACPOCollectFeatures &ACF, const ACPOCollectFeatures::FeatureInfo &Info) { assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures || @@ -1066,6 +1312,190 @@ void calculateACPOFIExtendedFeaturesFeatures( } } +void calculateBasicBlockFeatures( + ACPOCollectFeatures &ACF, const ACPOCollectFeatures::FeatureInfo &Info) { + assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures || + ACPOCollectFeatures::getFeatureGroup(Info.Idx) == + ACPOCollectFeatures::GroupID::BasicBlockFeatures); + + // check if we already calculated the values. + if (ACF.containsFeature(ACPOCollectFeatures::GroupID::BasicBlockFeatures)) + return; + + auto *BB = Info.SI.BB; + auto *F = Info.SI.F; + auto *FAM = Info.Managers.FAM; + + assert(BB && F && FAM && "One of BB, F or FAM is nullptr"); + + unsigned NumInstrs = std::distance(BB->instructionsWithoutDebug().begin(), + BB->instructionsWithoutDebug().end()); + + unsigned NumCriticalEdges = 0; + for (auto &BBI : *F) { + const Instruction *TI = BBI.getTerminator(); + for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) { + if(isCriticalEdge(TI, I)) + NumCriticalEdges++; + } + } + + Instruction *TI = BB->getTerminator(); + unsigned HighestNumInstrsInSucc = 0; + unsigned SuccNumWithHighestNumInstrs = 0; + + for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) { + BasicBlock *Succ = TI->getSuccessor(I); + unsigned CurrNumInstrs = std::distance(Succ->instructionsWithoutDebug().begin(), + Succ->instructionsWithoutDebug().end()); + if (CurrNumInstrs > HighestNumInstrsInSucc) { + HighestNumInstrsInSucc = CurrNumInstrs; + SuccNumWithHighestNumInstrs = GetSuccessorNumber(BB, Succ); + } + } + + bool IsFirstOpPtr = false; + bool IsSecondOpNull = false; + bool IsSecondOpConstant = false; + bool IsEqCmp = false; + bool IsNeCmp = false; + bool IsGtCmp = false; + bool IsLtCmp = false; + bool IsGeCmp = false; + bool IsLeCmp = false; + bool IsIndVarCmp = false; + bool IsBBInLoop = false; + bool IsFirstSuccInLoop = false; + bool IsSecondSuccInLoop = false; + if (BranchInst *BI = dyn_cast(TI)) { + if(BI->isConditional()) { + Value *Cond = BI->getCondition(); + if (ICmpInst *CI = dyn_cast(Cond)) { + Value *LHS = CI->getOperand(0); + IsFirstOpPtr = LHS->getType()->isPointerTy(); + Value *RHS = CI->getOperand(1); + IsSecondOpNull = isa(RHS); + IsSecondOpConstant = isa(RHS); + CmpInst::Predicate Pred = CI->getPredicate(); + IsEqCmp = Pred == CmpInst::ICMP_EQ; + IsNeCmp = Pred == CmpInst::ICMP_NE; + IsGtCmp = ICmpInst::isGT(Pred); + IsLtCmp = ICmpInst::isLT(Pred); + IsGeCmp = ICmpInst::isGE(Pred); + IsLeCmp = ICmpInst::isLE(Pred); + } + + LoopInfo &LI = FAM->getResult(*F); + ScalarEvolution &SE = FAM->getResult(*F); + for (auto &L : LI) { + IsBBInLoop = (IsBBInLoop || L->contains(BB)); + IsFirstSuccInLoop = (IsFirstSuccInLoop || L->contains(TI->getSuccessor(0))); + IsSecondSuccInLoop = (IsSecondSuccInLoop || L->contains(TI->getSuccessor(1))); + if (PHINode *IndVar = L->getInductionVariable(SE)) + if (IndVar->getParent() == BB) + IsIndVarCmp = true; + } + } + } + + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsSecondSuccInLoop, + Info, std::to_string(IsSecondSuccInLoop)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsFirstSuccInLoop, + Info, std::to_string(IsFirstSuccInLoop)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsBBInLoop, + Info, std::to_string(IsBBInLoop)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsIVCmp, + Info, std::to_string(IsIndVarCmp)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsLeCmp, + Info, std::to_string(IsLeCmp)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsGeCmp, + Info, std::to_string(IsGeCmp)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsLtCmp, + Info, std::to_string(IsLtCmp)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsGtCmp, + Info, std::to_string(IsGtCmp)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsEqCmp, + Info, std::to_string(IsEqCmp)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsNeCmp, + Info, std::to_string(IsNeCmp)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsSecondOpConstant, + Info, std::to_string(IsSecondOpConstant)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsSecondOpNull, + Info, std::to_string(IsSecondOpNull)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsFirstOpPtr, + Info, std::to_string(IsFirstOpPtr)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsCallBrInst, + Info, std::to_string(isa(TI))); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsInvokeInst, + Info, std::to_string(isa(TI))); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsIndirectBrInst, + Info, std::to_string(isa(TI))); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsSwitchInst, + Info, std::to_string(isa(TI))); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsBranchInst, + Info, std::to_string(isa(TI))); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::SuccNumWithHighestNumInstrs, + Info, std::to_string(SuccNumWithHighestNumInstrs)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::HighestNumInstrsInSucc, + Info, std::to_string(HighestNumInstrsInSucc)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::NumCriticalEdges, + Info, std::to_string(NumCriticalEdges)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::NumInstrs, + Info, std::to_string(NumInstrs)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::NumSuccessors, + Info, std::to_string(TI->getNumSuccessors())); +} + +void calculateEdgeFeatures( + ACPOCollectFeatures &ACF, const ACPOCollectFeatures::FeatureInfo &Info) { + assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures || + ACPOCollectFeatures::getFeatureGroup(Info.Idx) == + ACPOCollectFeatures::GroupID::EdgeFeatures); + + // Check if we already calculated the values. + if (ACF.containsFeature(ACPOCollectFeatures::GroupID::EdgeFeatures)) + return; + + auto *BB = Info.SI.BB; + auto *DestBB = Info.SI.DestBB; + auto *F = Info.SI.F; + auto *FAM = Info.Managers.FAM; + + assert(BB && DestBB && F && FAM && "One of BB, DestBB, F or FAM is nullptr"); + + unsigned DestNumInstrs = std::distance(DestBB->instructionsWithoutDebug().begin(), + DestBB->instructionsWithoutDebug().end()); + + unsigned DestNumCriticalEdges = 0; + for(auto &BBI : *F) { + const Instruction *TI = BBI.getTerminator(); + for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) { + if (isCriticalEdge(TI, I)) + DestNumCriticalEdges++; + } + } + + const Instruction *TI = DestBB->getTerminator(); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::DestSuccNumber, + Info, std::to_string(GetSuccessorNumber(BB, DestBB))); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::DestIsCallBrInst, + Info, std::to_string(isa(TI))); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::DestIsInvokeInst, + Info, std::to_string(isa(TI))); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::DestIsIndirectBrInst, + Info, std::to_string(isa(TI))); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::DestIsSwitchInst, + Info, std::to_string(isa(TI))); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::DestIsBranchInst, + Info, std::to_string(isa(TI))); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::DestNumCriticalEdges, + Info, std::to_string(DestNumCriticalEdges)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::DestNumInstrs, + Info, std::to_string(DestNumInstrs)); + ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::DestNumSuccessors, + Info, std::to_string(TI->getNumSuccessors())); +} + void calculateIsIndirectCall(ACPOCollectFeatures &ACF, const ACPOCollectFeatures::FeatureInfo &Info) { assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures || diff --git a/llvm/lib/Analysis/ACPOMLInterface.cpp b/llvm/lib/Analysis/ACPOMLInterface.cpp index 7d84bd5112d6dbd17872c9db0a03955c99ac36c5..fe7439f6bb9e84f331c0d5786997fd5847ef0fb1 100644 --- a/llvm/lib/Analysis/ACPOMLInterface.cpp +++ b/llvm/lib/Analysis/ACPOMLInterface.cpp @@ -18,6 +18,9 @@ #include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Instrumentation/AI4CFHModelRunner.h" +#include "llvm/Transforms/Instrumentation/AI4CMEMOPModelRunner.h" +#include "llvm/Transforms/Instrumentation/BWModelRunner.h" #include #include @@ -250,8 +253,15 @@ bool ACPOMLPythonInterface::loadModel(std::string ModelSpecFile) { return false; } } + LLVM_DEBUG(dbgs() << "Tokens size: " << Tokens.size() << "\n"); + LLVM_DEBUG(dbgs() << "Tokens "); + for (auto Token : Tokens) + LLVM_DEBUG(dbgs() << Token << " "); + LLVM_DEBUG(dbgs() << "\n"); int OutputStart = 3 + NumFeatures; + LLVM_DEBUG(dbgs() << "Token[OutputStart]: " << Tokens[OutputStart] << "\n"); int NumOutputs = std::stoi(Tokens[OutputStart]); + LLVM_DEBUG(dbgs() << "NumOutputs: " << NumOutputs << "\n"); ModelPtr->setNumOutputs(NumOutputs); OutputStart++; std::string OutputName; @@ -1403,6 +1413,37 @@ createFI(std::vector> Inputs, } #endif +#ifdef LLVM_HAVE_TF_AOT_BWCOMPILEDMODEL +std::unique_ptr +createBW(std::vector> Inputs, + StringRef Decision) { + // Context does not ever seem to be used in the model runner, + // so for now just create an empty context object + LLVMContext Ctx; + return std::make_unique(Ctx, Inputs, Decision); +} +#endif + +#ifdef LLVM_HAVE_TF_AOT_AI4CFHCOMPILEDMODEL +std::unique_ptr +createAI4CFH(std::vector> Inputs, StringRef Decision) { + // Context does not ever seem to be used in the model runner, + // so for now just create an empty context object + LLVMContext Ctx; + return std::make_unique(Ctx, Inputs, Decision); +} +#endif + +#ifdef LLVM_HAVE_TF_AOT_AI4CMEMOPCOMPILEDMODEL +std::unique_ptr +createAI4CMEMOP(std::vector> Inputs, StringRef Decision) { + // Context does not ever seem to be used in the model runner, + // so for now just create an empty context object + LLVMContext Ctx; + return std::make_unique(Ctx, Inputs, Decision); +} +#endif + // Generate map using ifdefs for now, in the future we could have this // automatically populate using macros const std::unordered_map OutFile("feature-output", llvm::cl::desc("File for outputting features"), llvm::cl::init("features.csv")); +namespace llvm { + // Defined in ACPOBranchWeightModel.cpp + extern cl::opt EnableACPOBWModel; +} // namespace llvm + namespace { unsigned getMaxInstructionID() { #define LAST_OTHER_INST(NR) return NR; @@ -215,9 +220,12 @@ void ACPOFIExtendedFeatures::updateBBLoopCallsiteBFFeatures( if (!Callee->isDeclaration()) { // Check all the functions that was called and get the max block // frequency. - uint64_t EntryFreq = - FAM->getResult(*Callee) - .getEntryFreq(); + uint64_t EntryFreq; + if (EnableACPOBWModel) { + EntryFreq = 0; + } else { + EntryFreq = FAM->getResult(F).getEntryFreq(); + } MaxCallsiteBlockFreq = std::max(EntryFreq, MaxCallsiteBlockFreq); } diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index e2fe3322aef4fa1f24a95ddeed1874e485da1905..f5d0a34f85ae5b6a7beea75a6c886db92a15f8bc 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -265,12 +265,14 @@ #if defined(ENABLE_AUTOTUNER) #include "llvm/Analysis/AutotuningDump.h" #include "llvm/Transforms/Scalar/AutoTuningCompile.h" +#include "llvm/Transforms/Utils/ACPOBranchWeightModel.h" #endif #if defined(ENABLE_ACPO) #include "llvm/Analysis/CallHeight.h" #include "llvm/Analysis/DumpCallsite.h" #include "llvm/Analysis/DumpFeature.h" +#include "llvm/Transforms/Instrumentation/AI4CAnalysis.h" #endif using namespace llvm; diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index de89f5393ba26e64701d03d51d70c1ae109c77d2..df54015763b69e0bba6fc0c942022db72d465b56 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -68,6 +68,7 @@ #include "llvm/Transforms/IPO/SyntheticCountsPropagation.h" #include "llvm/Transforms/IPO/WholeProgramDevirt.h" #include "llvm/Transforms/InstCombine/InstCombine.h" +#include "llvm/Transforms/InstCombine/InstCombine.h" #include "llvm/Transforms/Instrumentation/CGProfile.h" #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" #include "llvm/Transforms/Instrumentation/InstrOrderFile.h" @@ -142,6 +143,8 @@ #include "llvm/Analysis/CallHeight.h" #include "llvm/Analysis/DumpCallsite.h" #include "llvm/Analysis/DumpFeature.h" +#include "llvm/Transforms/Utils/ACPOBranchWeightModel.h" +#include "llvm/Transforms/Instrumentation/AI4CAnalysis.h" #endif using namespace llvm; @@ -307,6 +310,7 @@ extern cl::opt AutoTuningCompileMode; namespace llvm { extern cl::opt MaxDevirtIterations; extern cl::opt EnableKnowledgeRetention; +extern cl::opt EnableACPOBWModel; } // namespace llvm void PassBuilder::invokePeepholeEPCallbacks(FunctionPassManager &FPM, @@ -585,6 +589,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, invokePeepholeEPCallbacks(FPM, Level); + if (PTO.AI4CAnalysis && !Level.isOptimizingForSize()) FPM.addPass(PGOMemOPSizeOpt()); + // For PGO use pipeline, try to optimize memory intrinsics such as memcpy // using the size value profile. Don't perform this when optimizing for size. if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse && @@ -753,6 +759,47 @@ void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) { MPM.addPass(NameAnonGlobalPass()); } +void PassBuilder::addAI4CRelatedPassesForO0(ModulePassManager &MPM) { + MPM.addPass(AI4CAnalysis()); + MPM.addPass(RequireAnalysisPass()); +} + +void PassBuilder::addAI4CRelatedPasses(ModulePassManager &MPM, + OptimizationLevel Level, + ThinOrFullLTOPhase LTOPhase) { + assert(Level != OptimizationLevel::O0 && "Not expection O0 here!"); + if (!DisablePreInliner) { + InlineParams IP; + + IP.DefaultThreshold = PreInlineThreshold; + IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325; + ModuleInlinerWrapperPass MIWP( + IP, /* Mandatory First */ true, InlineContext{LTOPhase, InlinePass::EarlyInliner} + ); + CGSCCPassManager &CGPipeline = MIWP.getPM(); + + FunctionPassManager FPM; + //FPM.addPass(ConnectNoAliasDeclPass()); // Do this before SROA + FPM.addPass(EarlyCSEPass()); // Catch trivial redundancies. + + // Propagate and Convert as early as possible. + // But do it after SROA and EaslyCSE ! + //FPM.addPass(PropagateAndConvertNoAliasPass()); + + // Merge and remove basic blocks. + FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); + FPM.addPass(InstCombinePass()); // combine silly sequences. + invokePeepholeEPCallbacks(FPM, Level); + + CGPipeline.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM), PTO.EagerlyInvalidateAnalyses)); + + MPM.addPass(std::move(MIWP)); + + MPM.addPass(AI4CAnalysis()); + MPM.addPass(RequireAnalysisPass()); + } +} + void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, OptimizationLevel Level, bool RunProfileGen, bool IsCS, std::string ProfileFile, @@ -853,6 +900,36 @@ void PassBuilder::addPGOInstrPassesForO0( MPM.addPass(InstrProfiling(Options, IsCS)); } +void PassBuilder::addACPOBWPasses(ModulePassManager &MPM, OptimizationLevel Level, ThinOrFullLTOPhase LTOPhase, bool skipPreInline) { + assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!"); + if (!skipPreInline && !DisablePreInliner) { + InlineParams IP; + IP.DefaultThreshold = PreInlineThreshold; + IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325; + ModuleInlinerWrapperPass MIWP( + IP, true, InlineContext{LTOPhase, InlinePass::EarlyInliner} + ); + CGSCCPassManager &CGPipeline = MIWP.getPM(); + + FunctionPassManager FPM; + FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); + FPM.addPass(EarlyCSEPass()); + + FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); + FPM.addPass(InstCombinePass()); + invokePeepholeEPCallbacks(FPM, Level); + + CGPipeline.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM), PTO.EagerlyInvalidateAnalyses)); + + MPM.addPass(std::move(MIWP)); + + MPM.addPass(GlobalDCEPass()); + } + + MPM.addPass(ACPOBranchWeightModelPass()); + MPM.addPass(RequireAnalysisPass()); +} + static InlineParams getInlineParamsFromOptLevel(OptimizationLevel Level) { return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel()); } @@ -875,6 +952,9 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level, PGOOpt->Action == PGOOptions::SampleUse) IP.HotCallSiteThreshold = 0; + if (PTO.AI4CAnalysis) + IP.EnableDeferral = EnablePGOInlineDeferral; + if (PGOOpt) IP.EnableDeferral = EnablePGOInlineDeferral; @@ -975,6 +1055,9 @@ PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level, PGOOpt->Action == PGOOptions::SampleUse) IP.HotCallSiteThreshold = 0; + if (PTO.AI4CAnalysis) + IP.EnableDeferral = EnablePGOInlineDeferral; + if (PGOOpt) IP.EnableDeferral = EnablePGOInlineDeferral; @@ -1127,6 +1210,10 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM), PTO.EagerlyInvalidateAnalyses)); + if (PTO.AI4CAnalysis && Phase != ThinOrFullLTOPhase::ThinLTOPostLink) { + addAI4CRelatedPasses(MPM, Level, Phase); + MPM.addPass(PGOIndirectCallPromotion(false, false)); + } // Add all the requested passes for instrumentation PGO, if requested. if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink && (PGOOpt->Action == PGOOptions::IRInstr || @@ -1150,6 +1237,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, MPM.addPass(SyntheticCountsPropagation()); #if defined(ENABLE_AUTOTUNER) + if (!PGOOpt && EnableACPOBWModel) + addACPOBWPasses(MPM, Level, Phase, false); if (AutoTuningCompileMode) MPM.addPass(AutoTuningCompileModulePass(autotuning::CompileOptionInline)); #endif @@ -1365,6 +1454,9 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, PGOOpt->ProfileRemappingFile, LTOPhase, PGOOpt->FS); } + if (!LTOPreLink && !PGOOpt && EnableACPOBWModel) + addACPOBWPasses(MPM, Level, LTOPhase, true); + // Re-compute GlobalsAA here prior to function passes. This is particularly // useful as the above will have inlined, DCE'ed, and function-attr // propagated everything. We should at this point have a reasonably minimal @@ -1858,6 +1950,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, ThinOrFullLTOPhase::FullLTOPostLink, PGOOpt->FS); } + if (!PGOOpt && EnableACPOBWModel) + addACPOBWPasses(MPM, Level, ThinOrFullLTOPhase::FullLTOPostLink, true); + // Break up allocas FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); @@ -1995,6 +2090,9 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, if (PGOOpt && PGOOpt->PseudoProbeForProfiling) MPM.addPass(SampleProfileProbePass(TM)); + if (PTO.AI4CAnalysis) + addAI4CRelatedPassesForO0(MPM); + if (PGOOpt && (PGOOpt->Action == PGOOptions::IRInstr || PGOOpt->Action == PGOOptions::IRUse)) addPGOInstrPassesForO0( @@ -2081,6 +2179,12 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, return MPM; } +ModulePassManager PassBuilder::addAutoTunerLTOPreLinkPasses() { + ModulePassManager MPM; + addRequiredLTOPreLinkPasses(MPM); + return MPM; +} + AAManager PassBuilder::buildDefaultAAPipeline() { AAManager AA; diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 6ef0d6791ff29630400616407c741a574dc57b5c..6efd7c2bb3a805bb6d4b0643b8799c285b6e37a2 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -48,6 +48,7 @@ MODULE_ALIAS_ANALYSIS("globals-aa", GlobalsAA()) #ifndef MODULE_PASS #define MODULE_PASS(NAME, CREATE_PASS) #endif +MODULE_PASS("ai4c-analysis", AI4CAnalysis()) MODULE_PASS("always-inline", AlwaysInlinerPass()) MODULE_PASS("attributor", AttributorPass()) MODULE_PASS("annotation2metadata", Annotation2MetadataPass()) @@ -135,6 +136,7 @@ MODULE_PASS("sanmd-module", SanitizerBinaryMetadataPass()) MODULE_PASS("memprof-module", ModuleMemProfilerPass()) MODULE_PASS("poison-checking", PoisonCheckingPass()) MODULE_PASS("pseudo-probe-update", PseudoProbeUpdatePass()) +MODULE_PASS("acpo-branch-weight-model", ACPOBranchWeightModelPass()); #if defined(ENABLE_AUTOTUNER) MODULE_PASS("autotuning-compile-module", AutoTuningCompileModulePass()) #endif diff --git a/llvm/lib/Transforms/Instrumentation/ACPOAI4CFHModel.cpp b/llvm/lib/Transforms/Instrumentation/ACPOAI4CFHModel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..499410a9024b4a762b452fb2e26c30836e86b015 --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/ACPOAI4CFHModel.cpp @@ -0,0 +1,37 @@ +#include "llvm/Transforms/Instrumentation/ACPOAI4CFHModel.h" + +using namespace llvm; + +#define DEBUG_TYPE "acpo-ai4c-fh"; + +ACPOAI4CFHModel::ACPOAI4CFHModel(LLVMContext *Context, OptimizationRemarkEmitter *ORE) : ACPOModel(ORE, true) { + setContextPtr(Context); + setMLIF(createPersistentCompiledMLIF()); +} + +ACPOAI4CFHModel::~ACPOAI4CFHModel() {} + +void ACPOAI4CFHModel::setMLCustomFeatures(std::vector> FeatureValues) { + CustomFeatureValues = FeatureValues; +} + +std::unique_ptr ACPOAI4CFHModel::getAdviceML() { + std::shared_ptr MLIF = getMLIF(); + // Generate result. + std::unique_ptr Advice = std::make_unique(); + + if (!MLIF->loadModel("model-ai4cfh.acpo") || !MLIF->initializeFeatures("AI4CFH", CustomFeatureValues)) { + outs() << "Model not loaded or features not initialized." + << "Did you export BISHENG_ACPO_DIR to $LLVM_DIR/acpo ?\n" + << "Falling back to default advisor. \n"; + return nullptr; + } + bool ModelRunOK = MLIF->runModel("AI4CFH"); + Hotness = MLIF->getModelResultI("FH"); + Advice->addField("FH", ConstantInt::get(Type::getInt64Ty(*(getContextPtr())), (int64_t)Hotness)); + + return Advice; +} + +std::unique_ptr ACPOAI4CFHModel::getAdviceNoML() {return nullptr;} + diff --git a/llvm/lib/Transforms/Instrumentation/ACPOAI4CMEMOPModel.cpp b/llvm/lib/Transforms/Instrumentation/ACPOAI4CMEMOPModel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c2ca8d63dbcc3629bafa05da65e3acaabfd2614d --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/ACPOAI4CMEMOPModel.cpp @@ -0,0 +1,39 @@ +#include "llvm/Transforms/Instrumentation/ACPOAI4CMEMOPModel.h" + +using namespace llvm; + +#define DEBUG_TYPE "acpo-ai4c-memop" + +ACPOAI4CMEMOPModel::ACPOAI4CMEMOPModel(LLVMContext *Context, OptimizationRemarkEmitter *ORE) : ACPOModel(ORE, true) { + setContextPtr(Context); + setMLIF(createPersistentCompiledMLIF()); +} + +ACPOAI4CMEMOPModel::~ACPOAI4CMEMOPModel() {} + +void ACPOAI4CMEMOPModel::setMLCustomFeatures( + std::vector> FeatureValues) { + CustomFeatureValues = FeatureValues; +} + +std::unique_ptr ACPOAI4CMEMOPModel::getAdviceML() { + std::shared_ptr MLIF = getMLIF(); + // Generate result. + std::unique_ptr Advice = std::make_unique(); + + if (!MLIF->loadModel("model-ai4cmemop.acpo") || !MLIF->initializeFeatures("AI4CMEMOP", CustomFeatureValues)) { + outs() << "Model not loaded or features not initialized." + << "Did you export BISHENG_ACPO_DIR to $LLVM_DIR/acpo ?\n" + << "Falling back to default advisor. \n"; + return nullptr; + } + bool ModelRunOK = MLIF->runModel("AI4CMEMOP"); + ShouldOPT = MLIF->getModelResultI("OPT"); + Advice->addField("OPT", ConstantInt::get(Type::getInt64Ty(*(getContextPtr())), (int64_t)ShouldOPT)); + + return Advice; +} + +std::unique_ptr ACPOAI4CMEMOPModel::getAdviceNoML() { + return nullptr; +} diff --git a/llvm/lib/Transforms/Instrumentation/AI4CAnalysis.cpp b/llvm/lib/Transforms/Instrumentation/AI4CAnalysis.cpp new file mode 100644 index 0000000000000000000000000000000000000000..19c7849c114c85d9a9c5b19e5081fd717545488e --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/AI4CAnalysis.cpp @@ -0,0 +1,154 @@ +//===- AI4CAnalysis.cpp - AI4C Class for AOT ML model ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implement AOT ML model to decide function hotness +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Instrumentation/AI4CAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/ModelDataCollector.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/ProfileSummary.h" +#include "llvm/ProfileData/InstrProfReader.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/DOTGraphTraits.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Instrumentation/ACPOAI4CFHModel.h" +#include "llvm/Transforms/Utils/CallGraphUpdater.h" +#include "ValueProfileCollector.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "ai4c-analysis" + +static cl::opt AI4CDumpFile( + "ai4c-dump-file", cl::init("-"), cl::Hidden, + cl::desc("Name of a file to store AI4C feature/result data in.")); + +cl::opt + EnableAI4CFH("enable-ai4c-fh", cl::init(false), cl::Hidden, + cl::desc("Levarage AOT ML model to decide Function hotness.")); + +namespace { +/// Class for collecting AI4C FH model data +class ModelDataAI4CFHCollector : public ModelDataCollector { +public: + ModelDataAI4CFHCollector(formatted_raw_ostream &OS, + std::string OutputFileName) + : ModelDataCollector(OS, OutputFileName) {} + + void collectFeatures(Function *GlobalF, FunctionAnalysisManager *FAM) { + resetRegisteredFeatures(); + Module *GlobalM = GlobalF->getParent(); + ACPOCollectFeatures::FeatureInfo GlobalFeatureInfo { + ACPOCollectFeatures::FeatureIndex::NumOfFeatures, + {FAM, nullptr}, + {GlobalF, nullptr, nullptr, GlobalM, nullptr}}; + + registerFeature({ACPOCollectFeatures::Scope::Function}, GlobalFeatureInfo); + ModelDataCollector::collectFeatures(); + } +}; + +llvm::SmallDenseSet, 4> + InlinedInternalEdges = + llvm::SmallDenseSet, 4>(); +} // end anonymous namespace + +int64_t getACPOAdvice(Function *F, FunctionAnalysisManager *FAM, + ModelDataAI4CFHCollector *MDC) { + auto &ORE = FAM->getResult(*F); + std::unique_ptr AI4CFH = + std::make_unique(&(F->getContext()), &ORE); + std::vector> Features = + MDC->getFeatures(); + AI4CFH->setMLCustomFeatures(Features); + std::unique_ptr Advice = AI4CFH->getAdvice(); + Constant *Val = Advice->getField("FH"); + assert(Val != nullptr); + assert(isa(Val)); + ConstantInt *FH = dyn_cast(Val); + return FH->getSExtValue(); +} + +AI4CAnalysis::AI4CAnalysis() {} + +static bool skipAnalysis(const Function &F) { + if (F.isDeclaration()) + return true; + if (F.hasFnAttribute(llvm::Attribute::NoProfile)) + return true; + if (F.hasFnAttribute(llvm::Attribute::SkipProfile)) + return true; + + return false; +} + +PreservedAnalyses AI4CAnalysis::run(Module &M, ModuleAnalysisManager &MAM) { + if (EnableAI4CFH) { + LLVM_DEBUG(dbgs << "Annotate function hotness by ACPO: "); + // Initialize Feature Data Collector + std::error_code EC; + raw_fd_ostream RawOS(AI4CDumpFile.getValue(), EC, sys::fs::CD_OpenAlways, + sys::fs::FA_Write, sys::fs::OF_Append); + formatted_raw_ostream OS(RawOS); + ModelDataAI4CFHCollector MDC(OS, AI4CDumpFile); + FunctionAnalysisManager &FAM = + MAM.getResult(M).getManager(); + std::vector HotFunctions; + std::vector ColdFunctions; + + for (auto &F: M) { + if (skipAnalysis(F)) + continue; + MDC.collectFeatures(&F, &FAM); + FuncFreqAttr FreqAttr = (FuncFreqAttr)getACPOAdvice(&F, &FAM, &MDC); + if (FreqAttr == FFA_Cold) + ColdFunctions.push_back(&F); + else if (FreqAttr == FFA_Hot) + HotFunctions.push_back(&F); + } + + for (auto &F : HotFunctions) { + F->addFnAttr(Attribute::AlwaysInline); + LLVM_DEBUG(dbgs << "Set inline attribute to function " << F->getName() + << "\n"); + } + for (auto &F : ColdFunctions) { + // Only set when there is no Attribute::Hot set by the user. For hot + // attribute, user's annotation has the precedence over the profile. + if (F->hasFnAttribute(Attribute::Hot)) { + auto &Ctx = M.getContext(); + std::string Msg = std::string("Function ") + F->getName().str() + + std::string(" is annotated as a hot function but" + " the profile is cold"); + Ctx.diagnose( + DiagnosticInfoPGOProfile(M.getName().data(), Msg, DS_Warning)); + continue; + } + F->addFnAttr(Attribute::Cold); + LLVM_DEBUG(dbgs() << "Set cold attribute to function: " << F->getName() + << "\n"); + } + + return PreservedAnalyses::none(); + } else { + return PreservedAnalyses::all(); + } +} diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt index 955353944b141ac82f908abb99d0ba1ff9835111..ebea2258e0ff882cfe2893ba573be8451ed2f861 100644 --- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt @@ -1,4 +1,7 @@ add_llvm_component_library(LLVMInstrumentation + ACPOAI4CFHModel.cpp + ACPOAI4CMEMOPModel.cpp + AI4CAnalysis.cpp AddressSanitizer.cpp BoundsChecking.cpp CGProfile.cpp diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp index 2906fe19098401d22fb13ffcecba02ca8658e1fd..919dcecc4ffb5aa83f5035f85c37b77f83b02d8f 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/ModelDataCollector.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/BasicBlock.h" @@ -39,9 +40,13 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/FormattedStream.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" +#include "llvm/Transforms/Instrumentation/ACPOAI4CMEMOPModel.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include #include @@ -93,6 +98,135 @@ static cl::opt MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128), cl::desc("Optimize the memop size <= this value")); +cl::opt + EnableAI4CMEMOP("enable-ai4c-memop", cl::init(false), cl::Hidden, + cl::desc("Leverage AOT ML model to optimize memop.")); + +static cl::opt + MemOPDumpFile("memop-dump-file", cl::init("-"), cl::Hidden, + cl::desc("Name of a file to store memop data in.")); + +namespace { +class ModelDataAI4CMemOPCollector : public ModelDataCollector { +public: + ModelDataAI4CMemOPCollector(formatted_raw_ostream &OS, + std::string OutputFileName, + FunctionAnalysisManager *FAM) + : ModelDataCollector(OS, OutputFileName), FAM(FAM) {} + void collectFeatures(Function *F, Instruction *I, const char *op_name) { + // Now MemOPSizeOpt could only optimize memcpy and bcmp + int8_t type = op_name == "memcpy" ? 1 : op_name == "bcmp" ? 2 : 0; + resetRegisteredFeatures(); + Module *GlobalM = F->getParent(); + ACPOCollectFeatures::FeatureInfo GlobalFeatureInfo{ + ACPOCollectFeatures::FeatureIndex::NumOfFeatures, + {FAM, nullptr}, + {F, nullptr, I->getParent(), GlobalM, nullptr}}; + + registerFeature({ACPOCollectFeatures::Scope::Function}, GlobalFeatureInfo); + registerFeature({ACPOCollectFeatures::Scope::BasicBlock}, + GlobalFeatureInfo); + ModelDataCollector::collectFeatures(); + + // Insert Memop type + Features.push_back(std::make_pair( + "memop_type", std::to_string(type))); + + int8_t dst_align = 0, dst_from = 0, src_align = 0, src_from = 0; + // Insert Memop Align info of Dst and Src + if (type == 1) { + // For ptr param of memcpy + dst_align = + dyn_cast(I)->getDestAlign().valueOrOne().value(); + dst_from = getPtrType(dyn_cast(I)->getArgOperand(0)); + src_align = + dyn_cast(I)->getSourceAlign().valueOrOne().value(); + src_from = getPtrType(dyn_cast(I)->getArgOperand(1)); + } else if (type == 2) { + // For ptr param of bcmp + auto *DstPtr = dyn_cast(I)->getArgOperand(0); + auto *SrcPtr = dyn_cast(I)->getArgOperand(1); + dst_from = getPtrType(DstPtr); + src_from = getPtrType(SrcPtr); + } + Features.push_back(std::make_pair( + "dst_align", std::to_string(dst_align))); + + Features.push_back(std::make_pair( + "dst_from", std::to_string(dst_from))); + + Features.push_back(std::make_pair( + "src_align", std::to_string(src_align))); + + Features.push_back(std::make_pair( + "src_from", std::to_string(src_from))); + } + + // Set how the Dst/Src ptr is from as a feature + // 1: Alloca + // 2: From call malloc intrinsic + // 3: Other function's return + // 4: From load + // 5: Global variable + // 0: Other ways + uint16_t getPtrType(Value *Ptr) { + if (dyn_cast(Ptr)) { + return 1; + } else if (dyn_cast(Ptr)) { + if (dyn_cast(Ptr)->getCalledFunction() && + dyn_cast(Ptr)->getCalledFunction()->getName().startswith( + "malloc")) { + return 2; + } else { + return 3; + } + } else if (dyn_cast(Ptr)) { + return 4; + } else if (dyn_cast(Ptr)) { + return 5; + } else { + return 0; + } + } + +public: + FunctionAnalysisManager *FAM; +}; +} // namespace + +SmallVector getACPOAdvice(Function *F, + ModelDataAI4CMemOPCollector *MDC) { + SmallVector SizeIds; + int64_t OPT = 0; + auto &ORE = MDC->FAM->getResult(*F); + std::unique_ptr AI4CMEMOP = + std::make_unique(&(F->getContext()), &ORE); + std::vector> Features = + MDC->getFeatures(); + + std::vector PossibleSizes = {0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 16, 17, 32, 33, 65, 129, 257, 513}; + + for (int psize : PossibleSizes) { + if (psize == 0) + continue; + Features.push_back(std::make_pair( + "opt_size", std::to_string(psize))); + AI4CMEMOP->setMLCustomFeatures(Features); + std::unique_ptr Advice = AI4CMEMOP->getAdvice(); + Constant *Val = Advice->getField("OPT"); + assert(Val != nullptr); + assert(isa(Val)); + ConstantInt *OPTPtr = dyn_cast(Val); + OPT = OPTPtr->getSExtValue(); + if (OPT) + SizeIds.push_back(psize); + Features.pop_back(); + } + + return SizeIds; +} + namespace { static const char *getMIName(const MemIntrinsic *MI) { @@ -176,8 +310,9 @@ class MemOPSizeOpt : public InstVisitor { public: MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, OptimizationRemarkEmitter &ORE, DominatorTree *DT, - TargetLibraryInfo &TLI) - : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) { + TargetLibraryInfo &TLI, ModelDataAI4CMemOPCollector &MDC) + : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), MDC(MDC), + Changed(false) { ValueDataArray = std::make_unique(INSTR_PROF_NUM_BUCKETS); } @@ -225,6 +360,8 @@ private: // The space to read the profile annotation. std::unique_ptr ValueDataArray; bool perform(MemOp MO); + std::vector> Records; + ModelDataAI4CMemOPCollector &MDC; }; static bool isProfitable(uint64_t Count, uint64_t TotalCount) { @@ -254,103 +391,119 @@ bool MemOPSizeOpt::perform(MemOp MO) { uint32_t NumVals, MaxNumVals = INSTR_PROF_NUM_BUCKETS; uint64_t TotalCount; - if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals, - ValueDataArray.get(), NumVals, TotalCount)) - return false; - - uint64_t ActualCount = TotalCount; - uint64_t SavedTotalCount = TotalCount; - if (MemOPScaleCount) { - auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); - if (!BBEdgeCount) - return false; - ActualCount = *BBEdgeCount; - } - - ArrayRef VDs(ValueDataArray.get(), NumVals); - LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " - << ActualCount << "\n"); - LLVM_DEBUG( - for (auto &VD - : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; }); - - if (ActualCount < MemOPCountThreshold) - return false; - // Skip if the total value profiled count is 0, in which case we can't - // scale up the counts properly (and there is no profitable transformation). - if (TotalCount == 0) - return false; - - TotalCount = ActualCount; - if (MemOPScaleCount) - LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount - << " denominator = " << SavedTotalCount << "\n"); - - // Keeping track of the count of the default case: - uint64_t RemainCount = TotalCount; - uint64_t SavedRemainCount = SavedTotalCount; + uint64_t ActualCount; + uint64_t SavedTotalCount; + uint64_t RemainCount; + uint64_t SavedRemainCount; SmallVector SizeIds; SmallVector CaseCounts; SmallDenseSet SeenSizeId; uint64_t MaxCount = 0; unsigned Version = 0; - // Default case is in the front -- save the slot here. - CaseCounts.push_back(0); SmallVector RemainingVDs; - for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) { - auto &VD = *I; - int64_t V = VD.Value; - uint64_t C = VD.Count; - if (MemOPScaleCount) - C = getScaledCount(C, ActualCount, SavedTotalCount); - - if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) { - RemainingVDs.push_back(VD); - continue; + uint64_t SumForOpt; + const char *op_name = MO.getName(TLI); + if (EnableAI4CMEMOP) { + MDC.collectFeatures(&Func, MO.I, op_name); + SizeIds = getACPOAdvice(MO.I->getFunction(), &MDC); + if (!SizeIds.size()) + return false; + } else { + if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumVals, + ValueDataArray.get(), NumVals, TotalCount)) { + return false; } - - // ValueCounts are sorted on the count. Break at the first un-profitable - // value. - if (!isProfitable(C, RemainCount)) { - RemainingVDs.insert(RemainingVDs.end(), I, E); - break; + ActualCount = TotalCount; + SavedTotalCount = TotalCount; + if (MemOPScaleCount) { + auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); + if (!BBEdgeCount) { + return false; + } + ActualCount = *BBEdgeCount; } - if (!SeenSizeId.insert(V).second) { - errs() << "warning: Invalid Profile Data in Function " << Func.getName() - << ": Two identical values in MemOp value counts.\n"; + ArrayRef VDs(ValueDataArray.get(), NumVals); + LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count " + << ActualCount << "\n"); + LLVM_DEBUG(for (auto &VD + : VDs) { + dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; + }); + if (ActualCount < MemOPCountThreshold) { + return false; + } + // Skip if the total value profiled count is 0, in which case we can't + // scale up the counts properly (and there is no profitable transformation). + if (TotalCount == 0) { return false; } - SizeIds.push_back(V); - CaseCounts.push_back(C); - if (C > MaxCount) - MaxCount = C; + TotalCount = ActualCount; + if (MemOPScaleCount) + LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount + << " denominator = " << SavedTotalCount << "\n"); + + // Keeping track of the count of the default case: + RemainCount = TotalCount; + SavedRemainCount = SavedTotalCount; + // Default case is in the front -- save the slot here. + CaseCounts.push_back(0); + for (auto I = VDs.begin(), E = VDs.end(); I != E; ++I) { + auto &VD = *I; + int64_t V = VD.Value; + uint64_t C = VD.Count; + if (MemOPScaleCount) + C = getScaledCount(C, ActualCount, SavedTotalCount); + + if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize) { + RemainingVDs.push_back(VD); + continue; + } - assert(RemainCount >= C); - RemainCount -= C; - assert(SavedRemainCount >= VD.Count); - SavedRemainCount -= VD.Count; + // ValueCounts are sorted on the count. Break at the first un-profitable + // value. + if (!isProfitable(C, RemainCount)) { + RemainingVDs.insert(RemainingVDs.end(), I, E); + break; + } - if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) { - RemainingVDs.insert(RemainingVDs.end(), I + 1, E); - break; - } - } + if (!SeenSizeId.insert(V).second) { + errs() << "warning: Invalid Profile Data in Function " << Func.getName() + << ": Two identical values in MemOp value counts.\n"; + return false; + } - if (Version == 0) - return false; + SizeIds.push_back(V); + CaseCounts.push_back(C); + if (C > MaxCount) + MaxCount = C; - CaseCounts[0] = RemainCount; - if (RemainCount > MaxCount) - MaxCount = RemainCount; + assert(RemainCount >= C); + RemainCount -= C; + assert(SavedRemainCount >= VD.Count); + SavedRemainCount -= VD.Count; - uint64_t SumForOpt = TotalCount - RemainCount; + if (++Version >= MemOPMaxVersion && MemOPMaxVersion != 0) { + RemainingVDs.insert(RemainingVDs.end(), I + 1, E); + break; + } + } - LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version - << " Versions (covering " << SumForOpt << " out of " - << TotalCount << ")\n"); + if (Version == 0) { + return false; + } + + CaseCounts[0] = RemainCount; + if (RemainCount > MaxCount) + MaxCount = RemainCount; + SumForOpt = TotalCount - RemainCount; + LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version + << " Versions (covering " << SumForOpt << " out of " + << TotalCount << ")\n"); + } + // mem_op(..., size) // ==> // switch (size) { @@ -458,13 +611,14 @@ bool MemOPSizeOpt::perform(MemOp MO) { static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, OptimizationRemarkEmitter &ORE, - DominatorTree *DT, TargetLibraryInfo &TLI) { + DominatorTree *DT, TargetLibraryInfo &TLI, + ModelDataAI4CMemOPCollector &MDC) { if (DisableMemOPOPT) return false; if (F.hasFnAttribute(Attribute::OptimizeForSize)) return false; - MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); + MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI, MDC); MemOPSizeOpt.perform(); return MemOPSizeOpt.isChanged(); } @@ -475,7 +629,12 @@ PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, auto &ORE = FAM.getResult(F); auto *DT = FAM.getCachedResult(F); auto &TLI = FAM.getResult(F); - bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); + std::error_code EC; + raw_fd_ostream RawOS(MemOPDumpFile.getValue(), EC, sys::fs::CD_OpenAlways, + sys::fs::FA_Write, sys::fs::OF_Append); + formatted_raw_ostream OS(RawOS); + ModelDataAI4CMemOPCollector MDC(OS, MemOPDumpFile, &FAM); + bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI, MDC); if (!Changed) return PreservedAnalyses::all(); auto PA = PreservedAnalyses(); diff --git a/llvm/lib/Transforms/Utils/ACPOBranchWeightModel.cpp b/llvm/lib/Transforms/Utils/ACPOBranchWeightModel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2fd034309142d97bd4884396320e6a667aa34188 --- /dev/null +++ b/llvm/lib/Transforms/Utils/ACPOBranchWeightModel.cpp @@ -0,0 +1,227 @@ +//===- ACPOBranchWeightModel.h - ACPO Branch weight moel ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Expectations. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. +// +//===----------------------------------------------------------------------===// +// +// This pass adds the branch weight metadata. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/ACPOBranchWeightModel.h" +#include "llvm/Analysis/ACPOBWModel.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/ModelDataCollector.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/ProfileSummary.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "acpo-branch-weight-model" + +namespace llvm { + +cl::opt EnableACPOBWModel("use-acpo-bw-model-new-pass", cl::Hidden,cl::init(false), cl::desc("Enable ACPO branch weight model.")); + +static cl::list ExcludedModuleList("exclude-bw-modules", cl::Hidden, cl::CommaSeparated, cl::desc("Comma separated list of functions that will be excluded.")); + +} // namespace llvm + +static cl::opt + BWACPODumpFile("branch-weight-acpo-dump-file-new-pass", + cl::init("bw-acpo-data.csv"), cl::Hidden, + cl::desc("Name of a file to dump branch weight data.")); + +enum BWModelType { acpo }; + +static cl::opt ACPOBWModelType( + "acpo-bw-model-type-new-pass", cl::desc("Choose acpo bw model type:"), + cl::init(acpo), + cl::values(clEnumVal(acpo, "Use ACPO branch weight ML model"))); + +namespace { +// Class for collecting ACPO features +class ModelDataAI4CBWCollector : public ModelDataCollector { +public: + ModelDataAI4CBWCollector(formatted_raw_ostream &OS, std::string OutputFileName) : ModelDataCollector(OS, OutputFileName) {} + + bool collectFeatures(BasicBlock &BB, BasicBlock *Succ, FunctionAnalysisManager *FAM) { + Function *F = BB.getParent(); + Module *M = F->getParent(); + if (!FAM || !M || !F) { + errs() << "One of Module, Function or FAM is nullptr\n"; + return false; + } + + resetRegisteredFeatures(); + ACPOCollectFeatures::FeatureInfo GlobalFeatureInfo { + ACPOCollectFeatures::FeatureIndex::NumOfFeatures, + {FAM, nullptr}, + {F, nullptr, &BB, M, nullptr, Succ}}; + + registerFeature({ACPOCollectFeatures::Scope::Function}, GlobalFeatureInfo); + registerFeature({ACPOCollectFeatures::Scope::BasicBlock}, GlobalFeatureInfo); + registerFeature({ACPOCollectFeatures::Scope::Edge}, GlobalFeatureInfo); + ModelDataCollector::collectFeatures(); + return true; + } + + void printBranchWeights(Instruction *TI, unsigned Weight) { + BasicBlock *BB = TI->getParent(); + Function *F = BB->getParent(); + Module *M = F->getParent(); + std::string Out = ""; + + for(unsigned I = 0, E = Features.size(); I != E; ++I) { + if (I) + Out += ","; + Out += Features.at(I).second; + } + + Out += "," + M->getName().str() + ","+ F->getName().str() + "," + BB->getName().str(); + Out += "," + std::to_string(Weight); + Out += "\n"; + ModelDataCollector::setOutput(Out); + return; + } +}; +} // end anonymous namespace + +// Only enable the model for 920B +static bool isCPUTarget920B(Function &F) { + const AttributeList &Attrs = F.getAttributes(); + if (!Attrs.hasFnAttrs()) return false; + + AttributeSet AS = Attrs.getFnAttrs(); + for (const Attribute &Attr : AS) { + if (Attr.isStringAttribute()) { + StringRef AttrStr = Attr.getValueAsString(); + if (AttrStr.contains("hip09")) { + return true; + } + } + } + + return false; +} + +static bool checkMismatchOperandNum(Instruction *I, MDNode *MD) { + if (isa(I)) { + return (MD->getNumOperands() == 2 || MD->getNumOperands() == 3); + } + +unsigned ExpectedNumOperands = 0; + if (BranchInst *BI = dyn_cast(I)) + ExpectedNumOperands = BI->getNumSuccessors(); + else if (SwitchInst *SI = dyn_cast(I)) + ExpectedNumOperands = SI->getNumSuccessors(); + else if (isa(I)) + ExpectedNumOperands = 1; + else if (IndirectBrInst *IBI = dyn_cast(I)) + ExpectedNumOperands = IBI->getNumDestinations(); + else if (isa(I)) + ExpectedNumOperands = 2; + else if (CallBrInst *CI = dyn_cast(I)) + ExpectedNumOperands = CI->getNumSuccessors(); + + return (MD->getNumOperands() == 1 + ExpectedNumOperands); +} + +bool ACPOBranchWeightModelPass::applyBranchWeightUsingACPOModel(Module &M, ModuleAnalysisManager &MAM) { + std::error_code EC; + raw_fd_ostream RawOS(BWACPODumpFile, EC, sys::fs::CD_OpenAlways, sys::fs::FA_Write, sys::fs::OF_Append); + + if (EC) { + errs() << "Could not create/open feature dump file: " << EC.message() << '\n'; + return false; + } + formatted_raw_ostream OS(RawOS); + ModelDataAI4CBWCollector MDC(OS, BWACPODumpFile); + FunctionAnalysisManager &FAM = MAM.getResult(M).getManager(); + SmallVector Weights; + bool Changed = false; + for (Function &F : M) { + auto &ORE = FAM.getResult(F); + std::unique_ptr BW = std::make_unique(&(F.getContext()), &ORE); + + for (BasicBlock &BB : F) { + Weights.clear(); + Instruction *TI = BB.getTerminator(); + for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) { + bool printData = MDC.collectFeatures(BB, TI->getSuccessor(I), &FAM); + std::vector> Features = MDC.getFeatures(); + BW->setMLCustomFeatures(Features); + if(MDC.isEmptyOutputFile()) { + MDC.printRow(true); + } + MDC.printRow(); + std::unique_ptr Advice = BW->getAdvice(); + Constant *Val = Advice->getField("BW-BranchWright"); + assert(Val != nullptr); + assert(isa(Val)); + ConstantInt *ACPOBW = dyn_cast(Val); + int64_t BranchWeight = ACPOBW->getSExtValue(); + if (BranchWeight != 100) Weights.push_back(BranchWeight); + } + + // Create and add meta data + if (Weights.empty()) { + LLVM_DEBUG(dbgs() << "No weight data. Skipping."); + } else { + MDBuilder MDB(F.getContext()); + MDNode *MDWeight = MDB.createBranchWeights(Weights); + LLVM_DEBUG(dbgs() << "Instruction before adding metadata" << *TI << "\n"); + LLVM_DEBUG(dbgs() << "Metadata node " << *MDWeight << "\n"); + if (!checkMismatchOperandNum(TI, MDWeight)) { + LLVM_DEBUG(dbgs() << "Mismatch operand number. Skipping.\n"); + continue; + } + TI->setMetadata(LLVMContext::MD_prof, MDWeight); + LLVM_DEBUG(dbgs() << "Instruction after adding metadata" << *TI < "\n"); + Changed = true; + } + } + } + return Changed; +} + +PreservedAnalyses ACPOBranchWeightModelPass::run(Module &M, ModuleAnalysisManager &MAM) { + if (!EnableACPOBWModel) return PreservedAnalyses::all(); + + std::unordered_set ExcludedModules(ExcludedModuleList.begin(), ExcludedModuleList.end()); + + if (ExcludedModules.count(M.getName().str())) return PreservedAnalyses::all(); + + LLVM_DEBUG(dbgs() << "Using ACPO Branch Weight Model - " << M.getName() << '\n'); + + bool Changed = false; + + switch (ACPOBWModelType) { + case acpo: + Changed = applyBranchWeightUsingACPOModel(M, MAM); + break; + default: + LLVM_DEBUG(errs() << "Invalid branch weight model type\n"); + break; + } + + if (!Changed) { + return PreservedAnalyses::all(); + } else { + return PreservedAnalyses::none(); + } +} \ No newline at end of file diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt index 8616e7b923c026196379468eabb60cca6adebc65..b663a600fcedbe6ee0345a5688713c8e5e3a31fe 100644 --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_component_library(LLVMTransformUtils + ACPOBranchWeightModel.cpp AddDiscriminators.cpp AMDGPUEmitPrintf.cpp ASanStackFrameLayout.cpp diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp index 5ce9e4fee81fbfa3609dccb0fab56619ffb850e9..0516f7628ba3ac3df7cfb1886f38e473b60e0e4f 100644 --- a/llvm/tools/opt/NewPMDriver.cpp +++ b/llvm/tools/opt/NewPMDriver.cpp @@ -72,6 +72,13 @@ cl::opt enum class DebugLogging { None, Normal, Verbose, Quiet }; +cl::opt ACPORecipe( + "use-acpo-pass_recipe", + cl::desc("Enable textual description of the ACPO phase ordering-recipes " + "inserted as PassPipeline with the new PM "), + cl::init(false) +); + static cl::opt DebugPM( "debug-pass-manager", cl::Hidden, cl::ValueOptional, cl::desc("Print pass management debugging information"), diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp index 671a33309a1b50e739f3ec2cc3fbac899b413716..276aee9b538f81e52837acc01e650cb7d4ec2303 100644 --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -62,6 +62,7 @@ using namespace opt_tool; static codegen::RegisterCodeGenFlags CFG; +extern cl::opt ACPORecipe; // The OptimizationList is automatically populated with registered Passes by the // PassNameParser. static cl::list PassList(cl::desc( @@ -721,6 +722,24 @@ int main(int argc, char **argv) { ? OK_OutputAssembly : (OutputThinLTOBC ? OK_OutputThinLTOBitcode : OK_OutputBitcode); + // Use pass ordering defined by ACPO instead (clang frontend part) + // 1. Clang passes with -fPASS should be intact in the compilation pipeline. + // 2. If necessary, Using opt, needed passes shuould be added with -passes="PASS". + // 3. Optimization level should be set at O3 when compiling with clang, e.g.: + // clang -O3 -mllvm -use-acpo-bw-model -fai4c-recipe + + if (ACPORecipe) { + errs() << "ACPO Phase-ordering recipes are activate: 724-DADCB\n"; + if (!OptLevelO3) { + errs() << "Add -O3 to the compilation so it sets the parameters of the passes correctly.\n"; + return 1; + } + std::string ACPOPipeline = ""; + + errs() << "Additional passes are added to the pipeline: " << PassPipeline << '\n'; + Pipeline = PassPipeline + ',' + ACPOPipeline; + } + VerifierKind VK = VK_VerifyOut; if (NoVerify) VK = VK_NoVerifier;