From 993941ae0acd546b76b7604270eb322f9e8e38aa Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 21 Jul 2023 16:21:44 -0700 Subject: [PATCH 01/94] [Backport][BOLT] Accept function start as valid jump table entry Jump tables may contain a function start address. One real-world example is when a target basic block contains a recursive tail call that is later optimized/folded into a jump table target. While analyzing a jump table, we treat start address similar to an address past the end of the containing function (a result of __builtin_unreachable), i.e. we require another "regular" entry for the heuristic to proceed. Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D156206 --- bolt/lib/Core/BinaryContext.cpp | 30 +++++++---- bolt/test/X86/jump-table-func-entry.s | 72 +++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 11 deletions(-) create mode 100644 bolt/test/X86/jump-table-func-entry.s diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index 2d2b35ee2bd9..1e95d5fe38ab 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -503,6 +503,9 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address, // Is one of the targets __builtin_unreachable? bool HasUnreachable = false; + // Does one of the entries match function start address? + bool HasStartAsEntry = false; + // Number of targets other than __builtin_unreachable. uint64_t NumRealEntries = 0; @@ -567,14 +570,21 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address, continue; } + // Function start is another special case. It is allowed in the jump table, + // but we need at least one another regular entry to distinguish the table + // from, e.g. a function pointer array. + if (Value == BF.getAddress()) { + HasStartAsEntry = true; + addEntryAddress(Value); + continue; + } + // Function or one of its fragments. const BinaryFunction *TargetBF = getBinaryFunctionContainingAddress(Value); - - bool DoesBelongToFunction = BF.containsAddress(Value) || - (TargetBF && TargetBF->isParentOrChildOf(BF)); - - // We assume that a jump table cannot have function start as an entry. - if (!DoesBelongToFunction || Value == BF.getAddress()) { + const bool DoesBelongToFunction = + BF.containsAddress(Value) || + (TargetBF && TargetBF->isParentOrChildOf(BF)); + if (!DoesBelongToFunction) { LLVM_DEBUG({ if (!BF.containsAddress(Value)) { dbgs() << "FAIL: function doesn't contain this address\n"; @@ -589,8 +599,6 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address, } } } - if (Value == BF.getAddress()) - dbgs() << "FAIL: jump table cannot have function start as an entry\n"; }); break; } @@ -611,9 +619,9 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address, } // It's a jump table if the number of real entries is more than 1, or there's - // one real entry and "unreachable" targets. If there are only multiple - // "unreachable" targets, then it's not a jump table. - return NumRealEntries + HasUnreachable >= 2; + // one real entry and one or more special targets. If there are only multiple + // special targets, then it's not a jump table. + return NumRealEntries + (HasUnreachable || HasStartAsEntry) >= 2; } void BinaryContext::populateJumpTables() { diff --git a/bolt/test/X86/jump-table-func-entry.s b/bolt/test/X86/jump-table-func-entry.s new file mode 100644 index 000000000000..77b444d520a1 --- /dev/null +++ b/bolt/test/X86/jump-table-func-entry.s @@ -0,0 +1,72 @@ +# REQUIRES: system-linux + +## Check that BOLT correctly processes jump table that contains function start +## as one of its entries. + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.exe -no-pie -Wl,-q + +# RUN: llvm-bolt %t.exe --print-normalized --print-only=foo -o %t.out \ +# RUN: |& FileCheck %s + + + + .text + .globl _start + .type _start, %function +_start: + .cfi_startproc + call foo + ret + .cfi_endproc + .size _start, .-_start + + .globl foo + .type foo, %function +foo: + .cfi_startproc +.LBB00: + movq 0x8(%rdi), %rdi + movzbl 0x1(%rdi), %eax +.LBB00_br: + jmpq *"JUMP_TABLE/foo.0"(,%rax,8) +# CHECK: jmpq {{.*}} # JUMPTABLE +# CHECK-NEXT: Successors: {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}} + +.Ltmp87085: + xorl %eax, %eax + retq + +.Ltmp87086: + cmpb $0x0, 0x8(%rdi) + setne %al + retq + +.Ltmp87088: + movb $0x1, %al + retq + +.Ltmp87087: + movzbl 0x14(%rdi), %eax + andb $0x2, %al + shrb %al + retq + + .cfi_endproc +.size foo, .-foo + +# Jump tables +.section .rodata +"JUMP_TABLE/foo.0": + .quad .Ltmp87085 + .quad .Ltmp87086 + .quad .Ltmp87087 + .quad .LBB00 + .quad .Ltmp87088 + +# CHECK: Jump table {{.*}} for function foo +# CHECK-NEXT: 0x{{.*}} : +# CHECK-NEXT: 0x{{.*}} : +# CHECK-NEXT: 0x{{.*}} : +# CHECK-NEXT: 0x{{.*}} : +# CHECK-NEXT: 0x{{.*}} : -- Gitee From a6c1f471dddbe4ac99bf49fdd1c141db89f7488e Mon Sep 17 00:00:00 2001 From: spupyrev Date: Fri, 7 Jul 2023 11:55:17 -0700 Subject: [PATCH 02/94] [Backport][BOLT] Add stale-related logging Adding some logs related to stale profile matching. The new data can be helpful to understand how "stale" the input profile is and how well the inference is able to utilize the stale data. Example of outputs on clang-10 built with LTO (profile collected on a year-old release): ``` BOLT-INFO: inferred profile for 2101 (18.52% of profiled, 100.00% of stale) functions responsible for 30.95% samples (14754697 out of 47670654) BOLT-INFO: stale inference matched 89.42% of basic blocks (79052 out of 88402 stale) responsible for 76.99% samples (645737 out of 838719 stale) ``` LTO+AutoFDO: ``` BOLT-INFO: inferred profile for 6146 (57.57% of profiled, 100.00% of stale) functions responsible for 90.34% samples (50891403 out of 56330313) BOLT-INFO: stale inference matched 74.55% of basic blocks (191295 out of 256589 stale) responsible for 57.30% samples (1288632 out of 2248799 stale) ``` Reviewed By: Amir, maksfb Differential Revision: https://reviews.llvm.org/D154737 --- bolt/include/bolt/Core/BinaryContext.h | 19 ++++++++++-- bolt/include/bolt/Core/BinaryFunction.h | 2 +- bolt/lib/Core/BinaryFunction.cpp | 4 +-- bolt/lib/Passes/BinaryPasses.cpp | 17 ++++++++--- bolt/lib/Profile/StaleProfileMatching.cpp | 36 ++++++++++++++++------- 5 files changed, 58 insertions(+), 20 deletions(-) diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index 79f91985c492..7bbfb2fc33ee 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -638,9 +638,22 @@ public: /// Total hotness score according to profiling data for this binary. uint64_t TotalScore{0}; - /// Binary-wide stats for macro-fusion. - uint64_t MissedMacroFusionPairs{0}; - uint64_t MissedMacroFusionExecCount{0}; + /// Binary-wide aggregated stats. + struct BinaryStats { + /// Stats for macro-fusion. + uint64_t MissedMacroFusionPairs{0}; + uint64_t MissedMacroFusionExecCount{0}; + + /// Stats for stale profile matching: + /// the total number of basic blocks in the profile + uint32_t NumStaleBlocks{0}; + /// the number of matched basic blocks + uint32_t NumMatchedBlocks{0}; + /// the total count of samples in the profile + uint64_t StaleSampleCount{0}; + /// the count of matched samples + uint64_t MatchedSampleCount{0}; + } Stats; // Address of the first allocated segment. uint64_t FirstAllocAddress{std::numeric_limits::max()}; diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index c393b5b851d9..bbb5f99556d9 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -381,7 +381,7 @@ private: /// Profile match ratio. float ProfileMatchRatio{0.0f}; - /// Raw branch count for this function in the profile + /// Raw branch count for this function in the profile. uint64_t RawBranchCount{0}; /// Indicates the type of profile the function is using. diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 5b44a76dc8c3..3f6e74c5f774 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -2221,8 +2221,8 @@ void BinaryFunction::calculateMacroOpFusionStats() { << Twine::utohexstr(getAddress() + Offset) << " in function " << *this << "; executed " << BB.getKnownExecutionCount() << " times.\n"); - ++BC.MissedMacroFusionPairs; - BC.MissedMacroFusionExecCount += BB.getKnownExecutionCount(); + ++BC.Stats.MissedMacroFusionPairs; + BC.Stats.MissedMacroFusionExecCount += BB.getKnownExecutionCount(); } } diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index a674fb4fef76..bb760ea93ad1 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1454,6 +1454,14 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) { 100.0 * NumInferredFunctions / NumAllStaleFunctions, 100.0 * InferredSampleCount / TotalSampleCount, InferredSampleCount, TotalSampleCount); + outs() << format( + "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks" + " (%zu out of %zu stale) responsible for %.2f%% samples" + " (%zu out of %zu stale)\n", + 100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks, + BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks, + 100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount, + BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount); } if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { @@ -1562,10 +1570,11 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) { } // Print information on missed macro-fusion opportunities seen on input. - if (BC.MissedMacroFusionPairs) { - outs() << "BOLT-INFO: the input contains " << BC.MissedMacroFusionPairs - << " (dynamic count : " << BC.MissedMacroFusionExecCount - << ") opportunities for macro-fusion optimization"; + if (BC.Stats.MissedMacroFusionPairs) { + outs() << format("BOLT-INFO: the input contains %zu (dynamic count : %zu)" + " opportunities for macro-fusion optimization", + BC.Stats.MissedMacroFusionPairs, + BC.Stats.MissedMacroFusionExecCount); switch (opts::AlignMacroOpFusion) { case MFT_NONE: outs() << ". Use -align-macro-fusion to fix.\n"; diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp index b009d57a0e6e..535f7da1c1a9 100644 --- a/bolt/lib/Profile/StaleProfileMatching.cpp +++ b/bolt/lib/Profile/StaleProfileMatching.cpp @@ -236,14 +236,11 @@ public: /// Find the most similar block for a given hash. const FlowBlock *matchBlock(BlendedBlockHash BlendedHash) const { auto BlockIt = OpHashToBlocks.find(BlendedHash.OpcodeHash); - if (BlockIt == OpHashToBlocks.end()) { + if (BlockIt == OpHashToBlocks.end()) return nullptr; - } FlowBlock *BestBlock = nullptr; uint64_t BestDist = std::numeric_limits::max(); - for (auto It : BlockIt->second) { - FlowBlock *Block = It.second; - BlendedBlockHash Hash = It.first; + for (const auto &[Hash, Block] : BlockIt->second) { uint64_t Dist = Hash.distance(BlendedHash); if (BestBlock == nullptr || Dist < BestDist) { BestDist = Dist; @@ -253,6 +250,14 @@ public: return BestBlock; } + /// Returns true if the two basic blocks (in the binary and in the profile) + /// corresponding to the given hashes are matched to each other with a high + /// confidence. + static bool isHighConfidenceMatch(BlendedBlockHash Hash1, + BlendedBlockHash Hash2) { + return Hash1.InstrHash == Hash2.InstrHash; + } + private: using HashBlockPairType = std::pair; std::unordered_map> OpHashToBlocks; @@ -393,7 +398,8 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { /// of the basic blocks in the binary, the count is "matched" to the block. /// Similarly, if both the source and the target of a count in the profile are /// matched to a jump in the binary, the count is recorded in CFG. -void matchWeightsByHashes(const BinaryFunction::BasicBlockOrderType &BlockOrder, +void matchWeightsByHashes(BinaryContext &BC, + const BinaryFunction::BasicBlockOrderType &BlockOrder, const yaml::bolt::BinaryFunctionProfile &YamlBF, FlowFunction &Func) { assert(Func.Blocks.size() == BlockOrder.size() + 1); @@ -417,19 +423,29 @@ void matchWeightsByHashes(const BinaryFunction::BasicBlockOrderType &BlockOrder, // Match blocks from the profile to the blocks in CFG for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) { assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile"); - BlendedBlockHash BlendedHash(YamlBB.Hash); - const FlowBlock *MatchedBlock = Matcher.matchBlock(BlendedHash); + BlendedBlockHash YamlHash(YamlBB.Hash); + const FlowBlock *MatchedBlock = Matcher.matchBlock(YamlHash); if (MatchedBlock != nullptr) { MatchedBlocks[YamlBB.Index] = MatchedBlock; LLVM_DEBUG(dbgs() << "Matched yaml block with bid = " << YamlBB.Index << " and hash = " << Twine::utohexstr(YamlBB.Hash) << " to BB with index = " << MatchedBlock->Index - 1 << "\n"); + // Update matching stats accounting for the matched block. + BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1]; + if (Matcher.isHighConfidenceMatch(BinHash, YamlHash)) { + ++BC.Stats.NumMatchedBlocks; + BC.Stats.MatchedSampleCount += YamlBB.ExecCount; + } } else { LLVM_DEBUG( dbgs() << "Couldn't match yaml block with bid = " << YamlBB.Index << " and hash = " << Twine::utohexstr(YamlBB.Hash) << "\n"); } + + // Update matching stats. + ++BC.Stats.NumStaleBlocks; + BC.Stats.StaleSampleCount += YamlBB.ExecCount; } // Match jumps from the profile to the jumps from CFG @@ -475,7 +491,7 @@ void matchWeightsByHashes(const BinaryFunction::BasicBlockOrderType &BlockOrder, // Assign block counts based on in-/out- jumps for (FlowBlock &Block : Func.Blocks) { if (OutWeight[Block.Index] == 0 && InWeight[Block.Index] == 0) { - assert(Block.HasUnknownWeight && "unmatched block with positive count"); + assert(Block.HasUnknownWeight && "unmatched block with a positive count"); continue; } Block.HasUnknownWeight = false; @@ -702,7 +718,7 @@ bool YAMLProfileReader::inferStaleProfile( FlowFunction Func = createFlowFunction(BlockOrder); // Match as many block/jump counts from the stale profile as possible - matchWeightsByHashes(BlockOrder, YamlBF, Func); + matchWeightsByHashes(BF.getBinaryContext(), BlockOrder, YamlBF, Func); // Adjust the flow function by marking unreachable blocks Unlikely so that // they don't get any counts assigned -- Gitee From 0f5bc3706d5850646ecf820a7c5a8742d9ecc67f Mon Sep 17 00:00:00 2001 From: spupyrev Date: Fri, 7 Jul 2023 11:55:17 -0700 Subject: [PATCH 03/94] [Backport][BOLT] (Minor) Changes in stale inference 1. Using ADT/Bitfields.h for hash computation; this is equivalent but shorter than the existing implementation 2. Getting rid of Layout indices for stale matching; using BB->getIndex for indexing Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D155748 --- bolt/lib/Profile/StaleProfileMatching.cpp | 84 +++++++++-------------- bolt/lib/Profile/YAMLProfileReader.cpp | 3 - 2 files changed, 34 insertions(+), 53 deletions(-) diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp index 535f7da1c1a9..b5895d19de20 100644 --- a/bolt/lib/Profile/StaleProfileMatching.cpp +++ b/bolt/lib/Profile/StaleProfileMatching.cpp @@ -27,17 +27,18 @@ #include "bolt/Core/HashUtilities.h" #include "bolt/Profile/YAMLProfileReader.h" +#include "llvm/ADT/Bitfields.h" #include "llvm/ADT/Hashing.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/SampleProfileInference.h" #include +using namespace llvm; + #undef DEBUG_TYPE #define DEBUG_TYPE "bolt-prof" -using namespace llvm; - namespace opts { extern cl::OptionCategory BoltOptCategory; @@ -141,49 +142,29 @@ namespace bolt { /// components are of smaller size (e.g., uint16_t or uint8_t). struct BlendedBlockHash { private: - static uint64_t combineHashes(uint16_t Hash1, uint16_t Hash2, uint16_t Hash3, - uint16_t Hash4) { - uint64_t Hash = 0; - - Hash |= uint64_t(Hash4); - Hash <<= 16; - - Hash |= uint64_t(Hash3); - Hash <<= 16; - - Hash |= uint64_t(Hash2); - Hash <<= 16; - - Hash |= uint64_t(Hash1); - - return Hash; - } - - static void parseHashes(uint64_t Hash, uint16_t &Hash1, uint16_t &Hash2, - uint16_t &Hash3, uint16_t &Hash4) { - Hash1 = Hash & 0xffff; - Hash >>= 16; - - Hash2 = Hash & 0xffff; - Hash >>= 16; - - Hash3 = Hash & 0xffff; - Hash >>= 16; - - Hash4 = Hash & 0xffff; - Hash >>= 16; - } + using ValueOffset = Bitfield::Element; + using ValueOpcode = Bitfield::Element; + using ValueInstr = Bitfield::Element; + using ValueNeighbor = Bitfield::Element; public: explicit BlendedBlockHash() {} - explicit BlendedBlockHash(uint64_t CombinedHash) { - parseHashes(CombinedHash, Offset, OpcodeHash, InstrHash, NeighborHash); + explicit BlendedBlockHash(uint64_t Hash) { + Offset = Bitfield::get(Hash); + OpcodeHash = Bitfield::get(Hash); + InstrHash = Bitfield::get(Hash); + NeighborHash = Bitfield::get(Hash); } /// Combine the blended hash into uint64_t. uint64_t combine() const { - return combineHashes(Offset, OpcodeHash, InstrHash, NeighborHash); + uint64_t Hash = 0; + Bitfield::set(Hash, Offset); + Bitfield::set(Hash, OpcodeHash); + Bitfield::set(Hash, InstrHash); + Bitfield::set(Hash, NeighborHash); + return Hash; } /// Compute a distance between two given blended hashes. The smaller the @@ -311,6 +292,7 @@ void BinaryFunction::computeBlockHashes() const { BB->setHash(BlendedHashes[I].combine()); } } + /// Create a wrapper flow function to use with the profile inference algorithm, /// and initialize its jumps and metadata. FlowFunction @@ -319,7 +301,7 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { // Add a special "dummy" source so that there is always a unique entry point. // Because of the extra source, for all other blocks in FlowFunction it holds - // that Block.Index == BB->getLayoutIndex() + 1 + // that Block.Index == BB->getIndex() + 1 FlowBlock EntryBlock; EntryBlock.Index = 0; Func.Blocks.push_back(EntryBlock); @@ -330,7 +312,7 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { FlowBlock &Block = Func.Blocks.back(); Block.Index = Func.Blocks.size() - 1; (void)BB; - assert(Block.Index == BB->getLayoutIndex() + 1 && + assert(Block.Index == BB->getIndex() + 1 && "incorrectly assigned basic block index"); } @@ -346,8 +328,8 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { Func.Jumps.emplace_back(); FlowJump &Jump = Func.Jumps.back(); - Jump.Source = SrcBB->getLayoutIndex() + 1; - Jump.Target = DstBB->getLayoutIndex() + 1; + Jump.Source = SrcBB->getIndex() + 1; + Jump.Target = DstBB->getIndex() + 1; InDegree[Jump.Target]++; UniqueSuccs.insert(DstBB); } @@ -359,8 +341,8 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { Func.Jumps.emplace_back(); FlowJump &Jump = Func.Jumps.back(); - Jump.Source = SrcBB->getLayoutIndex() + 1; - Jump.Target = DstBB->getLayoutIndex() + 1; + Jump.Source = SrcBB->getIndex() + 1; + Jump.Target = DstBB->getIndex() + 1; InDegree[Jump.Target]++; UniqueSuccs.insert(DstBB); } @@ -707,31 +689,33 @@ void assignProfile(BinaryFunction &BF, bool YAMLProfileReader::inferStaleProfile( BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF) { - // Make sure that block indices and hashes are up to date - BF.getLayout().updateLayoutIndices(); + LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for " + << "\"" << BF.getPrintName() << "\"\n"); + + // Make sure that block hashes are up to date. BF.computeBlockHashes(); const BinaryFunction::BasicBlockOrderType BlockOrder( BF.getLayout().block_begin(), BF.getLayout().block_end()); - // Create a wrapper flow function to use with the profile inference algorithm + // Create a wrapper flow function to use with the profile inference algorithm. FlowFunction Func = createFlowFunction(BlockOrder); // Match as many block/jump counts from the stale profile as possible matchWeightsByHashes(BF.getBinaryContext(), BlockOrder, YamlBF, Func); // Adjust the flow function by marking unreachable blocks Unlikely so that - // they don't get any counts assigned + // they don't get any counts assigned. preprocessUnreachableBlocks(Func); - // Check if profile inference can be applied for the instance + // Check if profile inference can be applied for the instance. if (!canApplyInference(Func)) return false; - // Apply the profile inference algorithm + // Apply the profile inference algorithm. applyInference(Func); - // Collect inferred counts and update function annotations + // Collect inferred counts and update function annotations. assignProfile(BF, BlockOrder, Func); // As of now, we always mark the binary function having "correct" profile. diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp index 90e43b402750..7b5a751025b4 100644 --- a/bolt/lib/Profile/YAMLProfileReader.cpp +++ b/bolt/lib/Profile/YAMLProfileReader.cpp @@ -250,9 +250,6 @@ bool YAMLProfileReader::parseFunctionProfile( << " edges in profile did not match function " << BF << '\n'; if (!ProfileMatched && opts::InferStaleProfile) { - if (opts::Verbosity >= 1) - outs() << "BOLT-INFO: applying profile inference for " - << "\"" << BF.getPrintName() << "\"\n"; if (inferStaleProfile(BF, YamlBF)) { ProfileMatched = true; BF.markProfiled(YamlBP.Header.Flags); -- Gitee From be6fa618eae899978ac1e16d15782f12908b6657 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Thu, 27 Jul 2023 13:56:48 -0700 Subject: [PATCH 04/94] [Backport][BOLT][NFC] Format ReorderFunctions.cpp --- bolt/lib/Passes/ReorderFunctions.cpp | 295 +++++++++++++-------------- 1 file changed, 139 insertions(+), 156 deletions(-) diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp index 2fc99f652bf1..58e50a07a499 100644 --- a/bolt/lib/Passes/ReorderFunctions.cpp +++ b/bolt/lib/Passes/ReorderFunctions.cpp @@ -29,82 +29,70 @@ extern cl::opt RandomSeed; extern size_t padFunction(const bolt::BinaryFunction &Function); -cl::opt -ReorderFunctions("reorder-functions", - cl::desc("reorder and cluster functions (works only with relocations)"), - cl::init(bolt::ReorderFunctions::RT_NONE), - cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE, - "none", - "do not reorder functions"), - clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT, - "exec-count", - "order by execution count"), - clEnumValN(bolt::ReorderFunctions::RT_HFSORT, - "hfsort", - "use hfsort algorithm"), - clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS, - "hfsort+", - "use hfsort+ algorithm"), - clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN, - "pettis-hansen", - "use Pettis-Hansen algorithm"), - clEnumValN(bolt::ReorderFunctions::RT_RANDOM, - "random", - "reorder functions randomly"), - clEnumValN(bolt::ReorderFunctions::RT_USER, - "user", - "use function order specified by -function-order")), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); +cl::opt ReorderFunctions( + "reorder-functions", + cl::desc("reorder and cluster functions (works only with relocations)"), + cl::init(bolt::ReorderFunctions::RT_NONE), + cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE, "none", + "do not reorder functions"), + clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT, "exec-count", + "order by execution count"), + clEnumValN(bolt::ReorderFunctions::RT_HFSORT, "hfsort", + "use hfsort algorithm"), + clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS, "hfsort+", + "use hfsort+ algorithm"), + clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN, + "pettis-hansen", "use Pettis-Hansen algorithm"), + clEnumValN(bolt::ReorderFunctions::RT_RANDOM, "random", + "reorder functions randomly"), + clEnumValN(bolt::ReorderFunctions::RT_USER, "user", + "use function order specified by -function-order")), + cl::ZeroOrMore, cl::cat(BoltOptCategory)); static cl::opt ReorderFunctionsUseHotSize( "reorder-functions-use-hot-size", cl::desc("use a function's hot size when doing clustering"), cl::init(true), cl::cat(BoltOptCategory)); -static cl::opt -FunctionOrderFile("function-order", - cl::desc("file containing an ordered list of functions to use for function " - "reordering"), - cl::cat(BoltOptCategory)); +static cl::opt FunctionOrderFile( + "function-order", + cl::desc("file containing an ordered list of functions to use for function " + "reordering"), + cl::cat(BoltOptCategory)); -static cl::opt -GenerateFunctionOrderFile("generate-function-order", - cl::desc("file to dump the ordered list of functions to use for function " - "reordering"), - cl::cat(BoltOptCategory)); +static cl::opt GenerateFunctionOrderFile( + "generate-function-order", + cl::desc("file to dump the ordered list of functions to use for function " + "reordering"), + cl::cat(BoltOptCategory)); -static cl::opt -LinkSectionsFile("generate-link-sections", - cl::desc("generate a list of function sections in a format suitable for " - "inclusion in a linker script"), - cl::cat(BoltOptCategory)); +static cl::opt LinkSectionsFile( + "generate-link-sections", + cl::desc("generate a list of function sections in a format suitable for " + "inclusion in a linker script"), + cl::cat(BoltOptCategory)); static cl::opt UseEdgeCounts("use-edge-counts", cl::desc("use edge count data when doing clustering"), cl::init(true), cl::cat(BoltOptCategory)); -static cl::opt -CgFromPerfData("cg-from-perf-data", - cl::desc("use perf data directly when constructing the call graph" - " for stale functions"), - cl::init(true), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); +static cl::opt CgFromPerfData( + "cg-from-perf-data", + cl::desc("use perf data directly when constructing the call graph" + " for stale functions"), + cl::init(true), cl::ZeroOrMore, cl::cat(BoltOptCategory)); static cl::opt CgIgnoreRecursiveCalls( "cg-ignore-recursive-calls", cl::desc("ignore recursive calls when constructing the call graph"), cl::init(true), cl::cat(BoltOptCategory)); -static cl::opt -CgUseSplitHotSize("cg-use-split-hot-size", - cl::desc("use hot/cold data on basic blocks to determine hot sizes for " - "call graph functions"), - cl::init(false), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); +static cl::opt CgUseSplitHotSize( + "cg-use-split-hot-size", + cl::desc("use hot/cold data on basic blocks to determine hot sizes for " + "call graph functions"), + cl::init(false), cl::ZeroOrMore, cl::cat(BoltOptCategory)); } // namespace opts @@ -157,13 +145,13 @@ void ReorderFunctions::printStats(const std::vector &Clusters, bool PrintDetailed = opts::Verbosity > 1; #ifndef NDEBUG PrintDetailed |= - (DebugFlag && isCurrentDebugType("hfsort") && opts::Verbosity > 0); + (DebugFlag && isCurrentDebugType("hfsort") && opts::Verbosity > 0); #endif - uint64_t TotalSize = 0; - uint64_t CurPage = 0; - uint64_t Hotfuncs = 0; + uint64_t TotalSize = 0; + uint64_t CurPage = 0; + uint64_t Hotfuncs = 0; double TotalDistance = 0; - double TotalCalls = 0; + double TotalCalls = 0; double TotalCalls64B = 0; double TotalCalls4KB = 0; double TotalCalls2MB = 0; @@ -198,21 +186,22 @@ void ReorderFunctions::printStats(const std::vector &Clusters, << "BOLT-INFO: Src: " << *Cg.nodeIdToFunc(FuncId) << "\n" << "BOLT-INFO: Dst: " << *Cg.nodeIdToFunc(Dst) << "\n" << "BOLT-INFO: Weight = " << W << "\n" - << "BOLT-INFO: AvgOffset = " << Arc.avgCallOffset() << "\n"; + << "BOLT-INFO: AvgOffset = " << Arc.avgCallOffset() + << "\n"; Calls += W; - if (D < 64) TotalCalls64B += W; - if (D < 4096) TotalCalls4KB += W; - if (D < (2 << 20)) TotalCalls2MB += W; + if (D < 64) + TotalCalls64B += W; + if (D < 4096) + TotalCalls4KB += W; + if (D < (2 << 20)) + TotalCalls2MB += W; Dist += Arc.weight() * D; if (PrintDetailed) outs() << format("BOLT-INFO: arc: %u [@%lu+%.1lf] -> %u [@%lu]: " "weight = %.0lf, callDist = %f\n", - Arc.src(), - FuncAddr[Arc.src()], - Arc.avgCallOffset(), - Arc.dst(), - FuncAddr[Arc.dst()], - Arc.weight(), D); + Arc.src(), FuncAddr[Arc.src()], + Arc.avgCallOffset(), Arc.dst(), + FuncAddr[Arc.dst()], Arc.weight(), D); } TotalCalls += Calls; TotalDistance += Dist; @@ -290,33 +279,30 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) { switch (opts::ReorderFunctions) { case RT_NONE: break; - case RT_EXEC_COUNT: - { - std::vector SortedFunctions(BFs.size()); - uint32_t Index = 0; - llvm::transform(llvm::make_second_range(BFs), SortedFunctions.begin(), - [](BinaryFunction &BF) { return &BF; }); - llvm::stable_sort(SortedFunctions, [&](const BinaryFunction *A, - const BinaryFunction *B) { - if (A->isIgnored()) + case RT_EXEC_COUNT: { + std::vector SortedFunctions(BFs.size()); + uint32_t Index = 0; + llvm::transform(llvm::make_second_range(BFs), SortedFunctions.begin(), + [](BinaryFunction &BF) { return &BF; }); + llvm::stable_sort(SortedFunctions, [&](const BinaryFunction *A, + const BinaryFunction *B) { + if (A->isIgnored()) + return false; + const size_t PadA = opts::padFunction(*A); + const size_t PadB = opts::padFunction(*B); + if (!PadA || !PadB) { + if (PadA) + return true; + if (PadB) return false; - const size_t PadA = opts::padFunction(*A); - const size_t PadB = opts::padFunction(*B); - if (!PadA || !PadB) { - if (PadA) - return true; - if (PadB) - return false; - } - return !A->hasProfile() && - (B->hasProfile() || - (A->getExecutionCount() > B->getExecutionCount())); - }); - for (BinaryFunction *BF : SortedFunctions) - if (BF->hasProfile()) - BF->setIndex(Index++); - } - break; + } + return !A->hasProfile() && (B->hasProfile() || (A->getExecutionCount() > + B->getExecutionCount())); + }); + for (BinaryFunction *BF : SortedFunctions) + if (BF->hasProfile()) + BF->setIndex(Index++); + } break; case RT_HFSORT: Clusters = clusterize(Cg); break; @@ -330,74 +316,71 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) { std::srand(opts::RandomSeed); Clusters = randomClusters(Cg); break; - case RT_USER: - { - // Build LTOCommonNameMap - StringMap> LTOCommonNameMap; - for (const BinaryFunction &BF : llvm::make_second_range(BFs)) - for (StringRef Name : BF.getNames()) - if (std::optional LTOCommonName = getLTOCommonName(Name)) - LTOCommonNameMap[*LTOCommonName].push_back(BF.getAddress()); - - uint32_t Index = 0; - uint32_t InvalidEntries = 0; - for (const std::string &Function : readFunctionOrderFile()) { - std::vector FuncAddrs; - - BinaryData *BD = BC.getBinaryDataByName(Function); - if (!BD) { - // If we can't find the main symbol name, look for alternates. - uint32_t LocalID = 1; - while (true) { - const std::string FuncName = - Function + "/" + std::to_string(LocalID); - BD = BC.getBinaryDataByName(FuncName); - if (BD) - FuncAddrs.push_back(BD->getAddress()); - else - break; - LocalID++; - } - // Strip LTO suffixes - if (std::optional CommonName = getLTOCommonName(Function)) - if (LTOCommonNameMap.contains(*CommonName)) - llvm::append_range(FuncAddrs, LTOCommonNameMap[*CommonName]); - } else { - FuncAddrs.push_back(BD->getAddress()); + case RT_USER: { + // Build LTOCommonNameMap + StringMap> LTOCommonNameMap; + for (const BinaryFunction &BF : llvm::make_second_range(BFs)) + for (StringRef Name : BF.getNames()) + if (std::optional LTOCommonName = getLTOCommonName(Name)) + LTOCommonNameMap[*LTOCommonName].push_back(BF.getAddress()); + + uint32_t Index = 0; + uint32_t InvalidEntries = 0; + for (const std::string &Function : readFunctionOrderFile()) { + std::vector FuncAddrs; + + BinaryData *BD = BC.getBinaryDataByName(Function); + if (!BD) { + // If we can't find the main symbol name, look for alternates. + uint32_t LocalID = 1; + while (true) { + const std::string FuncName = Function + "/" + std::to_string(LocalID); + BD = BC.getBinaryDataByName(FuncName); + if (BD) + FuncAddrs.push_back(BD->getAddress()); + else + break; + LocalID++; } + // Strip LTO suffixes + if (std::optional CommonName = getLTOCommonName(Function)) + if (LTOCommonNameMap.contains(*CommonName)) + llvm::append_range(FuncAddrs, LTOCommonNameMap[*CommonName]); + } else { + FuncAddrs.push_back(BD->getAddress()); + } + + if (FuncAddrs.empty()) { + if (opts::Verbosity >= 1) + errs() << "BOLT-WARNING: Reorder functions: can't find function " + << "for " << Function << "\n"; + ++InvalidEntries; + continue; + } - if (FuncAddrs.empty()) { + for (const uint64_t FuncAddr : FuncAddrs) { + const BinaryData *FuncBD = BC.getBinaryDataAtAddress(FuncAddr); + assert(FuncBD); + + BinaryFunction *BF = BC.getFunctionForSymbol(FuncBD->getSymbol()); + if (!BF) { if (opts::Verbosity >= 1) errs() << "BOLT-WARNING: Reorder functions: can't find function " << "for " << Function << "\n"; ++InvalidEntries; - continue; - } - - for (const uint64_t FuncAddr : FuncAddrs) { - const BinaryData *FuncBD = BC.getBinaryDataAtAddress(FuncAddr); - assert(FuncBD); - - BinaryFunction *BF = BC.getFunctionForSymbol(FuncBD->getSymbol()); - if (!BF) { - if (opts::Verbosity >= 1) - errs() << "BOLT-WARNING: Reorder functions: can't find function " - << "for " << Function << "\n"; - ++InvalidEntries; - break; - } - if (!BF->hasValidIndex()) - BF->setIndex(Index++); - else if (opts::Verbosity > 0) - errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function - << "\n"; + break; } + if (!BF->hasValidIndex()) + BF->setIndex(Index++); + else if (opts::Verbosity > 0) + errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function + << "\n"; } - if (InvalidEntries) - errs() << "BOLT-WARNING: Reorder functions: can't find functions for " - << InvalidEntries << " entries in -function-order list\n"; } - break; + if (InvalidEntries) + errs() << "BOLT-WARNING: Reorder functions: can't find functions for " + << InvalidEntries << " entries in -function-order list\n"; + } break; } reorder(std::move(Clusters), BFs); -- Gitee From 77e1e78d15469ed575d53139f2c604d878c7df45 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Tue, 13 Jun 2023 10:08:00 -0700 Subject: [PATCH 05/94] [Backport]A new code layout algorithm for function reordering [2/3] We are bringing a new algorithm for function layout (reordering) based on the call graph (extracted from a profile data). The algorithm is an improvement of top of a known heuristic, C^3. It tries to co-locate hot and frequently executed together functions in the resulting ordering. Unlike C^3, it explores a larger search space and have an objective closely tied to the performance of instruction and i-TLB caches. Hence, the name CDS = Cache-Directed Sort. The algorithm can be used at the linking or post-linking (e.g., BOLT) stage. The algorithm shares some similarities with C^3 and an approach for basic block reordering (ext-tsp). It works with chains (ordered lists) of functions. Initially all chains are isolated functions. On every iteration, we pick a pair of chains whose merging yields the biggest increase in the objective, which is a weighted combination of frequency-based and distance-based locality. That is, we try to co-locate hot functions together (so they can share the cache lines) and functions frequently executed together. The merging process stops when there is only one chain left, or when merging does not improve the objective. In the latter case, the remaining chains are sorted by density in the decreasing order. **Complexity** We regularly apply the algorithm for large data-center binaries containing 10K+ (hot) functions, and the algorithm takes only a few seconds. For some extreme cases with 100K-1M nodes, the runtime is within minutes. **Perf-impact** We extensively tested the implementation extensively on a benchmark of isolated binaries and prod services. The impact is measurable for "larger" binaries that are front-end bound: the cpu time improvement (on top of C^3) is in the range of [0% .. 1%], which is a result of a reduced i-TLB miss rate (by up to 20%) and i-cache miss rate (up to 5%). Reviewed By: rahmanl Differential Revision: https://reviews.llvm.org/D152834 --- .../llvm/Transforms/Utils/CodeLayout.h | 34 + llvm/lib/Transforms/Utils/CodeLayout.cpp | 599 +++++++++++++++--- 2 files changed, 547 insertions(+), 86 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/CodeLayout.h b/llvm/include/llvm/Transforms/Utils/CodeLayout.h index e8106e474332..11a829b601ce 100644 --- a/llvm/include/llvm/Transforms/Utils/CodeLayout.h +++ b/llvm/include/llvm/Transforms/Utils/CodeLayout.h @@ -53,6 +53,40 @@ double calcExtTspScore(const std::vector &NodeSizes, const std::vector &NodeCounts, const std::vector &EdgeCounts); +/// Algorithm-specific params for Cache-Directed Sort. The values are tuned for +/// the best performance of large-scale front-end bound binaries. +struct CDSortConfig { + /// The size of the cache. + unsigned CacheEntries = 16; + /// The size of a line in the cache. + unsigned CacheSize = 2048; + /// The power exponent for the distance-based locality. + double DistancePower = 0.25; + /// The scale factor for the frequency-based locality. + double FrequencyScale = 0.25; +}; + +/// Apply a Cache-Directed Sort for functions represented by a call graph. +/// The placement is done by optimizing the call locality by co-locating +/// frequently executed functions. +/// \p FuncSizes: The sizes of the nodes (in bytes). +/// \p FuncCounts: The execution counts of the nodes in the profile. +/// \p CallCounts: The execution counts of every edge (jump) in the profile. The +/// map also defines the edges in CFG and should include 0-count edges. +/// \p CallOffsets: The offsets of the calls from their source nodes. +/// \returns The best function order found. +std::vector applyCDSLayout(const std::vector &FuncSizes, + const std::vector &FuncCounts, + const std::vector &CallCounts, + const std::vector &CallOffsets); + +/// Apply a Cache-Directed Sort with a custom config. +std::vector applyCDSLayout(const CDSortConfig &Config, + const std::vector &FuncSizes, + const std::vector &FuncCounts, + const std::vector &CallCounts, + const std::vector &CallOffsets); + } // end namespace llvm #endif // LLVM_TRANSFORMS_UTILS_CODELAYOUT_H diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp index ac74a1c116cc..6ef4ae3341e3 100644 --- a/llvm/lib/Transforms/Utils/CodeLayout.cpp +++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp @@ -45,6 +45,7 @@ #include "llvm/Support/Debug.h" #include +#include using namespace llvm; #define DEBUG_TYPE "code-layout" @@ -61,8 +62,8 @@ cl::opt ApplyExtTspWithoutProfile( cl::init(true), cl::Hidden); } // namespace llvm -// Algorithm-specific params. The values are tuned for the best performance -// of large-scale front-end bound binaries. +// Algorithm-specific params for Ext-TSP. The values are tuned for the best +// performance of large-scale front-end bound binaries. static cl::opt ForwardWeightCond( "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1), cl::desc("The weight of conditional forward jumps for ExtTSP value")); @@ -113,6 +114,21 @@ static cl::opt EnableChainSplitAlongJumps( "ext-tsp-enable-chain-split-along-jumps", cl::ReallyHidden, cl::init(true), cl::desc("The maximum size of a chain to apply splitting")); +// Algorithm-specific options for CDS. +static cl::opt CacheEntries("cds-cache-entries", cl::ReallyHidden, + cl::desc("The size of the cache")); + +static cl::opt CacheSize("cds-cache-size", cl::ReallyHidden, + cl::desc("The size of a line in the cache")); + +static cl::opt DistancePower( + "cds-distance-power", cl::ReallyHidden, + cl::desc("The power exponent for the distance-based locality")); + +static cl::opt FrequencyScale( + "cds-frequency-scale", cl::ReallyHidden, + cl::desc("The scale factor for the frequency-based locality")); + namespace { // Epsilon for comparison of doubles. @@ -280,9 +296,9 @@ struct ChainT { } ChainEdge *getEdge(ChainT *Other) const { - for (auto It : Edges) { - if (It.first == Other) - return It.second; + for (const auto &[Chain, ChainEdge] : Edges) { + if (Chain == Other) + return ChainEdge; } return nullptr; } @@ -304,11 +320,11 @@ struct ChainT { void merge(ChainT *Other, const std::vector &MergedBlocks) { Nodes = MergedBlocks; - // Update the chain's data + // Update the chain's data. ExecutionCount += Other->ExecutionCount; Size += Other->Size; Id = Nodes[0]->Index; - // Update the node's data + // Update the node's data. for (size_t Idx = 0; Idx < Nodes.size(); Idx++) { Nodes[Idx]->CurChain = this; Nodes[Idx]->CurIndex = Idx; @@ -340,7 +356,7 @@ struct ChainT { /// An edge in the graph representing jumps between two chains. /// When nodes are merged into chains, the edges are combined too so that -/// there is always at most one edge between a pair of chains +/// there is always at most one edge between a pair of chains. struct ChainEdge { ChainEdge(const ChainEdge &) = delete; ChainEdge(ChainEdge &&) = default; @@ -426,40 +442,34 @@ private: uint64_t NodeT::outCount() const { uint64_t Count = 0; - for (JumpT *Jump : OutJumps) { + for (JumpT *Jump : OutJumps) Count += Jump->ExecutionCount; - } return Count; } uint64_t NodeT::inCount() const { uint64_t Count = 0; - for (JumpT *Jump : InJumps) { + for (JumpT *Jump : InJumps) Count += Jump->ExecutionCount; - } return Count; } void ChainT::mergeEdges(ChainT *Other) { - // Update edges adjacent to chain Other - for (auto EdgeIt : Other->Edges) { - ChainT *DstChain = EdgeIt.first; - ChainEdge *DstEdge = EdgeIt.second; + // Update edges adjacent to chain Other. + for (const auto &[DstChain, DstEdge] : Other->Edges) { ChainT *TargetChain = DstChain == Other ? this : DstChain; ChainEdge *CurEdge = getEdge(TargetChain); if (CurEdge == nullptr) { DstEdge->changeEndpoint(Other, this); this->addEdge(TargetChain, DstEdge); - if (DstChain != this && DstChain != Other) { + if (DstChain != this && DstChain != Other) DstChain->addEdge(this, DstEdge); - } } else { CurEdge->moveJumps(DstEdge); } - // Cleanup leftover edge - if (DstChain != Other) { + // Cleanup leftover edge. + if (DstChain != Other) DstChain->removeEdge(Other); - } } } @@ -512,7 +522,7 @@ private: MergedChain mergeNodes(const std::vector &X, const std::vector &Y, size_t MergeOffset, MergeTypeT MergeType) { - // Split the first chain, X, into X1 and X2 + // Split the first chain, X, into X1 and X2. NodeIter BeginX1 = X.begin(); NodeIter EndX1 = X.begin() + MergeOffset; NodeIter BeginX2 = X.begin() + MergeOffset; @@ -520,7 +530,7 @@ MergedChain mergeNodes(const std::vector &X, NodeIter BeginY = Y.begin(); NodeIter EndY = Y.end(); - // Construct a new chain from the three existing ones + // Construct a new chain from the three existing ones. switch (MergeType) { case MergeTypeT::X_Y: return MergedChain(BeginX1, EndX2, BeginY, EndY); @@ -571,7 +581,7 @@ private: for (uint64_t Idx = 0; Idx < NumNodes; Idx++) { uint64_t Size = std::max(NodeSizes[Idx], 1ULL); uint64_t ExecutionCount = NodeCounts[Idx]; - // The execution count of the entry node is set to at least one + // The execution count of the entry node is set to at least one. if (Idx == 0 && ExecutionCount == 0) ExecutionCount = 1; AllNodes.emplace_back(Idx, Size, ExecutionCount); @@ -586,7 +596,7 @@ private: uint64_t Pred = It.first.first; uint64_t Succ = It.first.second; OutDegree[Pred]++; - // Ignore self-edges + // Ignore self-edges. if (Pred == Succ) continue; @@ -606,30 +616,29 @@ private: Jump.IsConditional = OutDegree[Jump.Source->Index] > 1; } - // Initialize chains + // Initialize chains. AllChains.reserve(NumNodes); HotChains.reserve(NumNodes); for (NodeT &Node : AllNodes) { AllChains.emplace_back(Node.Index, &Node); Node.CurChain = &AllChains.back(); - if (Node.ExecutionCount > 0) { + if (Node.ExecutionCount > 0) HotChains.push_back(&AllChains.back()); - } } - // Initialize chain edges + // Initialize chain edges. AllEdges.reserve(AllJumps.size()); for (NodeT &PredNode : AllNodes) { for (JumpT *Jump : PredNode.OutJumps) { NodeT *SuccNode = Jump->Target; ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain); - // this edge is already present in the graph + // this edge is already present in the graph. if (CurEdge != nullptr) { assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr); CurEdge->appendJump(Jump); continue; } - // this is a new edge + // this is a new edge. AllEdges.emplace_back(Jump); PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back()); SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back()); @@ -642,7 +651,7 @@ private: /// to B are from A. Such nodes should be adjacent in the optimal ordering; /// the method finds and merges such pairs of nodes. void mergeForcedPairs() { - // Find fallthroughs based on edge weights + // Find fallthroughs based on edge weights. for (NodeT &Node : AllNodes) { if (SuccNodes[Node.Index].size() == 1 && PredNodes[SuccNodes[Node.Index][0]].size() == 1 && @@ -669,12 +678,12 @@ private: } if (SuccNode == nullptr) continue; - // Break the cycle + // Break the cycle. AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr; Node.ForcedPred = nullptr; } - // Merge nodes with their fallthrough successors + // Merge nodes with their fallthrough successors. for (NodeT &Node : AllNodes) { if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) { const NodeT *CurBlock = &Node; @@ -689,7 +698,7 @@ private: /// Merge pairs of chains while improving the ExtTSP objective. void mergeChainPairs() { - /// Deterministically compare pairs of chains + /// Deterministically compare pairs of chains. auto compareChainPairs = [](const ChainT *A1, const ChainT *B1, const ChainT *A2, const ChainT *B2) { if (A1 != A2) @@ -701,21 +710,19 @@ private: ChainT *BestChainPred = nullptr; ChainT *BestChainSucc = nullptr; MergeGainT BestGain; - // Iterate over all pairs of chains + // Iterate over all pairs of chains. for (ChainT *ChainPred : HotChains) { - // Get candidates for merging with the current chain - for (auto EdgeIt : ChainPred->Edges) { - ChainT *ChainSucc = EdgeIt.first; - ChainEdge *Edge = EdgeIt.second; - // Ignore loop edges + // Get candidates for merging with the current chain. + for (const auto &[ChainSucc, Edge] : ChainPred->Edges) { + // Ignore loop edges. if (ChainPred == ChainSucc) continue; - // Stop early if the combined chain violates the maximum allowed size + // Stop early if the combined chain violates the maximum allowed size. if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize) continue; - // Compute the gain of merging the two chains + // Compute the gain of merging the two chains. MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge); if (CurGain.score() <= EPS) continue; @@ -731,11 +738,11 @@ private: } } - // Stop merging when there is no improvement + // Stop merging when there is no improvement. if (BestGain.score() <= EPS) break; - // Merge the best pair of chains + // Merge the best pair of chains. mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(), BestGain.mergeType()); } @@ -743,7 +750,7 @@ private: /// Merge remaining nodes into chains w/o taking jump counts into /// consideration. This allows to maintain the original node order in the - /// absence of profile data + /// absence of profile data. void mergeColdChains() { for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) { // Iterating in reverse order to make sure original fallthrough jumps are @@ -797,7 +804,7 @@ private: return Edge->getCachedMergeGain(ChainPred, ChainSucc); } - // Precompute jumps between ChainPred and ChainSucc + // Precompute jumps between ChainPred and ChainSucc. auto Jumps = Edge->jumps(); ChainEdge *EdgePP = ChainPred->getEdge(ChainPred); if (EdgePP != nullptr) { @@ -805,34 +812,34 @@ private: } assert(!Jumps.empty() && "trying to merge chains w/o jumps"); - // The object holds the best currently chosen gain of merging the two chains + // This object holds the best chosen gain of merging two chains. MergeGainT Gain = MergeGainT(); /// Given a merge offset and a list of merge types, try to merge two chains - /// and update Gain with a better alternative + /// and update Gain with a better alternative. auto tryChainMerging = [&](size_t Offset, const std::vector &MergeTypes) { - // Skip merging corresponding to concatenation w/o splitting + // Skip merging corresponding to concatenation w/o splitting. if (Offset == 0 || Offset == ChainPred->Nodes.size()) return; - // Skip merging if it breaks Forced successors + // Skip merging if it breaks Forced successors. NodeT *Node = ChainPred->Nodes[Offset - 1]; if (Node->ForcedSucc != nullptr) return; // Apply the merge, compute the corresponding gain, and update the best - // value, if the merge is beneficial + // value, if the merge is beneficial. for (const MergeTypeT &MergeType : MergeTypes) { Gain.updateIfLessThan( computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType)); } }; - // Try to concatenate two chains w/o splitting + // Try to concatenate two chains w/o splitting. Gain.updateIfLessThan( computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y)); if (EnableChainSplitAlongJumps) { - // Attach (a part of) ChainPred before the first node of ChainSucc + // Attach (a part of) ChainPred before the first node of ChainSucc. for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) { const NodeT *SrcBlock = Jump->Source; if (SrcBlock->CurChain != ChainPred) @@ -841,7 +848,7 @@ private: tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y}); } - // Attach (a part of) ChainPred after the last node of ChainSucc + // Attach (a part of) ChainPred after the last node of ChainSucc. for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) { const NodeT *DstBlock = Jump->Source; if (DstBlock->CurChain != ChainPred) @@ -851,12 +858,12 @@ private: } } - // Try to break ChainPred in various ways and concatenate with ChainSucc + // Try to break ChainPred in various ways and concatenate with ChainSucc. if (ChainPred->Nodes.size() <= ChainSplitThreshold) { for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) { // Try to split the chain in different ways. In practice, applying // X2_Y_X1 merging is almost never provides benefits; thus, we exclude - // it from consideration to reduce the search space + // it from consideration to reduce the search space. tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1, MergeTypeT::X2_X1_Y}); } @@ -875,12 +882,12 @@ private: auto MergedBlocks = mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType); - // Do not allow a merge that does not preserve the original entry point + // Do not allow a merge that does not preserve the original entry point. if ((ChainPred->isEntry() || ChainSucc->isEntry()) && !MergedBlocks.getFirstNode()->isEntry()) return MergeGainT(); - // The gain for the new chain + // The gain for the new chain. auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->Score; return MergeGainT(NewGainScore, MergeOffset, MergeType); } @@ -891,39 +898,39 @@ private: MergeTypeT MergeType) { assert(Into != From && "a chain cannot be merged with itself"); - // Merge the nodes + // Merge the nodes. MergedChain MergedNodes = mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType); Into->merge(From, MergedNodes.getNodes()); - // Merge the edges + // Merge the edges. Into->mergeEdges(From); From->clear(); - // Update cached ext-tsp score for the new chain + // Update cached ext-tsp score for the new chain. ChainEdge *SelfEdge = Into->getEdge(Into); if (SelfEdge != nullptr) { MergedNodes = MergedChain(Into->Nodes.begin(), Into->Nodes.end()); Into->Score = extTSPScore(MergedNodes, SelfEdge->jumps()); } - // Remove the chain from the list of active chains + // Remove the chain from the list of active chains. llvm::erase_value(HotChains, From); - // Invalidate caches + // Invalidate caches. for (auto EdgeIt : Into->Edges) EdgeIt.second->invalidateCache(); } /// Concatenate all chains into the final order. void concatChains(std::vector &Order) { - // Collect chains and calculate density stats for their sorting + // Collect chains and calculate density stats for their sorting. std::vector SortedChains; DenseMap ChainDensity; for (ChainT &Chain : AllChains) { if (!Chain.Nodes.empty()) { SortedChains.push_back(&Chain); - // Using doubles to avoid overflow of ExecutionCounts + // Using doubles to avoid overflow of ExecutionCounts. double Size = 0; double ExecutionCount = 0; for (NodeT *Node : Chain.Nodes) { @@ -935,21 +942,22 @@ private: } } - // Sorting chains by density in the decreasing order - std::stable_sort(SortedChains.begin(), SortedChains.end(), - [&](const ChainT *L, const ChainT *R) { - // Make sure the original entry point is at the - // beginning of the order - if (L->isEntry() != R->isEntry()) - return L->isEntry(); - - const double DL = ChainDensity[L]; - const double DR = ChainDensity[R]; - // Compare by density and break ties by chain identifiers - return (DL != DR) ? (DL > DR) : (L->Id < R->Id); - }); - - // Collect the nodes in the order specified by their chains + // Sorting chains by density in the decreasing order. + std::sort(SortedChains.begin(), SortedChains.end(), + [&](const ChainT *L, const ChainT *R) { + // Place the entry point is at the beginning of the order. + if (L->isEntry() != R->isEntry()) + return L->isEntry(); + + const double DL = ChainDensity[L]; + const double DR = ChainDensity[R]; + // Compare by density and break ties by chain identifiers. + return (DL != DR) ? (DL > DR) : (L->Id < R->Id); + return std::make_tuple(-DL, L->Id) < + std::make_tuple(-DR, R->Id); + }); + + // Collect the nodes in the order specified by their chains. Order.reserve(NumNodes); for (const ChainT *Chain : SortedChains) { for (NodeT *Node : Chain->Nodes) { @@ -984,22 +992,404 @@ private: std::vector HotChains; }; +/// The implementation of the Cache-Directed Sort (CDS) algorithm for ordering +/// functions represented by a call graph. +class CDSortImpl { +public: + CDSortImpl(const CDSortConfig &Config, const std::vector &NodeSizes, + const std::vector &NodeCounts, + const std::vector &EdgeCounts, + const std::vector &EdgeOffsets) + : Config(Config), NumNodes(NodeSizes.size()) { + initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets); + } + + /// Run the algorithm and return an ordered set of function clusters. + void run(std::vector &Result) { + // Merge pairs of chains while improving the objective. + mergeChainPairs(); + + LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number" + << " of chains from " << NumNodes << " to " + << HotChains.size() << "\n"); + + // Collect nodes from all the chains. + concatChains(Result); + } + +private: + /// Initialize the algorithm's data structures. + void initialize(const std::vector &NodeSizes, + const std::vector &NodeCounts, + const std::vector &EdgeCounts, + const std::vector &EdgeOffsets) { + // Initialize nodes. + AllNodes.reserve(NumNodes); + for (uint64_t Node = 0; Node < NumNodes; Node++) { + uint64_t Size = std::max(NodeSizes[Node], 1ULL); + uint64_t ExecutionCount = NodeCounts[Node]; + AllNodes.emplace_back(Node, Size, ExecutionCount); + TotalSamples += ExecutionCount; + if (ExecutionCount > 0) + TotalSize += Size; + } + + // Initialize jumps between the nodes. + SuccNodes.resize(NumNodes); + PredNodes.resize(NumNodes); + AllJumps.reserve(EdgeCounts.size()); + for (size_t I = 0; I < EdgeCounts.size(); I++) { + auto It = EdgeCounts[I]; + uint64_t Pred = It.first.first; + uint64_t Succ = It.first.second; + // Ignore recursive calls. + if (Pred == Succ) + continue; + + SuccNodes[Pred].push_back(Succ); + PredNodes[Succ].push_back(Pred); + uint64_t ExecutionCount = It.second; + if (ExecutionCount > 0) { + NodeT &PredNode = AllNodes[Pred]; + NodeT &SuccNode = AllNodes[Succ]; + AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount); + AllJumps.back().Offset = EdgeOffsets[I]; + SuccNode.InJumps.push_back(&AllJumps.back()); + PredNode.OutJumps.push_back(&AllJumps.back()); + } + } + + // Initialize chains. + AllChains.reserve(NumNodes); + HotChains.reserve(NumNodes); + for (NodeT &Node : AllNodes) { + // Adjust execution counts. + Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount()); + Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount()); + // Create chain. + AllChains.emplace_back(Node.Index, &Node); + Node.CurChain = &AllChains.back(); + if (Node.ExecutionCount > 0) + HotChains.push_back(&AllChains.back()); + } + + // Initialize chain edges. + AllEdges.reserve(AllJumps.size()); + for (NodeT &PredNode : AllNodes) { + for (JumpT *Jump : PredNode.OutJumps) { + NodeT *SuccNode = Jump->Target; + ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain); + // this edge is already present in the graph. + if (CurEdge != nullptr) { + assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr); + CurEdge->appendJump(Jump); + continue; + } + // this is a new edge. + AllEdges.emplace_back(Jump); + PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back()); + SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back()); + } + } + } + + /// Merge pairs of chains while there is an improvement in the objective. + void mergeChainPairs() { + // Create a priority queue containing all edges ordered by the merge gain. + auto GainComparator = [](ChainEdge *L, ChainEdge *R) { + return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) < + std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id); + }; + std::set Queue(GainComparator); + + // Insert the edges into the queue. + for (ChainT *ChainPred : HotChains) { + for (const auto &[Chain, Edge] : ChainPred->Edges) { + // Ignore self-edges. + if (Edge->isSelfEdge()) + continue; + // Ignore already processed edges. + if (Edge->gain() != -1.0) + continue; + + // Compute the gain of merging the two chains. + MergeGainT Gain = getBestMergeGain(Edge); + Edge->setMergeGain(Gain); + + if (Edge->gain() > EPS) + Queue.insert(Edge); + } + } + + // Merge the chains while the gain of merging is positive. + while (!Queue.empty()) { + // Extract the best (top) edge for merging. + ChainEdge *BestEdge = *Queue.begin(); + Queue.erase(Queue.begin()); + // Ignore self-edges. + if (BestEdge->isSelfEdge()) + continue; + // Ignore edges with non-positive gains. + if (BestEdge->gain() <= EPS) + continue; + + ChainT *BestSrcChain = BestEdge->srcChain(); + ChainT *BestDstChain = BestEdge->dstChain(); + + // Remove outdated edges from the queue. + for (const auto &[Chain, ChainEdge] : BestSrcChain->Edges) + Queue.erase(ChainEdge); + for (const auto &[Chain, ChainEdge] : BestDstChain->Edges) + Queue.erase(ChainEdge); + + // Merge the best pair of chains. + MergeGainT BestGain = BestEdge->getMergeGain(); + mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(), + BestGain.mergeType()); + + // Insert newly created edges into the queue. + for (const auto &[Chain, Edge] : BestSrcChain->Edges) { + // Ignore loop edges. + if (Edge->isSelfEdge()) + continue; + + // Compute the gain of merging the two chains. + MergeGainT Gain = getBestMergeGain(Edge); + Edge->setMergeGain(Gain); + + if (Edge->gain() > EPS) + Queue.insert(Edge); + } + } + } + + /// Compute the gain of merging two chains. + /// + /// The function considers all possible ways of merging two chains and + /// computes the one having the largest increase in ExtTSP objective. The + /// result is a pair with the first element being the gain and the second + /// element being the corresponding merging type. + MergeGainT getBestMergeGain(ChainEdge *Edge) const { + // Precompute jumps between ChainPred and ChainSucc. + auto Jumps = Edge->jumps(); + assert(!Jumps.empty() && "trying to merge chains w/o jumps"); + ChainT *SrcChain = Edge->srcChain(); + ChainT *DstChain = Edge->dstChain(); + + // This object holds the best currently chosen gain of merging two chains. + MergeGainT Gain = MergeGainT(); + + /// Given a list of merge types, try to merge two chains and update Gain + /// with a better alternative. + auto tryChainMerging = [&](const std::vector &MergeTypes) { + // Apply the merge, compute the corresponding gain, and update the best + // value, if the merge is beneficial. + for (const MergeTypeT &MergeType : MergeTypes) { + MergeGainT NewGain = + computeMergeGain(SrcChain, DstChain, Jumps, MergeType); + + // When forward and backward gains are the same, prioritize merging that + // preserves the original order of the functions in the binary. + if (std::abs(Gain.score() - NewGain.score()) < EPS) { + if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) || + (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) { + Gain = NewGain; + } + } else if (NewGain.score() > Gain.score() + EPS) { + Gain = NewGain; + } + } + }; + + // Try to concatenate two chains w/o splitting. + tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X}); + + return Gain; + } + + /// Compute the score gain of merging two chains, respecting a given type. + /// + /// The two chains are not modified in the method. + MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc, + const std::vector &Jumps, + MergeTypeT MergeType) const { + // This doesn't depend on the ordering of the nodes + double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc); + + // Merge offset is always 0, as the chains are not split. + size_t MergeOffset = 0; + auto MergedBlocks = + mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType); + double DistGain = distBasedLocalityGain(MergedBlocks, Jumps); + + double GainScore = DistGain + Config.FrequencyScale * FreqGain; + // Scale the result to increase the importance of merging short chains. + if (GainScore >= 0.0) + GainScore /= std::min(ChainPred->Size, ChainSucc->Size); + + return MergeGainT(GainScore, MergeOffset, MergeType); + } + + /// Compute the change of the frequency locality after merging the chains. + double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const { + auto missProbability = [&](double ChainDensity) { + double PageSamples = ChainDensity * Config.CacheSize; + if (PageSamples >= TotalSamples) + return 0.0; + double P = PageSamples / TotalSamples; + return pow(1.0 - P, static_cast(Config.CacheEntries)); + }; + + // Cache misses on the chains before merging. + double CurScore = + ChainPred->ExecutionCount * missProbability(ChainPred->density()) + + ChainSucc->ExecutionCount * missProbability(ChainSucc->density()); + + // Cache misses on the merged chain + double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount; + double MergedSize = ChainPred->Size + ChainSucc->Size; + double MergedDensity = static_cast(MergedCounts) / MergedSize; + double NewScore = MergedCounts * missProbability(MergedDensity); + + return CurScore - NewScore; + } + + /// Compute the distance locality for a jump / call. + double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const { + uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr; + double D = Dist == 0 ? 0.1 : static_cast(Dist); + return static_cast(Count) * std::pow(D, -Config.DistancePower); + } + + /// Compute the change of the distance locality after merging the chains. + double distBasedLocalityGain(const MergedChain &MergedBlocks, + const std::vector &Jumps) const { + if (Jumps.empty()) + return 0.0; + uint64_t CurAddr = 0; + MergedBlocks.forEach([&](const NodeT *Node) { + Node->EstimatedAddr = CurAddr; + CurAddr += Node->Size; + }); + + double CurScore = 0; + double NewScore = 0; + for (const JumpT *Arc : Jumps) { + uint64_t SrcAddr = Arc->Source->EstimatedAddr + Arc->Offset; + uint64_t DstAddr = Arc->Target->EstimatedAddr; + NewScore += distScore(SrcAddr, DstAddr, Arc->ExecutionCount); + CurScore += distScore(0, TotalSize, Arc->ExecutionCount); + } + return NewScore - CurScore; + } + + /// Merge chain From into chain Into, update the list of active chains, + /// adjacency information, and the corresponding cached values. + void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset, + MergeTypeT MergeType) { + assert(Into != From && "a chain cannot be merged with itself"); + + // Merge the nodes. + MergedChain MergedNodes = + mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType); + Into->merge(From, MergedNodes.getNodes()); + + // Merge the edges. + Into->mergeEdges(From); + From->clear(); + + // Remove the chain from the list of active chains. + llvm::erase_value(HotChains, From); + } + + /// Concatenate all chains into the final order. + void concatChains(std::vector &Order) { + // Collect chains and calculate density stats for their sorting. + std::vector SortedChains; + DenseMap ChainDensity; + for (ChainT &Chain : AllChains) { + if (!Chain.Nodes.empty()) { + SortedChains.push_back(&Chain); + // Using doubles to avoid overflow of ExecutionCounts. + double Size = 0; + double ExecutionCount = 0; + for (NodeT *Node : Chain.Nodes) { + Size += static_cast(Node->Size); + ExecutionCount += static_cast(Node->ExecutionCount); + } + assert(Size > 0 && "a chain of zero size"); + ChainDensity[&Chain] = ExecutionCount / Size; + } + } + + // Sort chains by density in the decreasing order. + std::sort(SortedChains.begin(), SortedChains.end(), + [&](const ChainT *L, const ChainT *R) { + const double DL = ChainDensity[L]; + const double DR = ChainDensity[R]; + // Compare by density and break ties by chain identifiers. + return std::make_tuple(-DL, L->Id) < + std::make_tuple(-DR, R->Id); + }); + + // Collect the nodes in the order specified by their chains. + Order.reserve(NumNodes); + for (const ChainT *Chain : SortedChains) + for (NodeT *Node : Chain->Nodes) + Order.push_back(Node->Index); + } + +private: + /// Config for the algorithm. + const CDSortConfig Config; + + /// The number of nodes in the graph. + const size_t NumNodes; + + /// Successors of each node. + std::vector> SuccNodes; + + /// Predecessors of each node. + std::vector> PredNodes; + + /// All nodes (functions) in the graph. + std::vector AllNodes; + + /// All jumps (function calls) between the nodes. + std::vector AllJumps; + + /// All chains of nodes. + std::vector AllChains; + + /// All edges between the chains. + std::vector AllEdges; + + /// Active chains. The vector gets updated at runtime when chains are merged. + std::vector HotChains; + + /// The total number of samples in the graph. + uint64_t TotalSamples{0}; + + /// The total size of the nodes in the graph. + uint64_t TotalSize{0}; +}; + } // end of anonymous namespace std::vector llvm::applyExtTspLayout(const std::vector &NodeSizes, const std::vector &NodeCounts, const std::vector &EdgeCounts) { - // Verify correctness of the input data + // Verify correctness of the input data. assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input"); assert(NodeSizes.size() > 2 && "Incorrect input"); - // Apply the reordering algorithm + // Apply the reordering algorithm. ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts); std::vector Result; Alg.run(Result); - // Verify correctness of the output + // Verify correctness of the output. assert(Result.front() == 0 && "Original entry point is not preserved"); assert(Result.size() == NodeSizes.size() && "Incorrect size of layout"); return Result; @@ -1009,7 +1399,7 @@ double llvm::calcExtTspScore(const std::vector &Order, const std::vector &NodeSizes, const std::vector &NodeCounts, const std::vector &EdgeCounts) { - // Estimate addresses of the blocks in memory + // Estimate addresses of the blocks in memory. std::vector Addr(NodeSizes.size(), 0); for (size_t Idx = 1; Idx < Order.size(); Idx++) { Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]]; @@ -1020,7 +1410,7 @@ double llvm::calcExtTspScore(const std::vector &Order, OutDegree[Pred]++; } - // Increase the score for each jump + // Increase the score for each jump. double Score = 0; for (auto It : EdgeCounts) { uint64_t Pred = It.first.first; @@ -1042,3 +1432,40 @@ double llvm::calcExtTspScore(const std::vector &NodeSizes, } return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts); } + +std::vector +llvm::applyCDSLayout(const CDSortConfig &Config, + const std::vector &FuncSizes, + const std::vector &FuncCounts, + const std::vector &CallCounts, + const std::vector &CallOffsets) { + // Verify correctness of the input data. + assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input"); + + // Apply the reordering algorithm. + CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets); + std::vector Result; + Alg.run(Result); + + // Verify correctness of the output. + assert(Result.size() == FuncSizes.size() && "Incorrect size of layout"); + return Result; +} + +std::vector +llvm::applyCDSLayout(const std::vector &FuncSizes, + const std::vector &FuncCounts, + const std::vector &CallCounts, + const std::vector &CallOffsets) { + CDSortConfig Config; + // Populate the config from the command-line options. + if (CacheEntries.getNumOccurrences() > 0) + Config.CacheEntries = CacheEntries; + if (CacheSize.getNumOccurrences() > 0) + Config.CacheSize = CacheSize; + if (DistancePower.getNumOccurrences() > 0) + Config.DistancePower = DistancePower; + if (FrequencyScale.getNumOccurrences() > 0) + Config.FrequencyScale = FrequencyScale; + return applyCDSLayout(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets); +} -- Gitee From 8ec371c63a5f502e7e6f5b929470a75dc9966428 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Tue, 13 Jun 2023 10:08:00 -0700 Subject: [PATCH 06/94] [Backport][BOLT] A new code layout algorithm for function reordering [3b/3] This is a new algorithm for function layout (reordering) based on the call graph extracted from a profile data; see diffs down the stack for more details. This layout is very similar to the existing hfsort+, but perhaps a little better on some benchmarks. The goals of the change is as follows: (i) rename and replace hfsort+ with a newer (hopefully better) implementation. I'd prefer to keep both algs together for some time to simplify evaluation and transition, but do want to remove hfsort+ once we're confident that there are no regressions. (ii) unify the implementation of code layout algorithms across LLVM. Currently Passes/HfsortPlus.cpp and Utils/CodeLayout.cpp share many implementation-specific details; this diff unifies the code. Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D153039 --- bolt/include/bolt/Passes/ReorderFunctions.h | 1 + bolt/lib/Passes/ReorderFunctions.cpp | 33 +++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/bolt/include/bolt/Passes/ReorderFunctions.h b/bolt/include/bolt/Passes/ReorderFunctions.h index 52156a600791..27094bee771a 100644 --- a/bolt/include/bolt/Passes/ReorderFunctions.h +++ b/bolt/include/bolt/Passes/ReorderFunctions.h @@ -32,6 +32,7 @@ public: RT_EXEC_COUNT, RT_HFSORT, RT_HFSORT_PLUS, + RT_CDS, RT_PETTIS_HANSEN, RT_RANDOM, RT_USER diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp index 58e50a07a499..4830d11f089c 100644 --- a/bolt/lib/Passes/ReorderFunctions.cpp +++ b/bolt/lib/Passes/ReorderFunctions.cpp @@ -15,6 +15,7 @@ #include "bolt/Utils/Utils.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/CodeLayout.h" #include #define DEBUG_TYPE "hfsort" @@ -41,6 +42,8 @@ cl::opt ReorderFunctions( "use hfsort algorithm"), clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS, "hfsort+", "use hfsort+ algorithm"), + clEnumValN(bolt::ReorderFunctions::RT_CDS, "cds", + "use cache-directed sort"), clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN, "pettis-hansen", "use Pettis-Hansen algorithm"), clEnumValN(bolt::ReorderFunctions::RT_RANDOM, "random", @@ -309,6 +312,36 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) { case RT_HFSORT_PLUS: Clusters = hfsortPlus(Cg); break; + case RT_CDS: { + // It is required that the sum of incoming arc weights is not greater + // than the number of samples for every function. Ensuring the call graph + // obeys the property before running the algorithm. + Cg.adjustArcWeights(); + + // Initialize CFG nodes and their data + std::vector FuncSizes; + std::vector FuncCounts; + using JumpT = std::pair; + std::vector> CallCounts; + std::vector CallOffsets; + for (NodeId F = 0; F < Cg.numNodes(); ++F) { + FuncSizes.push_back(Cg.size(F)); + FuncCounts.push_back(Cg.samples(F)); + for (NodeId Succ : Cg.successors(F)) { + const Arc &Arc = *Cg.findArc(F, Succ); + auto It = std::make_pair(F, Succ); + CallCounts.push_back(std::make_pair(It, Arc.weight())); + CallOffsets.push_back(uint64_t(Arc.avgCallOffset())); + } + } + + // Run the layout algorithm. + std::vector Result = + applyCDSLayout(FuncSizes, FuncCounts, CallCounts, CallOffsets); + + // Create a single cluster from the computed order of hot functions. + Clusters.emplace_back(Cluster(Result, Cg)); + } break; case RT_PETTIS_HANSEN: Clusters = pettisAndHansen(Cg); break; -- Gitee From 309342fc02efdf7284a593af01424f387011b4dd Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Mon, 31 Jul 2023 13:46:29 -0700 Subject: [PATCH 07/94] [Backport][BOLT][YAML] Only read first profile per function Work around the issue of multiple profiles per function. Can happen with a stale profile which has separate profiles that in a new binary got merged and became aliases. Reviewed By: #bolt, maksfb Differential Revision: https://reviews.llvm.org/D156644 --- bolt/lib/Profile/YAMLProfileReader.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp index 7b5a751025b4..3fd489b570d6 100644 --- a/bolt/lib/Profile/YAMLProfileReader.cpp +++ b/bolt/lib/Profile/YAMLProfileReader.cpp @@ -352,8 +352,10 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) { continue; yaml::bolt::BinaryFunctionProfile &YamlBF = *PI->getValue(); - if (profileMatches(YamlBF, Function)) + if (profileMatches(YamlBF, Function)) { matchProfileToFunction(YamlBF, Function); + break; + } } } -- Gitee From 79d92e44ae426b3e6329643df2a133cc17d70868 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Mon, 31 Jul 2023 13:48:28 -0700 Subject: [PATCH 08/94] [Backport][BOLT] Fix instrumenting conditional tail calls We identify instructions to be instrumented based on Offset annotation. BOLT "expands" conditional tail calls into a conditional jump to a basic block with unconditional tail call. Move Offset annotation from former CTC to the tail call. For expanded CTC we keep Offset attached to the original instruction which is converted into a regular conditional jump, while leaving the newly created tail call without an Offset annotation. This leads to attempting the instrumentation of the conditional jump which points to the basic block with an inherited input offset thus creating an invalid edge description. At the same time, the newly created tail call is skipped entirely which means we're not creating a call description for it. If we instead reassign Offset annotation from the conditional jump to the tail call we fix both issues. The conditional jump will be skipped not creating an invalid edge description, while tail call will be handled properly (unformly with regular calls). Reviewed By: #bolt, maksfb Differential Revision: https://reviews.llvm.org/D156389 --- bolt/lib/Core/BinaryFunction.cpp | 6 ++++++ bolt/test/runtime/X86/instrumentation-tail-call.s | 6 +++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 3f6e74c5f774..f6146d9749f5 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -2305,6 +2305,12 @@ void BinaryFunction::removeConditionalTailCalls() { // This branch is no longer a conditional tail call. BC.MIB->unsetConditionalTailCall(*CTCInstr); + + // Move offset from CTCInstr to TailCallInstr. + if (std::optional Offset = BC.MIB->getOffset(*CTCInstr)) { + BC.MIB->setOffset(TailCallInstr, *Offset); + BC.MIB->clearOffset(*CTCInstr); + } } insertBasicBlocks(std::prev(end()), std::move(NewBlocks), diff --git a/bolt/test/runtime/X86/instrumentation-tail-call.s b/bolt/test/runtime/X86/instrumentation-tail-call.s index 792d084e3f3d..dfb12f03401a 100644 --- a/bolt/test/runtime/X86/instrumentation-tail-call.s +++ b/bolt/test/runtime/X86/instrumentation-tail-call.s @@ -14,6 +14,9 @@ # CHECK: leaq 0x80(%rsp), %rsp +# RUN: FileCheck %s --input-file %t.fdata --check-prefix=CHECK-FDATA +# CHECK-FDATA: 1 main {{.*}} 1 targetFunc 0 0 1 + .text .globl main .type main, %function @@ -32,7 +35,8 @@ main: movq %rbp, %rsp pop %rbp mov -0x10(%rsp),%rax - jmp targetFunc + test %rsp, %rsp + jne targetFunc .LBBerror: addq $0x20, %rsp -- Gitee From e759f3abe3469aced8a3a723d3cd41968e77d280 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Mon, 31 Jul 2023 16:16:32 -0700 Subject: [PATCH 09/94] [Backport][BOLT][test] Add missing stderr redirections BOLT-ERROR and BOLT-WARNING messages are output to stderr which is not captured by piping to FileCheck. Redirect stderr to stdout to fix that in tests. Reviewed By: #bolt, maksfb Differential Revision: https://reviews.llvm.org/D156340 --- bolt/test/X86/issue26.s | 2 +- bolt/test/X86/issue26.test | 2 +- bolt/test/runtime/X86/exceptions-instrumentation.test | 2 +- bolt/test/runtime/meta-merge-fdata.test | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bolt/test/X86/issue26.s b/bolt/test/X86/issue26.s index a6e38b6e4cef..6f9bc72d6e10 100644 --- a/bolt/test/X86/issue26.s +++ b/bolt/test/X86/issue26.s @@ -7,7 +7,7 @@ # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \ # RUN: %s -o %t.o # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -# RUN: llvm-bolt %t.exe --relocs --print-cfg -o %t.out \ +# RUN: llvm-bolt %t.exe --relocs --print-cfg -o %t.out 2>&1 \ # RUN: | FileCheck %s # CHECK-NOT: BOLT-WARNING: CFG invalid in XYZ @ .LBB0 diff --git a/bolt/test/X86/issue26.test b/bolt/test/X86/issue26.test index 5bf25e6a59ba..bafd0912cf4a 100644 --- a/bolt/test/X86/issue26.test +++ b/bolt/test/X86/issue26.test @@ -1,7 +1,7 @@ # This reproduces issue 26 from our github repo # RUN: yaml2obj %p/Inputs/issue26.yaml &> %t.exe -# RUN: llvm-bolt %t.exe --relocs --print-cfg -o %t.out \ +# RUN: llvm-bolt %t.exe --relocs --print-cfg -o %t.out 2>&1 \ # RUN: | FileCheck %s CHECK-NOT: BOLT-WARNING: CFG invalid in XYZ @ .LBB0 diff --git a/bolt/test/runtime/X86/exceptions-instrumentation.test b/bolt/test/runtime/X86/exceptions-instrumentation.test index 7a8f4ee81e4f..4b8b3bee1fdb 100644 --- a/bolt/test/runtime/X86/exceptions-instrumentation.test +++ b/bolt/test/runtime/X86/exceptions-instrumentation.test @@ -9,7 +9,7 @@ RUN: %t.exc arg1 arg2 arg3 RUN: llvm-bolt %t_exc_split -o %t.exc.bolted --data %t.fdata \ RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \ -RUN: --split-functions --split-eh=1 \ +RUN: --split-functions --split-eh=1 2>&1 \ RUN: | FileCheck --check-prefix=EXCEPTIONS %s EXCEPTIONS-NOT: invalid (possibly stale) profile diff --git a/bolt/test/runtime/meta-merge-fdata.test b/bolt/test/runtime/meta-merge-fdata.test index 39f34ba3d8ac..5592e65b0928 100644 --- a/bolt/test/runtime/meta-merge-fdata.test +++ b/bolt/test/runtime/meta-merge-fdata.test @@ -22,7 +22,7 @@ CHECK-FDATA: 0 [unknown] 0 1 _start 0 0 1 # Check that BOLT works with this profile RUN: llvm-bolt merge-fdata -o %t.bolt --data %t.fdata1 \ RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \ -RUN: --split-functions \ +RUN: --split-functions 2>&1 \ RUN: | FileCheck %s --check-prefix=CHECK-BOLT1 CHECK-BOLT1-NOT: invalid (possibly stale) profile @@ -44,7 +44,7 @@ RUN: cmp %t.fdata.base %t.fdata.inst # Optimize using merged fdata RUN: llvm-bolt merge-fdata -o %t.opt --data %t.fdata.base \ RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \ -RUN: --split-functions \ +RUN: --split-functions 2>&1 \ RUN: | FileCheck %s --check-prefix=CHECK-BOLT2 CHECK-BOLT2-NOT: invalid (possibly stale) profile -- Gitee From 10efed52939da87fa30d1766a05ed76f121c206e Mon Sep 17 00:00:00 2001 From: chenpeihao3 Date: Fri, 4 Aug 2023 09:45:12 +0800 Subject: [PATCH 10/94] [Backport][BOLT] fix the endless loop of --iterative-guess Solve the endless loop caused by iterative guess. The main function of this option is guessEdgeByIterativeApproach, where the do while loop involves guessPredEdgeCounts and guessSuccessEdgeCounts. In some scenarios, the do while loop will fall into an endless loop. The reason is that although the GuessedPredEdgeCounts function has guessed the pred-edges counts, GuessedArcs does not insert the corresponding BB block, resulting in the changed variable always being true. Reviewed By: rafauler Differential Revision: https://reviews.llvm.org/D154922 --- bolt/lib/Passes/MCF.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/bolt/lib/Passes/MCF.cpp b/bolt/lib/Passes/MCF.cpp index ec040120a919..c3898d2dce98 100644 --- a/bolt/lib/Passes/MCF.cpp +++ b/bolt/lib/Passes/MCF.cpp @@ -262,6 +262,7 @@ bool guessPredEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) { continue; Pred->getBranchInfo(*BB).Count = Guessed; + GuessedArcs.insert(std::make_pair(Pred, BB)); return true; } llvm_unreachable("Expected unguessed arc"); -- Gitee From efd86990aca6732e67c1918aa8ff0d1eda2cf51d Mon Sep 17 00:00:00 2001 From: spupyrev Date: Wed, 14 Jun 2023 13:04:50 -0700 Subject: [PATCH 11/94] [Backport][BOLT] Fix sorting functions by execution count I noticed that `-reorder-functions=exec-count` doesn't work as expected due to a bug in the comparison function (which isn't symmetric). It is questionable whether anyone would want to ever use the sorting method (as sorting by say density is much better in all cases) but it is probably better to fix the bug. Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D152959 --- bolt/lib/Passes/ReorderFunctions.cpp | 44 +++++++----- bolt/test/X86/bug-function-layout-execount.s | 71 ++++++++++++++++++++ 2 files changed, 98 insertions(+), 17 deletions(-) create mode 100644 bolt/test/X86/bug-function-layout-execount.s diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp index 4830d11f089c..28ca68fa5a57 100644 --- a/bolt/lib/Passes/ReorderFunctions.cpp +++ b/bolt/lib/Passes/ReorderFunctions.cpp @@ -284,27 +284,37 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) { break; case RT_EXEC_COUNT: { std::vector SortedFunctions(BFs.size()); - uint32_t Index = 0; llvm::transform(llvm::make_second_range(BFs), SortedFunctions.begin(), [](BinaryFunction &BF) { return &BF; }); - llvm::stable_sort(SortedFunctions, [&](const BinaryFunction *A, - const BinaryFunction *B) { - if (A->isIgnored()) - return false; - const size_t PadA = opts::padFunction(*A); - const size_t PadB = opts::padFunction(*B); - if (!PadA || !PadB) { - if (PadA) - return true; - if (PadB) - return false; - } - return !A->hasProfile() && (B->hasProfile() || (A->getExecutionCount() > - B->getExecutionCount())); - }); + llvm::stable_sort(SortedFunctions, + [&](const BinaryFunction *A, const BinaryFunction *B) { + if (A->isIgnored()) + return false; + if (B->isIgnored()) + return true; + const size_t PadA = opts::padFunction(*A); + const size_t PadB = opts::padFunction(*B); + if (!PadA || !PadB) { + if (PadA) + return true; + if (PadB) + return false; + } + if (!A->hasProfile()) + return false; + if (!B->hasProfile()) + return true; + return A->getExecutionCount() > B->getExecutionCount(); + }); + uint32_t Index = 0; for (BinaryFunction *BF : SortedFunctions) - if (BF->hasProfile()) + if (BF->hasProfile()) { BF->setIndex(Index++); + LLVM_DEBUG(if (opts::Verbosity > 1) { + dbgs() << "BOLT-INFO: hot func " << BF->getPrintName() << " (" + << BF->getExecutionCount() << ")\n"; + }); + } } break; case RT_HFSORT: Clusters = clusterize(Cg); diff --git a/bolt/test/X86/bug-function-layout-execount.s b/bolt/test/X86/bug-function-layout-execount.s new file mode 100644 index 000000000000..2d80f0916524 --- /dev/null +++ b/bolt/test/X86/bug-function-layout-execount.s @@ -0,0 +1,71 @@ +# Verifies that llvm-bolt correctly sorts functions by their execution counts. + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe --data %t.fdata --lite --reorder-functions=exec-count \ +# RUN: -v=2 --debug-only=hfsort -o /dev/null 2>&1 | FileCheck %s + +# CHECK: Starting pass: reorder-functions +# CHECK-NEXT: hot func func2 (1500) +# CHECK-NEXT: hot func func1 (500) +# CHECK-NEXT: hot func main (400) +# CHECK-NEXT: hot func func5 (110) +# CHECK-NEXT: hot func func3 (100) +# CHECK-NEXT: hot func func4 (99) + + .text + .globl main + .type main, %function +main: +# FDATA: 0 [unknown] 0 1 main 0 1 400 + .cfi_startproc + call func1 + retq + .size _start, .-_start + .cfi_endproc + + .globl func1 + .type func1,@function +func1: +# FDATA: 0 [unknown] 0 1 func1 0 1 500 + .cfi_startproc + retq + .size func1, .-func1 + .cfi_endproc + + .globl func2 + .type func2,@function +func2: +# FDATA: 0 [unknown] 0 1 func2 0 1 1500 + .cfi_startproc + retq + .size func2, .-func2 + .cfi_endproc + + .globl func3 + .type func3,@function +func3: +# FDATA: 0 [unknown] 0 1 func3 0 1 100 + .cfi_startproc + retq + .size func3, .-func3 + .cfi_endproc + + .globl func4 + .type func4,@function +func4: +# FDATA: 0 [unknown] 0 1 func4 0 1 99 + .cfi_startproc + retq + .size func4, .-func4 + .cfi_endproc + + .globl func5 + .type func5,@function +func5: +# FDATA: 0 [unknown] 0 1 func5 0 1 110 + .cfi_startproc + retq + .size func5, .-func5 + .cfi_endproc -- Gitee From cd6d9b9dd7488c940aa660ef1992dc3a587f7b8e Mon Sep 17 00:00:00 2001 From: hezuoqiang Date: Fri, 18 Aug 2023 16:42:52 +0800 Subject: [PATCH 12/94] [Backport][BOLT] Consider Code Fragments during regreassign During register swapping, the code fragments associated with the function need to be swapped together (which may be generated during PGO optimization). Fix https://github.com/llvm/llvm-project/issues/59730 Reviewed By: rafauler Differential Revision: https://reviews.llvm.org/D141931 --- bolt/include/bolt/Core/BinaryFunction.h | 14 +++- bolt/lib/Passes/RegReAssign.cpp | 38 ++++++++++- .../test/runtime/X86/reg-reassign-swap-cold.s | 64 +++++++++++++++++++ 3 files changed, 112 insertions(+), 4 deletions(-) create mode 100644 bolt/test/runtime/X86/reg-reassign-swap-cold.s diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index bbb5f99556d9..359ecec905ee 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -366,14 +366,15 @@ private: std::string ColdCodeSectionName; /// Parent function fragment for split function fragments. - SmallPtrSet ParentFragments; + using FragmentsSetTy = SmallPtrSet; + FragmentsSetTy ParentFragments; /// Indicate if the function body was folded into another function. /// Used by ICF optimization. BinaryFunction *FoldedIntoFunction{nullptr}; /// All fragments for a parent function. - SmallPtrSet Fragments; + FragmentsSetTy Fragments; /// The profile data for the number of times the function was executed. uint64_t ExecutionCount{COUNT_NO_PROFILE}; @@ -1779,6 +1780,15 @@ public: return llvm::is_contained(Fragments, &Other); } + /// Return the child fragment form parent function + iterator_range getFragments() const { + return iterator_range(Fragments.begin(), + Fragments.end()); + } + + /// Return the parent function for split function fragments. + FragmentsSetTy *getParentFragments() { return &ParentFragments; } + /// Returns if this function is a parent or child of \p Other function. bool isParentOrChildOf(const BinaryFunction &Other) const { return isChildOf(Other) || isParentOf(Other); diff --git a/bolt/lib/Passes/RegReAssign.cpp b/bolt/lib/Passes/RegReAssign.cpp index 19e1a84c48d1..0efd27f0910b 100644 --- a/bolt/lib/Passes/RegReAssign.cpp +++ b/bolt/lib/Passes/RegReAssign.cpp @@ -140,7 +140,7 @@ void RegReAssign::rankRegisters(BinaryFunction &Function) { std::fill(RegScore.begin(), RegScore.end(), 0); std::fill(RankedRegs.begin(), RankedRegs.end(), 0); - for (BinaryBasicBlock &BB : Function) { + auto countRegScore = [&](BinaryBasicBlock &BB) { for (MCInst &Inst : BB) { const bool CannotUseREX = BC.MIB->cannotUseREX(Inst); const MCInstrDesc &Desc = BC.MII->get(Inst.getOpcode()); @@ -191,7 +191,15 @@ void RegReAssign::rankRegisters(BinaryFunction &Function) { RegScore[RegEC] += BB.getKnownExecutionCount(); } } + }; + for (BinaryBasicBlock &BB : Function) + countRegScore(BB); + + for (BinaryFunction *ChildFrag : Function.getFragments()) { + for (BinaryBasicBlock &BB : *ChildFrag) + countRegScore(BB); } + std::iota(RankedRegs.begin(), RankedRegs.end(), 0); // 0, 1, 2, 3... llvm::sort(RankedRegs, [&](size_t A, size_t B) { return RegScore[A] > RegScore[B]; }); @@ -213,6 +221,17 @@ void RegReAssign::aggressivePassOverFunction(BinaryFunction &Function) { BinaryContext &BC = Function.getBinaryContext(); rankRegisters(Function); + // If there is a situation where function: + // A() -> A.cold() + // A.localalias() -> A.cold() + // simply swapping these two calls can cause issues. + for (BinaryFunction *ChildFrag : Function.getFragments()) { + if (ChildFrag->getParentFragments()->size() > 1) + return; + if (ChildFrag->empty()) + return; + } + // Bail early if our registers are all black listed, before running expensive // analysis passes bool Bail = true; @@ -304,6 +323,10 @@ void RegReAssign::aggressivePassOverFunction(BinaryFunction &Function) { << " with " << BC.MRI->getName(ExtReg) << "\n\n"); swap(Function, ClassicReg, ExtReg); FuncsChanged.insert(&Function); + for (BinaryFunction *ChildFrag : Function.getFragments()) { + swap(*ChildFrag, ClassicReg, ExtReg); + FuncsChanged.insert(ChildFrag); + } ++Begin; if (Begin == End) break; @@ -315,6 +338,13 @@ bool RegReAssign::conservativePassOverFunction(BinaryFunction &Function) { BinaryContext &BC = Function.getBinaryContext(); rankRegisters(Function); + for (BinaryFunction *ChildFrag : Function.getFragments()) { + if (ChildFrag->getParentFragments()->size() > 1) + return false; + if (ChildFrag->empty()) + return false; + } + // Try swapping R12, R13, R14 or R15 with RBX (we work with all callee-saved // regs except RBP) MCPhysReg Candidate = 0; @@ -345,6 +375,10 @@ bool RegReAssign::conservativePassOverFunction(BinaryFunction &Function) { (void)BC; swap(Function, RBX, Candidate); FuncsChanged.insert(&Function); + for (BinaryFunction *ChildFrag : Function.getFragments()) { + swap(*ChildFrag, RBX, Candidate); + FuncsChanged.insert(ChildFrag); + } return true; } @@ -404,7 +438,7 @@ void RegReAssign::runOnFunctions(BinaryContext &BC) { for (auto &I : BC.getBinaryFunctions()) { BinaryFunction &Function = I.second; - if (!Function.isSimple() || Function.isIgnored()) + if (!Function.isSimple() || Function.isIgnored() || Function.isFragment()) continue; LLVM_DEBUG(dbgs() << "====================================\n"); diff --git a/bolt/test/runtime/X86/reg-reassign-swap-cold.s b/bolt/test/runtime/X86/reg-reassign-swap-cold.s new file mode 100644 index 000000000000..115b5b0eeff8 --- /dev/null +++ b/bolt/test/runtime/X86/reg-reassign-swap-cold.s @@ -0,0 +1,64 @@ +# This test case reproduces a bug where, during register swapping, +# the code fragments associated with the function need to be swapped +# together (which may be generated during PGO optimization). If not +# handled properly, optimized binary execution can result in a segmentation fault. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang -no-pie %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.out -data=%t.fdata --reg-reassign | FileCheck %s +# RUN: %t.out + +# CHECK: BOLT-INFO: Reg Reassignment Pass Stats +# CHECK-NEXT: 2 functions affected. + .text + .globl main + .globl main.cold + .p2align 4, 0x90 + .type main,@function + .type main.cold,@function +main.cold: +bb1: + cmp $0x3, %r12 + jne bb8 +bb2: + jmp bb4 +main: # @main + .cfi_startproc +# %bb.0: # %entry + pushq %rax + pushq %r12 + pushq %rbx + .cfi_def_cfa_offset 16 + mov $0x1, %r12 + mov $0x2, %rbx + add $0x1, %r12 + shr $0x14, %r12 + mov $0x3, %r12 +bb3: + jmp bb1 +bb4: + cmp $0x3, %r12 +bb5: + jne bb8 +bb6: + xorl %eax, %eax +bb7: + popq %rcx + popq %rbx + popq %r12 + .cfi_def_cfa_offset 8 + retq +bb8: + mov $0x1, %rax + jmp bb7 +# FDATA: 1 main.cold #bb2# 1 main #bb4# 0 100 +# FDATA: 1 main #bb5# 1 main #bb6# 0 100 +# FDATA: 1 main #bb3# 1 main.cold 0 0 100 + +.Lfunc_end0: + .size main, .Lfunc_end0-main + .cfi_endproc -- Gitee From 33cbe283e5c5502467fef7cf6c95a2a965e739c1 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Fri, 18 Aug 2023 13:59:58 +0200 Subject: [PATCH 13/94] [Backport]bolt/test/X86/bug-function-layout-execount.s: Require x86 and asserts Follow-up to D152959: --debug-only= requires an asserts build. The test also needs the x86 target. --- bolt/test/X86/bug-function-layout-execount.s | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bolt/test/X86/bug-function-layout-execount.s b/bolt/test/X86/bug-function-layout-execount.s index 2d80f0916524..540b6790d01e 100644 --- a/bolt/test/X86/bug-function-layout-execount.s +++ b/bolt/test/X86/bug-function-layout-execount.s @@ -1,5 +1,7 @@ # Verifies that llvm-bolt correctly sorts functions by their execution counts. +# REQUIRES: x86_64-linux, asserts + # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o # RUN: link_fdata %s %t.o %t.fdata # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -- Gitee From cf59ecf9274d0fbd1b4ce8e58d90a9164f475dea Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Mon, 21 Aug 2023 10:10:48 +0200 Subject: [PATCH 14/94] [Backport][BOLT] Calculate input to output address map using BOLTLinker BOLT uses MCAsmLayout to calculate the output values of basic blocks. This means output values are calculated based on a pre-linking state and any changes to symbol values during linking will cause incorrect values to be used. This issue was first addressed in D154604 by adding all basic block symbols to the symbol table for the linker to resolve them. However, the runtime overhead of handling this huge symbol table turned out to be prohibitively large. This patch solves the issue in a different way. First, a temporary section containing [input address, output symbol] pairs is emitted to the intermediary object file. The linker will resolve all these references so we end up with a section of [input address, output address] pairs. This section is then parsed and used to: - Replace BinaryBasicBlock::OffsetTranslationTable - Replace BinaryFunction::InputOffsetToAddressMap - Update BinaryBasicBlock::OutputAddressRange Note that the reason this is more performant than the previous attempt is that these symbol references do not cause entries to be added to the symbol table. Instead, section-relative references are used for the relocations. Reviewed By: maksfb Differential Revision: https://reviews.llvm.org/D155604 --- bolt/include/bolt/Core/AddressMap.h | 59 +++++++++++++++++++ bolt/include/bolt/Core/BinaryBasicBlock.h | 26 +-------- bolt/include/bolt/Core/BinaryContext.h | 10 ++++ bolt/include/bolt/Core/BinaryFunction.h | 15 ++--- bolt/include/bolt/Core/BinarySection.h | 4 ++ bolt/lib/Core/AddressMap.cpp | 63 +++++++++++++++++++++ bolt/lib/Core/BinaryBasicBlock.cpp | 22 ------- bolt/lib/Core/BinaryEmitter.cpp | 4 ++ bolt/lib/Core/BinaryFunction.cpp | 19 ++++--- bolt/lib/Core/CMakeLists.txt | 1 + bolt/lib/Profile/BoltAddressTranslation.cpp | 11 +++- bolt/lib/Rewrite/PseudoProbeRewriter.cpp | 4 +- bolt/lib/Rewrite/RewriteInstance.cpp | 15 +++++ 13 files changed, 183 insertions(+), 70 deletions(-) create mode 100644 bolt/include/bolt/Core/AddressMap.h create mode 100644 bolt/lib/Core/AddressMap.cpp diff --git a/bolt/include/bolt/Core/AddressMap.h b/bolt/include/bolt/Core/AddressMap.h new file mode 100644 index 000000000000..16c2727b6943 --- /dev/null +++ b/bolt/include/bolt/Core/AddressMap.h @@ -0,0 +1,59 @@ +//===- bolt/Core/AddressMap.h - Input-output address map --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Helper class to create a mapping from input to output addresses needed for +// updating debugging symbols and BAT. We emit an MCSection containing +// pairs to the object file and JITLink will +// transform this in pairs. The linker output +// can then be parsed and used to establish the mapping. +// +//===----------------------------------------------------------------------===// +// +#ifndef BOLT_CORE_ADDRESS_MAP_H +#define BOLT_CORE_ADDRESS_MAP_H + +#include "llvm/ADT/StringRef.h" + +#include +#include + +namespace llvm { + +class MCStreamer; + +namespace bolt { + +class BinaryContext; + +class AddressMap { + using MapTy = std::unordered_multimap; + MapTy Map; + +public: + static const char *const SectionName; + + static void emit(MCStreamer &Streamer, BinaryContext &BC); + static AddressMap parse(StringRef Buffer, const BinaryContext &BC); + + std::optional lookup(uint64_t InputAddress) const { + auto It = Map.find(InputAddress); + if (It != Map.end()) + return It->second; + return std::nullopt; + } + + std::pair + lookupAll(uint64_t InputAddress) const { + return Map.equal_range(InputAddress); + } +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/include/bolt/Core/BinaryBasicBlock.h b/bolt/include/bolt/Core/BinaryBasicBlock.h index 02be9c1d4f11..bc95e2c4de3a 100644 --- a/bolt/include/bolt/Core/BinaryBasicBlock.h +++ b/bolt/include/bolt/Core/BinaryBasicBlock.h @@ -100,16 +100,6 @@ private: using LocSymsTy = std::vector>; std::unique_ptr LocSyms; - /// After output/codegen, map output offsets of instructions in this basic - /// block to instruction offsets in the original function. Note that the - /// output basic block could be different from the input basic block. - /// We only map instruction of interest, such as calls and markers. - /// - /// We store the offset array in a basic block to facilitate BAT tables - /// generation. Otherwise, the mapping could be done at function level. - using OffsetTranslationTableTy = std::vector>; - std::unique_ptr OffsetTranslationTable; - /// Alignment requirements for the block. uint32_t Alignment{1}; @@ -828,8 +818,7 @@ public: return OutputAddressRange; } - /// Update addresses of special instructions inside this basic block. - void updateOutputValues(const MCAsmLayout &Layout); + bool hasLocSyms() const { return LocSyms != nullptr; } /// Return mapping of input offsets to symbols in the output. LocSymsTy &getLocSyms() { @@ -841,19 +830,6 @@ public: return const_cast(this)->getLocSyms(); } - /// Return offset translation table for the basic block. - OffsetTranslationTableTy &getOffsetTranslationTable() { - return OffsetTranslationTable - ? *OffsetTranslationTable - : *(OffsetTranslationTable = - std::make_unique()); - } - - /// Return offset translation table for the basic block. - const OffsetTranslationTableTy &getOffsetTranslationTable() const { - return const_cast(this)->getOffsetTranslationTable(); - } - /// Return size of the basic block in the output binary. uint64_t getOutputSize() const { return OutputAddressRange.second - OutputAddressRange.first; diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index 7bbfb2fc33ee..ef57ff3541dc 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -13,6 +13,7 @@ #ifndef BOLT_CORE_BINARY_CONTEXT_H #define BOLT_CORE_BINARY_CONTEXT_H +#include "bolt/Core/AddressMap.h" #include "bolt/Core/BinaryData.h" #include "bolt/Core/BinarySection.h" #include "bolt/Core/DebugData.h" @@ -221,6 +222,9 @@ class BinaryContext { bool ContainsDwarf5{false}; bool ContainsDwarfLegacy{false}; + /// Mapping from input to output addresses. + std::optional IOAddressMap; + /// Preprocess DWO debug information. void preprocessDWODebugInfo(); @@ -1343,6 +1347,12 @@ public: /* DWARFMustBeAtTheEnd */ false)); return Streamer; } + + void setIOAddressMap(AddressMap Map) { IOAddressMap = std::move(Map); } + const AddressMap &getIOAddressMap() const { + assert(IOAddressMap && "Address map not set yet"); + return *IOAddressMap; + } }; template > diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index 359ecec905ee..9b45467a6a8f 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -577,9 +577,6 @@ private: /// Count the number of functions created. static uint64_t Count; - /// Map offsets of special instructions to addresses in the output. - InputOffsetToAddressMapTy InputOffsetToAddressMap; - /// Register alternative function name. void addAlternativeName(std::string NewName) { Aliases.push_back(std::move(NewName)); @@ -1226,13 +1223,6 @@ public: /// Update output values of the function based on the final \p Layout. void updateOutputValues(const MCAsmLayout &Layout); - /// Return mapping of input to output addresses. Most users should call - /// translateInputToOutputAddress() for address translation. - InputOffsetToAddressMapTy &getInputOffsetToAddressMap() { - assert(isEmitted() && "cannot use address mapping before code emission"); - return InputOffsetToAddressMap; - } - /// Register relocation type \p RelType at a given \p Address in the function /// against \p Symbol. /// Assert if the \p Address is not inside this function. @@ -2180,6 +2170,11 @@ public: /// its code emission. bool requiresAddressTranslation() const; + /// Return true if the linker needs to generate an address map for this + /// function. Used for keeping track of the mapping from input to out + /// addresses of basic blocks. + bool requiresAddressMap() const; + /// Adjust branch instructions to match the CFG. /// /// As it comes to internal branches, the CFG represents "the ultimate source diff --git a/bolt/include/bolt/Core/BinarySection.h b/bolt/include/bolt/Core/BinarySection.h index f1041777926f..326d088d1f04 100644 --- a/bolt/include/bolt/Core/BinarySection.h +++ b/bolt/include/bolt/Core/BinarySection.h @@ -97,6 +97,8 @@ class BinarySection { mutable bool IsReordered{false}; // Have the contents been reordered? bool IsAnonymous{false}; // True if the name should not be included // in the output file. + bool IsLinkOnly{false}; // True if the section should not be included + // in the output file. uint64_t hash(const BinaryData &BD, std::map &Cache) const; @@ -452,6 +454,8 @@ public: void setIndex(uint32_t I) { Index = I; } void setOutputName(const Twine &Name) { OutputName = Name.str(); } void setAnonymous(bool Flag) { IsAnonymous = Flag; } + bool isLinkOnly() const { return IsLinkOnly; } + void setLinkOnly() { IsLinkOnly = true; } /// Emit the section as data, possibly with relocations. /// Use name \p SectionName for the section during the emission. diff --git a/bolt/lib/Core/AddressMap.cpp b/bolt/lib/Core/AddressMap.cpp new file mode 100644 index 000000000000..76c5378a3eb1 --- /dev/null +++ b/bolt/lib/Core/AddressMap.cpp @@ -0,0 +1,63 @@ +#include "bolt/Core/AddressMap.h" +#include "bolt/Core/BinaryContext.h" +#include "bolt/Core/BinaryFunction.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/DataExtractor.h" + +namespace llvm { +namespace bolt { + +const char *const AddressMap::SectionName = ".bolt.address_map"; + +static void emitLabel(MCStreamer &Streamer, uint64_t InputAddress, + const MCSymbol *OutputLabel) { + Streamer.emitIntValue(InputAddress, 8); + Streamer.emitSymbolValue(OutputLabel, 8); +} + +void AddressMap::emit(MCStreamer &Streamer, BinaryContext &BC) { + Streamer.switchSection(BC.getDataSection(SectionName)); + + for (const auto &[BFAddress, BF] : BC.getBinaryFunctions()) { + if (!BF.requiresAddressMap()) + continue; + + for (const auto &BB : BF) { + if (!BB.getLabel()->isDefined()) + continue; + + emitLabel(Streamer, BFAddress + BB.getInputAddressRange().first, + BB.getLabel()); + + if (!BB.hasLocSyms()) + continue; + + for (auto [Offset, Symbol] : BB.getLocSyms()) + emitLabel(Streamer, BFAddress + Offset, Symbol); + } + } +} + +AddressMap AddressMap::parse(StringRef Buffer, const BinaryContext &BC) { + const auto EntrySize = 2 * BC.AsmInfo->getCodePointerSize(); + assert(Buffer.size() % EntrySize == 0 && "Unexpected address map size"); + + DataExtractor DE(Buffer, BC.AsmInfo->isLittleEndian(), + BC.AsmInfo->getCodePointerSize()); + DataExtractor::Cursor Cursor(0); + + AddressMap Parsed; + Parsed.Map.reserve(Buffer.size() / EntrySize); + + while (Cursor && !DE.eof(Cursor)) { + const auto Input = DE.getAddress(Cursor); + const auto Output = DE.getAddress(Cursor); + Parsed.Map.insert({Input, Output}); + } + + assert(Cursor && "Error reading address map section"); + return Parsed; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/lib/Core/BinaryBasicBlock.cpp b/bolt/lib/Core/BinaryBasicBlock.cpp index b271b86ec699..d764a874d08c 100644 --- a/bolt/lib/Core/BinaryBasicBlock.cpp +++ b/bolt/lib/Core/BinaryBasicBlock.cpp @@ -613,27 +613,5 @@ BinaryBasicBlock *BinaryBasicBlock::splitAt(iterator II) { return NewBlock; } -void BinaryBasicBlock::updateOutputValues(const MCAsmLayout &Layout) { - if (!LocSyms) - return; - - const uint64_t BBAddress = getOutputAddressRange().first; - const uint64_t BBOffset = Layout.getSymbolOffset(*getLabel()); - for (const auto &LocSymKV : *LocSyms) { - const uint32_t InputFunctionOffset = LocSymKV.first; - const uint32_t OutputOffset = static_cast( - Layout.getSymbolOffset(*LocSymKV.second) - BBOffset); - getOffsetTranslationTable().emplace_back( - std::make_pair(OutputOffset, InputFunctionOffset)); - - // Update reverse (relative to BAT) address lookup table for function. - if (getFunction()->requiresAddressTranslation()) { - getFunction()->getInputOffsetToAddressMap().emplace( - std::make_pair(InputFunctionOffset, OutputOffset + BBAddress)); - } - } - LocSyms.reset(nullptr); -} - } // namespace bolt } // namespace llvm diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp index c4129615ac32..63446575f4b2 100644 --- a/bolt/lib/Core/BinaryEmitter.cpp +++ b/bolt/lib/Core/BinaryEmitter.cpp @@ -214,6 +214,10 @@ void BinaryEmitter::emitAll(StringRef OrgSecPrefix) { } emitDataSections(OrgSecPrefix); + + // TODO Enable for Mach-O once BinaryContext::getDataSection supports it. + if (BC.isELF()) + AddressMap::emit(Streamer, BC); } void BinaryEmitter::emitFunctions() { diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index f6146d9749f5..7b093116b3d6 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -2855,6 +2855,14 @@ bool BinaryFunction::requiresAddressTranslation() const { return opts::EnableBAT || hasSDTMarker() || hasPseudoProbe(); } +bool BinaryFunction::requiresAddressMap() const { + if (isInjected()) + return false; + + return opts::UpdateDebugSections || isMultiEntry() || + requiresAddressTranslation(); +} + uint64_t BinaryFunction::getInstructionCount() const { uint64_t Count = 0; for (const BinaryBasicBlock &BB : blocks()) @@ -4120,15 +4128,13 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) { assert(FragmentBaseAddress == getOutputAddress()); } - const uint64_t BBOffset = Layout.getSymbolOffset(*BB->getLabel()); - const uint64_t BBAddress = FragmentBaseAddress + BBOffset; + const uint64_t BBAddress = + *BC.getIOAddressMap().lookup(BB->getInputOffset() + getAddress()); BB->setOutputStartAddress(BBAddress); if (PrevBB) PrevBB->setOutputEndAddress(BBAddress); PrevBB = BB; - - BB->updateOutputValues(Layout); } PrevBB->setOutputEndAddress(PrevBB->isSplit() @@ -4181,9 +4187,8 @@ uint64_t BinaryFunction::translateInputToOutputAddress(uint64_t Address) const { // Check if the address is associated with an instruction that is tracked // by address translation. - auto KV = InputOffsetToAddressMap.find(Address - getAddress()); - if (KV != InputOffsetToAddressMap.end()) - return KV->second; + if (auto OutputAddress = BC.getIOAddressMap().lookup(Address)) + return *OutputAddress; // FIXME: #18950828 - we rely on relative offsets inside basic blocks to stay // intact. Instead we can use pseudo instructions and/or annotations. diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt index a4612fb93f8c..c913179ebcc5 100644 --- a/bolt/lib/Core/CMakeLists.txt +++ b/bolt/lib/Core/CMakeLists.txt @@ -11,6 +11,7 @@ set(LLVM_LINK_COMPONENTS ) add_llvm_library(LLVMBOLTCore + AddressMap.cpp BinaryBasicBlock.cpp BinaryContext.cpp BinaryData.cpp diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp index 57a850eb1723..e004309e0e21 100644 --- a/bolt/lib/Profile/BoltAddressTranslation.cpp +++ b/bolt/lib/Profile/BoltAddressTranslation.cpp @@ -46,9 +46,14 @@ void BoltAddressTranslation::writeEntriesForBB(MapTy &Map, // allowing it to overwrite the previously inserted key in the map. Map[BBOutputOffset] = BBInputOffset; - for (const auto &IOPair : BB.getOffsetTranslationTable()) { - const uint64_t OutputOffset = IOPair.first + BBOutputOffset; - const uint32_t InputOffset = IOPair.second; + const auto &IOAddressMap = + BB.getFunction()->getBinaryContext().getIOAddressMap(); + + for (const auto &[InputOffset, Sym] : BB.getLocSyms()) { + const auto InputAddress = BB.getFunction()->getAddress() + InputOffset; + const auto OutputAddress = IOAddressMap.lookup(InputAddress); + assert(OutputAddress && "Unknown instruction address"); + const auto OutputOffset = *OutputAddress - FuncAddress; // Is this the first instruction in the BB? No need to duplicate the entry. if (OutputOffset == BBOutputOffset) diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp index 64b8a8b6d400..316b83cfbd38 100644 --- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp +++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp @@ -183,9 +183,7 @@ void PseudoProbeRewriter::updatePseudoProbes() { // A call probe may be duplicated due to ICP // Go through output of InputOffsetToAddressMap to collect all related // probes - const InputOffsetToAddressMapTy &Offset2Addr = - F->getInputOffsetToAddressMap(); - auto CallOutputAddresses = Offset2Addr.equal_range(Offset); + auto CallOutputAddresses = BC.getIOAddressMap().lookupAll(AP.first); auto CallOutputAddress = CallOutputAddresses.first; if (CallOutputAddress == CallOutputAddresses.second) { Probe->setAddress(INT64_MAX); diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 1ade842c4ee0..d194abd40b8e 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "bolt/Rewrite/RewriteInstance.h" +#include "bolt/Core/AddressMap.h" #include "bolt/Core/BinaryContext.h" #include "bolt/Core/BinaryEmitter.h" #include "bolt/Core/BinaryFunction.h" @@ -3169,6 +3170,9 @@ void RewriteInstance::preregisterSections() { ROFlags); BC->registerOrUpdateSection(getNewSecPrefix() + ".rodata.cold", ELF::SHT_PROGBITS, ROFlags); + BC->registerOrUpdateSection(AddressMap::SectionName, ELF::SHT_PROGBITS, + ROFlags) + .setLinkOnly(); } void RewriteInstance::emitAndLink() { @@ -3574,6 +3578,9 @@ void RewriteInstance::mapAllocatableSections( } for (BinarySection &Section : BC->allocatableSections()) { + if (Section.isLinkOnly()) + continue; + if (!Section.hasValidSectionID()) continue; @@ -3636,6 +3643,12 @@ void RewriteInstance::mapAllocatableSections( } void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { + if (auto MapSection = BC->getUniqueSectionByName(AddressMap::SectionName)) { + auto Map = AddressMap::parse(MapSection->getOutputContents(), *BC); + BC->setIOAddressMap(std::move(Map)); + BC->deregisterSection(*MapSection); + } + for (BinaryFunction *Function : BC->getAllBinaryFunctions()) Function->updateOutputValues(Layout); } @@ -5281,6 +5294,8 @@ void RewriteInstance::rewriteFile() { for (BinarySection &Section : BC->allocatableSections()) { if (!Section.isFinalized() || !Section.getOutputData()) continue; + if (Section.isLinkOnly()) + continue; if (opts::Verbosity >= 1) outs() << "BOLT: writing new section " << Section.getName() -- Gitee From f68954b3abf15e49d68ae4055350e29f61b68d5e Mon Sep 17 00:00:00 2001 From: zhoujiapeng Date: Wed, 23 Aug 2023 00:21:05 +0800 Subject: [PATCH 15/94] [Backport][BOLT][NFC] Split createRelocation in X86 and share the second part This commit splits the createRelocation function for the X86 architecture into two parts, retaining the first half and moving the second half to a new function called extractFixupExpr. The purpose of this change is to make extractFixupExpr a shared function between AArch64 and X86 architectures, increasing code reusability and maintainability. Child revision: https://reviews.llvm.org/D156018 Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D157217 --- bolt/include/bolt/Core/MCPlusBuilder.h | 42 ++++++++++++++++++++++++ bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 41 ++--------------------- 2 files changed, 44 insertions(+), 39 deletions(-) diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index cd4676f370e6..20098ea82e71 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -1730,6 +1730,48 @@ public: return true; } + /// Extract a symbol and an addend out of the fixup value expression. + /// + /// Only the following limited expression types are supported: + /// Symbol + Addend + /// Symbol + Constant + Addend + /// Const + Addend + /// Symbol + std::pair extractFixupExpr(const MCFixup &Fixup) const { + uint64_t Addend = 0; + MCSymbol *Symbol = nullptr; + const MCExpr *ValueExpr = Fixup.getValue(); + if (ValueExpr->getKind() == MCExpr::Binary) { + const auto *BinaryExpr = cast(ValueExpr); + assert(BinaryExpr->getOpcode() == MCBinaryExpr::Add && + "unexpected binary expression"); + const MCExpr *LHS = BinaryExpr->getLHS(); + if (LHS->getKind() == MCExpr::Constant) { + Addend = cast(LHS)->getValue(); + } else if (LHS->getKind() == MCExpr::Binary) { + const auto *LHSBinaryExpr = cast(LHS); + assert(LHSBinaryExpr->getOpcode() == MCBinaryExpr::Add && + "unexpected binary expression"); + const MCExpr *LLHS = LHSBinaryExpr->getLHS(); + assert(LLHS->getKind() == MCExpr::SymbolRef && "unexpected LLHS"); + Symbol = const_cast(this->getTargetSymbol(LLHS)); + const MCExpr *RLHS = LHSBinaryExpr->getRHS(); + assert(RLHS->getKind() == MCExpr::Constant && "unexpected RLHS"); + Addend = cast(RLHS)->getValue(); + } else { + assert(LHS->getKind() == MCExpr::SymbolRef && "unexpected LHS"); + Symbol = const_cast(this->getTargetSymbol(LHS)); + } + const MCExpr *RHS = BinaryExpr->getRHS(); + assert(RHS->getKind() == MCExpr::Constant && "unexpected RHS"); + Addend += cast(RHS)->getValue(); + } else { + assert(ValueExpr->getKind() == MCExpr::SymbolRef && "unexpected value"); + Symbol = const_cast(this->getTargetSymbol(ValueExpr)); + } + return std::make_pair(Symbol, Addend); + } + /// Return annotation index matching the \p Name. std::optional getAnnotationIndex(StringRef Name) const { auto AI = AnnotationNameIndexMap.find(Name); diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index 265868fbddd4..9109159bf1b4 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -2464,46 +2464,9 @@ public: } } - // Extract a symbol and an addend out of the fixup value expression. - // - // Only the following limited expression types are supported: - // Symbol + Addend - // Symbol + Constant + Addend - // Const + Addend - // Symbol - uint64_t Addend = 0; - MCSymbol *Symbol = nullptr; - const MCExpr *ValueExpr = Fixup.getValue(); - if (ValueExpr->getKind() == MCExpr::Binary) { - const auto *BinaryExpr = cast(ValueExpr); - assert(BinaryExpr->getOpcode() == MCBinaryExpr::Add && - "unexpected binary expression"); - const MCExpr *LHS = BinaryExpr->getLHS(); - if (LHS->getKind() == MCExpr::Constant) { - Addend = cast(LHS)->getValue(); - } else if (LHS->getKind() == MCExpr::Binary) { - const auto *LHSBinaryExpr = cast(LHS); - assert(LHSBinaryExpr->getOpcode() == MCBinaryExpr::Add && - "unexpected binary expression"); - const MCExpr *LLHS = LHSBinaryExpr->getLHS(); - assert(LLHS->getKind() == MCExpr::SymbolRef && "unexpected LLHS"); - Symbol = const_cast(this->getTargetSymbol(LLHS)); - const MCExpr *RLHS = LHSBinaryExpr->getRHS(); - assert(RLHS->getKind() == MCExpr::Constant && "unexpected RLHS"); - Addend = cast(RLHS)->getValue(); - } else { - assert(LHS->getKind() == MCExpr::SymbolRef && "unexpected LHS"); - Symbol = const_cast(this->getTargetSymbol(LHS)); - } - const MCExpr *RHS = BinaryExpr->getRHS(); - assert(RHS->getKind() == MCExpr::Constant && "unexpected RHS"); - Addend += cast(RHS)->getValue(); - } else { - assert(ValueExpr->getKind() == MCExpr::SymbolRef && "unexpected value"); - Symbol = const_cast(this->getTargetSymbol(ValueExpr)); - } + auto [RelSymbol, RelAddend] = extractFixupExpr(Fixup); - return Relocation({RelOffset, Symbol, RelType, Addend, 0}); + return Relocation({RelOffset, RelSymbol, RelType, RelAddend, 0}); } bool replaceImmWithSymbolRef(MCInst &Inst, const MCSymbol *Symbol, -- Gitee From 0e10dc88322061f4335dafaf77bdbe391c33de81 Mon Sep 17 00:00:00 2001 From: zhoujiapeng Date: Wed, 23 Aug 2023 00:50:31 +0800 Subject: [PATCH 16/94] [Packport][BOLT] Implement createRelocation for AArch64 The implementation is based on the X86 version, with the same code of symbol and addend extraction. The differences include the support for RelType `R_AARCH64_CALL26` and the deletion of 8-bit relocation. Reviewed By: rafauler Differential Revision: https://reviews.llvm.org/D156018 --- bolt/lib/Core/Relocation.cpp | 7 +++ .../Target/AArch64/AArch64MCPlusBuilder.cpp | 47 +++++++++++++++++++ bolt/test/AArch64/reloc-call26.s | 29 ++++++++++++ 3 files changed, 83 insertions(+) create mode 100644 bolt/test/AArch64/reloc-call26.s diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp index e985d6da82c1..45da2addbb98 100644 --- a/bolt/lib/Core/Relocation.cpp +++ b/bolt/lib/Core/Relocation.cpp @@ -345,6 +345,13 @@ static uint64_t encodeValueAArch64(uint64_t Type, uint64_t Value, uint64_t PC) { case ELF::R_AARCH64_PREL64: Value -= PC; break; + case ELF::R_AARCH64_CALL26: + Value -= PC; + assert(isInt<28>(Value) && "only PC +/- 128MB is allowed for direct call"); + // Immediate goes in bits 25:0 of BL. + // OP 1001_01 goes in bits 31:26 of BL. + Value = (Value >> 2) | 0x94000000ULL; + break; } return Value; } diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 777a1e6cc743..41594bc00146 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64FixupKinds.h" #include "MCTargetDesc/AArch64MCExpr.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "Utils/AArch64BaseInfo.h" @@ -1550,6 +1551,52 @@ public: ELF::R_AARCH64_ADD_ABS_LO12_NC); return Insts; } + + std::optional + createRelocation(const MCFixup &Fixup, + const MCAsmBackend &MAB) const override { + const MCFixupKindInfo &FKI = MAB.getFixupKindInfo(Fixup.getKind()); + + assert(FKI.TargetOffset == 0 && "0-bit relocation offset expected"); + const uint64_t RelOffset = Fixup.getOffset(); + + uint64_t RelType; + if (Fixup.getKind() == MCFixupKind(AArch64::fixup_aarch64_pcrel_call26)) + RelType = ELF::R_AARCH64_CALL26; + else if (FKI.Flags & MCFixupKindInfo::FKF_IsPCRel) { + switch (FKI.TargetSize) { + default: + return std::nullopt; + case 16: + RelType = ELF::R_AARCH64_PREL16; + break; + case 32: + RelType = ELF::R_AARCH64_PREL32; + break; + case 64: + RelType = ELF::R_AARCH64_PREL64; + break; + } + } else { + switch (FKI.TargetSize) { + default: + return std::nullopt; + case 16: + RelType = ELF::R_AARCH64_ABS16; + break; + case 32: + RelType = ELF::R_AARCH64_ABS32; + break; + case 64: + RelType = ELF::R_AARCH64_ABS64; + break; + } + } + + auto [RelSymbol, RelAddend] = extractFixupExpr(Fixup); + + return Relocation({RelOffset, RelSymbol, RelType, RelAddend, 0}); + } }; } // end anonymous namespace diff --git a/bolt/test/AArch64/reloc-call26.s b/bolt/test/AArch64/reloc-call26.s new file mode 100644 index 000000000000..09399367b159 --- /dev/null +++ b/bolt/test/AArch64/reloc-call26.s @@ -0,0 +1,29 @@ +## This test checks processing of R_AARCH64_CALL26 relocation +## when option `--funcs` is enabled + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.bolt --funcs=func1 +# RUN: llvm-objdump -d --disassemble-symbols='_start' %t.bolt | \ +# RUN: FileCheck %s + +# CHECK: {{.*}} bl {{.*}} + + .text + .align 4 + .global _start + .type _start, %function +_start: + bl func1 + mov w8, #93 + svc #0 + .size _start, .-_start + + .global func1 + .type func1, %function +func1: + ret + .size func1, .-func1 -- Gitee From 785a825a400e9e1f1e4e54128c863287bc1d4ad2 Mon Sep 17 00:00:00 2001 From: Denis Revunov Date: Thu, 27 Jul 2023 11:48:08 -0400 Subject: [PATCH 17/94] [Backport][BOLT] Fix trap value for non-X86 The trap value used by BOLT was assumed to be single-byte instruction. It made some functions unaligned on AArch64(e.g exceptions-instrumentation test) and caused emission failures. Fix that by changing fill value to StringRef. Reviewed By: rafauler Differential Revision: https://reviews.llvm.org/D158191 --- bolt/include/bolt/Core/MCPlusBuilder.h | 9 ++++++--- bolt/lib/Core/BinaryEmitter.cpp | 4 ++-- bolt/lib/Rewrite/RewriteInstance.cpp | 6 ++++-- bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 4 ++++ bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp | 4 ++++ bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 2 +- 6 files changed, 21 insertions(+), 8 deletions(-) diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 20098ea82e71..4fef36103ca8 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -639,9 +639,12 @@ public: return false; } - /// If non-zero, this is used to fill the executable space with instructions - /// that will trap. Defaults to 0. - virtual unsigned getTrapFillValue() const { return 0; } + /// Used to fill the executable space with instructions + /// that will trap. + virtual StringRef getTrapFillValue() const { + llvm_unreachable("not implemented"); + return StringRef(); + } /// Interface and basic functionality of a MCInstMatcher. The idea is to make /// it easy to match one or more MCInsts against a tree-like pattern and diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp index 63446575f4b2..95ab63521c06 100644 --- a/bolt/lib/Core/BinaryEmitter.cpp +++ b/bolt/lib/Core/BinaryEmitter.cpp @@ -380,7 +380,7 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function, } if (opts::MarkFuncs) - Streamer.emitIntValue(BC.MIB->getTrapFillValue(), 1); + Streamer.emitBytes(BC.MIB->getTrapFillValue()); // Emit CFI end if (Function.hasCFI()) @@ -424,7 +424,7 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF, // case, the call site entries in that LSDA have 0 as offset to the landing // pad, which the runtime interprets as "no handler". To prevent this, // insert some padding. - Streamer.emitIntValue(BC.MIB->getTrapFillValue(), 1); + Streamer.emitBytes(BC.MIB->getTrapFillValue()); } // Track the first emitted instruction with debug info. diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index d194abd40b8e..129d29750652 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -5284,8 +5284,10 @@ void RewriteInstance::rewriteFile() { if (!BF.getFileOffset() || !BF.isEmitted()) continue; OS.seek(BF.getFileOffset()); - for (unsigned I = 0; I < BF.getMaxSize(); ++I) - OS.write((unsigned char)BC->MIB->getTrapFillValue()); + StringRef TrapInstr = BC->MIB->getTrapFillValue(); + unsigned NInstr = BF.getMaxSize() / TrapInstr.size(); + for (unsigned I = 0; I < NInstr; ++I) + OS.write(TrapInstr.data(), TrapInstr.size()); } OS.seek(SavedPos); } diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 41594bc00146..db9dfaea4ed6 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -1287,6 +1287,10 @@ public: } } + StringRef getTrapFillValue() const override { + return StringRef("\0\0\0\0", 4); + } + bool createReturn(MCInst &Inst) const override { Inst.setOpcode(AArch64::RET); Inst.clear(); diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp index ec5bca85231c..badc1bde80b5 100644 --- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp +++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp @@ -171,6 +171,10 @@ public: return true; } + StringRef getTrapFillValue() const override { + return StringRef("\0\0\0\0", 4); + } + bool analyzeBranch(InstructionIterator Begin, InstructionIterator End, const MCSymbol *&TBB, const MCSymbol *&FBB, MCInst *&CondBranch, diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index 9109159bf1b4..d3d371d8881e 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -416,7 +416,7 @@ public: } } - unsigned getTrapFillValue() const override { return 0xCC; } + StringRef getTrapFillValue() const override { return StringRef("\314", 1); } struct IndJmpMatcherFrag1 : MCInstMatcher { std::unique_ptr Base; -- Gitee From bbc18f977fa6c6ad6f582264c9fce32c84c4ce5e Mon Sep 17 00:00:00 2001 From: Denis Revunov Date: Thu, 17 Aug 2023 18:30:07 +0300 Subject: [PATCH 18/94] [Backport][BOLT] Add test for emitting trap value Reviewed By: rafauler Differential Revision: https://reviews.llvm.org/D158191 --- bolt/test/runtime/mark-funcs.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 bolt/test/runtime/mark-funcs.c diff --git a/bolt/test/runtime/mark-funcs.c b/bolt/test/runtime/mark-funcs.c new file mode 100644 index 000000000000..a8586ca8b6e1 --- /dev/null +++ b/bolt/test/runtime/mark-funcs.c @@ -0,0 +1,22 @@ +#include + +int dummy() { + printf("Dummy called\n"); + return 0; +} + +int main(int argc, char **argv) { + if (dummy() != 0) + return 1; + printf("Main called\n"); + return 0; +} +// Check that emitting trap value works properly and +// does not break functions +// REQUIRES: system-linux +// RUN: %clangxx -Wl,-q %s -o %t.exe +// RUN: %t.exe | FileCheck %s +// CHECK: Dummy called +// CHECK-NEXT: Main called +// RUN: llvm-bolt %t.exe -o %t.exe.bolt -lite=false --mark-funcs +// RUN: %t.exe.bolt | FileCheck %s -- Gitee From 8aec081247e7a74d6ffcb68647e3a2110f8b3632 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 23 Aug 2023 16:36:54 -0700 Subject: [PATCH 19/94] [Backport][BOLT] Don't choke on injected functions' IO map AddressMap would fail lookup for injected functions and crash BOLT. Fix that. Reviewed By: #bolt, maksfb, jobnoorman Differential Revision: https://reviews.llvm.org/D158685 --- bolt/lib/Core/BinaryFunction.cpp | 10 ++++++++-- bolt/test/X86/patch-entries.c | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 bolt/test/X86/patch-entries.c diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 7b093116b3d6..7bd16a55111e 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -4128,8 +4128,14 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) { assert(FragmentBaseAddress == getOutputAddress()); } - const uint64_t BBAddress = - *BC.getIOAddressMap().lookup(BB->getInputOffset() + getAddress()); + // Injected functions likely will fail lookup, as they have no + // input range. Just assign the BB the output address of the + // function. + auto MaybeBBAddress = + BC.getIOAddressMap().lookup(BB->getInputOffset() + getAddress()); + const uint64_t BBAddress = MaybeBBAddress ? *MaybeBBAddress + : BB->isSplit() ? FF.getAddress() + : getOutputAddress(); BB->setOutputStartAddress(BBAddress); if (PrevBB) diff --git a/bolt/test/X86/patch-entries.c b/bolt/test/X86/patch-entries.c new file mode 100644 index 000000000000..d435781fc60f --- /dev/null +++ b/bolt/test/X86/patch-entries.c @@ -0,0 +1,19 @@ +// Checking crashes against injected binary functions created by patch +// entries pass and debug info turned on. In these cases, we were +// trying to fetch input to output maps on injected functions and +// crashing. + +// REQUIRES: system-linux + +// RUN: %clang %cflags -no-pie -g %s -fuse-ld=lld -o %t.exe -Wl,-q +// RUN: llvm-bolt -relocs %t.exe -o %t.out --update-debug-sections \ +// RUN: --force-patch + +#include + +static void foo() { printf("foo\n"); } + +int main() { + foo(); + return 0; +} -- Gitee From 89678956d55b3b477dd3dd61888de86417fa1f36 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Fri, 25 Aug 2023 15:54:39 -0700 Subject: [PATCH 20/94] [Backport][BOLT] Fix cross-compilation build Don't enable BOLT runtime when cross compiling as we don't support this scenario yet. Differential Revision: https://reviews.llvm.org/D158906 --- bolt/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 89462f8a14c1..f163d4534287 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -35,7 +35,8 @@ set(BOLT_ENABLE_RUNTIME_default OFF) if ((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") AND (CMAKE_SYSTEM_NAME STREQUAL "Linux" - OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")) + OR CMAKE_SYSTEM_NAME STREQUAL "Darwin") + AND (NOT CMAKE_CROSSCOMPILING)) set(BOLT_ENABLE_RUNTIME_default ON) endif() option(BOLT_ENABLE_RUNTIME "Enable BOLT runtime" ${BOLT_ENABLE_RUNTIME_default}) -- Gitee From 0a9c8e9bb0f3f2a2ba4e13b1dfa46a08f494f2a4 Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Mon, 28 Aug 2023 10:04:02 +0200 Subject: [PATCH 21/94] [Backport][BOLT] Calculate output values using BOLTLinker BOLT uses `MCAsmLayout` to calculate the output values of functions and basic blocks. This means output values are calculated based on a pre-linking state and any changes to symbol values during linking will cause incorrect values to be used. This issue can be triggered by enabling linker relaxation on RISC-V. Since linker relaxation can remove instructions, symbol values may change. This causes, among other things, the symbol table created by BOLT in the output executable to be incorrect. This patch solves this issue by using `BOLTLinker` to get symbol values instead of `MCAsmLayout`. This way, output values are calculated based on a post-linking state. To make sure the linker can update all necessary symbols, this patch also makes sure all these symbols are not marked as temporary so that they end-up in the object file's symbol table. Note that this patch only deals with symbols of binary functions (`BinaryFunction::updateOutputValues`). The technique described above turned out to be too expensive for basic block symbols so those are handled differently in D155604. Reviewed By: maksfb Differential Revision: https://reviews.llvm.org/D154604 --- bolt/include/bolt/Core/BinaryFunction.h | 6 +-- bolt/include/bolt/Core/Linker.h | 15 +++++- bolt/include/bolt/Rewrite/JITLinkLinker.h | 5 +- bolt/include/bolt/Rewrite/RewriteInstance.h | 2 +- bolt/lib/Core/BinaryBasicBlock.cpp | 1 - bolt/lib/Core/BinaryFunction.cpp | 55 ++++++++++----------- bolt/lib/Rewrite/JITLinkLinker.cpp | 7 +-- bolt/lib/Rewrite/MachORewriteInstance.cpp | 4 -- bolt/lib/Rewrite/RewriteInstance.cpp | 14 +++--- 9 files changed, 56 insertions(+), 53 deletions(-) diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index 9b45467a6a8f..a34b6cfd1f5e 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -1191,7 +1191,7 @@ public: if (!Islands->FunctionConstantIslandLabel) { Islands->FunctionConstantIslandLabel = - BC.Ctx->createNamedTempSymbol("func_const_island"); + BC.Ctx->getOrCreateSymbol("func_const_island@" + getOneName()); } return Islands->FunctionConstantIslandLabel; } @@ -1201,7 +1201,7 @@ public: if (!Islands->FunctionColdConstantIslandLabel) { Islands->FunctionColdConstantIslandLabel = - BC.Ctx->createNamedTempSymbol("func_cold_const_island"); + BC.Ctx->getOrCreateSymbol("func_cold_const_island@" + getOneName()); } return Islands->FunctionColdConstantIslandLabel; } @@ -1221,7 +1221,7 @@ public: } /// Update output values of the function based on the final \p Layout. - void updateOutputValues(const MCAsmLayout &Layout); + void updateOutputValues(const BOLTLinker &Linker); /// Register relocation type \p RelType at a given \p Address in the function /// against \p Symbol. diff --git a/bolt/include/bolt/Core/Linker.h b/bolt/include/bolt/Core/Linker.h index 69e1fe431c0b..1e0876a0e13d 100644 --- a/bolt/include/bolt/Core/Linker.h +++ b/bolt/include/bolt/Core/Linker.h @@ -31,6 +31,11 @@ public: std::function; using SectionsMapper = std::function; + struct SymbolInfo { + uint64_t Address; + uint64_t Size; + }; + virtual ~BOLTLinker() = default; /// Load and link \p Obj. \p MapSections will be called before the object is @@ -38,8 +43,16 @@ public: /// of a section can be changed by calling the passed SectionMapper. virtual void loadObject(MemoryBufferRef Obj, SectionsMapper MapSections) = 0; + /// Return the address and size of a symbol or std::nullopt if it cannot be + /// found. + virtual std::optional lookupSymbolInfo(StringRef Name) const = 0; + /// Return the address of a symbol or std::nullopt if it cannot be found. - virtual std::optional lookupSymbol(StringRef Name) const = 0; + std::optional lookupSymbol(StringRef Name) const { + if (const auto Info = lookupSymbolInfo(Name)) + return Info->Address; + return std::nullopt; + } }; } // namespace bolt diff --git a/bolt/include/bolt/Rewrite/JITLinkLinker.h b/bolt/include/bolt/Rewrite/JITLinkLinker.h index 104c75bea0c2..1c41a26ac256 100644 --- a/bolt/include/bolt/Rewrite/JITLinkLinker.h +++ b/bolt/include/bolt/Rewrite/JITLinkLinker.h @@ -17,7 +17,6 @@ #include "bolt/Rewrite/ExecutableFileMemoryManager.h" #include "llvm/ExecutionEngine/JITLink/JITLinkDylib.h" -#include #include #include @@ -35,7 +34,7 @@ private: std::unique_ptr MM; jitlink::JITLinkDylib Dylib{"main"}; std::vector Allocs; - std::map Symtab; + StringMap Symtab; public: JITLinkLinker(BinaryContext &BC, @@ -43,7 +42,7 @@ public: ~JITLinkLinker(); void loadObject(MemoryBufferRef Obj, SectionsMapper MapSections) override; - std::optional lookupSymbol(StringRef Name) const override; + std::optional lookupSymbolInfo(StringRef Name) const override; static SmallVector orderedBlocks(const jitlink::Section &Section); diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h index 072c8109241d..940c7324594e 100644 --- a/bolt/include/bolt/Rewrite/RewriteInstance.h +++ b/bolt/include/bolt/Rewrite/RewriteInstance.h @@ -190,7 +190,7 @@ private: void mapAllocatableSections(BOLTLinker::SectionMapper MapSection); /// Update output object's values based on the final \p Layout. - void updateOutputValues(const MCAsmLayout &Layout); + void updateOutputValues(const BOLTLinker &Linker); /// Rewrite back all functions (hopefully optimized) that fit in the original /// memory footprint for that function. If the function is now larger and does diff --git a/bolt/lib/Core/BinaryBasicBlock.cpp b/bolt/lib/Core/BinaryBasicBlock.cpp index d764a874d08c..984bc6dbd220 100644 --- a/bolt/lib/Core/BinaryBasicBlock.cpp +++ b/bolt/lib/Core/BinaryBasicBlock.cpp @@ -14,7 +14,6 @@ #include "bolt/Core/BinaryContext.h" #include "bolt/Core/BinaryFunction.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/Errc.h" diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 7bd16a55111e..80470fbef558 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -25,7 +25,6 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Demangle/Demangle.h" #include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCExpr.h" @@ -4030,7 +4029,7 @@ void BinaryFunction::calculateLoopInfo() { } } -void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) { +void BinaryFunction::updateOutputValues(const BOLTLinker &Linker) { if (!isEmitted()) { assert(!isInjected() && "injected function should be emitted"); setOutputAddress(getAddress()); @@ -4038,16 +4037,17 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) { return; } - const uint64_t BaseAddress = getCodeSection()->getOutputAddress(); + const auto SymbolInfo = Linker.lookupSymbolInfo(getSymbol()->getName()); + assert(SymbolInfo && "Cannot find function entry symbol"); + setOutputAddress(SymbolInfo->Address); + setOutputSize(SymbolInfo->Size); + if (BC.HasRelocations || isInjected()) { - const uint64_t StartOffset = Layout.getSymbolOffset(*getSymbol()); - const uint64_t EndOffset = Layout.getSymbolOffset(*getFunctionEndLabel()); - setOutputAddress(BaseAddress + StartOffset); - setOutputSize(EndOffset - StartOffset); if (hasConstantIsland()) { - const uint64_t DataOffset = - Layout.getSymbolOffset(*getFunctionConstantIslandLabel()); - setOutputDataAddress(BaseAddress + DataOffset); + const auto DataAddress = + Linker.lookupSymbol(getFunctionConstantIslandLabel()->getName()); + assert(DataAddress && "Cannot find function CI symbol"); + setOutputDataAddress(*DataAddress); for (auto It : Islands->Offsets) { const uint64_t OldOffset = It.first; BinaryData *BD = BC.getBinaryDataAtAddress(getAddress() + OldOffset); @@ -4055,8 +4055,11 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) { continue; MCSymbol *Symbol = It.second; - const uint64_t NewOffset = Layout.getSymbolOffset(*Symbol); - BD->setOutputLocation(*getCodeSection(), NewOffset); + const auto NewAddress = Linker.lookupSymbol(Symbol->getName()); + assert(NewAddress && "Cannot find CI symbol"); + auto &Section = *getCodeSection(); + const auto NewOffset = *NewAddress - Section.getOutputAddress(); + BD->setOutputLocation(Section, NewOffset); } } if (isSplit()) { @@ -4066,7 +4069,6 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) { // If fragment is empty, cold section might not exist if (FF.empty() && ColdSection.getError()) continue; - const uint64_t ColdBaseAddress = ColdSection->getOutputAddress(); const MCSymbol *ColdStartSymbol = getSymbol(FF.getFragmentNum()); // If fragment is empty, symbol might have not been emitted @@ -4075,31 +4077,24 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) { continue; assert(ColdStartSymbol && ColdStartSymbol->isDefined() && "split function should have defined cold symbol"); - const MCSymbol *ColdEndSymbol = - getFunctionEndLabel(FF.getFragmentNum()); - assert(ColdEndSymbol && ColdEndSymbol->isDefined() && - "split function should have defined cold end symbol"); - const uint64_t ColdStartOffset = - Layout.getSymbolOffset(*ColdStartSymbol); - const uint64_t ColdEndOffset = Layout.getSymbolOffset(*ColdEndSymbol); - FF.setAddress(ColdBaseAddress + ColdStartOffset); - FF.setImageSize(ColdEndOffset - ColdStartOffset); + const auto ColdStartSymbolInfo = + Linker.lookupSymbolInfo(ColdStartSymbol->getName()); + assert(ColdStartSymbolInfo && "Cannot find cold start symbol"); + FF.setAddress(ColdStartSymbolInfo->Address); + FF.setImageSize(ColdStartSymbolInfo->Size); if (hasConstantIsland()) { - const uint64_t DataOffset = - Layout.getSymbolOffset(*getFunctionColdConstantIslandLabel()); - setOutputColdDataAddress(ColdBaseAddress + DataOffset); + const auto DataAddress = Linker.lookupSymbol( + getFunctionColdConstantIslandLabel()->getName()); + assert(DataAddress && "Cannot find cold CI symbol"); + setOutputColdDataAddress(*DataAddress); } } } - } else { - setOutputAddress(getAddress()); - setOutputSize(Layout.getSymbolOffset(*getFunctionEndLabel())); } // Update basic block output ranges for the debug info, if we have // secondary entry points in the symbol table to update or if writing BAT. - if (!opts::UpdateDebugSections && !isMultiEntry() && - !requiresAddressTranslation()) + if (!requiresAddressMap()) return; // Output ranges should match the input if the body hasn't changed. diff --git a/bolt/lib/Rewrite/JITLinkLinker.cpp b/bolt/lib/Rewrite/JITLinkLinker.cpp index 3c74fd5932bf..c57dd5893115 100644 --- a/bolt/lib/Rewrite/JITLinkLinker.cpp +++ b/bolt/lib/Rewrite/JITLinkLinker.cpp @@ -142,8 +142,8 @@ struct JITLinkLinker::Context : jitlink::JITLinkContext { }); for (auto *Symbol : G.defined_symbols()) { - Linker.Symtab.insert( - {Symbol->getName().str(), Symbol->getAddress().getValue()}); + SymbolInfo Info{Symbol->getAddress().getValue(), Symbol->getSize()}; + Linker.Symtab.insert({Symbol->getName().str(), Info}); } return Error::success(); @@ -174,7 +174,8 @@ void JITLinkLinker::loadObject(MemoryBufferRef Obj, jitlink::link(std::move(*LG), std::move(Ctx)); } -std::optional JITLinkLinker::lookupSymbol(StringRef Name) const { +std::optional +JITLinkLinker::lookupSymbolInfo(StringRef Name) const { auto It = Symtab.find(Name.data()); if (It == Symtab.end()) return std::nullopt; diff --git a/bolt/lib/Rewrite/MachORewriteInstance.cpp b/bolt/lib/Rewrite/MachORewriteInstance.cpp index fc7500a6deb0..8214cade8280 100644 --- a/bolt/lib/Rewrite/MachORewriteInstance.cpp +++ b/bolt/lib/Rewrite/MachORewriteInstance.cpp @@ -20,7 +20,6 @@ #include "bolt/Rewrite/JITLinkLinker.h" #include "bolt/RuntimeLibs/InstrumentationRuntimeLibrary.h" #include "bolt/Utils/Utils.h" -#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCObjectStreamer.h" #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" @@ -476,9 +475,6 @@ void MachORewriteInstance::emitAndLink() { "error creating in-memory object"); assert(Obj && "createObjectFile cannot return nullptr"); - MCAsmLayout FinalLayout( - static_cast(Streamer.get())->getAssembler()); - auto EFMM = std::make_unique(*BC); EFMM->setNewSecPrefix(getNewSecPrefix()); EFMM->setOrgSecPrefix(getOrgSecPrefix()); diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 129d29750652..94856edef642 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -3239,15 +3239,15 @@ void RewriteInstance::emitAndLink() { Linker->loadObject(ObjectMemBuffer->getMemBufferRef(), [this](auto MapSection) { mapFileSections(MapSection); }); - MCAsmLayout FinalLayout( - static_cast(Streamer.get())->getAssembler()); - // Update output addresses based on the new section map and // layout. Only do this for the object created by ourselves. - updateOutputValues(FinalLayout); + updateOutputValues(*Linker); - if (opts::UpdateDebugSections) + if (opts::UpdateDebugSections) { + MCAsmLayout FinalLayout( + static_cast(Streamer.get())->getAssembler()); DebugInfoRewriter->updateLineTableOffsets(FinalLayout); + } if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary()) RtLibrary->link(*BC, ToolPath, *Linker, [this](auto MapSection) { @@ -3642,7 +3642,7 @@ void RewriteInstance::mapAllocatableSections( } } -void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { +void RewriteInstance::updateOutputValues(const BOLTLinker &Linker) { if (auto MapSection = BC->getUniqueSectionByName(AddressMap::SectionName)) { auto Map = AddressMap::parse(MapSection->getOutputContents(), *BC); BC->setIOAddressMap(std::move(Map)); @@ -3650,7 +3650,7 @@ void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { } for (BinaryFunction *Function : BC->getAllBinaryFunctions()) - Function->updateOutputValues(Layout); + Function->updateOutputValues(Linker); } void RewriteInstance::patchELFPHDRTable() { -- Gitee From 5836c990b222b9a279bc4bcaef342b40489fb395 Mon Sep 17 00:00:00 2001 From: hezuoqiang Date: Mon, 28 Aug 2023 22:28:44 +0800 Subject: [PATCH 22/94] [Backport][BOLT] BL/BH are considered aliases in regreassign MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The relationship of X86 registers is shown in the diagram. BL and BH do not have a direct alias relationship. However, if the BH register cannot be swapped, then the BX/EBX/RBX registers cannot be swapped as well, which means that BL register also cannot be swapped. Therefore, in the presence of BX/EBX/RBX registers, BL and BH have an alias relationship. ┌────────────────┐ │ RBX │ ├────┬───────────┤ │ │ EBX │ ├────┴──┬────────┤ │ │ BX │ ├───────┼───┬────┤ │ │BH │BL │ └───────┴───┴────┘ Reviewed By: rafauler Differential Revision: https://reviews.llvm.org/D155098 --- bolt/lib/Passes/RegReAssign.cpp | 26 ++++++++ .../runtime/X86/reg-reassign-no-swap-bl.s | 59 +++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 bolt/test/runtime/X86/reg-reassign-no-swap-bl.s diff --git a/bolt/lib/Passes/RegReAssign.cpp b/bolt/lib/Passes/RegReAssign.cpp index 0efd27f0910b..8b9dc9c1fdd5 100644 --- a/bolt/lib/Passes/RegReAssign.cpp +++ b/bolt/lib/Passes/RegReAssign.cpp @@ -175,9 +175,25 @@ void RegReAssign::rankRegisters(BinaryFunction &Function) { continue; // Disallow substituitions involving regs in instrs that cannot use REX + // The relationship of X86 registers is shown in the diagram. BL and BH + // do not have a direct alias relationship. However, if the BH register + // cannot be swapped, then the BX/EBX/RBX registers cannot be swapped as + // well, which means that BL register also cannot be swapped. Therefore, + // in the presence of BX/EBX/RBX registers, BL and BH have an alias + // relationship. + // ┌─────────────────┐ + // │ RBX │ + // ├─────┬───────────┤ + // │ │ EBX │ + // ├─────┴──┬────────┤ + // │ │ BX │ + // ├────────┼───┬────┤ + // │ │BH │BL │ + // └────────┴───┴────┘ if (CannotUseREX) { RegScore[RegEC] = std::numeric_limits::min(); + RegScore[BC.MIB->getAliasSized(Reg, 1)] = RegScore[RegEC]; continue; } @@ -185,6 +201,7 @@ void RegReAssign::rankRegisters(BinaryFunction &Function) { if (BC.MIB->isUpper8BitReg(Reg) && ClassicCSR.test(Reg)) { RegScore[RegEC] = std::numeric_limits::min(); + RegScore[BC.MIB->getAliasSized(Reg, 1)] = RegScore[RegEC]; continue; } @@ -370,6 +387,15 @@ bool RegReAssign::conservativePassOverFunction(BinaryFunction &Function) { if (!RBX) return false; + // The high 8 bits of the register will never be swapped. To prevent the high + // 8 bits from being swapped incorrectly, we should switched to swapping the + // low 8 bits of the register instead. + if (BC.MIB->isUpper8BitReg(RBX)) { + RBX = BC.MIB->getAliasSized(RBX, 1); + if (RegScore[RBX] < 0 || RegScore[RBX] > RegScore[Candidate]) + return false; + } + LLVM_DEBUG(dbgs() << "\n ** Swapping " << BC.MRI->getName(RBX) << " with " << BC.MRI->getName(Candidate) << "\n\n"); (void)BC; diff --git a/bolt/test/runtime/X86/reg-reassign-no-swap-bl.s b/bolt/test/runtime/X86/reg-reassign-no-swap-bl.s new file mode 100644 index 000000000000..4e2e70ed6cba --- /dev/null +++ b/bolt/test/runtime/X86/reg-reassign-no-swap-bl.s @@ -0,0 +1,59 @@ +# This test case is used to reproduce an issue found in the mongod database. +# In function rankRegisters, if there is a BH Reg in the basic block, then the BL Reg +# also cannot be swap. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang -no-pie %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.out -data=%t.fdata --reg-reassign | FileCheck %s +# RUN: %t.out + +# CHECK: Reg Reassignment Pass: no changes were made + .text + .globl main + .globl main.cold + .p2align 4, 0x90 + .type main,@function + .type main.cold,@function +main.cold: +bb1: + mov $0x2, %bh +bb2: + jmp bb5 +main: # @main + .cfi_startproc +# %bb.0: # %entry + pushq %rax + pushq %r12 + pushq %rbx + .cfi_def_cfa_offset 16 + mov $0x1, %r12 + shr $0x14, %r12 + add $0x14, %r12 + mov $0x11, %rbx + mov $0x1, %bh + mov $0x1, %bl +bb3: + add $0x1, %r12 +bb4: + jmp bb1 +bb5: + cmp $0x201, %rbx + jne 0x0 +bb6: + xorl %eax, %eax + popq %rcx + popq %rbx + popq %r12 + .cfi_def_cfa_offset 8 + retq +# FDATA: 1 main.cold #bb2# 1 main 0 0 100 +# FDATA: 1 main #bb3# 1 main #bb4# 0 100 +# FDATA: 1 main #bb4# 1 main.cold 0 0 100 + +.Lfunc_end0: + .size main, .Lfunc_end0-main + .cfi_endproc -- Gitee From 9c2b819f56a487f39f39d36e8ee87dcfdb0cd6ed Mon Sep 17 00:00:00 2001 From: Sinan Lin Date: Thu, 31 Aug 2023 11:01:39 +0800 Subject: [PATCH 23/94] [Backport][BOLT] Fix a bug related to iterators in ReorderData pass If `Itr` is the last element and then `std::next(Itr)` will be `Range.end()`, so that the statement `std::next(Itr)->second` is a UB. Reviewed By: yota9, maksfb Differential Revision: https://reviews.llvm.org/D159177 --- bolt/lib/Passes/ReorderData.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/lib/Passes/ReorderData.cpp b/bolt/lib/Passes/ReorderData.cpp index 4df6ce37596d..6e1f9b6d7751 100644 --- a/bolt/lib/Passes/ReorderData.cpp +++ b/bolt/lib/Passes/ReorderData.cpp @@ -413,17 +413,17 @@ bool ReorderData::markUnmoveableSymbols(BinaryContext &BC, auto Range = BC.getBinaryDataForSection(Section); bool FoundUnmoveable = false; for (auto Itr = Range.begin(); Itr != Range.end(); ++Itr) { + BinaryData *Next = + std::next(Itr) != Range.end() ? std::next(Itr)->second : nullptr; if (Itr->second->getName().startswith("PG.")) { BinaryData *Prev = Itr != Range.begin() ? std::prev(Itr)->second : nullptr; - BinaryData *Next = Itr != Range.end() ? std::next(Itr)->second : nullptr; bool PrevIsPrivate = Prev && isPrivate(Prev); bool NextIsPrivate = Next && isPrivate(Next); if (isPrivate(Itr->second) && (PrevIsPrivate || NextIsPrivate)) Itr->second->setIsMoveable(false); } else { // check for overlapping symbols. - BinaryData *Next = Itr != Range.end() ? std::next(Itr)->second : nullptr; if (Next && Itr->second->getEndAddress() != Next->getAddress() && Next->containsAddress(Itr->second->getEndAddress())) { Itr->second->setIsMoveable(false); -- Gitee From 7ad78a92ddff278a62f1eba5f0a6136a9d2c9a02 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Mon, 24 Jul 2023 10:27:05 -0700 Subject: [PATCH 24/94] [Backport][BOLT] Fine-tuning hash computation for stale matching Fine-tuning hash computation for stale matching: - introducing a new "loose" basic block hash that allows to match many more blocks than before; - tweaking params of the inference algorithm that find (slightly) better solutions; - added more meaningful tests for stale matching. Tested the changes on several open-source benchmarks (clang, rocksdb, chrome) and one prod workload using different compiler modes (LTO/PGO etc). There is always an improvement in the quality of inferred profiles. (The current implementation is still not optimal but the diff is a step forward; I am open to further suggestions) Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D156278 --- bolt/include/bolt/Core/HashUtilities.h | 4 +- bolt/lib/Core/HashUtilities.cpp | 37 ++++++ bolt/lib/Profile/StaleProfileMatching.cpp | 119 ++++++++---------- .../test/X86/Inputs/blarge_profile_stale.yaml | 27 ++-- bolt/test/X86/reader-stale-yaml.test | 90 ++++++++----- 5 files changed, 167 insertions(+), 110 deletions(-) diff --git a/bolt/include/bolt/Core/HashUtilities.h b/bolt/include/bolt/Core/HashUtilities.h index 8d445ff83756..53ea110aa683 100644 --- a/bolt/include/bolt/Core/HashUtilities.h +++ b/bolt/include/bolt/Core/HashUtilities.h @@ -20,8 +20,6 @@ namespace llvm { namespace bolt { -uint16_t hash_64_to_16(const uint64_t Hash); - std::string hashInteger(uint64_t Value); std::string hashSymbol(BinaryContext &BC, const MCSymbol &Symbol); @@ -35,6 +33,8 @@ using OperandHashFuncTy = function_ref; std::string hashBlock(BinaryContext &BC, const BinaryBasicBlock &BB, OperandHashFuncTy OperandHashFunc); +std::string hashBlockLoose(BinaryContext &BC, const BinaryBasicBlock &BB); + } // namespace bolt } // namespace llvm diff --git a/bolt/lib/Core/HashUtilities.cpp b/bolt/lib/Core/HashUtilities.cpp index 0752eaeabef8..0fc72be888be 100644 --- a/bolt/lib/Core/HashUtilities.cpp +++ b/bolt/lib/Core/HashUtilities.cpp @@ -130,5 +130,42 @@ std::string hashBlock(BinaryContext &BC, const BinaryBasicBlock &BB, return HashString; } +/// A "loose" hash of a basic block to use with the stale profile matching. The +/// computed value will be the same for blocks with minor changes (such as +/// reordering of instructions or using different operands) but may result in +/// collisions that need to be resolved by a stronger hashing. +std::string hashBlockLoose(BinaryContext &BC, const BinaryBasicBlock &BB) { + // The hash is computed by creating a string of all lexicographically ordered + // instruction opcodes, which is then hashed with std::hash. + std::set Opcodes; + for (const MCInst &Inst : BB) { + if (BC.MIB->isPseudo(Inst)) + continue; + + // Ignore unconditional jumps, as they can be added / removed as a result + // of basic block reordering. + if (BC.MIB->isUnconditionalBranch(Inst)) + continue; + + // Do not distinguish different types of conditional jumps. + if (BC.MIB->isConditionalBranch(Inst)) { + Opcodes.insert("JMP"); + continue; + } + + std::string Mnemonic = BC.InstPrinter->getMnemonic(&Inst).first; + Mnemonic.erase( + std::remove_if(Mnemonic.begin(), Mnemonic.end(), + [](unsigned char ch) { return std::isspace(ch); }), + Mnemonic.end()); + Opcodes.insert(Mnemonic); + } + + std::string HashString; + for (const std::string &Opcode : Opcodes) + HashString.append(Opcode); + return HashString; +} + } // namespace bolt } // namespace llvm diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp index b5895d19de20..3b43ed64bd91 100644 --- a/bolt/lib/Profile/StaleProfileMatching.cpp +++ b/bolt/lib/Profile/StaleProfileMatching.cpp @@ -73,64 +73,39 @@ cl::opt StaleMatchingJoinIslands( cl::opt StaleMatchingCostBlockInc( "stale-matching-cost-block-inc", - cl::desc("The cost of increasing a block's count by one."), cl::init(110), + cl::desc("The cost of increasing a block count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); cl::opt StaleMatchingCostBlockDec( "stale-matching-cost-block-dec", - cl::desc("The cost of decreasing a block's count by one."), cl::init(100), + cl::desc("The cost of decreasing a block count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); -cl::opt StaleMatchingCostBlockEntryInc( - "stale-matching-cost-block-entry-inc", - cl::desc("The cost of increasing the entry block's count by one."), - cl::init(110), cl::ReallyHidden, cl::cat(BoltOptCategory)); - -cl::opt StaleMatchingCostBlockEntryDec( - "stale-matching-cost-block-entry-dec", - cl::desc("The cost of decreasing the entry block's count by one."), - cl::init(100), cl::ReallyHidden, cl::cat(BoltOptCategory)); - -cl::opt StaleMatchingCostBlockZeroInc( - "stale-matching-cost-block-zero-inc", - cl::desc("The cost of increasing a count of zero-weight block by one."), - cl::init(10), cl::Hidden, cl::cat(BoltOptCategory)); - -cl::opt StaleMatchingCostBlockUnknownInc( - "stale-matching-cost-block-unknown-inc", - cl::desc("The cost of increasing an unknown block's count by one."), - cl::init(10), cl::ReallyHidden, cl::cat(BoltOptCategory)); - cl::opt StaleMatchingCostJumpInc( "stale-matching-cost-jump-inc", - cl::desc("The cost of increasing a jump's count by one."), cl::init(100), + cl::desc("The cost of increasing a jump count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); -cl::opt StaleMatchingCostJumpFTInc( - "stale-matching-cost-jump-ft-inc", - cl::desc("The cost of increasing a fall-through jump's count by one."), - cl::init(100), cl::ReallyHidden, cl::cat(BoltOptCategory)); - cl::opt StaleMatchingCostJumpDec( "stale-matching-cost-jump-dec", - cl::desc("The cost of decreasing a jump's count by one."), cl::init(110), + cl::desc("The cost of decreasing a jump count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); -cl::opt StaleMatchingCostJumpFTDec( - "stale-matching-cost-jump-ft-dec", - cl::desc("The cost of decreasing a fall-through jump's count by one."), - cl::init(110), cl::ReallyHidden, cl::cat(BoltOptCategory)); +cl::opt StaleMatchingCostBlockUnknownInc( + "stale-matching-cost-block-unknown-inc", + cl::desc("The cost of increasing an unknown block count by one."), + cl::init(1), cl::ReallyHidden, cl::cat(BoltOptCategory)); cl::opt StaleMatchingCostJumpUnknownInc( "stale-matching-cost-jump-unknown-inc", - cl::desc("The cost of increasing an unknown jump's count by one."), - cl::init(50), cl::ReallyHidden, cl::cat(BoltOptCategory)); + cl::desc("The cost of increasing an unknown jump count by one."), + cl::init(140), cl::ReallyHidden, cl::cat(BoltOptCategory)); cl::opt StaleMatchingCostJumpUnknownFTInc( "stale-matching-cost-jump-unknown-ft-inc", cl::desc( - "The cost of increasing an unknown fall-through jump's count by one."), - cl::init(5), cl::ReallyHidden, cl::cat(BoltOptCategory)); + "The cost of increasing an unknown fall-through jump count by one."), + cl::init(3), cl::ReallyHidden, cl::cat(BoltOptCategory)); } // namespace opts @@ -145,7 +120,8 @@ private: using ValueOffset = Bitfield::Element; using ValueOpcode = Bitfield::Element; using ValueInstr = Bitfield::Element; - using ValueNeighbor = Bitfield::Element; + using ValuePred = Bitfield::Element; + using ValueSucc = Bitfield::Element; public: explicit BlendedBlockHash() {} @@ -154,7 +130,8 @@ public: Offset = Bitfield::get(Hash); OpcodeHash = Bitfield::get(Hash); InstrHash = Bitfield::get(Hash); - NeighborHash = Bitfield::get(Hash); + PredHash = Bitfield::get(Hash); + SuccHash = Bitfield::get(Hash); } /// Combine the blended hash into uint64_t. @@ -163,7 +140,8 @@ public: Bitfield::set(Hash, Offset); Bitfield::set(Hash, OpcodeHash); Bitfield::set(Hash, InstrHash); - Bitfield::set(Hash, NeighborHash); + Bitfield::set(Hash, PredHash); + Bitfield::set(Hash, SuccHash); return Hash; } @@ -175,7 +153,8 @@ public: "incorrect blended hash distance computation"); uint64_t Dist = 0; // Account for NeighborHash - Dist += NeighborHash == BBH.NeighborHash ? 0 : 1; + Dist += SuccHash == BBH.SuccHash ? 0 : 1; + Dist += PredHash == BBH.PredHash ? 0 : 1; Dist <<= 16; // Account for InstrHash Dist += InstrHash == BBH.InstrHash ? 0 : 1; @@ -192,9 +171,10 @@ public: /// (Strong) Hash of the basic block instructions, including opcodes and /// operands. uint16_t InstrHash{0}; - /// Hash of the (loose) basic block together with (loose) hashes of its - /// successors and predecessors. - uint16_t NeighborHash{0}; + /// (Loose) Hashes of the predecessors of the basic block. + uint8_t PredHash{0}; + /// (Loose) Hashes of the successors of the basic block. + uint8_t SuccHash{0}; }; /// The object is used to identify and match basic blocks in a BinaryFunction @@ -252,41 +232,43 @@ void BinaryFunction::computeBlockHashes() const { std::vector BlendedHashes(BasicBlocks.size()); std::vector OpcodeHashes(BasicBlocks.size()); - // Initialize hash components + // Initialize hash components. for (size_t I = 0; I < BasicBlocks.size(); I++) { const BinaryBasicBlock *BB = BasicBlocks[I]; assert(BB->getIndex() == I && "incorrect block index"); BlendedHashes[I].Offset = BB->getOffset(); - // Hashing complete instructions + // Hashing complete instructions. std::string InstrHashStr = hashBlock( BC, *BB, [&](const MCOperand &Op) { return hashInstOperand(BC, Op); }); uint64_t InstrHash = std::hash{}(InstrHashStr); - BlendedHashes[I].InstrHash = hash_64_to_16(InstrHash); - // Hashing opcodes - std::string OpcodeHashStr = - hashBlock(BC, *BB, [](const MCOperand &Op) { return std::string(); }); + BlendedHashes[I].InstrHash = (uint16_t)hash_value(InstrHash); + // Hashing opcodes. + std::string OpcodeHashStr = hashBlockLoose(BC, *BB); OpcodeHashes[I] = std::hash{}(OpcodeHashStr); - BlendedHashes[I].OpcodeHash = hash_64_to_16(OpcodeHashes[I]); + BlendedHashes[I].OpcodeHash = (uint16_t)hash_value(OpcodeHashes[I]); } - // Initialize neighbor hash + // Initialize neighbor hash. for (size_t I = 0; I < BasicBlocks.size(); I++) { const BinaryBasicBlock *BB = BasicBlocks[I]; - uint64_t Hash = OpcodeHashes[I]; - // Append hashes of successors + // Append hashes of successors. + uint64_t Hash = 0; for (BinaryBasicBlock *SuccBB : BB->successors()) { uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()]; Hash = hashing::detail::hash_16_bytes(Hash, SuccHash); } - // Append hashes of predecessors + BlendedHashes[I].SuccHash = (uint8_t)hash_value(Hash); + + // Append hashes of predecessors. + Hash = 0; for (BinaryBasicBlock *PredBB : BB->predecessors()) { uint64_t PredHash = OpcodeHashes[PredBB->getIndex()]; Hash = hashing::detail::hash_16_bytes(Hash, PredHash); } - BlendedHashes[I].NeighborHash = hash_64_to_16(Hash); + BlendedHashes[I].PredHash = (uint8_t)hash_value(Hash); } - // Assign hashes + // Assign hashes. for (size_t I = 0; I < BasicBlocks.size(); I++) { const BinaryBasicBlock *BB = BasicBlocks[I]; BB->setHash(BlendedHashes[I].combine()); @@ -409,20 +391,22 @@ void matchWeightsByHashes(BinaryContext &BC, const FlowBlock *MatchedBlock = Matcher.matchBlock(YamlHash); if (MatchedBlock != nullptr) { MatchedBlocks[YamlBB.Index] = MatchedBlock; - LLVM_DEBUG(dbgs() << "Matched yaml block with bid = " << YamlBB.Index - << " and hash = " << Twine::utohexstr(YamlBB.Hash) - << " to BB with index = " << MatchedBlock->Index - 1 + BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1]; + LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBB.Index << ")" + << " with hash " << Twine::utohexstr(YamlBB.Hash) + << " to BB (index = " << MatchedBlock->Index - 1 << ")" + << " with hash " << Twine::utohexstr(BinHash.combine()) << "\n"); // Update matching stats accounting for the matched block. - BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1]; if (Matcher.isHighConfidenceMatch(BinHash, YamlHash)) { ++BC.Stats.NumMatchedBlocks; BC.Stats.MatchedSampleCount += YamlBB.ExecCount; + LLVM_DEBUG(dbgs() << " exact match\n"); } } else { LLVM_DEBUG( - dbgs() << "Couldn't match yaml block with bid = " << YamlBB.Index - << " and hash = " << Twine::utohexstr(YamlBB.Hash) << "\n"); + dbgs() << "Couldn't match yaml block (bid = " << YamlBB.Index << ")" + << " with hash " << Twine::utohexstr(YamlBB.Hash) << "\n"); } // Update matching stats. @@ -575,16 +559,15 @@ void applyInference(FlowFunction &Func) { Params.JoinIslands = opts::StaleMatchingJoinIslands; Params.CostBlockInc = opts::StaleMatchingCostBlockInc; + Params.CostBlockEntryInc = opts::StaleMatchingCostBlockInc; Params.CostBlockDec = opts::StaleMatchingCostBlockDec; - Params.CostBlockEntryInc = opts::StaleMatchingCostBlockEntryInc; - Params.CostBlockEntryDec = opts::StaleMatchingCostBlockEntryDec; - Params.CostBlockZeroInc = opts::StaleMatchingCostBlockZeroInc; + Params.CostBlockEntryDec = opts::StaleMatchingCostBlockDec; Params.CostBlockUnknownInc = opts::StaleMatchingCostBlockUnknownInc; Params.CostJumpInc = opts::StaleMatchingCostJumpInc; - Params.CostJumpFTInc = opts::StaleMatchingCostJumpFTInc; + Params.CostJumpFTInc = opts::StaleMatchingCostJumpInc; Params.CostJumpDec = opts::StaleMatchingCostJumpDec; - Params.CostJumpFTDec = opts::StaleMatchingCostJumpFTDec; + Params.CostJumpFTDec = opts::StaleMatchingCostJumpDec; Params.CostJumpUnknownInc = opts::StaleMatchingCostJumpUnknownInc; Params.CostJumpUnknownFTInc = opts::StaleMatchingCostJumpUnknownFTInc; diff --git a/bolt/test/X86/Inputs/blarge_profile_stale.yaml b/bolt/test/X86/Inputs/blarge_profile_stale.yaml index afe76eda5485..f5abaed3da39 100644 --- a/bolt/test/X86/Inputs/blarge_profile_stale.yaml +++ b/bolt/test/X86/Inputs/blarge_profile_stale.yaml @@ -6,6 +6,7 @@ header: profile-flags: [ lbr ] profile-origin: branch profile reader profile-events: '' + dfs-order: false functions: - name: SolveCubic fid: 6 @@ -15,20 +16,24 @@ functions: blocks: - bid: 0 insns: 43 - hash: 0xD2411AC186118199 + hash: 0xed4db287e71c0000 exec: 151 - succ: [ { bid: 1, cnt: 4, mis: 2 }, { bid: 11, cnt: 0 } ] + succ: [ { bid: 1, cnt: 151, mis: 2 }, { bid: 7, cnt: 0 } ] - bid: 1 insns: 7 - hash: 0xDF0C9CC1FEAA70C3 - succ: [ { bid: 10, cnt: 0 }, { bid: 2, cnt: 0 } ] + hash: 0x39330000e4560088 + succ: [ { bid: 13, cnt: 151 }, { bid: 2, cnt: 0 } ] - bid: 13 insns: 26 - hash: 0xF05DC5524E99E56F - succ: [ { bid: 15, cnt: 89 }, { bid: 14, cnt: 0 } ] - - bid: 15 + hash: 0xa9700000fe202a7 + succ: [ { bid: 3, cnt: 89 }, { bid: 2, cnt: 10 } ] + - bid: 3 + insns: 9 + hash: 0x62391dad18a700a0 + succ: [ { bid: 5, cnt: 151 } ] + - bid: 5 insns: 9 - hash: 0xB2E8338276A9834E + hash: 0x4d906d19ecec0111 - name: usqrt fid: 7 hash: 0x8B62B1F9AD81EA35 @@ -37,15 +42,15 @@ functions: blocks: - bid: 0 insns: 4 - hash: 0xb1e5b76571270000 + hash: 0x1111111111111111 exec: 20 succ: [ { bid: 1, cnt: 0 } ] - bid: 1 insns: 9 - hash: 0x587e93788b970010 + hash: 0x27e43a5e10cd0010 succ: [ { bid: 3, cnt: 320, mis: 171 }, { bid: 2, cnt: 0 } ] - bid: 3 insns: 2 - hash: 0x20e605d745e50039 + hash: 0x4db935b6471e0039 succ: [ { bid: 1, cnt: 300, mis: 33 }, { bid: 4, cnt: 20 } ] ... diff --git a/bolt/test/X86/reader-stale-yaml.test b/bolt/test/X86/reader-stale-yaml.test index 3f9861d2b709..5231032f4f4a 100644 --- a/bolt/test/X86/reader-stale-yaml.test +++ b/bolt/test/X86/reader-stale-yaml.test @@ -1,39 +1,71 @@ # This script checks that YamlProfileReader in llvm-bolt is reading data -# correctly and stale data is corrected. +# correctly and stale data is corrected by profile inference. RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe +# Testing "usqrt" RUN: llvm-bolt %t.exe -o /dev/null --b %p/Inputs/blarge_profile_stale.yaml \ RUN: --print-cfg --print-only=usqrt --infer-stale-profile=1 \ -RUN: --profile-ignore-hash=1 --profile-use-dfs 2>&1 | FileCheck %s +RUN: --profile-ignore-hash=1 --profile-use-dfs=0 2>&1 | FileCheck %s -check-prefix=CHECK1 +# Testing "SolveCubic" +RUN: llvm-bolt %t.exe -o /dev/null --b %p/Inputs/blarge_profile_stale.yaml \ +RUN: --print-cfg --print-only=SolveCubic --infer-stale-profile=1 \ +RUN: --profile-ignore-hash=1 --profile-use-dfs=0 2>&1 | FileCheck %s -check-prefix=CHECK2 + +# Function "usqrt" has stale profile, since the number of blocks in the profile +# (nblocks=6) does not match the size of the CFG in the binary. The entry +# block (bid=0) has an incorrect (missing) count, which should be inferred by +# the algorithm. # Verify that yaml reader works as expected. -CHECK: pre-processing profile using YAML profile reader +CHECK1: pre-processing profile using YAML profile reader +CHECK1: Binary Function "usqrt" after building cfg { +CHECK1: State : CFG constructed +CHECK1: Address : 0x401170 +CHECK1: Size : 0x43 +CHECK1: Section : .text +CHECK1: IsSimple : 1 +CHECK1: BB Count : 5 +CHECK1: Exec Count : 20 +CHECK1: Branch Count: 640 +CHECK1: } +# Verify block counts. +CHECK1: .LBB01 (4 instructions, align : 1) +CHECK1: Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 20) +CHECK1: .Ltmp[[#BB13:]] (9 instructions, align : 1) +CHECK1: Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 320), .LFT[[#BB0:]] (mispreds: 0, count: 0) +CHECK1: .LFT[[#BB0:]] (2 instructions, align : 1) +CHECK1: Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 0) +CHECK1: .Ltmp[[#BB12:]] (2 instructions, align : 1) +CHECK1: Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 300), .LFT[[#BB1:]] (mispreds: 0, count: 20) +CHECK1: .LFT[[#BB1:]] (2 instructions, align : 1) +# Check the overall inference stats. +CHECK1: 2 out of 7 functions in the binary (28.6%) have non-empty execution profile +CHECK1: inferred profile for 2 (100.00% of profiled, 100.00% of stale) functions responsible for {{.*}} samples ({{.*}} out of {{.*}}) -# Verify the inferred counts of "usqrt" that has stale profile: -# - the function has nblocks=6 in the profile, which makes it stale -# - block with bid=0 has an incorrect (missing) count, which is inferred -CHECK: Binary Function "usqrt" after building cfg { -CHECK: State : CFG constructed -CHECK: Address : 0x401170 -CHECK: Size : 0x43 -CHECK: Section : .text -CHECK: IsSimple : 1 -CHECK: BB Count : 5 -CHECK: Exec Count : 20 -CHECK: Branch Count: 640 -CHECK: } -# Verify block counts. -CHECK: .LBB01 (4 instructions, align : 1) -CHECK: Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 20) -CHECK: .Ltmp[[#BB13:]] (9 instructions, align : 1) -CHECK: Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 320), .LFT[[#BB0:]] (mispreds: 0, count: 0) -CHECK: .LFT[[#BB0:]] (2 instructions, align : 1) -CHECK: Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 0) -CHECK: .Ltmp[[#BB12:]] (2 instructions, align : 1) -CHECK: Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 300), .LFT[[#BB1:]] (mispreds: 0, count: 20) -CHECK: .LFT[[#BB1:]] (2 instructions, align : 1) +# Function "SolveCubic" has stale profile, since there is one jump in the +# profile (from bid=13 to bid=2) which is not in the CFG in the binary. The test +# verifies that the inference is able to match two blocks (bid=1 and bid=13) +# using "loose" hashes and then correctly propagate the counts. -# Check the overal inference stats. -CHECK: 2 out of 7 functions in the binary (28.6%) have non-empty execution profile -CHECK: inferred profile for 1 (50.00% of profiled, 100.00% of stale) functions responsible for 87.31% samples (640 out of 733) +CHECK2: pre-processing profile using YAML profile reader +CHECK2: Binary Function "SolveCubic" after building cfg { +CHECK2: State : CFG constructed +CHECK2: Address : 0x400e00 +CHECK2: Size : 0x368 +CHECK2: Section : .text +CHECK2: IsSimple : 1 +CHECK2: BB Count : 18 +CHECK2: Exec Count : 151 +CHECK2: Branch Count: 552 +# Verify block counts. +CHECK2: .LBB00 (43 instructions, align : 1) +CHECK2: Successors: .Ltmp[[#BB7:]] (mispreds: 0, count: 0), .LFT[[#BB1:]] (mispreds: 0, count: 151) +CHECK2: .LFT[[#BB1:]] (5 instructions, align : 1) +CHECK2: Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 151), .LFT[[#BB2:]] (mispreds: 0, count: 0) +CHECK2: .Ltmp[[#BB3:]] (26 instructions, align : 1) +CHECK2: Successors: .Ltmp[[#BB5:]] (mispreds: 0, count: 151), .LFT[[#BB4:]] (mispreds: 0, count: 0) +CHECK2: .Ltmp[[#BB5:]] (9 instructions, align : 1) +CHECK2: .Ltmp[[#BB13:]] (12 instructions, align : 1) +CHECK2: Successors: .Ltmp[[#BB3:]] (mispreds: 0, count: 151) +CHECK2: 2 out of 7 functions in the binary (28.6%) have non-empty execution profile -- Gitee From 4a93af9e8c7448c7a7cdf3a40f9510afc6cf65be Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Thu, 31 Aug 2023 10:46:58 -0700 Subject: [PATCH 25/94] [Backport][BOLT][test] Move asm-dump.c to runtime/X86 Since the test executes instrumented version of the binary, move it under runtime/X86. Note that it can be adjusted to also run under AArch64 now that instrumentation is supported. Reviewed By: #bolt, maksfb Differential Revision: https://reviews.llvm.org/D159298 --- bolt/test/{ => runtime}/X86/asm-dump.c | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename bolt/test/{ => runtime}/X86/asm-dump.c (100%) diff --git a/bolt/test/X86/asm-dump.c b/bolt/test/runtime/X86/asm-dump.c similarity index 100% rename from bolt/test/X86/asm-dump.c rename to bolt/test/runtime/X86/asm-dump.c -- Gitee From a5211e51aea9a53f5840105e29cd5aedf6c142a2 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 23 Aug 2023 16:41:58 -0700 Subject: [PATCH 26/94] [Backport][BOLT] Give precedence to first AddressMap entries When parsing AddressMap and there is a conflict in keys, where two entries share the same key, consider the first entry as the correct one, instead of the last. This matches previous behavior in BOLT and covers case such as BOLT creating a new basic block but sharing the same input offset of the previous (or entry) basic block. In this case, instead of translating debuginfo to use the newly created BB, translate using the BB that was originally read from input. This will increase our chances of getting debuginfo right. Tested via binary comparison in tests: X86/dwarf4-df-input-lowpc-ranges.test X86/dwarf5-df-input-lowpc-ranges.test Reviewed By: #bolt, maksfb, jobnoorman Differential Revision: https://reviews.llvm.org/D158686 --- bolt/lib/Core/AddressMap.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bolt/lib/Core/AddressMap.cpp b/bolt/lib/Core/AddressMap.cpp index 76c5378a3eb1..c5f628d87864 100644 --- a/bolt/lib/Core/AddressMap.cpp +++ b/bolt/lib/Core/AddressMap.cpp @@ -52,7 +52,8 @@ AddressMap AddressMap::parse(StringRef Buffer, const BinaryContext &BC) { while (Cursor && !DE.eof(Cursor)) { const auto Input = DE.getAddress(Cursor); const auto Output = DE.getAddress(Cursor); - Parsed.Map.insert({Input, Output}); + if (!Parsed.Map.count(Input)) + Parsed.Map.insert({Input, Output}); } assert(Cursor && "Error reading address map section"); -- Gitee From f5d0ced58e65b673f70f0f363f10c7746e9a132e Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Fri, 1 Sep 2023 09:08:49 +0200 Subject: [PATCH 27/94] [Backport][BOLT] Provide generic implementations for isLoad/isStore `MCInstrDesc` provides the `mayLoad` and `mayStore` flags that seem appropriate to use as a target-independent way to implement `isLoad` and `isStore`. I believe this is currently good enough to use for the RISC-V target as well. I've provided a test for this that checks the generated dyno stats (which seems to be the only thing both `isLoad` and `isStore` are used for). Reviewed By: maksfb Differential Revision: https://reviews.llvm.org/D159266 --- bolt/include/bolt/Core/MCPlusBuilder.h | 6 ++---- bolt/test/RISCV/load-store.s | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) create mode 100644 bolt/test/RISCV/load-store.s diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 4fef36103ca8..880378523c04 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -614,13 +614,11 @@ public: virtual bool isMoveMem2Reg(const MCInst &Inst) const { return false; } virtual bool isLoad(const MCInst &Inst) const { - llvm_unreachable("not implemented"); - return false; + return Info->get(Inst.getOpcode()).mayLoad(); } virtual bool isStore(const MCInst &Inst) const { - llvm_unreachable("not implemented"); - return false; + return Info->get(Inst.getOpcode()).mayStore(); } virtual bool isCleanRegXOR(const MCInst &Inst) const { diff --git a/bolt/test/RISCV/load-store.s b/bolt/test/RISCV/load-store.s new file mode 100644 index 000000000000..5a9785571c80 --- /dev/null +++ b/bolt/test/RISCV/load-store.s @@ -0,0 +1,16 @@ +// RUN: %clang %cflags -o %t %s +// RUN: link_fdata --no-lbr %s %t %t.fdata +// RUN: llvm-bolt %t -o /dev/null --data=%t.fdata --dyno-stats | FileCheck %s + +// CHECK: BOLT-INFO: program-wide dynostats after all optimizations before SCTC and FOP (no change): +// CHECK: 3000 : executed instructions +// CHECK: 1000 : executed load instructions +// CHECK: 1000 : executed store instructions + + .globl _start +_start: +# FDATA: 1 _start #_start# 1 + ld t0, (gp) + sd t0, (gp) + ret + .size _start, .-_start -- Gitee From 5a2af8cbf5990eb66fd3dd3fc7d5016559cf23d0 Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Fri, 1 Sep 2023 09:33:02 +0200 Subject: [PATCH 28/94] [Backport][BOLT] Rename isLoad/isStore to mayLoad/mayStore As discussed in D159266, for some instructions it's impossible to know statically if they will load/store (e.g., predicated instructions). Therefore, mayLoad/mayStore are more appropriate names. --- bolt/include/bolt/Core/MCPlusBuilder.h | 4 ++-- bolt/lib/Core/DynoStats.cpp | 4 ++-- bolt/lib/Passes/ShrinkWrapping.cpp | 2 +- bolt/lib/Passes/StokeInfo.cpp | 2 +- bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 8 ++++---- bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 8 ++++---- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 880378523c04..e7b6c8e3a747 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -613,11 +613,11 @@ public: virtual bool isMoveMem2Reg(const MCInst &Inst) const { return false; } - virtual bool isLoad(const MCInst &Inst) const { + virtual bool mayLoad(const MCInst &Inst) const { return Info->get(Inst.getOpcode()).mayLoad(); } - virtual bool isStore(const MCInst &Inst) const { + virtual bool mayStore(const MCInst &Inst) const { return Info->get(Inst.getOpcode()).mayStore(); } diff --git a/bolt/lib/Core/DynoStats.cpp b/bolt/lib/Core/DynoStats.cpp index ee40eefd6f7c..5dd55e13e5b3 100644 --- a/bolt/lib/Core/DynoStats.cpp +++ b/bolt/lib/Core/DynoStats.cpp @@ -215,10 +215,10 @@ DynoStats getDynoStats(BinaryFunction &BF) { } } - if (BC.MIB->isStore(Instr)) { + if (BC.MIB->mayStore(Instr)) { Stats[DynoStats::STORES] += BBExecutionCount; } - if (BC.MIB->isLoad(Instr)) { + if (BC.MIB->mayLoad(Instr)) { Stats[DynoStats::LOADS] += BBExecutionCount; } if (!BC.MIB->isCall(Instr)) diff --git a/bolt/lib/Passes/ShrinkWrapping.cpp b/bolt/lib/Passes/ShrinkWrapping.cpp index cdf38e35ee87..17f169cc332b 100644 --- a/bolt/lib/Passes/ShrinkWrapping.cpp +++ b/bolt/lib/Passes/ShrinkWrapping.cpp @@ -1960,7 +1960,7 @@ bool ShrinkWrapping::perform(bool HotOnly) { for (const auto &Instr : *BB) { if (BC.MIB->isPseudo(Instr)) continue; - if (BC.MIB->isStore(Instr)) + if (BC.MIB->mayStore(Instr)) TotalStoreInstrs += BBExecCount; TotalInstrs += BBExecCount; } diff --git a/bolt/lib/Passes/StokeInfo.cpp b/bolt/lib/Passes/StokeInfo.cpp index cbd2c3c7a1a1..57e5a08113dd 100644 --- a/bolt/lib/Passes/StokeInfo.cpp +++ b/bolt/lib/Passes/StokeInfo.cpp @@ -75,7 +75,7 @@ void StokeInfo::checkInstr(const BinaryFunction &BF, StokeFuncInfo &FuncInfo) { if (IsPush) FuncInfo.StackOut = true; - if (MIB->isStore(It) && !IsPush && !IsRipAddr) + if (MIB->mayStore(It) && !IsPush && !IsRipAddr) FuncInfo.HeapOut = true; if (IsRipAddr) diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index db9dfaea4ed6..6623f9f8e0a3 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -268,12 +268,12 @@ public: Inst.getOpcode() == AArch64::LDRXui); } - bool isLoad(const MCInst &Inst) const override { + bool mayLoad(const MCInst &Inst) const override { return isLDRB(Inst) || isLDRH(Inst) || isLDRW(Inst) || isLDRX(Inst); } bool isLoadFromStack(const MCInst &Inst) const { - if (!isLoad(Inst)) + if (!mayLoad(Inst)) return false; for (const MCOperand &Operand : useOperands(Inst)) { if (!Operand.isReg()) @@ -680,7 +680,7 @@ public: PCRelBase = DefBaseAddr; // Match LOAD to load the jump table (relative) target const MCInst *DefLoad = UsesAdd[2]; - assert(isLoad(*DefLoad) && + assert(mayLoad(*DefLoad) && "Failed to match indirect branch load pattern! (1)"); assert((ScaleValue != 1LL || isLDRB(*DefLoad)) && "Failed to match indirect branch load pattern! (2)"); @@ -1013,7 +1013,7 @@ public: return true; } - bool isStore(const MCInst &Inst) const override { return false; } + bool mayStore(const MCInst &Inst) const override { return false; } bool createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx, bool IsTailCall) override { diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index d3d371d8881e..4cb9d61710d1 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -350,7 +350,7 @@ public: } } - bool isLoad(const MCInst &Inst) const override { + bool mayLoad(const MCInst &Inst) const override { if (isPop(Inst)) return true; @@ -363,7 +363,7 @@ public: return MCII.mayLoad(); } - bool isStore(const MCInst &Inst) const override { + bool mayStore(const MCInst &Inst) const override { if (isPush(Inst)) return true; @@ -1755,7 +1755,7 @@ public: // - Non-stack loads are prohibited (generally unsafe) // - Stack loads are OK if AllowStackMemOp is true // - Stack loads with RBP are OK if AllowBasePtrStackMemOp is true - if (isLoad(Inst)) { + if (mayLoad(Inst)) { // If stack memory operands are not allowed, no loads are allowed if (!AllowStackMemOp) return false; @@ -2190,7 +2190,7 @@ public: MCInst &CurInst = *Itr++; const MCInstrDesc &Desc = Info->get(CurInst.getOpcode()); if (Desc.hasDefOfPhysReg(CurInst, MethodRegNum, *RegInfo)) { - if (!isLoad(CurInst)) + if (!mayLoad(CurInst)) return false; if (std::optional MO = evaluateX86MemoryOperand(CurInst)) { -- Gitee From 43cca68f731b553eb8463d640425545b2e17e3cc Mon Sep 17 00:00:00 2001 From: Elvina Yakubova Date: Fri, 25 Aug 2023 16:02:16 +0300 Subject: [PATCH 29/94] [Backport][BOLT][test] Enable exceptions_split tests for AArch64 Since the issue with trap value is fixed in D158191, it now should pass on both platforms. Reviewed By: maksfb Differential Revision: https://reviews.llvm.org/D158899 --- .../{X86 => }/Inputs/exceptions_split.cpp | 16 +++++----------- .../{X86 => }/exceptions-instrumentation.test | 0 .../runtime/{X86 => }/pie-exceptions-split.test | 4 ++-- 3 files changed, 7 insertions(+), 13 deletions(-) rename bolt/test/runtime/{X86 => }/Inputs/exceptions_split.cpp (85%) rename bolt/test/runtime/{X86 => }/exceptions-instrumentation.test (100%) rename bolt/test/runtime/{X86 => }/pie-exceptions-split.test (95%) diff --git a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp b/bolt/test/runtime/Inputs/exceptions_split.cpp similarity index 85% rename from bolt/test/runtime/X86/Inputs/exceptions_split.cpp rename to bolt/test/runtime/Inputs/exceptions_split.cpp index 2c136b9a1cf5..de81adf7583c 100644 --- a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp +++ b/bolt/test/runtime/Inputs/exceptions_split.cpp @@ -3,31 +3,25 @@ // // Record performance data with no args. Run test with 2 args. -#include #include +#include -int foo() -{ - return 0; -} +int foo() { return 0; } void bar(int a) { if (a > 2 && a % 2) throw new int(); } -void filter_only(){ - foo(); -} +void filter_only() { foo(); } -int main(int argc, char **argv) -{ +int main(int argc, char **argv) { unsigned r = 0; uint64_t limit = (argc >= 2 ? 10 : 5000); for (uint64_t i = 0; i < limit; ++i) { i += foo(); - try { + try { bar(argc); try { if (argc >= 2) diff --git a/bolt/test/runtime/X86/exceptions-instrumentation.test b/bolt/test/runtime/exceptions-instrumentation.test similarity index 100% rename from bolt/test/runtime/X86/exceptions-instrumentation.test rename to bolt/test/runtime/exceptions-instrumentation.test diff --git a/bolt/test/runtime/X86/pie-exceptions-split.test b/bolt/test/runtime/pie-exceptions-split.test similarity index 95% rename from bolt/test/runtime/X86/pie-exceptions-split.test rename to bolt/test/runtime/pie-exceptions-split.test index 124fef60fd2d..30f2d02bc9e1 100644 --- a/bolt/test/runtime/X86/pie-exceptions-split.test +++ b/bolt/test/runtime/pie-exceptions-split.test @@ -16,9 +16,9 @@ RUN: --print-only=main 2>&1 | FileCheck %s ## All calls to printf() should be from exception handling code that was ## recorded as cold during the profile collection run. Check that the calls ## are placed after the split point. -CHECK-NOT: callq printf +CHECK-NOT: printf CHECK: HOT-COLD SPLIT POINT -CHECK: callq printf +CHECK: printf ## Verify the output still executes correctly when the exception path is being ## taken. -- Gitee From 98cd2e9ec30635cdaf552ac0b6dd1dd6b41c401d Mon Sep 17 00:00:00 2001 From: Elvina Yakubova Date: Thu, 7 Sep 2023 23:54:53 +0300 Subject: [PATCH 30/94] [Backport][BOLT][test] Fix cross-compilation tests after D151920 Fix tests that are failing in cross-compilation after D151920 (https://lab.llvm.org/buildbot/#/builders/221/builds/17715): - instrumentation-ind-call, basic-instrumentation: add -mno-outline-atomics flag to runtime lib - bolt-address-translation-internal-call, internal-call-instrument: add %cflags - meta-merge-fdata: restrict to x86_64 Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D159094 --- bolt/runtime/CMakeLists.txt | 3 +++ ...olt-address-translation-internal-call.test | 7 +++++- .../test/X86/instrumentation-eh_frame_hdr.cpp | 2 +- bolt/test/X86/internal-call-instrument.s | 24 +++++++++++++++++-- bolt/test/runtime/meta-merge-fdata.test | 2 +- 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt index 191d2b895b92..fadc4e79856d 100644 --- a/bolt/runtime/CMakeLists.txt +++ b/bolt/runtime/CMakeLists.txt @@ -32,6 +32,9 @@ set(BOLT_RT_FLAGS if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-sse") endif() +if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-outline-atomics") +endif() if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") include(CheckCXXCompilerFlag) diff --git a/bolt/test/X86/bolt-address-translation-internal-call.test b/bolt/test/X86/bolt-address-translation-internal-call.test index e24a9e6dc1c2..24cb635e13e9 100644 --- a/bolt/test/X86/bolt-address-translation-internal-call.test +++ b/bolt/test/X86/bolt-address-translation-internal-call.test @@ -9,7 +9,7 @@ # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o # Delete our BB symbols so BOLT doesn't mark them as entry points # RUN: llvm-strip --strip-unneeded %t.o -# RUN: %clang %t.o -o %t.exe -Wl,-q +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q # RUN: llvm-bolt --enable-bat %t.exe --relocs -o %t.out | FileCheck %s # CHECK: BOLT-INFO: Wrote {{.*}} BAT maps @@ -29,6 +29,7 @@ main: push %rbx sub $0x120,%rsp mov $0x3,%rbx + movq rel(%rip), %rdi .J1: cmp $0x0,%rbx je .J2 @@ -49,4 +50,8 @@ main: .J4: pop %rbp retq +end: .size main, .-main + + .data +rel: .quad end diff --git a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp index f6ebd6b76f60..4ed8be42cd0f 100644 --- a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp +++ b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp @@ -1,7 +1,7 @@ // This test checks that .eh_frame_hdr address is in bounds of the last LOAD // end address i.e. the section address is smaller then the LOAD end address. -// REQUIRES: system-linux,bolt-runtime +// REQUIRES: system-linux,bolt-runtime,target=x86_64{{.*}} // RUN: %clangxx %cxxflags -static -Wl,-q %s -o %t.exe -Wl,--entry=_start // RUN: llvm-bolt %t.exe -o %t.instr -instrument \ diff --git a/bolt/test/X86/internal-call-instrument.s b/bolt/test/X86/internal-call-instrument.s index 7ddfb4fb812d..c393f1dac864 100644 --- a/bolt/test/X86/internal-call-instrument.s +++ b/bolt/test/X86/internal-call-instrument.s @@ -1,15 +1,23 @@ # This reproduces a bug with instrumentation crashes on internal call -# REQUIRES: x86_64-linux,bolt-runtime +# REQUIRES: x86_64-linux,bolt-runtime,target=x86_64{{.*}} # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o # Delete our BB symbols so BOLT doesn't mark them as entry points # RUN: llvm-strip --strip-unneeded %t.o -# RUN: %clang %t.o -o %t.exe -Wl,-q +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q # RUN: llvm-bolt --instrument %t.exe --relocs -o %t.out .text + .globl _start + .type _start, %function + .p2align 4 +_start: + call main + ret + .size _start, .-_start + .globl main .type main, %function .p2align 4 @@ -20,6 +28,7 @@ main: push %rbx sub $0x120,%rsp mov $0x3,%rbx + movq rel(%rip), %rdi .J1: cmp $0x0,%rbx je .J2 @@ -40,4 +49,15 @@ main: .J4: pop %rbp retq +end: .size main, .-main + + .globl _fini + .type _fini, %function + .p2align 4 +_fini: + hlt + .size _fini, .-_fini + + .data +rel: .quad end diff --git a/bolt/test/runtime/meta-merge-fdata.test b/bolt/test/runtime/meta-merge-fdata.test index 5592e65b0928..6972e75c64de 100644 --- a/bolt/test/runtime/meta-merge-fdata.test +++ b/bolt/test/runtime/meta-merge-fdata.test @@ -1,7 +1,7 @@ # Meta test using merge-fdata binary UNSUPPORTED: asan # Instrumentation currently only works on X86 -REQUIRES: bolt-runtime +REQUIRES: x86_64-linux,bolt-runtime # Instrumentation, should test: # - Direct branches -- Gitee From b57259f41c17ed97ea12e926d0a61f2dc3545e8c Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Thu, 7 Sep 2023 17:09:45 -0700 Subject: [PATCH 31/94] [Backport][BOLT][test] Fix patch-entries for aarch64 buildbot (#65690) --- bolt/test/X86/Inputs/patch-entries.c | 8 ++++++++ bolt/test/X86/patch-entries.c | 19 ------------------- bolt/test/X86/patch-entries.test | 10 ++++++++++ 3 files changed, 18 insertions(+), 19 deletions(-) create mode 100644 bolt/test/X86/Inputs/patch-entries.c delete mode 100644 bolt/test/X86/patch-entries.c create mode 100644 bolt/test/X86/patch-entries.test diff --git a/bolt/test/X86/Inputs/patch-entries.c b/bolt/test/X86/Inputs/patch-entries.c new file mode 100644 index 000000000000..46a3b41b048e --- /dev/null +++ b/bolt/test/X86/Inputs/patch-entries.c @@ -0,0 +1,8 @@ +#include "stub.h" + +static void foo() { printf("foo\n"); } + +int main() { + foo(); + return 0; +} diff --git a/bolt/test/X86/patch-entries.c b/bolt/test/X86/patch-entries.c deleted file mode 100644 index d435781fc60f..000000000000 --- a/bolt/test/X86/patch-entries.c +++ /dev/null @@ -1,19 +0,0 @@ -// Checking crashes against injected binary functions created by patch -// entries pass and debug info turned on. In these cases, we were -// trying to fetch input to output maps on injected functions and -// crashing. - -// REQUIRES: system-linux - -// RUN: %clang %cflags -no-pie -g %s -fuse-ld=lld -o %t.exe -Wl,-q -// RUN: llvm-bolt -relocs %t.exe -o %t.out --update-debug-sections \ -// RUN: --force-patch - -#include - -static void foo() { printf("foo\n"); } - -int main() { - foo(); - return 0; -} diff --git a/bolt/test/X86/patch-entries.test b/bolt/test/X86/patch-entries.test new file mode 100644 index 000000000000..54f358f273e7 --- /dev/null +++ b/bolt/test/X86/patch-entries.test @@ -0,0 +1,10 @@ +# Checking crashes against injected binary functions created by patch +# entries pass and debug info turned on. In these cases, we were +# trying to fetch input to output maps on injected functions and +# crashing. + +REQUIRES: system-linux + +RUN: %clang %cflags -no-pie -g %p/Inputs/patch-entries.c -fuse-ld=lld -o %t.exe \ +RUN: -Wl,-q -I%p/../Inputs +RUN: llvm-bolt -relocs %t.exe -o %t.out --update-debug-sections --force-patch -- Gitee From 5a8f9a2f33f37987ffce444436ca8335e98aeae1 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 8 Sep 2023 08:42:45 +0100 Subject: [PATCH 32/94] [Backport][bolt][X86] Correct 2 test RUN lines (#65252) One had an extra ;, which is odd but harmless. The other was missing ":" after RUN. --- bolt/test/X86/dwarf4-df-dualcu.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/test/X86/dwarf4-df-dualcu.test b/bolt/test/X86/dwarf4-df-dualcu.test index 71726136d7ca..c8135ac54377 100644 --- a/bolt/test/X86/dwarf4-df-dualcu.test +++ b/bolt/test/X86/dwarf4-df-dualcu.test @@ -1,7 +1,7 @@ ; RUN: rm -rf %t ; RUN: mkdir %t ; RUN: cd %t -;; RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-df-dualcu-main.s \ +; RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-df-dualcu-main.s \ ; RUN: -split-dwarf-file=main.dwo -o main.o ; RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-df-dualcu-helper.s \ ; RUN: -split-dwarf-file=helper.dwo -o helper.o @@ -12,7 +12,7 @@ ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe.bolt >> %t/foo.txt ; RUN: cat %t/foo.txt | FileCheck -check-prefix=BOLT %s ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.dwo &> maindwo.txt -; RUN cat maindwo.txt | FileCheck -check-prefix=PRE-BOLT-DWO-MAIN %s +; RUN: cat maindwo.txt | FileCheck -check-prefix=PRE-BOLT-DWO-MAIN %s ; RUN: not llvm-dwarfdump --show-form --verbose --debug-info main.dwo.dwo &> mainddwodwo.txt ; RUN: cat mainddwodwo.txt | FileCheck -check-prefix=BOLT-DWO-MAIN %s ; RUN: llvm-dwarfdump --show-form --verbose --debug-info helper.dwo &> helperdwo.txt -- Gitee From f7b54dacf4fddde64751fefa22f987cbb0c39887 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Fri, 8 Sep 2023 11:35:03 -0700 Subject: [PATCH 33/94] [Backport][BOLT] Always match stale entry blocks Two (minor) improvements for stale matching: - always match entry blocks to each other, even if there is a hash mismatch; - ignore nops in (loose) hash computation. I record a small improvement in inference quality on my benchmarks. Tests are not affected Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D159488 --- bolt/lib/Core/HashUtilities.cpp | 3 ++- bolt/lib/Profile/StaleProfileMatching.cpp | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/bolt/lib/Core/HashUtilities.cpp b/bolt/lib/Core/HashUtilities.cpp index 0fc72be888be..88f01e4f936d 100644 --- a/bolt/lib/Core/HashUtilities.cpp +++ b/bolt/lib/Core/HashUtilities.cpp @@ -139,7 +139,8 @@ std::string hashBlockLoose(BinaryContext &BC, const BinaryBasicBlock &BB) { // instruction opcodes, which is then hashed with std::hash. std::set Opcodes; for (const MCInst &Inst : BB) { - if (BC.MIB->isPseudo(Inst)) + // Skip pseudo instructions and nops. + if (BC.MIB->isPseudo(Inst) || BC.MIB->isNoop(Inst)) continue; // Ignore unconditional jumps, as they can be added / removed as a result diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp index 3b43ed64bd91..d00bf87ffc8a 100644 --- a/bolt/lib/Profile/StaleProfileMatching.cpp +++ b/bolt/lib/Profile/StaleProfileMatching.cpp @@ -389,6 +389,9 @@ void matchWeightsByHashes(BinaryContext &BC, assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile"); BlendedBlockHash YamlHash(YamlBB.Hash); const FlowBlock *MatchedBlock = Matcher.matchBlock(YamlHash); + // Always match the entry block. + if (MatchedBlock == nullptr && YamlBB.Index == 0) + MatchedBlock = Blocks[0]; if (MatchedBlock != nullptr) { MatchedBlocks[YamlBB.Index] = MatchedBlock; BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1]; -- Gitee From f94a75cb4b3a462ec483dc448ad2d7983e46d5b8 Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Tue, 12 Sep 2023 13:44:55 +0200 Subject: [PATCH 34/94] [Backport][BOLT] Prevent adding secondary entry points for BB labels When linker relaxation is enabled on RISC-V, every branch has a relocation and a corresponding symbol in the symbol table. BOLT currently registers all these symbols as secondary entry points causing almost every function to be marked as multi entry on RISC-V. This patch modifies `adjustFunctionBoundaries` to ignore these symbols. Note that I currently try to detect them by checking if a symbol's name starts with the private label prefix as defined by `MCAsmInfo`. Since I'm not entirely sure what multi-entry functions look like on different targets, please check if this condition is correct. Maybe it could make sense to only check this on RISC-V? Reviewed By: maksfb Differential Revision: https://reviews.llvm.org/D159285 --- bolt/lib/Rewrite/RewriteInstance.cpp | 10 ++++++++++ bolt/test/RISCV/branch-no-secondary-entry.s | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 bolt/test/RISCV/branch-no-secondary-entry.s diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 94856edef642..d09316747127 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -1586,6 +1586,16 @@ void RewriteInstance::adjustFunctionBoundaries() { if (!Function.isSymbolValidInScope(Symbol, SymbolSize)) break; + // Skip basic block labels. This happens on RISC-V with linker relaxation + // enabled because every branch needs a relocation and corresponding + // symbol. We don't want to add such symbols as entry points. + const auto PrivateLabelPrefix = BC->AsmInfo->getPrivateLabelPrefix(); + if (!PrivateLabelPrefix.empty() && + cantFail(Symbol.getName()).starts_with(PrivateLabelPrefix)) { + ++NextSymRefI; + continue; + } + // This is potentially another entry point into the function. uint64_t EntryOffset = NextSymRefI->first - Function.getAddress(); LLVM_DEBUG(dbgs() << "BOLT-DEBUG: adding entry point to function " diff --git a/bolt/test/RISCV/branch-no-secondary-entry.s b/bolt/test/RISCV/branch-no-secondary-entry.s new file mode 100644 index 000000000000..bf8191f25744 --- /dev/null +++ b/bolt/test/RISCV/branch-no-secondary-entry.s @@ -0,0 +1,18 @@ +/// Test that no secondary entry points are created for basic block labels used +/// by branches. +// RUN: %clang %cflags -o %t %s +// RUN: llvm-bolt -print-cfg -o /dev/null %t 2>&1 | FileCheck %s + +// CHECK: Binary Function "_start" after building cfg { +// CHECK: IsMultiEntry: 0 +// CHECK: beq t0, t1, .Ltmp0 +// CHECK: {{^}}.Ltmp0 +// CHECK: ret + + .globl _start +_start: + beq t0, t1, 1f +1: + ret + .size _start, .-_start + -- Gitee From e747ac2c8ba3dcbe762534cbca6c084845c6bd97 Mon Sep 17 00:00:00 2001 From: zhoujing Date: Fri, 15 Sep 2023 21:34:59 +0800 Subject: [PATCH 35/94] [Backport][BOLT] Fix deadloop bug in taildup The intent is clearly to push the tail rather than current BB. Reviewed By: maksfb Differential Revision: https://reviews.llvm.org/D159289 --- bolt/lib/Passes/TailDuplication.cpp | 2 +- bolt/test/X86/tail-duplication-pass.s | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/bolt/lib/Passes/TailDuplication.cpp b/bolt/lib/Passes/TailDuplication.cpp index c04efd759bf3..7141d5d99aa6 100644 --- a/bolt/lib/Passes/TailDuplication.cpp +++ b/bolt/lib/Passes/TailDuplication.cpp @@ -303,7 +303,7 @@ TailDuplication::aggressiveDuplicate(BinaryBasicBlock &BB, if (isInCacheLine(BB, Tail)) return BlocksToDuplicate; - BinaryBasicBlock *CurrBB = &BB; + BinaryBasicBlock *CurrBB = &Tail; while (CurrBB) { LLVM_DEBUG(dbgs() << "Aggressive tail duplication: adding " << CurrBB->getName() << " to duplication list\n";); diff --git a/bolt/test/X86/tail-duplication-pass.s b/bolt/test/X86/tail-duplication-pass.s index 677f4986eb89..ed50cc5227d8 100644 --- a/bolt/test/X86/tail-duplication-pass.s +++ b/bolt/test/X86/tail-duplication-pass.s @@ -7,12 +7,21 @@ # RUN: llvm-bolt %t.exe --data %t.fdata --reorder-blocks=ext-tsp \ # RUN: --print-finalized --tail-duplication=moderate \ # RUN: --tail-duplication-minimum-offset=1 -o %t.out | FileCheck %s +# RUN: llvm-bolt %t.exe --data %t.fdata --print-finalized \ +# RUN: --tail-duplication=aggressive --tail-duplication-minimum-offset=1 \ +# RUN: -o %t.out | FileCheck %s --check-prefix CHECK-NOLOOP # FDATA: 1 main 2 1 main #.BB2# 0 10 # FDATA: 1 main 4 1 main #.BB2# 0 20 # CHECK: BOLT-INFO: tail duplication modified 1 ({{.*}}%) functions; duplicated 1 blocks (1 bytes) responsible for {{.*}} dynamic executions ({{.*}}% of all block executions) # CHECK: BB Layout : .LBB00, .Ltail-dup0, .Ltmp0, .Ltmp1 +# Check that the successor of Ltail-dup0 is .LBB00, not itself. +# CHECK-NOLOOP: .Ltail-dup0 (1 instructions, align : 1) +# CHECK-NOLOOP: Predecessors: .LBB00 +# CHECK-NOLOOP: retq +# CHECK-NOLOOP: .Ltmp0 (1 instructions, align : 1) + .text .globl main .type main, %function -- Gitee From b0a947a55b60c9748814a153a7f738d8c500501a Mon Sep 17 00:00:00 2001 From: zhoujiapeng Date: Sun, 17 Sep 2023 00:07:14 +0800 Subject: [PATCH 36/94] [Backport][BOLT] Skip the validation of CFG after it is finalized When current state is `CFG_Finalized`, function `validateCFG()` should return true directly. Reviewed By: maksfb, yota9, Kepontry Differential Revision: https://reviews.llvm.org/D159410 --- bolt/lib/Core/BinaryFunction.cpp | 4 ++++ bolt/test/verify-cfg.test | 8 ++++++++ 2 files changed, 12 insertions(+) create mode 100644 bolt/test/verify-cfg.test diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 80470fbef558..5905c10afe82 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -3156,6 +3156,10 @@ void BinaryFunction::dumpGraphToFile(std::string Filename) const { } bool BinaryFunction::validateCFG() const { + // Skip the validation of CFG after it is finalized + if (CurrentState == State::CFG_Finalized) + return true; + bool Valid = true; for (BinaryBasicBlock *BB : BasicBlocks) Valid &= BB->validateSuccessorInvariants(); diff --git a/bolt/test/verify-cfg.test b/bolt/test/verify-cfg.test new file mode 100644 index 000000000000..4a7de85cd427 --- /dev/null +++ b/bolt/test/verify-cfg.test @@ -0,0 +1,8 @@ +# Verify if the `--verify-cfg` option might produce incorrect alerts. + +REQUIRES: system-linux + +RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q +RUN: llvm-bolt %t -o %t.bolt --verify-cfg 2>&1 | FileCheck %s + +CHECK-NOT: BOLT-ERROR: Invalid CFG detected after pass {{.*}} -- Gitee From ce898bf96a544cc17497937875e5951b79c50178 Mon Sep 17 00:00:00 2001 From: zhoujiapeng Date: Sun, 17 Sep 2023 00:12:16 +0800 Subject: [PATCH 37/94] [Backport][BOLT] Incorporate umask into the output file permission Fix https://github.com/llvm/llvm-project/issues/65061 Reviewed By: maksfb, Amir Differential Revision: https://reviews.llvm.org/D159407 --- bolt/lib/Rewrite/MachORewriteInstance.cpp | 6 ++++-- bolt/lib/Rewrite/RewriteInstance.cpp | 5 ++++- bolt/test/permission.test | 13 +++++++++++++ 3 files changed, 21 insertions(+), 3 deletions(-) create mode 100644 bolt/test/permission.test diff --git a/bolt/lib/Rewrite/MachORewriteInstance.cpp b/bolt/lib/Rewrite/MachORewriteInstance.cpp index 8214cade8280..b827a196c826 100644 --- a/bolt/lib/Rewrite/MachORewriteInstance.cpp +++ b/bolt/lib/Rewrite/MachORewriteInstance.cpp @@ -564,8 +564,10 @@ void MachORewriteInstance::rewriteFile() { writeInstrumentationSection("I__literal16", OS); Out->keep(); - EC = sys::fs::setPermissions(opts::OutputFilename, - sys::fs::perms::all_all); + EC = sys::fs::setPermissions( + opts::OutputFilename, + static_cast(sys::fs::perms::all_all & + ~sys::fs::getUmask())); check_error(EC, "cannot set permissions of output file"); } diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index d09316747127..748fea91eeb2 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -5367,7 +5367,10 @@ void RewriteInstance::rewriteFile() { } Out->keep(); - EC = sys::fs::setPermissions(opts::OutputFilename, sys::fs::perms::all_all); + EC = sys::fs::setPermissions( + opts::OutputFilename, + static_cast(sys::fs::perms::all_all & + ~sys::fs::getUmask())); check_error(EC, "cannot set permissions of output file"); } diff --git a/bolt/test/permission.test b/bolt/test/permission.test new file mode 100644 index 000000000000..a5a98599eb83 --- /dev/null +++ b/bolt/test/permission.test @@ -0,0 +1,13 @@ +# Ensure that the permissions of the optimized binary file comply with the +# system's umask. + +# This test performs a logical AND operation on the results of the `stat -c %a +# %t.bolt` and `umask` commands (both results are displayed in octal), and +# checks whether the result is equal to 0. +REQUIRES: system-linux + +RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q +RUN: llvm-bolt %t -o %t.bolt +RUN: echo $(( 8#$(stat -c %a %t.bolt) & 8#$(umask) )) | FileCheck %s + +CHECK: 0 -- Gitee From 6b14e03c51b8e31a4e77ad9dcad2e68fb3fbe1f6 Mon Sep 17 00:00:00 2001 From: Sinan Lin Date: Mon, 18 Sep 2023 19:52:23 +0800 Subject: [PATCH 38/94] [Backport][Bolt] fix a relocation bug for R_AARCH64_CALL26 If the R_AARCH64_CALL26 against a symbol that has a lower address, then encodeValueAArch64 will return a wrong value. Reviewed By: Kepontry, yota9 Differential Revision: https://reviews.llvm.org/D159513 --- bolt/lib/Core/Relocation.cpp | 2 +- bolt/test/AArch64/reloc-call26.s | 34 ++++++++++++++++++++++++++------ 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp index 45da2addbb98..240c88744717 100644 --- a/bolt/lib/Core/Relocation.cpp +++ b/bolt/lib/Core/Relocation.cpp @@ -350,7 +350,7 @@ static uint64_t encodeValueAArch64(uint64_t Type, uint64_t Value, uint64_t PC) { assert(isInt<28>(Value) && "only PC +/- 128MB is allowed for direct call"); // Immediate goes in bits 25:0 of BL. // OP 1001_01 goes in bits 31:26 of BL. - Value = (Value >> 2) | 0x94000000ULL; + Value = ((Value >> 2) & 0x3ffffff) | 0x94000000ULL; break; } return Value; diff --git a/bolt/test/AArch64/reloc-call26.s b/bolt/test/AArch64/reloc-call26.s index 09399367b159..42e4f7f2b437 100644 --- a/bolt/test/AArch64/reloc-call26.s +++ b/bolt/test/AArch64/reloc-call26.s @@ -1,29 +1,51 @@ ## This test checks processing of R_AARCH64_CALL26 relocation ## when option `--funcs` is enabled +## We want to test on relocations against functions with both higher +## and lower addresses. The '--force-patch' option is used to prevent +## the functions func1 and func2 from being optimized, so that their +## addresses can remain unchanged. Therefore, the relocations can be +## updated via encodeValueAArch64 and the address order in the output +## binary is func1 < _start < func2. + # REQUIRES: system-linux # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ # RUN: %s -o %t.o # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -# RUN: llvm-bolt %t.exe -o %t.bolt --funcs=func1 +# RUN: llvm-bolt %t.exe -o %t.bolt --funcs=func1,func2 \ +# RUN: --force-patch 2>&1 | FileCheck %s -check-prefix=CHECK-BOLT # RUN: llvm-objdump -d --disassemble-symbols='_start' %t.bolt | \ # RUN: FileCheck %s +# RUN: llvm-nm --numeric-sort --extern-only %t.bolt | FileCheck \ +# RUN: %s -check-prefix=CHECK-FUNC-ORDER +# CHECK-BOLT: BOLT-WARNING: failed to patch entries in func1. The function will not be optimized. +# CHECK-BOLT: BOLT-WARNING: failed to patch entries in func2. The function will not be optimized. # CHECK: {{.*}} bl {{.*}} +# CHECK: {{.*}} bl {{.*}} + +# CHECK-FUNC-ORDER: {{.*}} func1 +# CHECK-FUNC-ORDER-NEXT: {{.*}} _start +# CHECK-FUNC-ORDER-NEXT: {{.*}} func2 .text .align 4 + .global func1 + .type func1, %function +func1: + ret + .size func1, .-func1 .global _start .type _start, %function _start: bl func1 + bl func2 mov w8, #93 svc #0 .size _start, .-_start - - .global func1 - .type func1, %function -func1: + .global func2 + .type func2, %function +func2: ret - .size func1, .-func1 + .size func2, .-func2 -- Gitee From f23d0be73e05db4f922b38a76776f4a8ab84a7b3 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Mon, 31 Jul 2023 14:51:01 -0700 Subject: [PATCH 39/94] [Backport][BOLT][NFC] Simplify DataAggregator Use short loop instead of duplicating the code for setHasProfileAvailable. Reviewed By: #bolt, maksfb Differential Revision: https://reviews.llvm.org/D154749 --- bolt/lib/Profile/DataAggregator.cpp | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 24dbe34b2f6a..02f7032c2c7e 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -1479,13 +1479,10 @@ std::error_code DataAggregator::parseBranchEvents() { NumTraces += parseLBRSample(Sample, NeedsSkylakeFix); } - for (const auto &LBR : BranchLBRs) { - const Trace &Trace = LBR.first; - if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Trace.From)) - BF->setHasProfileAvailable(); - if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Trace.To)) - BF->setHasProfileAvailable(); - } + for (const Trace &Trace : llvm::make_first_range(BranchLBRs)) + for (const uint64_t Addr : {Trace.From, Trace.To}) + if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Addr)) + BF->setHasProfileAvailable(); auto printColored = [](raw_ostream &OS, float Percent, float T1, float T2) { OS << " ("; @@ -1721,12 +1718,9 @@ std::error_code DataAggregator::parsePreAggregatedLBRSamples() { if (std::error_code EC = AggrEntry.getError()) return EC; - if (BinaryFunction *BF = - getBinaryFunctionContainingAddress(AggrEntry->From.Offset)) - BF->setHasProfileAvailable(); - if (BinaryFunction *BF = - getBinaryFunctionContainingAddress(AggrEntry->To.Offset)) - BF->setHasProfileAvailable(); + for (const uint64_t Addr : {AggrEntry->From.Offset, AggrEntry->To.Offset}) + if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Addr)) + BF->setHasProfileAvailable(); AggregatedLBRs.emplace_back(std::move(AggrEntry.get())); } -- Gitee From 7a0759b59ebacd1d898352585798cdb032b92306 Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Sat, 29 Jul 2023 09:14:44 +0200 Subject: [PATCH 40/94] [Backport][BOLT][RISCV] Recognize mapping symbols The RISC-V psABI [1] defines them similarly to AArch64. [1] https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc#mapping-symbol Reviewed By: yota9, Amir Differential Revision: https://reviews.llvm.org/D153277 --- bolt/lib/Core/BinaryContext.cpp | 8 ++++---- bolt/lib/Rewrite/RewriteInstance.cpp | 2 +- bolt/test/RISCV/mapping-syms.s | 27 +++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 5 deletions(-) create mode 100644 bolt/test/RISCV/mapping-syms.s diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index 1e95d5fe38ab..ffecc5209804 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -1760,10 +1760,10 @@ void BinaryContext::printCFI(raw_ostream &OS, const MCCFIInstruction &Inst) { } MarkerSymType BinaryContext::getMarkerType(const SymbolRef &Symbol) const { - // For aarch64, the ABI defines mapping symbols so we identify data in the - // code section (see IHI0056B). $x identifies a symbol starting code or the - // end of a data chunk inside code, $d indentifies start of data. - if (!isAArch64() || ELFSymbolRef(Symbol).getSize()) + // For aarch64 and riscv, the ABI defines mapping symbols so we identify data + // in the code section (see IHI0056B). $x identifies a symbol starting code or + // the end of a data chunk inside code, $d indentifies start of data. + if ((!isAArch64() && !isRISCV()) || ELFSymbolRef(Symbol).getSize()) return MarkerSymType::NONE; Expected NameOrError = Symbol.getName(); diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 748fea91eeb2..86f52865e537 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -882,7 +882,7 @@ void RewriteInstance::discoverFileObjects() { } }; - if (BC->isAArch64()) { + if (BC->isAArch64() || BC->isRISCV()) { addExtraDataMarkerPerSymbol(SortedFileSymbols, SortedMarkerSymbols); LastSymbol = std::stable_partition( SortedFileSymbols.begin(), SortedFileSymbols.end(), diff --git a/bolt/test/RISCV/mapping-syms.s b/bolt/test/RISCV/mapping-syms.s new file mode 100644 index 000000000000..e8fdeb0c7572 --- /dev/null +++ b/bolt/test/RISCV/mapping-syms.s @@ -0,0 +1,27 @@ +/// FIXME llvm-mc is used instead of clang because we need a recent change in +/// the RISC-V MC layer (D153260). Once that one is released, we can switch to +/// clang. (Note that the pre-merge check buildbots use the system's clang). +// RUN: llvm-mc -triple riscv64 -mattr=+c -filetype obj -o %t.o %s +// RUN: ld.lld -o %t %t.o +// RUN: llvm-bolt --print-cfg --print-only=_start -o %t.bolt %t 2>&1 | FileCheck %s +// RUN: llvm-objdump -d %t.bolt | FileCheck --check-prefix=CHECK-OBJDUMP %s + +// CHECK-NOT: BOLT-WARNING + +/// Check that .word is not disassembled by BOLT +// CHECK: 00000000: nop +// CHECK: 00000002: ret + +/// Check .word is still present in output +// CHECK-OBJDUMP: <_start>: +// CHECK-OBJDUMP-NEXT: nop +// CHECK-OBJDUMP-NEXT: unimp +// CHECK-OBJDUMP-NEXT: unimp +// CHECK-OBJDUMP-NEXT: ret + .text + .globl _start + .p2align 1 +_start: + nop + .word 0x0 + ret -- Gitee From 4f172503cdb79bb4605a66f70c4ca60b490894ba Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Sun, 17 Sep 2023 12:58:33 -0700 Subject: [PATCH 41/94] [Backport][BOLT] Speedup symbol table sort Memoize SymbolRef::getAddress() for sorting symbol table entries by their address. Saves about 10 seconds of processing time on large binaries with over 2 million symbols. NFCI. Reviewed By: jobnoorman, Amir Differential Revision: https://reviews.llvm.org/D159524 --- bolt/lib/Rewrite/RewriteInstance.cpp | 103 +++++++++++++-------------- 1 file changed, 49 insertions(+), 54 deletions(-) diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 86f52865e537..cf2d6c8b113a 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -799,7 +799,12 @@ void RewriteInstance::discoverFileObjects() { } // Sort symbols in the file by value. Ignore symbols from non-allocatable - // sections. + // sections. We memoize getAddress(), as it has rather high overhead. + struct SymbolInfo { + uint64_t Address; + SymbolRef Symbol; + }; + std::vector SortedSymbols; auto isSymbolInMemory = [this](const SymbolRef &Sym) { if (cantFail(Sym.getType()) == SymbolRef::ST_File) return false; @@ -810,25 +815,22 @@ void RewriteInstance::discoverFileObjects() { BinarySection Section(*BC, *cantFail(Sym.getSection())); return Section.isAllocatable(); }; - std::vector SortedFileSymbols; - llvm::copy_if(InputFile->symbols(), std::back_inserter(SortedFileSymbols), - isSymbolInMemory); - auto CompareSymbols = [this](const SymbolRef &A, const SymbolRef &B) { - // Marker symbols have the highest precedence, while - // SECTIONs have the lowest. - auto AddressA = cantFail(A.getAddress()); - auto AddressB = cantFail(B.getAddress()); - if (AddressA != AddressB) - return AddressA < AddressB; - - bool AMarker = BC->isMarker(A); - bool BMarker = BC->isMarker(B); + for (const SymbolRef &Symbol : InputFile->symbols()) + if (isSymbolInMemory(Symbol)) + SortedSymbols.push_back({cantFail(Symbol.getAddress()), Symbol}); + + auto CompareSymbols = [this](const SymbolInfo &A, const SymbolInfo &B) { + if (A.Address != B.Address) + return A.Address < B.Address; + + const bool AMarker = BC->isMarker(A.Symbol); + const bool BMarker = BC->isMarker(B.Symbol); if (AMarker || BMarker) { return AMarker && !BMarker; } - auto AType = cantFail(A.getType()); - auto BType = cantFail(B.getType()); + const auto AType = cantFail(A.Symbol.getType()); + const auto BType = cantFail(B.Symbol.getType()); if (AType == SymbolRef::ST_Function && BType != SymbolRef::ST_Function) return true; if (BType == SymbolRef::ST_Debug && AType != SymbolRef::ST_Debug) @@ -836,11 +838,10 @@ void RewriteInstance::discoverFileObjects() { return false; }; + llvm::stable_sort(SortedSymbols, CompareSymbols); - llvm::stable_sort(SortedFileSymbols, CompareSymbols); - - auto LastSymbol = SortedFileSymbols.end(); - if (!SortedFileSymbols.empty()) + auto LastSymbol = SortedSymbols.end(); + if (!SortedSymbols.empty()) --LastSymbol; // For aarch64, the ABI defines mapping symbols so we identify data in the @@ -855,39 +856,34 @@ void RewriteInstance::discoverFileObjects() { }; std::vector SortedMarkerSymbols; - auto addExtraDataMarkerPerSymbol = - [this](const std::vector &SortedFileSymbols, - std::vector &SortedMarkerSymbols) { - bool IsData = false; - uint64_t LastAddr = 0; - for (auto Sym = SortedFileSymbols.begin(); - Sym < SortedFileSymbols.end(); ++Sym) { - uint64_t Address = cantFail(Sym->getAddress()); - if (LastAddr == Address) // don't repeat markers - continue; + auto addExtraDataMarkerPerSymbol = [&]() { + bool IsData = false; + uint64_t LastAddr = 0; + for (const auto &SymInfo : SortedSymbols) { + if (LastAddr == SymInfo.Address) // don't repeat markers + continue; - MarkerSymType MarkerType = BC->getMarkerType(*Sym); - if (MarkerType != MarkerSymType::NONE) { - SortedMarkerSymbols.push_back(MarkerSym{Address, MarkerType}); - LastAddr = Address; - IsData = MarkerType == MarkerSymType::DATA; - continue; - } + MarkerSymType MarkerType = BC->getMarkerType(SymInfo.Symbol); + if (MarkerType != MarkerSymType::NONE) { + SortedMarkerSymbols.push_back(MarkerSym{SymInfo.Address, MarkerType}); + LastAddr = SymInfo.Address; + IsData = MarkerType == MarkerSymType::DATA; + continue; + } - if (IsData) { - SortedMarkerSymbols.push_back( - MarkerSym{cantFail(Sym->getAddress()), MarkerSymType::DATA}); - LastAddr = Address; - } - } - }; + if (IsData) { + SortedMarkerSymbols.push_back({SymInfo.Address, MarkerSymType::DATA}); + LastAddr = SymInfo.Address; + } + } + }; if (BC->isAArch64() || BC->isRISCV()) { - addExtraDataMarkerPerSymbol(SortedFileSymbols, SortedMarkerSymbols); + addExtraDataMarkerPerSymbol(); LastSymbol = std::stable_partition( - SortedFileSymbols.begin(), SortedFileSymbols.end(), - [this](const SymbolRef &Symbol) { return !BC->isMarker(Symbol); }); - if (!SortedFileSymbols.empty()) + SortedSymbols.begin(), SortedSymbols.end(), + [this](const SymbolInfo &S) { return !BC->isMarker(S.Symbol); }); + if (!SortedSymbols.empty()) --LastSymbol; } @@ -897,12 +893,11 @@ void RewriteInstance::discoverFileObjects() { // Regex object for matching cold fragments. Regex ColdFragment(".*\\.cold(\\.[0-9]+)?"); - const auto SortedSymbolsEnd = LastSymbol == SortedFileSymbols.end() - ? LastSymbol - : std::next(LastSymbol); - for (auto ISym = SortedFileSymbols.begin(); ISym != SortedSymbolsEnd; - ++ISym) { - const SymbolRef &Symbol = *ISym; + const auto SortedSymbolsEnd = + LastSymbol == SortedSymbols.end() ? LastSymbol : std::next(LastSymbol); + for (auto Iter = SortedSymbols.begin(); Iter != SortedSymbolsEnd; ++Iter) { + const SymbolRef &Symbol = Iter->Symbol; + // Keep undefined symbols for pretty printing? if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Undefined) continue; -- Gitee From 55885fdc1a1e0146e243c2437dab3c1bb2edbfc6 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Sun, 17 Sep 2023 13:13:09 -0700 Subject: [PATCH 42/94] [Backport][BOLT][NFC] Refactor RI::discoverFileObjects() Minor refactoring to delete redundant code. Reviewed By: jobnoorman Differential Revision: https://reviews.llvm.org/D159525 --- bolt/lib/Rewrite/RewriteInstance.cpp | 75 +++++++++++++--------------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index cf2d6c8b113a..151e50283a80 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -752,9 +752,6 @@ Error RewriteInstance::run() { void RewriteInstance::discoverFileObjects() { NamedRegionTimer T("discoverFileObjects", "discover file objects", TimerGroupName, TimerGroupDesc, opts::TimeRewrite); - FileSymRefs.clear(); - BC->getBinaryFunctions().clear(); - BC->clearBinaryData(); // For local symbols we want to keep track of associated FILE symbol name for // disambiguation by combined name. @@ -891,26 +888,21 @@ void RewriteInstance::discoverFileObjects() { unsigned AnonymousId = 0; // Regex object for matching cold fragments. - Regex ColdFragment(".*\\.cold(\\.[0-9]+)?"); + const Regex ColdFragment(".*\\.cold(\\.[0-9]+)?"); const auto SortedSymbolsEnd = LastSymbol == SortedSymbols.end() ? LastSymbol : std::next(LastSymbol); for (auto Iter = SortedSymbols.begin(); Iter != SortedSymbolsEnd; ++Iter) { const SymbolRef &Symbol = Iter->Symbol; - - // Keep undefined symbols for pretty printing? - if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Undefined) - continue; - + const uint64_t SymbolAddress = Iter->Address; + const auto SymbolFlags = cantFail(Symbol.getFlags()); const SymbolRef::Type SymbolType = cantFail(Symbol.getType()); if (SymbolType == SymbolRef::ST_File) continue; StringRef SymName = cantFail(Symbol.getName(), "cannot get symbol name"); - uint64_t Address = - cantFail(Symbol.getAddress(), "cannot get symbol address"); - if (Address == 0) { + if (SymbolAddress == 0) { if (opts::Verbosity >= 1 && SymbolType == SymbolRef::ST_Function) errs() << "BOLT-WARNING: function with 0 address seen\n"; continue; @@ -920,11 +912,12 @@ void RewriteInstance::discoverFileObjects() { if (SymName == "__hot_start" || SymName == "__hot_end") continue; - FileSymRefs[Address] = Symbol; + FileSymRefs[SymbolAddress] = Symbol; // Skip section symbols that will be registered by disassemblePLT(). - if ((cantFail(Symbol.getType()) == SymbolRef::ST_Debug)) { - ErrorOr BSection = BC->getSectionForAddress(Address); + if (SymbolType == SymbolRef::ST_Debug) { + ErrorOr BSection = + BC->getSectionForAddress(SymbolAddress); if (BSection && getPLTSectionInfo(BSection->getName())) continue; } @@ -946,10 +939,10 @@ void RewriteInstance::discoverFileObjects() { std::string AlternativeName; if (Name.empty()) { UniqueName = "ANONYMOUS." + std::to_string(AnonymousId++); - } else if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Global) { + } else if (SymbolFlags & SymbolRef::SF_Global) { if (const BinaryData *BD = BC->getBinaryDataByName(Name)) { if (BD->getSize() == ELFSymbolRef(Symbol).getSize() && - BD->getAddress() == Address) { + BD->getAddress() == SymbolAddress) { if (opts::Verbosity > 1) errs() << "BOLT-WARNING: ignoring duplicate global symbol " << Name << "\n"; @@ -985,14 +978,13 @@ void RewriteInstance::discoverFileObjects() { uint64_t SymbolSize = ELFSymbolRef(Symbol).getSize(); uint64_t SymbolAlignment = Symbol.getAlignment(); - unsigned SymbolFlags = cantFail(Symbol.getFlags()); auto registerName = [&](uint64_t FinalSize) { // Register names even if it's not a function, e.g. for an entry point. - BC->registerNameAtAddress(UniqueName, Address, FinalSize, SymbolAlignment, - SymbolFlags); + BC->registerNameAtAddress(UniqueName, SymbolAddress, FinalSize, + SymbolAlignment, SymbolFlags); if (!AlternativeName.empty()) - BC->registerNameAtAddress(AlternativeName, Address, FinalSize, + BC->registerNameAtAddress(AlternativeName, SymbolAddress, FinalSize, SymbolAlignment, SymbolFlags); }; @@ -1012,7 +1004,7 @@ void RewriteInstance::discoverFileObjects() { LLVM_DEBUG(dbgs() << "BOLT-DEBUG: considering symbol " << UniqueName << " for function\n"); - if (Address == Section->getAddress() + Section->getSize()) { + if (SymbolAddress == Section->getAddress() + Section->getSize()) { assert(SymbolSize == 0 && "unexpect non-zero sized symbol at end of section"); LLVM_DEBUG( @@ -1038,11 +1030,12 @@ void RewriteInstance::discoverFileObjects() { // their local labels. The only way to tell them apart is to look at // symbol scope - global vs local. if (PreviousFunction && SymbolType != SymbolRef::ST_Function) { - if (PreviousFunction->containsAddress(Address)) { + if (PreviousFunction->containsAddress(SymbolAddress)) { if (PreviousFunction->isSymbolValidInScope(Symbol, SymbolSize)) { LLVM_DEBUG(dbgs() << "BOLT-DEBUG: symbol is a function local symbol\n"); - } else if (Address == PreviousFunction->getAddress() && !SymbolSize) { + } else if (SymbolAddress == PreviousFunction->getAddress() && + !SymbolSize) { LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ignoring symbol as a marker\n"); } else if (opts::Verbosity > 1) { errs() << "BOLT-WARNING: symbol " << UniqueName @@ -1059,8 +1052,8 @@ void RewriteInstance::discoverFileObjects() { } } - if (PreviousFunction && PreviousFunction->containsAddress(Address) && - PreviousFunction->getAddress() != Address) { + if (PreviousFunction && PreviousFunction->containsAddress(SymbolAddress) && + PreviousFunction->getAddress() != SymbolAddress) { if (PreviousFunction->isSymbolValidInScope(Symbol, SymbolSize)) { if (opts::Verbosity >= 1) outs() << "BOLT-INFO: skipping possibly another entry for function " @@ -1072,12 +1065,12 @@ void RewriteInstance::discoverFileObjects() { registerName(0); - PreviousFunction->addEntryPointAtOffset(Address - + PreviousFunction->addEntryPointAtOffset(SymbolAddress - PreviousFunction->getAddress()); // Remove the symbol from FileSymRefs so that we can skip it from // in the future. - auto SI = FileSymRefs.find(Address); + auto SI = FileSymRefs.find(SymbolAddress); assert(SI != FileSymRefs.end() && "symbol expected to be present"); assert(SI->second == Symbol && "wrong symbol found"); FileSymRefs.erase(SI); @@ -1087,10 +1080,10 @@ void RewriteInstance::discoverFileObjects() { // Checkout for conflicts with function data from FDEs. bool IsSimple = true; - auto FDEI = CFIRdWrt->getFDEs().lower_bound(Address); + auto FDEI = CFIRdWrt->getFDEs().lower_bound(SymbolAddress); if (FDEI != CFIRdWrt->getFDEs().end()) { const dwarf::FDE &FDE = *FDEI->second; - if (FDEI->first != Address) { + if (FDEI->first != SymbolAddress) { // There's no matching starting address in FDE. Make sure the previous // FDE does not contain this address. if (FDEI != CFIRdWrt->getFDEs().begin()) { @@ -1098,7 +1091,8 @@ void RewriteInstance::discoverFileObjects() { const dwarf::FDE &PrevFDE = *FDEI->second; uint64_t PrevStart = PrevFDE.getInitialLocation(); uint64_t PrevLength = PrevFDE.getAddressRange(); - if (Address > PrevStart && Address < PrevStart + PrevLength) { + if (SymbolAddress > PrevStart && + SymbolAddress < PrevStart + PrevLength) { errs() << "BOLT-ERROR: function " << UniqueName << " is in conflict with FDE [" << Twine::utohexstr(PrevStart) << ", " @@ -1115,11 +1109,11 @@ void RewriteInstance::discoverFileObjects() { << "; symbol table : " << SymbolSize << ". Using max size.\n"; } SymbolSize = std::max(SymbolSize, FDE.getAddressRange()); - if (BC->getBinaryDataAtAddress(Address)) { - BC->setBinaryDataSize(Address, SymbolSize); + if (BC->getBinaryDataAtAddress(SymbolAddress)) { + BC->setBinaryDataSize(SymbolAddress, SymbolSize); } else { LLVM_DEBUG(dbgs() << "BOLT-DEBUG: No BD @ 0x" - << Twine::utohexstr(Address) << "\n"); + << Twine::utohexstr(SymbolAddress) << "\n"); } } } @@ -1128,7 +1122,7 @@ void RewriteInstance::discoverFileObjects() { // Since function may not have yet obtained its real size, do a search // using the list of registered functions instead of calling // getBinaryFunctionAtAddress(). - auto BFI = BC->getBinaryFunctions().find(Address); + auto BFI = BC->getBinaryFunctions().find(SymbolAddress); if (BFI != BC->getBinaryFunctions().end()) { BF = &BFI->second; // Duplicate the function name. Make sure everything matches before we add @@ -1142,15 +1136,17 @@ void RewriteInstance::discoverFileObjects() { << BF->getSize() << " new " << SymbolSize << "\n"; } BF->setSize(std::max(SymbolSize, BF->getSize())); - BC->setBinaryDataSize(Address, BF->getSize()); + BC->setBinaryDataSize(SymbolAddress, BF->getSize()); } BF->addAlternativeName(UniqueName); } else { - ErrorOr Section = BC->getSectionForAddress(Address); + ErrorOr Section = + BC->getSectionForAddress(SymbolAddress); // Skip symbols from invalid sections if (!Section) { errs() << "BOLT-WARNING: " << UniqueName << " (0x" - << Twine::utohexstr(Address) << ") does not have any section\n"; + << Twine::utohexstr(SymbolAddress) + << ") does not have any section\n"; continue; } @@ -1158,7 +1154,8 @@ void RewriteInstance::discoverFileObjects() { if (!Section->getSize()) continue; - BF = BC->createBinaryFunction(UniqueName, *Section, Address, SymbolSize); + BF = BC->createBinaryFunction(UniqueName, *Section, SymbolAddress, + SymbolSize); if (!IsSimple) BF->setSimple(false); } -- Gitee From bc712811d4fb6cf2e71bc3fcbd2f01e2fb30ba7a Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Fri, 15 Sep 2023 14:52:20 +0400 Subject: [PATCH 43/94] [Backport][BOLT][runtime] Test for outline-atomics support I'm using clang-10 to build bolt which doesn't have moutline-atomics option and though it doesn't do it. So test compiler for supporting it before appending to the list of cxxflags. Differential Revision: https://reviews.llvm.org/D159521 --- bolt/runtime/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt index fadc4e79856d..82ddc701fc84 100644 --- a/bolt/runtime/CMakeLists.txt +++ b/bolt/runtime/CMakeLists.txt @@ -1,4 +1,5 @@ cmake_minimum_required(VERSION 3.20.0) +include(CheckCXXCompilerFlag) include(CheckIncludeFiles) include(GNUInstallDirs) @@ -33,7 +34,10 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-sse") endif() if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-outline-atomics") + check_cxx_compiler_flag("-mno-outline-atomics" CXX_SUPPORTS_OUTLINE_ATOMICS) + if (CXX_SUPPORTS_OUTLINE_ATOMICS) + set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-outline-atomics") + endif() endif() if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") -- Gitee From a6603bfcd68939d9472c862b08b7f6e6371a95c8 Mon Sep 17 00:00:00 2001 From: Kristof Beyls Date: Thu, 21 Sep 2023 19:53:09 +0200 Subject: [PATCH 44/94] [Backport][BOLT] Fix data race in MCPlusBuilder::getOrCreateAnnotationIndex (#67004) MCPlusBuilder::getOrCreateAnnotationIndex(Name) can be called from different threads, for example when making use of ParallelUtilities::runOnEachFunctionWithUniqueAllocId. The race occurs when an Index for a particular annotation Name needs to be created for the first time. For example, this can easily happen when multiple "copies" of an analysis pass run on different BinaryFunctions, and the analysis pass creates a new Annotation Index to be able to store analysis results as annotations. This was found by using the ThreadSanitizer. No regression test was added; I don't think there is good way to write regression tests that verify the absence of data races? --------- Co-authored-by: Amir Ayupov --- bolt/include/bolt/Core/MCPlusBuilder.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index e7b6c8e3a747..83e9cfb7567c 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -29,6 +29,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ErrorOr.h" +#include "llvm/Support/RWMutex.h" #include #include #include @@ -166,6 +167,10 @@ protected: /// Names of non-standard annotations. SmallVector AnnotationNames; + /// A mutex that is used to control parallel accesses to + /// AnnotationNameIndexMap and AnnotationsNames. + mutable llvm::sys::RWMutex AnnotationNameMutex; + /// Allocate the TailCall annotation value. Clients of the target-specific /// MCPlusBuilder classes must use convert/lower/create* interfaces instead. void setTailCall(MCInst &Inst); @@ -1775,6 +1780,7 @@ public: /// Return annotation index matching the \p Name. std::optional getAnnotationIndex(StringRef Name) const { + std::shared_lock Lock(AnnotationNameMutex); auto AI = AnnotationNameIndexMap.find(Name); if (AI != AnnotationNameIndexMap.end()) return AI->second; @@ -1784,10 +1790,10 @@ public: /// Return annotation index matching the \p Name. Create a new index if the /// \p Name wasn't registered previously. unsigned getOrCreateAnnotationIndex(StringRef Name) { - auto AI = AnnotationNameIndexMap.find(Name); - if (AI != AnnotationNameIndexMap.end()) - return AI->second; + if (std::optional Index = getAnnotationIndex(Name)) + return *Index; + std::unique_lock Lock(AnnotationNameMutex); const unsigned Index = AnnotationNameIndexMap.size() + MCPlus::MCAnnotation::kGeneric; AnnotationNameIndexMap.insert(std::make_pair(Name, Index)); -- Gitee From e5d6c5d967b645f618d29e4f9d37bfe5e0dd93ea Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 21 Sep 2023 13:13:03 -0700 Subject: [PATCH 45/94] [Backport][CodeLayout] Refactor std::vector uses, namespace, and EdgeCountT. NFC * Place types and functions in the llvm::codelayout namespace * Change EdgeCountT from pair, uint64_t> to a struct and utilize structured bindings. It is not conventional to use the "T" suffix for structure types. * Remove a redundant copy in ChainT::merge. * Change {ExtTSPImpl,CDSortImpl}::run to use return value instead of an output parameter * Rename applyCDSLayout to computeCacheDirectedLayout: (a) avoid rare abbreviation "CDS" (cache-directed sort) (b) "compute" is more conventional for the specific use case * Change the parameter types from std::vector to ArrayRef so that SmallVector arguments can be used. * Similarly, rename applyExtTspLayout to computeExtTspLayout. Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D159526 --- bolt/lib/Passes/ReorderAlgorithm.cpp | 10 +- bolt/lib/Passes/ReorderFunctions.cpp | 10 +- .../llvm/Transforms/Utils/CodeLayout.h | 49 +++--- llvm/lib/CodeGen/MachineBlockPlacement.cpp | 8 +- llvm/lib/Transforms/Utils/CodeLayout.cpp | 150 ++++++++---------- 5 files changed, 106 insertions(+), 121 deletions(-) diff --git a/bolt/lib/Passes/ReorderAlgorithm.cpp b/bolt/lib/Passes/ReorderAlgorithm.cpp index b5052cdaddb1..3c3365e1d3d7 100644 --- a/bolt/lib/Passes/ReorderAlgorithm.cpp +++ b/bolt/lib/Passes/ReorderAlgorithm.cpp @@ -531,21 +531,21 @@ void ExtTSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF, } // Initialize CFG edges - using JumpT = std::pair; - std::vector> JumpCounts; + std::vector JumpCounts; for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { auto BI = BB->branch_info_begin(); for (BinaryBasicBlock *SuccBB : BB->successors()) { assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && "missing profile for a jump"); - auto It = std::make_pair(BB->getLayoutIndex(), SuccBB->getLayoutIndex()); - JumpCounts.push_back(std::make_pair(It, BI->Count)); + JumpCounts.push_back( + {BB->getLayoutIndex(), SuccBB->getLayoutIndex(), BI->Count}); ++BI; } } // Run the layout algorithm - auto Result = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts); + auto Result = + codelayout::computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts); Order.reserve(BF.getLayout().block_size()); for (uint64_t R : Result) Order.push_back(OrigOrder[R]); diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp index 28ca68fa5a57..70f87ac40c3c 100644 --- a/bolt/lib/Passes/ReorderFunctions.cpp +++ b/bolt/lib/Passes/ReorderFunctions.cpp @@ -331,23 +331,21 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) { // Initialize CFG nodes and their data std::vector FuncSizes; std::vector FuncCounts; - using JumpT = std::pair; - std::vector> CallCounts; + std::vector CallCounts; std::vector CallOffsets; for (NodeId F = 0; F < Cg.numNodes(); ++F) { FuncSizes.push_back(Cg.size(F)); FuncCounts.push_back(Cg.samples(F)); for (NodeId Succ : Cg.successors(F)) { const Arc &Arc = *Cg.findArc(F, Succ); - auto It = std::make_pair(F, Succ); - CallCounts.push_back(std::make_pair(It, Arc.weight())); + CallCounts.push_back({F, Succ, uint64_t(Arc.weight())}); CallOffsets.push_back(uint64_t(Arc.avgCallOffset())); } } // Run the layout algorithm. - std::vector Result = - applyCDSLayout(FuncSizes, FuncCounts, CallCounts, CallOffsets); + std::vector Result = codelayout::computeCacheDirectedLayout( + FuncSizes, FuncCounts, CallCounts, CallOffsets); // Create a single cluster from the computed order of hot functions. Clusters.emplace_back(Cluster(Result, Cg)); diff --git a/llvm/include/llvm/Transforms/Utils/CodeLayout.h b/llvm/include/llvm/Transforms/Utils/CodeLayout.h index 11a829b601ce..f5127cff24af 100644 --- a/llvm/include/llvm/Transforms/Utils/CodeLayout.h +++ b/llvm/include/llvm/Transforms/Utils/CodeLayout.h @@ -14,14 +14,21 @@ #ifndef LLVM_TRANSFORMS_UTILS_CODELAYOUT_H #define LLVM_TRANSFORMS_UTILS_CODELAYOUT_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include #include -namespace llvm { +namespace llvm::codelayout { using EdgeT = std::pair; -using EdgeCountT = std::pair; + +struct EdgeCount { + uint64_t src; + uint64_t dst; + uint64_t count; +}; /// Find a layout of nodes (basic blocks) of a given CFG optimizing jump /// locality and thus processor I-cache utilization. This is achieved via @@ -34,24 +41,22 @@ using EdgeCountT = std::pair; /// \p EdgeCounts: The execution counts of every edge (jump) in the profile. The /// map also defines the edges in CFG and should include 0-count edges. /// \returns The best block order found. -std::vector -applyExtTspLayout(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts); +std::vector computeExtTspLayout(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts); /// Estimate the "quality" of a given node order in CFG. The higher the score, /// the better the order is. The score is designed to reflect the locality of /// the given order, which is anti-correlated with the number of I-cache misses /// in a typical execution of the function. -double calcExtTspScore(const std::vector &Order, - const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts); +double calcExtTspScore(ArrayRef Order, ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts); /// Estimate the "quality" of the current node order in CFG. -double calcExtTspScore(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts); +double calcExtTspScore(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts); /// Algorithm-specific params for Cache-Directed Sort. The values are tuned for /// the best performance of large-scale front-end bound binaries. @@ -75,18 +80,16 @@ struct CDSortConfig { /// map also defines the edges in CFG and should include 0-count edges. /// \p CallOffsets: The offsets of the calls from their source nodes. /// \returns The best function order found. -std::vector applyCDSLayout(const std::vector &FuncSizes, - const std::vector &FuncCounts, - const std::vector &CallCounts, - const std::vector &CallOffsets); +std::vector computeCacheDirectedLayout( + ArrayRef FuncSizes, ArrayRef FuncCounts, + ArrayRef CallCounts, ArrayRef CallOffsets); /// Apply a Cache-Directed Sort with a custom config. -std::vector applyCDSLayout(const CDSortConfig &Config, - const std::vector &FuncSizes, - const std::vector &FuncCounts, - const std::vector &CallCounts, - const std::vector &CallOffsets); +std::vector computeCacheDirectedLayout( + const CDSortConfig &Config, ArrayRef FuncSizes, + ArrayRef FuncCounts, ArrayRef CallCounts, + ArrayRef CallOffsets); -} // end namespace llvm +} // namespace llvm::codelayout #endif // LLVM_TRANSFORMS_UTILS_CODELAYOUT_H diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 6913165add25..b69045b4d61f 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -3502,7 +3502,7 @@ void MachineBlockPlacement::applyExtTsp() { auto BlockSizes = std::vector(F->size()); auto BlockCounts = std::vector(F->size()); - std::vector JumpCounts; + std::vector JumpCounts; for (MachineBasicBlock &MBB : *F) { // Getting the block frequency. BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB); @@ -3521,8 +3521,8 @@ void MachineBlockPlacement::applyExtTsp() { for (MachineBasicBlock *Succ : MBB.successors()) { auto EP = MBPI->getEdgeProbability(&MBB, Succ); BlockFrequency JumpFreq = BlockFreq * EP; - auto Jump = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]); - JumpCounts.push_back(std::make_pair(Jump, JumpFreq.getFrequency())); + JumpCounts.push_back( + {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()}); } } @@ -3535,7 +3535,7 @@ void MachineBlockPlacement::applyExtTsp() { calcExtTspScore(BlockSizes, BlockCounts, JumpCounts))); // Run the layout algorithm. - auto NewOrder = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts); + auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts); std::vector NewBlockOrder; NewBlockOrder.reserve(F->size()); for (uint64_t Node : NewOrder) { diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp index 6ef4ae3341e3..58b5afbc869e 100644 --- a/llvm/lib/Transforms/Utils/CodeLayout.cpp +++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp @@ -48,6 +48,8 @@ #include using namespace llvm; +using namespace llvm::codelayout; + #define DEBUG_TYPE "code-layout" namespace llvm { @@ -318,8 +320,8 @@ struct ChainT { Edges.push_back(std::make_pair(Other, Edge)); } - void merge(ChainT *Other, const std::vector &MergedBlocks) { - Nodes = MergedBlocks; + void merge(ChainT *Other, std::vector MergedBlocks) { + Nodes = std::move(MergedBlocks); // Update the chain's data. ExecutionCount += Other->ExecutionCount; Size += Other->Size; @@ -549,15 +551,14 @@ MergedChain mergeNodes(const std::vector &X, /// The implementation of the ExtTSP algorithm. class ExtTSPImpl { public: - ExtTSPImpl(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts) + ExtTSPImpl(ArrayRef NodeSizes, ArrayRef NodeCounts, + ArrayRef EdgeCounts) : NumNodes(NodeSizes.size()) { initialize(NodeSizes, NodeCounts, EdgeCounts); } /// Run the algorithm and return an optimized ordering of nodes. - void run(std::vector &Result) { + std::vector run() { // Pass 1: Merge nodes with their mutually forced successors mergeForcedPairs(); @@ -568,14 +569,14 @@ public: mergeColdChains(); // Collect nodes from all chains - concatChains(Result); + return concatChains(); } private: /// Initialize the algorithm's data structures. - void initialize(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts) { + void initialize(const ArrayRef &NodeSizes, + const ArrayRef &NodeCounts, + const ArrayRef &EdgeCounts) { // Initialize nodes AllNodes.reserve(NumNodes); for (uint64_t Idx = 0; Idx < NumNodes; Idx++) { @@ -592,21 +593,18 @@ private: PredNodes.resize(NumNodes); std::vector OutDegree(NumNodes, 0); AllJumps.reserve(EdgeCounts.size()); - for (auto It : EdgeCounts) { - uint64_t Pred = It.first.first; - uint64_t Succ = It.first.second; - OutDegree[Pred]++; + for (auto Edge : EdgeCounts) { + ++OutDegree[Edge.src]; // Ignore self-edges. - if (Pred == Succ) + if (Edge.src == Edge.dst) continue; - SuccNodes[Pred].push_back(Succ); - PredNodes[Succ].push_back(Pred); - uint64_t ExecutionCount = It.second; - if (ExecutionCount > 0) { - NodeT &PredNode = AllNodes[Pred]; - NodeT &SuccNode = AllNodes[Succ]; - AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount); + SuccNodes[Edge.src].push_back(Edge.dst); + PredNodes[Edge.dst].push_back(Edge.src); + if (Edge.count > 0) { + NodeT &PredNode = AllNodes[Edge.src]; + NodeT &SuccNode = AllNodes[Edge.dst]; + AllJumps.emplace_back(&PredNode, &SuccNode, Edge.count); SuccNode.InJumps.push_back(&AllJumps.back()); PredNode.OutJumps.push_back(&AllJumps.back()); } @@ -923,7 +921,7 @@ private: } /// Concatenate all chains into the final order. - void concatChains(std::vector &Order) { + std::vector concatChains() { // Collect chains and calculate density stats for their sorting. std::vector SortedChains; DenseMap ChainDensity; @@ -958,12 +956,12 @@ private: }); // Collect the nodes in the order specified by their chains. + std::vector Order; Order.reserve(NumNodes); - for (const ChainT *Chain : SortedChains) { - for (NodeT *Node : Chain->Nodes) { + for (const ChainT *Chain : SortedChains) + for (NodeT *Node : Chain->Nodes) Order.push_back(Node->Index); - } - } + return Order; } private: @@ -996,16 +994,15 @@ private: /// functions represented by a call graph. class CDSortImpl { public: - CDSortImpl(const CDSortConfig &Config, const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts, - const std::vector &EdgeOffsets) + CDSortImpl(const CDSortConfig &Config, ArrayRef NodeSizes, + ArrayRef NodeCounts, ArrayRef EdgeCounts, + ArrayRef EdgeOffsets) : Config(Config), NumNodes(NodeSizes.size()) { initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets); } /// Run the algorithm and return an ordered set of function clusters. - void run(std::vector &Result) { + std::vector run() { // Merge pairs of chains while improving the objective. mergeChainPairs(); @@ -1014,15 +1011,15 @@ public: << HotChains.size() << "\n"); // Collect nodes from all the chains. - concatChains(Result); + return concatChains(); } private: /// Initialize the algorithm's data structures. - void initialize(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts, - const std::vector &EdgeOffsets) { + void initialize(const ArrayRef &NodeSizes, + const ArrayRef &NodeCounts, + const ArrayRef &EdgeCounts, + const ArrayRef &EdgeOffsets) { // Initialize nodes. AllNodes.reserve(NumNodes); for (uint64_t Node = 0; Node < NumNodes; Node++) { @@ -1039,20 +1036,17 @@ private: PredNodes.resize(NumNodes); AllJumps.reserve(EdgeCounts.size()); for (size_t I = 0; I < EdgeCounts.size(); I++) { - auto It = EdgeCounts[I]; - uint64_t Pred = It.first.first; - uint64_t Succ = It.first.second; + auto [Pred, Succ, Count] = EdgeCounts[I]; // Ignore recursive calls. if (Pred == Succ) continue; SuccNodes[Pred].push_back(Succ); PredNodes[Succ].push_back(Pred); - uint64_t ExecutionCount = It.second; - if (ExecutionCount > 0) { + if (Count > 0) { NodeT &PredNode = AllNodes[Pred]; NodeT &SuccNode = AllNodes[Succ]; - AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount); + AllJumps.emplace_back(&PredNode, &SuccNode, Count); AllJumps.back().Offset = EdgeOffsets[I]; SuccNode.InJumps.push_back(&AllJumps.back()); PredNode.OutJumps.push_back(&AllJumps.back()); @@ -1303,7 +1297,7 @@ private: } /// Concatenate all chains into the final order. - void concatChains(std::vector &Order) { + std::vector concatChains() { // Collect chains and calculate density stats for their sorting. std::vector SortedChains; DenseMap ChainDensity; @@ -1333,10 +1327,12 @@ private: }); // Collect the nodes in the order specified by their chains. + std::vector Order; Order.reserve(NumNodes); for (const ChainT *Chain : SortedChains) for (NodeT *Node : Chain->Nodes) Order.push_back(Node->Index); + return Order; } private: @@ -1377,17 +1373,16 @@ private: } // end of anonymous namespace std::vector -llvm::applyExtTspLayout(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts) { +codelayout::computeExtTspLayout(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts) { // Verify correctness of the input data. assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input"); assert(NodeSizes.size() > 2 && "Incorrect input"); // Apply the reordering algorithm. ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts); - std::vector Result; - Alg.run(Result); + std::vector Result = Alg.run(); // Verify correctness of the output. assert(Result.front() == 0 && "Original entry point is not preserved"); @@ -1395,37 +1390,32 @@ llvm::applyExtTspLayout(const std::vector &NodeSizes, return Result; } -double llvm::calcExtTspScore(const std::vector &Order, - const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts) { +double codelayout::calcExtTspScore(ArrayRef Order, + ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts) { // Estimate addresses of the blocks in memory. std::vector Addr(NodeSizes.size(), 0); for (size_t Idx = 1; Idx < Order.size(); Idx++) { Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]]; } std::vector OutDegree(NodeSizes.size(), 0); - for (auto It : EdgeCounts) { - uint64_t Pred = It.first.first; - OutDegree[Pred]++; - } + for (auto Edge : EdgeCounts) + ++OutDegree[Edge.src]; // Increase the score for each jump. double Score = 0; - for (auto It : EdgeCounts) { - uint64_t Pred = It.first.first; - uint64_t Succ = It.first.second; - uint64_t Count = It.second; - bool IsConditional = OutDegree[Pred] > 1; - Score += ::extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count, - IsConditional); + for (auto Edge : EdgeCounts) { + bool IsConditional = OutDegree[Edge.src] > 1; + Score += ::extTSPScore(Addr[Edge.src], NodeSizes[Edge.src], Addr[Edge.dst], + Edge.count, IsConditional); } return Score; } -double llvm::calcExtTspScore(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts) { +double codelayout::calcExtTspScore(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts) { std::vector Order(NodeSizes.size()); for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) { Order[Idx] = Idx; @@ -1433,30 +1423,23 @@ double llvm::calcExtTspScore(const std::vector &NodeSizes, return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts); } -std::vector -llvm::applyCDSLayout(const CDSortConfig &Config, - const std::vector &FuncSizes, - const std::vector &FuncCounts, - const std::vector &CallCounts, - const std::vector &CallOffsets) { +std::vector codelayout::computeCacheDirectedLayout( + const CDSortConfig &Config, ArrayRef FuncSizes, + ArrayRef FuncCounts, ArrayRef CallCounts, + ArrayRef CallOffsets) { // Verify correctness of the input data. assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input"); // Apply the reordering algorithm. CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets); - std::vector Result; - Alg.run(Result); - - // Verify correctness of the output. + std::vector Result = Alg.run(); assert(Result.size() == FuncSizes.size() && "Incorrect size of layout"); return Result; } -std::vector -llvm::applyCDSLayout(const std::vector &FuncSizes, - const std::vector &FuncCounts, - const std::vector &CallCounts, - const std::vector &CallOffsets) { +std::vector codelayout::computeCacheDirectedLayout( + ArrayRef FuncSizes, ArrayRef FuncCounts, + ArrayRef CallCounts, ArrayRef CallOffsets) { CDSortConfig Config; // Populate the config from the command-line options. if (CacheEntries.getNumOccurrences() > 0) @@ -1467,5 +1450,6 @@ llvm::applyCDSLayout(const std::vector &FuncSizes, Config.DistancePower = DistancePower; if (FrequencyScale.getNumOccurrences() > 0) Config.FrequencyScale = FrequencyScale; - return applyCDSLayout(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets); + return computeCacheDirectedLayout(Config, FuncSizes, FuncCounts, CallCounts, + CallOffsets); } -- Gitee From 0d418bc528557a9d81020568008030095cac0684 Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Fri, 15 Sep 2023 14:57:22 +0400 Subject: [PATCH 46/94] [Backport][BOLT][AArch64] Fix instrumentation deadloop According to ARMv8-a architecture reference manual B2.10.5 software must avoid having any explicit memory accesses between exclusive load and associated store instruction. Otherwise exclusive monitor might clear the exclusivity without application-related cause which may result in the deadloop. Disable instrumentation for such functions, since between exclusive load and store there might be branches and we would insert instrumentation snippet which contains loads and stores. The better solution would be to analyze with BFS finding the exact BBs between load and store and not instrumenting them. Or even better to recognize such sequences and replace them with more complex one, e.g. loading value non exclusively, and for the brach where exclusive store is made make exclusive load and store sequentially, but for now just disable instrumentation for such functions completely. Differential Revision: https://reviews.llvm.org/D159520 --- bolt/include/bolt/Core/MCPlusBuilder.h | 5 +++ bolt/lib/Passes/Instrumentation.cpp | 22 +++++++++++ .../Target/AArch64/AArch64MCPlusBuilder.cpp | 28 +++++++++++++ bolt/test/AArch64/exclusive-instrument.s | 39 +++++++++++++++++++ 4 files changed, 94 insertions(+) create mode 100644 bolt/test/AArch64/exclusive-instrument.s diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 83e9cfb7567c..07c54a60abbd 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -626,6 +626,11 @@ public: return Info->get(Inst.getOpcode()).mayStore(); } + virtual bool isAArch64Exclusive(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return false; + } + virtual bool isCleanRegXOR(const MCInst &Inst) const { llvm_unreachable("not implemented"); return false; diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp index 98044599d497..72adb319d71d 100644 --- a/bolt/lib/Passes/Instrumentation.cpp +++ b/bolt/lib/Passes/Instrumentation.cpp @@ -13,6 +13,7 @@ #include "bolt/Passes/Instrumentation.h" #include "bolt/Core/ParallelUtilities.h" #include "bolt/RuntimeLibs/InstrumentationRuntimeLibrary.h" +#include "bolt/Utils/CommandLineOpts.h" #include "bolt/Utils/Utils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/RWMutex.h" @@ -85,6 +86,24 @@ cl::opt InstrumentCalls("instrument-calls", namespace llvm { namespace bolt { +static bool hasAArch64ExclusiveMemop(BinaryFunction &Function) { + // FIXME ARMv8-a architecture reference manual says that software must avoid + // having any explicit memory accesses between exclusive load and associated + // store instruction. So for now skip instrumentation for functions that have + // these instructions, since it might lead to runtime deadlock. + BinaryContext &BC = Function.getBinaryContext(); + for (const BinaryBasicBlock &BB : Function) + for (const MCInst &Inst : BB) + if (BC.MIB->isAArch64Exclusive(Inst)) { + if (opts::Verbosity >= 1) + outs() << "BOLT-INSTRUMENTER: Function " << Function + << " has exclusive instructions, skip instrumentation\n"; + return true; + } + + return false; +} + uint32_t Instrumentation::getFunctionNameIndex(const BinaryFunction &Function) { auto Iter = FuncToStringIdx.find(&Function); if (Iter != FuncToStringIdx.end()) @@ -288,6 +307,9 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function, if (BC.isMachO() && Function.hasName("___GLOBAL_init_65535/1")) return; + if (BC.isAArch64() && hasAArch64ExclusiveMemop(Function)) + return; + SplitWorklistTy SplitWorklist; SplitInstrsTy SplitInstrs; diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 6623f9f8e0a3..466fedeb0171 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -272,6 +272,34 @@ public: return isLDRB(Inst) || isLDRH(Inst) || isLDRW(Inst) || isLDRX(Inst); } + bool isAArch64Exclusive(const MCInst &Inst) const override { + return (Inst.getOpcode() == AArch64::LDXPX || + Inst.getOpcode() == AArch64::LDXPW || + Inst.getOpcode() == AArch64::LDXRX || + Inst.getOpcode() == AArch64::LDXRW || + Inst.getOpcode() == AArch64::LDXRH || + Inst.getOpcode() == AArch64::LDXRB || + Inst.getOpcode() == AArch64::STXPX || + Inst.getOpcode() == AArch64::STXPW || + Inst.getOpcode() == AArch64::STXRX || + Inst.getOpcode() == AArch64::STXRW || + Inst.getOpcode() == AArch64::STXRH || + Inst.getOpcode() == AArch64::STXRB || + Inst.getOpcode() == AArch64::LDAXPX || + Inst.getOpcode() == AArch64::LDAXPW || + Inst.getOpcode() == AArch64::LDAXRX || + Inst.getOpcode() == AArch64::LDAXRW || + Inst.getOpcode() == AArch64::LDAXRH || + Inst.getOpcode() == AArch64::LDAXRB || + Inst.getOpcode() == AArch64::STLXPX || + Inst.getOpcode() == AArch64::STLXPW || + Inst.getOpcode() == AArch64::STLXRX || + Inst.getOpcode() == AArch64::STLXRW || + Inst.getOpcode() == AArch64::STLXRH || + Inst.getOpcode() == AArch64::STLXRB || + Inst.getOpcode() == AArch64::CLREX); + } + bool isLoadFromStack(const MCInst &Inst) const { if (!mayLoad(Inst)) return false; diff --git a/bolt/test/AArch64/exclusive-instrument.s b/bolt/test/AArch64/exclusive-instrument.s new file mode 100644 index 000000000000..502dd83b2f2a --- /dev/null +++ b/bolt/test/AArch64/exclusive-instrument.s @@ -0,0 +1,39 @@ +// This test checks that the foo function having exclusive memory access +// instructions won't be instrumented. + +// REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}} + +// RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +// RUN: %s -o %t.o +// RUN: %clang %cflags -fPIC -pie %t.o -o %t.exe -nostdlib -Wl,-q -Wl,-fini=dummy +// RUN: llvm-bolt %t.exe -o %t.bolt -instrument -v=1 | FileCheck %s + +// CHECK: Function foo has exclusive instructions, skip instrumentation + +.global foo +.type foo, %function +foo: + ldaxr w9, [x10] + cbnz w9, .Lret + stlxr w12, w11, [x9] + cbz w12, foo + clrex +.Lret: + ret +.size foo, .-foo + +.global _start +.type _start, %function +_start: + cmp x0, #0 + b.eq .Lexit + bl foo +.Lexit: + ret +.size _start, .-_start + +.global dummy +.type dummy, %function +dummy: + ret +.size dummy, .-dummy -- Gitee From e125094d4b79329c773b753aae014dd403086816 Mon Sep 17 00:00:00 2001 From: Kepontry Date: Mon, 25 Sep 2023 21:48:32 +0800 Subject: [PATCH 47/94] [Backport][BOLT] Implement '--assume-abi' option for AArch64 This patch implements the `getCalleeSavedRegs` function for AArch64, addressing the issue where the "not implemented" error occurs when both the `--assume-abi` option and options related to the RegAnalysis Pass (e.g., `--indirect-call-promotion=all`) are enabled. --- bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 16 ++++++++++++++++ bolt/test/assume-abi.test | 7 +++++++ 2 files changed, 23 insertions(+) create mode 100644 bolt/test/assume-abi.test diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 466fedeb0171..bf77244102a2 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -471,6 +471,22 @@ public: return true; } + void getCalleeSavedRegs(BitVector &Regs) const override { + Regs |= getAliases(AArch64::X18); + Regs |= getAliases(AArch64::X19); + Regs |= getAliases(AArch64::X20); + Regs |= getAliases(AArch64::X21); + Regs |= getAliases(AArch64::X22); + Regs |= getAliases(AArch64::X23); + Regs |= getAliases(AArch64::X24); + Regs |= getAliases(AArch64::X25); + Regs |= getAliases(AArch64::X26); + Regs |= getAliases(AArch64::X27); + Regs |= getAliases(AArch64::X28); + Regs |= getAliases(AArch64::LR); + Regs |= getAliases(AArch64::FP); + } + const MCExpr *getTargetExprFor(MCInst &Inst, const MCExpr *Expr, MCContext &Ctx, uint64_t RelType) const override { diff --git a/bolt/test/assume-abi.test b/bolt/test/assume-abi.test new file mode 100644 index 000000000000..688ab011441d --- /dev/null +++ b/bolt/test/assume-abi.test @@ -0,0 +1,7 @@ +# Validate the usage of the `--assume-abi` option in conjunction with +# options related to the RegAnalysis Pass. + +REQUIRES: system-linux + +RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q +RUN: llvm-bolt %t -o %t.bolt --assume-abi --indirect-call-promotion=all -- Gitee From c41cdebb93fd9644dc4204e9241470ddf17c3b74 Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Wed, 27 Sep 2023 10:31:25 +0400 Subject: [PATCH 48/94] [Backport][BOLT][AArch64] Fix CI alignment Fix alignment calculation for CI. Differential Revision: https://reviews.llvm.org/D159548 --- bolt/lib/Passes/Aligner.cpp | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/bolt/lib/Passes/Aligner.cpp b/bolt/lib/Passes/Aligner.cpp index ef419bb6baaa..c5b63d881e17 100644 --- a/bolt/lib/Passes/Aligner.cpp +++ b/bolt/lib/Passes/Aligner.cpp @@ -158,25 +158,27 @@ void AlignerPass::runOnFunctions(BinaryContext &BC) { BinaryContext::IndependentCodeEmitter Emitter = BC.createIndependentMCCodeEmitter(); - if (opts::UseCompactAligner) - alignCompact(BF, Emitter.MCE.get()); - else - alignMaxBytes(BF); - // Align objects that contains constant islands and no code // to at least 8 bytes. if (!BF.size() && BF.hasIslandsInfo()) { - const uint16_t Alignment = BF.getConstantIslandAlignment(); - if (BF.getAlignment() < Alignment) - BF.setAlignment(Alignment); - - if (BF.getMaxAlignmentBytes() < Alignment) - BF.setMaxAlignmentBytes(Alignment); - - if (BF.getMaxColdAlignmentBytes() < Alignment) - BF.setMaxColdAlignmentBytes(Alignment); + uint16_t Alignment = BF.getConstantIslandAlignment(); + // Check if we're forcing output alignment and it is greater than minimal + // CI required one + if (!opts::UseCompactAligner && Alignment < opts::AlignFunctions && + opts::AlignFunctions <= opts::AlignFunctionsMaxBytes) + Alignment = opts::AlignFunctions; + + BF.setAlignment(Alignment); + BF.setMaxAlignmentBytes(Alignment); + BF.setMaxColdAlignmentBytes(Alignment); + return; } + if (opts::UseCompactAligner) + alignCompact(BF, Emitter.MCE.get()); + else + alignMaxBytes(BF); + if (opts::AlignBlocks && !opts::PreserveBlocksAlignment) alignBlocks(BF, Emitter.MCE.get()); }; -- Gitee From d16a088d664565c092cd7f6095dcfae479ab8ba4 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 27 Jul 2023 21:08:38 -0700 Subject: [PATCH 49/94] [Backport][BOLT][test] Add --show-all-symbols to llvm-objdump -d command llvm-objdump -d has been changed to not display mapping symbols by default. --- bolt/test/AArch64/constant_island_pie_update.s | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/test/AArch64/constant_island_pie_update.s b/bolt/test/AArch64/constant_island_pie_update.s index c6856988d52f..f5f6f5d2f1e3 100644 --- a/bolt/test/AArch64/constant_island_pie_update.s +++ b/bolt/test/AArch64/constant_island_pie_update.s @@ -8,13 +8,13 @@ # RUN: %clang %cflags -fPIC -pie %t.o -o %t.rela.exe -nostdlib \ # RUN: -Wl,-q -Wl,-z,notext # RUN: llvm-bolt %t.rela.exe -o %t.rela.bolt --use-old-text=0 --lite=0 -# RUN: llvm-objdump -j .text -d %t.rela.bolt | FileCheck %s +# RUN: llvm-objdump -j .text -d --show-all-symbols %t.rela.bolt | FileCheck %s # RUN: llvm-readelf -rsW %t.rela.bolt | FileCheck --check-prefix=ELFCHECK %s // .relr.dyn # RUN: %clang %cflags -fPIC -pie %t.o -o %t.relr.exe -nostdlib \ # RUN: -Wl,-q -Wl,-z,notext -Wl,--pack-dyn-relocs=relr # RUN: llvm-bolt %t.relr.exe -o %t.relr.bolt --use-old-text=0 --lite=0 -# RUN: llvm-objdump -j .text -d %t.relr.bolt | FileCheck %s +# RUN: llvm-objdump -j .text -d --show-all-symbols %t.relr.bolt | FileCheck %s # RUN: llvm-readelf -rsW %t.relr.bolt | FileCheck --check-prefix=ELFCHECK %s # RUN: llvm-readelf -SW %t.relr.bolt | FileCheck --check-prefix=RELRSZCHECK %s -- Gitee From 8b6ddd0bf586da398da387fe009b13e42eed1484 Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Wed, 27 Sep 2023 10:27:56 +0400 Subject: [PATCH 50/94] [Backport][BOLT] Fix .relr section addend patching The new relocation offset in .relr section patching was calculated wrong previously. Pass the new file offset to lambda instead of re-calculating it in it. Test removes relocation from mytext section, so in case of wrong offset calculation we won't emit right addend value in expected place, i.e. on the new relocation offset. Differential Revision: https://reviews.llvm.org/D159543 --- bolt/lib/Rewrite/RewriteInstance.cpp | 13 ++++------ .../test/AArch64/constant_island_pie_update.s | 25 ++++++++++++++++++- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 151e50283a80..e37f03e14d23 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -4721,9 +4721,11 @@ void RewriteInstance::patchELFAllocatableRelrSection( const uint8_t PSize = BC->AsmInfo->getCodePointerSize(); const uint64_t MaxDelta = ((CHAR_BIT * DynamicRelrEntrySize) - 1) * PSize; - auto FixAddend = [&](const BinarySection &Section, const Relocation &Rel) { + auto FixAddend = [&](const BinarySection &Section, const Relocation &Rel, + uint64_t FileOffset) { // Fix relocation symbol value in place if no static relocation found - // on the same address + // on the same address. We won't check the BF relocations here since it + // is rare case and no optimization is required. if (Section.getRelocationAt(Rel.Offset)) return; @@ -4732,11 +4734,6 @@ void RewriteInstance::patchELFAllocatableRelrSection( if (!Addend) return; - uint64_t FileOffset = Section.getOutputFileOffset(); - if (!FileOffset) - FileOffset = Section.getInputFileOffset(); - - FileOffset += Rel.Offset; OS.pwrite(reinterpret_cast(&Addend), PSize, FileOffset); }; @@ -4758,7 +4755,7 @@ void RewriteInstance::patchELFAllocatableRelrSection( RelOffset = RelOffset == 0 ? SectionAddress + Rel.Offset : RelOffset; assert((RelOffset & 1) == 0 && "Wrong relocation offset"); RelOffsets.emplace(RelOffset); - FixAddend(Section, Rel); + FixAddend(Section, Rel, RelOffset); } } diff --git a/bolt/test/AArch64/constant_island_pie_update.s b/bolt/test/AArch64/constant_island_pie_update.s index f5f6f5d2f1e3..0ab67d07a854 100644 --- a/bolt/test/AArch64/constant_island_pie_update.s +++ b/bolt/test/AArch64/constant_island_pie_update.s @@ -13,8 +13,11 @@ // .relr.dyn # RUN: %clang %cflags -fPIC -pie %t.o -o %t.relr.exe -nostdlib \ # RUN: -Wl,-q -Wl,-z,notext -Wl,--pack-dyn-relocs=relr +# RUN: llvm-objcopy --remove-section .rela.mytext %t.relr.exe # RUN: llvm-bolt %t.relr.exe -o %t.relr.bolt --use-old-text=0 --lite=0 # RUN: llvm-objdump -j .text -d --show-all-symbols %t.relr.bolt | FileCheck %s +# RUN: llvm-objdump -j .text -d %t.relr.bolt | \ +# RUN: FileCheck %s --check-prefix=ADDENDCHECK # RUN: llvm-readelf -rsW %t.relr.bolt | FileCheck --check-prefix=ELFCHECK %s # RUN: llvm-readelf -SW %t.relr.bolt | FileCheck --check-prefix=RELRSZCHECK %s @@ -30,6 +33,11 @@ # CHECK-NEXT: {{.*}} .word 0x{{[0]+}}[[#ADDR]] # CHECK-NEXT: {{.*}} .word 0x00000000 +// Check that addend was properly patched in mytextP with stripped relocations +# ADDENDCHECK: [[#%x,ADDR:]] : +# ADDENDCHECK: {{.*}} : +# ADDENDCHECK-NEXT: {{.*}} .word 0x{{[0]+}}[[#ADDR]] +# ADDENDCHECK-NEXT: {{.*}} .word 0x00000000 // Check that we've relaxed adr to adrp + add to refer external CI # CHECK: : @@ -40,9 +48,10 @@ # ELFCHECK: [[#%x,OFF:]] [[#%x,INFO_DYN:]] R_AARCH64_RELATIVE # ELFCHECK-NEXT: [[#OFF + 8]] {{0*}}[[#INFO_DYN]] R_AARCH64_RELATIVE # ELFCHECK-NEXT: [[#OFF + 24]] {{0*}}[[#INFO_DYN]] R_AARCH64_RELATIVE +# ELFCHECK-NEXT: {{.*}} R_AARCH64_RELATIVE # ELFCHECK: {{.*}}[[#OFF]] {{.*}} $d -// Check that .relr.dyn size is 2 bytes to ensure that last 2 relocations were +// Check that .relr.dyn size is 2 bytes to ensure that last 3 relocations were // encoded as a bitmap so the total section size for 3 relocations is 2 bytes. # RELRSZCHECK: .relr.dyn RELR [[#%x,ADDR:]] [[#%x,OFF:]] {{0*}}10 @@ -81,3 +90,17 @@ addressDynCi: adr x1, .Lci bl _start .size addressDynCi, .-addressDynCi + + .section ".mytext", "ax" + .balign 8 + .global dummy + .type dummy, %function +dummy: + nop + .word 0 + .size dummy, .-dummy + + .global mytextP +mytextP: + .xword exitLocal + .size mytextP, .-mytextP -- Gitee From 3ac118932a706caba858c1234229924a2ceeac78 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Thu, 28 Sep 2023 10:42:24 -0700 Subject: [PATCH 51/94] [Backport][BOLT] Update for rename of MemLifetimePolicy in e994f84c8a6. --- bolt/lib/Rewrite/JITLinkLinker.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/lib/Rewrite/JITLinkLinker.cpp b/bolt/lib/Rewrite/JITLinkLinker.cpp index c57dd5893115..994450c75fcf 100644 --- a/bolt/lib/Rewrite/JITLinkLinker.cpp +++ b/bolt/lib/Rewrite/JITLinkLinker.cpp @@ -31,7 +31,7 @@ bool hasSymbols(const jitlink::Block &B) { Error markSectionsLive(jitlink::LinkGraph &G) { for (auto &Section : G.sections()) { // We only need allocatable sections. - if (Section.getMemLifetimePolicy() == orc::MemLifetimePolicy::NoAlloc) + if (Section.getMemLifetime() == orc::MemLifetime::NoAlloc) continue; // Skip empty sections. -- Gitee From 9ed342d844b86044eb27aac8af8497b916ce6d23 Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Sat, 30 Sep 2023 13:47:41 +0400 Subject: [PATCH 52/94] [Backport][BOLT][NFC] Run ADRRelaxationPass in parallel (#67831) To do this: 1. Protect BC.Ctx with mutex 2. Don't call exit from thread, please check the reason comment near PassFailed variable definition. The other option would be call _Exit instead of exit, but I think we shall call destructors properly. --- bolt/lib/Passes/ADRRelaxationPass.cpp | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/ADRRelaxationPass.cpp index 76924d96fcf9..7b612cbf6572 100644 --- a/bolt/lib/Passes/ADRRelaxationPass.cpp +++ b/bolt/lib/Passes/ADRRelaxationPass.cpp @@ -29,7 +29,16 @@ static cl::opt namespace llvm { namespace bolt { +// We don't exit directly from runOnFunction since it would call ThreadPool +// destructor which might result in internal assert if we're not finished +// creating async jobs on the moment of exit. So we're finishing all parallel +// jobs and checking the exit flag after it. +static bool PassFailed = false; + void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { + if (PassFailed) + return; + BinaryContext &BC = BF.getBinaryContext(); for (BinaryBasicBlock &BB : BF) { for (auto It = BB.begin(); It != BB.end(); ++It) { @@ -54,8 +63,12 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { MCPhysReg Reg; BC.MIB->getADRReg(Inst, Reg); int64_t Addend = BC.MIB->getTargetAddend(Inst); - InstructionListType Addr = - BC.MIB->materializeAddress(Symbol, BC.Ctx.get(), Reg, Addend); + InstructionListType Addr; + + { + auto L = BC.scopeLock(); + Addr = BC.MIB->materializeAddress(Symbol, BC.Ctx.get(), Reg, Addend); + } if (It != BB.begin() && BC.MIB->isNoop(*std::prev(It))) { It = BB.eraseInstruction(std::prev(It)); @@ -68,7 +81,8 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { errs() << formatv("BOLT-ERROR: Cannot relax adr in non-simple function " "{0}. Can't proceed in current mode.\n", BF.getOneName()); - exit(1); + PassFailed = true; + return; } It = BB.replaceInstruction(It, Addr); } @@ -85,7 +99,10 @@ void ADRRelaxationPass::runOnFunctions(BinaryContext &BC) { ParallelUtilities::runOnEachFunction( BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, nullptr, - "ADRRelaxationPass", /* ForceSequential */ true); + "ADRRelaxationPass"); + + if (PassFailed) + exit(1); } } // end namespace bolt -- Gitee From 1e532fce64f50623df36ffe3fd18db6084ad5707 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Wed, 4 Oct 2023 17:57:17 -0700 Subject: [PATCH 53/94] [Backport][BOLT] Fix 32-bit overflow in checkOffsets/checkVMA (#68274) --- bolt/lib/Rewrite/RewriteInstance.cpp | 10 +++++--- bolt/test/checkvma-large-section.test | 35 +++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 bolt/test/checkvma-large-section.test diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index e37f03e14d23..04235d61fb56 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -408,8 +408,9 @@ static bool checkOffsets(const typename ELFT::Phdr &Phdr, return true; // Only non-empty sections can be at the end of a segment. - uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1; - AddressRange SectionAddressRange(Sec.sh_offset, Sec.sh_offset + SectionSize); + uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1ull; + AddressRange SectionAddressRange((uint64_t)Sec.sh_offset, + Sec.sh_offset + SectionSize); AddressRange SegmentAddressRange(Phdr.p_offset, Phdr.p_offset + Phdr.p_filesz); if (SegmentAddressRange.contains(SectionAddressRange)) @@ -425,8 +426,9 @@ template static bool checkVMA(const typename ELFT::Phdr &Phdr, const typename ELFT::Shdr &Sec, bool &Overlap) { // Only non-empty sections can be at the end of a segment. - uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1; - AddressRange SectionAddressRange(Sec.sh_addr, Sec.sh_addr + SectionSize); + uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1ull; + AddressRange SectionAddressRange((uint64_t)Sec.sh_addr, + Sec.sh_addr + SectionSize); AddressRange SegmentAddressRange(Phdr.p_vaddr, Phdr.p_vaddr + Phdr.p_memsz); if (SegmentAddressRange.contains(SectionAddressRange)) diff --git a/bolt/test/checkvma-large-section.test b/bolt/test/checkvma-large-section.test new file mode 100644 index 000000000000..36a915951115 --- /dev/null +++ b/bolt/test/checkvma-large-section.test @@ -0,0 +1,35 @@ +# This test reproduces the issue with a section which ends at >4G address +REQUIRES: asserts +RUN: split-file %s %t +RUN: yaml2obj %t/yaml -o %t.exe --max-size=0 +RUN: llvm-bolt %t.exe -o /dev/null --allow-stripped +#--- yaml +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +ProgramHeaders: + - Type: PT_LOAD + FirstSec: .a + LastSec: .a + Align: 0x1000 + - Type: PT_LOAD + Flags: [ PF_R, PF_W ] + FirstSec: .large_sec + LastSec: .large_sec + VAddr: 0x4a0279a8 + - Type: PT_GNU_RELRO + Flags: [ PF_R ] +Sections: + - Name: .a + Type: SHT_PROGBITS + Content: 00 + AddressAlign: 0x1 + - Name: .large_sec + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4a0279a8 + Size: 0xdf8bb1a0 +... -- Gitee From dc5c6378e5c269a4a136284e2ad1c597f04b084c Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Sat, 9 Sep 2023 08:22:37 +0000 Subject: [PATCH 54/94] [Backport][BOLT][RISCV] Implement R_RISCV_PCREL_LO12_S (#65204) Relocation used for store instructions. --- bolt/lib/Core/Relocation.cpp | 9 +++++++++ bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp | 2 ++ bolt/test/RISCV/reloc-pcrel.s | 4 ++++ 3 files changed, 15 insertions(+) diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp index 240c88744717..208a8c4d3557 100644 --- a/bolt/lib/Core/Relocation.cpp +++ b/bolt/lib/Core/Relocation.cpp @@ -101,6 +101,7 @@ static bool isSupportedRISCV(uint64_t Type) { case ELF::R_RISCV_GOT_HI20: case ELF::R_RISCV_PCREL_HI20: case ELF::R_RISCV_PCREL_LO12_I: + case ELF::R_RISCV_PCREL_LO12_S: case ELF::R_RISCV_RVC_JUMP: case ELF::R_RISCV_RVC_BRANCH: case ELF::R_RISCV_ADD32: @@ -195,6 +196,7 @@ static size_t getSizeForTypeRISCV(uint64_t Type) { case ELF::R_RISCV_BRANCH: case ELF::R_RISCV_PCREL_HI20: case ELF::R_RISCV_PCREL_LO12_I: + case ELF::R_RISCV_PCREL_LO12_S: case ELF::R_RISCV_32_PCREL: case ELF::R_RISCV_CALL: case ELF::R_RISCV_CALL_PLT: @@ -480,6 +482,10 @@ static uint64_t extractIImmRISCV(uint32_t Contents) { return SignExtend64<12>(Contents >> 20); } +static uint64_t extractSImmRISCV(uint32_t Contents) { + return SignExtend64<12>(((Contents >> 7) & 0x1f) | ((Contents >> 25) << 5)); +} + static uint64_t extractJImmRISCV(uint32_t Contents) { return SignExtend64<21>( (((Contents >> 21) & 0x3ff) << 1) | (((Contents >> 20) & 0x1) << 11) | @@ -516,6 +522,8 @@ static uint64_t extractValueRISCV(uint64_t Type, uint64_t Contents, return extractUImmRISCV(Contents); case ELF::R_RISCV_PCREL_LO12_I: return extractIImmRISCV(Contents); + case ELF::R_RISCV_PCREL_LO12_S: + return extractSImmRISCV(Contents); case ELF::R_RISCV_RVC_JUMP: return SignExtend64<11>(Contents >> 2); case ELF::R_RISCV_RVC_BRANCH: @@ -692,6 +700,7 @@ static bool isPCRelativeRISCV(uint64_t Type) { case ELF::R_RISCV_GOT_HI20: case ELF::R_RISCV_PCREL_HI20: case ELF::R_RISCV_PCREL_LO12_I: + case ELF::R_RISCV_PCREL_LO12_S: case ELF::R_RISCV_RVC_JUMP: case ELF::R_RISCV_RVC_BRANCH: case ELF::R_RISCV_32_PCREL: diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp index badc1bde80b5..d13eb22f9582 100644 --- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp +++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp @@ -42,6 +42,7 @@ public: case ELF::R_RISCV_GOT_HI20: case ELF::R_RISCV_PCREL_HI20: case ELF::R_RISCV_PCREL_LO12_I: + case ELF::R_RISCV_PCREL_LO12_S: return true; default: llvm_unreachable("Unexpected RISCV relocation type in code"); @@ -352,6 +353,7 @@ public: case ELF::R_RISCV_PCREL_HI20: return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_RISCV_PCREL_HI, Ctx); case ELF::R_RISCV_PCREL_LO12_I: + case ELF::R_RISCV_PCREL_LO12_S: return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_RISCV_PCREL_LO, Ctx); case ELF::R_RISCV_CALL: return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_RISCV_CALL, Ctx); diff --git a/bolt/test/RISCV/reloc-pcrel.s b/bolt/test/RISCV/reloc-pcrel.s index 2d5a349d03e7..36b132727291 100644 --- a/bolt/test/RISCV/reloc-pcrel.s +++ b/bolt/test/RISCV/reloc-pcrel.s @@ -18,5 +18,9 @@ _start: // CHECK: auipc t0, %pcrel_hi(d) // CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp0)(t0) ld t0, d +// CHECK: .Ltmp1 +// CHECK: auipc t1, %pcrel_hi(d) +// CHECK-NEXT: sd t0, %pcrel_lo(.Ltmp1)(t1) + sd t0, d, t1 ret .size _start, .-_start -- Gitee From affa5e4f365f150cca3c45fa8d371b927bd7de2f Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Fri, 6 Oct 2023 06:46:16 +0000 Subject: [PATCH 55/94] [Backport][BOLT] Improve handling of relocations targeting specific instructions (#66395) On RISC-V, there are certain relocations that target a specific instruction instead of a more abstract location like a function or basic block. Take the following example that loads a value from symbol `foo`: ``` nop 1: auipc t0, %pcrel_hi(foo) ld t0, %pcrel_lo(1b)(t0) ``` This results in two relocation: - auipc: `R_RISCV_PCREL_HI20` referencing `foo`; - ld: `R_RISCV_PCREL_LO12_I` referencing to local label `1` which points to the auipc instruction. It is of utmost importance that the `R_RISCV_PCREL_LO12_I` keeps referring to the auipc instruction; if not, the program will fail to assemble. However, BOLT currently does not guarantee this. BOLT currently assumes that all local symbols are jump targets and always starts a new basic block at symbol locations. The example above results in a CFG the looks like this: ``` .BB0: nop .BB1: auipc t0, %pcrel_hi(foo) ld t0, %pcrel_lo(.BB1)(t0) ``` While this currently works (i.e., the `R_RISCV_PCREL_LO12_I` relocation points to the correct instruction), it has two downsides: - Too many basic blocks are created (the example above is logically only one yet two are created); - If instructions are inserted in `.BB1` (e.g., by instrumentation), things will break since the label will not point to the auipc anymore. This patch proposes to fix this issue by teaching BOLT to track labels that should always point to a specific instruction. This is implemented as follows: - Add a new annotation type (`kLabel`) that allows us to annotate instructions with an `MCSymbol *`; - Whenever we encounter a relocation type that is used to refer to a specific instruction (`Relocation::isInstructionReference`), we register it without a symbol; - During disassembly, whenever we encounter an instruction with such a relocation, create a symbol for its target and store it in an offset to symbol map (to ensure multiple relocations referencing the same instruction use the same label); - After disassembly, iterate this map to attach labels to instructions via the new annotation type; - During emission, emit these labels right before the instruction. I believe the use of annotations works quite well for this use case as it allows us to reliably track instruction labels. If we were to store them as offsets in basic blocks, it would be error prone to keep them updated whenever instructions are inserted or removed. I have chosen to add labels as first-class annotations (as opposed to a generic one) because the documentation of `MCAnnotation` suggests that generic annotations are to be used for optional metadata that can be discarded without affecting correctness. As this is not the case for labels, a first-class annotation seemed more appropriate. --- bolt/include/bolt/Core/MCPlus.h | 1 + bolt/include/bolt/Core/MCPlusBuilder.h | 7 +++++ bolt/include/bolt/Core/Relocation.h | 4 +++ bolt/lib/Core/BinaryContext.cpp | 2 ++ bolt/lib/Core/BinaryEmitter.cpp | 3 ++ bolt/lib/Core/BinaryFunction.cpp | 32 ++++++++++++++++++-- bolt/lib/Core/MCPlusBuilder.cpp | 11 +++++++ bolt/lib/Core/Relocation.cpp | 13 ++++++++ bolt/lib/Passes/BinaryPasses.cpp | 5 +++ bolt/lib/Rewrite/RewriteInstance.cpp | 12 +++++++- bolt/test/RISCV/reloc-abs.s | 3 +- bolt/test/RISCV/reloc-bb-split.s | 42 ++++++++++++++++++++++++++ bolt/test/RISCV/reloc-got.s | 3 +- bolt/test/RISCV/reloc-pcrel.s | 6 ++-- 14 files changed, 133 insertions(+), 11 deletions(-) create mode 100644 bolt/test/RISCV/reloc-bb-split.s diff --git a/bolt/include/bolt/Core/MCPlus.h b/bolt/include/bolt/Core/MCPlus.h index b4a72ac274fa..31cc9071de76 100644 --- a/bolt/include/bolt/Core/MCPlus.h +++ b/bolt/include/bolt/Core/MCPlus.h @@ -66,6 +66,7 @@ public: kTailCall, /// Tail call. kConditionalTailCall, /// CTC. kOffset, /// Offset in the function. + kLabel, /// MCSymbol pointing to this instruction. kGeneric /// First generic annotation. }; diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 07c54a60abbd..b9ccf53919cc 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -1179,6 +1179,13 @@ public: /// Remove offset annotation. bool clearOffset(MCInst &Inst); + /// Return the label of \p Inst, if available. + std::optional getLabel(const MCInst &Inst) const; + + /// Set the label of \p Inst. This label will be emitted right before \p Inst + /// is emitted to MCStreamer. + bool setLabel(MCInst &Inst, MCSymbol *Label); + /// Return MCSymbol that represents a target of this instruction at a given /// operand number \p OpNum. If there's no symbol associated with /// the operand - return nullptr. diff --git a/bolt/include/bolt/Core/Relocation.h b/bolt/include/bolt/Core/Relocation.h index 5ae288a91986..1ddba9d78b3b 100644 --- a/bolt/include/bolt/Core/Relocation.h +++ b/bolt/include/bolt/Core/Relocation.h @@ -97,6 +97,10 @@ struct Relocation { /// Return true if relocation type is for thread local storage. static bool isTLS(uint64_t Type); + /// Return true of relocation type is for referencing a specific instruction + /// (as opposed to a function, basic block, etc). + static bool isInstructionReference(uint64_t Type); + /// Return code for a NONE relocation static uint64_t getNone(); diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index ffecc5209804..4c67f41300b6 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -1863,6 +1863,8 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction, } if (std::optional Offset = MIB->getOffset(Instruction)) OS << " # Offset: " << *Offset; + if (auto Label = MIB->getLabel(Instruction)) + OS << " # Label: " << **Label; MIB->printAnnotations(Instruction, OS); diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp index 95ab63521c06..b1ee6cc221d7 100644 --- a/bolt/lib/Core/BinaryEmitter.cpp +++ b/bolt/lib/Core/BinaryEmitter.cpp @@ -498,6 +498,9 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF, BB->getLocSyms().emplace_back(Offset, LocSym); } + if (auto Label = BC.MIB->getLabel(Instr)) + Streamer.emitLabel(*Label); + Streamer.emitInstruction(Instr, *BC.STI); LastIsPrefix = BC.MIB->isPrefix(Instr); } diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 5905c10afe82..1aa6af92f4be 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -1173,6 +1173,13 @@ bool BinaryFunction::disassemble() { // basic block. Labels[0] = Ctx->createNamedTempSymbol("BB0"); + // Map offsets in the function to a label that should always point to the + // corresponding instruction. This is used for labels that shouldn't point to + // the start of a basic block but always to a specific instruction. This is + // used, for example, on RISC-V where %pcrel_lo relocations point to the + // corresponding %pcrel_hi. + LabelsMapType InstructionLabels; + uint64_t Size = 0; // instruction size for (uint64_t Offset = 0; Offset < getSize(); Offset += Size) { MCInst Instruction; @@ -1329,9 +1336,23 @@ bool BinaryFunction::disassemble() { ItrE = Relocations.lower_bound(Offset + Size); Itr != ItrE; ++Itr) { const Relocation &Relocation = Itr->second; + MCSymbol *Symbol = Relocation.Symbol; + + if (Relocation::isInstructionReference(Relocation.Type)) { + uint64_t RefOffset = Relocation.Value - getAddress(); + LabelsMapType::iterator LI = InstructionLabels.find(RefOffset); + + if (LI == InstructionLabels.end()) { + Symbol = BC.Ctx->createNamedTempSymbol(); + InstructionLabels.emplace(RefOffset, Symbol); + } else { + Symbol = LI->second; + } + } + int64_t Value = Relocation.Value; const bool Result = BC.MIB->replaceImmWithSymbolRef( - Instruction, Relocation.Symbol, Relocation.Addend, Ctx.get(), Value, + Instruction, Symbol, Relocation.Addend, Ctx.get(), Value, Relocation.Type); (void)Result; assert(Result && "cannot replace immediate with relocation"); @@ -1366,6 +1387,13 @@ add_instruction: addInstruction(Offset, std::move(Instruction)); } + for (auto [Offset, Label] : InstructionLabels) { + InstrMapType::iterator II = Instructions.find(Offset); + assert(II != Instructions.end() && "reference to non-existing instruction"); + + BC.MIB->setLabel(II->second, Label); + } + // Reset symbolizer for the disassembler. BC.SymbolicDisAsm->setSymbolizer(nullptr); @@ -4487,7 +4515,7 @@ void BinaryFunction::addRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t Offset = Address - getAddress(); LLVM_DEBUG(dbgs() << "BOLT-DEBUG: addRelocation in " << formatv("{0}@{1:x} against {2}\n", *this, Offset, - Symbol->getName())); + (Symbol ? Symbol->getName() : ""))); bool IsCI = BC.isAArch64() && isInConstantIsland(Address); std::map &Rels = IsCI ? Islands->Relocations : Relocations; diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp index 027cef1063ee..0a5eb44e4876 100644 --- a/bolt/lib/Core/MCPlusBuilder.cpp +++ b/bolt/lib/Core/MCPlusBuilder.cpp @@ -268,6 +268,17 @@ bool MCPlusBuilder::clearOffset(MCInst &Inst) { return true; } +std::optional MCPlusBuilder::getLabel(const MCInst &Inst) const { + if (auto Label = tryGetAnnotationAs(Inst, MCAnnotation::kLabel)) + return *Label; + return std::nullopt; +} + +bool MCPlusBuilder::setLabel(MCInst &Inst, MCSymbol *Label) { + getOrCreateAnnotationAs(Inst, MCAnnotation::kLabel) = Label; + return true; +} + bool MCPlusBuilder::hasAnnotation(const MCInst &Inst, unsigned Index) const { const MCInst *AnnotationInst = getAnnotationInst(Inst); if (!AnnotationInst) diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp index 208a8c4d3557..a20a3f46c7d0 100644 --- a/bolt/lib/Core/Relocation.cpp +++ b/bolt/lib/Core/Relocation.cpp @@ -797,6 +797,19 @@ bool Relocation::isTLS(uint64_t Type) { return isTLSX86(Type); } +bool Relocation::isInstructionReference(uint64_t Type) { + if (Arch != Triple::riscv64) + return false; + + switch (Type) { + default: + return false; + case ELF::R_RISCV_PCREL_LO12_I: + case ELF::R_RISCV_PCREL_LO12_S: + return true; + } +} + uint64_t Relocation::getNone() { if (Arch == Triple::aarch64) return ELF::R_AARCH64_NONE; diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index bb760ea93ad1..3ba53d7b2b79 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -575,6 +575,7 @@ bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { void LowerAnnotations::runOnFunctions(BinaryContext &BC) { std::vector> PreservedOffsetAnnotations; + std::vector> PreservedLabelAnnotations; for (auto &It : BC.getBinaryFunctions()) { BinaryFunction &BF = It.second; @@ -609,6 +610,8 @@ void LowerAnnotations::runOnFunctions(BinaryContext &BC) { if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II)) PreservedOffsetAnnotations.emplace_back(&(*II), *BC.MIB->getOffset(*II)); + if (auto Label = BC.MIB->getLabel(*II)) + PreservedLabelAnnotations.emplace_back(&*II, *Label); BC.MIB->stripAnnotations(*II); } } @@ -625,6 +628,8 @@ void LowerAnnotations::runOnFunctions(BinaryContext &BC) { // Reinsert preserved annotations we need during code emission. for (const std::pair &Item : PreservedOffsetAnnotations) BC.MIB->setOffset(*Item.first, Item.second); + for (auto [Instr, Label] : PreservedLabelAnnotations) + BC.MIB->setLabel(*Instr, Label); } // Check for dirty state in MCSymbol objects that might be a consequence diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 04235d61fb56..049e766f2328 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -2531,7 +2531,17 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection, // Adjust the point of reference to a code location inside a function. if (ReferencedBF->containsAddress(Address, /*UseMaxSize = */ true)) { RefFunctionOffset = Address - ReferencedBF->getAddress(); - if (RefFunctionOffset) { + if (Relocation::isInstructionReference(RType)) { + // Instruction labels are created while disassembling so we just leave + // the symbol empty for now. Since the extracted value is typically + // unrelated to the referenced symbol (e.g., %pcrel_lo in RISC-V + // references an instruction but the patched value references the low + // bits of a data address), we set the extracted value to the symbol + // address in order to be able to correctly reconstruct the reference + // later. + ReferencedSymbol = nullptr; + ExtractedValue = Address; + } else if (RefFunctionOffset) { if (ContainingBF && ContainingBF != ReferencedBF) { ReferencedSymbol = ReferencedBF->addEntryPointAtOffset(RefFunctionOffset); diff --git a/bolt/test/RISCV/reloc-abs.s b/bolt/test/RISCV/reloc-abs.s index 3e4b8b1395e1..5b728f092b3c 100644 --- a/bolt/test/RISCV/reloc-abs.s +++ b/bolt/test/RISCV/reloc-abs.s @@ -17,8 +17,7 @@ _start: .option push .option norelax 1: -// CHECK: .Ltmp0 -// CHECK: auipc gp, %pcrel_hi(__global_pointer$) +// CHECK: auipc gp, %pcrel_hi(__global_pointer$) # Label: .Ltmp0 // CHECK-NEXT: addi gp, gp, %pcrel_lo(.Ltmp0) auipc gp, %pcrel_hi(__global_pointer$) addi gp, gp, %pcrel_lo(1b) diff --git a/bolt/test/RISCV/reloc-bb-split.s b/bolt/test/RISCV/reloc-bb-split.s new file mode 100644 index 000000000000..5995562cf130 --- /dev/null +++ b/bolt/test/RISCV/reloc-bb-split.s @@ -0,0 +1,42 @@ +// RUN: %clang %cflags -o %t %s +// RUN: llvm-bolt --print-cfg --print-only=_start -o /dev/null %t \ +// RUN: | FileCheck %s + + .data + .globl d + .p2align 3 +d: + .dword 0 + + .text + .globl _start + .p2align 1 +// CHECK-LABEL: Binary Function "_start" after building cfg { +_start: +/// The local label is used for %pcrel_lo as well as a jump target so a new +/// basic block should start there. +// CHECK-LABEL: {{^}}.LBB00 +// CHECK: nop +// CHECK-LABEL: {{^}}.Ltmp0 +// CHECK: auipc t0, %pcrel_hi(d) # Label: .Ltmp1 +// CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp1)(t0) +// CHECK-NEXT: j .Ltmp0 + nop +1: + auipc t0, %pcrel_hi(d) + ld t0, %pcrel_lo(1b)(t0) + j 1b + +/// The local label is used only for %pcrel_lo so no new basic block should +/// start there. +// CHECK-LABEL: {{^}}.LFT0 +// CHECK: nop +// CHECK-NEXT: auipc t0, %pcrel_hi(d) # Label: .Ltmp2 +// CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp2)(t0) +// CHECK-NEXT: ret + nop +1: + auipc t0, %pcrel_hi(d) + ld t0, %pcrel_lo(1b)(t0) + ret + .size _start, .-_start diff --git a/bolt/test/RISCV/reloc-got.s b/bolt/test/RISCV/reloc-got.s index b6cd61be723b..dcf9d0ea3ffb 100644 --- a/bolt/test/RISCV/reloc-got.s +++ b/bolt/test/RISCV/reloc-got.s @@ -14,8 +14,7 @@ d: // CHECK: Binary Function "_start" after building cfg { _start: nop // Here to not make the _start and .Ltmp0 symbols coincide -// CHECK: .Ltmp0 -// CHECK: auipc t0, %pcrel_hi(__BOLT_got_zero+{{[0-9]+}}) +// CHECK: auipc t0, %pcrel_hi(__BOLT_got_zero+{{[0-9]+}}) # Label: .Ltmp0 // CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp0)(t0) 1: auipc t0, %got_pcrel_hi(d) diff --git a/bolt/test/RISCV/reloc-pcrel.s b/bolt/test/RISCV/reloc-pcrel.s index 36b132727291..3ad3015a0a57 100644 --- a/bolt/test/RISCV/reloc-pcrel.s +++ b/bolt/test/RISCV/reloc-pcrel.s @@ -14,12 +14,10 @@ d: // CHECK: Binary Function "_start" after building cfg { _start: nop // Here to not make the _start and .Ltmp0 symbols coincide -// CHECK: .Ltmp0 -// CHECK: auipc t0, %pcrel_hi(d) +// CHECK: auipc t0, %pcrel_hi(d) # Label: .Ltmp0 // CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp0)(t0) ld t0, d -// CHECK: .Ltmp1 -// CHECK: auipc t1, %pcrel_hi(d) +// CHECK-NEXT: auipc t1, %pcrel_hi(d) # Label: .Ltmp1 // CHECK-NEXT: sd t0, %pcrel_lo(.Ltmp1)(t1) sd t0, d, t1 ret -- Gitee From b9da0f885ac0ad0ae385a508d76da20607b3f274 Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Tue, 10 Oct 2023 13:27:14 +0400 Subject: [PATCH 56/94] [Backport][BOLT][runtime] Add start & fini symbols (#68505) Add absent start & fini symbols, currently setted by bolt for runtime libraries at DT_INIT and DT_FINI. The proper tests would be added by the https://github.com/llvm/llvm-project/pull/67348 PR. --- bolt/lib/Rewrite/RewriteInstance.cpp | 35 +++++++++++++++++++--------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 049e766f2328..88ece58a09bb 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -4581,15 +4581,12 @@ void RewriteInstance::updateELFSymbolTable( } } - assert((!NumHotTextSymsUpdated || NumHotTextSymsUpdated == 2) && - "either none or both __hot_start/__hot_end symbols were expected"); - assert((!NumHotDataSymsUpdated || NumHotDataSymsUpdated == 2) && - "either none or both __hot_data_start/__hot_data_end symbols were " - "expected"); + auto AddSymbol = [&](const StringRef &Name, uint64_t Address) { + if (!Address) + return; - auto addSymbol = [&](const std::string &Name) { ELFSymTy Symbol; - Symbol.st_value = getNewValueForSymbol(Name); + Symbol.st_value = Address; Symbol.st_shndx = ELF::SHN_ABS; Symbol.st_name = AddToStrTab(Name); Symbol.st_size = 0; @@ -4602,14 +4599,30 @@ void RewriteInstance::updateELFSymbolTable( Symbols.emplace_back(Symbol); }; + // Add runtime library start and fini address symbols + if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary()) { + AddSymbol("__bolt_runtime_start", RtLibrary->getRuntimeStartAddress()); + AddSymbol("__bolt_runtime_fini", RtLibrary->getRuntimeFiniAddress()); + } + + assert((!NumHotTextSymsUpdated || NumHotTextSymsUpdated == 2) && + "either none or both __hot_start/__hot_end symbols were expected"); + assert((!NumHotDataSymsUpdated || NumHotDataSymsUpdated == 2) && + "either none or both __hot_data_start/__hot_data_end symbols were " + "expected"); + + auto AddEmittedSymbol = [&](const StringRef &Name) { + AddSymbol(Name, getNewValueForSymbol(Name)); + }; + if (opts::HotText && !NumHotTextSymsUpdated) { - addSymbol("__hot_start"); - addSymbol("__hot_end"); + AddEmittedSymbol("__hot_start"); + AddEmittedSymbol("__hot_end"); } if (opts::HotData && !NumHotDataSymsUpdated) { - addSymbol("__hot_data_start"); - addSymbol("__hot_data_end"); + AddEmittedSymbol("__hot_data_start"); + AddEmittedSymbol("__hot_data_end"); } // Put local symbols at the beginning. -- Gitee From 6d8854abbe22e6c5b4f70dc2ca69755de6c85fdd Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Wed, 11 Oct 2023 07:26:20 +0000 Subject: [PATCH 57/94] [Backport][BOLT] Preserve label annotations for injected functions (#68713) Needed for instrumentation on RISC-V. --- bolt/lib/Passes/BinaryPasses.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index 3ba53d7b2b79..e50fa9dea602 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -619,8 +619,11 @@ void LowerAnnotations::runOnFunctions(BinaryContext &BC) { } for (BinaryFunction *BF : BC.getInjectedBinaryFunctions()) for (BinaryBasicBlock &BB : *BF) - for (MCInst &Instruction : BB) + for (MCInst &Instruction : BB) { + if (auto Label = BC.MIB->getLabel(Instruction)) + PreservedLabelAnnotations.emplace_back(&Instruction, *Label); BC.MIB->stripAnnotations(Instruction); + } // Release all memory taken by annotations BC.MIB->freeAnnotations(); -- Gitee From eb5e9ff4e52e7840f442306beb448035c905e164 Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Thu, 12 Oct 2023 09:33:08 +0400 Subject: [PATCH 58/94] [Backport][BOLT] Return proper minimal alignment from BF (#67707) Currently minimal alignment of function is hardcoded to 2 bytes. Add 2 more cases: 1. In case BF is data in code return the alignment of CI as minimal alignment 2. For aarch64 and riscv platforms return the minimal value of 4 (added test for aarch64) Otherwise fallback to returning the 2 as it previously was. --- bolt/include/bolt/Core/BinaryFunction.h | 21 ++++++++++++--- bolt/lib/Core/BinaryEmitter.cpp | 2 +- bolt/lib/Core/BinaryFunction.cpp | 2 -- bolt/lib/Passes/Aligner.cpp | 16 ----------- bolt/lib/Passes/LongJmp.cpp | 4 +-- bolt/test/AArch64/bf_min_alignment.s | 35 +++++++++++++++++++++++++ 6 files changed, 55 insertions(+), 25 deletions(-) create mode 100644 bolt/test/AArch64/bf_min_alignment.s diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index a34b6cfd1f5e..5feef3f178c9 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -192,9 +192,6 @@ public: static constexpr uint64_t COUNT_NO_PROFILE = BinaryBasicBlock::COUNT_NO_PROFILE; - /// We have to use at least 2-byte alignment for functions because of C++ ABI. - static constexpr unsigned MinAlign = 2; - static const char TimerGroupName[]; static const char TimerGroupDesc[]; @@ -1720,8 +1717,24 @@ public: return *this; } - Align getAlign() const { return Align(Alignment); } + uint16_t getMinAlignment() const { + // Align data in code BFs minimum to CI alignment + if (!size() && hasIslandsInfo()) + return getConstantIslandAlignment(); + + // Minimal code alignment on AArch64 and RISCV is 4 + if (BC.isAArch64() || BC.isRISCV()) + return 4; + + // We have to use at least 2-byte alignment for functions because + // of C++ ABI. + return 2; + } + + Align getMinAlign() const { return Align(getMinAlignment()); } + uint16_t getAlignment() const { return Alignment; } + Align getAlign() const { return Align(getAlignment()); } BinaryFunction &setMaxAlignmentBytes(uint16_t MaxAlignBytes) { MaxAlignmentBytes = MaxAlignBytes; diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp index b1ee6cc221d7..de80a99a74ed 100644 --- a/bolt/lib/Core/BinaryEmitter.cpp +++ b/bolt/lib/Core/BinaryEmitter.cpp @@ -309,7 +309,7 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function, // tentative layout. Section->ensureMinAlignment(Align(opts::AlignFunctions)); - Streamer.emitCodeAlignment(Align(BinaryFunction::MinAlign), &*BC.STI); + Streamer.emitCodeAlignment(Function.getMinAlign(), &*BC.STI); uint16_t MaxAlignBytes = FF.isSplitFragment() ? Function.getMaxColdAlignmentBytes() : Function.getMaxAlignmentBytes(); diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 1aa6af92f4be..14c4925f2e06 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -164,8 +164,6 @@ bool shouldPrint(const BinaryFunction &Function) { namespace llvm { namespace bolt { -constexpr unsigned BinaryFunction::MinAlign; - template static bool emptyRange(const R &Range) { return Range.begin() == Range.end(); } diff --git a/bolt/lib/Passes/Aligner.cpp b/bolt/lib/Passes/Aligner.cpp index c5b63d881e17..7c387525434b 100644 --- a/bolt/lib/Passes/Aligner.cpp +++ b/bolt/lib/Passes/Aligner.cpp @@ -158,22 +158,6 @@ void AlignerPass::runOnFunctions(BinaryContext &BC) { BinaryContext::IndependentCodeEmitter Emitter = BC.createIndependentMCCodeEmitter(); - // Align objects that contains constant islands and no code - // to at least 8 bytes. - if (!BF.size() && BF.hasIslandsInfo()) { - uint16_t Alignment = BF.getConstantIslandAlignment(); - // Check if we're forcing output alignment and it is greater than minimal - // CI required one - if (!opts::UseCompactAligner && Alignment < opts::AlignFunctions && - opts::AlignFunctions <= opts::AlignFunctionsMaxBytes) - Alignment = opts::AlignFunctions; - - BF.setAlignment(Alignment); - BF.setMaxAlignmentBytes(Alignment); - BF.setMaxColdAlignmentBytes(Alignment); - return; - } - if (opts::UseCompactAligner) alignCompact(BF, Emitter.MCE.get()); else diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp index 6f4d1170dbe2..a81689bc3746 100644 --- a/bolt/lib/Passes/LongJmp.cpp +++ b/bolt/lib/Passes/LongJmp.cpp @@ -293,7 +293,7 @@ uint64_t LongJmpPass::tentativeLayoutRelocColdPart( for (BinaryFunction *Func : SortedFunctions) { if (!Func->isSplit()) continue; - DotAddress = alignTo(DotAddress, BinaryFunction::MinAlign); + DotAddress = alignTo(DotAddress, Func->getMinAlignment()); uint64_t Pad = offsetToAlignment(DotAddress, llvm::Align(Func->getAlignment())); if (Pad <= Func->getMaxColdAlignmentBytes()) @@ -352,7 +352,7 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode( DotAddress = alignTo(DotAddress, opts::AlignText); } - DotAddress = alignTo(DotAddress, BinaryFunction::MinAlign); + DotAddress = alignTo(DotAddress, Func->getMinAlignment()); uint64_t Pad = offsetToAlignment(DotAddress, llvm::Align(Func->getAlignment())); if (Pad <= Func->getMaxAlignmentBytes()) diff --git a/bolt/test/AArch64/bf_min_alignment.s b/bolt/test/AArch64/bf_min_alignment.s new file mode 100644 index 000000000000..2dd06b373a79 --- /dev/null +++ b/bolt/test/AArch64/bf_min_alignment.s @@ -0,0 +1,35 @@ +// This tests checks the minimum alignment of the AARch64 function +// is equal to 4. Otherwise the jitlinker would fail to link the +// binary since the size of the first function after reorder is not +// not a multiple of 4. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags -fPIC -pie %t.o -o %t.exe -nostdlib -Wl,-q +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-bolt %t.exe -o %t.bolt --use-old-text=0 --lite=0 \ +# RUN: --align-functions-max-bytes=1 \ +# RUN: --data %t.fdata --reorder-functions=exec-count +# RUN: llvm-nm -n %t.bolt | FileCheck %s + +# CHECK: {{0|4|8|c}} T dummy +# CHECK-NEXT: {{0|4|8|c}} T _start + + .text + .align 4 + .global _start + .type _start, %function +_start: +# FDATA: 0 [unknown] 0 1 _start 0 0 1 + bl dymmy + ret + .size _start, .-_start + + .global dummy + .type dummy, %function +dummy: +# FDATA: 0 [unknown] 0 1 dummy 0 0 42 + adr x0, .Lci + ret +.Lci: + .byte 0 + .size dummy, .-dummy -- Gitee From c40e8e39510b928ed880b5e237bec4a175054520 Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Mon, 16 Oct 2023 07:12:30 +0000 Subject: [PATCH 59/94] [Backport][BOLT] Move X86-specific test to X86 subdirectory (#68992) It only works when the X86 target is available. --- bolt/test/{ => X86}/checkvma-large-section.test | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename bolt/test/{ => X86}/checkvma-large-section.test (100%) diff --git a/bolt/test/checkvma-large-section.test b/bolt/test/X86/checkvma-large-section.test similarity index 100% rename from bolt/test/checkvma-large-section.test rename to bolt/test/X86/checkvma-large-section.test -- Gitee From 58d54ee4ee8824dd67994ba020bf0e8b05db54f6 Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Wed, 18 Oct 2023 11:54:26 +0400 Subject: [PATCH 60/94] [Backport][BOLT] Fix instrumentation test (#69383) --- bolt/test/X86/internal-call-instrument-so.s | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bolt/test/X86/internal-call-instrument-so.s b/bolt/test/X86/internal-call-instrument-so.s index b8903fc7f822..d13c828f605c 100644 --- a/bolt/test/X86/internal-call-instrument-so.s +++ b/bolt/test/X86/internal-call-instrument-so.s @@ -1,6 +1,6 @@ # This reproduces a bug with instrumentation crashes on internal call -# REQUIRES: system-linux,bolt-runtime +# REQUIRES: system-linux,bolt-runtime,target=x86_64{{.*}} # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o # Delete our BB symbols so BOLT doesn't mark them as entry points @@ -41,7 +41,6 @@ _start: retq .size _start, .-_start - .globl _fini .type _fini, %function .p2align 4 -- Gitee From 73ecf5af12b62e25443e2344f7617ec56468bb3e Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Wed, 18 Oct 2023 05:30:46 -0700 Subject: [PATCH 61/94] [Backport][BOLT][test] Update checkvma-large-section.test (#69419) --- bolt/test/X86/checkvma-large-section.test | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bolt/test/X86/checkvma-large-section.test b/bolt/test/X86/checkvma-large-section.test index 36a915951115..afa44111ead4 100644 --- a/bolt/test/X86/checkvma-large-section.test +++ b/bolt/test/X86/checkvma-large-section.test @@ -19,7 +19,7 @@ ProgramHeaders: Flags: [ PF_R, PF_W ] FirstSec: .large_sec LastSec: .large_sec - VAddr: 0x4a0279a8 + VAddr: 0x80000000 - Type: PT_GNU_RELRO Flags: [ PF_R ] Sections: @@ -28,8 +28,8 @@ Sections: Content: 00 AddressAlign: 0x1 - Name: .large_sec - Type: SHT_PROGBITS + Type: SHT_NOBITS Flags: [ SHF_WRITE, SHF_ALLOC ] - Address: 0x4a0279a8 - Size: 0xdf8bb1a0 + Address: 0x80000000 + Size: 0x80000000 ... -- Gitee From 53a36b41f24935be10295ce8e6df44fbf71302cd Mon Sep 17 00:00:00 2001 From: maksfb Date: Tue, 24 Oct 2023 12:22:43 -0700 Subject: [PATCH 62/94] [Backport][BOLT] Fix incorrect basic block output addresses (#70000) Some optimization passes may duplicate basic blocks and assign the same input offset to a number of different blocks in a function. This is done e.g. to correctly map debugging ranges for duplicated code. However, duplicate input offsets present a problem when we use AddressMap to generate new addresses for basic blocks. The output address is calculated based on the input offset and will be the same for blocks with identical offsets. The result is potentially incorrect debug info and BAT records. To address the issue, we have to eliminate the dependency on input offsets while generating output addresses for a basic block. Each block has a unique label, hence we extend AddressMap to include address lookup based on MCSymbol and use the new functionality to update block addresses. --- bolt/include/bolt/Core/AddressMap.h | 50 ++++++++++----- bolt/lib/Core/AddressMap.cpp | 94 ++++++++++++++++++++++------ bolt/lib/Core/BinaryFunction.cpp | 11 +++- bolt/lib/Rewrite/RewriteInstance.cpp | 10 +-- bolt/test/X86/jump-table-icp.test | 1 + 5 files changed, 120 insertions(+), 46 deletions(-) diff --git a/bolt/include/bolt/Core/AddressMap.h b/bolt/include/bolt/Core/AddressMap.h index 16c2727b6943..85a9ab4473aa 100644 --- a/bolt/include/bolt/Core/AddressMap.h +++ b/bolt/include/bolt/Core/AddressMap.h @@ -6,18 +6,16 @@ // //===----------------------------------------------------------------------===// // -// Helper class to create a mapping from input to output addresses needed for -// updating debugging symbols and BAT. We emit an MCSection containing -// pairs to the object file and JITLink will -// transform this in pairs. The linker output -// can then be parsed and used to establish the mapping. +// This file contains the declaration of the AddressMap class used for looking +// up addresses in the output object. // //===----------------------------------------------------------------------===// -// + #ifndef BOLT_CORE_ADDRESS_MAP_H #define BOLT_CORE_ADDRESS_MAP_H #include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCSymbol.h" #include #include @@ -30,26 +28,48 @@ namespace bolt { class BinaryContext; +/// Helper class to create a mapping from input entities to output addresses +/// needed for updating debugging symbols and BAT. We emit a section containing +/// pairs to the object file and JITLink will +/// transform this in pairs. The linker output +/// can then be parsed and used to establish the mapping. +/// +/// The entities that can be mapped to output address are input addresses and +/// labels (MCSymbol). Input addresses support one-to-many mapping. class AddressMap { - using MapTy = std::unordered_multimap; - MapTy Map; + static const char *const AddressSectionName; + static const char *const LabelSectionName; -public: - static const char *const SectionName; + /// Map multiple to . + using Addr2AddrMapTy = std::unordered_multimap; + Addr2AddrMapTy Address2AddressMap; + /// Map MCSymbol to its output address. Normally used for temp symbols that + /// are not updated by the linker. + using Label2AddrMapTy = DenseMap; + Label2AddrMapTy Label2AddrMap; + +public: static void emit(MCStreamer &Streamer, BinaryContext &BC); - static AddressMap parse(StringRef Buffer, const BinaryContext &BC); + static std::optional parse(BinaryContext &BC); std::optional lookup(uint64_t InputAddress) const { - auto It = Map.find(InputAddress); - if (It != Map.end()) + auto It = Address2AddressMap.find(InputAddress); + if (It != Address2AddressMap.end()) + return It->second; + return std::nullopt; + } + + std::optional lookup(const MCSymbol *Symbol) const { + auto It = Label2AddrMap.find(Symbol); + if (It != Label2AddrMap.end()) return It->second; return std::nullopt; } - std::pair + std::pair lookupAll(uint64_t InputAddress) const { - return Map.equal_range(InputAddress); + return Address2AddressMap.equal_range(InputAddress); } }; diff --git a/bolt/lib/Core/AddressMap.cpp b/bolt/lib/Core/AddressMap.cpp index c5f628d87864..efa376d408db 100644 --- a/bolt/lib/Core/AddressMap.cpp +++ b/bolt/lib/Core/AddressMap.cpp @@ -1,22 +1,44 @@ +//===- bolt/Core/AddressMap.cpp - Input-output Address Map ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #include "bolt/Core/AddressMap.h" #include "bolt/Core/BinaryContext.h" #include "bolt/Core/BinaryFunction.h" +#include "bolt/Core/BinarySection.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/DataExtractor.h" namespace llvm { namespace bolt { -const char *const AddressMap::SectionName = ".bolt.address_map"; +const char *const AddressMap::AddressSectionName = ".bolt.addr2addr_map"; +const char *const AddressMap::LabelSectionName = ".bolt.label2addr_map"; -static void emitLabel(MCStreamer &Streamer, uint64_t InputAddress, - const MCSymbol *OutputLabel) { +static void emitAddress(MCStreamer &Streamer, uint64_t InputAddress, + const MCSymbol *OutputLabel) { Streamer.emitIntValue(InputAddress, 8); Streamer.emitSymbolValue(OutputLabel, 8); } +static void emitLabel(MCStreamer &Streamer, const MCSymbol *OutputLabel) { + Streamer.emitIntValue(reinterpret_cast(OutputLabel), 8); + Streamer.emitSymbolValue(OutputLabel, 8); +} + void AddressMap::emit(MCStreamer &Streamer, BinaryContext &BC) { - Streamer.switchSection(BC.getDataSection(SectionName)); + // Mark map sections as link-only to avoid allocation in the output file. + const unsigned Flags = BinarySection::getFlags(/*IsReadOnly*/ true, + /*IsText*/ false, + /*IsAllocatable*/ true); + BC.registerOrUpdateSection(AddressSectionName, ELF::SHT_PROGBITS, Flags) + .setLinkOnly(); + BC.registerOrUpdateSection(LabelSectionName, ELF::SHT_PROGBITS, Flags) + .setLinkOnly(); for (const auto &[BFAddress, BF] : BC.getBinaryFunctions()) { if (!BF.requiresAddressMap()) @@ -26,37 +48,69 @@ void AddressMap::emit(MCStreamer &Streamer, BinaryContext &BC) { if (!BB.getLabel()->isDefined()) continue; - emitLabel(Streamer, BFAddress + BB.getInputAddressRange().first, - BB.getLabel()); + Streamer.switchSection(BC.getDataSection(LabelSectionName)); + emitLabel(Streamer, BB.getLabel()); if (!BB.hasLocSyms()) continue; + Streamer.switchSection(BC.getDataSection(AddressSectionName)); for (auto [Offset, Symbol] : BB.getLocSyms()) - emitLabel(Streamer, BFAddress + Offset, Symbol); + emitAddress(Streamer, BFAddress + Offset, Symbol); } } } -AddressMap AddressMap::parse(StringRef Buffer, const BinaryContext &BC) { - const auto EntrySize = 2 * BC.AsmInfo->getCodePointerSize(); - assert(Buffer.size() % EntrySize == 0 && "Unexpected address map size"); +std::optional AddressMap::parse(BinaryContext &BC) { + auto AddressMapSection = BC.getUniqueSectionByName(AddressSectionName); + auto LabelMapSection = BC.getUniqueSectionByName(LabelSectionName); - DataExtractor DE(Buffer, BC.AsmInfo->isLittleEndian(), - BC.AsmInfo->getCodePointerSize()); - DataExtractor::Cursor Cursor(0); + if (!AddressMapSection && !LabelMapSection) + return std::nullopt; AddressMap Parsed; - Parsed.Map.reserve(Buffer.size() / EntrySize); - while (Cursor && !DE.eof(Cursor)) { - const auto Input = DE.getAddress(Cursor); - const auto Output = DE.getAddress(Cursor); - if (!Parsed.Map.count(Input)) - Parsed.Map.insert({Input, Output}); + const size_t EntrySize = 2 * BC.AsmInfo->getCodePointerSize(); + auto parseSection = + [&](BinarySection &Section, + function_ref InsertCallback) { + StringRef Buffer = Section.getOutputContents(); + assert(Buffer.size() % EntrySize == 0 && "Unexpected address map size"); + + DataExtractor DE(Buffer, BC.AsmInfo->isLittleEndian(), + BC.AsmInfo->getCodePointerSize()); + DataExtractor::Cursor Cursor(0); + + while (Cursor && !DE.eof(Cursor)) { + const uint64_t Input = DE.getAddress(Cursor); + const uint64_t Output = DE.getAddress(Cursor); + InsertCallback(Input, Output); + } + + assert(Cursor && "Error reading address map section"); + BC.deregisterSection(Section); + }; + + if (AddressMapSection) { + Parsed.Address2AddressMap.reserve(AddressMapSection->getOutputSize() / + EntrySize); + parseSection(*AddressMapSection, [&](uint64_t Input, uint64_t Output) { + if (!Parsed.Address2AddressMap.count(Input)) + Parsed.Address2AddressMap.insert({Input, Output}); + }); + } + + if (LabelMapSection) { + Parsed.Label2AddrMap.reserve(LabelMapSection->getOutputSize() / EntrySize); + parseSection(*LabelMapSection, [&](uint64_t Input, uint64_t Output) { + assert(!Parsed.Label2AddrMap.count( + reinterpret_cast(Input)) && + "Duplicate label entry detected."); + Parsed.Label2AddrMap.insert( + {reinterpret_cast(Input), Output}); + }); } - assert(Cursor && "Error reading address map section"); return Parsed; } diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 14c4925f2e06..1161934af662 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -4156,15 +4156,20 @@ void BinaryFunction::updateOutputValues(const BOLTLinker &Linker) { // Injected functions likely will fail lookup, as they have no // input range. Just assign the BB the output address of the // function. - auto MaybeBBAddress = - BC.getIOAddressMap().lookup(BB->getInputOffset() + getAddress()); + auto MaybeBBAddress = BC.getIOAddressMap().lookup(BB->getLabel()); const uint64_t BBAddress = MaybeBBAddress ? *MaybeBBAddress : BB->isSplit() ? FF.getAddress() : getOutputAddress(); BB->setOutputStartAddress(BBAddress); - if (PrevBB) + if (PrevBB) { + assert(PrevBB->getOutputAddressRange().first <= BBAddress && + "Bad output address for basic block."); + assert((PrevBB->getOutputAddressRange().first != BBAddress || + !hasInstructions() || PrevBB->empty()) && + "Bad output address for basic block."); PrevBB->setOutputEndAddress(BBAddress); + } PrevBB = BB; } diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 88ece58a09bb..25d730a38001 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -3184,9 +3184,6 @@ void RewriteInstance::preregisterSections() { ROFlags); BC->registerOrUpdateSection(getNewSecPrefix() + ".rodata.cold", ELF::SHT_PROGBITS, ROFlags); - BC->registerOrUpdateSection(AddressMap::SectionName, ELF::SHT_PROGBITS, - ROFlags) - .setLinkOnly(); } void RewriteInstance::emitAndLink() { @@ -3657,11 +3654,8 @@ void RewriteInstance::mapAllocatableSections( } void RewriteInstance::updateOutputValues(const BOLTLinker &Linker) { - if (auto MapSection = BC->getUniqueSectionByName(AddressMap::SectionName)) { - auto Map = AddressMap::parse(MapSection->getOutputContents(), *BC); - BC->setIOAddressMap(std::move(Map)); - BC->deregisterSection(*MapSection); - } + if (std::optional Map = AddressMap::parse(*BC)) + BC->setIOAddressMap(std::move(*Map)); for (BinaryFunction *Function : BC->getAllBinaryFunctions()) Function->updateOutputValues(Linker); diff --git a/bolt/test/X86/jump-table-icp.test b/bolt/test/X86/jump-table-icp.test index 708f1273af3f..34339dc327fa 100644 --- a/bolt/test/X86/jump-table-icp.test +++ b/bolt/test/X86/jump-table-icp.test @@ -12,6 +12,7 @@ RUN: (llvm-bolt %t.exe --data %t.fdata -o %t --relocs \ RUN: --reorder-blocks=cache --split-functions --split-all-cold \ RUN: --use-gnu-stack --dyno-stats --indirect-call-promotion=jump-tables \ RUN: --print-icp -v=0 \ +RUN: --enable-bat --print-cache-metrics \ RUN: --icp-jt-remaining-percent-threshold=10 \ RUN: --icp-jt-total-percent-threshold=2 \ RUN: --indirect-call-promotion-topn=1 \ -- Gitee From 8409ed5db9e346f8c83d59b4617276d43adc3a12 Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Sat, 4 Nov 2023 00:47:24 +0400 Subject: [PATCH 63/94] [Backport][BOLT][AArch64] Handle .plt.got section (#71216) It seems that currently this section is only created by the mold linker if 2 conditions are met: 1. The PLT function was called directly. 2. The indirect access to PLT function was found (e.g. through ADRP relocation). Although mold created symbol for every plt entry I've removed them in yaml file to check that .plt.got was truly disassembled by bolt. --- bolt/include/bolt/Rewrite/RewriteInstance.h | 6 +- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 8 + bolt/test/AArch64/Inputs/plt-got.yaml | 216 ++++++++++++++++++ bolt/test/AArch64/plt-got.test | 7 + 4 files changed, 234 insertions(+), 3 deletions(-) create mode 100644 bolt/test/AArch64/Inputs/plt-got.yaml create mode 100644 bolt/test/AArch64/plt-got.test diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h index 940c7324594e..6e020ba95ca0 100644 --- a/bolt/include/bolt/Rewrite/RewriteInstance.h +++ b/bolt/include/bolt/Rewrite/RewriteInstance.h @@ -503,11 +503,11 @@ private: }; /// AArch64 PLT sections. - const PLTSectionInfo AArch64_PLTSections[3] = { - {".plt"}, {".iplt"}, {nullptr}}; + const PLTSectionInfo AArch64_PLTSections[4] = { + {".plt"}, {".plt.got"}, {".iplt"}, {nullptr}}; /// RISCV PLT sections. - const PLTSectionInfo RISCV_PLTSections[3] = {{".plt"}, {nullptr}}; + const PLTSectionInfo RISCV_PLTSections[2] = {{".plt"}, {nullptr}}; /// Return PLT information for a section with \p SectionName or nullptr /// if the section is not PLT. diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index bf77244102a2..642de6c3c618 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -864,6 +864,14 @@ public: /// add x16, x16, #0xbe0 /// br x17 /// + /// The other type of trampolines are located in .plt.got, that are used for + /// non-lazy bindings so doesn't use x16 arg to transfer .got entry address: + /// + /// adrp x16, 230000 + /// ldr x17, [x16, #3040] + /// br x17 + /// nop + /// uint64_t analyzePLTEntry(MCInst &Instruction, InstructionIterator Begin, InstructionIterator End, uint64_t BeginPC) const override { diff --git a/bolt/test/AArch64/Inputs/plt-got.yaml b/bolt/test/AArch64/Inputs/plt-got.yaml new file mode 100644 index 000000000000..7856719c5df8 --- /dev/null +++ b/bolt/test/AArch64/Inputs/plt-got.yaml @@ -0,0 +1,216 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_AARCH64 + Entry: 0x10360 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_R ] + VAddr: 0x40 + Align: 0x8 + Offset: 0x40 + - Type: PT_INTERP + Flags: [ PF_R ] + FirstSec: .interp + LastSec: .interp + VAddr: 0x270 + Offset: 0x270 + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .interp + LastSec: .rela.dyn + Align: 0x10000 + Offset: 0x0 + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + FirstSec: .plt.got + LastSec: .text + VAddr: 0x10350 + Align: 0x10000 + Offset: 0x2e0 + - Type: PT_LOAD + Flags: [ PF_W, PF_R ] + FirstSec: .interp + LastSec: .got + VAddr: 0x203B0 + Align: 0x10000 + Offset: 0x270 + - Type: PT_LOAD + Flags: [ PF_W, PF_R ] + FirstSec: .got.plt + LastSec: .got.plt + VAddr: 0x304E0 + Align: 0x10000 + Offset: 0x420 + - Type: PT_DYNAMIC + Flags: [ PF_W, PF_R ] + FirstSec: .dynamic + LastSec: .dynamic + VAddr: 0x203B0 + Align: 0x8 + Offset: 0x340 + - Type: PT_GNU_STACK + Flags: [ PF_W, PF_R ] + Offset: 0x0 +Sections: + - Name: .interp + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x270 + AddressAlign: 0x1 + Offset: 0x270 + Content: 2F6C69622F6C642D6C696E75782D616172636836342E736F2E3100 + - Name: .dynsym + Type: SHT_DYNSYM + Flags: [ SHF_ALLOC ] + Address: 0x2B0 + Link: .dynstr + AddressAlign: 0x8 + - Name: .dynstr + Type: SHT_STRTAB + Flags: [ SHF_ALLOC ] + Address: 0x2E0 + AddressAlign: 0x1 + - Name: .rela.dyn + Type: SHT_RELA + Flags: [ SHF_ALLOC ] + Address: 0x2F0 + Link: .dynsym + AddressAlign: 0x8 + Relocations: + - Offset: 0x204D8 + Symbol: abort + Type: R_AARCH64_GLOB_DAT + - Name: .plt.got + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x10350 + AddressAlign: 0x10 + Content: 90000090116E42F920021FD61F2003D5 + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x10360 + AddressAlign: 0x4 + Content: FF8300D1FD7B01A9FD43009188000090086D42F9E80700F9E80740F9080100F1E8179F1AA800003701000014E80740F900013FD601000014EEFFFF97007D20D41000009010420D9100021FD61F2003D5 + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x203B0 + Link: .dynstr + AddressAlign: 0x8 + Entries: + - Tag: DT_NEEDED + Value: 0x1 + - Tag: DT_RELA + Value: 0x2F0 + - Tag: DT_RELASZ + Value: 0x18 + - Tag: DT_RELAENT + Value: 0x18 + - Tag: DT_PLTGOT + Value: 0x304E0 + - Tag: DT_SYMTAB + Value: 0x2B0 + - Tag: DT_SYMENT + Value: 0x18 + - Tag: DT_STRTAB + Value: 0x2E0 + - Tag: DT_STRSZ + Value: 0x10 + - Tag: DT_GNU_HASH + Value: 0x290 + - Tag: DT_FLAGS_1 + Value: 0x8000000 + - Tag: DT_DEBUG + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Name: .got + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x204D0 + AddressAlign: 0x8 + Content: '00000000000000000000000000000000' + - Name: .got.plt + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x304E0 + AddressAlign: 0x8 + Content: B00302000000000000000000000000000000000000000000 + - Name: .rela.text + Type: SHT_RELA + Flags: [ SHF_INFO_LINK ] + Link: .symtab + AddressAlign: 0x8 + Offset: 0x1268 + Info: .text + Relocations: + - Offset: 0x1036C + Symbol: abort + Type: R_AARCH64_ADR_GOT_PAGE + - Offset: 0x10370 + Symbol: abort + Type: R_AARCH64_LD64_GOT_LO12_NC + - Offset: 0x10398 + Symbol: abort + Type: R_AARCH64_CALL26 + - Type: SectionHeaderTable + Sections: + - Name: .interp + - Name: .dynsym + - Name: .dynstr + - Name: .rela.dyn + - Name: .plt.got + - Name: .text + - Name: .dynamic + - Name: .got + - Name: .got.plt + - Name: .strtab + - Name: .symtab + - Name: .shstrtab + - Name: .rela.text +Symbols: + - Name: .text + Type: STT_SECTION + Section: .text + Value: 0x10360 + - Name: .dynamic + Type: STT_SECTION + Section: .dynamic + Value: 0x203B0 + - Name: .got + Type: STT_SECTION + Section: .got + Value: 0x204D0 + - Name: .got.plt + Type: STT_SECTION + Section: .got.plt + Value: 0x304E0 + - Name: 'abort$got' + Type: STT_OBJECT + Section: .got + Value: 0x204D8 + - Name: _start + Type: STT_FUNC + Section: .text + Value: 0x10360 + Size: 0x3C + - Name: _DYNAMIC + Section: .dynamic + Value: 0x203B0 + - Name: _GLOBAL_OFFSET_TABLE_ + Section: .got + Value: 0x204D0 + - Name: abort + Type: STT_FUNC + Binding: STB_GLOBAL + Size: 0x8 +DynamicSymbols: + - Name: abort + Type: STT_FUNC + Binding: STB_GLOBAL + Size: 0x8 +... diff --git a/bolt/test/AArch64/plt-got.test b/bolt/test/AArch64/plt-got.test new file mode 100644 index 000000000000..be1c095784b7 --- /dev/null +++ b/bolt/test/AArch64/plt-got.test @@ -0,0 +1,7 @@ +// This test checks .plt.got handling by BOLT + +RUN: yaml2obj %p/Inputs/plt-got.yaml &> %t.exe +RUN: llvm-bolt %t.exe -o %t.bolt --print-disasm --print-only=_start/1 | \ +RUN: FileCheck %s + +CHECK: bl abort@PLT -- Gitee From 6191709a0ed716f8118332a1c2265791a1cc8066 Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Mon, 6 Nov 2023 11:40:04 +0000 Subject: [PATCH 64/94] [Backport][BOLT] Add itrace aggregation for AUX data (#70426) If you have a perf.data with Arm ETM data the only way to use perf2bolt with Branch Aggregation is to first run `perf inject --itrace=l64i1us -o perf-brstack.data` and then pass the new perf-brstack.data into perf2bolt. perf2bolt then runs `perf script -F pid,ip,brstack` to produce the brstacks. This PR adds `--itrace` arg to perf2bolt to enable Itrace Aggregation. It takes a string which is what is passed to the `perf script -F pid,ip,brstack --itrace={0}`. This command produces the brstacks without having to run perf inject and creating a new perf.data file. --- bolt/lib/Profile/DataAggregator.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 02f7032c2c7e..cbc079afbb7e 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -46,6 +46,11 @@ static cl::opt cl::desc("aggregate basic samples (without LBR info)"), cl::cat(AggregatorCategory)); +static cl::opt + ITraceAggregation("itrace", + cl::desc("Generate LBR info with perf itrace argument"), + cl::cat(AggregatorCategory)); + static cl::opt FilterMemProfile("filter-mem-profile", cl::desc("if processing a memory profile, filter out stack or heap accesses " @@ -163,16 +168,23 @@ void DataAggregator::start() { findPerfExecutable(); - if (opts::BasicAggregation) + if (opts::BasicAggregation) { launchPerfProcess("events without LBR", MainEventsPPI, "script -F pid,event,ip", /*Wait = */false); - else + } else if (!opts::ITraceAggregation.empty()) { + std::string ItracePerfScriptArgs = llvm::formatv( + "script -F pid,ip,brstack --itrace={0}", opts::ITraceAggregation); + launchPerfProcess("branch events with itrace", MainEventsPPI, + ItracePerfScriptArgs.c_str(), + /*Wait = */ false); + } else { launchPerfProcess("branch events", MainEventsPPI, "script -F pid,ip,brstack", /*Wait = */false); + } // Note: we launch script for mem events regardless of the option, as the // command fails fairly fast if mem events were not collected. -- Gitee From b3d8749b4ccf077c0ac3b9d0fb451f8c341e6c51 Mon Sep 17 00:00:00 2001 From: maksfb Date: Mon, 6 Nov 2023 11:25:49 -0800 Subject: [PATCH 65/94] [Backport][BOLT] Fix address mapping for ICP code (#70136) When we create new code for indirect code promotion optimization, we should mark it as originating from the indirect jump instruction for BOLT address translation (BAT) to map it to the original instruction. --- bolt/lib/Core/BinaryFunction.cpp | 2 +- bolt/lib/Passes/IndirectCallPromotion.cpp | 19 +++++++++++++++---- bolt/test/X86/jump-table-icp.test | 6 ++++-- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 1161934af662..a73ee25e0e08 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -533,7 +533,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) { if (BB->getCFIState() >= 0) OS << " CFI State : " << BB->getCFIState() << '\n'; if (opts::EnableBAT) { - OS << " Input offset: " << Twine::utohexstr(BB->getInputOffset()) + OS << " Input offset: 0x" << Twine::utohexstr(BB->getInputOffset()) << "\n"; } if (!BB->pred_empty()) { diff --git a/bolt/lib/Passes/IndirectCallPromotion.cpp b/bolt/lib/Passes/IndirectCallPromotion.cpp index ea8019431cf5..89727233ec78 100644 --- a/bolt/lib/Passes/IndirectCallPromotion.cpp +++ b/bolt/lib/Passes/IndirectCallPromotion.cpp @@ -754,6 +754,15 @@ IndirectCallPromotion::rewriteCall( const bool IsTailCallOrJT = (MIB->isTailCall(CallInst) || Function.getJumpTable(CallInst)); + // If we are tracking the indirect call/jump address, propagate the address to + // the ICP code. + const std::optional IndirectInstrOffset = MIB->getOffset(CallInst); + if (IndirectInstrOffset) { + for (auto &[Symbol, Instructions] : ICPcode) + for (MCInst &Inst : Instructions) + MIB->setOffset(Inst, *IndirectInstrOffset); + } + // Move instructions from the tail of the original call block // to the merge block. @@ -767,10 +776,12 @@ IndirectCallPromotion::rewriteCall( TailInsts.push_back(*++TailInst); InstructionListType MovedInst = IndCallBlock.splitInstructions(&CallInst); - // Link new BBs to the original input offset of the BB where the indirect - // call site is, so we can map samples recorded in new BBs back to the - // original BB seen in the input binary (if using BAT) - const uint32_t OrigOffset = IndCallBlock.getInputOffset(); + // Link new BBs to the original input offset of the indirect call site or its + // containing BB, so we can map samples recorded in new BBs back to the + // original BB seen in the input binary (if using BAT). + const uint32_t OrigOffset = IndirectInstrOffset + ? *IndirectInstrOffset + : IndCallBlock.getInputOffset(); IndCallBlock.eraseInstructions(MethodFetchInsns.begin(), MethodFetchInsns.end()); diff --git a/bolt/test/X86/jump-table-icp.test b/bolt/test/X86/jump-table-icp.test index 34339dc327fa..5b989d18018b 100644 --- a/bolt/test/X86/jump-table-icp.test +++ b/bolt/test/X86/jump-table-icp.test @@ -37,12 +37,14 @@ CHECK: Successors: .Ltmp{{.*}} (mispreds: 189, count: 189), .LFT{{.*}} (mispre CHECK: .LFT{{.*}} (4 instructions, align : 1) CHECK-NEXT: Exec Count : 881 CHECK: Predecessors: .LBB{{.*}} -CHECK: Successors: .Ltmp{{.*}} (mispreds: 138, count: 155), .Ltmp{{.*}} (mispreds: 0, count: 726) +CHECK: je {{.*}} # Offset: 28 +CHECK-NEXT: Successors: .Ltmp{{.*}} (mispreds: 138, count: 155), .Ltmp{{.*}} (mispreds: 0, count: 726) CHECK: .Ltmp{{.*}} (1 instructions, align : 1) CHECK-NEXT: Exec Count : 726 CHECK: Predecessors: .LFT{{.*}} -CHECK: Successors: .L{{.*}} (mispreds: 126, count: 157), .L{{.*}} (mispreds: 140, count: 156), .L{{.*}} (mispreds: 134, count: 152), .L{{.*}} (mispreds: 137, count: 150), .L{{.*}} (mispreds: 129, count: 148), .L{{.*}} (mispreds: 0, count: 0) +CHECK: jmpq {{.*}} # Offset: 28 +CHECK-NEXT: Successors: .L{{.*}} (mispreds: 126, count: 157), .L{{.*}} (mispreds: 140, count: 156), .L{{.*}} (mispreds: 134, count: 152), .L{{.*}} (mispreds: 137, count: 150), .L{{.*}} (mispreds: 129, count: 148), .L{{.*}} (mispreds: 0, count: 0) CHECK: .Ltmp{{.*}} (5 instructions, align : 1) CHECK-NEXT: Exec Count : 167 -- Gitee From 684bd9424f62e26d709b810f43eab22c028b60c0 Mon Sep 17 00:00:00 2001 From: maksfb Date: Mon, 6 Nov 2023 11:41:47 -0800 Subject: [PATCH 66/94] [Backport][BOLT] Reduce the number of emitted symbols. NFCI. (#70175) We emit a symbol before an instruction for a number of reasons, e.g. for tracking LocSyms, debug line, or if the instruction has a label annotation. Currently, we may emit multiple symbols per instruction. Reuse the same label instead of creating and emitting new ones when possible. I'm planning to refactor EH labels as well in a separate diff. Change getLabel() to return a pointer instead of std::optional<> since an empty label should be treated identically to no label. --- bolt/include/bolt/Core/MCPlusBuilder.h | 2 +- bolt/lib/Core/BinaryContext.cpp | 4 +- bolt/lib/Core/BinaryEmitter.cpp | 54 ++++++++++++++++---------- bolt/lib/Core/MCPlusBuilder.cpp | 4 +- bolt/lib/Passes/BinaryPasses.cpp | 8 ++-- 5 files changed, 43 insertions(+), 29 deletions(-) diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index b9ccf53919cc..800e1358b451 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -1180,7 +1180,7 @@ public: bool clearOffset(MCInst &Inst); /// Return the label of \p Inst, if available. - std::optional getLabel(const MCInst &Inst) const; + MCSymbol *getLabel(const MCInst &Inst) const; /// Set the label of \p Inst. This label will be emitted right before \p Inst /// is emitted to MCStreamer. diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index 4c67f41300b6..f1a660836562 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -1863,8 +1863,8 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction, } if (std::optional Offset = MIB->getOffset(Instruction)) OS << " # Offset: " << *Offset; - if (auto Label = MIB->getLabel(Instruction)) - OS << " # Label: " << **Label; + if (MCSymbol *Label = MIB->getLabel(Instruction)) + OS << " # Label: " << *Label; MIB->printAnnotations(Instruction, OS); diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp index de80a99a74ed..9b8d9d69faea 100644 --- a/bolt/lib/Core/BinaryEmitter.cpp +++ b/bolt/lib/Core/BinaryEmitter.cpp @@ -161,9 +161,17 @@ private: /// \p FirstInstr indicates if \p NewLoc represents the first instruction /// in a sequence, such as a function fragment. /// + /// If \p NewLoc location matches \p PrevLoc, no new line number entry will be + /// created and the function will return \p PrevLoc while \p InstrLabel will + /// be ignored. Otherwise, the caller should use \p InstrLabel to mark the + /// corresponding instruction by emitting \p InstrLabel before it. + /// If \p InstrLabel is set by the caller, its value will be used with \p + /// \p NewLoc. If it was nullptr on entry, it will be populated with a pointer + /// to a new temp symbol used with \p NewLoc. + /// /// Return new current location which is either \p NewLoc or \p PrevLoc. SMLoc emitLineInfo(const BinaryFunction &BF, SMLoc NewLoc, SMLoc PrevLoc, - bool FirstInstr); + bool FirstInstr, MCSymbol *&InstrLabel); /// Use \p FunctionEndSymbol to mark the end of the line info sequence. /// Note that it does not automatically result in the insertion of the EOS @@ -483,23 +491,28 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF, // are relaxable, we should be safe. } - if (!EmitCodeOnly && opts::UpdateDebugSections && BF.getDWARFUnit()) { - LastLocSeen = emitLineInfo(BF, Instr.getLoc(), LastLocSeen, FirstInstr); - FirstInstr = false; - } + if (!EmitCodeOnly) { + // A symbol to be emitted before the instruction to mark its location. + MCSymbol *InstrLabel = BC.MIB->getLabel(Instr); - // Prepare to tag this location with a label if we need to keep track of - // the location of calls/returns for BOLT address translation maps - if (!EmitCodeOnly && BF.requiresAddressTranslation() && - BC.MIB->getOffset(Instr)) { - const uint32_t Offset = *BC.MIB->getOffset(Instr); - MCSymbol *LocSym = BC.Ctx->createTempSymbol(); - Streamer.emitLabel(LocSym); - BB->getLocSyms().emplace_back(Offset, LocSym); - } + if (opts::UpdateDebugSections && BF.getDWARFUnit()) { + LastLocSeen = emitLineInfo(BF, Instr.getLoc(), LastLocSeen, + FirstInstr, InstrLabel); + FirstInstr = false; + } - if (auto Label = BC.MIB->getLabel(Instr)) - Streamer.emitLabel(*Label); + // Prepare to tag this location with a label if we need to keep track of + // the location of calls/returns for BOLT address translation maps + if (BF.requiresAddressTranslation() && BC.MIB->getOffset(Instr)) { + const uint32_t Offset = *BC.MIB->getOffset(Instr); + if (!InstrLabel) + InstrLabel = BC.Ctx->createTempSymbol(); + BB->getLocSyms().emplace_back(Offset, InstrLabel); + } + + if (InstrLabel) + Streamer.emitLabel(InstrLabel); + } Streamer.emitInstruction(Instr, *BC.STI); LastIsPrefix = BC.MIB->isPrefix(Instr); @@ -661,7 +674,8 @@ void BinaryEmitter::emitConstantIslands(BinaryFunction &BF, bool EmitColdPart, } SMLoc BinaryEmitter::emitLineInfo(const BinaryFunction &BF, SMLoc NewLoc, - SMLoc PrevLoc, bool FirstInstr) { + SMLoc PrevLoc, bool FirstInstr, + MCSymbol *&InstrLabel) { DWARFUnit *FunctionCU = BF.getDWARFUnit(); const DWARFDebugLine::LineTable *FunctionLineTable = BF.getDWARFLineTable(); assert(FunctionCU && "cannot emit line info for function without CU"); @@ -711,12 +725,12 @@ SMLoc BinaryEmitter::emitLineInfo(const BinaryFunction &BF, SMLoc NewLoc, const MCDwarfLoc &DwarfLoc = BC.Ctx->getCurrentDwarfLoc(); BC.Ctx->clearDwarfLocSeen(); - MCSymbol *LineSym = BC.Ctx->createTempSymbol(); - Streamer.emitLabel(LineSym); + if (!InstrLabel) + InstrLabel = BC.Ctx->createTempSymbol(); BC.getDwarfLineTable(FunctionUnitIndex) .getMCLineSections() - .addLineEntry(MCDwarfLineEntry(LineSym, DwarfLoc), + .addLineEntry(MCDwarfLineEntry(InstrLabel, DwarfLoc), Streamer.getCurrentSectionOnly()); return NewLoc; diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp index 0a5eb44e4876..21d017ed4fdd 100644 --- a/bolt/lib/Core/MCPlusBuilder.cpp +++ b/bolt/lib/Core/MCPlusBuilder.cpp @@ -268,10 +268,10 @@ bool MCPlusBuilder::clearOffset(MCInst &Inst) { return true; } -std::optional MCPlusBuilder::getLabel(const MCInst &Inst) const { +MCSymbol *MCPlusBuilder::getLabel(const MCInst &Inst) const { if (auto Label = tryGetAnnotationAs(Inst, MCAnnotation::kLabel)) return *Label; - return std::nullopt; + return nullptr; } bool MCPlusBuilder::setLabel(MCInst &Inst, MCSymbol *Label) { diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index e50fa9dea602..8db22de37b33 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -610,8 +610,8 @@ void LowerAnnotations::runOnFunctions(BinaryContext &BC) { if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II)) PreservedOffsetAnnotations.emplace_back(&(*II), *BC.MIB->getOffset(*II)); - if (auto Label = BC.MIB->getLabel(*II)) - PreservedLabelAnnotations.emplace_back(&*II, *Label); + if (MCSymbol *Label = BC.MIB->getLabel(*II)) + PreservedLabelAnnotations.emplace_back(&*II, Label); BC.MIB->stripAnnotations(*II); } } @@ -620,8 +620,8 @@ void LowerAnnotations::runOnFunctions(BinaryContext &BC) { for (BinaryFunction *BF : BC.getInjectedBinaryFunctions()) for (BinaryBasicBlock &BB : *BF) for (MCInst &Instruction : BB) { - if (auto Label = BC.MIB->getLabel(Instruction)) - PreservedLabelAnnotations.emplace_back(&Instruction, *Label); + if (MCSymbol *Label = BC.MIB->getLabel(Instruction)) + PreservedLabelAnnotations.emplace_back(&Instruction, Label); BC.MIB->stripAnnotations(Instruction); } -- Gitee From f551d287e9a88e363d4efea38c698d9897d932e0 Mon Sep 17 00:00:00 2001 From: maksfb Date: Mon, 6 Nov 2023 12:14:22 -0800 Subject: [PATCH 67/94] [Backport][BOLT] Modify MCPlus annotation internals. NFCI. (#70412) When annotating MCInst instructions, attach extra annotation operands directly to the annotated instruction, instead of attaching them to an instruction pointed to by a special kInst operand. With this change, it's no longer necessary to allocate MCInst and most of the first-class annotations come with free memory as currently MCInst is declared with: SmallVector Operands; i.e. more operands than are normally being used. We still create a kInst operand with a nullptr instruction value to designate the beginning of annotation operands. However, this special operand might not be needed if we can rely on MCInstrDesc::NumOperands. --- bolt/include/bolt/Core/MCPlus.h | 26 +++--- bolt/include/bolt/Core/MCPlusBuilder.h | 113 ++++++++++++------------- bolt/lib/Core/BinaryFunction.cpp | 19 +++-- bolt/lib/Core/MCPlusBuilder.cpp | 62 ++++++-------- 4 files changed, 106 insertions(+), 114 deletions(-) diff --git a/bolt/include/bolt/Core/MCPlus.h b/bolt/include/bolt/Core/MCPlus.h index 31cc9071de76..f6ffd33513dd 100644 --- a/bolt/include/bolt/Core/MCPlus.h +++ b/bolt/include/bolt/Core/MCPlus.h @@ -32,11 +32,16 @@ namespace MCPlus { /// pad and the uint64_t represents the action. using MCLandingPad = std::pair; -/// An extension to MCInst is provided via an extra operand of type MCInst with -/// ANNOTATION_LABEL opcode (i.e. we are tying an annotation instruction to an -/// existing one). The annotation instruction contains a list of Immediate -/// operands. Each operand either contains a value, or is a pointer to -/// an instance of class MCAnnotation. +/// An extension to MCInst is provided via extra operands, i.e. operands that +/// are not used in the instruction assembly. Any kind of metadata can be +/// attached to MCInst with this "annotation" extension using MCPlusBuilder +/// interface. +// +/// The first extra operand must be of type kInst with an empty (nullptr) +/// value. The kInst operand type is unused on most non-VLIW architectures. +/// We use it to mark the beginning of annotations operands. The rest of the +/// operands are of Immediate type with annotation info encoded into the value +/// of the immediate. /// /// There are 2 distinct groups of annotations. The first group is a first-class /// annotation that affects semantics of the instruction, such as an @@ -55,7 +60,7 @@ using MCLandingPad = std::pair; /// of their corresponding operand. /// /// Annotations in the second group could be addressed either by name, or by -/// by and index which could be queried by providing a name. +/// by index which could be queried by providing the name. class MCAnnotation { public: enum Kind { @@ -106,10 +111,11 @@ private: /// Return a number of operands in \Inst excluding operands representing /// annotations. inline unsigned getNumPrimeOperands(const MCInst &Inst) { - if (Inst.getNumOperands() > 0 && std::prev(Inst.end())->isInst()) { - assert(std::prev(Inst.end())->getInst()->getOpcode() == - TargetOpcode::ANNOTATION_LABEL); - return Inst.getNumOperands() - 1; + for (signed I = Inst.getNumOperands() - 1; I >= 0; --I) { + if (Inst.getOperand(I).isInst()) + return I; + if (!Inst.getOperand(I).isImm()) + return Inst.getNumOperands(); } return Inst.getNumOperands(); } diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 800e1358b451..90d1fe32b9e3 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -65,7 +65,6 @@ public: private: /// A struct that represents a single annotation allocator struct AnnotationAllocator { - SpecificBumpPtrAllocator MCInstAllocator; BumpPtrAllocator ValueAllocator; std::unordered_set AnnotationPool; }; @@ -97,60 +96,62 @@ private: return SignExtend64<56>(ImmValue & 0xff'ffff'ffff'ffffULL); } - MCInst *getAnnotationInst(const MCInst &Inst) const { - if (Inst.getNumOperands() == 0) - return nullptr; + std::optional getFirstAnnotationOpIndex(const MCInst &Inst) const { + const unsigned NumPrimeOperands = MCPlus::getNumPrimeOperands(Inst); + if (Inst.getNumOperands() == NumPrimeOperands) + return std::nullopt; - const MCOperand &LastOp = Inst.getOperand(Inst.getNumOperands() - 1); - if (!LastOp.isInst()) - return nullptr; + assert(Inst.getOperand(NumPrimeOperands).getInst() == nullptr && + "Empty instruction expected."); - MCInst *AnnotationInst = const_cast(LastOp.getInst()); - assert(AnnotationInst->getOpcode() == TargetOpcode::ANNOTATION_LABEL); + return NumPrimeOperands + 1; + } - return AnnotationInst; + MCInst::iterator getAnnotationInstOp(MCInst &Inst) const { + for (MCInst::iterator Iter = Inst.begin(); Iter != Inst.end(); ++Iter) { + if (Iter->isInst()) { + assert(Iter->getInst() == nullptr && "Empty instruction expected."); + return Iter; + } + } + return Inst.end(); } - void removeAnnotationInst(MCInst &Inst) const { - assert(getAnnotationInst(Inst) && "Expected annotation instruction."); - Inst.erase(std::prev(Inst.end())); - assert(!getAnnotationInst(Inst) && - "More than one annotation instruction detected."); + void removeAnnotations(MCInst &Inst) const { + Inst.erase(getAnnotationInstOp(Inst), Inst.end()); } - void setAnnotationOpValue(MCInst &Inst, unsigned Index, int64_t Value, - AllocatorIdTy AllocatorId = 0) { - MCInst *AnnotationInst = getAnnotationInst(Inst); - if (!AnnotationInst) { - AnnotationAllocator &Allocator = getAnnotationAllocator(AllocatorId); - AnnotationInst = new (Allocator.MCInstAllocator.Allocate()) MCInst(); - AnnotationInst->setOpcode(TargetOpcode::ANNOTATION_LABEL); - Inst.addOperand(MCOperand::createInst(AnnotationInst)); + void setAnnotationOpValue(MCInst &Inst, unsigned Index, int64_t Value) const { + const int64_t AnnotationValue = encodeAnnotationImm(Index, Value); + const std::optional FirstAnnotationOp = + getFirstAnnotationOpIndex(Inst); + if (!FirstAnnotationOp) { + Inst.addOperand(MCOperand::createInst(nullptr)); + Inst.addOperand(MCOperand::createImm(AnnotationValue)); + return; } - const int64_t AnnotationValue = encodeAnnotationImm(Index, Value); - for (int I = AnnotationInst->getNumOperands() - 1; I >= 0; --I) { - int64_t ImmValue = AnnotationInst->getOperand(I).getImm(); + for (unsigned I = *FirstAnnotationOp; I < Inst.getNumOperands(); ++I) { + const int64_t ImmValue = Inst.getOperand(I).getImm(); if (extractAnnotationIndex(ImmValue) == Index) { - AnnotationInst->getOperand(I).setImm(AnnotationValue); + Inst.getOperand(I).setImm(AnnotationValue); return; } } - AnnotationInst->addOperand(MCOperand::createImm(AnnotationValue)); + Inst.addOperand(MCOperand::createImm(AnnotationValue)); } std::optional getAnnotationOpValue(const MCInst &Inst, unsigned Index) const { - const MCInst *AnnotationInst = getAnnotationInst(Inst); - if (!AnnotationInst) + std::optional FirstAnnotationOp = getFirstAnnotationOpIndex(Inst); + if (!FirstAnnotationOp) return std::nullopt; - for (int I = AnnotationInst->getNumOperands() - 1; I >= 0; --I) { - int64_t ImmValue = AnnotationInst->getOperand(I).getImm(); - if (extractAnnotationIndex(ImmValue) == Index) { + for (unsigned I = *FirstAnnotationOp; I < Inst.getNumOperands(); ++I) { + const int64_t ImmValue = Inst.getOperand(I).getImm(); + if (extractAnnotationIndex(ImmValue) == Index) return extractAnnotationValue(ImmValue); - } } return std::nullopt; @@ -171,20 +172,17 @@ protected: /// AnnotationNameIndexMap and AnnotationsNames. mutable llvm::sys::RWMutex AnnotationNameMutex; - /// Allocate the TailCall annotation value. Clients of the target-specific + /// Set TailCall annotation value to true. Clients of the target-specific /// MCPlusBuilder classes must use convert/lower/create* interfaces instead. - void setTailCall(MCInst &Inst); + void setTailCall(MCInst &Inst) const; /// Transfer annotations from \p SrcInst to \p DstInst. void moveAnnotations(MCInst &&SrcInst, MCInst &DstInst) const { - assert(!getAnnotationInst(DstInst) && - "Destination instruction should not have annotations."); - const MCInst *AnnotationInst = getAnnotationInst(SrcInst); - if (!AnnotationInst) - return; + MCInst::iterator AnnotationOp = getAnnotationInstOp(SrcInst); + for (MCInst::iterator Iter = AnnotationOp; Iter != SrcInst.end(); ++Iter) + DstInst.addOperand(*Iter); - DstInst.addOperand(MCOperand::createInst(AnnotationInst)); - removeAnnotationInst(SrcInst); + SrcInst.erase(AnnotationOp, SrcInst.end()); } public: @@ -389,7 +387,6 @@ public: Allocator.AnnotationPool.clear(); Allocator.ValueAllocator.Reset(); - Allocator.MCInstAllocator.DestroyAll(); } } @@ -1127,20 +1124,19 @@ public: std::optional getEHInfo(const MCInst &Inst) const; /// Add handler and action info for call instruction. - void addEHInfo(MCInst &Inst, const MCPlus::MCLandingPad &LP); + void addEHInfo(MCInst &Inst, const MCPlus::MCLandingPad &LP) const; /// Update exception-handling info for the invoke instruction \p Inst. /// Return true on success and false otherwise, e.g. if the instruction is /// not an invoke. - bool updateEHInfo(MCInst &Inst, const MCPlus::MCLandingPad &LP); + bool updateEHInfo(MCInst &Inst, const MCPlus::MCLandingPad &LP) const; /// Return non-negative GNU_args_size associated with the instruction /// or -1 if there's no associated info. int64_t getGnuArgsSize(const MCInst &Inst) const; /// Add the value of GNU_args_size to Inst if it already has EH info. - void addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize, - AllocatorIdTy AllocId = 0); + void addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize) const; /// Return jump table addressed by this instruction. uint64_t getJumpTable(const MCInst &Inst) const; @@ -1153,7 +1149,7 @@ public: AllocatorIdTy AllocId = 0); /// Disassociate instruction with a jump table. - bool unsetJumpTable(MCInst &Inst); + bool unsetJumpTable(MCInst &Inst) const; /// Return destination of conditional tail call instruction if \p Inst is one. std::optional getConditionalTailCall(const MCInst &Inst) const; @@ -1161,11 +1157,11 @@ public: /// Mark the \p Instruction as a conditional tail call, and set its /// destination address if it is known. If \p Instruction was already marked, /// update its destination with \p Dest. - bool setConditionalTailCall(MCInst &Inst, uint64_t Dest = 0); + bool setConditionalTailCall(MCInst &Inst, uint64_t Dest = 0) const; /// If \p Inst was marked as a conditional tail call convert it to a regular /// branch. Return true if the instruction was converted. - bool unsetConditionalTailCall(MCInst &Inst); + bool unsetConditionalTailCall(MCInst &Inst) const; /// Return offset of \p Inst in the original function, if available. std::optional getOffset(const MCInst &Inst) const; @@ -1174,10 +1170,10 @@ public: uint32_t getOffsetWithDefault(const MCInst &Inst, uint32_t Default) const; /// Set offset of \p Inst in the original function. - bool setOffset(MCInst &Inst, uint32_t Offset, AllocatorIdTy AllocatorId = 0); + bool setOffset(MCInst &Inst, uint32_t Offset) const; /// Remove offset annotation. - bool clearOffset(MCInst &Inst); + bool clearOffset(MCInst &Inst) const; /// Return the label of \p Inst, if available. MCSymbol *getLabel(const MCInst &Inst) const; @@ -1826,8 +1822,7 @@ public: if (!std::is_trivial::value) Allocator.AnnotationPool.insert(A); - setAnnotationOpValue(Inst, Index, reinterpret_cast(A), - AllocatorId); + setAnnotationOpValue(Inst, Index, reinterpret_cast(A)); return A->getValue(); } @@ -1960,21 +1955,21 @@ public: /// /// Return true if the annotation was removed, false if the annotation /// was not present. - bool removeAnnotation(MCInst &Inst, unsigned Index); + bool removeAnnotation(MCInst &Inst, unsigned Index) const; /// Remove annotation associated with \p Name. /// /// Return true if the annotation was removed, false if the annotation /// was not present. - bool removeAnnotation(MCInst &Inst, StringRef Name) { + bool removeAnnotation(MCInst &Inst, StringRef Name) const { const auto Index = getAnnotationIndex(Name); if (!Index) return false; return removeAnnotation(Inst, *Index); } - /// Remove meta-data, but don't destroy it. - void stripAnnotations(MCInst &Inst, bool KeepTC = false); + /// Remove meta-data from the instruction, but don't destroy it. + void stripAnnotations(MCInst &Inst, bool KeepTC = false) const; virtual InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst, MCSymbol *HandlerFuncAddr, diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index a73ee25e0e08..2ff9b87307c9 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -1998,7 +1998,7 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) { } } if (LastNonNop && !MIB->getOffset(*LastNonNop)) - MIB->setOffset(*LastNonNop, static_cast(Offset), AllocatorId); + MIB->setOffset(*LastNonNop, static_cast(Offset)); }; for (auto I = Instructions.begin(), E = Instructions.end(); I != E; ++I) { @@ -2021,7 +2021,7 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) { if (MIB->isNoop(Instr) && !MIB->getOffset(Instr)) { // If "Offset" annotation is not present, set it and mark the nop for // deletion. - MIB->setOffset(Instr, static_cast(Offset), AllocatorId); + MIB->setOffset(Instr, static_cast(Offset)); // Annotate ordinary nops, so we can safely delete them if required. MIB->addAnnotation(Instr, "NOP", static_cast(1), AllocatorId); } @@ -2302,6 +2302,13 @@ void BinaryFunction::removeConditionalTailCalls() { assert(CTCTargetLabel && "symbol expected for conditional tail call"); MCInst TailCallInstr; BC.MIB->createTailCall(TailCallInstr, CTCTargetLabel, BC.Ctx.get()); + + // Move offset from CTCInstr to TailCallInstr. + if (const std::optional Offset = BC.MIB->getOffset(*CTCInstr)) { + BC.MIB->setOffset(TailCallInstr, *Offset); + BC.MIB->clearOffset(*CTCInstr); + } + // Link new BBs to the original input offset of the BB where the CTC // is, so we can map samples recorded in new BBs back to the original BB // seem in the input binary (if using BAT) @@ -2330,12 +2337,6 @@ void BinaryFunction::removeConditionalTailCalls() { // This branch is no longer a conditional tail call. BC.MIB->unsetConditionalTailCall(*CTCInstr); - - // Move offset from CTCInstr to TailCallInstr. - if (std::optional Offset = BC.MIB->getOffset(*CTCInstr)) { - BC.MIB->setOffset(TailCallInstr, *Offset); - BC.MIB->clearOffset(*CTCInstr); - } } insertBasicBlocks(std::prev(end()), std::move(NewBlocks), @@ -3365,7 +3366,7 @@ void BinaryFunction::propagateGnuArgsSizeInfo( } } else if (BC.MIB->isInvoke(Instr)) { // Add the value of GNU_args_size as an extra operand to invokes. - BC.MIB->addGnuArgsSize(Instr, CurrentGnuArgsSize, AllocId); + BC.MIB->addGnuArgsSize(Instr, CurrentGnuArgsSize); } ++II; } diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp index 21d017ed4fdd..7b6a71d560ea 100644 --- a/bolt/lib/Core/MCPlusBuilder.cpp +++ b/bolt/lib/Core/MCPlusBuilder.cpp @@ -120,7 +120,7 @@ bool MCPlusBuilder::equals(const MCTargetExpr &A, const MCTargetExpr &B, llvm_unreachable("target-specific expressions are unsupported"); } -void MCPlusBuilder::setTailCall(MCInst &Inst) { +void MCPlusBuilder::setTailCall(MCInst &Inst) const { assert(!hasAnnotation(Inst, MCAnnotation::kTailCall)); setAnnotationOpValue(Inst, MCAnnotation::kTailCall, true); } @@ -149,7 +149,7 @@ std::optional MCPlusBuilder::getEHInfo(const MCInst &Inst) const { static_cast(*Action)); } -void MCPlusBuilder::addEHInfo(MCInst &Inst, const MCLandingPad &LP) { +void MCPlusBuilder::addEHInfo(MCInst &Inst, const MCLandingPad &LP) const { if (isCall(Inst)) { assert(!getEHInfo(Inst)); setAnnotationOpValue(Inst, MCAnnotation::kEHLandingPad, @@ -159,7 +159,7 @@ void MCPlusBuilder::addEHInfo(MCInst &Inst, const MCLandingPad &LP) { } } -bool MCPlusBuilder::updateEHInfo(MCInst &Inst, const MCLandingPad &LP) { +bool MCPlusBuilder::updateEHInfo(MCInst &Inst, const MCLandingPad &LP) const { if (!isInvoke(Inst)) return false; @@ -178,13 +178,12 @@ int64_t MCPlusBuilder::getGnuArgsSize(const MCInst &Inst) const { return *Value; } -void MCPlusBuilder::addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize, - AllocatorIdTy AllocId) { +void MCPlusBuilder::addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize) const { assert(GnuArgsSize >= 0 && "cannot set GNU_args_size to negative value"); assert(getGnuArgsSize(Inst) == -1LL && "GNU_args_size already set"); assert(isInvoke(Inst) && "GNU_args_size can only be set for invoke"); - setAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize, GnuArgsSize, AllocId); + setAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize, GnuArgsSize); } uint64_t MCPlusBuilder::getJumpTable(const MCInst &Inst) const { @@ -203,12 +202,12 @@ bool MCPlusBuilder::setJumpTable(MCInst &Inst, uint64_t Value, uint16_t IndexReg, AllocatorIdTy AllocId) { if (!isIndirectBranch(Inst)) return false; - setAnnotationOpValue(Inst, MCAnnotation::kJumpTable, Value, AllocId); + setAnnotationOpValue(Inst, MCAnnotation::kJumpTable, Value); getOrCreateAnnotationAs(Inst, "JTIndexReg", AllocId) = IndexReg; return true; } -bool MCPlusBuilder::unsetJumpTable(MCInst &Inst) { +bool MCPlusBuilder::unsetJumpTable(MCInst &Inst) const { if (!getJumpTable(Inst)) return false; removeAnnotation(Inst, MCAnnotation::kJumpTable); @@ -225,7 +224,7 @@ MCPlusBuilder::getConditionalTailCall(const MCInst &Inst) const { return static_cast(*Value); } -bool MCPlusBuilder::setConditionalTailCall(MCInst &Inst, uint64_t Dest) { +bool MCPlusBuilder::setConditionalTailCall(MCInst &Inst, uint64_t Dest) const { if (!isConditionalBranch(Inst)) return false; @@ -233,7 +232,7 @@ bool MCPlusBuilder::setConditionalTailCall(MCInst &Inst, uint64_t Dest) { return true; } -bool MCPlusBuilder::unsetConditionalTailCall(MCInst &Inst) { +bool MCPlusBuilder::unsetConditionalTailCall(MCInst &Inst) const { if (!getConditionalTailCall(Inst)) return false; removeAnnotation(Inst, MCAnnotation::kConditionalTailCall); @@ -255,13 +254,12 @@ uint32_t MCPlusBuilder::getOffsetWithDefault(const MCInst &Inst, return Default; } -bool MCPlusBuilder::setOffset(MCInst &Inst, uint32_t Offset, - AllocatorIdTy AllocatorId) { - setAnnotationOpValue(Inst, MCAnnotation::kOffset, Offset, AllocatorId); +bool MCPlusBuilder::setOffset(MCInst &Inst, uint32_t Offset) const { + setAnnotationOpValue(Inst, MCAnnotation::kOffset, Offset); return true; } -bool MCPlusBuilder::clearOffset(MCInst &Inst) { +bool MCPlusBuilder::clearOffset(MCInst &Inst) const { if (!hasAnnotation(Inst, MCAnnotation::kOffset)) return false; removeAnnotation(Inst, MCAnnotation::kOffset); @@ -280,49 +278,41 @@ bool MCPlusBuilder::setLabel(MCInst &Inst, MCSymbol *Label) { } bool MCPlusBuilder::hasAnnotation(const MCInst &Inst, unsigned Index) const { - const MCInst *AnnotationInst = getAnnotationInst(Inst); - if (!AnnotationInst) - return false; - return (bool)getAnnotationOpValue(Inst, Index); } -bool MCPlusBuilder::removeAnnotation(MCInst &Inst, unsigned Index) { - MCInst *AnnotationInst = getAnnotationInst(Inst); - if (!AnnotationInst) +bool MCPlusBuilder::removeAnnotation(MCInst &Inst, unsigned Index) const { + std::optional FirstAnnotationOp = getFirstAnnotationOpIndex(Inst); + if (!FirstAnnotationOp) return false; - for (int I = AnnotationInst->getNumOperands() - 1; I >= 0; --I) { - int64_t ImmValue = AnnotationInst->getOperand(I).getImm(); + for (unsigned I = Inst.getNumOperands() - 1; I >= *FirstAnnotationOp; --I) { + const int64_t ImmValue = Inst.getOperand(I).getImm(); if (extractAnnotationIndex(ImmValue) == Index) { - AnnotationInst->erase(AnnotationInst->begin() + I); + Inst.erase(Inst.begin() + I); return true; } } return false; } -void MCPlusBuilder::stripAnnotations(MCInst &Inst, bool KeepTC) { - MCInst *AnnotationInst = getAnnotationInst(Inst); - if (!AnnotationInst) - return; - // Preserve TailCall annotation. - auto IsTC = hasAnnotation(Inst, MCAnnotation::kTailCall); +void MCPlusBuilder::stripAnnotations(MCInst &Inst, bool KeepTC) const { + KeepTC &= hasAnnotation(Inst, MCAnnotation::kTailCall); - removeAnnotationInst(Inst); + removeAnnotations(Inst); - if (KeepTC && IsTC) + if (KeepTC) setTailCall(Inst); } void MCPlusBuilder::printAnnotations(const MCInst &Inst, raw_ostream &OS) const { - const MCInst *AnnotationInst = getAnnotationInst(Inst); - if (!AnnotationInst) + std::optional FirstAnnotationOp = getFirstAnnotationOpIndex(Inst); + if (!FirstAnnotationOp) return; - for (unsigned I = 0; I < AnnotationInst->getNumOperands(); ++I) { - const int64_t Imm = AnnotationInst->getOperand(I).getImm(); + for (unsigned I = *FirstAnnotationOp; I < Inst.getNumOperands(); ++I) { + const int64_t Imm = Inst.getOperand(I).getImm(); const unsigned Index = extractAnnotationIndex(Imm); const int64_t Value = extractAnnotationValue(Imm); const auto *Annotation = reinterpret_cast(Value); -- Gitee From 7526ca7d919737bc32b4d9b786ec1e20a59f0c44 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Thu, 28 Sep 2023 08:53:17 -0700 Subject: [PATCH 68/94] [Backport][ORC] Rename MemLifetimePolicy to MemLifetime. The *Policy suffix came from the earlier MemAllocPolicy type, where it was included to distinguish the type from a memory-allocation operation. MemLifetime is a noun already, so the *Policy suffix is just dead weight now. --- .../llvm/ExecutionEngine/JITLink/JITLink.h | 6 ++--- .../JITLink/JITLinkMemoryManager.h | 4 +-- .../ExecutionEngine/Orc/Shared/MemoryFlags.h | 25 +++++++++---------- .../Orc/Shared/TargetProcessControlTypes.h | 5 ++-- .../JITLink/COFFLinkGraphBuilder.cpp | 2 +- .../JITLink/ELFLinkGraphBuilder.h | 2 +- .../ExecutionEngine/JITLink/JITLinkGeneric.h | 14 +++++------ .../JITLink/JITLinkMemoryManager.cpp | 19 +++++++------- .../JITLink/MachOLinkGraphBuilder.cpp | 2 +- llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp | 4 +-- llvm/tools/llvm-jitlink/llvm-jitlink.cpp | 2 +- .../JITLink/LinkGraphTests.cpp | 2 +- 12 files changed, 41 insertions(+), 46 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h index 568c9cf87f80..50b78095d015 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h @@ -723,10 +723,10 @@ public: void setMemProt(orc::MemProt Prot) { this->Prot = Prot; } /// Get the memory lifetime policy for this section. - orc::MemLifetimePolicy getMemLifetimePolicy() const { return MLP; } + orc::MemLifetime getMemLifetime() const { return ML; } /// Set the memory lifetime policy for this section. - void setMemLifetimePolicy(orc::MemLifetimePolicy MLP) { this->MLP = MLP; } + void setMemLifetime(orc::MemLifetime ML) { this->ML = ML; } /// Returns the ordinal for this section. SectionOrdinal getOrdinal() const { return SecOrdinal; } @@ -794,7 +794,7 @@ private: StringRef Name; orc::MemProt Prot; - orc::MemLifetimePolicy MLP = orc::MemLifetimePolicy::Standard; + orc::MemLifetime ML = orc::MemLifetime::Standard; SectionOrdinal SecOrdinal = 0; BlockSet Blocks; SymbolSet Symbols; diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h index 09e0d71cf0bd..1b8c4d4e181c 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h @@ -292,8 +292,8 @@ private: /// address of that block using the Segment's AllocGroup. Once memory has been /// populated, clients can call finalize to finalize the memory. /// -/// Note: Segments with MemLifetimePolicy::NoAlloc are not permitted, since -/// they would not be useful, and their presence is likely to indicate a bug. +/// Note: Segments with MemLifetime::NoAlloc are not permitted, since they would +/// not be useful, and their presence is likely to indicate a bug. class SimpleSegmentAlloc { public: /// Describes a segment to be allocated. diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h index c20366cfbb38..b8b5f90b6b0f 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h @@ -72,7 +72,7 @@ inline MemProt fromSysMemoryProtectionFlags(sys::Memory::ProtectionFlags PF) { /// deallocated if a call is made to /// JITLinkMemoryManager::InFlightAllocation::abandon. The policies below apply /// to finalized allocations. -enum class MemLifetimePolicy { +enum class MemLifetime { /// Standard memory should be allocated by the allocator and then deallocated /// when the deallocate method is called for the finalized allocation. Standard, @@ -89,15 +89,15 @@ enum class MemLifetimePolicy { }; /// Print a MemDeallocPolicy. -inline raw_ostream &operator<<(raw_ostream &OS, MemLifetimePolicy MLP) { +inline raw_ostream &operator<<(raw_ostream &OS, MemLifetime MLP) { switch (MLP) { - case MemLifetimePolicy::Standard: + case MemLifetime::Standard: OS << "standard"; break; - case MemLifetimePolicy::Finalize: + case MemLifetime::Finalize: OS << "finalize"; break; - case MemLifetimePolicy::NoAlloc: + case MemLifetime::NoAlloc: OS << "noalloc"; break; } @@ -124,11 +124,11 @@ public: AllocGroup() = default; /// Create an AllocGroup from a MemProt only -- uses - /// MemLifetimePolicy::Standard. + /// MemLifetime::Standard. AllocGroup(MemProt MP) : Id(static_cast(MP)) {} - /// Create an AllocGroup from a MemProt and a MemLifetimePolicy. - AllocGroup(MemProt MP, MemLifetimePolicy MLP) + /// Create an AllocGroup from a MemProt and a MemLifetime. + AllocGroup(MemProt MP, MemLifetime MLP) : Id(static_cast(MP) | (static_cast(MLP) << BitsForProt)) {} @@ -137,9 +137,9 @@ public: return static_cast(Id & ((1U << BitsForProt) - 1)); } - /// Returns the MemLifetimePolicy for this group. - MemLifetimePolicy getMemLifetimePolicy() const { - return static_cast(Id >> BitsForProt); + /// Returns the MemLifetime for this group. + MemLifetime getMemLifetime() const { + return static_cast(Id >> BitsForProt); } friend bool operator==(const AllocGroup &LHS, const AllocGroup &RHS) { @@ -203,8 +203,7 @@ private: /// Print an AllocGroup. inline raw_ostream &operator<<(raw_ostream &OS, AllocGroup AG) { - return OS << '(' << AG.getMemProt() << ", " << AG.getMemLifetimePolicy() - << ')'; + return OS << '(' << AG.getMemProt() << ", " << AG.getMemLifetime() << ')'; } } // end namespace orc diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h index 09c73db44a94..1285867565e2 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h @@ -36,10 +36,9 @@ struct RemoteAllocGroup { RemoteAllocGroup(MemProt Prot, bool FinalizeLifetime) : Prot(Prot), FinalizeLifetime(FinalizeLifetime) {} RemoteAllocGroup(const AllocGroup &AG) : Prot(AG.getMemProt()) { - assert(AG.getMemLifetimePolicy() != orc::MemLifetimePolicy::NoAlloc && + assert(AG.getMemLifetime() != orc::MemLifetime::NoAlloc && "Cannot use no-alloc memory in a remote alloc request"); - FinalizeLifetime = - AG.getMemLifetimePolicy() == orc::MemLifetimePolicy::Finalize; + FinalizeLifetime = AG.getMemLifetime() == orc::MemLifetime::Finalize; } MemProt Prot; diff --git a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp index 6668854e1a6a..3bf7c9edb8bc 100644 --- a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp @@ -161,7 +161,7 @@ Error COFFLinkGraphBuilder::graphifySections() { if (!GraphSec) { GraphSec = &G->createSection(SectionName, Prot); if ((*Sec)->Characteristics & COFF::IMAGE_SCN_LNK_REMOVE) - GraphSec->setMemLifetimePolicy(orc::MemLifetimePolicy::NoAlloc); + GraphSec->setMemLifetime(orc::MemLifetime::NoAlloc); } if (GraphSec->getMemProt() != Prot) return make_error("MemProt should match"); diff --git a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h index e72645798349..127f33aad2ea 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h +++ b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h @@ -366,7 +366,7 @@ template Error ELFLinkGraphBuilder::graphifySections() { GraphSec = &G->createSection(*Name, Prot); // Non-SHF_ALLOC sections get NoAlloc memory lifetimes. if (!(Sec.sh_flags & ELF::SHF_ALLOC)) { - GraphSec->setMemLifetimePolicy(orc::MemLifetimePolicy::NoAlloc); + GraphSec->setMemLifetime(orc::MemLifetime::NoAlloc); LLVM_DEBUG({ dbgs() << " " << SecIndex << ": \"" << *Name << "\" is not a SHF_ALLOC section. Using NoAlloc lifetime.\n"; diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h index e69eddd6e119..25569d63daa2 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h +++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h @@ -124,8 +124,7 @@ private: LLVM_DEBUG(dbgs() << "Fixing up blocks:\n"); for (auto &Sec : G.sections()) { - bool NoAllocSection = - Sec.getMemLifetimePolicy() == orc::MemLifetimePolicy::NoAlloc; + bool NoAllocSection = Sec.getMemLifetime() == orc::MemLifetime::NoAlloc; for (auto *B : Sec.blocks()) { LLVM_DEBUG(dbgs() << " " << *B << ":\n"); @@ -153,12 +152,11 @@ private: // If B is a block in a Standard or Finalize section then make sure // that no edges point to symbols in NoAlloc sections. - assert( - (NoAllocSection || !E.getTarget().isDefined() || - E.getTarget().getBlock().getSection().getMemLifetimePolicy() != - orc::MemLifetimePolicy::NoAlloc) && - "Block in allocated section has edge pointing to no-alloc " - "section"); + assert((NoAllocSection || !E.getTarget().isDefined() || + E.getTarget().getBlock().getSection().getMemLifetime() != + orc::MemLifetime::NoAlloc) && + "Block in allocated section has edge pointing to no-alloc " + "section"); // Dispatch to LinkerImpl for fixup. if (auto Err = impl().applyFixup(G, *B, E)) diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp index f481504135a5..57e17aa78fed 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp @@ -26,10 +26,10 @@ BasicLayout::BasicLayout(LinkGraph &G) : G(G) { for (auto &Sec : G.sections()) { // Skip empty sections, and sections with NoAlloc lifetime policies. if (Sec.blocks().empty() || - Sec.getMemLifetimePolicy() == orc::MemLifetimePolicy::NoAlloc) + Sec.getMemLifetime() == orc::MemLifetime::NoAlloc) continue; - auto &Seg = Segments[{Sec.getMemProt(), Sec.getMemLifetimePolicy()}]; + auto &Seg = Segments[{Sec.getMemProt(), Sec.getMemLifetime()}]; for (auto *B : Sec.blocks()) if (LLVM_LIKELY(!B->isZeroFill())) Seg.ContentBlocks.push_back(B); @@ -90,7 +90,7 @@ BasicLayout::getContiguousPageBasedLayoutSizes(uint64_t PageSize) { inconvertibleErrorCode()); uint64_t SegSize = alignTo(Seg.ContentSize + Seg.ZeroFillSize, PageSize); - if (AG.getMemLifetimePolicy() == orc::MemLifetimePolicy::Standard) + if (AG.getMemLifetime() == orc::MemLifetime::Standard) SegsSizes.StandardSegs += SegSize; else SegsSizes.FinalizeSegs += SegSize; @@ -164,15 +164,15 @@ void SimpleSegmentAlloc::Create(JITLinkMemoryManager &MemMgr, auto &AG = KV.first; auto &Seg = KV.second; - assert(AG.getMemLifetimePolicy() != orc::MemLifetimePolicy::NoAlloc && + assert(AG.getMemLifetime() != orc::MemLifetime::NoAlloc && "NoAlloc segments are not supported by SimpleSegmentAlloc"); auto AGSectionName = AGSectionNames[static_cast(AG.getMemProt()) | - static_cast(AG.getMemLifetimePolicy()) << 3]; + static_cast(AG.getMemLifetime()) << 3]; auto &Sec = G->createSection(AGSectionName, AG.getMemProt()); - Sec.setMemLifetimePolicy(AG.getMemLifetimePolicy()); + Sec.setMemLifetime(AG.getMemLifetime()); if (Seg.ContentSize != 0) { NextAddr = @@ -419,10 +419,9 @@ void InProcessMemoryManager::allocate(const JITLinkDylib *JD, LinkGraph &G, auto &AG = KV.first; auto &Seg = KV.second; - auto &SegAddr = - (AG.getMemLifetimePolicy() == orc::MemLifetimePolicy::Standard) - ? NextStandardSegAddr - : NextFinalizeSegAddr; + auto &SegAddr = (AG.getMemLifetime() == orc::MemLifetime::Standard) + ? NextStandardSegAddr + : NextFinalizeSegAddr; Seg.WorkingMem = SegAddr.toPtr(); Seg.Addr = SegAddr; diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp index c40e0f9ffc8d..45385eb6f76d 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp @@ -192,7 +192,7 @@ Error MachOLinkGraphBuilder::createNormalizedSections() { // TODO: Are there any other criteria for NoAlloc lifetime? if (NSec.Flags & MachO::S_ATTR_DEBUG) - NSec.GraphSection->setMemLifetimePolicy(orc::MemLifetimePolicy::NoAlloc); + NSec.GraphSection->setMemLifetime(orc::MemLifetime::NoAlloc); IndexToSection.insert(std::make_pair(SecIndex, std::move(NSec))); } diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp index ca4950077ffe..9cfe547c84c3 100644 --- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp @@ -322,8 +322,8 @@ void SharedMemoryMapper::initialize(MemoryMapper::AllocInfo &AI, std::memset(Base + Segment.ContentSize, 0, Segment.ZeroFillSize); tpctypes::SharedMemorySegFinalizeRequest SegReq; - SegReq.RAG = {Segment.AG.getMemProt(), Segment.AG.getMemLifetimePolicy() == - MemLifetimePolicy::Finalize}; + SegReq.RAG = {Segment.AG.getMemProt(), + Segment.AG.getMemLifetime() == MemLifetime::Finalize}; SegReq.Addr = AI.MappingBase + Segment.Offset; SegReq.Size = Segment.ContentSize + Segment.ZeroFillSize; diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index 00dd5206d75a..8e17f53c62c6 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -501,7 +501,7 @@ public: auto FixedAI = std::move(AI); FixedAI.MappingBase -= DeltaAddr; for (auto &Seg : FixedAI.Segments) - Seg.AG = {MemProt::Read | MemProt::Write, Seg.AG.getMemLifetimePolicy()}; + Seg.AG = {MemProt::Read | MemProt::Write, Seg.AG.getMemLifetime()}; FixedAI.Actions.clear(); InProcessMemoryMapper::initialize( FixedAI, [this, OnInitialized = std::move(OnInitialized)]( diff --git a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp index ff153f6d4b32..711f35fc7683 100644 --- a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp +++ b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp @@ -798,7 +798,7 @@ TEST(LinkGraphTest, BasicLayoutHonorsNoAlloc) { // Create a NoAlloc section and block. auto &Sec2 = G.createSection("__metadata", orc::MemProt::Read | orc::MemProt::Write); - Sec2.setMemLifetimePolicy(orc::MemLifetimePolicy::NoAlloc); + Sec2.setMemLifetime(orc::MemLifetime::NoAlloc); G.createContentBlock(Sec2, BlockContent.slice(0, 8), orc::ExecutorAddr(), 8, 0); -- Gitee From 81a37369e6c4a8cdb5335d2c6e16a240f8d6d51e Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Wed, 8 Nov 2023 01:54:50 +0400 Subject: [PATCH 69/94] [Backport][BOLT] Move instrumentation option check (NFC) (#71581) Move options check from emitBinary to more proper adjustCommandLineOptions. --- .../RuntimeLibs/InstrumentationRuntimeLibrary.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp index cc36406543f3..c6c284a3f784 100644 --- a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp +++ b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp @@ -57,6 +57,14 @@ void InstrumentationRuntimeLibrary::adjustCommandLineOptions( "the input binary\n"; exit(1); } + + if (BC.IsStaticExecutable && !opts::InstrumentationSleepTime) { + errs() << "BOLT-ERROR: instrumentation of static binary currently does not " + "support profile output on binary finalization, so it " + "requires -instrumentation-sleep-time=N (N>0) usage\n"; + exit(1); + } + if (!BC.FiniFunctionAddress && !BC.IsStaticExecutable) { errs() << "BOLT-ERROR: input binary lacks DT_FINI entry in the dynamic " "section but instrumentation currently relies on patching " @@ -89,13 +97,6 @@ void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC, "__BOLT", "__counters", MachO::S_REGULAR, SectionKind::getData())); - if (BC.IsStaticExecutable && !opts::InstrumentationSleepTime) { - errs() << "BOLT-ERROR: instrumentation of static binary currently does not " - "support profile output on binary finalization, so it " - "requires -instrumentation-sleep-time=N (N>0) usage\n"; - exit(1); - } - Section->setAlignment(llvm::Align(BC.RegularPageSize)); Streamer.switchSection(Section); -- Gitee From a02898e561a0f416f325f60c5f55b939b963d935 Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Wed, 8 Nov 2023 11:01:10 +0000 Subject: [PATCH 70/94] [Backport][BOLT] Support instrumentation hook via DT_FINI_ARRAY (#67348) BOLT currently hooks its its instrumentation finalization function via `DT_FINI`. However, this method of calling finalization routines is not supported anymore on newer ABIs like RISC-V. `DT_FINI_ARRAY` is preferred there. This patch adds support for hooking into `DT_FINI_ARRAY` instead if the binary does not have a `DT_FINI` entry. If it does, `DT_FINI` takes precedence so this patch should not change how the currently supported instrumentation targets behave. `DT_FINI_ARRAY` points to an array in memory of `DT_FINI_ARRAYSZ` bytes. It consists of pointer-length entries that contain the addresses of finalization functions. However, the addresses are only filled-in by the dynamic linker at load time using relative relocations. This makes hooking via `DT_FINI_ARRAY` a bit more complicated than via `DT_FINI`. The implementation works as follows: - While scanning the binary: find the section where `DT_FINI_ARRAY` points to, read its first dynamic relocation and use its addend to find the address of the fini function we will use to hook; - While writing the output file: overwrite the addend of the dynamic relocation with the address of the runtime library's fini function. Updating the dynamic relocation required a bit of boiler plate: since dynamic relocations are stored in a `std::multiset` which doesn't support getting mutable references to its items, functions were added to `BinarySection` to take an existing relocation and insert a new one. --- bolt/include/bolt/Core/BinaryContext.h | 9 ++ bolt/include/bolt/Core/BinarySection.h | 20 +++- bolt/include/bolt/Rewrite/RewriteInstance.h | 9 ++ bolt/lib/Core/Relocation.cpp | 2 + bolt/lib/Rewrite/RewriteInstance.cpp | 86 ++++++++++++++- .../InstrumentationRuntimeLibrary.cpp | 7 -- bolt/test/AArch64/hook-fini.s | 103 ++++++++++++++++++ bolt/test/runtime/AArch64/hook-fini.test | 61 +++++++++++ 8 files changed, 287 insertions(+), 10 deletions(-) create mode 100644 bolt/test/AArch64/hook-fini.s create mode 100644 bolt/test/runtime/AArch64/hook-fini.test diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index ef57ff3541dc..39613228e908 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -680,6 +680,15 @@ public: /// the execution of the binary is completed. std::optional FiniFunctionAddress; + /// DT_FINI. + std::optional FiniAddress; + + /// DT_FINI_ARRAY. Only used when DT_FINI is not set. + std::optional FiniArrayAddress; + + /// DT_FINI_ARRAYSZ. Only used when DT_FINI is not set. + std::optional FiniArraySize; + /// Page alignment used for code layout. uint64_t PageAlign{HugePageSize}; diff --git a/bolt/include/bolt/Core/BinarySection.h b/bolt/include/bolt/Core/BinarySection.h index 326d088d1f04..92ab6ea0d38e 100644 --- a/bolt/include/bolt/Core/BinarySection.h +++ b/bolt/include/bolt/Core/BinarySection.h @@ -375,8 +375,12 @@ public: /// Add a dynamic relocation at the given /p Offset. void addDynamicRelocation(uint64_t Offset, MCSymbol *Symbol, uint64_t Type, uint64_t Addend, uint64_t Value = 0) { - assert(Offset < getSize() && "offset not within section bounds"); - DynamicRelocations.emplace(Relocation{Offset, Symbol, Type, Addend, Value}); + addDynamicRelocation(Relocation{Offset, Symbol, Type, Addend, Value}); + } + + void addDynamicRelocation(const Relocation &Reloc) { + assert(Reloc.Offset < getSize() && "offset not within section bounds"); + DynamicRelocations.emplace(Reloc); } /// Add relocation against the original contents of this section. @@ -410,6 +414,18 @@ public: return Itr != DynamicRelocations.end() ? &*Itr : nullptr; } + std::optional takeDynamicRelocationAt(uint64_t Offset) { + Relocation Key{Offset, 0, 0, 0, 0}; + auto Itr = DynamicRelocations.find(Key); + + if (Itr == DynamicRelocations.end()) + return std::nullopt; + + Relocation Reloc = *Itr; + DynamicRelocations.erase(Itr); + return Reloc; + } + uint64_t hash(const BinaryData &BD) const { std::map Cache; return hash(BD, Cache); diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h index 6e020ba95ca0..ca62d788e139 100644 --- a/bolt/include/bolt/Rewrite/RewriteInstance.h +++ b/bolt/include/bolt/Rewrite/RewriteInstance.h @@ -95,6 +95,15 @@ private: /// from meta data in the file. void discoverFileObjects(); + /// Check whether we should use DT_FINI or DT_FINI_ARRAY for instrumentation. + /// DT_FINI is preferred; DT_FINI_ARRAY is only used when no DT_FINI entry was + /// found. + Error discoverRtFiniAddress(); + + /// If DT_FINI_ARRAY is used for instrumentation, update the relocation of its + /// first entry to point to the instrumentation library's fini address. + void updateRtFiniReloc(); + /// Create and initialize metadata rewriters for this instance. void initializeMetadataManager(); diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp index a20a3f46c7d0..70fcc6953ed7 100644 --- a/bolt/lib/Core/Relocation.cpp +++ b/bolt/lib/Core/Relocation.cpp @@ -340,7 +340,9 @@ static uint64_t encodeValueAArch64(uint64_t Type, uint64_t Value, uint64_t PC) { switch (Type) { default: llvm_unreachable("unsupported relocation"); + case ELF::R_AARCH64_ABS16: case ELF::R_AARCH64_ABS32: + case ELF::R_AARCH64_ABS64: break; case ELF::R_AARCH64_PREL16: case ELF::R_AARCH64_PREL32: diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 25d730a38001..3c5f39a95118 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -702,6 +702,10 @@ Error RewriteInstance::run() { adjustCommandLineOptions(); discoverFileObjects(); + if (opts::Instrument && !BC->IsStaticExecutable) + if (Error E = discoverRtFiniAddress()) + return E; + preprocessProfileData(); // Skip disassembling if we have a translation table and we are running an @@ -738,6 +742,9 @@ Error RewriteInstance::run() { updateMetadata(); + if (opts::Instrument && !BC->IsStaticExecutable) + updateRtFiniReloc(); + if (opts::LinuxKernelMode) { errs() << "BOLT-WARNING: not writing the output file for Linux Kernel\n"; return Error::success(); @@ -1278,6 +1285,77 @@ void RewriteInstance::discoverFileObjects() { registerFragments(); } +Error RewriteInstance::discoverRtFiniAddress() { + // Use DT_FINI if it's available. + if (BC->FiniAddress) { + BC->FiniFunctionAddress = BC->FiniAddress; + return Error::success(); + } + + if (!BC->FiniArrayAddress || !BC->FiniArraySize) { + return createStringError( + std::errc::not_supported, + "Instrumentation needs either DT_FINI or DT_FINI_ARRAY"); + } + + if (*BC->FiniArraySize < BC->AsmInfo->getCodePointerSize()) { + return createStringError(std::errc::not_supported, + "Need at least 1 DT_FINI_ARRAY slot"); + } + + ErrorOr FiniArraySection = + BC->getSectionForAddress(*BC->FiniArrayAddress); + if (auto EC = FiniArraySection.getError()) + return errorCodeToError(EC); + + if (const Relocation *Reloc = FiniArraySection->getDynamicRelocationAt(0)) { + BC->FiniFunctionAddress = Reloc->Addend; + return Error::success(); + } + + if (const Relocation *Reloc = FiniArraySection->getRelocationAt(0)) { + BC->FiniFunctionAddress = Reloc->Value; + return Error::success(); + } + + return createStringError(std::errc::not_supported, + "No relocation for first DT_FINI_ARRAY slot"); +} + +void RewriteInstance::updateRtFiniReloc() { + // Updating DT_FINI is handled by patchELFDynamic. + if (BC->FiniAddress) + return; + + const RuntimeLibrary *RT = BC->getRuntimeLibrary(); + if (!RT || !RT->getRuntimeFiniAddress()) + return; + + assert(BC->FiniArrayAddress && BC->FiniArraySize && + "inconsistent .fini_array state"); + + ErrorOr FiniArraySection = + BC->getSectionForAddress(*BC->FiniArrayAddress); + assert(FiniArraySection && ".fini_array removed"); + + if (std::optional Reloc = + FiniArraySection->takeDynamicRelocationAt(0)) { + assert(Reloc->Addend == BC->FiniFunctionAddress && + "inconsistent .fini_array dynamic relocation"); + Reloc->Addend = RT->getRuntimeFiniAddress(); + FiniArraySection->addDynamicRelocation(*Reloc); + } + + // Update the static relocation by adding a pending relocation which will get + // patched when flushPendingRelocations is called in rewriteFile. Note that + // flushPendingRelocations will calculate the value to patch as + // "Symbol + Addend". Since we don't have a symbol, just set the addend to the + // desired value. + FiniArraySection->addPendingRelocation(Relocation{ + /*Offset*/ 0, /*Symbol*/ nullptr, /*Type*/ Relocation::getAbs64(), + /*Addend*/ RT->getRuntimeFiniAddress(), /*Value*/ 0}); +} + void RewriteInstance::registerFragments() { if (!BC->HasSplitFunctions) return; @@ -5106,7 +5184,13 @@ Error RewriteInstance::readELFDynamic(ELFObjectFile *File) { } break; case ELF::DT_FINI: - BC->FiniFunctionAddress = Dyn.getPtr(); + BC->FiniAddress = Dyn.getPtr(); + break; + case ELF::DT_FINI_ARRAY: + BC->FiniArrayAddress = Dyn.getPtr(); + break; + case ELF::DT_FINI_ARRAYSZ: + BC->FiniArraySize = Dyn.getPtr(); break; case ELF::DT_RELA: DynamicRelocationsAddress = Dyn.getPtr(); diff --git a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp index c6c284a3f784..cd1b975be7b9 100644 --- a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp +++ b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp @@ -65,13 +65,6 @@ void InstrumentationRuntimeLibrary::adjustCommandLineOptions( exit(1); } - if (!BC.FiniFunctionAddress && !BC.IsStaticExecutable) { - errs() << "BOLT-ERROR: input binary lacks DT_FINI entry in the dynamic " - "section but instrumentation currently relies on patching " - "DT_FINI to write the profile\n"; - exit(1); - } - if ((opts::InstrumentationWaitForks || opts::InstrumentationSleepTime) && opts::InstrumentationFileAppendPID) { errs() diff --git a/bolt/test/AArch64/hook-fini.s b/bolt/test/AArch64/hook-fini.s new file mode 100644 index 000000000000..a07187c2ef89 --- /dev/null +++ b/bolt/test/AArch64/hook-fini.s @@ -0,0 +1,103 @@ +## Test the different ways of hooking the fini function for instrumentation (via +## DT_FINI and via DT_FINI_ARRAY). We test the latter for both PIE and non-PIE +## binaries because of the different ways of handling relocations (static or +## dynamic). +## All tests perform the following steps: +## - Compile and link for the case to be tested +## - Some sanity-checks on the dynamic section and relocations in the binary to +## verify it has the shape we want for testing: +## - DT_FINI or DT_FINI_ARRAY in dynamic section +## - No relative relocations for non-PIE +## - Instrument +## - Verify generated binary +# REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}} + +# RUN: %clang %cflags -pie %s -Wl,-q -o %t.exe +# RUN: llvm-readelf -d %t.exe | FileCheck --check-prefix=DYN-FINI %s +# RUN: llvm-readelf -r %t.exe | FileCheck --check-prefix=RELOC-PIE %s +# RUN: llvm-bolt %t.exe -o %t --instrument +# RUN: llvm-readelf -drs %t | FileCheck --check-prefix=CHECK-FINI %s + +# RUN: %clang %cflags -pie %s -Wl,-q,-fini=0 -o %t-no-fini.exe +# RUN: llvm-readelf -d %t-no-fini.exe | FileCheck --check-prefix=DYN-NO-FINI %s +# RUN: llvm-readelf -r %t-no-fini.exe | FileCheck --check-prefix=RELOC-PIE %s +# RUN: llvm-bolt %t-no-fini.exe -o %t-no-fini --instrument +# RUN: llvm-readelf -drs %t-no-fini | FileCheck --check-prefix=CHECK-NO-FINI %s +# RUN: llvm-readelf -ds -x .fini_array %t-no-fini | FileCheck --check-prefix=CHECK-NO-FINI-RELOC %s + +## Create a dummy shared library to link against to force creation of the dynamic section. +# RUN: %clang %cflags %p/../Inputs/stub.c -fPIC -shared -o %t-stubs.so +# RUN: %clang %cflags %s -no-pie -Wl,-q,-fini=0 %t-stub.so -o %t-no-pie-no-fini.exe +# RUN: llvm-readelf -r %t-no-pie-no-fini.exe | FileCheck --check-prefix=RELOC-NO-PIE %s +# RUN: llvm-bolt %t-no-pie-no-fini.exe -o %t-no-pie-no-fini --instrument +# RUN: llvm-readelf -ds -x .fini_array %t-no-pie-no-fini | FileCheck --check-prefix=CHECK-NO-PIE-NO-FINI %s + +## With fini: dynamic section should contain DT_FINI +# DYN-FINI: (FINI) + +## Without fini: dynamic section should only contain DT_FINI_ARRAY +# DYN-NO-FINI-NOT: (FINI) +# DYN-NO-FINI: (FINI_ARRAY) +# DYN-NO-FINI: (FINI_ARRAYSZ) + +## With PIE: binary should have relative relocations +# RELOC-PIE: R_AARCH64_RELATIVE + +## Without PIE: binary should not have relative relocations +# RELOC-NO-PIE-NOT: R_AARCH64_RELATIVE + +## Check that DT_FINI is set to __bolt_runtime_fini +# CHECK-FINI: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-FINI-DAG: (FINI) 0x[[FINI:[[:xdigit:]]+]] +# CHECK-FINI-DAG: (FINI_ARRAY) 0x[[FINI_ARRAY:[[:xdigit:]]+]] +## Check that the dynamic relocation at .fini_array was not patched +# CHECK-FINI: Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries +# CHECK-FINI-NOT: {{0+}}[[FINI_ARRAY]] {{.*}} R_AARCH64_RELATIVE [[FINI]] +# CHECK-FINI: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-FINI: {{0+}}[[FINI]] {{.*}} __bolt_runtime_fini + +## Check that DT_FINI_ARRAY has a dynamic relocation for __bolt_runtime_fini +# CHECK-NO-FINI: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-NO-FINI-NOT: (FINI) +# CHECK-NO-FINI: (FINI_ARRAY) 0x[[FINI_ARRAY:[[:xdigit:]]+]] +# CHECK-NO-FINI: Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries +# CHECK-NO-FINI: {{0+}}[[FINI_ARRAY]] {{.*}} R_AARCH64_RELATIVE [[FINI_ADDR:[[:xdigit:]]+]] +# CHECK-NO-FINI: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-NO-FINI: {{0+}}[[FINI_ADDR]] {{.*}} __bolt_runtime_fini + +## Check that the static relocation in .fini_array is patched even for PIE +# CHECK-NO-FINI-RELOC: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-NO-FINI-RELOC: (FINI_ARRAY) 0x[[FINI_ARRAY:[[:xdigit:]]+]] +# CHECK-NO-FINI-RELOC: Symbol table '.symtab' contains {{.*}} entries: +## Read bytes separately so we can reverse them later +# CHECK-NO-FINI-RELOC: {{0+}}[[FINI_ADDR_B0:[[:xdigit:]]{2}]][[FINI_ADDR_B1:[[:xdigit:]]{2}]][[FINI_ADDR_B2:[[:xdigit:]]{2}]][[FINI_ADDR_B3:[[:xdigit:]]{2}]] {{.*}} __bolt_runtime_fini +# CHECK-NO-FINI-RELOC: Hex dump of section '.fini_array': +# CHECK-NO-FINI-RELOC: 0x{{0+}}[[FINI_ARRAY]] [[FINI_ADDR_B3]][[FINI_ADDR_B2]][[FINI_ADDR_B1]][[FINI_ADDR_B0]] 00000000 + +## Check that DT_FINI_ARRAY has static relocation applied for __bolt_runtime_fini +# CHECK-NO-PIE-NO-FINI: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-NO-PIE-NO-FINI-NOT: (FINI) +# CHECK-NO-PIE-NO-FINI: (FINI_ARRAY) 0x[[FINI_ARRAY:[a-f0-9]+]] +# CHECK-NO-PIE-NO-FINI: Symbol table '.symtab' contains {{.*}} entries: +## Read address bytes separately so we can reverse them later +# CHECK-NO-PIE-NO-FINI: {{0+}}[[FINI_ADDR_B0:[[:xdigit:]]{2}]][[FINI_ADDR_B1:[[:xdigit:]]{2}]][[FINI_ADDR_B2:[[:xdigit:]]{2}]][[FINI_ADDR_B3:[[:xdigit:]]{2}]] {{.*}} __bolt_runtime_fini +# CHECK-NO-PIE-NO-FINI: Hex dump of section '.fini_array': +# CHECK-NO-PIE-NO-FINI: 0x{{0+}}[[FINI_ARRAY]] [[FINI_ADDR_B3]][[FINI_ADDR_B2]][[FINI_ADDR_B1]][[FINI_ADDR_B0]] 00000000 + + .globl _start + .type _start, %function +_start: + # Dummy relocation to force relocation mode. + .reloc 0, R_AARCH64_NONE + ret +.size _start, .-_start + + .globl _fini + .type _fini, %function +_fini: + ret + .size _fini, .-_fini + + .section .fini_array,"aw" + .align 3 + .dword _fini diff --git a/bolt/test/runtime/AArch64/hook-fini.test b/bolt/test/runtime/AArch64/hook-fini.test new file mode 100644 index 000000000000..8d23b21b6d61 --- /dev/null +++ b/bolt/test/runtime/AArch64/hook-fini.test @@ -0,0 +1,61 @@ +# Test the different ways of hooking the fini function for instrumentation (via +# DT_FINI and via DT_FINI_ARRAY). We test the latter for both PIE and non-PIE +# binaries because of the different ways of handling relocations (static or +# dynamic). +# All tests perform the following steps: +# - Compile and link for the case to be tested +# - Some sanity-checks on the dynamic section and relocations in the binary to +# verify it has the shape we want for testing: +# - DT_FINI or DT_FINI_ARRAY in dynamic section +# - No relative relocations for non-PIE +# - Instrument +# - Run instrumented binary +# - Verify generated profile +REQUIRES: system-linux,bolt-runtime + +RUN: %clang %cflags -pie %p/Inputs/basic-instrumentation.s -Wl,-q -o %t.exe +RUN: llvm-readelf -d %t.exe | FileCheck --check-prefix=DYN-FINI %s +RUN: llvm-readelf -r %t.exe | FileCheck --check-prefix=RELOC-PIE %s +RUN: llvm-bolt %t.exe -o %t --instrument \ +RUN: --instrumentation-file=%t \ +RUN: --instrumentation-file-append-pid +RUN: rm -f %t.*.fdata +RUN: %t +RUN: cat %t.*.fdata | FileCheck %s + +RUN: %clang %cflags -pie %p/Inputs/basic-instrumentation.s -Wl,-q,-fini=0 -o %t-no-fini.exe +RUN: llvm-readelf -d %t-no-fini.exe | FileCheck --check-prefix=DYN-NO-FINI %s +RUN: llvm-readelf -r %t-no-fini.exe | FileCheck --check-prefix=RELOC-PIE %s +RUN: llvm-bolt %t-no-fini.exe -o %t-no-fini --instrument \ +RUN: --instrumentation-file=%t-no-fini \ +RUN: --instrumentation-file-append-pid +RUN: rm -f %t-no-fini.*.fdata +RUN: %t-no-fini +RUN: cat %t-no-fini.*.fdata | FileCheck %s + +RUN: %clang %cflags -no-pie %p/Inputs/basic-instrumentation.s -Wl,-q,-fini=0 -o %t-no-pie-no-fini.exe +RUN: llvm-readelf -d %t-no-pie-no-fini.exe | FileCheck --check-prefix=DYN-NO-FINI %s +RUN: llvm-readelf -r %t-no-pie-no-fini.exe | FileCheck --check-prefix=RELOC-NO-PIE %s +RUN: llvm-bolt %t-no-pie-no-fini.exe -o %t-no-pie-no-fini --instrument \ +RUN: --instrumentation-file=%t-no-pie-no-fini \ +RUN: --instrumentation-file-append-pid +RUN: rm -f %t-no-pie-no-fini.*.fdata +RUN: %t-no-pie-no-fini +RUN: cat %t-no-pie-no-fini.*.fdata | FileCheck %s + +# With fini: dynamic section should contain DT_FINI +DYN-FINI: (FINI) + +# Without fini: dynamic section should only contain DT_FINI_ARRAY +DYN-NO-FINI-NOT: (FINI) +DYN-NO-FINI: (FINI_ARRAY) +DYN-NO-FINI: (FINI_ARRAYSZ) + +# With PIE: binary should have relative relocations +RELOC-PIE: R_AARCH64_RELATIVE + +# Without PIE: binary should not have relative relocations +RELOC-NO-PIE-NOT: R_AARCH64_RELATIVE + +# The instrumented profile should at least say main was called once +CHECK: main 0 0 1{{$}} -- Gitee From 346cbe8b26ea4d0d8900c803e2a83885d47bbc99 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Wed, 8 Nov 2023 10:53:36 -0800 Subject: [PATCH 71/94] [Backport][BOLT] Follow-up to "Fix incorrect basic block output addresses" (#71630) In 8244ff6739a09cb75e6e7fd1c24b85e2b1397266, I've introduced an assertion that incorrectly used BasicBlock::empty(). Some basic blocks may contain only pseudo instructions and thus BB->empty() will evaluate to false, while the actual code size will be zero. --- bolt/lib/Core/BinaryFunction.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 2ff9b87307c9..3472a538b041 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -4167,7 +4167,7 @@ void BinaryFunction::updateOutputValues(const BOLTLinker &Linker) { assert(PrevBB->getOutputAddressRange().first <= BBAddress && "Bad output address for basic block."); assert((PrevBB->getOutputAddressRange().first != BBAddress || - !hasInstructions() || PrevBB->empty()) && + !hasInstructions() || !PrevBB->getNumNonPseudos()) && "Bad output address for basic block."); PrevBB->setOutputEndAddress(BBAddress); } -- Gitee From 29f95e32e056e8a6adfb326a83512c931159f68a Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Thu, 9 Nov 2023 09:13:44 +0100 Subject: [PATCH 72/94] [Backport][BOLT] Fix typo in test --- bolt/test/AArch64/hook-fini.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/test/AArch64/hook-fini.s b/bolt/test/AArch64/hook-fini.s index a07187c2ef89..4f321d463ef3 100644 --- a/bolt/test/AArch64/hook-fini.s +++ b/bolt/test/AArch64/hook-fini.s @@ -26,7 +26,7 @@ # RUN: llvm-readelf -ds -x .fini_array %t-no-fini | FileCheck --check-prefix=CHECK-NO-FINI-RELOC %s ## Create a dummy shared library to link against to force creation of the dynamic section. -# RUN: %clang %cflags %p/../Inputs/stub.c -fPIC -shared -o %t-stubs.so +# RUN: %clang %cflags %p/../Inputs/stub.c -fPIC -shared -o %t-stub.so # RUN: %clang %cflags %s -no-pie -Wl,-q,-fini=0 %t-stub.so -o %t-no-pie-no-fini.exe # RUN: llvm-readelf -r %t-no-pie-no-fini.exe | FileCheck --check-prefix=RELOC-NO-PIE %s # RUN: llvm-bolt %t-no-pie-no-fini.exe -o %t-no-pie-no-fini --instrument -- Gitee From 4e9d49df41a6c1ba3f925cc69a4f1d4d3db117ca Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Fri, 10 Nov 2023 00:46:04 +0400 Subject: [PATCH 73/94] [Backport][BOLT] Run EliminateUnreachableBlocks in parallel (#71299) The wall time for this pass decreased on my laptop from ~80 sec to 5 sec processing the clang. --- bolt/include/bolt/Core/BinaryFunction.h | 3 +- bolt/lib/Core/BinaryFunction.cpp | 5 ++- bolt/lib/Passes/BinaryPasses.cpp | 60 ++++++++++++++----------- 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index 5feef3f178c9..9deeb13a077b 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -1445,7 +1445,8 @@ public: /// Rebuilds BBs layout, ignoring dead BBs. Returns the number of removed /// BBs and the removed number of bytes of code. - std::pair eraseInvalidBBs(); + std::pair + eraseInvalidBBs(const MCCodeEmitter *Emitter = nullptr); /// Get the relative order between two basic blocks in the original /// layout. The result is > 0 if B occurs before A and < 0 if B diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 3472a538b041..b72279bd731b 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -322,7 +322,8 @@ void BinaryFunction::markUnreachableBlocks() { // Any unnecessary fallthrough jumps revealed after calling eraseInvalidBBs // will be cleaned up by fixBranches(). -std::pair BinaryFunction::eraseInvalidBBs() { +std::pair +BinaryFunction::eraseInvalidBBs(const MCCodeEmitter *Emitter) { DenseSet InvalidBBs; unsigned Count = 0; uint64_t Bytes = 0; @@ -331,7 +332,7 @@ std::pair BinaryFunction::eraseInvalidBBs() { assert(!isEntryPoint(*BB) && "all entry blocks must be valid"); InvalidBBs.insert(BB); ++Count; - Bytes += BC.computeCodeSize(BB->begin(), BB->end()); + Bytes += BC.computeCodeSize(BB->begin(), BB->end(), Emitter); } } diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index 8db22de37b33..5366a62d7328 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -317,38 +317,46 @@ void NormalizeCFG::runOnFunctions(BinaryContext &BC) { } void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { - if (!Function.getLayout().block_empty()) { - unsigned Count; - uint64_t Bytes; - Function.markUnreachableBlocks(); - LLVM_DEBUG({ - for (BinaryBasicBlock &BB : Function) { - if (!BB.isValid()) { - dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName() - << " in function " << Function << "\n"; - Function.dump(); - } + BinaryContext &BC = Function.getBinaryContext(); + unsigned Count; + uint64_t Bytes; + Function.markUnreachableBlocks(); + LLVM_DEBUG({ + for (BinaryBasicBlock &BB : Function) { + if (!BB.isValid()) { + dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName() + << " in function " << Function << "\n"; + Function.dump(); } - }); - std::tie(Count, Bytes) = Function.eraseInvalidBBs(); - DeletedBlocks += Count; - DeletedBytes += Bytes; - if (Count) { - Modified.insert(&Function); - if (opts::Verbosity > 0) - outs() << "BOLT-INFO: removed " << Count - << " dead basic block(s) accounting for " << Bytes - << " bytes in function " << Function << '\n'; } + }); + BinaryContext::IndependentCodeEmitter Emitter = + BC.createIndependentMCCodeEmitter(); + std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get()); + DeletedBlocks += Count; + DeletedBytes += Bytes; + if (Count) { + auto L = BC.scopeLock(); + Modified.insert(&Function); + if (opts::Verbosity > 0) + outs() << "BOLT-INFO: removed " << Count + << " dead basic block(s) accounting for " << Bytes + << " bytes in function " << Function << '\n'; } } void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { - for (auto &It : BC.getBinaryFunctions()) { - BinaryFunction &Function = It.second; - if (shouldOptimize(Function)) - runOnFunction(Function); - } + ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { + runOnFunction(BF); + }; + + ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { + return !shouldOptimize(BF) || BF.getLayout().block_empty(); + }; + + ParallelUtilities::runOnEachFunction( + BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, + SkipPredicate, "elimininate-unreachable"); if (DeletedBlocks) outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " -- Gitee From 67dfef881f7baacc807ce8037737920fc51d30be Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Fri, 10 Nov 2023 11:46:36 +0400 Subject: [PATCH 74/94] [Backport][BOLT][AArch64] Fix strict usage during ADR Relax (#71377) Currently strict mode is used to expand number of optimized functions, not to shrink it. Revert the option usage in the pass, so passing strict option would relax adr instruction even if there are no nops around it. Also add check for nop after adr instruction. --- bolt/lib/Passes/ADRRelaxationPass.cpp | 7 +++-- bolt/test/AArch64/r_aarch64_prelxx.s | 2 +- bolt/test/runtime/AArch64/adrrelaxationpass.s | 31 +++++++++++-------- bolt/test/runtime/AArch64/controlflow.s | 2 ++ 4 files changed, 26 insertions(+), 16 deletions(-) diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/ADRRelaxationPass.cpp index 7b612cbf6572..4039fa2fbb51 100644 --- a/bolt/lib/Passes/ADRRelaxationPass.cpp +++ b/bolt/lib/Passes/ADRRelaxationPass.cpp @@ -72,14 +72,17 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { if (It != BB.begin() && BC.MIB->isNoop(*std::prev(It))) { It = BB.eraseInstruction(std::prev(It)); - } else if (opts::StrictMode && !BF.isSimple()) { + } else if (std::next(It) != BB.end() && BC.MIB->isNoop(*std::next(It))) { + BB.eraseInstruction(std::next(It)); + } else if (!opts::StrictMode && !BF.isSimple()) { // If the function is not simple, it may contain a jump table undetected // by us. This jump table may use an offset from the branch instruction // to land in the desired place. If we add new instructions, we // invalidate this offset, so we have to rely on linker-inserted NOP to // replace it with ADRP, and abort if it is not present. + auto L = BC.scopeLock(); errs() << formatv("BOLT-ERROR: Cannot relax adr in non-simple function " - "{0}. Can't proceed in current mode.\n", + "{0}. Use --strict option to override\n", BF.getOneName()); PassFailed = true; return; diff --git a/bolt/test/AArch64/r_aarch64_prelxx.s b/bolt/test/AArch64/r_aarch64_prelxx.s index 444dee72b7c0..73bf8387d363 100644 --- a/bolt/test/AArch64/r_aarch64_prelxx.s +++ b/bolt/test/AArch64/r_aarch64_prelxx.s @@ -12,7 +12,7 @@ // CHECKPREL-NEXT: R_AARCH64_PREL32 {{.*}} _start + 4 // CHECKPREL-NEXT: R_AARCH64_PREL64 {{.*}} _start + 8 -// RUN: llvm-bolt %t.exe -o %t.bolt +// RUN: llvm-bolt %t.exe -o %t.bolt --strict // RUN: llvm-objdump -D %t.bolt | FileCheck %s --check-prefix=CHECKPREL32 // CHECKPREL32: [[#%x,DATATABLEADDR:]] : diff --git a/bolt/test/runtime/AArch64/adrrelaxationpass.s b/bolt/test/runtime/AArch64/adrrelaxationpass.s index 5c50cd637192..fa9fb63c613d 100644 --- a/bolt/test/runtime/AArch64/adrrelaxationpass.s +++ b/bolt/test/runtime/AArch64/adrrelaxationpass.s @@ -1,33 +1,27 @@ # The second and third ADR instructions are non-local to functions # and must be replaced with ADRP + ADD by BOLT -# Also since main is non-simple, we can't change it's length so we have to -# replace NOP with adrp, and if there is no nop before adr in non-simple +# Also since main and test are non-simple, we can't change it's length so we +# have to replace NOP with adrp, and if there is no nop before adr in non-simple # function, we can't guarantee we didn't break possible jump tables, so we -# fail in strict mode +# fail in non-strict mode # REQUIRES: system-linux # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ # RUN: %s -o %t.o # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -# RUN: llvm-bolt %t.exe -o %t.bolt --adr-relaxation=true +# RUN: llvm-bolt %t.exe -o %t.bolt --adr-relaxation=true --strict # RUN: llvm-objdump --no-print-imm-hex -d --disassemble-symbols=main %t.bolt | FileCheck %s # RUN: %t.bolt -# RUN: not llvm-bolt %t.exe -o %t.bolt --adr-relaxation=true --strict \ +# RUN: not llvm-bolt %t.exe -o %t.bolt --adr-relaxation=true \ # RUN: 2>&1 | FileCheck %s --check-prefix CHECK-ERROR - .data - .align 8 - .global Gvar -Gvar: .xword 0x0 - .global Gvar2 -Gvar2: .xword 0x42 - .text .align 4 .global test .type test, %function test: + adr x2, Gvar mov x0, xzr ret .size test, .-test @@ -47,6 +41,17 @@ br: .CI: .word 0xff + .data + .align 8 + .global Gvar +Gvar: .xword 0x0 + .global Gvar2 +Gvar2: .xword 0x42 + .balign 4 +jmptable: + .word 0 + .word test - jmptable + # CHECK:
: # CHECK-NEXT: adr x0, 0x{{[1-8a-f][0-9a-f]*}} # CHECK-NEXT: adrp x1, 0x{{[1-8a-f][0-9a-f]*}} @@ -54,4 +59,4 @@ br: # CHECK-NEXT: adrp x2, 0x{{[1-8a-f][0-9a-f]*}} # CHECK-NEXT: add x2, x2, #{{[1-8a-f][0-9a-f]*}} # CHECK-NEXT: adr x3, 0x{{[1-8a-f][0-9a-f]*}} -# CHECK-ERROR: BOLT-ERROR: Cannot relax adr in non-simple function main +# CHECK-ERROR: BOLT-ERROR: Cannot relax adr in non-simple function diff --git a/bolt/test/runtime/AArch64/controlflow.s b/bolt/test/runtime/AArch64/controlflow.s index fe9aab88f0c7..7b0a38779f6e 100644 --- a/bolt/test/runtime/AArch64/controlflow.s +++ b/bolt/test/runtime/AArch64/controlflow.s @@ -48,6 +48,7 @@ test_cond_branch: .global test_branch_reg .type test_branch_reg, %function test_branch_reg: + nop adr x0, test_branch_zero br x0 panic @@ -97,6 +98,7 @@ test_call: .global test_call_reg .type test_call_reg, %function test_call_reg: + nop adr x0, test_call_foo blr x0 panic -- Gitee From ff5f77c736173fd931d697c8100ed2b74ff5957c Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Wed, 8 Nov 2023 11:41:43 +0400 Subject: [PATCH 75/94] [Backport][BOLT][AArch64] Handle IFUNCS properly (#71104) Currently we were testing only the binaries compiled with O0, which results in indirect call to the IFUNC trampoline and the trampoline has associated IFUNC symbol with it. Compile with O3 results in direct calling the IFUNC trampoline and no symbols are associated with it, the IFUNC symbol address becomes the same as IFUNC resolver address. Since no symbol was associated the BF was not created before PLT analyze and be the algorithm we're going to analyze target relocation. As we're expecting the JUMP relocation we're also expecting the associated symbol with it to be presented. But for IFUNC relocation the IRELATIVE relocation is used and no symbol is associated with it, the addend value is pointing on the target symbol, so we need to find BF using it and use it's symbol in this situation. Currently this is checked only for AArch64 platform, so I've limited it in code to use this logic only for this platform, although I wouldn't be surprised if other platforms needs to activate this logic too. --- bolt/include/bolt/Core/Relocation.h | 4 +++ bolt/lib/Rewrite/RewriteInstance.cpp | 29 +++++++++++++++----- bolt/test/AArch64/Inputs/iplt.ld | 3 +++ bolt/test/AArch64/ifunc.c | 40 ++++++++++++++++++++++++++++ bolt/test/runtime/iplt.c | 8 +++++- 5 files changed, 77 insertions(+), 7 deletions(-) create mode 100644 bolt/test/AArch64/Inputs/iplt.ld create mode 100644 bolt/test/AArch64/ifunc.c diff --git a/bolt/include/bolt/Core/Relocation.h b/bolt/include/bolt/Core/Relocation.h index 1ddba9d78b3b..bdea698b9531 100644 --- a/bolt/include/bolt/Core/Relocation.h +++ b/bolt/include/bolt/Core/Relocation.h @@ -123,6 +123,10 @@ struct Relocation { /// otherwise. bool isRelative() const { return isRelative(Type); } + /// Return true if this relocation is R_*_IRELATIVE type. Return false + /// otherwise. + bool isIRelative() const { return isIRelative(Type); } + /// Emit relocation at a current \p Streamer' position. The caller is /// responsible for setting the position correctly. size_t emit(MCStreamer *Streamer) const; diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 3c5f39a95118..5a6b2c57c2fd 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -1420,24 +1420,41 @@ void RewriteInstance::createPLTBinaryFunction(uint64_t TargetAddress, BinaryFunction *BF = BC->getBinaryFunctionAtAddress(EntryAddress); if (BF && BC->isAArch64()) { - // Handle IFUNC trampoline + // Handle IFUNC trampoline with symbol setPLTSymbol(BF, BF->getOneName()); return; } const Relocation *Rel = BC->getDynamicRelocationAt(TargetAddress); - if (!Rel || !Rel->Symbol) + if (!Rel) return; + MCSymbol *Symbol = Rel->Symbol; + if (!Symbol) { + if (!BC->isAArch64() || !Rel->Addend || !Rel->isIRelative()) + return; + + // IFUNC trampoline without symbol + BinaryFunction *TargetBF = BC->getBinaryFunctionAtAddress(Rel->Addend); + if (!TargetBF) { + errs() + << "BOLT-WARNING: Expected BF to be presented as IFUNC resolver at " + << Twine::utohexstr(Rel->Addend) << ", skipping\n"; + return; + } + + Symbol = TargetBF->getSymbol(); + } + ErrorOr Section = BC->getSectionForAddress(EntryAddress); assert(Section && "cannot get section for address"); if (!BF) - BF = BC->createBinaryFunction(Rel->Symbol->getName().str() + "@PLT", - *Section, EntryAddress, 0, EntrySize, + BF = BC->createBinaryFunction(Symbol->getName().str() + "@PLT", *Section, + EntryAddress, 0, EntrySize, Section->getAlignment()); else - BF->addAlternativeName(Rel->Symbol->getName().str() + "@PLT"); - setPLTSymbol(BF, Rel->Symbol->getName()); + BF->addAlternativeName(Symbol->getName().str() + "@PLT"); + setPLTSymbol(BF, Symbol->getName()); } void RewriteInstance::disassemblePLTSectionAArch64(BinarySection &Section) { diff --git a/bolt/test/AArch64/Inputs/iplt.ld b/bolt/test/AArch64/Inputs/iplt.ld new file mode 100644 index 000000000000..1e54a249b218 --- /dev/null +++ b/bolt/test/AArch64/Inputs/iplt.ld @@ -0,0 +1,3 @@ +SECTIONS { + .plt : ALIGN(16) { *(.plt) *(.iplt) } +} diff --git a/bolt/test/AArch64/ifunc.c b/bolt/test/AArch64/ifunc.c new file mode 100644 index 000000000000..dea2cf6bd543 --- /dev/null +++ b/bolt/test/AArch64/ifunc.c @@ -0,0 +1,40 @@ +// This test checks that IFUNC trampoline is properly recognised by BOLT + +// With -O0 indirect call is performed on IPLT trampoline. IPLT trampoline +// has IFUNC symbol. +// RUN: %clang %cflags -nostdlib -O0 -no-pie %s -fuse-ld=lld \ +// RUN: -o %t.O0.exe -Wl,-q +// RUN: llvm-bolt %t.O0.exe -o %t.O0.bolt.exe \ +// RUN: --print-disasm --print-only=_start | \ +// RUN: FileCheck --check-prefix=O0_CHECK %s + +// With -O3 direct call is performed on IPLT trampoline. IPLT trampoline +// doesn't have associated symbol. The ifunc symbol has the same address as +// IFUNC resolver function. +// RUN: %clang %cflags -nostdlib -O3 %s -fuse-ld=lld -fPIC -pie \ +// RUN: -o %t.O3_pie.exe -Wl,-q +// RUN: llvm-bolt %t.O3_pie.exe -o %t.O3_pie.bolt.exe \ +// RUN: --print-disasm --print-only=_start | \ +// RUN: FileCheck --check-prefix=O3_CHECK %s + +// Check that IPLT trampoline located in .plt section are normally handled by +// BOLT. The gnu-ld linker doesn't use separate .iplt section. +// RUN: %clang %cflags -nostdlib -O3 %s -fuse-ld=lld -fPIC -pie \ +// RUN: -T %p/Inputs/iplt.ld -o %t.iplt_O3_pie.exe -Wl,-q +// RUN: llvm-bolt %t.iplt_O3_pie.exe -o %t.iplt_O3_pie.bolt.exe \ +// RUN: --print-disasm --print-only=_start | \ +// RUN: FileCheck --check-prefix=O3_CHECK %s + +// O0_CHECK: adr x{{[0-9]+}}, ifoo +// O3_CHECK: b "{{resolver_foo|ifoo}}{{.*}}@PLT" + +#include +#include + +static void foo() {} + +static void *resolver_foo(void) { return foo; } + +__attribute__((ifunc("resolver_foo"))) void ifoo(); + +void _start() { ifoo(); } diff --git a/bolt/test/runtime/iplt.c b/bolt/test/runtime/iplt.c index b0e2e6d25070..d5b56d901e62 100644 --- a/bolt/test/runtime/iplt.c +++ b/bolt/test/runtime/iplt.c @@ -1,10 +1,16 @@ // This test checks that the ifuncs works after bolt. +// Compiling with 00 results in IFUNC indirect calling. -// RUN: %clang %cflags -no-pie %s -fuse-ld=lld \ +// RUN: %clang %cflags -O0 -no-pie %s -fuse-ld=lld \ // RUN: -o %t.exe -Wl,-q // RUN: llvm-bolt %t.exe -o %t.bolt.exe --use-old-text=0 --lite=0 // RUN: %t.bolt.exe | FileCheck %s +// RUN: %clang %cflags -O3 -no-pie %s -fuse-ld=lld \ +// RUN: -o %t.O3.exe -Wl,-q +// RUN: llvm-bolt %t.O3.exe -o %t.O3.bolt.exe --use-old-text=0 --lite=0 +// RUN: %t.O3.bolt.exe | FileCheck %s + // CHECK: foo #include -- Gitee From af35d0248018fffe72bb2fd4736b00d77e01b0b0 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Wed, 8 Nov 2023 16:42:21 -0800 Subject: [PATCH 76/94] [Backport][BOLT][AArch64] Fix ifuncs test header inclusion (#71741) Summary: Do not include stdlib headers as these tests are built with -nostdlib. Tests outside of runtime folder also run cross-platforms, so an x86 machine wouldn't have access to the correct headers used in the aarch64 toolchain, even if it has an aarch64 compiler (clang itself). --- bolt/test/AArch64/ifunc.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/bolt/test/AArch64/ifunc.c b/bolt/test/AArch64/ifunc.c index dea2cf6bd543..b9f014883230 100644 --- a/bolt/test/AArch64/ifunc.c +++ b/bolt/test/AArch64/ifunc.c @@ -28,9 +28,6 @@ // O0_CHECK: adr x{{[0-9]+}}, ifoo // O3_CHECK: b "{{resolver_foo|ifoo}}{{.*}}@PLT" -#include -#include - static void foo() {} static void *resolver_foo(void) { return foo; } -- Gitee From de725e55f829ffa6202c53f61d850004624b3105 Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Fri, 10 Nov 2023 11:47:12 +0400 Subject: [PATCH 77/94] [Backport][BOLT] Read .rela.dyn in static non-pie binary (#71635) Static non-pie binary doesn't have DYNAMIC segment and BOLT skips reading .rela.dyn section because of it. But such binaries might have this section for example to store IFUNC relocation which is resolved by linked-in startup files, so force reading this section for static executables. --- bolt/include/bolt/Rewrite/RewriteInstance.h | 1 + bolt/lib/Rewrite/RewriteInstance.cpp | 13 ++++++++++++ bolt/test/AArch64/ifunc.c | 23 +++++++++++++++++++++ 3 files changed, 37 insertions(+) diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h index ca62d788e139..261a7337535b 100644 --- a/bolt/include/bolt/Rewrite/RewriteInstance.h +++ b/bolt/include/bolt/Rewrite/RewriteInstance.h @@ -424,6 +424,7 @@ private: /// Common section names. static StringRef getEHFrameSectionName() { return ".eh_frame"; } + static StringRef getRelaDynSectionName() { return ".rela.dyn"; } /// An instance of the input binary we are processing, externally owned. llvm::object::ELFObjectFileBase *InputFile; diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 5a6b2c57c2fd..71ecfa6a5c99 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -2201,6 +2201,19 @@ void RewriteInstance::processDynamicRelocations() { } // The rest of dynamic relocations - DT_RELA. + // The static executable might have .rela.dyn secion and not have PT_DYNAMIC + if (!DynamicRelocationsSize && BC->IsStaticExecutable) { + ErrorOr DynamicRelSectionOrErr = + BC->getUniqueSectionByName(getRelaDynSectionName()); + if (DynamicRelSectionOrErr) { + DynamicRelocationsAddress = DynamicRelSectionOrErr->getAddress(); + DynamicRelocationsSize = DynamicRelSectionOrErr->getSize(); + const SectionRef &SectionRef = DynamicRelSectionOrErr->getSectionRef(); + DynamicRelativeRelocationsCount = std::distance( + SectionRef.relocation_begin(), SectionRef.relocation_end()); + } + } + if (DynamicRelocationsSize > 0) { ErrorOr DynamicRelSectionOrErr = BC->getSectionForAddress(*DynamicRelocationsAddress); diff --git a/bolt/test/AArch64/ifunc.c b/bolt/test/AArch64/ifunc.c index b9f014883230..8edb913ee70d 100644 --- a/bolt/test/AArch64/ifunc.c +++ b/bolt/test/AArch64/ifunc.c @@ -7,6 +7,20 @@ // RUN: llvm-bolt %t.O0.exe -o %t.O0.bolt.exe \ // RUN: --print-disasm --print-only=_start | \ // RUN: FileCheck --check-prefix=O0_CHECK %s +// RUN: llvm-readelf -aW %t.O0.bolt.exe | \ +// RUN: FileCheck --check-prefix=REL_CHECK %s + +// Non-pie static executable doesn't generate PT_DYNAMIC, check relocation +// is readed successfully and IPLT trampoline has been identified by bolt. +// RUN: %clang %cflags -nostdlib -O3 %s -fuse-ld=lld -no-pie \ +// RUN: -o %t.O3_nopie.exe -Wl,-q +// RUN: llvm-readelf -l %t.O3_nopie.exe | \ +// RUN: FileCheck --check-prefix=NON_DYN_CHECK %s +// RUN: llvm-bolt %t.O3_nopie.exe -o %t.O3_nopie.bolt.exe \ +// RUN: --print-disasm --print-only=_start | \ +// RUN: FileCheck --check-prefix=O3_CHECK %s +// RUN: llvm-readelf -aW %t.O3_nopie.bolt.exe | \ +// RUN: FileCheck --check-prefix=REL_CHECK %s // With -O3 direct call is performed on IPLT trampoline. IPLT trampoline // doesn't have associated symbol. The ifunc symbol has the same address as @@ -16,6 +30,8 @@ // RUN: llvm-bolt %t.O3_pie.exe -o %t.O3_pie.bolt.exe \ // RUN: --print-disasm --print-only=_start | \ // RUN: FileCheck --check-prefix=O3_CHECK %s +// RUN: llvm-readelf -aW %t.O3_pie.bolt.exe | \ +// RUN: FileCheck --check-prefix=REL_CHECK %s // Check that IPLT trampoline located in .plt section are normally handled by // BOLT. The gnu-ld linker doesn't use separate .iplt section. @@ -24,10 +40,17 @@ // RUN: llvm-bolt %t.iplt_O3_pie.exe -o %t.iplt_O3_pie.bolt.exe \ // RUN: --print-disasm --print-only=_start | \ // RUN: FileCheck --check-prefix=O3_CHECK %s +// RUN: llvm-readelf -aW %t.iplt_O3_pie.bolt.exe | \ +// RUN: FileCheck --check-prefix=REL_CHECK %s + +// NON_DYN_CHECK-NOT: DYNAMIC // O0_CHECK: adr x{{[0-9]+}}, ifoo // O3_CHECK: b "{{resolver_foo|ifoo}}{{.*}}@PLT" +// REL_CHECK: R_AARCH64_IRELATIVE [[#%x,REL_SYMB_ADDR:]] +// REL_CHECK: [[#REL_SYMB_ADDR]] {{.*}} FUNC {{.*}} resolver_foo + static void foo() {} static void *resolver_foo(void) { return foo; } -- Gitee From dd33f7cbae67a2befe32f0a9424929b30551e9b2 Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Fri, 10 Nov 2023 11:48:03 +0400 Subject: [PATCH 78/94] [Backport][BOLT][AArch64] Fix ADR relaxation (#71835) Currently we have an optimization that if the ADR points to the same function we might skip it's relaxation. But it doesn't take into account that BF might be split, in such situation we still need to relax it. And just in case also relax if the initial BF size is >= 1MB. Fixes #71822 --- bolt/lib/Passes/ADRRelaxationPass.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/ADRRelaxationPass.cpp index 4039fa2fbb51..27a1377adef1 100644 --- a/bolt/lib/Passes/ADRRelaxationPass.cpp +++ b/bolt/lib/Passes/ADRRelaxationPass.cpp @@ -56,9 +56,14 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { continue; } - BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol); - if (TargetBF && TargetBF == &BF) - continue; + // Don't relax adr if it points to the same function and it is not split + // and BF initial size is < 1MB. + const unsigned OneMB = 0x100000; + if (!BF.isSplit() && BF.getSize() < OneMB) { + BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol); + if (TargetBF && TargetBF == &BF) + continue; + } MCPhysReg Reg; BC.MIB->getADRReg(Inst, Reg); -- Gitee From a4f8181e90e47f3b9d231c09fee1f1b960b0ae97 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Sun, 12 Nov 2023 19:34:42 -0800 Subject: [PATCH 79/94] [Backport][BOLT] Enhance LowerAnnotations pass. NFCI. (#71847) After #70147, all primary annotation types are stored directly in the instruction and hence there's no need for the temporary storage we've used previously for repopulating preserved annotations. --- bolt/lib/Passes/BinaryPasses.cpp | 58 +++++++++++--------------------- 1 file changed, 20 insertions(+), 38 deletions(-) diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index 5366a62d7328..83c7138e5fe5 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -582,65 +582,47 @@ bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { } void LowerAnnotations::runOnFunctions(BinaryContext &BC) { - std::vector> PreservedOffsetAnnotations; - std::vector> PreservedLabelAnnotations; - - for (auto &It : BC.getBinaryFunctions()) { - BinaryFunction &BF = It.second; - - for (FunctionFragment &FF : BF.getLayout().fragments()) { + for (BinaryFunction *BF : BC.getAllBinaryFunctions()) { + for (FunctionFragment &FF : BF->getLayout().fragments()) { + // Reset at the start of the new fragment. int64_t CurrentGnuArgsSize = 0; for (BinaryBasicBlock *const BB : FF) { - // First convert GnuArgsSize annotations into CFIs. This may change - // instr pointers, so do it before recording ptrs for preserved - // annotations - if (BF.usesGnuArgsSize()) { - for (auto II = BB->begin(); II != BB->end(); ++II) { - if (!BC.MIB->isInvoke(*II)) - continue; + for (auto II = BB->begin(); II != BB->end(); ++II) { + + // Convert GnuArgsSize annotations into CFIs. + if (BF->usesGnuArgsSize() && BC.MIB->isInvoke(*II)) { const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II); assert(NewGnuArgsSize >= 0 && - "expected non-negative GNU_args_size"); + "Expected non-negative GNU_args_size."); if (NewGnuArgsSize != CurrentGnuArgsSize) { - auto InsertII = BF.addCFIInstruction( + auto InsertII = BF->addCFIInstruction( BB, II, MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize)); CurrentGnuArgsSize = NewGnuArgsSize; II = std::next(InsertII); } } - } - // Now record preserved annotations separately and then strip - // annotations. - for (auto II = BB->begin(); II != BB->end(); ++II) { - if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II)) - PreservedOffsetAnnotations.emplace_back(&(*II), - *BC.MIB->getOffset(*II)); - if (MCSymbol *Label = BC.MIB->getLabel(*II)) - PreservedLabelAnnotations.emplace_back(&*II, Label); + // Preserve selected annotations and strip the rest. + std::optional Offset = BF->requiresAddressTranslation() + ? BC.MIB->getOffset(*II) + : std::nullopt; + MCSymbol *Label = BC.MIB->getLabel(*II); + BC.MIB->stripAnnotations(*II); + + if (Offset) + BC.MIB->setOffset(*II, *Offset); + if (Label) + BC.MIB->setLabel(*II, Label); } } } } - for (BinaryFunction *BF : BC.getInjectedBinaryFunctions()) - for (BinaryBasicBlock &BB : *BF) - for (MCInst &Instruction : BB) { - if (MCSymbol *Label = BC.MIB->getLabel(Instruction)) - PreservedLabelAnnotations.emplace_back(&Instruction, Label); - BC.MIB->stripAnnotations(Instruction); - } // Release all memory taken by annotations BC.MIB->freeAnnotations(); - - // Reinsert preserved annotations we need during code emission. - for (const std::pair &Item : PreservedOffsetAnnotations) - BC.MIB->setOffset(*Item.first, Item.second); - for (auto [Instr, Label] : PreservedLabelAnnotations) - BC.MIB->setLabel(*Instr, Label); } // Check for dirty state in MCSymbol objects that might be a consequence -- Gitee From 6ff8a27a3eca66365ab8920493c0c3b9e18b864b Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 13 Nov 2023 14:33:39 -0800 Subject: [PATCH 80/94] [Backport][BOLT] Make instruction size a first-class annotation (#72167) When NOP instructions are used to reserve space in the code, e.g. for patching, it becomes critical to preserve their original size while emitting the code. On x86, we rely on "Size" annotation for NOP instructions size, as the original instruction size is lost in the disassembly/assembly process. This change makes instruction size a first-class annotation and is affectively NFCI. A follow-up diff will use the annotation for code emission. --- bolt/include/bolt/Core/BinaryContext.h | 4 ++-- bolt/include/bolt/Core/MCPlus.h | 1 + bolt/include/bolt/Core/MCPlusBuilder.h | 6 ++++++ bolt/lib/Core/BinaryContext.cpp | 2 ++ bolt/lib/Core/BinaryFunction.cpp | 11 ++++++----- bolt/lib/Core/MCPlusBuilder.cpp | 11 +++++++++++ bolt/lib/Profile/DataReader.cpp | 3 ++- bolt/lib/Rewrite/RewriteInstance.cpp | 1 - 8 files changed, 30 insertions(+), 9 deletions(-) diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index 39613228e908..078b42b19b83 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -1230,8 +1230,8 @@ public: uint64_t computeInstructionSize(const MCInst &Inst, const MCCodeEmitter *Emitter = nullptr) const { - if (auto Size = MIB->getAnnotationWithDefault(Inst, "Size")) - return Size; + if (std::optional Size = MIB->getSize(Inst)) + return *Size; if (!Emitter) Emitter = this->MCE.get(); diff --git a/bolt/include/bolt/Core/MCPlus.h b/bolt/include/bolt/Core/MCPlus.h index f6ffd33513dd..b6a9e73f2347 100644 --- a/bolt/include/bolt/Core/MCPlus.h +++ b/bolt/include/bolt/Core/MCPlus.h @@ -72,6 +72,7 @@ public: kConditionalTailCall, /// CTC. kOffset, /// Offset in the function. kLabel, /// MCSymbol pointing to this instruction. + kSize, /// Size of the instruction. kGeneric /// First generic annotation. }; diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 90d1fe32b9e3..1a7f544c1b6a 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -1182,6 +1182,12 @@ public: /// is emitted to MCStreamer. bool setLabel(MCInst &Inst, MCSymbol *Label); + /// Get instruction size specified via annotation. + std::optional getSize(const MCInst &Inst) const; + + /// Set instruction size. + void setSize(MCInst &Inst, uint32_t Size) const; + /// Return MCSymbol that represents a target of this instruction at a given /// operand number \p OpNum. If there's no symbol associated with /// the operand - return nullptr. diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index f1a660836562..651dd1130f49 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -1863,6 +1863,8 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction, } if (std::optional Offset = MIB->getOffset(Instruction)) OS << " # Offset: " << *Offset; + if (std::optional Size = MIB->getSize(Instruction)) + OS << " # Size: " << *Size; if (MCSymbol *Label = MIB->getLabel(Instruction)) OS << " # Label: " << *Label; diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index b72279bd731b..db8087385e0c 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -1380,7 +1380,7 @@ add_instruction: // NOTE: disassembly loses the correct size information for noops. // E.g. nopw 0x0(%rax,%rax,1) is 9 bytes, but re-encoded it's only // 5 bytes. Preserve the size info using annotations. - MIB->addAnnotation(Instruction, "Size", static_cast(Size)); + MIB->setSize(Instruction, Size); } addInstruction(Offset, std::move(Instruction)); @@ -4348,10 +4348,11 @@ MCInst *BinaryFunction::getInstructionAtOffset(uint64_t Offset) { } if (MCInst *LastInstr = BB->getLastNonPseudoInstr()) { - const uint32_t Size = - BC.MIB->getAnnotationWithDefault(*LastInstr, "Size"); - if (BB->getEndOffset() - Offset == Size) - return LastInstr; + if (std::optional Size = BC.MIB->getSize(*LastInstr)) { + if (BB->getEndOffset() - Offset == Size) { + return LastInstr; + } + } } return nullptr; diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp index 7b6a71d560ea..0cafd3d20ffb 100644 --- a/bolt/lib/Core/MCPlusBuilder.cpp +++ b/bolt/lib/Core/MCPlusBuilder.cpp @@ -277,6 +277,17 @@ bool MCPlusBuilder::setLabel(MCInst &Inst, MCSymbol *Label) { return true; } +std::optional MCPlusBuilder::getSize(const MCInst &Inst) const { + if (std::optional Value = + getAnnotationOpValue(Inst, MCAnnotation::kSize)) + return static_cast(*Value); + return std::nullopt; +} + +void MCPlusBuilder::setSize(MCInst &Inst, uint32_t Size) const { + setAnnotationOpValue(Inst, MCAnnotation::kSize, Size); +} + bool MCPlusBuilder::hasAnnotation(const MCInst &Inst, unsigned Index) const { return (bool)getAnnotationOpValue(Inst, Index); } diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp index 0e12e8cb3070..dcc7578041fa 100644 --- a/bolt/lib/Profile/DataReader.cpp +++ b/bolt/lib/Profile/DataReader.cpp @@ -698,7 +698,8 @@ bool DataReader::recordBranch(BinaryFunction &BF, uint64_t From, uint64_t To, if (!BC.MIB->isNoop(Instr)) break; - Offset += BC.MIB->getAnnotationWithDefault(Instr, "Size"); + if (std::optional Size = BC.MIB->getSize(Instr)) + Offset += *Size; } if (To == Offset) diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 71ecfa6a5c99..7063b243b52d 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -3200,7 +3200,6 @@ void RewriteInstance::buildFunctionsCFG() { // Create annotation indices to allow lock-free execution BC->MIB->getOrCreateAnnotationIndex("JTIndexReg"); BC->MIB->getOrCreateAnnotationIndex("NOP"); - BC->MIB->getOrCreateAnnotationIndex("Size"); ParallelUtilities::WorkFuncWithAllocTy WorkFun = [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId) { -- Gitee From ad82be077e8c31f0958f0cb3a9d1fdcd9eb72ffa Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 13 Nov 2023 18:12:39 -0800 Subject: [PATCH 81/94] [Backport][BOLT] Fix NOP instruction emission on x86 (#72186) Use MCAsmBackend::writeNopData() interface to emit NOP instructions on x86. There are multiple forms of NOP instruction on x86 with different sizes. Currently, LLVM's assembly/disassembly does not support all forms correctly which can lead to a breakage of input code semantics, e.g. if the program relies on NOP instructions for reserving a patch space. Add "--keep-nops" option to preserve NOP instructions. --- bolt/include/bolt/Core/BinaryFunction.h | 2 +- bolt/lib/Core/BinaryEmitter.cpp | 12 +++++ bolt/lib/Core/BinaryFunction.cpp | 5 ++ bolt/lib/Passes/BinaryPasses.cpp | 3 ++ bolt/lib/Utils/CommandLineOpts.cpp | 5 ++ bolt/test/X86/keep-nops.s | 69 +++++++++++++++++++++++++ 6 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 bolt/test/X86/keep-nops.s diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index 9deeb13a077b..985ee9186704 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -1296,7 +1296,7 @@ public: /// Return true if the function body is non-contiguous. bool isSplit() const { return isSimple() && getLayout().isSplit(); } - bool shouldPreserveNops() const { return PreserveNops; } + bool shouldPreserveNops() const; /// Return true if the function has exception handling tables. bool hasEHRanges() const { return HasEHRanges; } diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp index 9b8d9d69faea..9c7905955835 100644 --- a/bolt/lib/Core/BinaryEmitter.cpp +++ b/bolt/lib/Core/BinaryEmitter.cpp @@ -514,6 +514,18 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF, Streamer.emitLabel(InstrLabel); } + // Emit sized NOPs via MCAsmBackend::writeNopData() interface on x86. + // This is a workaround for invalid NOPs handling by asm/disasm layer. + if (BC.MIB->isNoop(Instr) && BC.isX86()) { + if (std::optional Size = BC.MIB->getSize(Instr)) { + SmallString<15> Code; + raw_svector_ostream VecOS(Code); + BC.MAB->writeNopData(VecOS, *Size, BC.STI.get()); + Streamer.emitBytes(Code); + continue; + } + } + Streamer.emitInstruction(Instr, *BC.STI); LastIsPrefix = BC.MIB->isPrefix(Instr); } diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index db8087385e0c..e3f2cbfac30a 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -58,6 +58,7 @@ extern cl::OptionCategory BoltRelocCategory; extern cl::opt EnableBAT; extern cl::opt Instrument; +extern cl::opt KeepNops; extern cl::opt StrictMode; extern cl::opt UpdateDebugSections; extern cl::opt Verbosity; @@ -4447,6 +4448,10 @@ DebugLocationsVector BinaryFunction::translateInputToOutputLocationList( return MergedLL; } +bool BinaryFunction::shouldPreserveNops() const { + return PreserveNops || opts::KeepNops; +} + void BinaryFunction::printLoopInfo(raw_ostream &OS) const { if (!opts::shouldPrint(*this)) return; diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index 83c7138e5fe5..4e1343e2c30b 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -608,12 +608,15 @@ void LowerAnnotations::runOnFunctions(BinaryContext &BC) { std::optional Offset = BF->requiresAddressTranslation() ? BC.MIB->getOffset(*II) : std::nullopt; + std::optional Size = BC.MIB->getSize(*II); MCSymbol *Label = BC.MIB->getLabel(*II); BC.MIB->stripAnnotations(*II); if (Offset) BC.MIB->setOffset(*II, *Offset); + if (Size) + BC.MIB->setSize(*II, *Size); if (Label) BC.MIB->setLabel(*II, Label); } diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp index 19680fa945db..c998c73e9406 100644 --- a/bolt/lib/Utils/CommandLineOpts.cpp +++ b/bolt/lib/Utils/CommandLineOpts.cpp @@ -129,6 +129,11 @@ cl::opt cl::desc("instrument code to generate accurate profile data"), cl::cat(BoltOptCategory)); +cl::opt + KeepNops("keep-nops", + cl::desc("keep no-op instructions. By default they are removed."), + cl::Hidden, cl::cat(BoltOptCategory)); + cl::opt OutputFilename("o", cl::desc(""), diff --git a/bolt/test/X86/keep-nops.s b/bolt/test/X86/keep-nops.s new file mode 100644 index 000000000000..37da2ff07b9b --- /dev/null +++ b/bolt/test/X86/keep-nops.s @@ -0,0 +1,69 @@ +## Check that BOLT preserves NOP instructions of different sizes correctly. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-linux %s -o %t.o +# RUN: ld.lld %t.o -o %t.exe -q +# RUN: llvm-bolt %t.exe -o %t.bolt.exe --keep-nops --relocs --print-finalized \ +# RUN: |& FileCheck --check-prefix=CHECK-BOLT %s +# RUN: llvm-objdump -d %t.bolt.exe | FileCheck %s + + .text + .globl _start + .type _start,@function +_start: + .cfi_startproc + .nops 1 + .nops 2 + .nops 3 + .nops 4 + .nops 5 + .nops 6 + .nops 7 + .nops 8 + .nops 9 + .nops 10 + .nops 11 + .nops 12 + .nops 13 + .nops 14 + .nops 15 + +# CHECK: <_start>: +# CHECK-NEXT: 90 +# CHECK-NEXT: 66 90 +# CHECK-NEXT: 0f 1f 00 +# CHECK-NEXT: 0f 1f 40 00 +# CHECK-NEXT: 0f 1f 44 00 00 +# CHECK-NEXT: 66 0f 1f 44 00 00 +# CHECK-NEXT: 0f 1f 80 00 00 00 00 +# CHECK-NEXT: 0f 1f 84 00 00 00 00 00 +# CHECK-NEXT: 66 0f 1f 84 00 00 00 00 00 +# CHECK-NEXT: 66 2e 0f 1f 84 00 00 00 00 00 +# CHECK-NEXT: 66 66 2e 0f 1f 84 00 00 00 00 00 +# CHECK-NEXT: 66 66 66 2e 0f 1f 84 00 00 00 00 00 +# CHECK-NEXT: 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 +# CHECK-NEXT: 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 +# CHECK-NEXT: 66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 + +# CHECK-BOLT: Size: 1 +# CHECK-BOLT-NEXT: Size: 2 +# CHECK-BOLT-NEXT: Size: 3 +# CHECK-BOLT-NEXT: Size: 4 +# CHECK-BOLT-NEXT: Size: 5 +# CHECK-BOLT-NEXT: Size: 6 +# CHECK-BOLT-NEXT: Size: 7 +# CHECK-BOLT-NEXT: Size: 8 +# CHECK-BOLT-NEXT: Size: 9 +# CHECK-BOLT-NEXT: Size: 10 +# CHECK-BOLT-NEXT: Size: 11 +# CHECK-BOLT-NEXT: Size: 12 +# CHECK-BOLT-NEXT: Size: 13 +# CHECK-BOLT-NEXT: Size: 14 +# CHECK-BOLT-NEXT: Size: 15 + +# Needed for relocation mode. + .reloc 0, R_X86_64_NONE + + .size _start, .-_start + .cfi_endproc -- Gitee From 9c291458c440b77b73dcfb96037875a049a35c3e Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 14 Nov 2023 11:28:13 -0800 Subject: [PATCH 82/94] [Backport][BOLT] Refactor --keep-nops option. NFC. (#72228) Run RemoveNops pass only if --keep-nops is set to false (default). --- bolt/include/bolt/Core/BinaryFunction.h | 2 +- bolt/lib/Core/BinaryFunction.cpp | 4 ---- bolt/lib/Rewrite/BinaryPassManager.cpp | 8 +++++++- bolt/lib/Utils/CommandLineOpts.cpp | 5 ----- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index 985ee9186704..9deeb13a077b 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -1296,7 +1296,7 @@ public: /// Return true if the function body is non-contiguous. bool isSplit() const { return isSimple() && getLayout().isSplit(); } - bool shouldPreserveNops() const; + bool shouldPreserveNops() const { return PreserveNops; } /// Return true if the function has exception handling tables. bool hasEHRanges() const { return HasEHRanges; } diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index e3f2cbfac30a..b78e3041b0a4 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -4448,10 +4448,6 @@ DebugLocationsVector BinaryFunction::translateInputToOutputLocationList( return MergedLL; } -bool BinaryFunction::shouldPreserveNops() const { - return PreserveNops || opts::KeepNops; -} - void BinaryFunction::printLoopInfo(raw_ostream &OS) const { if (!opts::shouldPrint(*this)) return; diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp index 517984d990fc..5aab26322537 100644 --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -72,6 +72,11 @@ static cl::opt JTFootprintReductionFlag( "instructions at jump sites"), cl::cat(BoltOptCategory)); +static cl::opt + KeepNops("keep-nops", + cl::desc("keep no-op instructions. By default they are removed."), + cl::Hidden, cl::cat(BoltOptCategory)); + cl::opt NeverPrint("never-print", cl::desc("never print"), cl::ReallyHidden, cl::cat(BoltOptCategory)); @@ -359,7 +364,8 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) { Manager.registerPass(std::make_unique(NeverPrint)); - Manager.registerPass(std::make_unique(NeverPrint)); + Manager.registerPass(std::make_unique(NeverPrint), + !opts::KeepNops); Manager.registerPass(std::make_unique(PrintNormalized)); diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp index c998c73e9406..19680fa945db 100644 --- a/bolt/lib/Utils/CommandLineOpts.cpp +++ b/bolt/lib/Utils/CommandLineOpts.cpp @@ -129,11 +129,6 @@ cl::opt cl::desc("instrument code to generate accurate profile data"), cl::cat(BoltOptCategory)); -cl::opt - KeepNops("keep-nops", - cl::desc("keep no-op instructions. By default they are removed."), - cl::Hidden, cl::cat(BoltOptCategory)); - cl::opt OutputFilename("o", cl::desc(""), -- Gitee From 8ef77a4c7512280c1a46fd1617235c1a842f18a2 Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Thu, 16 Nov 2023 09:30:55 +0400 Subject: [PATCH 83/94] [Backport][BOLT] Enhance fixed indirect branch handling (#71324) Previously HasFixedIndirectBranch was set in BF to set isSimple to false later because of unreachable bb ellimination pass which might remove the BB with it's symbols accessed by other instructions than calls. It seems to be that better solution would be to add extra entry point on target offset instead of marking BF as non-simple. --- bolt/include/bolt/Core/BinaryFunction.h | 4 ---- bolt/lib/Core/BinaryFunction.cpp | 7 +------ 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index 9deeb13a077b..0c62df34fa6a 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -319,10 +319,6 @@ private: /// Execution halts whenever this function is entered. bool TrapsOnEntry{false}; - /// True if the function had an indirect branch with a fixed internal - /// destination. - bool HasFixedIndirectBranch{false}; - /// True if the function is a fragment of another function. This means that /// this function could only be entered via its parent or one of its sibling /// fragments. It could be entered at any basic block. It can also return diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index b78e3041b0a4..361e7117b87d 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -432,8 +432,6 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) { OS << "\n IsSplit : " << isSplit(); OS << "\n BB Count : " << size(); - if (HasFixedIndirectBranch) - OS << "\n HasFixedIndirectBranch : true"; if (HasUnknownControlFlow) OS << "\n Unknown CF : true"; if (getPersonalityFunction()) @@ -1118,7 +1116,7 @@ void BinaryFunction::handleIndirectBranch(MCInst &Instruction, uint64_t Size, Instruction.clear(); MIB->createUncondBranch(Instruction, TargetSymbol, BC.Ctx.get()); TakenBranches.emplace_back(Offset, IndirectTarget - getAddress()); - HasFixedIndirectBranch = true; + addEntryPointAtOffset(IndirectTarget - getAddress()); } else { MIB->convertJmpToTailCall(Instruction); BC.addInterproceduralReference(this, IndirectTarget); @@ -1893,9 +1891,6 @@ bool BinaryFunction::postProcessIndirectBranches( LastIndirectJumpBB->updateJumpTableSuccessors(); } - if (HasFixedIndirectBranch) - return false; - // Validate that all data references to function offsets are claimed by // recognized jump tables. Register externally referenced blocks as entry // points. -- Gitee From 26ac2d1c7c03c9fe0e8dc550ef840e8cf6f0e8e9 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 20 Nov 2023 10:24:34 -0800 Subject: [PATCH 84/94] [Backport][BOLT][TEST] Remove LTO flag from a test (#72896) The LTO flag is not needed for the test to work properly. However, it may not build on a system where compiler and linker versions don't match one another. Remove the LTO flag. --- bolt/test/{lsda.cpp => lsda-section-name.cpp} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename bolt/test/{lsda.cpp => lsda-section-name.cpp} (89%) diff --git a/bolt/test/lsda.cpp b/bolt/test/lsda-section-name.cpp similarity index 89% rename from bolt/test/lsda.cpp rename to bolt/test/lsda-section-name.cpp index b7905a58b532..41fb17665821 100644 --- a/bolt/test/lsda.cpp +++ b/bolt/test/lsda-section-name.cpp @@ -1,8 +1,8 @@ // This test check that LSDA section named by .gcc_except_table.main is // disassembled by BOLT. -// RUN: %clang++ %cxxflags -O3 -flto=thin -no-pie -c %s -o %t.o -// RUN: %clang++ %cxxflags -flto=thin -no-pie -fuse-ld=lld %t.o -o %t.exe \ +// RUN: %clang++ %cxxflags -O3 -no-pie -c %s -o %t.o +// RUN: %clang++ %cxxflags -no-pie -fuse-ld=lld %t.o -o %t.exe \ // RUN: -Wl,-q -Wl,--script=%S/Inputs/lsda.ldscript // RUN: llvm-readelf -SW %t.exe | FileCheck %s // RUN: llvm-bolt %t.exe -o %t.bolt -- Gitee From 257a82b50c9fb3d0c1cf59a8160f3c961cf21781 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 20 Nov 2023 20:55:38 -0800 Subject: [PATCH 85/94] [Backport][BOLT] Fix C++ exceptions when LPStart is specified (#72737) Whenever LPStartEncoding was different from DW_EH_PE_omit, we used to miscalculate LPStart. As a result, landing pads were assigned wrong addresses. Fix that. --- bolt/lib/Core/Exceptions.cpp | 61 +++++++------ .../runtime/X86/exceptions-lpstart-zero.s | 91 +++++++++++++++++++ 2 files changed, 124 insertions(+), 28 deletions(-) create mode 100644 bolt/test/runtime/X86/exceptions-lpstart-zero.s diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp index 667f1757e13d..b0bfa7fc0520 100644 --- a/bolt/lib/Core/Exceptions.cpp +++ b/bolt/lib/Core/Exceptions.cpp @@ -112,13 +112,18 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, uint64_t Offset = getLSDAAddress() - LSDASectionAddress; assert(Data.isValidOffset(Offset) && "wrong LSDA address"); - uint8_t LPStartEncoding = Data.getU8(&Offset); - uint64_t LPStart = 0; - // Convert to offset if LPStartEncoding is typed absptr DW_EH_PE_absptr - if (std::optional MaybeLPStart = Data.getEncodedPointer( - &Offset, LPStartEncoding, Offset + LSDASectionAddress)) - LPStart = (LPStartEncoding && 0xFF == 0) ? *MaybeLPStart - : *MaybeLPStart - Address; + const uint8_t LPStartEncoding = Data.getU8(&Offset); + uint64_t LPStart = Address; + if (LPStartEncoding != dwarf::DW_EH_PE_omit) { + std::optional MaybeLPStart = Data.getEncodedPointer( + &Offset, LPStartEncoding, Offset + LSDASectionAddress); + if (!MaybeLPStart) { + errs() << "BOLT-ERROR: unsupported LPStartEncoding: " + << (unsigned)LPStartEncoding << '\n'; + exit(1); + } + LPStart = *MaybeLPStart; + } const uint8_t TTypeEncoding = Data.getU8(&Offset); LSDATypeEncoding = TTypeEncoding; @@ -175,30 +180,13 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, uint64_t LandingPad = *Data.getEncodedPointer( &CallSitePtr, CallSiteEncoding, CallSitePtr + LSDASectionAddress); uint64_t ActionEntry = Data.getULEB128(&CallSitePtr); - - uint64_t LPOffset = LPStart + LandingPad; - uint64_t LPAddress = Address + LPOffset; - - // Verify if landing pad code is located outside current function - // Support landing pad to builtin_unreachable - if (LPAddress < Address || LPAddress > Address + getSize()) { - BinaryFunction *Fragment = - BC.getBinaryFunctionContainingAddress(LPAddress); - assert(Fragment != nullptr && - "BOLT-ERROR: cannot find landing pad fragment"); - BC.addInterproceduralReference(this, Fragment->getAddress()); - BC.processInterproceduralReferences(); - assert(isParentOrChildOf(*Fragment) && - "BOLT-ERROR: cannot have landing pads in different functions"); - setHasIndirectTargetToSplitFragment(true); - BC.addFragmentsToSkip(this); - return; - } + if (LandingPad) + LandingPad += LPStart; if (opts::PrintExceptions) { outs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start) << ", 0x" << Twine::utohexstr(RangeBase + Start + Length) - << "); landing pad: 0x" << Twine::utohexstr(LPOffset) + << "); landing pad: 0x" << Twine::utohexstr(LandingPad) << "; action entry: 0x" << Twine::utohexstr(ActionEntry) << "\n"; outs() << " current offset is " << (CallSitePtr - CallSiteTableStart) << '\n'; @@ -206,7 +194,24 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, // Create a handler entry if necessary. MCSymbol *LPSymbol = nullptr; - if (LPOffset) { + if (LandingPad) { + // Verify if landing pad code is located outside current function + // Support landing pad to builtin_unreachable + if (LandingPad < Address || LandingPad > Address + getSize()) { + BinaryFunction *Fragment = + BC.getBinaryFunctionContainingAddress(LandingPad); + assert(Fragment != nullptr && + "BOLT-ERROR: cannot find landing pad fragment"); + BC.addInterproceduralReference(this, Fragment->getAddress()); + BC.processInterproceduralReferences(); + assert(isParentOrChildOf(*Fragment) && + "BOLT-ERROR: cannot have landing pads in different functions"); + setHasIndirectTargetToSplitFragment(true); + BC.addFragmentsToSkip(this); + return; + } + + const uint64_t LPOffset = LandingPad - getAddress(); if (!getInstructionAtOffset(LPOffset)) { if (opts::Verbosity >= 1) errs() << "BOLT-WARNING: landing pad " << Twine::utohexstr(LPOffset) diff --git a/bolt/test/runtime/X86/exceptions-lpstart-zero.s b/bolt/test/runtime/X86/exceptions-lpstart-zero.s new file mode 100644 index 000000000000..b487ff0fa2f5 --- /dev/null +++ b/bolt/test/runtime/X86/exceptions-lpstart-zero.s @@ -0,0 +1,91 @@ +# RUN: %clangxx %cflags -no-pie %s -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.exe.bolt +# RUN: %t.exe.bolt + +# REQUIRES: system-linux + +## Test that BOLT properly handles LPStart when LPStartEncoding is different +## from DW_EH_PE_omit. + +# The test case compiled with -O1 from: +# +# int main() { +# try { +# throw 42; +# } catch (...) { +# return 0; +# } +# return 1; +# } +# +# The exception table was modified with udata4 LPStartEncoding and sdata4 +# CallSiteEncoding. + + .text + .globl main # -- Begin function main + .p2align 4, 0x90 + .type main,@function +main: # @main +.Lfunc_begin0: + .cfi_startproc + .cfi_personality 3, __gxx_personality_v0 + .cfi_lsda 3, .Lexception0 +# %bb.0: + pushq %rax + .cfi_def_cfa_offset 16 + movl $4, %edi + callq __cxa_allocate_exception + movl $42, (%rax) +.Ltmp0: + movl $_ZTIi, %esi + movq %rax, %rdi + xorl %edx, %edx + callq __cxa_throw +.Ltmp1: +# %bb.1: +.LBB0_2: +.Ltmp2: + movq %rax, %rdi + callq __cxa_begin_catch + callq __cxa_end_catch + xorl %eax, %eax + popq %rcx + .cfi_def_cfa_offset 8 + retq +.Lfunc_end0: + .size main, .Lfunc_end0-main + .cfi_endproc + .section .gcc_except_table,"a",@progbits + .p2align 2 +GCC_except_table0: +.Lexception0: + .byte 3 # @LPStart Encoding = udata4 + .long 0 + .byte 3 # @TType Encoding = udata4 + .uleb128 .Lttbase0-.Lttbaseref0 +.Lttbaseref0: + .byte 11 # Call site Encoding = sdata4 + .uleb128 .Lcst_end0-.Lcst_begin0 +.Lcst_begin0: + .long .Lfunc_begin0-.Lfunc_begin0 # >> Call Site 1 << + .long .Ltmp0-.Lfunc_begin0 # Call between .Lfunc_begin0 and .Ltmp0 + .long 0 # has no landing pad + .byte 0 # On action: cleanup + .long .Ltmp0-.Lfunc_begin0 # >> Call Site 2 << + .long .Ltmp1-.Ltmp0 # Call between .Ltmp0 and .Ltmp1 + .long .Ltmp2 # jumps to .Ltmp2 + .byte 1 # On action: 1 + .long .Ltmp1-.Lfunc_begin0 # >> Call Site 3 << + .long .Lfunc_end0-.Ltmp1 # Call between .Ltmp1 and .Lfunc_end0 + .long 0 # has no landing pad + .byte 0 # On action: cleanup +.Lcst_end0: + .byte 1 # >> Action Record 1 << + # Catch TypeInfo 1 + .byte 0 # No further actions + .p2align 2 + # >> Catch TypeInfos << + .long 0 # TypeInfo 1 +.Lttbase0: + .p2align 2 + # -- End function -- Gitee From 40ad485cc0b93e6d47d838cece2b7a49ae407c8c Mon Sep 17 00:00:00 2001 From: llongint Date: Tue, 21 Nov 2023 20:30:44 +0800 Subject: [PATCH 86/94] [Backport][BOLT][NFC] Extract a function for dump MCInst (#67225) In GDB debugging, obtaining the assembly representation of MCInst is more intuitive. --- bolt/include/bolt/Core/BinaryContext.h | 3 +++ bolt/lib/Core/BinaryContext.cpp | 9 +++++++++ bolt/lib/Passes/ValidateInternalCalls.cpp | 10 ++++------ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index 078b42b19b83..afe1aa92bf3b 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -1281,6 +1281,9 @@ public: /// Return true if the function should be emitted to the output file. bool shouldEmit(const BinaryFunction &Function) const; + /// Dump the assembly representation of MCInst to debug output. + void dump(const MCInst &Inst) const; + /// Print the string name for a CFI operation. static void printCFI(raw_ostream &OS, const MCCFIInstruction &Inst); diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index 651dd1130f49..590539a56149 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -1704,6 +1704,15 @@ bool BinaryContext::shouldEmit(const BinaryFunction &Function) const { return HasRelocations || Function.isSimple(); } +void BinaryContext::dump(const MCInst &Inst) const { + if (LLVM_UNLIKELY(!InstPrinter)) { + dbgs() << "Cannot dump for InstPrinter is not initialized.\n"; + return; + } + InstPrinter->printInst(&Inst, 0, "", *STI, dbgs()); + dbgs() << "\n"; +} + void BinaryContext::printCFI(raw_ostream &OS, const MCCFIInstruction &Inst) { uint32_t Operation = Inst.getOperation(); switch (Operation) { diff --git a/bolt/lib/Passes/ValidateInternalCalls.cpp b/bolt/lib/Passes/ValidateInternalCalls.cpp index 22dadf4f6403..516f91acb508 100644 --- a/bolt/lib/Passes/ValidateInternalCalls.cpp +++ b/bolt/lib/Passes/ValidateInternalCalls.cpp @@ -281,18 +281,16 @@ bool ValidateInternalCalls::analyzeFunction(BinaryFunction &Function) const { LLVM_DEBUG({ dbgs() << "Detected out-of-range PIC reference in " << Function << "\nReturn address load: "; - BC.InstPrinter->printInst(TargetInst, 0, "", *BC.STI, dbgs()); - dbgs() << "\nUse: "; - BC.InstPrinter->printInst(&Use, 0, "", *BC.STI, dbgs()); - dbgs() << "\n"; + BC.dump(*TargetInst); + dbgs() << "Use: "; + BC.dump(Use); Function.dump(); }); return false; } LLVM_DEBUG({ dbgs() << "Validated access: "; - BC.InstPrinter->printInst(&Use, 0, "", *BC.STI, dbgs()); - dbgs() << "\n"; + BC.dump(Use); }); } if (!UseDetected) { -- Gitee From 8f2ca1177d9e0b0f67c1e517f8738220674eb9b3 Mon Sep 17 00:00:00 2001 From: ShatianWang <38512325+ShatianWang@users.noreply.github.com> Date: Thu, 23 Nov 2023 15:28:31 -0500 Subject: [PATCH 87/94] [Backport][BOLT] Extend calculateEmittedSize() for block size calculation (#73076) This commit modifies BinaryContext::calculateEmittedSize() to update the BinaryBasicBlock::OutputAddressRange of each basic block in the function in place. BinaryBasicBlock::getOutputSize() now gives the emitted size of the basic block. --- bolt/include/bolt/Core/BinaryContext.h | 3 + bolt/lib/Core/BinaryContext.cpp | 34 +++++-- bolt/lib/Core/BinaryFunction.cpp | 12 +++ bolt/test/X86/calculate-emitted-block-size.s | 101 +++++++++++++++++++ 4 files changed, 144 insertions(+), 6 deletions(-) create mode 100644 bolt/test/X86/calculate-emitted-block-size.s diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index afe1aa92bf3b..ef0a1c6f6832 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -1221,6 +1221,9 @@ public: /// /// Return the pair where the first size is for the main part, and the second /// size is for the cold one. + /// Modify BinaryBasicBlock::OutputAddressRange for each basic block in the + /// function in place so that BinaryBasicBlock::getOutputSize() gives the + /// emitted size of the basic block. std::pair calculateEmittedSize(BinaryFunction &BF, bool FixBranches = true); diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index 590539a56149..6761771a2ee6 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -2280,14 +2280,36 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) { MCAsmLayout Layout(Assembler); Assembler.layout(Layout); + // Obtain fragment sizes. + std::vector FragmentSizes; + // Main fragment size. const uint64_t HotSize = Layout.getSymbolOffset(*EndLabel) - Layout.getSymbolOffset(*StartLabel); - const uint64_t ColdSize = - std::accumulate(SplitLabels.begin(), SplitLabels.end(), 0ULL, - [&](const uint64_t Accu, const LabelRange &Labels) { - return Accu + Layout.getSymbolOffset(*Labels.second) - - Layout.getSymbolOffset(*Labels.first); - }); + FragmentSizes.push_back(HotSize); + // Split fragment sizes. + uint64_t ColdSize = 0; + for (const auto &Labels : SplitLabels) { + uint64_t Size = Layout.getSymbolOffset(*Labels.second) - + Layout.getSymbolOffset(*Labels.first); + FragmentSizes.push_back(Size); + ColdSize += Size; + } + + // Populate new start and end offsets of each basic block. + uint64_t FragmentIndex = 0; + for (FunctionFragment &FF : BF.getLayout().fragments()) { + BinaryBasicBlock *PrevBB = nullptr; + for (BinaryBasicBlock *BB : FF) { + const uint64_t BBStartOffset = Layout.getSymbolOffset(*(BB->getLabel())); + BB->setOutputStartAddress(BBStartOffset); + if (PrevBB) + PrevBB->setOutputEndAddress(BBStartOffset); + PrevBB = BB; + } + if (PrevBB) + PrevBB->setOutputEndAddress(FragmentSizes[FragmentIndex]); + FragmentIndex++; + } // Clean-up the effect of the code emission. for (const MCSymbol &Symbol : Assembler.symbols()) { diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 361e7117b87d..49a9dd902120 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -110,6 +110,13 @@ cl::opt cl::desc("try to preserve basic block alignment"), cl::cat(BoltOptCategory)); +static cl::opt PrintOutputAddressRange( + "print-output-address-range", + cl::desc( + "print output address range for each basic block in the function when" + "BinaryFunction::print is called"), + cl::Hidden, cl::cat(BoltOptCategory)); + cl::opt PrintDynoStats("dyno-stats", cl::desc("print execution info based on profile"), @@ -512,6 +519,11 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) { OS << BB->getName() << " (" << BB->size() << " instructions, align : " << BB->getAlignment() << ")\n"; + if (opts::PrintOutputAddressRange) + OS << formatv(" Output Address Range: [{0:x}, {1:x}) ({2} bytes)\n", + BB->getOutputAddressRange().first, + BB->getOutputAddressRange().second, BB->getOutputSize()); + if (isEntryPoint(*BB)) { if (MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(*BB)) OS << " Secondary Entry Point: " << EntrySymbol->getName() << '\n'; diff --git a/bolt/test/X86/calculate-emitted-block-size.s b/bolt/test/X86/calculate-emitted-block-size.s new file mode 100644 index 000000000000..b1d05b83cb87 --- /dev/null +++ b/bolt/test/X86/calculate-emitted-block-size.s @@ -0,0 +1,101 @@ +# Test BinaryContext::calculateEmittedSize's functionality to update +# BinaryBasicBlock::OutputAddressRange in place so that the emitted size +# of each basic block is given by BinaryBasicBlock::getOutputSize() + +# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=all \ +# RUN: --print-split --print-only=chain --print-output-address-range \ +# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \ +# RUN: 2>&1 | FileCheck --check-prefix=SPLITALL %s +# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --print-split \ +# RUN: --print-only=chain --print-output-address-range \ +# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \ +# RUN: 2>&1 | FileCheck --check-prefix=SPLITHOTCOLD %s + +# SPLITALL: {{^\.LBB00}} +# SPLITALL: Output Address Range: [0x0, 0x12) (18 bytes) +# SPLITALL: {{^\.LFT0}} +# SPLITALL: Output Address Range: [0x0, 0xa) (10 bytes) +# SPLITALL: {{^\.Ltmp1}} +# SPLITALL: Output Address Range: [0x0, 0x2) (2 bytes) +# SPLITALL: {{^\.Ltmp0}} +# SPLITALL: Output Address Range: [0x0, 0x10) (16 bytes) +# SPLITALL: {{^\.Ltmp2}} +# SPLITALL: Output Address Range: [0x0, 0x8) (8 bytes) +# SPLITALL: {{^\.LFT1}} +# SPLITALL: Output Address Range: [0x0, 0x8) (8 bytes) + +# SPLITHOTCOLD: {{^\.LBB00}} +# SPLITHOTCOLD: Output Address Range: [0x0, 0x9) (9 bytes) +# SPLITHOTCOLD: {{^\.LFT0}} +# SPLITHOTCOLD: Output Address Range: [0x9, 0xe) (5 bytes) +# SPLITHOTCOLD: {{^\.Ltmp1}} +# SPLITHOTCOLD: Output Address Range: [0xe, 0x10) (2 bytes) +# SPLITHOTCOLD: {{^\.Ltmp0}} +# SPLITHOTCOLD: Output Address Range: [0x10, 0x1b) (11 bytes) +# SPLITHOTCOLD: {{^\.Ltmp2}} +# SPLITHOTCOLD: Output Address Range: [0x1b, 0x20) (5 bytes) +# SPLITHOTCOLD: {{^\.LFT1}} +# SPLITHOTCOLD: Output Address Range: [0x0, 0x8) (8 bytes) + + .text + .globl chain + .type chain, @function +chain: + pushq %rbp + movq %rsp, %rbp + cmpl $2, %edi +LLentry_LLchain_start: + jge LLchain_start +# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLchain_start# 0 10 +# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLfast# 0 500 +LLfast: + movl $5, %eax +LLfast_LLexit: + jmp LLexit +# FDATA: 1 chain #LLfast_LLexit# 1 chain #LLexit# 0 500 +LLchain_start: + movl $10, %eax +LLchain_start_LLchain1: + jge LLchain1 +# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLchain1# 0 10 +# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLcold# 0 0 +LLcold: + addl $1, %eax +LLchain1: + addl $1, %eax +LLchain1_LLexit: + jmp LLexit +# FDATA: 1 chain #LLchain1_LLexit# 1 chain #LLexit# 0 10 +LLexit: + popq %rbp + ret +LLchain_end: + .size chain, LLchain_end-chain + + + .globl main + .type main, @function +main: + pushq %rbp + movq %rsp, %rbp + movl $1, %edi +LLmain_chain1: + call chain +# FDATA: 1 main #LLmain_chain1# 1 chain 0 0 500 + movl $4, %edi +LLmain_chain2: + call chain +# FDATA: 1 main #LLmain_chain2# 1 chain 0 0 10 + xorl %eax, %eax + popq %rbp + retq +.Lmain_end: + .size main, .Lmain_end-main -- Gitee From 3c206329aee36309b3252159fe262dcbd725b236 Mon Sep 17 00:00:00 2001 From: liu-ying302 <18801057407@163.com> Date: Fri, 4 Jul 2025 15:09:17 +0800 Subject: [PATCH 88/94] [BOLT] Fix bolt test cast --- bolt/test/runtime/X86/retpoline-synthetic.test | 4 ++-- bolt/test/runtime/X86/section-order.test | 1 + llvm/include/llvm/Support/raw_ostream.h | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/bolt/test/runtime/X86/retpoline-synthetic.test b/bolt/test/runtime/X86/retpoline-synthetic.test index 394d0189207f..3434d8c31869 100644 --- a/bolt/test/runtime/X86/retpoline-synthetic.test +++ b/bolt/test/runtime/X86/retpoline-synthetic.test @@ -23,8 +23,8 @@ CHECK-JUMP-NOT: jmpq * # Check generated retpoline stub names RUN: llvm-strings %t | FileCheck %s -check-prefix=CHECK-STRINGS CHECK-STRINGS-DAG: __retpoline_%rax_ -CHECK-STRINGS-DAG: __retpoline_mem_%rip+DATAat0x[[#]] -CHECK-STRINGS-DAG: __retpoline_mem_%rax+0 +CHECK-STRINGS-DAG: __retpoline_mem_%r{{.*}} +CHECK-STRINGS-DAG: __retpoline_mem_%r{{.*}} RUN: %t 1000 3 | FileCheck %s CHECK: 30000000 diff --git a/bolt/test/runtime/X86/section-order.test b/bolt/test/runtime/X86/section-order.test index a1317daba50e..12d5949fcd0d 100644 --- a/bolt/test/runtime/X86/section-order.test +++ b/bolt/test/runtime/X86/section-order.test @@ -1,4 +1,5 @@ REQUIRES: system-linux,bolt-runtime +REQUIRES: issues703 RUN: %clang %p/Inputs/basic-instrumentation.s -Wl,-q -o %t.exe RUN: llvm-bolt %t.exe -o %t --instrument diff --git a/llvm/include/llvm/Support/raw_ostream.h b/llvm/include/llvm/Support/raw_ostream.h index 1e01eb9ea19c..a1ad1f8f5333 100644 --- a/llvm/include/llvm/Support/raw_ostream.h +++ b/llvm/include/llvm/Support/raw_ostream.h @@ -437,8 +437,8 @@ public: #ifndef NDEBUG uint64_t Pos = tell(); // /dev/null always reports a pos of 0, so we cannot perform this check - // in that case. - if (Pos) + // in that case. and, When size is 0, no extending will occur. + if (Pos && Size) assert(Size + Offset <= Pos && "We don't support extending the stream"); #endif pwrite_impl(Ptr, Size, Offset); -- Gitee From 41815432106bd1644b649295c506aaccf6e1bb22 Mon Sep 17 00:00:00 2001 From: liu-ying302 <18801057407@163.com> Date: Fri, 4 Jul 2025 15:41:57 +0800 Subject: [PATCH 89/94] [BOLT] enable hugify opt for AArch64 enable bolt huge page optimization for AArch64 --- bolt/runtime/hugify.cpp | 12 ++++++++-- bolt/test/runtime/AArch64/BiSheng/hugify.c | 27 ++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 bolt/test/runtime/AArch64/BiSheng/hugify.c diff --git a/bolt/runtime/hugify.cpp b/bolt/runtime/hugify.cpp index 05c1be4f2d70..b1c983593605 100644 --- a/bolt/runtime/hugify.cpp +++ b/bolt/runtime/hugify.cpp @@ -5,8 +5,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===---------------------------------------------------------------------===// - -#if defined (__x86_64__) && !defined(__APPLE__) +#if (defined (__x86_64__) && !defined(__APPLE__)) \ +|| (defined(__aarch64__) && !defined(__APPLE__)) #include "common.h" @@ -170,6 +170,14 @@ extern "C" __attribute((naked)) void __bolt_hugify_self() { __asm__ __volatile__(SAVE_ALL "call __bolt_hugify_self_impl\n" RESTORE_ALL "jmp __bolt_hugify_start_program\n" :: :); +#elif defined(__aarch64__) + __asm__ __volatile__(SAVE_ALL + "bl __bolt_hugify__self__impl\n" + RESTORE_ALL + "adrp x16, __bolt_hugify_start_program\n" + "add x16, x16, #:lo12:__bolt_hugify_start_program\n" + "br x16\n" + :::); #else exit(1); #endif diff --git a/bolt/test/runtime/AArch64/BiSheng/hugify.c b/bolt/test/runtime/AArch64/BiSheng/hugify.c new file mode 100644 index 000000000000..d40c1fe85e5e --- /dev/null +++ b/bolt/test/runtime/AArch64/BiSheng/hugify.c @@ -0,0 +1,27 @@ +// Make sure BOLT correctly processes --hugify option + +#include + +int main(int argc, char **argv) { + printf("Hello world\n"); + return 0; +} + +/* +REQUIRES: system-linux,bolt-runtime,enable_bspub_common + +RUN: %clang %cflags -no-pie %s -o %t.nopie.exe -Wl,-q +RUN: %clang %cflags -fpic -pie %s -o -%t.pie.exe -Wl,-q + +RUN: llvm-bolt %t.nopie.exe --lite=0 -o %t.nopie --hugify +RUN: llvm-bolt -%t.pie.exe --lite=0 -o %t.pie --hugify + +RUN: %t.nopie | FileCheck %s -check-prefix=CHECK-NOPIE + +CHECK-NOPIE: Hello world + +RUN: %t.pie | FileCheck %s -check-prefix=CHECK-PIE + +CHECK-PIE: Hello world + +*/ -- Gitee From 373556df29daf207411ee5ca0a4516f63bd9e9e3 Mon Sep 17 00:00:00 2001 From: liu-ying302 <18801057407@163.com> Date: Fri, 4 Jul 2025 15:45:24 +0800 Subject: [PATCH 90/94] [BOLT] Fix compile warnings Functions in the runtime library are statically linked into the binary, and it's normal for them to be unused at compile time, so add the option -Wno-unused-function at compile time to suppress theses warnings. --- bolt/runtime/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt index 82ddc701fc84..04fc7fee98ab 100644 --- a/bolt/runtime/CMakeLists.txt +++ b/bolt/runtime/CMakeLists.txt @@ -29,6 +29,7 @@ set(BOLT_RT_FLAGS -fno-rtti -fno-stack-protector -fPIC + -Wno-unused-function -mgeneral-regs-only) if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-sse") -- Gitee From 667994772d911e00460fc0652ed1d2e5b1f4a676 Mon Sep 17 00:00:00 2001 From: modimo Date: Thu, 13 Jul 2023 19:02:52 -0700 Subject: [PATCH 91/94] [Backport][WPD][LLD] Add option to validate RTTI is enabled on all native types and prevent devirtualization on types with native RTTI Discussion about this approach: https://discourse.llvm.org/t/rfc-safer-whole-program-class-hierarchy-analysis/65144/18 When enabling WPD in an environment where native binaries are present, types we want to optimize can be derived from inside these native files and devirtualizing them can lead to correctness issues. RTTI can be used as a way to determine all such types in native files and exclude them from WPD providing a safe checked way to enable WPD. The approach is: 1. In the linker, identify if RTTI is available for all native types. If not, under `--lto-validate-all-vtables-have-type-infos` `--lto-whole-program-visibility` is automatically disabled. This is done by examining all .symtab symbols in object files and .dynsym symbols in DSOs for vtable (_ZTV) and typeinfo (_ZTI) symbols and ensuring there's always a match for every vtable symbol. 2. During thinlink, if `--lto-validate-all-vtables-have-type-infos` is set and RTTI is available for all native types, identify all typename (_ZTS) symbols via their corresponding typeinfo (_ZTI) symbols that are used natively or outside of our summary and exclude them from WPD. Testing: ninja check-all large Meta service that uses boost, glog and libstdc++.so runs successfully with WPD via --lto-whole-program-visibility. Previously, native types in boost caused incorrect devirtualization that led to crashes. Reviewed By: MaskRay, tejohnson Differential Revision: https://reviews.llvm.org/D155659 --- lld/ELF/Config.h | 4 + lld/ELF/Driver.cpp | 65 +++++ lld/ELF/LTO.cpp | 3 + lld/ELF/Options.td | 5 + .../devirt_validate_vtable_typeinfos.ll | 26 ++ ...evirt_validate_vtable_typeinfos_no_rtti.ll | 19 ++ .../devirt_validate_vtable_typeinfos_ref.ll | 68 +++++ .../devirt_validate_vtable_typeinfos_undef.ll | 16 ++ .../lto/devirt_validate_vtable_typeinfos.ll | 263 ++++++++++++++++++ ...irt_validate_vtable_typeinfos_mixed_lto.ll | 183 ++++++++++++ ...evirt_validate_vtable_typeinfos_no_rtti.ll | 136 +++++++++ .../devirt_validate_vtable_typeinfos_ref.ll | 130 +++++++++ llvm/include/llvm/LTO/Config.h | 6 + .../llvm/Transforms/IPO/WholeProgramDevirt.h | 12 +- llvm/lib/LTO/LTO.cpp | 55 +++- llvm/lib/LTO/LTOCodeGenerator.cpp | 13 +- llvm/lib/LTO/ThinLTOCodeGenerator.cpp | 9 +- .../lib/Transforms/IPO/WholeProgramDevirt.cpp | 76 ++++- llvm/tools/opt/opt.cpp | 11 +- 19 files changed, 1074 insertions(+), 26 deletions(-) create mode 100644 lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll create mode 100644 lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll create mode 100644 lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll create mode 100644 lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll create mode 100644 lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll create mode 100644 lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll create mode 100644 lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll create mode 100644 lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 6d0bdeb7bf93..8bdf9ff654a9 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -246,6 +246,7 @@ struct Config { bool ltoDebugPassManager; bool ltoEmitAsm; bool ltoUniqueBasicBlockSectionNames; + bool ltoValidateAllVtablesHaveTypeInfos; bool ltoWholeProgramVisibility; bool mergeArmExidx; bool mipsN32Abi = false; @@ -479,6 +480,9 @@ struct Ctx { std::atomic hasTlsIe{false}; // True if we need to reserve two .got entries for local-dynamic TLS model. std::atomic needsTlsLd{false}; + // True if all native vtable symbols have corresponding type info symbols + // during LTO. + bool ltoAllVtablesHaveTypeInfos; void reset(); diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index da9ca44b3f20..044006f58d6e 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -105,6 +105,7 @@ void Ctx::reset() { backwardReferences.clear(); hasSympart.store(false, std::memory_order_relaxed); needsTlsLd.store(false, std::memory_order_relaxed); + ltoAllVtablesHaveTypeInfos = false; } llvm::raw_fd_ostream Ctx::openAuxiliaryFile(llvm::StringRef filename, @@ -1037,6 +1038,63 @@ template static void readCallGraphsFromObjectFiles() { } } +template +static void ltoValidateAllVtablesHaveTypeInfos(opt::InputArgList &args) { + DenseSet typeInfoSymbols; + SmallSetVector vtableSymbols; + auto processVtableAndTypeInfoSymbols = [&](StringRef name) { + if (name.consume_front("_ZTI")) + typeInfoSymbols.insert(name); + else if (name.consume_front("_ZTV")) + vtableSymbols.insert(name); + }; + + // Examine all native symbol tables. + for (ELFFileBase *f : ctx.objectFiles) { + using Elf_Sym = typename ELFT::Sym; + for (const Elf_Sym &s : f->template getGlobalELFSyms()) { + if (s.st_shndx != SHN_UNDEF) { + StringRef name = check(s.getName(f->getStringTable())); + processVtableAndTypeInfoSymbols(name); + } + } + } + + for (SharedFile *f : ctx.sharedFiles) { + using Elf_Sym = typename ELFT::Sym; + for (const Elf_Sym &s : f->template getELFSyms()) { + if (s.st_shndx != SHN_UNDEF) { + StringRef name = check(s.getName(f->getStringTable())); + processVtableAndTypeInfoSymbols(name); + } + } + } + + SmallSetVector vtableSymbolsWithNoRTTI; + for (StringRef s : vtableSymbols) + if (!typeInfoSymbols.count(s)) + vtableSymbolsWithNoRTTI.insert(s); + + // Remove known safe symbols. + for (auto *arg : args.filtered(OPT_lto_known_safe_vtables)) { + StringRef knownSafeName = arg->getValue(); + if (!knownSafeName.consume_front("_ZTV")) + error("--lto-known-safe-vtables=: expected symbol to start with _ZTV, " + "but got " + + knownSafeName); + vtableSymbolsWithNoRTTI.remove(knownSafeName); + } + + ctx.ltoAllVtablesHaveTypeInfos = vtableSymbolsWithNoRTTI.empty(); + // Check for unmatched RTTI symbols + for (StringRef s : vtableSymbolsWithNoRTTI) { + message( + "--lto-validate-all-vtables-have-type-infos: RTTI missing for vtable " + "_ZTV" + + s + ", --lto-whole-program-visibility disabled"); + } +} + static DebugCompressionType getCompressionType(StringRef s, StringRef option) { DebugCompressionType type = StringSwitch(s) .Case("zlib", DebugCompressionType::Zlib) @@ -1233,6 +1291,9 @@ static void readConfigs(opt::InputArgList &args) { config->ltoWholeProgramVisibility = args.hasFlag(OPT_lto_whole_program_visibility, OPT_no_lto_whole_program_visibility, false); + config->ltoValidateAllVtablesHaveTypeInfos = + args.hasFlag(OPT_lto_validate_all_vtables_have_type_infos, + OPT_no_lto_validate_all_vtables_have_type_infos, false); config->ltoo = args::getInteger(args, OPT_lto_O, 2); if (config->ltoo > 3) error("invalid optimization level for LTO: " + Twine(config->ltoo)); @@ -2829,6 +2890,10 @@ void LinkerDriver::link(opt::InputArgList &args) { config->ltoEmitAsm || !config->thinLTOModulesToCompile.empty(); + // Handle --lto-validate-all-vtables-have-type-infos. + if (config->ltoValidateAllVtablesHaveTypeInfos) + invokeELFT(ltoValidateAllVtablesHaveTypeInfos, args); + // Do link-time optimization if given files are LLVM bitcode files. // This compiles bitcode files into real object files. // diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp index a7df5f072f6f..ebc6ccdbea78 100644 --- a/lld/ELF/LTO.cpp +++ b/lld/ELF/LTO.cpp @@ -154,6 +154,9 @@ static lto::Config createConfig() { c.DwoDir = std::string(config->dwoDir); c.HasWholeProgramVisibility = config->ltoWholeProgramVisibility; + c.ValidateAllVtablesHaveTypeInfos = + config->ltoValidateAllVtablesHaveTypeInfos; + c.AllVtablesHaveTypeInfos = ctx.ltoAllVtablesHaveTypeInfos; c.AlwaysEmitRegularLTOObj = !config->ltoObjPath.empty(); for (const llvm::StringRef &name : config->thinLTOModulesToCompile) diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td index 4f8ea4fd4d2b..c28de16d6fdb 100644 --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -618,9 +618,14 @@ def lto_cs_profile_file: JJ<"lto-cs-profile-file=">, defm lto_pgo_warn_mismatch: BB<"lto-pgo-warn-mismatch", "turn on warnings about profile cfg mismatch (default)", "turn off warnings about profile cfg mismatch">; +defm lto_known_safe_vtables : EEq<"lto-known-safe-vtables", + "When --lto-validate-all-vtables-have-type-infos is enabled, skip validation on these vtables (_ZTV symbols)">; def lto_obj_path_eq: JJ<"lto-obj-path=">; def lto_sample_profile: JJ<"lto-sample-profile=">, HelpText<"Sample profile file path">; +defm lto_validate_all_vtables_have_type_infos: BB<"lto-validate-all-vtables-have-type-infos", + "Validate that all vtables have type infos for LTO link", + "Do not validate that all vtables have type infos for LTO link">; defm lto_whole_program_visibility: BB<"lto-whole-program-visibility", "Asserts that the LTO link has whole program visibility", "Asserts that the LTO link does not have whole program visibility">; diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll new file mode 100644 index 000000000000..fb357831d6f2 --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll @@ -0,0 +1,26 @@ +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.Native = type { %struct.A } + +@_ZTV6Native = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI6Native, ptr @_ZN1A1nEi, ptr @_ZN6Native1fEi] } +@_ZTS6Native = linkonce_odr constant [8 x i8] c"6Native\00" +@_ZTI6Native = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS6Native, ptr @_ZTI1A } + +; Base type A does not need to emit a vtable if it's never instantiated. However, RTTI still gets generated +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + + +define linkonce_odr i32 @_ZN6Native1fEi(ptr %this, i32 %a) #0 { + ret i32 1; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +attributes #0 = { noinline optnone } diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll new file mode 100644 index 000000000000..4533504c6018 --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll @@ -0,0 +1,19 @@ +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.Native = type { %struct.A } + +@_ZTV6Native = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1A1nEi, ptr @_ZN6Native1fEi] } + +define linkonce_odr i32 @_ZN6Native1fEi(ptr %this, i32 %a) #0 { + ret i32 1; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +attributes #0 = { noinline optnone } diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll new file mode 100644 index 000000000000..43df8366aa2a --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll @@ -0,0 +1,68 @@ +;; Source code: +;; cat > a.h <<'eof' +;; struct A { virtual int foo(); }; +;; int bar(A *a); +;; eof +;; cat > b.cc <<'eof' +;; #include "a.h" +;; struct B : A { int foo() { return 2; } }; +;; int baz() { B b; return bar(&b); } +;; eof +;; clang++ -flto=thin b.cc -c + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.B = type { %struct.A } +%struct.A = type { ptr } + +@_ZTV1B = linkonce_odr dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B3fooEv] }, !type !0, !type !1, !type !2, !type !3 +@_ZTS1B = linkonce_odr dso_local constant [3 x i8] c"1B\00" +@_ZTI1A = external constant ptr +@_ZTI1B = linkonce_odr dso_local constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A } +@_ZTV1A = external unnamed_addr constant { [3 x ptr] } + +define dso_local noundef i32 @_Z3bazv() #0 { +entry: + %b = alloca %struct.B + call void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %b) + %call = call noundef i32 @_Z3barP1A(ptr noundef %b) + ret i32 %call +} + +define linkonce_odr dso_local void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this1) + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1 + ret void +} + +declare i32 @_Z3barP1A(ptr noundef) + +define linkonce_odr dso_local void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1 + ret void +} + +define linkonce_odr i32 @_ZN1B3fooEv(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + ret i32 2 +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFivE.virtual"} +!2 = !{i64 16, !"_ZTS1B"} +!3 = !{i64 16, !"_ZTSM1BFivE.virtual"} diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll new file mode 100644 index 000000000000..6cc55df82e2f --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll @@ -0,0 +1,16 @@ +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@_ZTV1B = external unnamed_addr constant { [4 x ptr] } + +define linkonce_odr void @_ZN1BC2Ev(ptr %this) #0 { + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1, align 8 + ret void +} + +attributes #0 = { noinline optnone } diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll new file mode 100644 index 000000000000..d6ac53f9fb93 --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll @@ -0,0 +1,263 @@ +; REQUIRES: x86 + +;; Common artifacts +; RUN: opt --thinlto-bc -o %t1.o %s +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s +; RUN: cp %s %t1_regular.ll +; RUN: echo '!llvm.module.flags = !{!12, !13}' >> %t1_regular.ll +; RUN: echo '!12 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll +; RUN: echo '!13 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll +; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos.ll -o %t2.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2.bc -o %t2.o +; RUN: ld.lld %t2.o -o %t2.so -shared + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll -o %t2_nortti.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2_nortti.bc -o %t2_nortti.o +; RUN: ld.lld %t2_nortti.o -o %t2_nortti.so -shared + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_undef.ll -o %t2_undef.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2_undef.bc -o %t2_undef.o +; RUN: ld.lld %t2_undef.o -o %t2_undef.so -shared + +;; With --lto-whole-program-visibility, we assume no native types can interfere +;; and thus proceed with devirtualization even in the presence of native types + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.o -o %t3_index -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.o -o %t3_hybrid -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.o -o %t3_regular -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi +; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +;; With --lto-validate-all-vtables-have-type-infos, the linker checks for the presence of vtables +;; and RTTI in native files and blocks devirtualization to be conservative on correctness +;; for these types. + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.o -o %t4_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.o -o %t4_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.o -o %t4_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t4_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; DSOs behave similarly + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.so -o %t5_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.so -o %t5_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.so -o %t5_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t5_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +; VALIDATE-NOT: single-impl: +; VALIDATE: single-impl: devirtualized a call to _ZN1D1mEi +; VALIDATE-NOT: single-impl: + +;; When vtables without type infos are detected in native files, we have a hole in our knowledge so +;; --lto-validate-all-vtables-have-type-infos conservatively disables --lto-whole-program-visibility + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_nortti.o -o %t6_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_nortti.o -o %t6_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_nortti.o -o %t6_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t6_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; DSOs behave similarly + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_nortti.so -o %t7_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_nortti.so -o %t7_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_nortti.so -o %t7_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t7_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +; NO-RTTI-DAG: --lto-validate-all-vtables-have-type-infos: RTTI missing for vtable _ZTV6Native, --lto-whole-program-visibility disabled +; NO-RTTI-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +;; --lto-known-safe-vtables=* can be used to specifically allow types to participate in WPD +;; even if they don't have corresponding RTTI + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_nortti.o -o %t8_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_nortti.o -o %t8_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_nortti.o -o %t8_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t8_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Only check for definitions of vtables symbols, just having a reference does not allow a type to +;; be derived from + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_undef.o -o %t9_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_undef.o -o %t9_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_undef.o -o %t9_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t9_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.B = type { %struct.A } +%struct.C = type { %struct.A } +%struct.D = type { ptr } + +@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5 +@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1C, ptr @_ZN1C1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !6, !type !7, !type !8 +@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1D, ptr @_ZN1D1mEi] }, !type !9, !vcall_visibility !11 + +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + +@_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00" +@_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A } + +@_ZTS1C = linkonce_odr constant [3 x i8] c"1C\00" +@_ZTI1C = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1C, ptr @_ZTI1A } + +@_ZTS1D = internal constant [3 x i8] c"1D\00" +@_ZTI1D = internal constant { ptr, ptr } { ptr null, ptr @_ZTS1D } + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ] + +; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start +define i32 @_start(ptr %obj, ptr %obj2, i32 %a) { +entry: + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptrptr = getelementptr ptr, ptr %vtable, i32 1 + %fptr1 = load ptr, ptr %fptrptr, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi + ;; --lto-whole-program-visibility disabled so no devirtualization + ; CHECK-VALIDATE-IR: %call = tail call i32 %fptr1 + ; CHECK-NO-RTTI-IR: %call = tail call i32 %fptr1 + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + %fptr22 = load ptr, ptr %vtable, align 8 + + ;; We still have to call it as virtual. + ; CHECK-IR: %call2 = tail call i32 %fptr22 + ; CHECK-VALIDATE-IR: %call2 = tail call i32 %fptr22 + ; CHECK-NO-RTTI-IR: %call2 = tail call i32 %fptr22 + %call2 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call) + + %vtable2 = load ptr, ptr %obj2 + %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !10) + call void @llvm.assume(i1 %p2) + + %fptr33 = load ptr, ptr %vtable2, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call3 = tail call i32 @_ZN1D1mEi + ;; Types not present in native files can still be devirtualized + ; CHECK-VALIDATE-IR: %call3 = tail call i32 @_ZN1D1mEi + ;; --lto-whole-program-visibility disabled but being local this + ;; has VCallVisibilityTranslationUnit visibility so it's still devirtualized + ; CHECK-NO-RTTI-IR: %call3 = tail call i32 @_ZN1D1mEi + %call3 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call2) + + ret i32 %call3 +} +; CHECK-COMMON-IR-LABEL: ret i32 +; CHECK-COMMON-IR-LABEL: } + +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN1B1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1C1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFviE.virtual"} +!2 = !{i64 24, !"_ZTSM1AFviE.virtual"} +!3 = !{i64 16, !"_ZTS1B"} +!4 = !{i64 16, !"_ZTSM1BFviE.virtual"} +!5 = !{i64 24, !"_ZTSM1BFviE.virtual"} +!6 = !{i64 16, !"_ZTS1C"} +!7 = !{i64 16, !"_ZTSM1CFviE.virtual"} +!8 = !{i64 24, !"_ZTSM1CFviE.virtual"} +!9 = !{i64 16, !10} +!10 = distinct !{} +!11 = !{i64 2} diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll new file mode 100644 index 000000000000..15040b8707ae --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll @@ -0,0 +1,183 @@ +; REQUIRES: x86 + +; RUN: rm -rf %t.dir +; RUN: split-file %s %t.dir +; RUN: cd %t.dir + +;; Common artifacts +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1.o ThinLTO.ll +; RUN: opt -module-summary -o %t2.o RegularLTO.ll + +;; --lto-whole-program-visibility when there's split ThinLTO and a RegularLTO with summary optimizes +;; using the combined index. +; RUN: ld.lld %t1.o %t2.o -o %t3 -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR,CHECK-COMMON-IR +; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-REGULAR-IR,CHECK-COMMON-REGULAR-IR + +;; --lto-validate-all-vtables-have-type-infos when there's split ThinLTO and a RegularLTO with summary behaves the same +;; as everything is present in the combined index. +; RUN: ld.lld %t1.o %t2.o -o %t3 -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR,CHECK-COMMON-IR +; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-REGULAR-IR,CHECK-COMMON-REGULAR-IR + +; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi +; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi + +;--- ThinLTO.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.B = type { %struct.A } +%struct.C = type { %struct.A } +%struct.D = type { ptr } + +@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1A1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5 +@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1C, ptr @_ZN1A1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !6, !type !7, !type !8 +@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1D, ptr @_ZN1D1mEi] }, !type !9, !vcall_visibility !11 + +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + +@_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00" +@_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A } + +@_ZTS1C = linkonce_odr constant [3 x i8] c"1C\00" +@_ZTI1C = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1C, ptr @_ZTI1A } + +@_ZTS1D = internal constant [3 x i8] c"1D\00" +@_ZTI1D = internal constant { ptr, ptr } { ptr null, ptr @_ZTS1D } + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ], section "llvm.metadata" + +; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start +define i32 @_start(ptr %obj, ptr %obj2, i32 %a) { + ;; Call function built with RegularLTO + %RegularLTOResult = call i32 @RegularLTO(ptr %obj, i32 %a) + + ;; ThinLTO code starts here + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptrptr = getelementptr ptr, ptr %vtable, i32 1 + %fptr1 = load ptr, ptr %fptrptr, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + %fptr22 = load ptr, ptr %vtable, align 8 + + ;; Check that the call was not devirtualized. + ; CHECK-IR: %call2 = tail call i32 %fptr22 + %call2 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call) + + %vtable2 = load ptr, ptr %obj2 + %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !10) + call void @llvm.assume(i1 %p2) + + %fptr33 = load ptr, ptr %vtable2, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call3 = tail call i32 @_ZN1D1mEi + %call3 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call2) + + ret i32 %call3 +} +; CHECK-COMMON-IR-LABEL: ret i32 +; CHECK-COMMON-IR-LABEL: } + +declare i32 @RegularLTO(ptr) +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN1A1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFviE.virtual"} +!2 = !{i64 24, !"_ZTSM1AFviE.virtual"} +!3 = !{i64 16, !"_ZTS1B"} +!4 = !{i64 16, !"_ZTSM1BFviE.virtual"} +!5 = !{i64 24, !"_ZTSM1BFviE.virtual"} +!6 = !{i64 16, !"_ZTS1C"} +!7 = !{i64 16, !"_ZTSM1CFviE.virtual"} +!8 = !{i64 24, !"_ZTSM1CFviE.virtual"} +!9 = !{i64 16, !10} +!10 = distinct !{} +!11 = !{i64 2} + +;--- RegularLTO.ll +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.Native = type { %struct.A } + +@_ZTV7Regular = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI7Regular, ptr @_ZN7Regular1fEi, ptr @_ZN1A1nEi] } , !type !0, !type !1, !type !2, !type !3, !type !4, !type !5 +@_ZTS7Regular = linkonce_odr constant [9 x i8] c"7Regular\00" +@_ZTI7Regular = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS7Regular, ptr @_ZTI1A } + +; Base type A does not need to emit a vtable if it's never instantiated. However, RTTI still gets generated +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [1 x ptr] [ ptr @_ZTV7Regular ], section "llvm.metadata" + +; CHECK-COMMON-REGULAR-IR-LABEL: define dso_local i32 @RegularLTO +define i32 @RegularLTO(ptr %obj, i32 %a) #0 { +entry: + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptr1 = load ptr, ptr %vtable, align 8 + + ;; Check that the call was not devirtualized. + ; CHECK-REGULAR-IR: %call = tail call i32 %fptr1 + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + ret i32 %call +} +; CHECK-COMMON-REGULAR-IR-LABEL: ret i32 +; CHECK-COMMON-REGULAR-IR-LABEL: } + +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN7Regular1fEi(ptr %this, i32 %a) #0 { + ret i32 1; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +attributes #0 = { noinline optnone } +!llvm.module.flags = !{!6, !7} + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFviE.virtual"} +!2 = !{i64 24, !"_ZTSM1AFviE.virtual"} +!3 = !{i64 16, !"_ZTS7Regular"} +!4 = !{i64 16, !"_ZTSM7RegularFviE.virtual"} +!5 = !{i64 24, !"_ZTSM7RegularFviE.virtual"} +!6 = !{i32 1, !"ThinLTO", i32 0} +!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1} diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll new file mode 100644 index 000000000000..30bd75606f7d --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll @@ -0,0 +1,136 @@ +; REQUIRES: x86 + +;; Common artifacts +; RUN: opt --thinlto-bc -o %t1.o %s +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s +; RUN: cp %s %t1_regular.ll +; RUN: echo '!llvm.module.flags = !{!6, !7}' >> %t1_regular.ll +; RUN: echo '!6 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll +; RUN: echo '!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll +; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll + +;; With --lto-whole-program-visibility, we assume no native types can interfere +;; and thus proceed with devirtualization even in the presence of native types + +;; Index based WPD +; RUN: ld.lld %t1.o -o %t3_index -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o -o %t3_hybrid -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o -o %t3_regular -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi +; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +;; With --lto-whole-program-visibility and --lto-validate-all-vtables-have-type-infos +;; we rely on resolutions on the typename symbol to inform us of what's outside the summary. +;; Without the typename symbol in the LTO unit (e.g. RTTI disabled) this causes +;; conservative disablement of WPD on these types unless it's local + +;; Index based WPD +; RUN: ld.lld %t1.o -o %t3_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o -o %t3_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o -o %t3_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +; VALIDATE-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.B = type { %struct.A } +%struct.C = type { %struct.A } +%struct.D = type { ptr } + +@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1B1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1 +@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1C1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !2 +@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN1D1mEi] }, !type !3, !vcall_visibility !5 + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ] + +; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start +define i32 @_start(ptr %obj, ptr %obj2, i32 %a) { +entry: + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptrptr = getelementptr ptr, ptr %vtable, i32 1 + %fptr1 = load ptr, ptr %fptrptr, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi + ;; No resolution for _ZTS1A means we don't devirtualize + ; CHECK-VALIDATE-IR: %call = tail call i32 %fptr1 + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + %fptr22 = load ptr, ptr %vtable, align 8 + + ;; We still have to call it as virtual. + ; CHECK-IR: %call3 = tail call i32 %fptr22 + ; CHECK-VALIDATE-IR: %call3 = tail call i32 %fptr22 + %call3 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call) + + %vtable2 = load ptr, ptr %obj2 + %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !4) + call void @llvm.assume(i1 %p2) + + %fptr33 = load ptr, ptr %vtable2, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call4 = tail call i32 @_ZN1D1mEi + ;; Being local this has VCallVisibilityTranslationUnit + ;; visibility so it's still devirtualized + ; CHECK-VALIDATE-IR: %call4 = tail call i32 @_ZN1D1mEi + %call4 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call3) + ret i32 %call4 +} +; CHECK-COMMON-IR-LABEL: ret i32 +; CHECK-COMMON-IR-LABEL: } + +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN1B1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1C1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTS1B"} +!2 = !{i64 16, !"_ZTS1C"} +!3 = !{i64 16, !4} +!4 = distinct !{} +!5 = !{i64 2} diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll new file mode 100644 index 000000000000..4ef048d6b6c6 --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll @@ -0,0 +1,130 @@ +; REQUIRES: x86 + +;; Common artifacts +; RUN: opt --thinlto-bc -o %t1.o %s +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s +; RUN: cp %s %t1_regular.ll +; RUN: echo '!llvm.module.flags = !{!2, !3}' >> %t1_regular.ll +; RUN: echo '!2 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll +; RUN: echo '!3 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll +; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_ref.ll -o %t2.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2.bc -o %t2.o + +;; Native objects can contain only a reference to the base type infos if the base declaration has no key functions. +;; Because of that, --lto-validate-all-vtables-have-type-infos needs to query for the type info symbol inside native files rather than the +;; type name symbol that's used as the key in !type metadata to correctly stop devirtualization on the native type. + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.o -o %t3_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.o -o %t3_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.o -o %t3_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR + +; CHECK-NOT: single-impl: devirtualized a call to _ZN1A3fooEv + +;; Source code: +;; cat > a.h <<'eof' +;; struct A { virtual int foo(); }; +;; int bar(A *a); +;; eof +;; cat > main.cc <<'eof' +;; #include "a.h" +;; +;; int A::foo() { return 1; } +;; int bar(A *a) { return a->foo(); } +;; +;; extern int baz(); +;; int main() { +;; A a; +;; int i = bar(&a); +;; int j = baz(); +;; return i + j; +;; } +;; eof +;; clang++ -fwhole-program-vtables -fno-split-lto-unit -flto=thin main.cc -c + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { %struct.Abase } +%struct.Abase = type { ptr } + +@_ZTV1A = dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A3fooEv] }, align 8, !type !0, !type !1 +@_ZTS1A = dso_local constant [3 x i8] c"1A\00", align 1 +@_ZTI1A = dso_local constant { ptr, ptr } { ptr null, ptr @_ZTS1A }, align 8 + +define dso_local noundef i32 @_ZN1A3fooEv(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 align 2 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + ret i32 1 +} + +; CHECK-IR: define dso_local noundef i32 @_Z3barP1A +define dso_local noundef i32 @_Z3barP1A(ptr noundef %a) #0 { +entry: + %a.addr = alloca ptr + store ptr %a, ptr %a.addr + %0 = load ptr, ptr %a.addr + %vtable = load ptr, ptr %0 + %1 = call i1 @llvm.public.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %1) + %vfn = getelementptr inbounds ptr, ptr %vtable, i64 0 + %fptr = load ptr, ptr %vfn + ;; Check that the call was not devirtualized. + ; CHECK-IR: %call = call noundef i32 %fptr + %call = call noundef i32 %fptr(ptr noundef nonnull align 8 dereferenceable(8) %0) + ret i32 %call +} +; CHECK-IR: ret i32 +; CHECK-IR: } + +declare i1 @llvm.public.type.test(ptr, metadata) +declare void @llvm.assume(i1 noundef) + +define dso_local noundef i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %a = alloca %struct.A, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + store i32 0, ptr %retval, align 4 + call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %a) + %call = call noundef i32 @_Z3barP1A(ptr noundef %a) + store i32 %call, ptr %i, align 4 + %call1 = call noundef i32 @_Z3bazv() + store i32 %call1, ptr %j, align 4 + %0 = load i32, ptr %i, align 4 + %1 = load i32, ptr %j, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +define linkonce_odr dso_local void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1, align 8 + ret void +} + +declare noundef i32 @_Z3bazv() + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFivE.virtual"} diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h index 5c23ba4f7ac4..76e19dd00791 100644 --- a/llvm/include/llvm/LTO/Config.h +++ b/llvm/include/llvm/LTO/Config.h @@ -80,6 +80,12 @@ struct Config { /// link. bool HasWholeProgramVisibility = false; + /// We're validating that all native vtables have corresponding type infos. + bool ValidateAllVtablesHaveTypeInfos = false; + /// If all native vtables have corresponding type infos, allow + /// usage of RTTI to block devirtualization on types used in native files. + bool AllVtablesHaveTypeInfos = false; + /// Always emit a Regular LTO object even when it is empty because no Regular /// LTO modules were linked. This option is useful for some build system which /// want to know a priori all possible output files. diff --git a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h index 9e121d9c6f4e..0be3146f695a 100644 --- a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h +++ b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h @@ -243,10 +243,18 @@ void updatePublicTypeTestCalls(Module &M, bool WholeProgramVisibilityEnabledInLTO); void updateVCallVisibilityInModule( Module &M, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols); + const DenseSet &DynamicExportSymbols, + bool ValidateAllVtablesHaveTypeInfos, + function_ref IsVisibleToRegularObj); void updateVCallVisibilityInIndex( ModuleSummaryIndex &Index, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols); + const DenseSet &DynamicExportSymbols, + const DenseSet &VisibleToRegularObjSymbols); + +void getVisibleToRegularObjVtableGUIDs( + ModuleSummaryIndex &Index, + DenseSet &VisibleToRegularObjSymbols, + function_ref IsVisibleToRegularObj); /// Perform index-based whole program devirtualization on the \p Summary /// index. Any devirtualized targets used by a type test in another module diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index bc8abb751221..6efdf6a7c3c9 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1285,13 +1285,27 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) { updateMemProfAttributes(*RegularLTO.CombinedModule, ThinLTO.CombinedIndex); + bool WholeProgramVisibilityEnabledInLTO = + Conf.HasWholeProgramVisibility && + // If validation is enabled, upgrade visibility only when all vtables + // have typeinfos. + (!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos); + + // This returns true when the name is local or not defined. Locals are + // expected to be handled separately. + auto IsVisibleToRegularObj = [&](StringRef name) { + auto It = GlobalResolutions.find(name); + return (It == GlobalResolutions.end() || It->second.VisibleOutsideSummary); + }; + // If allowed, upgrade public vcall visibility metadata to linkage unit // visibility before whole program devirtualization in the optimizer. - updateVCallVisibilityInModule(*RegularLTO.CombinedModule, - Conf.HasWholeProgramVisibility, - DynamicExportSymbols); + updateVCallVisibilityInModule( + *RegularLTO.CombinedModule, WholeProgramVisibilityEnabledInLTO, + DynamicExportSymbols, Conf.ValidateAllVtablesHaveTypeInfos, + IsVisibleToRegularObj); updatePublicTypeTestCalls(*RegularLTO.CombinedModule, - Conf.HasWholeProgramVisibility); + WholeProgramVisibilityEnabledInLTO); if (Conf.PreOptModuleHook && !Conf.PreOptModuleHook(0, *RegularLTO.CombinedModule)) @@ -1693,13 +1707,38 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, std::set ExportedGUIDs; - if (hasWholeProgramVisibility(Conf.HasWholeProgramVisibility)) + bool WholeProgramVisibilityEnabledInLTO = + Conf.HasWholeProgramVisibility && + // If validation is enabled, upgrade visibility only when all vtables + // have typeinfos. + (!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos); + if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) ThinLTO.CombinedIndex.setWithWholeProgramVisibility(); + + // If we're validating, get the vtable symbols that should not be + // upgraded because they correspond to typeIDs outside of index-based + // WPD info. + DenseSet VisibleToRegularObjSymbols; + if (WholeProgramVisibilityEnabledInLTO && + Conf.ValidateAllVtablesHaveTypeInfos) { + // This returns true when the name is local or not defined. Locals are + // expected to be handled separately. + auto IsVisibleToRegularObj = [&](StringRef name) { + auto It = GlobalResolutions.find(name); + return (It == GlobalResolutions.end() || + It->second.VisibleOutsideSummary); + }; + + getVisibleToRegularObjVtableGUIDs(ThinLTO.CombinedIndex, + VisibleToRegularObjSymbols, + IsVisibleToRegularObj); + } + // If allowed, upgrade public vcall visibility to linkage unit visibility in // the summaries before whole program devirtualization below. - updateVCallVisibilityInIndex(ThinLTO.CombinedIndex, - Conf.HasWholeProgramVisibility, - DynamicExportSymbols); + updateVCallVisibilityInIndex( + ThinLTO.CombinedIndex, WholeProgramVisibilityEnabledInLTO, + DynamicExportSymbols, VisibleToRegularObjSymbols); // Perform index-based WPD. This will return immediately if there are // no index entries in the typeIdMetadata map (e.g. if we are instead diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp index 1402da7fbbd2..3e2216ca61a2 100644 --- a/llvm/lib/LTO/LTOCodeGenerator.cpp +++ b/llvm/lib/LTO/LTOCodeGenerator.cpp @@ -604,11 +604,14 @@ bool LTOCodeGenerator::optimize() { // pipeline run below. updatePublicTypeTestCalls(*MergedModule, /* WholeProgramVisibilityEnabledInLTO */ false); - updateVCallVisibilityInModule(*MergedModule, - /* WholeProgramVisibilityEnabledInLTO */ false, - // FIXME: This needs linker information via a - // TBD new interface. - /* DynamicExportSymbols */ {}); + updateVCallVisibilityInModule( + *MergedModule, + /* WholeProgramVisibilityEnabledInLTO */ false, + // FIXME: These need linker information via a + // TBD new interface. + /*DynamicExportSymbols=*/{}, + /*ValidateAllVtablesHaveTypeInfos=*/false, + /*IsVisibleToRegularObj=*/[](StringRef) { return true; }); // We always run the verifier once on the merged module, the `DisableVerify` // parameter only applies to subsequent verify. diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp index 24cd6e1a0b41..152f708969e1 100644 --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -1058,11 +1058,14 @@ void ThinLTOCodeGenerator::run() { // via the internal option. Must be done before WPD below. if (hasWholeProgramVisibility(/* WholeProgramVisibilityEnabledInLTO */ false)) Index->setWithWholeProgramVisibility(); + + // FIXME: This needs linker information via a TBD new interface updateVCallVisibilityInIndex(*Index, - /* WholeProgramVisibilityEnabledInLTO */ false, - // FIXME: This needs linker information via a + /*WholeProgramVisibilityEnabledInLTO=*/false, + // FIXME: These need linker information via a // TBD new interface. - /* DynamicExportSymbols */ {}); + /*DynamicExportSymbols=*/{}, + /*VisibleToRegularObjSymbols=*/{}); // Perform index-based WPD. This will return immediately if there are // no index entries in the typeIdMetadata map (e.g. if we are instead diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index d33258642365..3406595950b5 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -784,12 +784,52 @@ bool hasWholeProgramVisibility(bool WholeProgramVisibilityEnabledInLTO) { !DisableWholeProgramVisibility; } +static bool +typeIDVisibleToRegularObj(StringRef TypeID, + function_ref IsVisibleToRegularObj) { + // TypeID for member function pointer type is an internal construct + // and won't exist in IsVisibleToRegularObj. The full TypeID + // will be present and participate in invalidation. + if (TypeID.ends_with(".virtual")) + return false; + + // TypeID that doesn't start with Itanium mangling (_ZTS) will be + // non-externally visible types which cannot interact with + // external native files. See CodeGenModule::CreateMetadataIdentifierImpl. + if (!TypeID.consume_front("_ZTS")) + return false; + + // TypeID is keyed off the type name symbol (_ZTS). However, the native + // object may not contain this symbol if it does not contain a key + // function for the base type and thus only contains a reference to the + // type info (_ZTI). To catch this case we query using the type info + // symbol corresponding to the TypeID. + std::string typeInfo = ("_ZTI" + TypeID).str(); + return IsVisibleToRegularObj(typeInfo); +} + +static bool +skipUpdateDueToValidation(GlobalVariable &GV, + function_ref IsVisibleToRegularObj) { + SmallVector Types; + GV.getMetadata(LLVMContext::MD_type, Types); + + for (auto Type : Types) + if (auto *TypeID = dyn_cast(Type->getOperand(1).get())) + return typeIDVisibleToRegularObj(TypeID->getString(), + IsVisibleToRegularObj); + + return false; +} + /// If whole program visibility asserted, then upgrade all public vcall /// visibility metadata on vtable definitions to linkage unit visibility in /// Module IR (for regular or hybrid LTO). void updateVCallVisibilityInModule( Module &M, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols) { + const DenseSet &DynamicExportSymbols, + bool ValidateAllVtablesHaveTypeInfos, + function_ref IsVisibleToRegularObj) { if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) return; for (GlobalVariable &GV : M.globals()) { @@ -800,7 +840,13 @@ void updateVCallVisibilityInModule( GV.getVCallVisibility() == GlobalObject::VCallVisibilityPublic && // Don't upgrade the visibility for symbols exported to the dynamic // linker, as we have no information on their eventual use. - !DynamicExportSymbols.count(GV.getGUID())) + !DynamicExportSymbols.count(GV.getGUID()) && + // With validation enabled, we want to exclude symbols visible to + // regular objects. Local symbols will be in this group due to the + // current implementation but those with VCallVisibilityTranslationUnit + // will have already been marked in clang so are unaffected. + !(ValidateAllVtablesHaveTypeInfos && + skipUpdateDueToValidation(GV, IsVisibleToRegularObj))) GV.setVCallVisibilityMetadata(GlobalObject::VCallVisibilityLinkageUnit); } } @@ -832,12 +878,26 @@ void updatePublicTypeTestCalls(Module &M, } } +/// Based on typeID string, get all associated vtable GUIDS that are +/// visible to regular objects. +void getVisibleToRegularObjVtableGUIDs( + ModuleSummaryIndex &Index, + DenseSet &VisibleToRegularObjSymbols, + function_ref IsVisibleToRegularObj) { + for (const auto &typeID : Index.typeIdCompatibleVtableMap()) { + if (typeIDVisibleToRegularObj(typeID.first, IsVisibleToRegularObj)) + for (const TypeIdOffsetVtableInfo &P : typeID.second) + VisibleToRegularObjSymbols.insert(P.VTableVI.getGUID()); + } +} + /// If whole program visibility asserted, then upgrade all public vcall /// visibility metadata on vtable definition summaries to linkage unit /// visibility in Module summary index (for ThinLTO). void updateVCallVisibilityInIndex( ModuleSummaryIndex &Index, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols) { + const DenseSet &DynamicExportSymbols, + const DenseSet &VisibleToRegularObjSymbols) { if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) return; for (auto &P : Index) { @@ -850,6 +910,12 @@ void updateVCallVisibilityInIndex( if (!GVar || GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic) continue; + // With validation enabled, we want to exclude symbols visible to regular + // objects. Local symbols will be in this group due to the current + // implementation but those with VCallVisibilityTranslationUnit will have + // already been marked in clang so are unaffected. + if (VisibleToRegularObjSymbols.count(P.first)) + continue; GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit); } } @@ -1045,8 +1111,8 @@ bool DevirtModule::tryFindVirtualCallTargets( } bool DevirtIndex::tryFindVirtualCallTargets( - std::vector &TargetsForSlot, const TypeIdCompatibleVtableInfo TIdInfo, - uint64_t ByteOffset) { + std::vector &TargetsForSlot, + const TypeIdCompatibleVtableInfo TIdInfo, uint64_t ByteOffset) { for (const TypeIdOffsetVtableInfo &P : TIdInfo) { // Find a representative copy of the vtable initializer. // We can have multiple available_externally, linkonce_odr and weak_odr diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp index 28ecf1af321f..abaf1e123145 100644 --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -581,9 +581,14 @@ int main(int argc, char **argv) { // the facility for updating public visibility to linkage unit visibility when // specified by an internal option. This is normally done during LTO which is // not performed via opt. - updateVCallVisibilityInModule(*M, - /* WholeProgramVisibilityEnabledInLTO */ false, - /* DynamicExportSymbols */ {}); + updateVCallVisibilityInModule( + *M, + /*WholeProgramVisibilityEnabledInLTO=*/false, + // FIXME: These need linker information via a + // TBD new interface. + /*DynamicExportSymbols=*/{}, + /*ValidateAllVtablesHaveTypeInfos=*/false, + /*IsVisibleToRegularObj=*/[](StringRef) { return true; }); // Figure out what stream we are supposed to write to... std::unique_ptr Out; -- Gitee From a8b94b8170220acac4ff98a6e660e9078ee495a3 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 25 Sep 2023 09:49:40 -0700 Subject: [PATCH 92/94] [Backport][ELF] Change --call-graph-profile-sort to accept an argument Change the FF form --call-graph-profile-sort to --call-graph-profile-sort={none,hfsort}. This will be extended to support llvm/lib/Transforms/Utils/CodeLayout.cpp. --call-graph-profile-sort is not used in the wild but --no-call-graph-profile-sort is (Chromium). Make --no-call-graph-profile-sort an alias for --call-graph-profile-sort=none. Reviewed By: rahmanl Differential Revision: https://reviews.llvm.org/D159544 --- lld/ELF/Config.h | 5 ++++- lld/ELF/Driver.cpp | 16 ++++++++++++---- lld/ELF/Options.td | 9 ++++++--- lld/docs/ld.lld.1 | 11 +++++++++++ lld/test/ELF/cgprofile-obj.s | 5 ++++- lld/test/ELF/cgprofile-txt.s | 10 +++++++++- 6 files changed, 46 insertions(+), 10 deletions(-) diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 8bdf9ff654a9..73f9b00dac84 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -59,6 +59,9 @@ enum class BsymbolicKind { None, NonWeakFunctions, Functions, All }; // For --build-id. enum class BuildIdKind { None, Fast, Md5, Sha1, Hexstring, Uuid }; +// For --call-graph-profile-sort={none,hfsort}. +enum class CGProfileSortKind { None, Hfsort }; + // For --discard-{all,locals,none}. enum class DiscardPolicy { Default, All, Locals, None }; @@ -214,7 +217,7 @@ struct Config { bool asNeeded = false; bool armBe8 = false; BsymbolicKind bsymbolic = BsymbolicKind::None; - bool callGraphProfileSort; + CGProfileSortKind callGraphProfileSort; bool checkSections; bool checkDynamicRelocs; llvm::DebugCompressionType compressDebugSections; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 044006f58d6e..ad9de476d0ac 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -1095,6 +1095,15 @@ static void ltoValidateAllVtablesHaveTypeInfos(opt::InputArgList &args) { } } +static CGProfileSortKind getCGProfileSortKind(opt::InputArgList &args) { + StringRef s = args.getLastArgValue(OPT_call_graph_profile_sort, "hfsort"); + if (s == "hfsort") + return CGProfileSortKind::Hfsort; + if (s != "none") + error("unknown --call-graph-profile-sort= value: " + s); + return CGProfileSortKind::None; +} + static DebugCompressionType getCompressionType(StringRef s, StringRef option) { DebugCompressionType type = StringSwitch(s) .Case("zlib", DebugCompressionType::Zlib) @@ -1226,6 +1235,7 @@ static void readConfigs(opt::InputArgList &args) { else if (arg->getOption().matches(OPT_Bsymbolic)) config->bsymbolic = BsymbolicKind::All; } + config->callGraphProfileSort = getCGProfileSortKind(args); config->checkSections = args.hasFlag(OPT_check_sections, OPT_no_check_sections, true); config->chroot = args.getLastArgValue(OPT_chroot); @@ -1246,8 +1256,6 @@ static void readConfigs(opt::InputArgList &args) { args.hasFlag(OPT_eh_frame_hdr, OPT_no_eh_frame_hdr, false); config->emitLLVM = args.hasArg(OPT_plugin_opt_emit_llvm, false); config->emitRelocs = args.hasArg(OPT_emit_relocs); - config->callGraphProfileSort = args.hasFlag( - OPT_call_graph_profile_sort, OPT_no_call_graph_profile_sort, true); config->enableNewDtags = args.hasFlag(OPT_enable_new_dtags, OPT_disable_new_dtags, true); config->entry = args.getLastArgValue(OPT_entry); @@ -1680,7 +1688,7 @@ static void readConfigs(opt::InputArgList &args) { config->symbolOrderingFile = getSymbolOrderingFile(*buffer); // Also need to disable CallGraphProfileSort to prevent // LLD order symbols with CGProfile - config->callGraphProfileSort = false; + config->callGraphProfileSort = CGProfileSortKind::None; } } @@ -3086,7 +3094,7 @@ void LinkerDriver::link(opt::InputArgList &args) { } // Read the callgraph now that we know what was gced or icfed - if (config->callGraphProfileSort) { + if (config->callGraphProfileSort != CGProfileSortKind::None) { if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file)) if (std::optional buffer = readFile(arg->getValue())) readCallGraph(*buffer); diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td index c28de16d6fdb..29f60633914c 100644 --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -125,9 +125,12 @@ defm as_needed: B<"as-needed", defm call_graph_ordering_file: Eq<"call-graph-ordering-file", "Layout sections to optimize the given callgraph">; -defm call_graph_profile_sort: BB<"call-graph-profile-sort", - "Reorder sections with call graph profile (default)", - "Do not reorder sections with call graph profile">; +def call_graph_profile_sort: JJ<"call-graph-profile-sort=">, + HelpText<"Reorder input sections with call graph profile using the specified algorithm (default: hfsort)">, + MetaVarName<"[none,hfsort]">, + Values<"none,hfsort">; +def : FF<"no-call-graph-profile-sort">, Alias, AliasArgs<["none"]>, + Flags<[HelpHidden]>; // --chroot doesn't have a help text because it is an internal option. def chroot: Separate<["--"], "chroot">; diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1 index 0a5e4293deda..827c42d324ac 100644 --- a/lld/docs/ld.lld.1 +++ b/lld/docs/ld.lld.1 @@ -117,6 +117,17 @@ is not intended to be cryptographically secure. .It Fl -build-id Synonym for .Fl -build-id Ns = Ns Cm fast . +.It Fl -call-graph-profile-sort Ns = Ns Ar algorithm +.Ar algorithm +may be: +.Pp +.Bl -tag -width 2n -compact +.It Cm none +Ignore call graph profile. +.It Cm hfsort +Use hfsort (default). +.El +.Pp .It Fl -color-diagnostics Ns = Ns Ar value Use colors in diagnostics. .Ar value diff --git a/lld/test/ELF/cgprofile-obj.s b/lld/test/ELF/cgprofile-obj.s index f56f3bcbf0c3..0848adc5e427 100644 --- a/lld/test/ELF/cgprofile-obj.s +++ b/lld/test/ELF/cgprofile-obj.s @@ -3,8 +3,11 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o # RUN: ld.lld -e A %t.o -o %t # RUN: llvm-nm --no-sort %t | FileCheck %s -# RUN: ld.lld --no-call-graph-profile-sort -e A %t.o -o %t +# RUN: ld.lld --call-graph-profile-sort=none -e A %t.o -o %t # RUN: llvm-nm --no-sort %t | FileCheck %s --check-prefix=NO-CG +## --no-call-graph-profile-sort is an alias for --call-graph-profile-sort=none. +# RUN: ld.lld --no-call-graph-profile-sort -e A %t.o -o %t1 +# RUN: cmp %t %t1 .section .text.D,"ax",@progbits D: diff --git a/lld/test/ELF/cgprofile-txt.s b/lld/test/ELF/cgprofile-txt.s index 99cbfa574532..2c0c9642a509 100644 --- a/lld/test/ELF/cgprofile-txt.s +++ b/lld/test/ELF/cgprofile-txt.s @@ -24,8 +24,16 @@ # RUN: echo "TooManyPreds8 TooManyPreds 10" >> %t.call_graph # RUN: echo "TooManyPreds9 TooManyPreds 10" >> %t.call_graph # RUN: echo "TooManyPreds10 TooManyPreds 11" >> %t.call_graph -# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2 +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=hfsort -o %t2 # RUN: llvm-readobj --symbols %t2 | FileCheck %s +## --call-graph-profile-sort=hfsort is the default. +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2b +# RUN: cmp %t2 %t2b + +# RUN: not ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=sort \ +# RUN: -o /dev/null 2>&1 | FileCheck %s --check-prefix=UNKNOWN + +# UNKNOWN: error: unknown --call-graph-profile-sort= value: sort .section .text.D,"ax",@progbits D: -- Gitee From e475eebfd7775f221fb7003f901adfcf4306c22d Mon Sep 17 00:00:00 2001 From: spupyrev Date: Tue, 13 Jun 2023 10:08:00 -0700 Subject: [PATCH 93/94] [Backport][ELF] A new code layout algorithm for function reordering [3a/3] We are brining a new algorithm for function layout (reordering) based on the call graph (extracted from a profile data). The algorithm is an improvement of top of a known heuristic, C^3. It tries to co-locate hot and frequently executed together functions in the resulting ordering. Unlike C^3, it explores a larger search space and have an objective closely tied to the performance of instruction and i-TLB caches. Hence, the name CDS = Cache-Directed Sort. The algorithm can be used at the linking or post-linking (e.g., BOLT) stage. Refer to https://reviews.llvm.org/D152834 for the actual implementation of the reordering algorithm. This diff adds a linker option to replace the existing C^3 heuristic with CDS. The new behavior can be turned on by passing "--use-cache-directed-sort". (the plan is to make it default in a next diff) **Perf-impact** clang-10 binary (built with LTO+AutoFDO/CSSPGO): wins on top of C^3 in [0.3%..0.8%] rocksDB-8 binary (built with LTO+CSSPGO): wins on top of C^3 in [0.8%..1.5%] Note that function layout affects the perf the most on older machines (with smaller instruction/iTLB caches) and when huge pages are not enabled. The impact on newer processors with huge pages enabled is likely neutral/minor. Reviewed By: MaskRay Differential Revision: https://reviews.llvm.org/D152840 --- lld/ELF/CMakeLists.txt | 1 + lld/ELF/CallGraphSort.cpp | 140 ++++++++++++++++++++++++++-------- lld/ELF/CallGraphSort.h | 2 + lld/ELF/Config.h | 4 +- lld/ELF/Driver.cpp | 2 + lld/ELF/Options.td | 2 +- lld/docs/ld.lld.1 | 2 + lld/test/ELF/cgprofile-txt.s | 28 +++++++ lld/test/ELF/cgprofile-txt2.s | 31 +++++--- 9 files changed, 166 insertions(+), 46 deletions(-) diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt index 048c3e54ca44..6fde9fe962a3 100644 --- a/lld/ELF/CMakeLists.txt +++ b/lld/ELF/CMakeLists.txt @@ -74,6 +74,7 @@ add_lld_library(lldELF Passes Support TargetParser + TransformUtils LINK_LIBS lldCommon diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp index ff72731b1f38..5e36964da94f 100644 --- a/lld/ELF/CallGraphSort.cpp +++ b/lld/ELF/CallGraphSort.cpp @@ -6,38 +6,21 @@ // //===----------------------------------------------------------------------===// /// -/// Implementation of Call-Chain Clustering from: Optimizing Function Placement -/// for Large-Scale Data-Center Applications -/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf -/// -/// The goal of this algorithm is to improve runtime performance of the final -/// executable by arranging code sections such that page table and i-cache -/// misses are minimized. -/// -/// Definitions: -/// * Cluster -/// * An ordered list of input sections which are laid out as a unit. At the -/// beginning of the algorithm each input section has its own cluster and -/// the weight of the cluster is the sum of the weight of all incoming -/// edges. -/// * Call-Chain Clustering (C³) Heuristic -/// * Defines when and how clusters are combined. Pick the highest weighted -/// input section then add it to its most likely predecessor if it wouldn't -/// penalize it too much. -/// * Density -/// * The weight of the cluster divided by the size of the cluster. This is a -/// proxy for the amount of execution time spent per byte of the cluster. -/// -/// It does so given a call graph profile by the following: -/// * Build a weighted call graph from the call graph profile -/// * Sort input sections by weight -/// * For each input section starting with the highest weight -/// * Find its most likely predecessor cluster -/// * Check if the combined cluster would be too large, or would have too low -/// a density. -/// * If not, then combine the clusters. -/// * Sort non-empty clusters by density +/// The file is responsible for sorting sections using LLVM call graph profile +/// data by placing frequently executed code sections together. The goal of the +/// placement is to improve the runtime performance of the final executable by +/// arranging code sections so that i-TLB misses and i-cache misses are reduced. /// +/// The algorithm first builds a call graph based on the profile data and then +/// iteratively merges "chains" (ordered lists) of input sections which will be +/// laid out as a unit. There are two implementations for deciding how to +/// merge a pair of chains: +/// - a simpler one, referred to as Call-Chain Clustering (C^3), that follows +/// "Optimizing Function Placement for Large-Scale Data-Center Applications" +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +/// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which +/// typically produces layouts with higher locality, and hence, yields fewer +/// instruction cache misses on large binaries. //===----------------------------------------------------------------------===// #include "CallGraphSort.h" @@ -45,6 +28,7 @@ #include "InputSection.h" #include "Symbols.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Transforms/Utils/CodeLayout.h" #include @@ -75,6 +59,33 @@ struct Cluster { Edge bestPred = {-1, 0}; }; +/// Implementation of the Call-Chain Clustering (C^3). The goal of this +/// algorithm is to improve runtime performance of the executable by arranging +/// code sections such that page table and i-cache misses are minimized. +/// +/// Definitions: +/// * Cluster +/// * An ordered list of input sections which are laid out as a unit. At the +/// beginning of the algorithm each input section has its own cluster and +/// the weight of the cluster is the sum of the weight of all incoming +/// edges. +/// * Call-Chain Clustering (C³) Heuristic +/// * Defines when and how clusters are combined. Pick the highest weighted +/// input section then add it to its most likely predecessor if it wouldn't +/// penalize it too much. +/// * Density +/// * The weight of the cluster divided by the size of the cluster. This is a +/// proxy for the amount of execution time spent per byte of the cluster. +/// +/// It does so given a call graph profile by the following: +/// * Build a weighted call graph from the call graph profile +/// * Sort input sections by weight +/// * For each input section starting with the highest weight +/// * Find its most likely predecessor cluster +/// * Check if the combined cluster would be too large, or would have too low +/// a density. +/// * If not, then combine the clusters. +/// * Sort non-empty clusters by density class CallGraphSort { public: CallGraphSort(); @@ -260,11 +271,74 @@ DenseMap CallGraphSort::run() { return orderMap; } +// Sort sections by the profile data using the Cache-Directed Sort algorithm. +// The placement is done by optimizing the locality by co-locating frequently +// executed code sections together. +DenseMap elf::computeCacheDirectedSortOrder() { + SmallVector funcSizes; + SmallVector funcCounts; + SmallVector callCounts; + SmallVector callOffsets; + SmallVector sections; + DenseMap secToTargetId; + + auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t { + auto res = secToTargetId.try_emplace(inSec, sections.size()); + if (res.second) { + // inSec does not appear before in the graph. + sections.push_back(inSec); + assert(inSec->getSize() > 0 && "found a function with zero size"); + funcSizes.push_back(inSec->getSize()); + funcCounts.push_back(0); + } + return res.first->second; + }; + + // Create the graph. + for (std::pair &c : config->callGraphProfile) { + const InputSectionBase *fromSB = cast(c.first.first); + const InputSectionBase *toSB = cast(c.first.second); + // Ignore edges between input sections belonging to different sections. + if (fromSB->getOutputSection() != toSB->getOutputSection()) + continue; + + uint64_t weight = c.second; + // Ignore edges with zero weight. + if (weight == 0) + continue; + + size_t from = getOrCreateNode(fromSB); + size_t to = getOrCreateNode(toSB); + // Ignore self-edges (recursive calls). + if (from == to) + continue; + + callCounts.push_back({from, to, weight}); + // Assume that the jump is at the middle of the input section. The profile + // data does not contain jump offsets. + callOffsets.push_back((funcSizes[from] + 1) / 2); + funcCounts[to] += weight; + } + + // Run the layout algorithm. + std::vector sortedSections = codelayout::computeCacheDirectedLayout( + funcSizes, funcCounts, callCounts, callOffsets); + + // Create the final order. + DenseMap orderMap; + int curOrder = 1; + for (uint64_t secIdx : sortedSections) + orderMap[sections[secIdx]] = curOrder++; + + return orderMap; +} + // Sort sections by the profile data provided by --callgraph-profile-file. // // This first builds a call graph based on the profile data then merges sections -// according to the C³ heuristic. All clusters are then sorted by a density -// metric to further improve locality. +// according either to the C³ or Cache-Directed-Sort ordering algorithm. DenseMap elf::computeCallGraphProfileOrder() { + if (config->callGraphProfileSort == CGProfileSortKind::Cdsort) + return computeCacheDirectedSortOrder(); return CallGraphSort().run(); } diff --git a/lld/ELF/CallGraphSort.h b/lld/ELF/CallGraphSort.h index 4997cb102c32..1b54f2b62482 100644 --- a/lld/ELF/CallGraphSort.h +++ b/lld/ELF/CallGraphSort.h @@ -14,6 +14,8 @@ namespace lld::elf { class InputSectionBase; +llvm::DenseMap computeCacheDirectedSortOrder(); + llvm::DenseMap computeCallGraphProfileOrder(); } // namespace lld::elf diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 73f9b00dac84..aa9f5456a754 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -59,8 +59,8 @@ enum class BsymbolicKind { None, NonWeakFunctions, Functions, All }; // For --build-id. enum class BuildIdKind { None, Fast, Md5, Sha1, Hexstring, Uuid }; -// For --call-graph-profile-sort={none,hfsort}. -enum class CGProfileSortKind { None, Hfsort }; +// For --call-graph-profile-sort={none,hfsort,cdsort}. +enum class CGProfileSortKind { None, Hfsort, Cdsort }; // For --discard-{all,locals,none}. enum class DiscardPolicy { Default, All, Locals, None }; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index ad9de476d0ac..4b563a0fdf2f 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -1099,6 +1099,8 @@ static CGProfileSortKind getCGProfileSortKind(opt::InputArgList &args) { StringRef s = args.getLastArgValue(OPT_call_graph_profile_sort, "hfsort"); if (s == "hfsort") return CGProfileSortKind::Hfsort; + if (s == "cdsort") + return CGProfileSortKind::Cdsort; if (s != "none") error("unknown --call-graph-profile-sort= value: " + s); return CGProfileSortKind::None; diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td index 29f60633914c..dea6c16949ee 100644 --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -621,7 +621,7 @@ def lto_cs_profile_file: JJ<"lto-cs-profile-file=">, defm lto_pgo_warn_mismatch: BB<"lto-pgo-warn-mismatch", "turn on warnings about profile cfg mismatch (default)", "turn off warnings about profile cfg mismatch">; -defm lto_known_safe_vtables : EEq<"lto-known-safe-vtables", +defm lto_known_safe_vtables : EEq<"lto-known-safe-vtables", "When --lto-validate-all-vtables-have-type-infos is enabled, skip validation on these vtables (_ZTV symbols)">; def lto_obj_path_eq: JJ<"lto-obj-path=">; def lto_sample_profile: JJ<"lto-sample-profile=">, diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1 index 827c42d324ac..72b90094eec2 100644 --- a/lld/docs/ld.lld.1 +++ b/lld/docs/ld.lld.1 @@ -126,6 +126,8 @@ may be: Ignore call graph profile. .It Cm hfsort Use hfsort (default). +.It Cm cdsort +Use cdsort. .El .Pp .It Fl -color-diagnostics Ns = Ns Ar value diff --git a/lld/test/ELF/cgprofile-txt.s b/lld/test/ELF/cgprofile-txt.s index 2c0c9642a509..c9194bbbc43c 100644 --- a/lld/test/ELF/cgprofile-txt.s +++ b/lld/test/ELF/cgprofile-txt.s @@ -30,6 +30,9 @@ # RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2b # RUN: cmp %t2 %t2b +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=cdsort -o %t2 +# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CDSORT + # RUN: not ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=sort \ # RUN: -o /dev/null 2>&1 | FileCheck %s --check-prefix=UNKNOWN @@ -167,6 +170,31 @@ TooManyPreds10: # CHECK: Name: _init2 # CHECK-NEXT: Value: 0x201141 +# CDSORT: Name: D +# CDSORT-NEXT: Value: 0x201123 +# CDSORT: Name: TooManyPreds +# CDSORT-NEXT: Value: 0x20112F +# CDSORT: Name: TooManyPreds10 +# CDSORT-NEXT: Value: 0x20112E +# CDSORT: Name: C +# CDSORT-NEXT: Value: 0x201122 +# CDSORT: Name: B +# CDSORT-NEXT: Value: 0x201121 +# CDSORT: Name: A +# CDSORT-NEXT: Value: 0x201120 +# CDSORT: Name: TS +# CDSORT-NEXT: Value: 0x20113D +# CDSORT: Name: PP +# CDSORT-NEXT: Value: 0x20113C +# CDSORT: Name: QC +# CDSORT-NEXT: Value: 0x20113E +# CDSORT: Name: GB +# CDSORT-NEXT: Value: 0x20113F +# CDSORT: Name: _init +# CDSORT-NEXT: Value: 0x201140 +# CDSORT: Name: _init2 +# CDSORT-NEXT: Value: 0x201141 + # NOSORT: Name: D # NOSORT-NEXT: Value: 0x201120 # NOSORT: Name: TooManyPreds diff --git a/lld/test/ELF/cgprofile-txt2.s b/lld/test/ELF/cgprofile-txt2.s index 91961db39c3a..b59b6eeb292f 100644 --- a/lld/test/ELF/cgprofile-txt2.s +++ b/lld/test/ELF/cgprofile-txt2.s @@ -5,17 +5,28 @@ # RUN: echo "B C 50" >> %t.call_graph # RUN: echo "C D 40" >> %t.call_graph # RUN: echo "D B 10" >> %t.call_graph -# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2 -# RUN: llvm-readobj --symbols %t2 | FileCheck %s +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=hfsort -o %t2 +# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKC3 +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=cdsort -o %t2 +# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKCDS -# CHECK: Name: A -# CHECK-NEXT: Value: 0x201123 -# CHECK: Name: B -# CHECK-NEXT: Value: 0x201120 -# CHECK: Name: C -# CHECK-NEXT: Value: 0x201121 -# CHECK: Name: D -# CHECK-NEXT: Value: 0x201122 +# CHECKC3: Name: A +# CHECKC3-NEXT: Value: 0x201123 +# CHECKC3: Name: B +# CHECKC3-NEXT: Value: 0x201120 +# CHECKC3: Name: C +# CHECKC3-NEXT: Value: 0x201121 +# CHECKC3: Name: D +# CHECKC3-NEXT: Value: 0x201122 + +# CHECKCDS: Name: A +# CHECKCDS-NEXT: Value: 0x201120 +# CHECKCDS: Name: B +# CHECKCDS-NEXT: Value: 0x201121 +# CHECKCDS: Name: C +# CHECKCDS-NEXT: Value: 0x201122 +# CHECKCDS: Name: D +# CHECKCDS-NEXT: Value: 0x201123 .section .text.A,"ax",@progbits .globl A -- Gitee From c32f1c09ef46da03d7b13487e1ed77452c16e666 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Fri, 15 Dec 2023 12:17:01 -0700 Subject: [PATCH 94/94] fixup! [GlobalISel] Always direct-call IFuncs and Aliases (#74902) The codegen change broke one of the BOLT tests. --- bolt/test/AArch64/ifunc.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/bolt/test/AArch64/ifunc.c b/bolt/test/AArch64/ifunc.c index 8edb913ee70d..79c035ed4537 100644 --- a/bolt/test/AArch64/ifunc.c +++ b/bolt/test/AArch64/ifunc.c @@ -6,7 +6,7 @@ // RUN: -o %t.O0.exe -Wl,-q // RUN: llvm-bolt %t.O0.exe -o %t.O0.bolt.exe \ // RUN: --print-disasm --print-only=_start | \ -// RUN: FileCheck --check-prefix=O0_CHECK %s +// RUN: FileCheck --check-prefix=CHECK %s // RUN: llvm-readelf -aW %t.O0.bolt.exe | \ // RUN: FileCheck --check-prefix=REL_CHECK %s @@ -18,7 +18,7 @@ // RUN: FileCheck --check-prefix=NON_DYN_CHECK %s // RUN: llvm-bolt %t.O3_nopie.exe -o %t.O3_nopie.bolt.exe \ // RUN: --print-disasm --print-only=_start | \ -// RUN: FileCheck --check-prefix=O3_CHECK %s +// RUN: FileCheck --check-prefix=CHECK %s // RUN: llvm-readelf -aW %t.O3_nopie.bolt.exe | \ // RUN: FileCheck --check-prefix=REL_CHECK %s @@ -29,7 +29,7 @@ // RUN: -o %t.O3_pie.exe -Wl,-q // RUN: llvm-bolt %t.O3_pie.exe -o %t.O3_pie.bolt.exe \ // RUN: --print-disasm --print-only=_start | \ -// RUN: FileCheck --check-prefix=O3_CHECK %s +// RUN: FileCheck --check-prefix=CHECK %s // RUN: llvm-readelf -aW %t.O3_pie.bolt.exe | \ // RUN: FileCheck --check-prefix=REL_CHECK %s @@ -39,14 +39,13 @@ // RUN: -T %p/Inputs/iplt.ld -o %t.iplt_O3_pie.exe -Wl,-q // RUN: llvm-bolt %t.iplt_O3_pie.exe -o %t.iplt_O3_pie.bolt.exe \ // RUN: --print-disasm --print-only=_start | \ -// RUN: FileCheck --check-prefix=O3_CHECK %s +// RUN: FileCheck --check-prefix=CHECK %s // RUN: llvm-readelf -aW %t.iplt_O3_pie.bolt.exe | \ // RUN: FileCheck --check-prefix=REL_CHECK %s // NON_DYN_CHECK-NOT: DYNAMIC -// O0_CHECK: adr x{{[0-9]+}}, ifoo -// O3_CHECK: b "{{resolver_foo|ifoo}}{{.*}}@PLT" +// CHECK: b{{l?}} "{{resolver_foo|ifoo}}{{.*}}@PLT" // REL_CHECK: R_AARCH64_IRELATIVE [[#%x,REL_SYMB_ADDR:]] // REL_CHECK: [[#REL_SYMB_ADDR]] {{.*}} FUNC {{.*}} resolver_foo -- Gitee