diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 89462f8a14c1441a4150907279163196eee607ea..f163d45342874d0e8dd2a64a292e735ddf3cd626 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -35,7 +35,8 @@ set(BOLT_ENABLE_RUNTIME_default OFF) if ((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") AND (CMAKE_SYSTEM_NAME STREQUAL "Linux" - OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")) + OR CMAKE_SYSTEM_NAME STREQUAL "Darwin") + AND (NOT CMAKE_CROSSCOMPILING)) set(BOLT_ENABLE_RUNTIME_default ON) endif() option(BOLT_ENABLE_RUNTIME "Enable BOLT runtime" ${BOLT_ENABLE_RUNTIME_default}) diff --git a/bolt/include/bolt/Core/AddressMap.h b/bolt/include/bolt/Core/AddressMap.h new file mode 100644 index 0000000000000000000000000000000000000000..85a9ab4473aafedd192145acaa334208e22ac6e6 --- /dev/null +++ b/bolt/include/bolt/Core/AddressMap.h @@ -0,0 +1,79 @@ +//===- bolt/Core/AddressMap.h - Input-output address map --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the AddressMap class used for looking +// up addresses in the output object. +// +//===----------------------------------------------------------------------===// + +#ifndef BOLT_CORE_ADDRESS_MAP_H +#define BOLT_CORE_ADDRESS_MAP_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCSymbol.h" + +#include +#include + +namespace llvm { + +class MCStreamer; + +namespace bolt { + +class BinaryContext; + +/// Helper class to create a mapping from input entities to output addresses +/// needed for updating debugging symbols and BAT. We emit a section containing +/// pairs to the object file and JITLink will +/// transform this in pairs. The linker output +/// can then be parsed and used to establish the mapping. +/// +/// The entities that can be mapped to output address are input addresses and +/// labels (MCSymbol). Input addresses support one-to-many mapping. +class AddressMap { + static const char *const AddressSectionName; + static const char *const LabelSectionName; + + /// Map multiple to . + using Addr2AddrMapTy = std::unordered_multimap; + Addr2AddrMapTy Address2AddressMap; + + /// Map MCSymbol to its output address. Normally used for temp symbols that + /// are not updated by the linker. + using Label2AddrMapTy = DenseMap; + Label2AddrMapTy Label2AddrMap; + +public: + static void emit(MCStreamer &Streamer, BinaryContext &BC); + static std::optional parse(BinaryContext &BC); + + std::optional lookup(uint64_t InputAddress) const { + auto It = Address2AddressMap.find(InputAddress); + if (It != Address2AddressMap.end()) + return It->second; + return std::nullopt; + } + + std::optional lookup(const MCSymbol *Symbol) const { + auto It = Label2AddrMap.find(Symbol); + if (It != Label2AddrMap.end()) + return It->second; + return std::nullopt; + } + + std::pair + lookupAll(uint64_t InputAddress) const { + return Address2AddressMap.equal_range(InputAddress); + } +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/include/bolt/Core/BinaryBasicBlock.h b/bolt/include/bolt/Core/BinaryBasicBlock.h index 02be9c1d4f118d9edc65202395bb879177cb5412..bc95e2c4de3a11ec2b94e9faf7575ce696856611 100644 --- a/bolt/include/bolt/Core/BinaryBasicBlock.h +++ b/bolt/include/bolt/Core/BinaryBasicBlock.h @@ -100,16 +100,6 @@ private: using LocSymsTy = std::vector>; std::unique_ptr LocSyms; - /// After output/codegen, map output offsets of instructions in this basic - /// block to instruction offsets in the original function. Note that the - /// output basic block could be different from the input basic block. - /// We only map instruction of interest, such as calls and markers. - /// - /// We store the offset array in a basic block to facilitate BAT tables - /// generation. Otherwise, the mapping could be done at function level. - using OffsetTranslationTableTy = std::vector>; - std::unique_ptr OffsetTranslationTable; - /// Alignment requirements for the block. uint32_t Alignment{1}; @@ -828,8 +818,7 @@ public: return OutputAddressRange; } - /// Update addresses of special instructions inside this basic block. - void updateOutputValues(const MCAsmLayout &Layout); + bool hasLocSyms() const { return LocSyms != nullptr; } /// Return mapping of input offsets to symbols in the output. LocSymsTy &getLocSyms() { @@ -841,19 +830,6 @@ public: return const_cast(this)->getLocSyms(); } - /// Return offset translation table for the basic block. - OffsetTranslationTableTy &getOffsetTranslationTable() { - return OffsetTranslationTable - ? *OffsetTranslationTable - : *(OffsetTranslationTable = - std::make_unique()); - } - - /// Return offset translation table for the basic block. - const OffsetTranslationTableTy &getOffsetTranslationTable() const { - return const_cast(this)->getOffsetTranslationTable(); - } - /// Return size of the basic block in the output binary. uint64_t getOutputSize() const { return OutputAddressRange.second - OutputAddressRange.first; diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index 79f91985c4920c521540223141e908644d58701f..ef0a1c6f68320bc74bb43b179d21db09ad52c13a 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -13,6 +13,7 @@ #ifndef BOLT_CORE_BINARY_CONTEXT_H #define BOLT_CORE_BINARY_CONTEXT_H +#include "bolt/Core/AddressMap.h" #include "bolt/Core/BinaryData.h" #include "bolt/Core/BinarySection.h" #include "bolt/Core/DebugData.h" @@ -221,6 +222,9 @@ class BinaryContext { bool ContainsDwarf5{false}; bool ContainsDwarfLegacy{false}; + /// Mapping from input to output addresses. + std::optional IOAddressMap; + /// Preprocess DWO debug information. void preprocessDWODebugInfo(); @@ -638,9 +642,22 @@ public: /// Total hotness score according to profiling data for this binary. uint64_t TotalScore{0}; - /// Binary-wide stats for macro-fusion. - uint64_t MissedMacroFusionPairs{0}; - uint64_t MissedMacroFusionExecCount{0}; + /// Binary-wide aggregated stats. + struct BinaryStats { + /// Stats for macro-fusion. + uint64_t MissedMacroFusionPairs{0}; + uint64_t MissedMacroFusionExecCount{0}; + + /// Stats for stale profile matching: + /// the total number of basic blocks in the profile + uint32_t NumStaleBlocks{0}; + /// the number of matched basic blocks + uint32_t NumMatchedBlocks{0}; + /// the total count of samples in the profile + uint64_t StaleSampleCount{0}; + /// the count of matched samples + uint64_t MatchedSampleCount{0}; + } Stats; // Address of the first allocated segment. uint64_t FirstAllocAddress{std::numeric_limits::max()}; @@ -663,6 +680,15 @@ public: /// the execution of the binary is completed. std::optional FiniFunctionAddress; + /// DT_FINI. + std::optional FiniAddress; + + /// DT_FINI_ARRAY. Only used when DT_FINI is not set. + std::optional FiniArrayAddress; + + /// DT_FINI_ARRAYSZ. Only used when DT_FINI is not set. + std::optional FiniArraySize; + /// Page alignment used for code layout. uint64_t PageAlign{HugePageSize}; @@ -1195,6 +1221,9 @@ public: /// /// Return the pair where the first size is for the main part, and the second /// size is for the cold one. + /// Modify BinaryBasicBlock::OutputAddressRange for each basic block in the + /// function in place so that BinaryBasicBlock::getOutputSize() gives the + /// emitted size of the basic block. std::pair calculateEmittedSize(BinaryFunction &BF, bool FixBranches = true); @@ -1204,8 +1233,8 @@ public: uint64_t computeInstructionSize(const MCInst &Inst, const MCCodeEmitter *Emitter = nullptr) const { - if (auto Size = MIB->getAnnotationWithDefault(Inst, "Size")) - return Size; + if (std::optional Size = MIB->getSize(Inst)) + return *Size; if (!Emitter) Emitter = this->MCE.get(); @@ -1255,6 +1284,9 @@ public: /// Return true if the function should be emitted to the output file. bool shouldEmit(const BinaryFunction &Function) const; + /// Dump the assembly representation of MCInst to debug output. + void dump(const MCInst &Inst) const; + /// Print the string name for a CFI operation. static void printCFI(raw_ostream &OS, const MCCFIInstruction &Inst); @@ -1330,6 +1362,12 @@ public: /* DWARFMustBeAtTheEnd */ false)); return Streamer; } + + void setIOAddressMap(AddressMap Map) { IOAddressMap = std::move(Map); } + const AddressMap &getIOAddressMap() const { + assert(IOAddressMap && "Address map not set yet"); + return *IOAddressMap; + } }; template > diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index c393b5b851d99bcce6962c9c50be14a153b1b597..0c62df34fa6ac94f43fff4fa88776c11329ccb18 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -192,9 +192,6 @@ public: static constexpr uint64_t COUNT_NO_PROFILE = BinaryBasicBlock::COUNT_NO_PROFILE; - /// We have to use at least 2-byte alignment for functions because of C++ ABI. - static constexpr unsigned MinAlign = 2; - static const char TimerGroupName[]; static const char TimerGroupDesc[]; @@ -322,10 +319,6 @@ private: /// Execution halts whenever this function is entered. bool TrapsOnEntry{false}; - /// True if the function had an indirect branch with a fixed internal - /// destination. - bool HasFixedIndirectBranch{false}; - /// True if the function is a fragment of another function. This means that /// this function could only be entered via its parent or one of its sibling /// fragments. It could be entered at any basic block. It can also return @@ -366,14 +359,15 @@ private: std::string ColdCodeSectionName; /// Parent function fragment for split function fragments. - SmallPtrSet ParentFragments; + using FragmentsSetTy = SmallPtrSet; + FragmentsSetTy ParentFragments; /// Indicate if the function body was folded into another function. /// Used by ICF optimization. BinaryFunction *FoldedIntoFunction{nullptr}; /// All fragments for a parent function. - SmallPtrSet Fragments; + FragmentsSetTy Fragments; /// The profile data for the number of times the function was executed. uint64_t ExecutionCount{COUNT_NO_PROFILE}; @@ -381,7 +375,7 @@ private: /// Profile match ratio. float ProfileMatchRatio{0.0f}; - /// Raw branch count for this function in the profile + /// Raw branch count for this function in the profile. uint64_t RawBranchCount{0}; /// Indicates the type of profile the function is using. @@ -576,9 +570,6 @@ private: /// Count the number of functions created. static uint64_t Count; - /// Map offsets of special instructions to addresses in the output. - InputOffsetToAddressMapTy InputOffsetToAddressMap; - /// Register alternative function name. void addAlternativeName(std::string NewName) { Aliases.push_back(std::move(NewName)); @@ -1193,7 +1184,7 @@ public: if (!Islands->FunctionConstantIslandLabel) { Islands->FunctionConstantIslandLabel = - BC.Ctx->createNamedTempSymbol("func_const_island"); + BC.Ctx->getOrCreateSymbol("func_const_island@" + getOneName()); } return Islands->FunctionConstantIslandLabel; } @@ -1203,7 +1194,7 @@ public: if (!Islands->FunctionColdConstantIslandLabel) { Islands->FunctionColdConstantIslandLabel = - BC.Ctx->createNamedTempSymbol("func_cold_const_island"); + BC.Ctx->getOrCreateSymbol("func_cold_const_island@" + getOneName()); } return Islands->FunctionColdConstantIslandLabel; } @@ -1223,14 +1214,7 @@ public: } /// Update output values of the function based on the final \p Layout. - void updateOutputValues(const MCAsmLayout &Layout); - - /// Return mapping of input to output addresses. Most users should call - /// translateInputToOutputAddress() for address translation. - InputOffsetToAddressMapTy &getInputOffsetToAddressMap() { - assert(isEmitted() && "cannot use address mapping before code emission"); - return InputOffsetToAddressMap; - } + void updateOutputValues(const BOLTLinker &Linker); /// Register relocation type \p RelType at a given \p Address in the function /// against \p Symbol. @@ -1457,7 +1441,8 @@ public: /// Rebuilds BBs layout, ignoring dead BBs. Returns the number of removed /// BBs and the removed number of bytes of code. - std::pair eraseInvalidBBs(); + std::pair + eraseInvalidBBs(const MCCodeEmitter *Emitter = nullptr); /// Get the relative order between two basic blocks in the original /// layout. The result is > 0 if B occurs before A and < 0 if B @@ -1729,8 +1714,24 @@ public: return *this; } - Align getAlign() const { return Align(Alignment); } + uint16_t getMinAlignment() const { + // Align data in code BFs minimum to CI alignment + if (!size() && hasIslandsInfo()) + return getConstantIslandAlignment(); + + // Minimal code alignment on AArch64 and RISCV is 4 + if (BC.isAArch64() || BC.isRISCV()) + return 4; + + // We have to use at least 2-byte alignment for functions because + // of C++ ABI. + return 2; + } + + Align getMinAlign() const { return Align(getMinAlignment()); } + uint16_t getAlignment() const { return Alignment; } + Align getAlign() const { return Align(getAlignment()); } BinaryFunction &setMaxAlignmentBytes(uint16_t MaxAlignBytes) { MaxAlignmentBytes = MaxAlignBytes; @@ -1779,6 +1780,15 @@ public: return llvm::is_contained(Fragments, &Other); } + /// Return the child fragment form parent function + iterator_range getFragments() const { + return iterator_range(Fragments.begin(), + Fragments.end()); + } + + /// Return the parent function for split function fragments. + FragmentsSetTy *getParentFragments() { return &ParentFragments; } + /// Returns if this function is a parent or child of \p Other function. bool isParentOrChildOf(const BinaryFunction &Other) const { return isChildOf(Other) || isParentOf(Other); @@ -2170,6 +2180,11 @@ public: /// its code emission. bool requiresAddressTranslation() const; + /// Return true if the linker needs to generate an address map for this + /// function. Used for keeping track of the mapping from input to out + /// addresses of basic blocks. + bool requiresAddressMap() const; + /// Adjust branch instructions to match the CFG. /// /// As it comes to internal branches, the CFG represents "the ultimate source diff --git a/bolt/include/bolt/Core/BinarySection.h b/bolt/include/bolt/Core/BinarySection.h index f1041777926fde1ee06fb3f0bdd4c8e33f1f72c0..92ab6ea0d38e14eabd0f47cc7bb82fa11257dae1 100644 --- a/bolt/include/bolt/Core/BinarySection.h +++ b/bolt/include/bolt/Core/BinarySection.h @@ -97,6 +97,8 @@ class BinarySection { mutable bool IsReordered{false}; // Have the contents been reordered? bool IsAnonymous{false}; // True if the name should not be included // in the output file. + bool IsLinkOnly{false}; // True if the section should not be included + // in the output file. uint64_t hash(const BinaryData &BD, std::map &Cache) const; @@ -373,8 +375,12 @@ public: /// Add a dynamic relocation at the given /p Offset. void addDynamicRelocation(uint64_t Offset, MCSymbol *Symbol, uint64_t Type, uint64_t Addend, uint64_t Value = 0) { - assert(Offset < getSize() && "offset not within section bounds"); - DynamicRelocations.emplace(Relocation{Offset, Symbol, Type, Addend, Value}); + addDynamicRelocation(Relocation{Offset, Symbol, Type, Addend, Value}); + } + + void addDynamicRelocation(const Relocation &Reloc) { + assert(Reloc.Offset < getSize() && "offset not within section bounds"); + DynamicRelocations.emplace(Reloc); } /// Add relocation against the original contents of this section. @@ -408,6 +414,18 @@ public: return Itr != DynamicRelocations.end() ? &*Itr : nullptr; } + std::optional takeDynamicRelocationAt(uint64_t Offset) { + Relocation Key{Offset, 0, 0, 0, 0}; + auto Itr = DynamicRelocations.find(Key); + + if (Itr == DynamicRelocations.end()) + return std::nullopt; + + Relocation Reloc = *Itr; + DynamicRelocations.erase(Itr); + return Reloc; + } + uint64_t hash(const BinaryData &BD) const { std::map Cache; return hash(BD, Cache); @@ -452,6 +470,8 @@ public: void setIndex(uint32_t I) { Index = I; } void setOutputName(const Twine &Name) { OutputName = Name.str(); } void setAnonymous(bool Flag) { IsAnonymous = Flag; } + bool isLinkOnly() const { return IsLinkOnly; } + void setLinkOnly() { IsLinkOnly = true; } /// Emit the section as data, possibly with relocations. /// Use name \p SectionName for the section during the emission. diff --git a/bolt/include/bolt/Core/HashUtilities.h b/bolt/include/bolt/Core/HashUtilities.h index 8d445ff837564a3431c8737fdede1f4bae6f75ce..53ea110aa683b9f8501b4287d1e7be5519db076c 100644 --- a/bolt/include/bolt/Core/HashUtilities.h +++ b/bolt/include/bolt/Core/HashUtilities.h @@ -20,8 +20,6 @@ namespace llvm { namespace bolt { -uint16_t hash_64_to_16(const uint64_t Hash); - std::string hashInteger(uint64_t Value); std::string hashSymbol(BinaryContext &BC, const MCSymbol &Symbol); @@ -35,6 +33,8 @@ using OperandHashFuncTy = function_ref; std::string hashBlock(BinaryContext &BC, const BinaryBasicBlock &BB, OperandHashFuncTy OperandHashFunc); +std::string hashBlockLoose(BinaryContext &BC, const BinaryBasicBlock &BB); + } // namespace bolt } // namespace llvm diff --git a/bolt/include/bolt/Core/Linker.h b/bolt/include/bolt/Core/Linker.h index 69e1fe431c0b8f6223a01f3a1e26f41ae226449f..1e0876a0e13d9d419b55be8d9ff3978254d490b8 100644 --- a/bolt/include/bolt/Core/Linker.h +++ b/bolt/include/bolt/Core/Linker.h @@ -31,6 +31,11 @@ public: std::function; using SectionsMapper = std::function; + struct SymbolInfo { + uint64_t Address; + uint64_t Size; + }; + virtual ~BOLTLinker() = default; /// Load and link \p Obj. \p MapSections will be called before the object is @@ -38,8 +43,16 @@ public: /// of a section can be changed by calling the passed SectionMapper. virtual void loadObject(MemoryBufferRef Obj, SectionsMapper MapSections) = 0; + /// Return the address and size of a symbol or std::nullopt if it cannot be + /// found. + virtual std::optional lookupSymbolInfo(StringRef Name) const = 0; + /// Return the address of a symbol or std::nullopt if it cannot be found. - virtual std::optional lookupSymbol(StringRef Name) const = 0; + std::optional lookupSymbol(StringRef Name) const { + if (const auto Info = lookupSymbolInfo(Name)) + return Info->Address; + return std::nullopt; + } }; } // namespace bolt diff --git a/bolt/include/bolt/Core/MCPlus.h b/bolt/include/bolt/Core/MCPlus.h index b4a72ac274fade22ad9036b47206a10f937b2692..b6a9e73f2347e71faa09beb4ae2aad9ff7fc5747 100644 --- a/bolt/include/bolt/Core/MCPlus.h +++ b/bolt/include/bolt/Core/MCPlus.h @@ -32,11 +32,16 @@ namespace MCPlus { /// pad and the uint64_t represents the action. using MCLandingPad = std::pair; -/// An extension to MCInst is provided via an extra operand of type MCInst with -/// ANNOTATION_LABEL opcode (i.e. we are tying an annotation instruction to an -/// existing one). The annotation instruction contains a list of Immediate -/// operands. Each operand either contains a value, or is a pointer to -/// an instance of class MCAnnotation. +/// An extension to MCInst is provided via extra operands, i.e. operands that +/// are not used in the instruction assembly. Any kind of metadata can be +/// attached to MCInst with this "annotation" extension using MCPlusBuilder +/// interface. +// +/// The first extra operand must be of type kInst with an empty (nullptr) +/// value. The kInst operand type is unused on most non-VLIW architectures. +/// We use it to mark the beginning of annotations operands. The rest of the +/// operands are of Immediate type with annotation info encoded into the value +/// of the immediate. /// /// There are 2 distinct groups of annotations. The first group is a first-class /// annotation that affects semantics of the instruction, such as an @@ -55,7 +60,7 @@ using MCLandingPad = std::pair; /// of their corresponding operand. /// /// Annotations in the second group could be addressed either by name, or by -/// by and index which could be queried by providing a name. +/// by index which could be queried by providing the name. class MCAnnotation { public: enum Kind { @@ -66,6 +71,8 @@ public: kTailCall, /// Tail call. kConditionalTailCall, /// CTC. kOffset, /// Offset in the function. + kLabel, /// MCSymbol pointing to this instruction. + kSize, /// Size of the instruction. kGeneric /// First generic annotation. }; @@ -105,10 +112,11 @@ private: /// Return a number of operands in \Inst excluding operands representing /// annotations. inline unsigned getNumPrimeOperands(const MCInst &Inst) { - if (Inst.getNumOperands() > 0 && std::prev(Inst.end())->isInst()) { - assert(std::prev(Inst.end())->getInst()->getOpcode() == - TargetOpcode::ANNOTATION_LABEL); - return Inst.getNumOperands() - 1; + for (signed I = Inst.getNumOperands() - 1; I >= 0; --I) { + if (Inst.getOperand(I).isInst()) + return I; + if (!Inst.getOperand(I).isImm()) + return Inst.getNumOperands(); } return Inst.getNumOperands(); } diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index cd4676f370e64e27dbfad76e21e2d839f8a8b1b4..1a7f544c1b6a6acf0aa5569111e62e9e5936f208 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -29,6 +29,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/ErrorOr.h" +#include "llvm/Support/RWMutex.h" #include #include #include @@ -64,7 +65,6 @@ public: private: /// A struct that represents a single annotation allocator struct AnnotationAllocator { - SpecificBumpPtrAllocator MCInstAllocator; BumpPtrAllocator ValueAllocator; std::unordered_set AnnotationPool; }; @@ -96,60 +96,62 @@ private: return SignExtend64<56>(ImmValue & 0xff'ffff'ffff'ffffULL); } - MCInst *getAnnotationInst(const MCInst &Inst) const { - if (Inst.getNumOperands() == 0) - return nullptr; + std::optional getFirstAnnotationOpIndex(const MCInst &Inst) const { + const unsigned NumPrimeOperands = MCPlus::getNumPrimeOperands(Inst); + if (Inst.getNumOperands() == NumPrimeOperands) + return std::nullopt; - const MCOperand &LastOp = Inst.getOperand(Inst.getNumOperands() - 1); - if (!LastOp.isInst()) - return nullptr; + assert(Inst.getOperand(NumPrimeOperands).getInst() == nullptr && + "Empty instruction expected."); - MCInst *AnnotationInst = const_cast(LastOp.getInst()); - assert(AnnotationInst->getOpcode() == TargetOpcode::ANNOTATION_LABEL); + return NumPrimeOperands + 1; + } - return AnnotationInst; + MCInst::iterator getAnnotationInstOp(MCInst &Inst) const { + for (MCInst::iterator Iter = Inst.begin(); Iter != Inst.end(); ++Iter) { + if (Iter->isInst()) { + assert(Iter->getInst() == nullptr && "Empty instruction expected."); + return Iter; + } + } + return Inst.end(); } - void removeAnnotationInst(MCInst &Inst) const { - assert(getAnnotationInst(Inst) && "Expected annotation instruction."); - Inst.erase(std::prev(Inst.end())); - assert(!getAnnotationInst(Inst) && - "More than one annotation instruction detected."); + void removeAnnotations(MCInst &Inst) const { + Inst.erase(getAnnotationInstOp(Inst), Inst.end()); } - void setAnnotationOpValue(MCInst &Inst, unsigned Index, int64_t Value, - AllocatorIdTy AllocatorId = 0) { - MCInst *AnnotationInst = getAnnotationInst(Inst); - if (!AnnotationInst) { - AnnotationAllocator &Allocator = getAnnotationAllocator(AllocatorId); - AnnotationInst = new (Allocator.MCInstAllocator.Allocate()) MCInst(); - AnnotationInst->setOpcode(TargetOpcode::ANNOTATION_LABEL); - Inst.addOperand(MCOperand::createInst(AnnotationInst)); + void setAnnotationOpValue(MCInst &Inst, unsigned Index, int64_t Value) const { + const int64_t AnnotationValue = encodeAnnotationImm(Index, Value); + const std::optional FirstAnnotationOp = + getFirstAnnotationOpIndex(Inst); + if (!FirstAnnotationOp) { + Inst.addOperand(MCOperand::createInst(nullptr)); + Inst.addOperand(MCOperand::createImm(AnnotationValue)); + return; } - const int64_t AnnotationValue = encodeAnnotationImm(Index, Value); - for (int I = AnnotationInst->getNumOperands() - 1; I >= 0; --I) { - int64_t ImmValue = AnnotationInst->getOperand(I).getImm(); + for (unsigned I = *FirstAnnotationOp; I < Inst.getNumOperands(); ++I) { + const int64_t ImmValue = Inst.getOperand(I).getImm(); if (extractAnnotationIndex(ImmValue) == Index) { - AnnotationInst->getOperand(I).setImm(AnnotationValue); + Inst.getOperand(I).setImm(AnnotationValue); return; } } - AnnotationInst->addOperand(MCOperand::createImm(AnnotationValue)); + Inst.addOperand(MCOperand::createImm(AnnotationValue)); } std::optional getAnnotationOpValue(const MCInst &Inst, unsigned Index) const { - const MCInst *AnnotationInst = getAnnotationInst(Inst); - if (!AnnotationInst) + std::optional FirstAnnotationOp = getFirstAnnotationOpIndex(Inst); + if (!FirstAnnotationOp) return std::nullopt; - for (int I = AnnotationInst->getNumOperands() - 1; I >= 0; --I) { - int64_t ImmValue = AnnotationInst->getOperand(I).getImm(); - if (extractAnnotationIndex(ImmValue) == Index) { + for (unsigned I = *FirstAnnotationOp; I < Inst.getNumOperands(); ++I) { + const int64_t ImmValue = Inst.getOperand(I).getImm(); + if (extractAnnotationIndex(ImmValue) == Index) return extractAnnotationValue(ImmValue); - } } return std::nullopt; @@ -166,20 +168,21 @@ protected: /// Names of non-standard annotations. SmallVector AnnotationNames; - /// Allocate the TailCall annotation value. Clients of the target-specific + /// A mutex that is used to control parallel accesses to + /// AnnotationNameIndexMap and AnnotationsNames. + mutable llvm::sys::RWMutex AnnotationNameMutex; + + /// Set TailCall annotation value to true. Clients of the target-specific /// MCPlusBuilder classes must use convert/lower/create* interfaces instead. - void setTailCall(MCInst &Inst); + void setTailCall(MCInst &Inst) const; /// Transfer annotations from \p SrcInst to \p DstInst. void moveAnnotations(MCInst &&SrcInst, MCInst &DstInst) const { - assert(!getAnnotationInst(DstInst) && - "Destination instruction should not have annotations."); - const MCInst *AnnotationInst = getAnnotationInst(SrcInst); - if (!AnnotationInst) - return; + MCInst::iterator AnnotationOp = getAnnotationInstOp(SrcInst); + for (MCInst::iterator Iter = AnnotationOp; Iter != SrcInst.end(); ++Iter) + DstInst.addOperand(*Iter); - DstInst.addOperand(MCOperand::createInst(AnnotationInst)); - removeAnnotationInst(SrcInst); + SrcInst.erase(AnnotationOp, SrcInst.end()); } public: @@ -384,7 +387,6 @@ public: Allocator.AnnotationPool.clear(); Allocator.ValueAllocator.Reset(); - Allocator.MCInstAllocator.DestroyAll(); } } @@ -613,12 +615,15 @@ public: virtual bool isMoveMem2Reg(const MCInst &Inst) const { return false; } - virtual bool isLoad(const MCInst &Inst) const { - llvm_unreachable("not implemented"); - return false; + virtual bool mayLoad(const MCInst &Inst) const { + return Info->get(Inst.getOpcode()).mayLoad(); + } + + virtual bool mayStore(const MCInst &Inst) const { + return Info->get(Inst.getOpcode()).mayStore(); } - virtual bool isStore(const MCInst &Inst) const { + virtual bool isAArch64Exclusive(const MCInst &Inst) const { llvm_unreachable("not implemented"); return false; } @@ -639,9 +644,12 @@ public: return false; } - /// If non-zero, this is used to fill the executable space with instructions - /// that will trap. Defaults to 0. - virtual unsigned getTrapFillValue() const { return 0; } + /// Used to fill the executable space with instructions + /// that will trap. + virtual StringRef getTrapFillValue() const { + llvm_unreachable("not implemented"); + return StringRef(); + } /// Interface and basic functionality of a MCInstMatcher. The idea is to make /// it easy to match one or more MCInsts against a tree-like pattern and @@ -1116,20 +1124,19 @@ public: std::optional getEHInfo(const MCInst &Inst) const; /// Add handler and action info for call instruction. - void addEHInfo(MCInst &Inst, const MCPlus::MCLandingPad &LP); + void addEHInfo(MCInst &Inst, const MCPlus::MCLandingPad &LP) const; /// Update exception-handling info for the invoke instruction \p Inst. /// Return true on success and false otherwise, e.g. if the instruction is /// not an invoke. - bool updateEHInfo(MCInst &Inst, const MCPlus::MCLandingPad &LP); + bool updateEHInfo(MCInst &Inst, const MCPlus::MCLandingPad &LP) const; /// Return non-negative GNU_args_size associated with the instruction /// or -1 if there's no associated info. int64_t getGnuArgsSize(const MCInst &Inst) const; /// Add the value of GNU_args_size to Inst if it already has EH info. - void addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize, - AllocatorIdTy AllocId = 0); + void addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize) const; /// Return jump table addressed by this instruction. uint64_t getJumpTable(const MCInst &Inst) const; @@ -1142,7 +1149,7 @@ public: AllocatorIdTy AllocId = 0); /// Disassociate instruction with a jump table. - bool unsetJumpTable(MCInst &Inst); + bool unsetJumpTable(MCInst &Inst) const; /// Return destination of conditional tail call instruction if \p Inst is one. std::optional getConditionalTailCall(const MCInst &Inst) const; @@ -1150,11 +1157,11 @@ public: /// Mark the \p Instruction as a conditional tail call, and set its /// destination address if it is known. If \p Instruction was already marked, /// update its destination with \p Dest. - bool setConditionalTailCall(MCInst &Inst, uint64_t Dest = 0); + bool setConditionalTailCall(MCInst &Inst, uint64_t Dest = 0) const; /// If \p Inst was marked as a conditional tail call convert it to a regular /// branch. Return true if the instruction was converted. - bool unsetConditionalTailCall(MCInst &Inst); + bool unsetConditionalTailCall(MCInst &Inst) const; /// Return offset of \p Inst in the original function, if available. std::optional getOffset(const MCInst &Inst) const; @@ -1163,10 +1170,23 @@ public: uint32_t getOffsetWithDefault(const MCInst &Inst, uint32_t Default) const; /// Set offset of \p Inst in the original function. - bool setOffset(MCInst &Inst, uint32_t Offset, AllocatorIdTy AllocatorId = 0); + bool setOffset(MCInst &Inst, uint32_t Offset) const; /// Remove offset annotation. - bool clearOffset(MCInst &Inst); + bool clearOffset(MCInst &Inst) const; + + /// Return the label of \p Inst, if available. + MCSymbol *getLabel(const MCInst &Inst) const; + + /// Set the label of \p Inst. This label will be emitted right before \p Inst + /// is emitted to MCStreamer. + bool setLabel(MCInst &Inst, MCSymbol *Label); + + /// Get instruction size specified via annotation. + std::optional getSize(const MCInst &Inst) const; + + /// Set instruction size. + void setSize(MCInst &Inst, uint32_t Size) const; /// Return MCSymbol that represents a target of this instruction at a given /// operand number \p OpNum. If there's no symbol associated with @@ -1730,8 +1750,51 @@ public: return true; } + /// Extract a symbol and an addend out of the fixup value expression. + /// + /// Only the following limited expression types are supported: + /// Symbol + Addend + /// Symbol + Constant + Addend + /// Const + Addend + /// Symbol + std::pair extractFixupExpr(const MCFixup &Fixup) const { + uint64_t Addend = 0; + MCSymbol *Symbol = nullptr; + const MCExpr *ValueExpr = Fixup.getValue(); + if (ValueExpr->getKind() == MCExpr::Binary) { + const auto *BinaryExpr = cast(ValueExpr); + assert(BinaryExpr->getOpcode() == MCBinaryExpr::Add && + "unexpected binary expression"); + const MCExpr *LHS = BinaryExpr->getLHS(); + if (LHS->getKind() == MCExpr::Constant) { + Addend = cast(LHS)->getValue(); + } else if (LHS->getKind() == MCExpr::Binary) { + const auto *LHSBinaryExpr = cast(LHS); + assert(LHSBinaryExpr->getOpcode() == MCBinaryExpr::Add && + "unexpected binary expression"); + const MCExpr *LLHS = LHSBinaryExpr->getLHS(); + assert(LLHS->getKind() == MCExpr::SymbolRef && "unexpected LLHS"); + Symbol = const_cast(this->getTargetSymbol(LLHS)); + const MCExpr *RLHS = LHSBinaryExpr->getRHS(); + assert(RLHS->getKind() == MCExpr::Constant && "unexpected RLHS"); + Addend = cast(RLHS)->getValue(); + } else { + assert(LHS->getKind() == MCExpr::SymbolRef && "unexpected LHS"); + Symbol = const_cast(this->getTargetSymbol(LHS)); + } + const MCExpr *RHS = BinaryExpr->getRHS(); + assert(RHS->getKind() == MCExpr::Constant && "unexpected RHS"); + Addend += cast(RHS)->getValue(); + } else { + assert(ValueExpr->getKind() == MCExpr::SymbolRef && "unexpected value"); + Symbol = const_cast(this->getTargetSymbol(ValueExpr)); + } + return std::make_pair(Symbol, Addend); + } + /// Return annotation index matching the \p Name. std::optional getAnnotationIndex(StringRef Name) const { + std::shared_lock Lock(AnnotationNameMutex); auto AI = AnnotationNameIndexMap.find(Name); if (AI != AnnotationNameIndexMap.end()) return AI->second; @@ -1741,10 +1804,10 @@ public: /// Return annotation index matching the \p Name. Create a new index if the /// \p Name wasn't registered previously. unsigned getOrCreateAnnotationIndex(StringRef Name) { - auto AI = AnnotationNameIndexMap.find(Name); - if (AI != AnnotationNameIndexMap.end()) - return AI->second; + if (std::optional Index = getAnnotationIndex(Name)) + return *Index; + std::unique_lock Lock(AnnotationNameMutex); const unsigned Index = AnnotationNameIndexMap.size() + MCPlus::MCAnnotation::kGeneric; AnnotationNameIndexMap.insert(std::make_pair(Name, Index)); @@ -1765,8 +1828,7 @@ public: if (!std::is_trivial::value) Allocator.AnnotationPool.insert(A); - setAnnotationOpValue(Inst, Index, reinterpret_cast(A), - AllocatorId); + setAnnotationOpValue(Inst, Index, reinterpret_cast(A)); return A->getValue(); } @@ -1899,21 +1961,21 @@ public: /// /// Return true if the annotation was removed, false if the annotation /// was not present. - bool removeAnnotation(MCInst &Inst, unsigned Index); + bool removeAnnotation(MCInst &Inst, unsigned Index) const; /// Remove annotation associated with \p Name. /// /// Return true if the annotation was removed, false if the annotation /// was not present. - bool removeAnnotation(MCInst &Inst, StringRef Name) { + bool removeAnnotation(MCInst &Inst, StringRef Name) const { const auto Index = getAnnotationIndex(Name); if (!Index) return false; return removeAnnotation(Inst, *Index); } - /// Remove meta-data, but don't destroy it. - void stripAnnotations(MCInst &Inst, bool KeepTC = false); + /// Remove meta-data from the instruction, but don't destroy it. + void stripAnnotations(MCInst &Inst, bool KeepTC = false) const; virtual InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst, MCSymbol *HandlerFuncAddr, diff --git a/bolt/include/bolt/Core/Relocation.h b/bolt/include/bolt/Core/Relocation.h index 5ae288a91986e523e93f506dd1bf4f5143925d92..bdea698b9531bb84c16cd0cf82a6e511c44d3843 100644 --- a/bolt/include/bolt/Core/Relocation.h +++ b/bolt/include/bolt/Core/Relocation.h @@ -97,6 +97,10 @@ struct Relocation { /// Return true if relocation type is for thread local storage. static bool isTLS(uint64_t Type); + /// Return true of relocation type is for referencing a specific instruction + /// (as opposed to a function, basic block, etc). + static bool isInstructionReference(uint64_t Type); + /// Return code for a NONE relocation static uint64_t getNone(); @@ -119,6 +123,10 @@ struct Relocation { /// otherwise. bool isRelative() const { return isRelative(Type); } + /// Return true if this relocation is R_*_IRELATIVE type. Return false + /// otherwise. + bool isIRelative() const { return isIRelative(Type); } + /// Emit relocation at a current \p Streamer' position. The caller is /// responsible for setting the position correctly. size_t emit(MCStreamer *Streamer) const; diff --git a/bolt/include/bolt/Passes/ReorderFunctions.h b/bolt/include/bolt/Passes/ReorderFunctions.h index 52156a600791cb6871bb6ad34cedfa51574b3896..27094bee771ad5293693d553d1d4e59eff31029f 100644 --- a/bolt/include/bolt/Passes/ReorderFunctions.h +++ b/bolt/include/bolt/Passes/ReorderFunctions.h @@ -32,6 +32,7 @@ public: RT_EXEC_COUNT, RT_HFSORT, RT_HFSORT_PLUS, + RT_CDS, RT_PETTIS_HANSEN, RT_RANDOM, RT_USER diff --git a/bolt/include/bolt/Rewrite/JITLinkLinker.h b/bolt/include/bolt/Rewrite/JITLinkLinker.h index 104c75bea0c26934343fcd50283e854ba4fc8cca..1c41a26ac256350c09f9c3a78c0bc7a2bb869298 100644 --- a/bolt/include/bolt/Rewrite/JITLinkLinker.h +++ b/bolt/include/bolt/Rewrite/JITLinkLinker.h @@ -17,7 +17,6 @@ #include "bolt/Rewrite/ExecutableFileMemoryManager.h" #include "llvm/ExecutionEngine/JITLink/JITLinkDylib.h" -#include #include #include @@ -35,7 +34,7 @@ private: std::unique_ptr MM; jitlink::JITLinkDylib Dylib{"main"}; std::vector Allocs; - std::map Symtab; + StringMap Symtab; public: JITLinkLinker(BinaryContext &BC, @@ -43,7 +42,7 @@ public: ~JITLinkLinker(); void loadObject(MemoryBufferRef Obj, SectionsMapper MapSections) override; - std::optional lookupSymbol(StringRef Name) const override; + std::optional lookupSymbolInfo(StringRef Name) const override; static SmallVector orderedBlocks(const jitlink::Section &Section); diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h index 072c8109241d21d041d1d3db2907f1e6b99ffaee..261a7337535b29b989316ad6a55f85fd4e910a7b 100644 --- a/bolt/include/bolt/Rewrite/RewriteInstance.h +++ b/bolt/include/bolt/Rewrite/RewriteInstance.h @@ -95,6 +95,15 @@ private: /// from meta data in the file. void discoverFileObjects(); + /// Check whether we should use DT_FINI or DT_FINI_ARRAY for instrumentation. + /// DT_FINI is preferred; DT_FINI_ARRAY is only used when no DT_FINI entry was + /// found. + Error discoverRtFiniAddress(); + + /// If DT_FINI_ARRAY is used for instrumentation, update the relocation of its + /// first entry to point to the instrumentation library's fini address. + void updateRtFiniReloc(); + /// Create and initialize metadata rewriters for this instance. void initializeMetadataManager(); @@ -190,7 +199,7 @@ private: void mapAllocatableSections(BOLTLinker::SectionMapper MapSection); /// Update output object's values based on the final \p Layout. - void updateOutputValues(const MCAsmLayout &Layout); + void updateOutputValues(const BOLTLinker &Linker); /// Rewrite back all functions (hopefully optimized) that fit in the original /// memory footprint for that function. If the function is now larger and does @@ -415,6 +424,7 @@ private: /// Common section names. static StringRef getEHFrameSectionName() { return ".eh_frame"; } + static StringRef getRelaDynSectionName() { return ".rela.dyn"; } /// An instance of the input binary we are processing, externally owned. llvm::object::ELFObjectFileBase *InputFile; @@ -503,11 +513,11 @@ private: }; /// AArch64 PLT sections. - const PLTSectionInfo AArch64_PLTSections[3] = { - {".plt"}, {".iplt"}, {nullptr}}; + const PLTSectionInfo AArch64_PLTSections[4] = { + {".plt"}, {".plt.got"}, {".iplt"}, {nullptr}}; /// RISCV PLT sections. - const PLTSectionInfo RISCV_PLTSections[3] = {{".plt"}, {nullptr}}; + const PLTSectionInfo RISCV_PLTSections[2] = {{".plt"}, {nullptr}}; /// Return PLT information for a section with \p SectionName or nullptr /// if the section is not PLT. diff --git a/bolt/lib/Core/AddressMap.cpp b/bolt/lib/Core/AddressMap.cpp new file mode 100644 index 0000000000000000000000000000000000000000..efa376d408db882b2fb3b937cab287063b5c79fc --- /dev/null +++ b/bolt/lib/Core/AddressMap.cpp @@ -0,0 +1,118 @@ +//===- bolt/Core/AddressMap.cpp - Input-output Address Map ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "bolt/Core/AddressMap.h" +#include "bolt/Core/BinaryContext.h" +#include "bolt/Core/BinaryFunction.h" +#include "bolt/Core/BinarySection.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/DataExtractor.h" + +namespace llvm { +namespace bolt { + +const char *const AddressMap::AddressSectionName = ".bolt.addr2addr_map"; +const char *const AddressMap::LabelSectionName = ".bolt.label2addr_map"; + +static void emitAddress(MCStreamer &Streamer, uint64_t InputAddress, + const MCSymbol *OutputLabel) { + Streamer.emitIntValue(InputAddress, 8); + Streamer.emitSymbolValue(OutputLabel, 8); +} + +static void emitLabel(MCStreamer &Streamer, const MCSymbol *OutputLabel) { + Streamer.emitIntValue(reinterpret_cast(OutputLabel), 8); + Streamer.emitSymbolValue(OutputLabel, 8); +} + +void AddressMap::emit(MCStreamer &Streamer, BinaryContext &BC) { + // Mark map sections as link-only to avoid allocation in the output file. + const unsigned Flags = BinarySection::getFlags(/*IsReadOnly*/ true, + /*IsText*/ false, + /*IsAllocatable*/ true); + BC.registerOrUpdateSection(AddressSectionName, ELF::SHT_PROGBITS, Flags) + .setLinkOnly(); + BC.registerOrUpdateSection(LabelSectionName, ELF::SHT_PROGBITS, Flags) + .setLinkOnly(); + + for (const auto &[BFAddress, BF] : BC.getBinaryFunctions()) { + if (!BF.requiresAddressMap()) + continue; + + for (const auto &BB : BF) { + if (!BB.getLabel()->isDefined()) + continue; + + Streamer.switchSection(BC.getDataSection(LabelSectionName)); + emitLabel(Streamer, BB.getLabel()); + + if (!BB.hasLocSyms()) + continue; + + Streamer.switchSection(BC.getDataSection(AddressSectionName)); + for (auto [Offset, Symbol] : BB.getLocSyms()) + emitAddress(Streamer, BFAddress + Offset, Symbol); + } + } +} + +std::optional AddressMap::parse(BinaryContext &BC) { + auto AddressMapSection = BC.getUniqueSectionByName(AddressSectionName); + auto LabelMapSection = BC.getUniqueSectionByName(LabelSectionName); + + if (!AddressMapSection && !LabelMapSection) + return std::nullopt; + + AddressMap Parsed; + + const size_t EntrySize = 2 * BC.AsmInfo->getCodePointerSize(); + auto parseSection = + [&](BinarySection &Section, + function_ref InsertCallback) { + StringRef Buffer = Section.getOutputContents(); + assert(Buffer.size() % EntrySize == 0 && "Unexpected address map size"); + + DataExtractor DE(Buffer, BC.AsmInfo->isLittleEndian(), + BC.AsmInfo->getCodePointerSize()); + DataExtractor::Cursor Cursor(0); + + while (Cursor && !DE.eof(Cursor)) { + const uint64_t Input = DE.getAddress(Cursor); + const uint64_t Output = DE.getAddress(Cursor); + InsertCallback(Input, Output); + } + + assert(Cursor && "Error reading address map section"); + BC.deregisterSection(Section); + }; + + if (AddressMapSection) { + Parsed.Address2AddressMap.reserve(AddressMapSection->getOutputSize() / + EntrySize); + parseSection(*AddressMapSection, [&](uint64_t Input, uint64_t Output) { + if (!Parsed.Address2AddressMap.count(Input)) + Parsed.Address2AddressMap.insert({Input, Output}); + }); + } + + if (LabelMapSection) { + Parsed.Label2AddrMap.reserve(LabelMapSection->getOutputSize() / EntrySize); + parseSection(*LabelMapSection, [&](uint64_t Input, uint64_t Output) { + assert(!Parsed.Label2AddrMap.count( + reinterpret_cast(Input)) && + "Duplicate label entry detected."); + Parsed.Label2AddrMap.insert( + {reinterpret_cast(Input), Output}); + }); + } + + return Parsed; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/lib/Core/BinaryBasicBlock.cpp b/bolt/lib/Core/BinaryBasicBlock.cpp index b271b86ec69920456088371dd80efc9f062be813..984bc6dbd220ab05904e3ee909e46f56983c2304 100644 --- a/bolt/lib/Core/BinaryBasicBlock.cpp +++ b/bolt/lib/Core/BinaryBasicBlock.cpp @@ -14,7 +14,6 @@ #include "bolt/Core/BinaryContext.h" #include "bolt/Core/BinaryFunction.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/Errc.h" @@ -613,27 +612,5 @@ BinaryBasicBlock *BinaryBasicBlock::splitAt(iterator II) { return NewBlock; } -void BinaryBasicBlock::updateOutputValues(const MCAsmLayout &Layout) { - if (!LocSyms) - return; - - const uint64_t BBAddress = getOutputAddressRange().first; - const uint64_t BBOffset = Layout.getSymbolOffset(*getLabel()); - for (const auto &LocSymKV : *LocSyms) { - const uint32_t InputFunctionOffset = LocSymKV.first; - const uint32_t OutputOffset = static_cast( - Layout.getSymbolOffset(*LocSymKV.second) - BBOffset); - getOffsetTranslationTable().emplace_back( - std::make_pair(OutputOffset, InputFunctionOffset)); - - // Update reverse (relative to BAT) address lookup table for function. - if (getFunction()->requiresAddressTranslation()) { - getFunction()->getInputOffsetToAddressMap().emplace( - std::make_pair(InputFunctionOffset, OutputOffset + BBAddress)); - } - } - LocSyms.reset(nullptr); -} - } // namespace bolt } // namespace llvm diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index 2d2b35ee2bd9c919fc0ee6c3c9310932d84aa125..6761771a2ee6f18f224398de5c4b5a3e70d7176f 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -503,6 +503,9 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address, // Is one of the targets __builtin_unreachable? bool HasUnreachable = false; + // Does one of the entries match function start address? + bool HasStartAsEntry = false; + // Number of targets other than __builtin_unreachable. uint64_t NumRealEntries = 0; @@ -567,14 +570,21 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address, continue; } + // Function start is another special case. It is allowed in the jump table, + // but we need at least one another regular entry to distinguish the table + // from, e.g. a function pointer array. + if (Value == BF.getAddress()) { + HasStartAsEntry = true; + addEntryAddress(Value); + continue; + } + // Function or one of its fragments. const BinaryFunction *TargetBF = getBinaryFunctionContainingAddress(Value); - - bool DoesBelongToFunction = BF.containsAddress(Value) || - (TargetBF && TargetBF->isParentOrChildOf(BF)); - - // We assume that a jump table cannot have function start as an entry. - if (!DoesBelongToFunction || Value == BF.getAddress()) { + const bool DoesBelongToFunction = + BF.containsAddress(Value) || + (TargetBF && TargetBF->isParentOrChildOf(BF)); + if (!DoesBelongToFunction) { LLVM_DEBUG({ if (!BF.containsAddress(Value)) { dbgs() << "FAIL: function doesn't contain this address\n"; @@ -589,8 +599,6 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address, } } } - if (Value == BF.getAddress()) - dbgs() << "FAIL: jump table cannot have function start as an entry\n"; }); break; } @@ -611,9 +619,9 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address, } // It's a jump table if the number of real entries is more than 1, or there's - // one real entry and "unreachable" targets. If there are only multiple - // "unreachable" targets, then it's not a jump table. - return NumRealEntries + HasUnreachable >= 2; + // one real entry and one or more special targets. If there are only multiple + // special targets, then it's not a jump table. + return NumRealEntries + (HasUnreachable || HasStartAsEntry) >= 2; } void BinaryContext::populateJumpTables() { @@ -1696,6 +1704,15 @@ bool BinaryContext::shouldEmit(const BinaryFunction &Function) const { return HasRelocations || Function.isSimple(); } +void BinaryContext::dump(const MCInst &Inst) const { + if (LLVM_UNLIKELY(!InstPrinter)) { + dbgs() << "Cannot dump for InstPrinter is not initialized.\n"; + return; + } + InstPrinter->printInst(&Inst, 0, "", *STI, dbgs()); + dbgs() << "\n"; +} + void BinaryContext::printCFI(raw_ostream &OS, const MCCFIInstruction &Inst) { uint32_t Operation = Inst.getOperation(); switch (Operation) { @@ -1752,10 +1769,10 @@ void BinaryContext::printCFI(raw_ostream &OS, const MCCFIInstruction &Inst) { } MarkerSymType BinaryContext::getMarkerType(const SymbolRef &Symbol) const { - // For aarch64, the ABI defines mapping symbols so we identify data in the - // code section (see IHI0056B). $x identifies a symbol starting code or the - // end of a data chunk inside code, $d indentifies start of data. - if (!isAArch64() || ELFSymbolRef(Symbol).getSize()) + // For aarch64 and riscv, the ABI defines mapping symbols so we identify data + // in the code section (see IHI0056B). $x identifies a symbol starting code or + // the end of a data chunk inside code, $d indentifies start of data. + if ((!isAArch64() && !isRISCV()) || ELFSymbolRef(Symbol).getSize()) return MarkerSymType::NONE; Expected NameOrError = Symbol.getName(); @@ -1855,6 +1872,10 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction, } if (std::optional Offset = MIB->getOffset(Instruction)) OS << " # Offset: " << *Offset; + if (std::optional Size = MIB->getSize(Instruction)) + OS << " # Size: " << *Size; + if (MCSymbol *Label = MIB->getLabel(Instruction)) + OS << " # Label: " << *Label; MIB->printAnnotations(Instruction, OS); @@ -2259,14 +2280,36 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) { MCAsmLayout Layout(Assembler); Assembler.layout(Layout); + // Obtain fragment sizes. + std::vector FragmentSizes; + // Main fragment size. const uint64_t HotSize = Layout.getSymbolOffset(*EndLabel) - Layout.getSymbolOffset(*StartLabel); - const uint64_t ColdSize = - std::accumulate(SplitLabels.begin(), SplitLabels.end(), 0ULL, - [&](const uint64_t Accu, const LabelRange &Labels) { - return Accu + Layout.getSymbolOffset(*Labels.second) - - Layout.getSymbolOffset(*Labels.first); - }); + FragmentSizes.push_back(HotSize); + // Split fragment sizes. + uint64_t ColdSize = 0; + for (const auto &Labels : SplitLabels) { + uint64_t Size = Layout.getSymbolOffset(*Labels.second) - + Layout.getSymbolOffset(*Labels.first); + FragmentSizes.push_back(Size); + ColdSize += Size; + } + + // Populate new start and end offsets of each basic block. + uint64_t FragmentIndex = 0; + for (FunctionFragment &FF : BF.getLayout().fragments()) { + BinaryBasicBlock *PrevBB = nullptr; + for (BinaryBasicBlock *BB : FF) { + const uint64_t BBStartOffset = Layout.getSymbolOffset(*(BB->getLabel())); + BB->setOutputStartAddress(BBStartOffset); + if (PrevBB) + PrevBB->setOutputEndAddress(BBStartOffset); + PrevBB = BB; + } + if (PrevBB) + PrevBB->setOutputEndAddress(FragmentSizes[FragmentIndex]); + FragmentIndex++; + } // Clean-up the effect of the code emission. for (const MCSymbol &Symbol : Assembler.symbols()) { diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp index c4129615ac32de564b095a68a555dbf14cf7cf95..9c7905955835bfbd6bd3f8a114b8f791b5226651 100644 --- a/bolt/lib/Core/BinaryEmitter.cpp +++ b/bolt/lib/Core/BinaryEmitter.cpp @@ -161,9 +161,17 @@ private: /// \p FirstInstr indicates if \p NewLoc represents the first instruction /// in a sequence, such as a function fragment. /// + /// If \p NewLoc location matches \p PrevLoc, no new line number entry will be + /// created and the function will return \p PrevLoc while \p InstrLabel will + /// be ignored. Otherwise, the caller should use \p InstrLabel to mark the + /// corresponding instruction by emitting \p InstrLabel before it. + /// If \p InstrLabel is set by the caller, its value will be used with \p + /// \p NewLoc. If it was nullptr on entry, it will be populated with a pointer + /// to a new temp symbol used with \p NewLoc. + /// /// Return new current location which is either \p NewLoc or \p PrevLoc. SMLoc emitLineInfo(const BinaryFunction &BF, SMLoc NewLoc, SMLoc PrevLoc, - bool FirstInstr); + bool FirstInstr, MCSymbol *&InstrLabel); /// Use \p FunctionEndSymbol to mark the end of the line info sequence. /// Note that it does not automatically result in the insertion of the EOS @@ -214,6 +222,10 @@ void BinaryEmitter::emitAll(StringRef OrgSecPrefix) { } emitDataSections(OrgSecPrefix); + + // TODO Enable for Mach-O once BinaryContext::getDataSection supports it. + if (BC.isELF()) + AddressMap::emit(Streamer, BC); } void BinaryEmitter::emitFunctions() { @@ -305,7 +317,7 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function, // tentative layout. Section->ensureMinAlignment(Align(opts::AlignFunctions)); - Streamer.emitCodeAlignment(Align(BinaryFunction::MinAlign), &*BC.STI); + Streamer.emitCodeAlignment(Function.getMinAlign(), &*BC.STI); uint16_t MaxAlignBytes = FF.isSplitFragment() ? Function.getMaxColdAlignmentBytes() : Function.getMaxAlignmentBytes(); @@ -376,7 +388,7 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function, } if (opts::MarkFuncs) - Streamer.emitIntValue(BC.MIB->getTrapFillValue(), 1); + Streamer.emitBytes(BC.MIB->getTrapFillValue()); // Emit CFI end if (Function.hasCFI()) @@ -420,7 +432,7 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF, // case, the call site entries in that LSDA have 0 as offset to the landing // pad, which the runtime interprets as "no handler". To prevent this, // insert some padding. - Streamer.emitIntValue(BC.MIB->getTrapFillValue(), 1); + Streamer.emitBytes(BC.MIB->getTrapFillValue()); } // Track the first emitted instruction with debug info. @@ -479,19 +491,39 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF, // are relaxable, we should be safe. } - if (!EmitCodeOnly && opts::UpdateDebugSections && BF.getDWARFUnit()) { - LastLocSeen = emitLineInfo(BF, Instr.getLoc(), LastLocSeen, FirstInstr); - FirstInstr = false; + if (!EmitCodeOnly) { + // A symbol to be emitted before the instruction to mark its location. + MCSymbol *InstrLabel = BC.MIB->getLabel(Instr); + + if (opts::UpdateDebugSections && BF.getDWARFUnit()) { + LastLocSeen = emitLineInfo(BF, Instr.getLoc(), LastLocSeen, + FirstInstr, InstrLabel); + FirstInstr = false; + } + + // Prepare to tag this location with a label if we need to keep track of + // the location of calls/returns for BOLT address translation maps + if (BF.requiresAddressTranslation() && BC.MIB->getOffset(Instr)) { + const uint32_t Offset = *BC.MIB->getOffset(Instr); + if (!InstrLabel) + InstrLabel = BC.Ctx->createTempSymbol(); + BB->getLocSyms().emplace_back(Offset, InstrLabel); + } + + if (InstrLabel) + Streamer.emitLabel(InstrLabel); } - // Prepare to tag this location with a label if we need to keep track of - // the location of calls/returns for BOLT address translation maps - if (!EmitCodeOnly && BF.requiresAddressTranslation() && - BC.MIB->getOffset(Instr)) { - const uint32_t Offset = *BC.MIB->getOffset(Instr); - MCSymbol *LocSym = BC.Ctx->createTempSymbol(); - Streamer.emitLabel(LocSym); - BB->getLocSyms().emplace_back(Offset, LocSym); + // Emit sized NOPs via MCAsmBackend::writeNopData() interface on x86. + // This is a workaround for invalid NOPs handling by asm/disasm layer. + if (BC.MIB->isNoop(Instr) && BC.isX86()) { + if (std::optional Size = BC.MIB->getSize(Instr)) { + SmallString<15> Code; + raw_svector_ostream VecOS(Code); + BC.MAB->writeNopData(VecOS, *Size, BC.STI.get()); + Streamer.emitBytes(Code); + continue; + } } Streamer.emitInstruction(Instr, *BC.STI); @@ -654,7 +686,8 @@ void BinaryEmitter::emitConstantIslands(BinaryFunction &BF, bool EmitColdPart, } SMLoc BinaryEmitter::emitLineInfo(const BinaryFunction &BF, SMLoc NewLoc, - SMLoc PrevLoc, bool FirstInstr) { + SMLoc PrevLoc, bool FirstInstr, + MCSymbol *&InstrLabel) { DWARFUnit *FunctionCU = BF.getDWARFUnit(); const DWARFDebugLine::LineTable *FunctionLineTable = BF.getDWARFLineTable(); assert(FunctionCU && "cannot emit line info for function without CU"); @@ -704,12 +737,12 @@ SMLoc BinaryEmitter::emitLineInfo(const BinaryFunction &BF, SMLoc NewLoc, const MCDwarfLoc &DwarfLoc = BC.Ctx->getCurrentDwarfLoc(); BC.Ctx->clearDwarfLocSeen(); - MCSymbol *LineSym = BC.Ctx->createTempSymbol(); - Streamer.emitLabel(LineSym); + if (!InstrLabel) + InstrLabel = BC.Ctx->createTempSymbol(); BC.getDwarfLineTable(FunctionUnitIndex) .getMCLineSections() - .addLineEntry(MCDwarfLineEntry(LineSym, DwarfLoc), + .addLineEntry(MCDwarfLineEntry(InstrLabel, DwarfLoc), Streamer.getCurrentSectionOnly()); return NewLoc; diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 5b44a76dc8c385fa3d8f3b492b5008240e554acb..49a9dd902120dc3f5cf22c9980bc1d9c096e517c 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -25,7 +25,6 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Demangle/Demangle.h" #include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCExpr.h" @@ -59,6 +58,7 @@ extern cl::OptionCategory BoltRelocCategory; extern cl::opt EnableBAT; extern cl::opt Instrument; +extern cl::opt KeepNops; extern cl::opt StrictMode; extern cl::opt UpdateDebugSections; extern cl::opt Verbosity; @@ -110,6 +110,13 @@ cl::opt cl::desc("try to preserve basic block alignment"), cl::cat(BoltOptCategory)); +static cl::opt PrintOutputAddressRange( + "print-output-address-range", + cl::desc( + "print output address range for each basic block in the function when" + "BinaryFunction::print is called"), + cl::Hidden, cl::cat(BoltOptCategory)); + cl::opt PrintDynoStats("dyno-stats", cl::desc("print execution info based on profile"), @@ -165,8 +172,6 @@ bool shouldPrint(const BinaryFunction &Function) { namespace llvm { namespace bolt { -constexpr unsigned BinaryFunction::MinAlign; - template static bool emptyRange(const R &Range) { return Range.begin() == Range.end(); } @@ -325,7 +330,8 @@ void BinaryFunction::markUnreachableBlocks() { // Any unnecessary fallthrough jumps revealed after calling eraseInvalidBBs // will be cleaned up by fixBranches(). -std::pair BinaryFunction::eraseInvalidBBs() { +std::pair +BinaryFunction::eraseInvalidBBs(const MCCodeEmitter *Emitter) { DenseSet InvalidBBs; unsigned Count = 0; uint64_t Bytes = 0; @@ -334,7 +340,7 @@ std::pair BinaryFunction::eraseInvalidBBs() { assert(!isEntryPoint(*BB) && "all entry blocks must be valid"); InvalidBBs.insert(BB); ++Count; - Bytes += BC.computeCodeSize(BB->begin(), BB->end()); + Bytes += BC.computeCodeSize(BB->begin(), BB->end(), Emitter); } } @@ -433,8 +439,6 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) { OS << "\n IsSplit : " << isSplit(); OS << "\n BB Count : " << size(); - if (HasFixedIndirectBranch) - OS << "\n HasFixedIndirectBranch : true"; if (HasUnknownControlFlow) OS << "\n Unknown CF : true"; if (getPersonalityFunction()) @@ -515,6 +519,11 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) { OS << BB->getName() << " (" << BB->size() << " instructions, align : " << BB->getAlignment() << ")\n"; + if (opts::PrintOutputAddressRange) + OS << formatv(" Output Address Range: [{0:x}, {1:x}) ({2} bytes)\n", + BB->getOutputAddressRange().first, + BB->getOutputAddressRange().second, BB->getOutputSize()); + if (isEntryPoint(*BB)) { if (MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(*BB)) OS << " Secondary Entry Point: " << EntrySymbol->getName() << '\n'; @@ -536,7 +545,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) { if (BB->getCFIState() >= 0) OS << " CFI State : " << BB->getCFIState() << '\n'; if (opts::EnableBAT) { - OS << " Input offset: " << Twine::utohexstr(BB->getInputOffset()) + OS << " Input offset: 0x" << Twine::utohexstr(BB->getInputOffset()) << "\n"; } if (!BB->pred_empty()) { @@ -1119,7 +1128,7 @@ void BinaryFunction::handleIndirectBranch(MCInst &Instruction, uint64_t Size, Instruction.clear(); MIB->createUncondBranch(Instruction, TargetSymbol, BC.Ctx.get()); TakenBranches.emplace_back(Offset, IndirectTarget - getAddress()); - HasFixedIndirectBranch = true; + addEntryPointAtOffset(IndirectTarget - getAddress()); } else { MIB->convertJmpToTailCall(Instruction); BC.addInterproceduralReference(this, IndirectTarget); @@ -1174,6 +1183,13 @@ bool BinaryFunction::disassemble() { // basic block. Labels[0] = Ctx->createNamedTempSymbol("BB0"); + // Map offsets in the function to a label that should always point to the + // corresponding instruction. This is used for labels that shouldn't point to + // the start of a basic block but always to a specific instruction. This is + // used, for example, on RISC-V where %pcrel_lo relocations point to the + // corresponding %pcrel_hi. + LabelsMapType InstructionLabels; + uint64_t Size = 0; // instruction size for (uint64_t Offset = 0; Offset < getSize(); Offset += Size) { MCInst Instruction; @@ -1330,9 +1346,23 @@ bool BinaryFunction::disassemble() { ItrE = Relocations.lower_bound(Offset + Size); Itr != ItrE; ++Itr) { const Relocation &Relocation = Itr->second; + MCSymbol *Symbol = Relocation.Symbol; + + if (Relocation::isInstructionReference(Relocation.Type)) { + uint64_t RefOffset = Relocation.Value - getAddress(); + LabelsMapType::iterator LI = InstructionLabels.find(RefOffset); + + if (LI == InstructionLabels.end()) { + Symbol = BC.Ctx->createNamedTempSymbol(); + InstructionLabels.emplace(RefOffset, Symbol); + } else { + Symbol = LI->second; + } + } + int64_t Value = Relocation.Value; const bool Result = BC.MIB->replaceImmWithSymbolRef( - Instruction, Relocation.Symbol, Relocation.Addend, Ctx.get(), Value, + Instruction, Symbol, Relocation.Addend, Ctx.get(), Value, Relocation.Type); (void)Result; assert(Result && "cannot replace immediate with relocation"); @@ -1361,12 +1391,19 @@ add_instruction: // NOTE: disassembly loses the correct size information for noops. // E.g. nopw 0x0(%rax,%rax,1) is 9 bytes, but re-encoded it's only // 5 bytes. Preserve the size info using annotations. - MIB->addAnnotation(Instruction, "Size", static_cast(Size)); + MIB->setSize(Instruction, Size); } addInstruction(Offset, std::move(Instruction)); } + for (auto [Offset, Label] : InstructionLabels) { + InstrMapType::iterator II = Instructions.find(Offset); + assert(II != Instructions.end() && "reference to non-existing instruction"); + + BC.MIB->setLabel(II->second, Label); + } + // Reset symbolizer for the disassembler. BC.SymbolicDisAsm->setSymbolizer(nullptr); @@ -1866,9 +1903,6 @@ bool BinaryFunction::postProcessIndirectBranches( LastIndirectJumpBB->updateJumpTableSuccessors(); } - if (HasFixedIndirectBranch) - return false; - // Validate that all data references to function offsets are claimed by // recognized jump tables. Register externally referenced blocks as entry // points. @@ -1973,7 +2007,7 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) { } } if (LastNonNop && !MIB->getOffset(*LastNonNop)) - MIB->setOffset(*LastNonNop, static_cast(Offset), AllocatorId); + MIB->setOffset(*LastNonNop, static_cast(Offset)); }; for (auto I = Instructions.begin(), E = Instructions.end(); I != E; ++I) { @@ -1996,7 +2030,7 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) { if (MIB->isNoop(Instr) && !MIB->getOffset(Instr)) { // If "Offset" annotation is not present, set it and mark the nop for // deletion. - MIB->setOffset(Instr, static_cast(Offset), AllocatorId); + MIB->setOffset(Instr, static_cast(Offset)); // Annotate ordinary nops, so we can safely delete them if required. MIB->addAnnotation(Instr, "NOP", static_cast(1), AllocatorId); } @@ -2221,8 +2255,8 @@ void BinaryFunction::calculateMacroOpFusionStats() { << Twine::utohexstr(getAddress() + Offset) << " in function " << *this << "; executed " << BB.getKnownExecutionCount() << " times.\n"); - ++BC.MissedMacroFusionPairs; - BC.MissedMacroFusionExecCount += BB.getKnownExecutionCount(); + ++BC.Stats.MissedMacroFusionPairs; + BC.Stats.MissedMacroFusionExecCount += BB.getKnownExecutionCount(); } } @@ -2277,6 +2311,13 @@ void BinaryFunction::removeConditionalTailCalls() { assert(CTCTargetLabel && "symbol expected for conditional tail call"); MCInst TailCallInstr; BC.MIB->createTailCall(TailCallInstr, CTCTargetLabel, BC.Ctx.get()); + + // Move offset from CTCInstr to TailCallInstr. + if (const std::optional Offset = BC.MIB->getOffset(*CTCInstr)) { + BC.MIB->setOffset(TailCallInstr, *Offset); + BC.MIB->clearOffset(*CTCInstr); + } + // Link new BBs to the original input offset of the BB where the CTC // is, so we can map samples recorded in new BBs back to the original BB // seem in the input binary (if using BAT) @@ -2849,6 +2890,14 @@ bool BinaryFunction::requiresAddressTranslation() const { return opts::EnableBAT || hasSDTMarker() || hasPseudoProbe(); } +bool BinaryFunction::requiresAddressMap() const { + if (isInjected()) + return false; + + return opts::UpdateDebugSections || isMultiEntry() || + requiresAddressTranslation(); +} + uint64_t BinaryFunction::getInstructionCount() const { uint64_t Count = 0; for (const BinaryBasicBlock &BB : blocks()) @@ -3143,6 +3192,10 @@ void BinaryFunction::dumpGraphToFile(std::string Filename) const { } bool BinaryFunction::validateCFG() const { + // Skip the validation of CFG after it is finalized + if (CurrentState == State::CFG_Finalized) + return true; + bool Valid = true; for (BinaryBasicBlock *BB : BasicBlocks) Valid &= BB->validateSuccessorInvariants(); @@ -3322,7 +3375,7 @@ void BinaryFunction::propagateGnuArgsSizeInfo( } } else if (BC.MIB->isInvoke(Instr)) { // Add the value of GNU_args_size as an extra operand to invokes. - BC.MIB->addGnuArgsSize(Instr, CurrentGnuArgsSize, AllocId); + BC.MIB->addGnuArgsSize(Instr, CurrentGnuArgsSize); } ++II; } @@ -4016,7 +4069,7 @@ void BinaryFunction::calculateLoopInfo() { } } -void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) { +void BinaryFunction::updateOutputValues(const BOLTLinker &Linker) { if (!isEmitted()) { assert(!isInjected() && "injected function should be emitted"); setOutputAddress(getAddress()); @@ -4024,16 +4077,17 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) { return; } - const uint64_t BaseAddress = getCodeSection()->getOutputAddress(); + const auto SymbolInfo = Linker.lookupSymbolInfo(getSymbol()->getName()); + assert(SymbolInfo && "Cannot find function entry symbol"); + setOutputAddress(SymbolInfo->Address); + setOutputSize(SymbolInfo->Size); + if (BC.HasRelocations || isInjected()) { - const uint64_t StartOffset = Layout.getSymbolOffset(*getSymbol()); - const uint64_t EndOffset = Layout.getSymbolOffset(*getFunctionEndLabel()); - setOutputAddress(BaseAddress + StartOffset); - setOutputSize(EndOffset - StartOffset); if (hasConstantIsland()) { - const uint64_t DataOffset = - Layout.getSymbolOffset(*getFunctionConstantIslandLabel()); - setOutputDataAddress(BaseAddress + DataOffset); + const auto DataAddress = + Linker.lookupSymbol(getFunctionConstantIslandLabel()->getName()); + assert(DataAddress && "Cannot find function CI symbol"); + setOutputDataAddress(*DataAddress); for (auto It : Islands->Offsets) { const uint64_t OldOffset = It.first; BinaryData *BD = BC.getBinaryDataAtAddress(getAddress() + OldOffset); @@ -4041,8 +4095,11 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) { continue; MCSymbol *Symbol = It.second; - const uint64_t NewOffset = Layout.getSymbolOffset(*Symbol); - BD->setOutputLocation(*getCodeSection(), NewOffset); + const auto NewAddress = Linker.lookupSymbol(Symbol->getName()); + assert(NewAddress && "Cannot find CI symbol"); + auto &Section = *getCodeSection(); + const auto NewOffset = *NewAddress - Section.getOutputAddress(); + BD->setOutputLocation(Section, NewOffset); } } if (isSplit()) { @@ -4052,7 +4109,6 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) { // If fragment is empty, cold section might not exist if (FF.empty() && ColdSection.getError()) continue; - const uint64_t ColdBaseAddress = ColdSection->getOutputAddress(); const MCSymbol *ColdStartSymbol = getSymbol(FF.getFragmentNum()); // If fragment is empty, symbol might have not been emitted @@ -4061,31 +4117,24 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) { continue; assert(ColdStartSymbol && ColdStartSymbol->isDefined() && "split function should have defined cold symbol"); - const MCSymbol *ColdEndSymbol = - getFunctionEndLabel(FF.getFragmentNum()); - assert(ColdEndSymbol && ColdEndSymbol->isDefined() && - "split function should have defined cold end symbol"); - const uint64_t ColdStartOffset = - Layout.getSymbolOffset(*ColdStartSymbol); - const uint64_t ColdEndOffset = Layout.getSymbolOffset(*ColdEndSymbol); - FF.setAddress(ColdBaseAddress + ColdStartOffset); - FF.setImageSize(ColdEndOffset - ColdStartOffset); + const auto ColdStartSymbolInfo = + Linker.lookupSymbolInfo(ColdStartSymbol->getName()); + assert(ColdStartSymbolInfo && "Cannot find cold start symbol"); + FF.setAddress(ColdStartSymbolInfo->Address); + FF.setImageSize(ColdStartSymbolInfo->Size); if (hasConstantIsland()) { - const uint64_t DataOffset = - Layout.getSymbolOffset(*getFunctionColdConstantIslandLabel()); - setOutputColdDataAddress(ColdBaseAddress + DataOffset); + const auto DataAddress = Linker.lookupSymbol( + getFunctionColdConstantIslandLabel()->getName()); + assert(DataAddress && "Cannot find cold CI symbol"); + setOutputColdDataAddress(*DataAddress); } } } - } else { - setOutputAddress(getAddress()); - setOutputSize(Layout.getSymbolOffset(*getFunctionEndLabel())); } // Update basic block output ranges for the debug info, if we have // secondary entry points in the symbol table to update or if writing BAT. - if (!opts::UpdateDebugSections && !isMultiEntry() && - !requiresAddressTranslation()) + if (!requiresAddressMap()) return; // Output ranges should match the input if the body hasn't changed. @@ -4114,15 +4163,24 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) { assert(FragmentBaseAddress == getOutputAddress()); } - const uint64_t BBOffset = Layout.getSymbolOffset(*BB->getLabel()); - const uint64_t BBAddress = FragmentBaseAddress + BBOffset; + // Injected functions likely will fail lookup, as they have no + // input range. Just assign the BB the output address of the + // function. + auto MaybeBBAddress = BC.getIOAddressMap().lookup(BB->getLabel()); + const uint64_t BBAddress = MaybeBBAddress ? *MaybeBBAddress + : BB->isSplit() ? FF.getAddress() + : getOutputAddress(); BB->setOutputStartAddress(BBAddress); - if (PrevBB) + if (PrevBB) { + assert(PrevBB->getOutputAddressRange().first <= BBAddress && + "Bad output address for basic block."); + assert((PrevBB->getOutputAddressRange().first != BBAddress || + !hasInstructions() || !PrevBB->getNumNonPseudos()) && + "Bad output address for basic block."); PrevBB->setOutputEndAddress(BBAddress); + } PrevBB = BB; - - BB->updateOutputValues(Layout); } PrevBB->setOutputEndAddress(PrevBB->isSplit() @@ -4175,9 +4233,8 @@ uint64_t BinaryFunction::translateInputToOutputAddress(uint64_t Address) const { // Check if the address is associated with an instruction that is tracked // by address translation. - auto KV = InputOffsetToAddressMap.find(Address - getAddress()); - if (KV != InputOffsetToAddressMap.end()) - return KV->second; + if (auto OutputAddress = BC.getIOAddressMap().lookup(Address)) + return *OutputAddress; // FIXME: #18950828 - we rely on relative offsets inside basic blocks to stay // intact. Instead we can use pseudo instructions and/or annotations. @@ -4299,10 +4356,11 @@ MCInst *BinaryFunction::getInstructionAtOffset(uint64_t Offset) { } if (MCInst *LastInstr = BB->getLastNonPseudoInstr()) { - const uint32_t Size = - BC.MIB->getAnnotationWithDefault(*LastInstr, "Size"); - if (BB->getEndOffset() - Offset == Size) - return LastInstr; + if (std::optional Size = BC.MIB->getSize(*LastInstr)) { + if (BB->getEndOffset() - Offset == Size) { + return LastInstr; + } + } } return nullptr; @@ -4471,7 +4529,7 @@ void BinaryFunction::addRelocation(uint64_t Address, MCSymbol *Symbol, uint64_t Offset = Address - getAddress(); LLVM_DEBUG(dbgs() << "BOLT-DEBUG: addRelocation in " << formatv("{0}@{1:x} against {2}\n", *this, Offset, - Symbol->getName())); + (Symbol ? Symbol->getName() : ""))); bool IsCI = BC.isAArch64() && isInConstantIsland(Address); std::map &Rels = IsCI ? Islands->Relocations : Relocations; diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt index a4612fb93f8c349e3d1f9c39fbf1373b95042f93..c913179ebcc517ef17650ea5de30a48a12c3185f 100644 --- a/bolt/lib/Core/CMakeLists.txt +++ b/bolt/lib/Core/CMakeLists.txt @@ -11,6 +11,7 @@ set(LLVM_LINK_COMPONENTS ) add_llvm_library(LLVMBOLTCore + AddressMap.cpp BinaryBasicBlock.cpp BinaryContext.cpp BinaryData.cpp diff --git a/bolt/lib/Core/DynoStats.cpp b/bolt/lib/Core/DynoStats.cpp index ee40eefd6f7c24e1691ae8b7156d7e5832f9985f..5dd55e13e5b31fdea55eb3071d8e62f8c5368b54 100644 --- a/bolt/lib/Core/DynoStats.cpp +++ b/bolt/lib/Core/DynoStats.cpp @@ -215,10 +215,10 @@ DynoStats getDynoStats(BinaryFunction &BF) { } } - if (BC.MIB->isStore(Instr)) { + if (BC.MIB->mayStore(Instr)) { Stats[DynoStats::STORES] += BBExecutionCount; } - if (BC.MIB->isLoad(Instr)) { + if (BC.MIB->mayLoad(Instr)) { Stats[DynoStats::LOADS] += BBExecutionCount; } if (!BC.MIB->isCall(Instr)) diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp index 667f1757e13d7101859025ec9b67bd25a865b099..b0bfa7fc052085acf540b0692b492ee7abc735e1 100644 --- a/bolt/lib/Core/Exceptions.cpp +++ b/bolt/lib/Core/Exceptions.cpp @@ -112,13 +112,18 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, uint64_t Offset = getLSDAAddress() - LSDASectionAddress; assert(Data.isValidOffset(Offset) && "wrong LSDA address"); - uint8_t LPStartEncoding = Data.getU8(&Offset); - uint64_t LPStart = 0; - // Convert to offset if LPStartEncoding is typed absptr DW_EH_PE_absptr - if (std::optional MaybeLPStart = Data.getEncodedPointer( - &Offset, LPStartEncoding, Offset + LSDASectionAddress)) - LPStart = (LPStartEncoding && 0xFF == 0) ? *MaybeLPStart - : *MaybeLPStart - Address; + const uint8_t LPStartEncoding = Data.getU8(&Offset); + uint64_t LPStart = Address; + if (LPStartEncoding != dwarf::DW_EH_PE_omit) { + std::optional MaybeLPStart = Data.getEncodedPointer( + &Offset, LPStartEncoding, Offset + LSDASectionAddress); + if (!MaybeLPStart) { + errs() << "BOLT-ERROR: unsupported LPStartEncoding: " + << (unsigned)LPStartEncoding << '\n'; + exit(1); + } + LPStart = *MaybeLPStart; + } const uint8_t TTypeEncoding = Data.getU8(&Offset); LSDATypeEncoding = TTypeEncoding; @@ -175,30 +180,13 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, uint64_t LandingPad = *Data.getEncodedPointer( &CallSitePtr, CallSiteEncoding, CallSitePtr + LSDASectionAddress); uint64_t ActionEntry = Data.getULEB128(&CallSitePtr); - - uint64_t LPOffset = LPStart + LandingPad; - uint64_t LPAddress = Address + LPOffset; - - // Verify if landing pad code is located outside current function - // Support landing pad to builtin_unreachable - if (LPAddress < Address || LPAddress > Address + getSize()) { - BinaryFunction *Fragment = - BC.getBinaryFunctionContainingAddress(LPAddress); - assert(Fragment != nullptr && - "BOLT-ERROR: cannot find landing pad fragment"); - BC.addInterproceduralReference(this, Fragment->getAddress()); - BC.processInterproceduralReferences(); - assert(isParentOrChildOf(*Fragment) && - "BOLT-ERROR: cannot have landing pads in different functions"); - setHasIndirectTargetToSplitFragment(true); - BC.addFragmentsToSkip(this); - return; - } + if (LandingPad) + LandingPad += LPStart; if (opts::PrintExceptions) { outs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start) << ", 0x" << Twine::utohexstr(RangeBase + Start + Length) - << "); landing pad: 0x" << Twine::utohexstr(LPOffset) + << "); landing pad: 0x" << Twine::utohexstr(LandingPad) << "; action entry: 0x" << Twine::utohexstr(ActionEntry) << "\n"; outs() << " current offset is " << (CallSitePtr - CallSiteTableStart) << '\n'; @@ -206,7 +194,24 @@ void BinaryFunction::parseLSDA(ArrayRef LSDASectionData, // Create a handler entry if necessary. MCSymbol *LPSymbol = nullptr; - if (LPOffset) { + if (LandingPad) { + // Verify if landing pad code is located outside current function + // Support landing pad to builtin_unreachable + if (LandingPad < Address || LandingPad > Address + getSize()) { + BinaryFunction *Fragment = + BC.getBinaryFunctionContainingAddress(LandingPad); + assert(Fragment != nullptr && + "BOLT-ERROR: cannot find landing pad fragment"); + BC.addInterproceduralReference(this, Fragment->getAddress()); + BC.processInterproceduralReferences(); + assert(isParentOrChildOf(*Fragment) && + "BOLT-ERROR: cannot have landing pads in different functions"); + setHasIndirectTargetToSplitFragment(true); + BC.addFragmentsToSkip(this); + return; + } + + const uint64_t LPOffset = LandingPad - getAddress(); if (!getInstructionAtOffset(LPOffset)) { if (opts::Verbosity >= 1) errs() << "BOLT-WARNING: landing pad " << Twine::utohexstr(LPOffset) diff --git a/bolt/lib/Core/HashUtilities.cpp b/bolt/lib/Core/HashUtilities.cpp index 0752eaeabef85069cfa3a6dbebe7326a49446630..88f01e4f936d30c2a34b415b911a91c12bb8393f 100644 --- a/bolt/lib/Core/HashUtilities.cpp +++ b/bolt/lib/Core/HashUtilities.cpp @@ -130,5 +130,43 @@ std::string hashBlock(BinaryContext &BC, const BinaryBasicBlock &BB, return HashString; } +/// A "loose" hash of a basic block to use with the stale profile matching. The +/// computed value will be the same for blocks with minor changes (such as +/// reordering of instructions or using different operands) but may result in +/// collisions that need to be resolved by a stronger hashing. +std::string hashBlockLoose(BinaryContext &BC, const BinaryBasicBlock &BB) { + // The hash is computed by creating a string of all lexicographically ordered + // instruction opcodes, which is then hashed with std::hash. + std::set Opcodes; + for (const MCInst &Inst : BB) { + // Skip pseudo instructions and nops. + if (BC.MIB->isPseudo(Inst) || BC.MIB->isNoop(Inst)) + continue; + + // Ignore unconditional jumps, as they can be added / removed as a result + // of basic block reordering. + if (BC.MIB->isUnconditionalBranch(Inst)) + continue; + + // Do not distinguish different types of conditional jumps. + if (BC.MIB->isConditionalBranch(Inst)) { + Opcodes.insert("JMP"); + continue; + } + + std::string Mnemonic = BC.InstPrinter->getMnemonic(&Inst).first; + Mnemonic.erase( + std::remove_if(Mnemonic.begin(), Mnemonic.end(), + [](unsigned char ch) { return std::isspace(ch); }), + Mnemonic.end()); + Opcodes.insert(Mnemonic); + } + + std::string HashString; + for (const std::string &Opcode : Opcodes) + HashString.append(Opcode); + return HashString; +} + } // namespace bolt } // namespace llvm diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp index 027cef1063ee3ebe9181423156e804739f52305e..0cafd3d20ffb95a33c750459ee4166e2b372377e 100644 --- a/bolt/lib/Core/MCPlusBuilder.cpp +++ b/bolt/lib/Core/MCPlusBuilder.cpp @@ -120,7 +120,7 @@ bool MCPlusBuilder::equals(const MCTargetExpr &A, const MCTargetExpr &B, llvm_unreachable("target-specific expressions are unsupported"); } -void MCPlusBuilder::setTailCall(MCInst &Inst) { +void MCPlusBuilder::setTailCall(MCInst &Inst) const { assert(!hasAnnotation(Inst, MCAnnotation::kTailCall)); setAnnotationOpValue(Inst, MCAnnotation::kTailCall, true); } @@ -149,7 +149,7 @@ std::optional MCPlusBuilder::getEHInfo(const MCInst &Inst) const { static_cast(*Action)); } -void MCPlusBuilder::addEHInfo(MCInst &Inst, const MCLandingPad &LP) { +void MCPlusBuilder::addEHInfo(MCInst &Inst, const MCLandingPad &LP) const { if (isCall(Inst)) { assert(!getEHInfo(Inst)); setAnnotationOpValue(Inst, MCAnnotation::kEHLandingPad, @@ -159,7 +159,7 @@ void MCPlusBuilder::addEHInfo(MCInst &Inst, const MCLandingPad &LP) { } } -bool MCPlusBuilder::updateEHInfo(MCInst &Inst, const MCLandingPad &LP) { +bool MCPlusBuilder::updateEHInfo(MCInst &Inst, const MCLandingPad &LP) const { if (!isInvoke(Inst)) return false; @@ -178,13 +178,12 @@ int64_t MCPlusBuilder::getGnuArgsSize(const MCInst &Inst) const { return *Value; } -void MCPlusBuilder::addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize, - AllocatorIdTy AllocId) { +void MCPlusBuilder::addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize) const { assert(GnuArgsSize >= 0 && "cannot set GNU_args_size to negative value"); assert(getGnuArgsSize(Inst) == -1LL && "GNU_args_size already set"); assert(isInvoke(Inst) && "GNU_args_size can only be set for invoke"); - setAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize, GnuArgsSize, AllocId); + setAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize, GnuArgsSize); } uint64_t MCPlusBuilder::getJumpTable(const MCInst &Inst) const { @@ -203,12 +202,12 @@ bool MCPlusBuilder::setJumpTable(MCInst &Inst, uint64_t Value, uint16_t IndexReg, AllocatorIdTy AllocId) { if (!isIndirectBranch(Inst)) return false; - setAnnotationOpValue(Inst, MCAnnotation::kJumpTable, Value, AllocId); + setAnnotationOpValue(Inst, MCAnnotation::kJumpTable, Value); getOrCreateAnnotationAs(Inst, "JTIndexReg", AllocId) = IndexReg; return true; } -bool MCPlusBuilder::unsetJumpTable(MCInst &Inst) { +bool MCPlusBuilder::unsetJumpTable(MCInst &Inst) const { if (!getJumpTable(Inst)) return false; removeAnnotation(Inst, MCAnnotation::kJumpTable); @@ -225,7 +224,7 @@ MCPlusBuilder::getConditionalTailCall(const MCInst &Inst) const { return static_cast(*Value); } -bool MCPlusBuilder::setConditionalTailCall(MCInst &Inst, uint64_t Dest) { +bool MCPlusBuilder::setConditionalTailCall(MCInst &Inst, uint64_t Dest) const { if (!isConditionalBranch(Inst)) return false; @@ -233,7 +232,7 @@ bool MCPlusBuilder::setConditionalTailCall(MCInst &Inst, uint64_t Dest) { return true; } -bool MCPlusBuilder::unsetConditionalTailCall(MCInst &Inst) { +bool MCPlusBuilder::unsetConditionalTailCall(MCInst &Inst) const { if (!getConditionalTailCall(Inst)) return false; removeAnnotation(Inst, MCAnnotation::kConditionalTailCall); @@ -255,63 +254,76 @@ uint32_t MCPlusBuilder::getOffsetWithDefault(const MCInst &Inst, return Default; } -bool MCPlusBuilder::setOffset(MCInst &Inst, uint32_t Offset, - AllocatorIdTy AllocatorId) { - setAnnotationOpValue(Inst, MCAnnotation::kOffset, Offset, AllocatorId); +bool MCPlusBuilder::setOffset(MCInst &Inst, uint32_t Offset) const { + setAnnotationOpValue(Inst, MCAnnotation::kOffset, Offset); return true; } -bool MCPlusBuilder::clearOffset(MCInst &Inst) { +bool MCPlusBuilder::clearOffset(MCInst &Inst) const { if (!hasAnnotation(Inst, MCAnnotation::kOffset)) return false; removeAnnotation(Inst, MCAnnotation::kOffset); return true; } -bool MCPlusBuilder::hasAnnotation(const MCInst &Inst, unsigned Index) const { - const MCInst *AnnotationInst = getAnnotationInst(Inst); - if (!AnnotationInst) - return false; +MCSymbol *MCPlusBuilder::getLabel(const MCInst &Inst) const { + if (auto Label = tryGetAnnotationAs(Inst, MCAnnotation::kLabel)) + return *Label; + return nullptr; +} + +bool MCPlusBuilder::setLabel(MCInst &Inst, MCSymbol *Label) { + getOrCreateAnnotationAs(Inst, MCAnnotation::kLabel) = Label; + return true; +} +std::optional MCPlusBuilder::getSize(const MCInst &Inst) const { + if (std::optional Value = + getAnnotationOpValue(Inst, MCAnnotation::kSize)) + return static_cast(*Value); + return std::nullopt; +} + +void MCPlusBuilder::setSize(MCInst &Inst, uint32_t Size) const { + setAnnotationOpValue(Inst, MCAnnotation::kSize, Size); +} + +bool MCPlusBuilder::hasAnnotation(const MCInst &Inst, unsigned Index) const { return (bool)getAnnotationOpValue(Inst, Index); } -bool MCPlusBuilder::removeAnnotation(MCInst &Inst, unsigned Index) { - MCInst *AnnotationInst = getAnnotationInst(Inst); - if (!AnnotationInst) +bool MCPlusBuilder::removeAnnotation(MCInst &Inst, unsigned Index) const { + std::optional FirstAnnotationOp = getFirstAnnotationOpIndex(Inst); + if (!FirstAnnotationOp) return false; - for (int I = AnnotationInst->getNumOperands() - 1; I >= 0; --I) { - int64_t ImmValue = AnnotationInst->getOperand(I).getImm(); + for (unsigned I = Inst.getNumOperands() - 1; I >= *FirstAnnotationOp; --I) { + const int64_t ImmValue = Inst.getOperand(I).getImm(); if (extractAnnotationIndex(ImmValue) == Index) { - AnnotationInst->erase(AnnotationInst->begin() + I); + Inst.erase(Inst.begin() + I); return true; } } return false; } -void MCPlusBuilder::stripAnnotations(MCInst &Inst, bool KeepTC) { - MCInst *AnnotationInst = getAnnotationInst(Inst); - if (!AnnotationInst) - return; - // Preserve TailCall annotation. - auto IsTC = hasAnnotation(Inst, MCAnnotation::kTailCall); +void MCPlusBuilder::stripAnnotations(MCInst &Inst, bool KeepTC) const { + KeepTC &= hasAnnotation(Inst, MCAnnotation::kTailCall); - removeAnnotationInst(Inst); + removeAnnotations(Inst); - if (KeepTC && IsTC) + if (KeepTC) setTailCall(Inst); } void MCPlusBuilder::printAnnotations(const MCInst &Inst, raw_ostream &OS) const { - const MCInst *AnnotationInst = getAnnotationInst(Inst); - if (!AnnotationInst) + std::optional FirstAnnotationOp = getFirstAnnotationOpIndex(Inst); + if (!FirstAnnotationOp) return; - for (unsigned I = 0; I < AnnotationInst->getNumOperands(); ++I) { - const int64_t Imm = AnnotationInst->getOperand(I).getImm(); + for (unsigned I = *FirstAnnotationOp; I < Inst.getNumOperands(); ++I) { + const int64_t Imm = Inst.getOperand(I).getImm(); const unsigned Index = extractAnnotationIndex(Imm); const int64_t Value = extractAnnotationValue(Imm); const auto *Annotation = reinterpret_cast(Value); diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp index e985d6da82c197781d4fba7707f7766caaede793..70fcc6953ed71bc0fe0f8f4bf899c72844cf7200 100644 --- a/bolt/lib/Core/Relocation.cpp +++ b/bolt/lib/Core/Relocation.cpp @@ -101,6 +101,7 @@ static bool isSupportedRISCV(uint64_t Type) { case ELF::R_RISCV_GOT_HI20: case ELF::R_RISCV_PCREL_HI20: case ELF::R_RISCV_PCREL_LO12_I: + case ELF::R_RISCV_PCREL_LO12_S: case ELF::R_RISCV_RVC_JUMP: case ELF::R_RISCV_RVC_BRANCH: case ELF::R_RISCV_ADD32: @@ -195,6 +196,7 @@ static size_t getSizeForTypeRISCV(uint64_t Type) { case ELF::R_RISCV_BRANCH: case ELF::R_RISCV_PCREL_HI20: case ELF::R_RISCV_PCREL_LO12_I: + case ELF::R_RISCV_PCREL_LO12_S: case ELF::R_RISCV_32_PCREL: case ELF::R_RISCV_CALL: case ELF::R_RISCV_CALL_PLT: @@ -338,13 +340,22 @@ static uint64_t encodeValueAArch64(uint64_t Type, uint64_t Value, uint64_t PC) { switch (Type) { default: llvm_unreachable("unsupported relocation"); + case ELF::R_AARCH64_ABS16: case ELF::R_AARCH64_ABS32: + case ELF::R_AARCH64_ABS64: break; case ELF::R_AARCH64_PREL16: case ELF::R_AARCH64_PREL32: case ELF::R_AARCH64_PREL64: Value -= PC; break; + case ELF::R_AARCH64_CALL26: + Value -= PC; + assert(isInt<28>(Value) && "only PC +/- 128MB is allowed for direct call"); + // Immediate goes in bits 25:0 of BL. + // OP 1001_01 goes in bits 31:26 of BL. + Value = ((Value >> 2) & 0x3ffffff) | 0x94000000ULL; + break; } return Value; } @@ -473,6 +484,10 @@ static uint64_t extractIImmRISCV(uint32_t Contents) { return SignExtend64<12>(Contents >> 20); } +static uint64_t extractSImmRISCV(uint32_t Contents) { + return SignExtend64<12>(((Contents >> 7) & 0x1f) | ((Contents >> 25) << 5)); +} + static uint64_t extractJImmRISCV(uint32_t Contents) { return SignExtend64<21>( (((Contents >> 21) & 0x3ff) << 1) | (((Contents >> 20) & 0x1) << 11) | @@ -509,6 +524,8 @@ static uint64_t extractValueRISCV(uint64_t Type, uint64_t Contents, return extractUImmRISCV(Contents); case ELF::R_RISCV_PCREL_LO12_I: return extractIImmRISCV(Contents); + case ELF::R_RISCV_PCREL_LO12_S: + return extractSImmRISCV(Contents); case ELF::R_RISCV_RVC_JUMP: return SignExtend64<11>(Contents >> 2); case ELF::R_RISCV_RVC_BRANCH: @@ -685,6 +702,7 @@ static bool isPCRelativeRISCV(uint64_t Type) { case ELF::R_RISCV_GOT_HI20: case ELF::R_RISCV_PCREL_HI20: case ELF::R_RISCV_PCREL_LO12_I: + case ELF::R_RISCV_PCREL_LO12_S: case ELF::R_RISCV_RVC_JUMP: case ELF::R_RISCV_RVC_BRANCH: case ELF::R_RISCV_32_PCREL: @@ -781,6 +799,19 @@ bool Relocation::isTLS(uint64_t Type) { return isTLSX86(Type); } +bool Relocation::isInstructionReference(uint64_t Type) { + if (Arch != Triple::riscv64) + return false; + + switch (Type) { + default: + return false; + case ELF::R_RISCV_PCREL_LO12_I: + case ELF::R_RISCV_PCREL_LO12_S: + return true; + } +} + uint64_t Relocation::getNone() { if (Arch == Triple::aarch64) return ELF::R_AARCH64_NONE; diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/ADRRelaxationPass.cpp index 76924d96fcf9bc5795cef91326a8b904f5a8545f..27a1377adef1641848188f31b55b47ac5f143764 100644 --- a/bolt/lib/Passes/ADRRelaxationPass.cpp +++ b/bolt/lib/Passes/ADRRelaxationPass.cpp @@ -29,7 +29,16 @@ static cl::opt namespace llvm { namespace bolt { +// We don't exit directly from runOnFunction since it would call ThreadPool +// destructor which might result in internal assert if we're not finished +// creating async jobs on the moment of exit. So we're finishing all parallel +// jobs and checking the exit flag after it. +static bool PassFailed = false; + void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { + if (PassFailed) + return; + BinaryContext &BC = BF.getBinaryContext(); for (BinaryBasicBlock &BB : BF) { for (auto It = BB.begin(); It != BB.end(); ++It) { @@ -47,28 +56,41 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { continue; } - BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol); - if (TargetBF && TargetBF == &BF) - continue; + // Don't relax adr if it points to the same function and it is not split + // and BF initial size is < 1MB. + const unsigned OneMB = 0x100000; + if (!BF.isSplit() && BF.getSize() < OneMB) { + BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol); + if (TargetBF && TargetBF == &BF) + continue; + } MCPhysReg Reg; BC.MIB->getADRReg(Inst, Reg); int64_t Addend = BC.MIB->getTargetAddend(Inst); - InstructionListType Addr = - BC.MIB->materializeAddress(Symbol, BC.Ctx.get(), Reg, Addend); + InstructionListType Addr; + + { + auto L = BC.scopeLock(); + Addr = BC.MIB->materializeAddress(Symbol, BC.Ctx.get(), Reg, Addend); + } if (It != BB.begin() && BC.MIB->isNoop(*std::prev(It))) { It = BB.eraseInstruction(std::prev(It)); - } else if (opts::StrictMode && !BF.isSimple()) { + } else if (std::next(It) != BB.end() && BC.MIB->isNoop(*std::next(It))) { + BB.eraseInstruction(std::next(It)); + } else if (!opts::StrictMode && !BF.isSimple()) { // If the function is not simple, it may contain a jump table undetected // by us. This jump table may use an offset from the branch instruction // to land in the desired place. If we add new instructions, we // invalidate this offset, so we have to rely on linker-inserted NOP to // replace it with ADRP, and abort if it is not present. + auto L = BC.scopeLock(); errs() << formatv("BOLT-ERROR: Cannot relax adr in non-simple function " - "{0}. Can't proceed in current mode.\n", + "{0}. Use --strict option to override\n", BF.getOneName()); - exit(1); + PassFailed = true; + return; } It = BB.replaceInstruction(It, Addr); } @@ -85,7 +107,10 @@ void ADRRelaxationPass::runOnFunctions(BinaryContext &BC) { ParallelUtilities::runOnEachFunction( BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, nullptr, - "ADRRelaxationPass", /* ForceSequential */ true); + "ADRRelaxationPass"); + + if (PassFailed) + exit(1); } } // end namespace bolt diff --git a/bolt/lib/Passes/Aligner.cpp b/bolt/lib/Passes/Aligner.cpp index ef419bb6baaa21068ab2aedd4f0537df010e07df..7c387525434bd39bc847ebe15349784435a6ea4f 100644 --- a/bolt/lib/Passes/Aligner.cpp +++ b/bolt/lib/Passes/Aligner.cpp @@ -163,20 +163,6 @@ void AlignerPass::runOnFunctions(BinaryContext &BC) { else alignMaxBytes(BF); - // Align objects that contains constant islands and no code - // to at least 8 bytes. - if (!BF.size() && BF.hasIslandsInfo()) { - const uint16_t Alignment = BF.getConstantIslandAlignment(); - if (BF.getAlignment() < Alignment) - BF.setAlignment(Alignment); - - if (BF.getMaxAlignmentBytes() < Alignment) - BF.setMaxAlignmentBytes(Alignment); - - if (BF.getMaxColdAlignmentBytes() < Alignment) - BF.setMaxColdAlignmentBytes(Alignment); - } - if (opts::AlignBlocks && !opts::PreserveBlocksAlignment) alignBlocks(BF, Emitter.MCE.get()); }; diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index a674fb4fef76a352c386ceace5733446dad9f2b7..4e1343e2c30be56dc2d9c719790a276b3b38469b 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -317,38 +317,46 @@ void NormalizeCFG::runOnFunctions(BinaryContext &BC) { } void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) { - if (!Function.getLayout().block_empty()) { - unsigned Count; - uint64_t Bytes; - Function.markUnreachableBlocks(); - LLVM_DEBUG({ - for (BinaryBasicBlock &BB : Function) { - if (!BB.isValid()) { - dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName() - << " in function " << Function << "\n"; - Function.dump(); - } + BinaryContext &BC = Function.getBinaryContext(); + unsigned Count; + uint64_t Bytes; + Function.markUnreachableBlocks(); + LLVM_DEBUG({ + for (BinaryBasicBlock &BB : Function) { + if (!BB.isValid()) { + dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName() + << " in function " << Function << "\n"; + Function.dump(); } - }); - std::tie(Count, Bytes) = Function.eraseInvalidBBs(); - DeletedBlocks += Count; - DeletedBytes += Bytes; - if (Count) { - Modified.insert(&Function); - if (opts::Verbosity > 0) - outs() << "BOLT-INFO: removed " << Count - << " dead basic block(s) accounting for " << Bytes - << " bytes in function " << Function << '\n'; } + }); + BinaryContext::IndependentCodeEmitter Emitter = + BC.createIndependentMCCodeEmitter(); + std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get()); + DeletedBlocks += Count; + DeletedBytes += Bytes; + if (Count) { + auto L = BC.scopeLock(); + Modified.insert(&Function); + if (opts::Verbosity > 0) + outs() << "BOLT-INFO: removed " << Count + << " dead basic block(s) accounting for " << Bytes + << " bytes in function " << Function << '\n'; } } void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) { - for (auto &It : BC.getBinaryFunctions()) { - BinaryFunction &Function = It.second; - if (shouldOptimize(Function)) - runOnFunction(Function); - } + ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { + runOnFunction(BF); + }; + + ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { + return !shouldOptimize(BF) || BF.getLayout().block_empty(); + }; + + ParallelUtilities::runOnEachFunction( + BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun, + SkipPredicate, "elimininate-unreachable"); if (DeletedBlocks) outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and " @@ -574,57 +582,50 @@ bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const { } void LowerAnnotations::runOnFunctions(BinaryContext &BC) { - std::vector> PreservedOffsetAnnotations; - - for (auto &It : BC.getBinaryFunctions()) { - BinaryFunction &BF = It.second; - - for (FunctionFragment &FF : BF.getLayout().fragments()) { + for (BinaryFunction *BF : BC.getAllBinaryFunctions()) { + for (FunctionFragment &FF : BF->getLayout().fragments()) { + // Reset at the start of the new fragment. int64_t CurrentGnuArgsSize = 0; for (BinaryBasicBlock *const BB : FF) { - // First convert GnuArgsSize annotations into CFIs. This may change - // instr pointers, so do it before recording ptrs for preserved - // annotations - if (BF.usesGnuArgsSize()) { - for (auto II = BB->begin(); II != BB->end(); ++II) { - if (!BC.MIB->isInvoke(*II)) - continue; + for (auto II = BB->begin(); II != BB->end(); ++II) { + + // Convert GnuArgsSize annotations into CFIs. + if (BF->usesGnuArgsSize() && BC.MIB->isInvoke(*II)) { const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II); assert(NewGnuArgsSize >= 0 && - "expected non-negative GNU_args_size"); + "Expected non-negative GNU_args_size."); if (NewGnuArgsSize != CurrentGnuArgsSize) { - auto InsertII = BF.addCFIInstruction( + auto InsertII = BF->addCFIInstruction( BB, II, MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize)); CurrentGnuArgsSize = NewGnuArgsSize; II = std::next(InsertII); } } - } - // Now record preserved annotations separately and then strip - // annotations. - for (auto II = BB->begin(); II != BB->end(); ++II) { - if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II)) - PreservedOffsetAnnotations.emplace_back(&(*II), - *BC.MIB->getOffset(*II)); + // Preserve selected annotations and strip the rest. + std::optional Offset = BF->requiresAddressTranslation() + ? BC.MIB->getOffset(*II) + : std::nullopt; + std::optional Size = BC.MIB->getSize(*II); + MCSymbol *Label = BC.MIB->getLabel(*II); + BC.MIB->stripAnnotations(*II); + + if (Offset) + BC.MIB->setOffset(*II, *Offset); + if (Size) + BC.MIB->setSize(*II, *Size); + if (Label) + BC.MIB->setLabel(*II, Label); } } } } - for (BinaryFunction *BF : BC.getInjectedBinaryFunctions()) - for (BinaryBasicBlock &BB : *BF) - for (MCInst &Instruction : BB) - BC.MIB->stripAnnotations(Instruction); // Release all memory taken by annotations BC.MIB->freeAnnotations(); - - // Reinsert preserved annotations we need during code emission. - for (const std::pair &Item : PreservedOffsetAnnotations) - BC.MIB->setOffset(*Item.first, Item.second); } // Check for dirty state in MCSymbol objects that might be a consequence @@ -1454,6 +1455,14 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) { 100.0 * NumInferredFunctions / NumAllStaleFunctions, 100.0 * InferredSampleCount / TotalSampleCount, InferredSampleCount, TotalSampleCount); + outs() << format( + "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks" + " (%zu out of %zu stale) responsible for %.2f%% samples" + " (%zu out of %zu stale)\n", + 100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks, + BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks, + 100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount, + BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount); } if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { @@ -1562,10 +1571,11 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) { } // Print information on missed macro-fusion opportunities seen on input. - if (BC.MissedMacroFusionPairs) { - outs() << "BOLT-INFO: the input contains " << BC.MissedMacroFusionPairs - << " (dynamic count : " << BC.MissedMacroFusionExecCount - << ") opportunities for macro-fusion optimization"; + if (BC.Stats.MissedMacroFusionPairs) { + outs() << format("BOLT-INFO: the input contains %zu (dynamic count : %zu)" + " opportunities for macro-fusion optimization", + BC.Stats.MissedMacroFusionPairs, + BC.Stats.MissedMacroFusionExecCount); switch (opts::AlignMacroOpFusion) { case MFT_NONE: outs() << ". Use -align-macro-fusion to fix.\n"; diff --git a/bolt/lib/Passes/IndirectCallPromotion.cpp b/bolt/lib/Passes/IndirectCallPromotion.cpp index ea8019431cf52f96b6febe92c24488d474b0c94e..89727233ec78b05f35bb79a4361b64507d232262 100644 --- a/bolt/lib/Passes/IndirectCallPromotion.cpp +++ b/bolt/lib/Passes/IndirectCallPromotion.cpp @@ -754,6 +754,15 @@ IndirectCallPromotion::rewriteCall( const bool IsTailCallOrJT = (MIB->isTailCall(CallInst) || Function.getJumpTable(CallInst)); + // If we are tracking the indirect call/jump address, propagate the address to + // the ICP code. + const std::optional IndirectInstrOffset = MIB->getOffset(CallInst); + if (IndirectInstrOffset) { + for (auto &[Symbol, Instructions] : ICPcode) + for (MCInst &Inst : Instructions) + MIB->setOffset(Inst, *IndirectInstrOffset); + } + // Move instructions from the tail of the original call block // to the merge block. @@ -767,10 +776,12 @@ IndirectCallPromotion::rewriteCall( TailInsts.push_back(*++TailInst); InstructionListType MovedInst = IndCallBlock.splitInstructions(&CallInst); - // Link new BBs to the original input offset of the BB where the indirect - // call site is, so we can map samples recorded in new BBs back to the - // original BB seen in the input binary (if using BAT) - const uint32_t OrigOffset = IndCallBlock.getInputOffset(); + // Link new BBs to the original input offset of the indirect call site or its + // containing BB, so we can map samples recorded in new BBs back to the + // original BB seen in the input binary (if using BAT). + const uint32_t OrigOffset = IndirectInstrOffset + ? *IndirectInstrOffset + : IndCallBlock.getInputOffset(); IndCallBlock.eraseInstructions(MethodFetchInsns.begin(), MethodFetchInsns.end()); diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp index 98044599d497e71bf35dbf9ed1ce784a3b5271b8..72adb319d71dc0e437981a31c9ba682523f9b065 100644 --- a/bolt/lib/Passes/Instrumentation.cpp +++ b/bolt/lib/Passes/Instrumentation.cpp @@ -13,6 +13,7 @@ #include "bolt/Passes/Instrumentation.h" #include "bolt/Core/ParallelUtilities.h" #include "bolt/RuntimeLibs/InstrumentationRuntimeLibrary.h" +#include "bolt/Utils/CommandLineOpts.h" #include "bolt/Utils/Utils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/RWMutex.h" @@ -85,6 +86,24 @@ cl::opt InstrumentCalls("instrument-calls", namespace llvm { namespace bolt { +static bool hasAArch64ExclusiveMemop(BinaryFunction &Function) { + // FIXME ARMv8-a architecture reference manual says that software must avoid + // having any explicit memory accesses between exclusive load and associated + // store instruction. So for now skip instrumentation for functions that have + // these instructions, since it might lead to runtime deadlock. + BinaryContext &BC = Function.getBinaryContext(); + for (const BinaryBasicBlock &BB : Function) + for (const MCInst &Inst : BB) + if (BC.MIB->isAArch64Exclusive(Inst)) { + if (opts::Verbosity >= 1) + outs() << "BOLT-INSTRUMENTER: Function " << Function + << " has exclusive instructions, skip instrumentation\n"; + return true; + } + + return false; +} + uint32_t Instrumentation::getFunctionNameIndex(const BinaryFunction &Function) { auto Iter = FuncToStringIdx.find(&Function); if (Iter != FuncToStringIdx.end()) @@ -288,6 +307,9 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function, if (BC.isMachO() && Function.hasName("___GLOBAL_init_65535/1")) return; + if (BC.isAArch64() && hasAArch64ExclusiveMemop(Function)) + return; + SplitWorklistTy SplitWorklist; SplitInstrsTy SplitInstrs; diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp index 6f4d1170dbe2a4aa52bed58f14062f5f998943f2..a81689bc37469a43877afe948a07ed6049fc2b12 100644 --- a/bolt/lib/Passes/LongJmp.cpp +++ b/bolt/lib/Passes/LongJmp.cpp @@ -293,7 +293,7 @@ uint64_t LongJmpPass::tentativeLayoutRelocColdPart( for (BinaryFunction *Func : SortedFunctions) { if (!Func->isSplit()) continue; - DotAddress = alignTo(DotAddress, BinaryFunction::MinAlign); + DotAddress = alignTo(DotAddress, Func->getMinAlignment()); uint64_t Pad = offsetToAlignment(DotAddress, llvm::Align(Func->getAlignment())); if (Pad <= Func->getMaxColdAlignmentBytes()) @@ -352,7 +352,7 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode( DotAddress = alignTo(DotAddress, opts::AlignText); } - DotAddress = alignTo(DotAddress, BinaryFunction::MinAlign); + DotAddress = alignTo(DotAddress, Func->getMinAlignment()); uint64_t Pad = offsetToAlignment(DotAddress, llvm::Align(Func->getAlignment())); if (Pad <= Func->getMaxAlignmentBytes()) diff --git a/bolt/lib/Passes/MCF.cpp b/bolt/lib/Passes/MCF.cpp index ec040120a919fb3658c14a0179d967ca4d4f7d6e..c3898d2dce989efdd7f3f77149b5417578ad678a 100644 --- a/bolt/lib/Passes/MCF.cpp +++ b/bolt/lib/Passes/MCF.cpp @@ -262,6 +262,7 @@ bool guessPredEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) { continue; Pred->getBranchInfo(*BB).Count = Guessed; + GuessedArcs.insert(std::make_pair(Pred, BB)); return true; } llvm_unreachable("Expected unguessed arc"); diff --git a/bolt/lib/Passes/RegReAssign.cpp b/bolt/lib/Passes/RegReAssign.cpp index 19e1a84c48d1b5f16875f1968e17b5247f430b49..8b9dc9c1fdd506c89e9cb97b2ca1612687d6e407 100644 --- a/bolt/lib/Passes/RegReAssign.cpp +++ b/bolt/lib/Passes/RegReAssign.cpp @@ -140,7 +140,7 @@ void RegReAssign::rankRegisters(BinaryFunction &Function) { std::fill(RegScore.begin(), RegScore.end(), 0); std::fill(RankedRegs.begin(), RankedRegs.end(), 0); - for (BinaryBasicBlock &BB : Function) { + auto countRegScore = [&](BinaryBasicBlock &BB) { for (MCInst &Inst : BB) { const bool CannotUseREX = BC.MIB->cannotUseREX(Inst); const MCInstrDesc &Desc = BC.MII->get(Inst.getOpcode()); @@ -175,9 +175,25 @@ void RegReAssign::rankRegisters(BinaryFunction &Function) { continue; // Disallow substituitions involving regs in instrs that cannot use REX + // The relationship of X86 registers is shown in the diagram. BL and BH + // do not have a direct alias relationship. However, if the BH register + // cannot be swapped, then the BX/EBX/RBX registers cannot be swapped as + // well, which means that BL register also cannot be swapped. Therefore, + // in the presence of BX/EBX/RBX registers, BL and BH have an alias + // relationship. + // ┌─────────────────┐ + // │ RBX │ + // ├─────┬───────────┤ + // │ │ EBX │ + // ├─────┴──┬────────┤ + // │ │ BX │ + // ├────────┼───┬────┤ + // │ │BH │BL │ + // └────────┴───┴────┘ if (CannotUseREX) { RegScore[RegEC] = std::numeric_limits::min(); + RegScore[BC.MIB->getAliasSized(Reg, 1)] = RegScore[RegEC]; continue; } @@ -185,13 +201,22 @@ void RegReAssign::rankRegisters(BinaryFunction &Function) { if (BC.MIB->isUpper8BitReg(Reg) && ClassicCSR.test(Reg)) { RegScore[RegEC] = std::numeric_limits::min(); + RegScore[BC.MIB->getAliasSized(Reg, 1)] = RegScore[RegEC]; continue; } RegScore[RegEC] += BB.getKnownExecutionCount(); } } + }; + for (BinaryBasicBlock &BB : Function) + countRegScore(BB); + + for (BinaryFunction *ChildFrag : Function.getFragments()) { + for (BinaryBasicBlock &BB : *ChildFrag) + countRegScore(BB); } + std::iota(RankedRegs.begin(), RankedRegs.end(), 0); // 0, 1, 2, 3... llvm::sort(RankedRegs, [&](size_t A, size_t B) { return RegScore[A] > RegScore[B]; }); @@ -213,6 +238,17 @@ void RegReAssign::aggressivePassOverFunction(BinaryFunction &Function) { BinaryContext &BC = Function.getBinaryContext(); rankRegisters(Function); + // If there is a situation where function: + // A() -> A.cold() + // A.localalias() -> A.cold() + // simply swapping these two calls can cause issues. + for (BinaryFunction *ChildFrag : Function.getFragments()) { + if (ChildFrag->getParentFragments()->size() > 1) + return; + if (ChildFrag->empty()) + return; + } + // Bail early if our registers are all black listed, before running expensive // analysis passes bool Bail = true; @@ -304,6 +340,10 @@ void RegReAssign::aggressivePassOverFunction(BinaryFunction &Function) { << " with " << BC.MRI->getName(ExtReg) << "\n\n"); swap(Function, ClassicReg, ExtReg); FuncsChanged.insert(&Function); + for (BinaryFunction *ChildFrag : Function.getFragments()) { + swap(*ChildFrag, ClassicReg, ExtReg); + FuncsChanged.insert(ChildFrag); + } ++Begin; if (Begin == End) break; @@ -315,6 +355,13 @@ bool RegReAssign::conservativePassOverFunction(BinaryFunction &Function) { BinaryContext &BC = Function.getBinaryContext(); rankRegisters(Function); + for (BinaryFunction *ChildFrag : Function.getFragments()) { + if (ChildFrag->getParentFragments()->size() > 1) + return false; + if (ChildFrag->empty()) + return false; + } + // Try swapping R12, R13, R14 or R15 with RBX (we work with all callee-saved // regs except RBP) MCPhysReg Candidate = 0; @@ -340,11 +387,24 @@ bool RegReAssign::conservativePassOverFunction(BinaryFunction &Function) { if (!RBX) return false; + // The high 8 bits of the register will never be swapped. To prevent the high + // 8 bits from being swapped incorrectly, we should switched to swapping the + // low 8 bits of the register instead. + if (BC.MIB->isUpper8BitReg(RBX)) { + RBX = BC.MIB->getAliasSized(RBX, 1); + if (RegScore[RBX] < 0 || RegScore[RBX] > RegScore[Candidate]) + return false; + } + LLVM_DEBUG(dbgs() << "\n ** Swapping " << BC.MRI->getName(RBX) << " with " << BC.MRI->getName(Candidate) << "\n\n"); (void)BC; swap(Function, RBX, Candidate); FuncsChanged.insert(&Function); + for (BinaryFunction *ChildFrag : Function.getFragments()) { + swap(*ChildFrag, RBX, Candidate); + FuncsChanged.insert(ChildFrag); + } return true; } @@ -404,7 +464,7 @@ void RegReAssign::runOnFunctions(BinaryContext &BC) { for (auto &I : BC.getBinaryFunctions()) { BinaryFunction &Function = I.second; - if (!Function.isSimple() || Function.isIgnored()) + if (!Function.isSimple() || Function.isIgnored() || Function.isFragment()) continue; LLVM_DEBUG(dbgs() << "====================================\n"); diff --git a/bolt/lib/Passes/ReorderAlgorithm.cpp b/bolt/lib/Passes/ReorderAlgorithm.cpp index b5052cdaddb13e38fd8b8d7a3f3d5b999ad90ad9..3c3365e1d3d711321c3eda520012d5cbb64e0507 100644 --- a/bolt/lib/Passes/ReorderAlgorithm.cpp +++ b/bolt/lib/Passes/ReorderAlgorithm.cpp @@ -531,21 +531,21 @@ void ExtTSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF, } // Initialize CFG edges - using JumpT = std::pair; - std::vector> JumpCounts; + std::vector JumpCounts; for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { auto BI = BB->branch_info_begin(); for (BinaryBasicBlock *SuccBB : BB->successors()) { assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && "missing profile for a jump"); - auto It = std::make_pair(BB->getLayoutIndex(), SuccBB->getLayoutIndex()); - JumpCounts.push_back(std::make_pair(It, BI->Count)); + JumpCounts.push_back( + {BB->getLayoutIndex(), SuccBB->getLayoutIndex(), BI->Count}); ++BI; } } // Run the layout algorithm - auto Result = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts); + auto Result = + codelayout::computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts); Order.reserve(BF.getLayout().block_size()); for (uint64_t R : Result) Order.push_back(OrigOrder[R]); diff --git a/bolt/lib/Passes/ReorderData.cpp b/bolt/lib/Passes/ReorderData.cpp index 4df6ce37596d71a3132d259873745690567411a3..6e1f9b6d77512e12572ad2a6198bb261de95ad81 100644 --- a/bolt/lib/Passes/ReorderData.cpp +++ b/bolt/lib/Passes/ReorderData.cpp @@ -413,17 +413,17 @@ bool ReorderData::markUnmoveableSymbols(BinaryContext &BC, auto Range = BC.getBinaryDataForSection(Section); bool FoundUnmoveable = false; for (auto Itr = Range.begin(); Itr != Range.end(); ++Itr) { + BinaryData *Next = + std::next(Itr) != Range.end() ? std::next(Itr)->second : nullptr; if (Itr->second->getName().startswith("PG.")) { BinaryData *Prev = Itr != Range.begin() ? std::prev(Itr)->second : nullptr; - BinaryData *Next = Itr != Range.end() ? std::next(Itr)->second : nullptr; bool PrevIsPrivate = Prev && isPrivate(Prev); bool NextIsPrivate = Next && isPrivate(Next); if (isPrivate(Itr->second) && (PrevIsPrivate || NextIsPrivate)) Itr->second->setIsMoveable(false); } else { // check for overlapping symbols. - BinaryData *Next = Itr != Range.end() ? std::next(Itr)->second : nullptr; if (Next && Itr->second->getEndAddress() != Next->getAddress() && Next->containsAddress(Itr->second->getEndAddress())) { Itr->second->setIsMoveable(false); diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp index 2fc99f652bf1c375abaa73ff5712720daa0ea10c..70f87ac40c3c149b0079b2038d077a41bbf382d5 100644 --- a/bolt/lib/Passes/ReorderFunctions.cpp +++ b/bolt/lib/Passes/ReorderFunctions.cpp @@ -15,6 +15,7 @@ #include "bolt/Utils/Utils.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/CodeLayout.h" #include #define DEBUG_TYPE "hfsort" @@ -29,82 +30,72 @@ extern cl::opt RandomSeed; extern size_t padFunction(const bolt::BinaryFunction &Function); -cl::opt -ReorderFunctions("reorder-functions", - cl::desc("reorder and cluster functions (works only with relocations)"), - cl::init(bolt::ReorderFunctions::RT_NONE), - cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE, - "none", - "do not reorder functions"), - clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT, - "exec-count", - "order by execution count"), - clEnumValN(bolt::ReorderFunctions::RT_HFSORT, - "hfsort", - "use hfsort algorithm"), - clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS, - "hfsort+", - "use hfsort+ algorithm"), - clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN, - "pettis-hansen", - "use Pettis-Hansen algorithm"), - clEnumValN(bolt::ReorderFunctions::RT_RANDOM, - "random", - "reorder functions randomly"), - clEnumValN(bolt::ReorderFunctions::RT_USER, - "user", - "use function order specified by -function-order")), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); +cl::opt ReorderFunctions( + "reorder-functions", + cl::desc("reorder and cluster functions (works only with relocations)"), + cl::init(bolt::ReorderFunctions::RT_NONE), + cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE, "none", + "do not reorder functions"), + clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT, "exec-count", + "order by execution count"), + clEnumValN(bolt::ReorderFunctions::RT_HFSORT, "hfsort", + "use hfsort algorithm"), + clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS, "hfsort+", + "use hfsort+ algorithm"), + clEnumValN(bolt::ReorderFunctions::RT_CDS, "cds", + "use cache-directed sort"), + clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN, + "pettis-hansen", "use Pettis-Hansen algorithm"), + clEnumValN(bolt::ReorderFunctions::RT_RANDOM, "random", + "reorder functions randomly"), + clEnumValN(bolt::ReorderFunctions::RT_USER, "user", + "use function order specified by -function-order")), + cl::ZeroOrMore, cl::cat(BoltOptCategory)); static cl::opt ReorderFunctionsUseHotSize( "reorder-functions-use-hot-size", cl::desc("use a function's hot size when doing clustering"), cl::init(true), cl::cat(BoltOptCategory)); -static cl::opt -FunctionOrderFile("function-order", - cl::desc("file containing an ordered list of functions to use for function " - "reordering"), - cl::cat(BoltOptCategory)); +static cl::opt FunctionOrderFile( + "function-order", + cl::desc("file containing an ordered list of functions to use for function " + "reordering"), + cl::cat(BoltOptCategory)); -static cl::opt -GenerateFunctionOrderFile("generate-function-order", - cl::desc("file to dump the ordered list of functions to use for function " - "reordering"), - cl::cat(BoltOptCategory)); +static cl::opt GenerateFunctionOrderFile( + "generate-function-order", + cl::desc("file to dump the ordered list of functions to use for function " + "reordering"), + cl::cat(BoltOptCategory)); -static cl::opt -LinkSectionsFile("generate-link-sections", - cl::desc("generate a list of function sections in a format suitable for " - "inclusion in a linker script"), - cl::cat(BoltOptCategory)); +static cl::opt LinkSectionsFile( + "generate-link-sections", + cl::desc("generate a list of function sections in a format suitable for " + "inclusion in a linker script"), + cl::cat(BoltOptCategory)); static cl::opt UseEdgeCounts("use-edge-counts", cl::desc("use edge count data when doing clustering"), cl::init(true), cl::cat(BoltOptCategory)); -static cl::opt -CgFromPerfData("cg-from-perf-data", - cl::desc("use perf data directly when constructing the call graph" - " for stale functions"), - cl::init(true), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); +static cl::opt CgFromPerfData( + "cg-from-perf-data", + cl::desc("use perf data directly when constructing the call graph" + " for stale functions"), + cl::init(true), cl::ZeroOrMore, cl::cat(BoltOptCategory)); static cl::opt CgIgnoreRecursiveCalls( "cg-ignore-recursive-calls", cl::desc("ignore recursive calls when constructing the call graph"), cl::init(true), cl::cat(BoltOptCategory)); -static cl::opt -CgUseSplitHotSize("cg-use-split-hot-size", - cl::desc("use hot/cold data on basic blocks to determine hot sizes for " - "call graph functions"), - cl::init(false), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); +static cl::opt CgUseSplitHotSize( + "cg-use-split-hot-size", + cl::desc("use hot/cold data on basic blocks to determine hot sizes for " + "call graph functions"), + cl::init(false), cl::ZeroOrMore, cl::cat(BoltOptCategory)); } // namespace opts @@ -157,13 +148,13 @@ void ReorderFunctions::printStats(const std::vector &Clusters, bool PrintDetailed = opts::Verbosity > 1; #ifndef NDEBUG PrintDetailed |= - (DebugFlag && isCurrentDebugType("hfsort") && opts::Verbosity > 0); + (DebugFlag && isCurrentDebugType("hfsort") && opts::Verbosity > 0); #endif - uint64_t TotalSize = 0; - uint64_t CurPage = 0; - uint64_t Hotfuncs = 0; + uint64_t TotalSize = 0; + uint64_t CurPage = 0; + uint64_t Hotfuncs = 0; double TotalDistance = 0; - double TotalCalls = 0; + double TotalCalls = 0; double TotalCalls64B = 0; double TotalCalls4KB = 0; double TotalCalls2MB = 0; @@ -198,21 +189,22 @@ void ReorderFunctions::printStats(const std::vector &Clusters, << "BOLT-INFO: Src: " << *Cg.nodeIdToFunc(FuncId) << "\n" << "BOLT-INFO: Dst: " << *Cg.nodeIdToFunc(Dst) << "\n" << "BOLT-INFO: Weight = " << W << "\n" - << "BOLT-INFO: AvgOffset = " << Arc.avgCallOffset() << "\n"; + << "BOLT-INFO: AvgOffset = " << Arc.avgCallOffset() + << "\n"; Calls += W; - if (D < 64) TotalCalls64B += W; - if (D < 4096) TotalCalls4KB += W; - if (D < (2 << 20)) TotalCalls2MB += W; + if (D < 64) + TotalCalls64B += W; + if (D < 4096) + TotalCalls4KB += W; + if (D < (2 << 20)) + TotalCalls2MB += W; Dist += Arc.weight() * D; if (PrintDetailed) outs() << format("BOLT-INFO: arc: %u [@%lu+%.1lf] -> %u [@%lu]: " "weight = %.0lf, callDist = %f\n", - Arc.src(), - FuncAddr[Arc.src()], - Arc.avgCallOffset(), - Arc.dst(), - FuncAddr[Arc.dst()], - Arc.weight(), D); + Arc.src(), FuncAddr[Arc.src()], + Arc.avgCallOffset(), Arc.dst(), + FuncAddr[Arc.dst()], Arc.weight(), D); } TotalCalls += Calls; TotalDistance += Dist; @@ -290,39 +282,74 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) { switch (opts::ReorderFunctions) { case RT_NONE: break; - case RT_EXEC_COUNT: - { - std::vector SortedFunctions(BFs.size()); - uint32_t Index = 0; - llvm::transform(llvm::make_second_range(BFs), SortedFunctions.begin(), - [](BinaryFunction &BF) { return &BF; }); - llvm::stable_sort(SortedFunctions, [&](const BinaryFunction *A, - const BinaryFunction *B) { - if (A->isIgnored()) - return false; - const size_t PadA = opts::padFunction(*A); - const size_t PadB = opts::padFunction(*B); - if (!PadA || !PadB) { - if (PadA) - return true; - if (PadB) - return false; - } - return !A->hasProfile() && - (B->hasProfile() || - (A->getExecutionCount() > B->getExecutionCount())); - }); - for (BinaryFunction *BF : SortedFunctions) - if (BF->hasProfile()) - BF->setIndex(Index++); - } - break; + case RT_EXEC_COUNT: { + std::vector SortedFunctions(BFs.size()); + llvm::transform(llvm::make_second_range(BFs), SortedFunctions.begin(), + [](BinaryFunction &BF) { return &BF; }); + llvm::stable_sort(SortedFunctions, + [&](const BinaryFunction *A, const BinaryFunction *B) { + if (A->isIgnored()) + return false; + if (B->isIgnored()) + return true; + const size_t PadA = opts::padFunction(*A); + const size_t PadB = opts::padFunction(*B); + if (!PadA || !PadB) { + if (PadA) + return true; + if (PadB) + return false; + } + if (!A->hasProfile()) + return false; + if (!B->hasProfile()) + return true; + return A->getExecutionCount() > B->getExecutionCount(); + }); + uint32_t Index = 0; + for (BinaryFunction *BF : SortedFunctions) + if (BF->hasProfile()) { + BF->setIndex(Index++); + LLVM_DEBUG(if (opts::Verbosity > 1) { + dbgs() << "BOLT-INFO: hot func " << BF->getPrintName() << " (" + << BF->getExecutionCount() << ")\n"; + }); + } + } break; case RT_HFSORT: Clusters = clusterize(Cg); break; case RT_HFSORT_PLUS: Clusters = hfsortPlus(Cg); break; + case RT_CDS: { + // It is required that the sum of incoming arc weights is not greater + // than the number of samples for every function. Ensuring the call graph + // obeys the property before running the algorithm. + Cg.adjustArcWeights(); + + // Initialize CFG nodes and their data + std::vector FuncSizes; + std::vector FuncCounts; + std::vector CallCounts; + std::vector CallOffsets; + for (NodeId F = 0; F < Cg.numNodes(); ++F) { + FuncSizes.push_back(Cg.size(F)); + FuncCounts.push_back(Cg.samples(F)); + for (NodeId Succ : Cg.successors(F)) { + const Arc &Arc = *Cg.findArc(F, Succ); + CallCounts.push_back({F, Succ, uint64_t(Arc.weight())}); + CallOffsets.push_back(uint64_t(Arc.avgCallOffset())); + } + } + + // Run the layout algorithm. + std::vector Result = codelayout::computeCacheDirectedLayout( + FuncSizes, FuncCounts, CallCounts, CallOffsets); + + // Create a single cluster from the computed order of hot functions. + Clusters.emplace_back(Cluster(Result, Cg)); + } break; case RT_PETTIS_HANSEN: Clusters = pettisAndHansen(Cg); break; @@ -330,74 +357,71 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) { std::srand(opts::RandomSeed); Clusters = randomClusters(Cg); break; - case RT_USER: - { - // Build LTOCommonNameMap - StringMap> LTOCommonNameMap; - for (const BinaryFunction &BF : llvm::make_second_range(BFs)) - for (StringRef Name : BF.getNames()) - if (std::optional LTOCommonName = getLTOCommonName(Name)) - LTOCommonNameMap[*LTOCommonName].push_back(BF.getAddress()); - - uint32_t Index = 0; - uint32_t InvalidEntries = 0; - for (const std::string &Function : readFunctionOrderFile()) { - std::vector FuncAddrs; - - BinaryData *BD = BC.getBinaryDataByName(Function); - if (!BD) { - // If we can't find the main symbol name, look for alternates. - uint32_t LocalID = 1; - while (true) { - const std::string FuncName = - Function + "/" + std::to_string(LocalID); - BD = BC.getBinaryDataByName(FuncName); - if (BD) - FuncAddrs.push_back(BD->getAddress()); - else - break; - LocalID++; - } - // Strip LTO suffixes - if (std::optional CommonName = getLTOCommonName(Function)) - if (LTOCommonNameMap.contains(*CommonName)) - llvm::append_range(FuncAddrs, LTOCommonNameMap[*CommonName]); - } else { - FuncAddrs.push_back(BD->getAddress()); + case RT_USER: { + // Build LTOCommonNameMap + StringMap> LTOCommonNameMap; + for (const BinaryFunction &BF : llvm::make_second_range(BFs)) + for (StringRef Name : BF.getNames()) + if (std::optional LTOCommonName = getLTOCommonName(Name)) + LTOCommonNameMap[*LTOCommonName].push_back(BF.getAddress()); + + uint32_t Index = 0; + uint32_t InvalidEntries = 0; + for (const std::string &Function : readFunctionOrderFile()) { + std::vector FuncAddrs; + + BinaryData *BD = BC.getBinaryDataByName(Function); + if (!BD) { + // If we can't find the main symbol name, look for alternates. + uint32_t LocalID = 1; + while (true) { + const std::string FuncName = Function + "/" + std::to_string(LocalID); + BD = BC.getBinaryDataByName(FuncName); + if (BD) + FuncAddrs.push_back(BD->getAddress()); + else + break; + LocalID++; } + // Strip LTO suffixes + if (std::optional CommonName = getLTOCommonName(Function)) + if (LTOCommonNameMap.contains(*CommonName)) + llvm::append_range(FuncAddrs, LTOCommonNameMap[*CommonName]); + } else { + FuncAddrs.push_back(BD->getAddress()); + } - if (FuncAddrs.empty()) { + if (FuncAddrs.empty()) { + if (opts::Verbosity >= 1) + errs() << "BOLT-WARNING: Reorder functions: can't find function " + << "for " << Function << "\n"; + ++InvalidEntries; + continue; + } + + for (const uint64_t FuncAddr : FuncAddrs) { + const BinaryData *FuncBD = BC.getBinaryDataAtAddress(FuncAddr); + assert(FuncBD); + + BinaryFunction *BF = BC.getFunctionForSymbol(FuncBD->getSymbol()); + if (!BF) { if (opts::Verbosity >= 1) errs() << "BOLT-WARNING: Reorder functions: can't find function " << "for " << Function << "\n"; ++InvalidEntries; - continue; - } - - for (const uint64_t FuncAddr : FuncAddrs) { - const BinaryData *FuncBD = BC.getBinaryDataAtAddress(FuncAddr); - assert(FuncBD); - - BinaryFunction *BF = BC.getFunctionForSymbol(FuncBD->getSymbol()); - if (!BF) { - if (opts::Verbosity >= 1) - errs() << "BOLT-WARNING: Reorder functions: can't find function " - << "for " << Function << "\n"; - ++InvalidEntries; - break; - } - if (!BF->hasValidIndex()) - BF->setIndex(Index++); - else if (opts::Verbosity > 0) - errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function - << "\n"; + break; } + if (!BF->hasValidIndex()) + BF->setIndex(Index++); + else if (opts::Verbosity > 0) + errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function + << "\n"; } - if (InvalidEntries) - errs() << "BOLT-WARNING: Reorder functions: can't find functions for " - << InvalidEntries << " entries in -function-order list\n"; } - break; + if (InvalidEntries) + errs() << "BOLT-WARNING: Reorder functions: can't find functions for " + << InvalidEntries << " entries in -function-order list\n"; + } break; } reorder(std::move(Clusters), BFs); diff --git a/bolt/lib/Passes/ShrinkWrapping.cpp b/bolt/lib/Passes/ShrinkWrapping.cpp index cdf38e35ee87da5b3c222707e69dbdc8cf5d3406..17f169cc332b644e3a4ee99fb00c33fb873dab8e 100644 --- a/bolt/lib/Passes/ShrinkWrapping.cpp +++ b/bolt/lib/Passes/ShrinkWrapping.cpp @@ -1960,7 +1960,7 @@ bool ShrinkWrapping::perform(bool HotOnly) { for (const auto &Instr : *BB) { if (BC.MIB->isPseudo(Instr)) continue; - if (BC.MIB->isStore(Instr)) + if (BC.MIB->mayStore(Instr)) TotalStoreInstrs += BBExecCount; TotalInstrs += BBExecCount; } diff --git a/bolt/lib/Passes/StokeInfo.cpp b/bolt/lib/Passes/StokeInfo.cpp index cbd2c3c7a1a1255c818de812e6eff78ee0d3d1ca..57e5a08113dd0f11bee6f5d2391e50c3d2a80a2b 100644 --- a/bolt/lib/Passes/StokeInfo.cpp +++ b/bolt/lib/Passes/StokeInfo.cpp @@ -75,7 +75,7 @@ void StokeInfo::checkInstr(const BinaryFunction &BF, StokeFuncInfo &FuncInfo) { if (IsPush) FuncInfo.StackOut = true; - if (MIB->isStore(It) && !IsPush && !IsRipAddr) + if (MIB->mayStore(It) && !IsPush && !IsRipAddr) FuncInfo.HeapOut = true; if (IsRipAddr) diff --git a/bolt/lib/Passes/TailDuplication.cpp b/bolt/lib/Passes/TailDuplication.cpp index c04efd759bf3030f0b818b1d38ad42f49355d3f6..7141d5d99aa65e188ddffa72a6065b9550ee658a 100644 --- a/bolt/lib/Passes/TailDuplication.cpp +++ b/bolt/lib/Passes/TailDuplication.cpp @@ -303,7 +303,7 @@ TailDuplication::aggressiveDuplicate(BinaryBasicBlock &BB, if (isInCacheLine(BB, Tail)) return BlocksToDuplicate; - BinaryBasicBlock *CurrBB = &BB; + BinaryBasicBlock *CurrBB = &Tail; while (CurrBB) { LLVM_DEBUG(dbgs() << "Aggressive tail duplication: adding " << CurrBB->getName() << " to duplication list\n";); diff --git a/bolt/lib/Passes/ValidateInternalCalls.cpp b/bolt/lib/Passes/ValidateInternalCalls.cpp index 22dadf4f6403be3699cf0a0a2dc7521ced10e49d..516f91acb5084417e4844cd4948e525f79cefaa0 100644 --- a/bolt/lib/Passes/ValidateInternalCalls.cpp +++ b/bolt/lib/Passes/ValidateInternalCalls.cpp @@ -281,18 +281,16 @@ bool ValidateInternalCalls::analyzeFunction(BinaryFunction &Function) const { LLVM_DEBUG({ dbgs() << "Detected out-of-range PIC reference in " << Function << "\nReturn address load: "; - BC.InstPrinter->printInst(TargetInst, 0, "", *BC.STI, dbgs()); - dbgs() << "\nUse: "; - BC.InstPrinter->printInst(&Use, 0, "", *BC.STI, dbgs()); - dbgs() << "\n"; + BC.dump(*TargetInst); + dbgs() << "Use: "; + BC.dump(Use); Function.dump(); }); return false; } LLVM_DEBUG({ dbgs() << "Validated access: "; - BC.InstPrinter->printInst(&Use, 0, "", *BC.STI, dbgs()); - dbgs() << "\n"; + BC.dump(Use); }); } if (!UseDetected) { diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp index 57a850eb17234c98a86a85d26f75ff5e31c1de7b..e004309e0e21365008774d033f25b65588f3ff24 100644 --- a/bolt/lib/Profile/BoltAddressTranslation.cpp +++ b/bolt/lib/Profile/BoltAddressTranslation.cpp @@ -46,9 +46,14 @@ void BoltAddressTranslation::writeEntriesForBB(MapTy &Map, // allowing it to overwrite the previously inserted key in the map. Map[BBOutputOffset] = BBInputOffset; - for (const auto &IOPair : BB.getOffsetTranslationTable()) { - const uint64_t OutputOffset = IOPair.first + BBOutputOffset; - const uint32_t InputOffset = IOPair.second; + const auto &IOAddressMap = + BB.getFunction()->getBinaryContext().getIOAddressMap(); + + for (const auto &[InputOffset, Sym] : BB.getLocSyms()) { + const auto InputAddress = BB.getFunction()->getAddress() + InputOffset; + const auto OutputAddress = IOAddressMap.lookup(InputAddress); + assert(OutputAddress && "Unknown instruction address"); + const auto OutputOffset = *OutputAddress - FuncAddress; // Is this the first instruction in the BB? No need to duplicate the entry. if (OutputOffset == BBOutputOffset) diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 24dbe34b2f6a0da974392788eb8e3c34664e6038..cbc079afbb7e4f00afcde7680fd80a9acd825893 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -46,6 +46,11 @@ static cl::opt cl::desc("aggregate basic samples (without LBR info)"), cl::cat(AggregatorCategory)); +static cl::opt + ITraceAggregation("itrace", + cl::desc("Generate LBR info with perf itrace argument"), + cl::cat(AggregatorCategory)); + static cl::opt FilterMemProfile("filter-mem-profile", cl::desc("if processing a memory profile, filter out stack or heap accesses " @@ -163,16 +168,23 @@ void DataAggregator::start() { findPerfExecutable(); - if (opts::BasicAggregation) + if (opts::BasicAggregation) { launchPerfProcess("events without LBR", MainEventsPPI, "script -F pid,event,ip", /*Wait = */false); - else + } else if (!opts::ITraceAggregation.empty()) { + std::string ItracePerfScriptArgs = llvm::formatv( + "script -F pid,ip,brstack --itrace={0}", opts::ITraceAggregation); + launchPerfProcess("branch events with itrace", MainEventsPPI, + ItracePerfScriptArgs.c_str(), + /*Wait = */ false); + } else { launchPerfProcess("branch events", MainEventsPPI, "script -F pid,ip,brstack", /*Wait = */false); + } // Note: we launch script for mem events regardless of the option, as the // command fails fairly fast if mem events were not collected. @@ -1479,13 +1491,10 @@ std::error_code DataAggregator::parseBranchEvents() { NumTraces += parseLBRSample(Sample, NeedsSkylakeFix); } - for (const auto &LBR : BranchLBRs) { - const Trace &Trace = LBR.first; - if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Trace.From)) - BF->setHasProfileAvailable(); - if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Trace.To)) - BF->setHasProfileAvailable(); - } + for (const Trace &Trace : llvm::make_first_range(BranchLBRs)) + for (const uint64_t Addr : {Trace.From, Trace.To}) + if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Addr)) + BF->setHasProfileAvailable(); auto printColored = [](raw_ostream &OS, float Percent, float T1, float T2) { OS << " ("; @@ -1721,12 +1730,9 @@ std::error_code DataAggregator::parsePreAggregatedLBRSamples() { if (std::error_code EC = AggrEntry.getError()) return EC; - if (BinaryFunction *BF = - getBinaryFunctionContainingAddress(AggrEntry->From.Offset)) - BF->setHasProfileAvailable(); - if (BinaryFunction *BF = - getBinaryFunctionContainingAddress(AggrEntry->To.Offset)) - BF->setHasProfileAvailable(); + for (const uint64_t Addr : {AggrEntry->From.Offset, AggrEntry->To.Offset}) + if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Addr)) + BF->setHasProfileAvailable(); AggregatedLBRs.emplace_back(std::move(AggrEntry.get())); } diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp index 0e12e8cb307002d37d75751c59d3d40f199d2bd9..dcc7578041fae6bfff826f98ecfa7a8ec01f98bf 100644 --- a/bolt/lib/Profile/DataReader.cpp +++ b/bolt/lib/Profile/DataReader.cpp @@ -698,7 +698,8 @@ bool DataReader::recordBranch(BinaryFunction &BF, uint64_t From, uint64_t To, if (!BC.MIB->isNoop(Instr)) break; - Offset += BC.MIB->getAnnotationWithDefault(Instr, "Size"); + if (std::optional Size = BC.MIB->getSize(Instr)) + Offset += *Size; } if (To == Offset) diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp index b009d57a0e6e43a3362b454acad0e8e62b245d9c..d00bf87ffc8ad871ecba876be40dc8b822a5503b 100644 --- a/bolt/lib/Profile/StaleProfileMatching.cpp +++ b/bolt/lib/Profile/StaleProfileMatching.cpp @@ -27,17 +27,18 @@ #include "bolt/Core/HashUtilities.h" #include "bolt/Profile/YAMLProfileReader.h" +#include "llvm/ADT/Bitfields.h" #include "llvm/ADT/Hashing.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/SampleProfileInference.h" #include +using namespace llvm; + #undef DEBUG_TYPE #define DEBUG_TYPE "bolt-prof" -using namespace llvm; - namespace opts { extern cl::OptionCategory BoltOptCategory; @@ -72,64 +73,39 @@ cl::opt StaleMatchingJoinIslands( cl::opt StaleMatchingCostBlockInc( "stale-matching-cost-block-inc", - cl::desc("The cost of increasing a block's count by one."), cl::init(110), + cl::desc("The cost of increasing a block count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); cl::opt StaleMatchingCostBlockDec( "stale-matching-cost-block-dec", - cl::desc("The cost of decreasing a block's count by one."), cl::init(100), + cl::desc("The cost of decreasing a block count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); -cl::opt StaleMatchingCostBlockEntryInc( - "stale-matching-cost-block-entry-inc", - cl::desc("The cost of increasing the entry block's count by one."), - cl::init(110), cl::ReallyHidden, cl::cat(BoltOptCategory)); - -cl::opt StaleMatchingCostBlockEntryDec( - "stale-matching-cost-block-entry-dec", - cl::desc("The cost of decreasing the entry block's count by one."), - cl::init(100), cl::ReallyHidden, cl::cat(BoltOptCategory)); - -cl::opt StaleMatchingCostBlockZeroInc( - "stale-matching-cost-block-zero-inc", - cl::desc("The cost of increasing a count of zero-weight block by one."), - cl::init(10), cl::Hidden, cl::cat(BoltOptCategory)); - -cl::opt StaleMatchingCostBlockUnknownInc( - "stale-matching-cost-block-unknown-inc", - cl::desc("The cost of increasing an unknown block's count by one."), - cl::init(10), cl::ReallyHidden, cl::cat(BoltOptCategory)); - cl::opt StaleMatchingCostJumpInc( "stale-matching-cost-jump-inc", - cl::desc("The cost of increasing a jump's count by one."), cl::init(100), + cl::desc("The cost of increasing a jump count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); -cl::opt StaleMatchingCostJumpFTInc( - "stale-matching-cost-jump-ft-inc", - cl::desc("The cost of increasing a fall-through jump's count by one."), - cl::init(100), cl::ReallyHidden, cl::cat(BoltOptCategory)); - cl::opt StaleMatchingCostJumpDec( "stale-matching-cost-jump-dec", - cl::desc("The cost of decreasing a jump's count by one."), cl::init(110), + cl::desc("The cost of decreasing a jump count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); -cl::opt StaleMatchingCostJumpFTDec( - "stale-matching-cost-jump-ft-dec", - cl::desc("The cost of decreasing a fall-through jump's count by one."), - cl::init(110), cl::ReallyHidden, cl::cat(BoltOptCategory)); +cl::opt StaleMatchingCostBlockUnknownInc( + "stale-matching-cost-block-unknown-inc", + cl::desc("The cost of increasing an unknown block count by one."), + cl::init(1), cl::ReallyHidden, cl::cat(BoltOptCategory)); cl::opt StaleMatchingCostJumpUnknownInc( "stale-matching-cost-jump-unknown-inc", - cl::desc("The cost of increasing an unknown jump's count by one."), - cl::init(50), cl::ReallyHidden, cl::cat(BoltOptCategory)); + cl::desc("The cost of increasing an unknown jump count by one."), + cl::init(140), cl::ReallyHidden, cl::cat(BoltOptCategory)); cl::opt StaleMatchingCostJumpUnknownFTInc( "stale-matching-cost-jump-unknown-ft-inc", cl::desc( - "The cost of increasing an unknown fall-through jump's count by one."), - cl::init(5), cl::ReallyHidden, cl::cat(BoltOptCategory)); + "The cost of increasing an unknown fall-through jump count by one."), + cl::init(3), cl::ReallyHidden, cl::cat(BoltOptCategory)); } // namespace opts @@ -141,49 +117,32 @@ namespace bolt { /// components are of smaller size (e.g., uint16_t or uint8_t). struct BlendedBlockHash { private: - static uint64_t combineHashes(uint16_t Hash1, uint16_t Hash2, uint16_t Hash3, - uint16_t Hash4) { - uint64_t Hash = 0; - - Hash |= uint64_t(Hash4); - Hash <<= 16; - - Hash |= uint64_t(Hash3); - Hash <<= 16; - - Hash |= uint64_t(Hash2); - Hash <<= 16; - - Hash |= uint64_t(Hash1); - - return Hash; - } - - static void parseHashes(uint64_t Hash, uint16_t &Hash1, uint16_t &Hash2, - uint16_t &Hash3, uint16_t &Hash4) { - Hash1 = Hash & 0xffff; - Hash >>= 16; - - Hash2 = Hash & 0xffff; - Hash >>= 16; - - Hash3 = Hash & 0xffff; - Hash >>= 16; - - Hash4 = Hash & 0xffff; - Hash >>= 16; - } + using ValueOffset = Bitfield::Element; + using ValueOpcode = Bitfield::Element; + using ValueInstr = Bitfield::Element; + using ValuePred = Bitfield::Element; + using ValueSucc = Bitfield::Element; public: explicit BlendedBlockHash() {} - explicit BlendedBlockHash(uint64_t CombinedHash) { - parseHashes(CombinedHash, Offset, OpcodeHash, InstrHash, NeighborHash); + explicit BlendedBlockHash(uint64_t Hash) { + Offset = Bitfield::get(Hash); + OpcodeHash = Bitfield::get(Hash); + InstrHash = Bitfield::get(Hash); + PredHash = Bitfield::get(Hash); + SuccHash = Bitfield::get(Hash); } /// Combine the blended hash into uint64_t. uint64_t combine() const { - return combineHashes(Offset, OpcodeHash, InstrHash, NeighborHash); + uint64_t Hash = 0; + Bitfield::set(Hash, Offset); + Bitfield::set(Hash, OpcodeHash); + Bitfield::set(Hash, InstrHash); + Bitfield::set(Hash, PredHash); + Bitfield::set(Hash, SuccHash); + return Hash; } /// Compute a distance between two given blended hashes. The smaller the @@ -194,7 +153,8 @@ public: "incorrect blended hash distance computation"); uint64_t Dist = 0; // Account for NeighborHash - Dist += NeighborHash == BBH.NeighborHash ? 0 : 1; + Dist += SuccHash == BBH.SuccHash ? 0 : 1; + Dist += PredHash == BBH.PredHash ? 0 : 1; Dist <<= 16; // Account for InstrHash Dist += InstrHash == BBH.InstrHash ? 0 : 1; @@ -211,9 +171,10 @@ public: /// (Strong) Hash of the basic block instructions, including opcodes and /// operands. uint16_t InstrHash{0}; - /// Hash of the (loose) basic block together with (loose) hashes of its - /// successors and predecessors. - uint16_t NeighborHash{0}; + /// (Loose) Hashes of the predecessors of the basic block. + uint8_t PredHash{0}; + /// (Loose) Hashes of the successors of the basic block. + uint8_t SuccHash{0}; }; /// The object is used to identify and match basic blocks in a BinaryFunction @@ -236,14 +197,11 @@ public: /// Find the most similar block for a given hash. const FlowBlock *matchBlock(BlendedBlockHash BlendedHash) const { auto BlockIt = OpHashToBlocks.find(BlendedHash.OpcodeHash); - if (BlockIt == OpHashToBlocks.end()) { + if (BlockIt == OpHashToBlocks.end()) return nullptr; - } FlowBlock *BestBlock = nullptr; uint64_t BestDist = std::numeric_limits::max(); - for (auto It : BlockIt->second) { - FlowBlock *Block = It.second; - BlendedBlockHash Hash = It.first; + for (const auto &[Hash, Block] : BlockIt->second) { uint64_t Dist = Hash.distance(BlendedHash); if (BestBlock == nullptr || Dist < BestDist) { BestDist = Dist; @@ -253,6 +211,14 @@ public: return BestBlock; } + /// Returns true if the two basic blocks (in the binary and in the profile) + /// corresponding to the given hashes are matched to each other with a high + /// confidence. + static bool isHighConfidenceMatch(BlendedBlockHash Hash1, + BlendedBlockHash Hash2) { + return Hash1.InstrHash == Hash2.InstrHash; + } + private: using HashBlockPairType = std::pair; std::unordered_map> OpHashToBlocks; @@ -266,46 +232,49 @@ void BinaryFunction::computeBlockHashes() const { std::vector BlendedHashes(BasicBlocks.size()); std::vector OpcodeHashes(BasicBlocks.size()); - // Initialize hash components + // Initialize hash components. for (size_t I = 0; I < BasicBlocks.size(); I++) { const BinaryBasicBlock *BB = BasicBlocks[I]; assert(BB->getIndex() == I && "incorrect block index"); BlendedHashes[I].Offset = BB->getOffset(); - // Hashing complete instructions + // Hashing complete instructions. std::string InstrHashStr = hashBlock( BC, *BB, [&](const MCOperand &Op) { return hashInstOperand(BC, Op); }); uint64_t InstrHash = std::hash{}(InstrHashStr); - BlendedHashes[I].InstrHash = hash_64_to_16(InstrHash); - // Hashing opcodes - std::string OpcodeHashStr = - hashBlock(BC, *BB, [](const MCOperand &Op) { return std::string(); }); + BlendedHashes[I].InstrHash = (uint16_t)hash_value(InstrHash); + // Hashing opcodes. + std::string OpcodeHashStr = hashBlockLoose(BC, *BB); OpcodeHashes[I] = std::hash{}(OpcodeHashStr); - BlendedHashes[I].OpcodeHash = hash_64_to_16(OpcodeHashes[I]); + BlendedHashes[I].OpcodeHash = (uint16_t)hash_value(OpcodeHashes[I]); } - // Initialize neighbor hash + // Initialize neighbor hash. for (size_t I = 0; I < BasicBlocks.size(); I++) { const BinaryBasicBlock *BB = BasicBlocks[I]; - uint64_t Hash = OpcodeHashes[I]; - // Append hashes of successors + // Append hashes of successors. + uint64_t Hash = 0; for (BinaryBasicBlock *SuccBB : BB->successors()) { uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()]; Hash = hashing::detail::hash_16_bytes(Hash, SuccHash); } - // Append hashes of predecessors + BlendedHashes[I].SuccHash = (uint8_t)hash_value(Hash); + + // Append hashes of predecessors. + Hash = 0; for (BinaryBasicBlock *PredBB : BB->predecessors()) { uint64_t PredHash = OpcodeHashes[PredBB->getIndex()]; Hash = hashing::detail::hash_16_bytes(Hash, PredHash); } - BlendedHashes[I].NeighborHash = hash_64_to_16(Hash); + BlendedHashes[I].PredHash = (uint8_t)hash_value(Hash); } - // Assign hashes + // Assign hashes. for (size_t I = 0; I < BasicBlocks.size(); I++) { const BinaryBasicBlock *BB = BasicBlocks[I]; BB->setHash(BlendedHashes[I].combine()); } } + /// Create a wrapper flow function to use with the profile inference algorithm, /// and initialize its jumps and metadata. FlowFunction @@ -314,7 +283,7 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { // Add a special "dummy" source so that there is always a unique entry point. // Because of the extra source, for all other blocks in FlowFunction it holds - // that Block.Index == BB->getLayoutIndex() + 1 + // that Block.Index == BB->getIndex() + 1 FlowBlock EntryBlock; EntryBlock.Index = 0; Func.Blocks.push_back(EntryBlock); @@ -325,7 +294,7 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { FlowBlock &Block = Func.Blocks.back(); Block.Index = Func.Blocks.size() - 1; (void)BB; - assert(Block.Index == BB->getLayoutIndex() + 1 && + assert(Block.Index == BB->getIndex() + 1 && "incorrectly assigned basic block index"); } @@ -341,8 +310,8 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { Func.Jumps.emplace_back(); FlowJump &Jump = Func.Jumps.back(); - Jump.Source = SrcBB->getLayoutIndex() + 1; - Jump.Target = DstBB->getLayoutIndex() + 1; + Jump.Source = SrcBB->getIndex() + 1; + Jump.Target = DstBB->getIndex() + 1; InDegree[Jump.Target]++; UniqueSuccs.insert(DstBB); } @@ -354,8 +323,8 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { Func.Jumps.emplace_back(); FlowJump &Jump = Func.Jumps.back(); - Jump.Source = SrcBB->getLayoutIndex() + 1; - Jump.Target = DstBB->getLayoutIndex() + 1; + Jump.Source = SrcBB->getIndex() + 1; + Jump.Target = DstBB->getIndex() + 1; InDegree[Jump.Target]++; UniqueSuccs.insert(DstBB); } @@ -393,7 +362,8 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { /// of the basic blocks in the binary, the count is "matched" to the block. /// Similarly, if both the source and the target of a count in the profile are /// matched to a jump in the binary, the count is recorded in CFG. -void matchWeightsByHashes(const BinaryFunction::BasicBlockOrderType &BlockOrder, +void matchWeightsByHashes(BinaryContext &BC, + const BinaryFunction::BasicBlockOrderType &BlockOrder, const yaml::bolt::BinaryFunctionProfile &YamlBF, FlowFunction &Func) { assert(Func.Blocks.size() == BlockOrder.size() + 1); @@ -417,19 +387,34 @@ void matchWeightsByHashes(const BinaryFunction::BasicBlockOrderType &BlockOrder, // Match blocks from the profile to the blocks in CFG for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) { assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile"); - BlendedBlockHash BlendedHash(YamlBB.Hash); - const FlowBlock *MatchedBlock = Matcher.matchBlock(BlendedHash); + BlendedBlockHash YamlHash(YamlBB.Hash); + const FlowBlock *MatchedBlock = Matcher.matchBlock(YamlHash); + // Always match the entry block. + if (MatchedBlock == nullptr && YamlBB.Index == 0) + MatchedBlock = Blocks[0]; if (MatchedBlock != nullptr) { MatchedBlocks[YamlBB.Index] = MatchedBlock; - LLVM_DEBUG(dbgs() << "Matched yaml block with bid = " << YamlBB.Index - << " and hash = " << Twine::utohexstr(YamlBB.Hash) - << " to BB with index = " << MatchedBlock->Index - 1 + BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1]; + LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBB.Index << ")" + << " with hash " << Twine::utohexstr(YamlBB.Hash) + << " to BB (index = " << MatchedBlock->Index - 1 << ")" + << " with hash " << Twine::utohexstr(BinHash.combine()) << "\n"); + // Update matching stats accounting for the matched block. + if (Matcher.isHighConfidenceMatch(BinHash, YamlHash)) { + ++BC.Stats.NumMatchedBlocks; + BC.Stats.MatchedSampleCount += YamlBB.ExecCount; + LLVM_DEBUG(dbgs() << " exact match\n"); + } } else { LLVM_DEBUG( - dbgs() << "Couldn't match yaml block with bid = " << YamlBB.Index - << " and hash = " << Twine::utohexstr(YamlBB.Hash) << "\n"); + dbgs() << "Couldn't match yaml block (bid = " << YamlBB.Index << ")" + << " with hash " << Twine::utohexstr(YamlBB.Hash) << "\n"); } + + // Update matching stats. + ++BC.Stats.NumStaleBlocks; + BC.Stats.StaleSampleCount += YamlBB.ExecCount; } // Match jumps from the profile to the jumps from CFG @@ -475,7 +460,7 @@ void matchWeightsByHashes(const BinaryFunction::BasicBlockOrderType &BlockOrder, // Assign block counts based on in-/out- jumps for (FlowBlock &Block : Func.Blocks) { if (OutWeight[Block.Index] == 0 && InWeight[Block.Index] == 0) { - assert(Block.HasUnknownWeight && "unmatched block with positive count"); + assert(Block.HasUnknownWeight && "unmatched block with a positive count"); continue; } Block.HasUnknownWeight = false; @@ -577,16 +562,15 @@ void applyInference(FlowFunction &Func) { Params.JoinIslands = opts::StaleMatchingJoinIslands; Params.CostBlockInc = opts::StaleMatchingCostBlockInc; + Params.CostBlockEntryInc = opts::StaleMatchingCostBlockInc; Params.CostBlockDec = opts::StaleMatchingCostBlockDec; - Params.CostBlockEntryInc = opts::StaleMatchingCostBlockEntryInc; - Params.CostBlockEntryDec = opts::StaleMatchingCostBlockEntryDec; - Params.CostBlockZeroInc = opts::StaleMatchingCostBlockZeroInc; + Params.CostBlockEntryDec = opts::StaleMatchingCostBlockDec; Params.CostBlockUnknownInc = opts::StaleMatchingCostBlockUnknownInc; Params.CostJumpInc = opts::StaleMatchingCostJumpInc; - Params.CostJumpFTInc = opts::StaleMatchingCostJumpFTInc; + Params.CostJumpFTInc = opts::StaleMatchingCostJumpInc; Params.CostJumpDec = opts::StaleMatchingCostJumpDec; - Params.CostJumpFTDec = opts::StaleMatchingCostJumpFTDec; + Params.CostJumpFTDec = opts::StaleMatchingCostJumpDec; Params.CostJumpUnknownInc = opts::StaleMatchingCostJumpUnknownInc; Params.CostJumpUnknownFTInc = opts::StaleMatchingCostJumpUnknownFTInc; @@ -691,31 +675,33 @@ void assignProfile(BinaryFunction &BF, bool YAMLProfileReader::inferStaleProfile( BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF) { - // Make sure that block indices and hashes are up to date - BF.getLayout().updateLayoutIndices(); + LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for " + << "\"" << BF.getPrintName() << "\"\n"); + + // Make sure that block hashes are up to date. BF.computeBlockHashes(); const BinaryFunction::BasicBlockOrderType BlockOrder( BF.getLayout().block_begin(), BF.getLayout().block_end()); - // Create a wrapper flow function to use with the profile inference algorithm + // Create a wrapper flow function to use with the profile inference algorithm. FlowFunction Func = createFlowFunction(BlockOrder); // Match as many block/jump counts from the stale profile as possible - matchWeightsByHashes(BlockOrder, YamlBF, Func); + matchWeightsByHashes(BF.getBinaryContext(), BlockOrder, YamlBF, Func); // Adjust the flow function by marking unreachable blocks Unlikely so that - // they don't get any counts assigned + // they don't get any counts assigned. preprocessUnreachableBlocks(Func); - // Check if profile inference can be applied for the instance + // Check if profile inference can be applied for the instance. if (!canApplyInference(Func)) return false; - // Apply the profile inference algorithm + // Apply the profile inference algorithm. applyInference(Func); - // Collect inferred counts and update function annotations + // Collect inferred counts and update function annotations. assignProfile(BF, BlockOrder, Func); // As of now, we always mark the binary function having "correct" profile. diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp index 90e43b402750d89617774ebccac2b1bce7ecec74..3fd489b570d6c17a3591d97dc7bdd032c6348960 100644 --- a/bolt/lib/Profile/YAMLProfileReader.cpp +++ b/bolt/lib/Profile/YAMLProfileReader.cpp @@ -250,9 +250,6 @@ bool YAMLProfileReader::parseFunctionProfile( << " edges in profile did not match function " << BF << '\n'; if (!ProfileMatched && opts::InferStaleProfile) { - if (opts::Verbosity >= 1) - outs() << "BOLT-INFO: applying profile inference for " - << "\"" << BF.getPrintName() << "\"\n"; if (inferStaleProfile(BF, YamlBF)) { ProfileMatched = true; BF.markProfiled(YamlBP.Header.Flags); @@ -355,8 +352,10 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) { continue; yaml::bolt::BinaryFunctionProfile &YamlBF = *PI->getValue(); - if (profileMatches(YamlBF, Function)) + if (profileMatches(YamlBF, Function)) { matchProfileToFunction(YamlBF, Function); + break; + } } } diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp index 517984d990fc5a3ecfe8a2d84b910e69d00c3311..5aab26322537292216ce4abb640bc30b337165b6 100644 --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -72,6 +72,11 @@ static cl::opt JTFootprintReductionFlag( "instructions at jump sites"), cl::cat(BoltOptCategory)); +static cl::opt + KeepNops("keep-nops", + cl::desc("keep no-op instructions. By default they are removed."), + cl::Hidden, cl::cat(BoltOptCategory)); + cl::opt NeverPrint("never-print", cl::desc("never print"), cl::ReallyHidden, cl::cat(BoltOptCategory)); @@ -359,7 +364,8 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) { Manager.registerPass(std::make_unique(NeverPrint)); - Manager.registerPass(std::make_unique(NeverPrint)); + Manager.registerPass(std::make_unique(NeverPrint), + !opts::KeepNops); Manager.registerPass(std::make_unique(PrintNormalized)); diff --git a/bolt/lib/Rewrite/JITLinkLinker.cpp b/bolt/lib/Rewrite/JITLinkLinker.cpp index 3c74fd5932bf001db04b6b93adceab82c039eaab..994450c75fcfb311037fa9ba08b582e85e72f1cd 100644 --- a/bolt/lib/Rewrite/JITLinkLinker.cpp +++ b/bolt/lib/Rewrite/JITLinkLinker.cpp @@ -31,7 +31,7 @@ bool hasSymbols(const jitlink::Block &B) { Error markSectionsLive(jitlink::LinkGraph &G) { for (auto &Section : G.sections()) { // We only need allocatable sections. - if (Section.getMemLifetimePolicy() == orc::MemLifetimePolicy::NoAlloc) + if (Section.getMemLifetime() == orc::MemLifetime::NoAlloc) continue; // Skip empty sections. @@ -142,8 +142,8 @@ struct JITLinkLinker::Context : jitlink::JITLinkContext { }); for (auto *Symbol : G.defined_symbols()) { - Linker.Symtab.insert( - {Symbol->getName().str(), Symbol->getAddress().getValue()}); + SymbolInfo Info{Symbol->getAddress().getValue(), Symbol->getSize()}; + Linker.Symtab.insert({Symbol->getName().str(), Info}); } return Error::success(); @@ -174,7 +174,8 @@ void JITLinkLinker::loadObject(MemoryBufferRef Obj, jitlink::link(std::move(*LG), std::move(Ctx)); } -std::optional JITLinkLinker::lookupSymbol(StringRef Name) const { +std::optional +JITLinkLinker::lookupSymbolInfo(StringRef Name) const { auto It = Symtab.find(Name.data()); if (It == Symtab.end()) return std::nullopt; diff --git a/bolt/lib/Rewrite/MachORewriteInstance.cpp b/bolt/lib/Rewrite/MachORewriteInstance.cpp index fc7500a6deb08b399e1c14536fe1475cc997a6e7..b827a196c82653aaa79bf0b065314d9befc571f2 100644 --- a/bolt/lib/Rewrite/MachORewriteInstance.cpp +++ b/bolt/lib/Rewrite/MachORewriteInstance.cpp @@ -20,7 +20,6 @@ #include "bolt/Rewrite/JITLinkLinker.h" #include "bolt/RuntimeLibs/InstrumentationRuntimeLibrary.h" #include "bolt/Utils/Utils.h" -#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCObjectStreamer.h" #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" @@ -476,9 +475,6 @@ void MachORewriteInstance::emitAndLink() { "error creating in-memory object"); assert(Obj && "createObjectFile cannot return nullptr"); - MCAsmLayout FinalLayout( - static_cast(Streamer.get())->getAssembler()); - auto EFMM = std::make_unique(*BC); EFMM->setNewSecPrefix(getNewSecPrefix()); EFMM->setOrgSecPrefix(getOrgSecPrefix()); @@ -568,8 +564,10 @@ void MachORewriteInstance::rewriteFile() { writeInstrumentationSection("I__literal16", OS); Out->keep(); - EC = sys::fs::setPermissions(opts::OutputFilename, - sys::fs::perms::all_all); + EC = sys::fs::setPermissions( + opts::OutputFilename, + static_cast(sys::fs::perms::all_all & + ~sys::fs::getUmask())); check_error(EC, "cannot set permissions of output file"); } diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp index 64b8a8b6d400fe85efd2d3e7cc507347461ea7c3..316b83cfbd38a52c822ca1eaa6840199b9c12296 100644 --- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp +++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp @@ -183,9 +183,7 @@ void PseudoProbeRewriter::updatePseudoProbes() { // A call probe may be duplicated due to ICP // Go through output of InputOffsetToAddressMap to collect all related // probes - const InputOffsetToAddressMapTy &Offset2Addr = - F->getInputOffsetToAddressMap(); - auto CallOutputAddresses = Offset2Addr.equal_range(Offset); + auto CallOutputAddresses = BC.getIOAddressMap().lookupAll(AP.first); auto CallOutputAddress = CallOutputAddresses.first; if (CallOutputAddress == CallOutputAddresses.second) { Probe->setAddress(INT64_MAX); diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 1ade842c4ee0539513d85a20385ddbc5f1793ace..7063b243b52dcc7f5fb5dc06a6c3773efd2bbb76 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "bolt/Rewrite/RewriteInstance.h" +#include "bolt/Core/AddressMap.h" #include "bolt/Core/BinaryContext.h" #include "bolt/Core/BinaryEmitter.h" #include "bolt/Core/BinaryFunction.h" @@ -407,8 +408,9 @@ static bool checkOffsets(const typename ELFT::Phdr &Phdr, return true; // Only non-empty sections can be at the end of a segment. - uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1; - AddressRange SectionAddressRange(Sec.sh_offset, Sec.sh_offset + SectionSize); + uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1ull; + AddressRange SectionAddressRange((uint64_t)Sec.sh_offset, + Sec.sh_offset + SectionSize); AddressRange SegmentAddressRange(Phdr.p_offset, Phdr.p_offset + Phdr.p_filesz); if (SegmentAddressRange.contains(SectionAddressRange)) @@ -424,8 +426,9 @@ template static bool checkVMA(const typename ELFT::Phdr &Phdr, const typename ELFT::Shdr &Sec, bool &Overlap) { // Only non-empty sections can be at the end of a segment. - uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1; - AddressRange SectionAddressRange(Sec.sh_addr, Sec.sh_addr + SectionSize); + uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1ull; + AddressRange SectionAddressRange((uint64_t)Sec.sh_addr, + Sec.sh_addr + SectionSize); AddressRange SegmentAddressRange(Phdr.p_vaddr, Phdr.p_vaddr + Phdr.p_memsz); if (SegmentAddressRange.contains(SectionAddressRange)) @@ -699,6 +702,10 @@ Error RewriteInstance::run() { adjustCommandLineOptions(); discoverFileObjects(); + if (opts::Instrument && !BC->IsStaticExecutable) + if (Error E = discoverRtFiniAddress()) + return E; + preprocessProfileData(); // Skip disassembling if we have a translation table and we are running an @@ -735,6 +742,9 @@ Error RewriteInstance::run() { updateMetadata(); + if (opts::Instrument && !BC->IsStaticExecutable) + updateRtFiniReloc(); + if (opts::LinuxKernelMode) { errs() << "BOLT-WARNING: not writing the output file for Linux Kernel\n"; return Error::success(); @@ -751,9 +761,6 @@ Error RewriteInstance::run() { void RewriteInstance::discoverFileObjects() { NamedRegionTimer T("discoverFileObjects", "discover file objects", TimerGroupName, TimerGroupDesc, opts::TimeRewrite); - FileSymRefs.clear(); - BC->getBinaryFunctions().clear(); - BC->clearBinaryData(); // For local symbols we want to keep track of associated FILE symbol name for // disambiguation by combined name. @@ -798,7 +805,12 @@ void RewriteInstance::discoverFileObjects() { } // Sort symbols in the file by value. Ignore symbols from non-allocatable - // sections. + // sections. We memoize getAddress(), as it has rather high overhead. + struct SymbolInfo { + uint64_t Address; + SymbolRef Symbol; + }; + std::vector SortedSymbols; auto isSymbolInMemory = [this](const SymbolRef &Sym) { if (cantFail(Sym.getType()) == SymbolRef::ST_File) return false; @@ -809,25 +821,22 @@ void RewriteInstance::discoverFileObjects() { BinarySection Section(*BC, *cantFail(Sym.getSection())); return Section.isAllocatable(); }; - std::vector SortedFileSymbols; - llvm::copy_if(InputFile->symbols(), std::back_inserter(SortedFileSymbols), - isSymbolInMemory); - auto CompareSymbols = [this](const SymbolRef &A, const SymbolRef &B) { - // Marker symbols have the highest precedence, while - // SECTIONs have the lowest. - auto AddressA = cantFail(A.getAddress()); - auto AddressB = cantFail(B.getAddress()); - if (AddressA != AddressB) - return AddressA < AddressB; - - bool AMarker = BC->isMarker(A); - bool BMarker = BC->isMarker(B); + for (const SymbolRef &Symbol : InputFile->symbols()) + if (isSymbolInMemory(Symbol)) + SortedSymbols.push_back({cantFail(Symbol.getAddress()), Symbol}); + + auto CompareSymbols = [this](const SymbolInfo &A, const SymbolInfo &B) { + if (A.Address != B.Address) + return A.Address < B.Address; + + const bool AMarker = BC->isMarker(A.Symbol); + const bool BMarker = BC->isMarker(B.Symbol); if (AMarker || BMarker) { return AMarker && !BMarker; } - auto AType = cantFail(A.getType()); - auto BType = cantFail(B.getType()); + const auto AType = cantFail(A.Symbol.getType()); + const auto BType = cantFail(B.Symbol.getType()); if (AType == SymbolRef::ST_Function && BType != SymbolRef::ST_Function) return true; if (BType == SymbolRef::ST_Debug && AType != SymbolRef::ST_Debug) @@ -835,11 +844,10 @@ void RewriteInstance::discoverFileObjects() { return false; }; + llvm::stable_sort(SortedSymbols, CompareSymbols); - llvm::stable_sort(SortedFileSymbols, CompareSymbols); - - auto LastSymbol = SortedFileSymbols.end(); - if (!SortedFileSymbols.empty()) + auto LastSymbol = SortedSymbols.end(); + if (!SortedSymbols.empty()) --LastSymbol; // For aarch64, the ABI defines mapping symbols so we identify data in the @@ -854,39 +862,34 @@ void RewriteInstance::discoverFileObjects() { }; std::vector SortedMarkerSymbols; - auto addExtraDataMarkerPerSymbol = - [this](const std::vector &SortedFileSymbols, - std::vector &SortedMarkerSymbols) { - bool IsData = false; - uint64_t LastAddr = 0; - for (auto Sym = SortedFileSymbols.begin(); - Sym < SortedFileSymbols.end(); ++Sym) { - uint64_t Address = cantFail(Sym->getAddress()); - if (LastAddr == Address) // don't repeat markers - continue; + auto addExtraDataMarkerPerSymbol = [&]() { + bool IsData = false; + uint64_t LastAddr = 0; + for (const auto &SymInfo : SortedSymbols) { + if (LastAddr == SymInfo.Address) // don't repeat markers + continue; - MarkerSymType MarkerType = BC->getMarkerType(*Sym); - if (MarkerType != MarkerSymType::NONE) { - SortedMarkerSymbols.push_back(MarkerSym{Address, MarkerType}); - LastAddr = Address; - IsData = MarkerType == MarkerSymType::DATA; - continue; - } + MarkerSymType MarkerType = BC->getMarkerType(SymInfo.Symbol); + if (MarkerType != MarkerSymType::NONE) { + SortedMarkerSymbols.push_back(MarkerSym{SymInfo.Address, MarkerType}); + LastAddr = SymInfo.Address; + IsData = MarkerType == MarkerSymType::DATA; + continue; + } - if (IsData) { - SortedMarkerSymbols.push_back( - MarkerSym{cantFail(Sym->getAddress()), MarkerSymType::DATA}); - LastAddr = Address; - } - } - }; + if (IsData) { + SortedMarkerSymbols.push_back({SymInfo.Address, MarkerSymType::DATA}); + LastAddr = SymInfo.Address; + } + } + }; - if (BC->isAArch64()) { - addExtraDataMarkerPerSymbol(SortedFileSymbols, SortedMarkerSymbols); + if (BC->isAArch64() || BC->isRISCV()) { + addExtraDataMarkerPerSymbol(); LastSymbol = std::stable_partition( - SortedFileSymbols.begin(), SortedFileSymbols.end(), - [this](const SymbolRef &Symbol) { return !BC->isMarker(Symbol); }); - if (!SortedFileSymbols.empty()) + SortedSymbols.begin(), SortedSymbols.end(), + [this](const SymbolInfo &S) { return !BC->isMarker(S.Symbol); }); + if (!SortedSymbols.empty()) --LastSymbol; } @@ -894,27 +897,21 @@ void RewriteInstance::discoverFileObjects() { unsigned AnonymousId = 0; // Regex object for matching cold fragments. - Regex ColdFragment(".*\\.cold(\\.[0-9]+)?"); - - const auto SortedSymbolsEnd = LastSymbol == SortedFileSymbols.end() - ? LastSymbol - : std::next(LastSymbol); - for (auto ISym = SortedFileSymbols.begin(); ISym != SortedSymbolsEnd; - ++ISym) { - const SymbolRef &Symbol = *ISym; - // Keep undefined symbols for pretty printing? - if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Undefined) - continue; - + const Regex ColdFragment(".*\\.cold(\\.[0-9]+)?"); + + const auto SortedSymbolsEnd = + LastSymbol == SortedSymbols.end() ? LastSymbol : std::next(LastSymbol); + for (auto Iter = SortedSymbols.begin(); Iter != SortedSymbolsEnd; ++Iter) { + const SymbolRef &Symbol = Iter->Symbol; + const uint64_t SymbolAddress = Iter->Address; + const auto SymbolFlags = cantFail(Symbol.getFlags()); const SymbolRef::Type SymbolType = cantFail(Symbol.getType()); if (SymbolType == SymbolRef::ST_File) continue; StringRef SymName = cantFail(Symbol.getName(), "cannot get symbol name"); - uint64_t Address = - cantFail(Symbol.getAddress(), "cannot get symbol address"); - if (Address == 0) { + if (SymbolAddress == 0) { if (opts::Verbosity >= 1 && SymbolType == SymbolRef::ST_Function) errs() << "BOLT-WARNING: function with 0 address seen\n"; continue; @@ -924,11 +921,12 @@ void RewriteInstance::discoverFileObjects() { if (SymName == "__hot_start" || SymName == "__hot_end") continue; - FileSymRefs[Address] = Symbol; + FileSymRefs[SymbolAddress] = Symbol; // Skip section symbols that will be registered by disassemblePLT(). - if ((cantFail(Symbol.getType()) == SymbolRef::ST_Debug)) { - ErrorOr BSection = BC->getSectionForAddress(Address); + if (SymbolType == SymbolRef::ST_Debug) { + ErrorOr BSection = + BC->getSectionForAddress(SymbolAddress); if (BSection && getPLTSectionInfo(BSection->getName())) continue; } @@ -950,10 +948,10 @@ void RewriteInstance::discoverFileObjects() { std::string AlternativeName; if (Name.empty()) { UniqueName = "ANONYMOUS." + std::to_string(AnonymousId++); - } else if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Global) { + } else if (SymbolFlags & SymbolRef::SF_Global) { if (const BinaryData *BD = BC->getBinaryDataByName(Name)) { if (BD->getSize() == ELFSymbolRef(Symbol).getSize() && - BD->getAddress() == Address) { + BD->getAddress() == SymbolAddress) { if (opts::Verbosity > 1) errs() << "BOLT-WARNING: ignoring duplicate global symbol " << Name << "\n"; @@ -989,14 +987,13 @@ void RewriteInstance::discoverFileObjects() { uint64_t SymbolSize = ELFSymbolRef(Symbol).getSize(); uint64_t SymbolAlignment = Symbol.getAlignment(); - unsigned SymbolFlags = cantFail(Symbol.getFlags()); auto registerName = [&](uint64_t FinalSize) { // Register names even if it's not a function, e.g. for an entry point. - BC->registerNameAtAddress(UniqueName, Address, FinalSize, SymbolAlignment, - SymbolFlags); + BC->registerNameAtAddress(UniqueName, SymbolAddress, FinalSize, + SymbolAlignment, SymbolFlags); if (!AlternativeName.empty()) - BC->registerNameAtAddress(AlternativeName, Address, FinalSize, + BC->registerNameAtAddress(AlternativeName, SymbolAddress, FinalSize, SymbolAlignment, SymbolFlags); }; @@ -1016,7 +1013,7 @@ void RewriteInstance::discoverFileObjects() { LLVM_DEBUG(dbgs() << "BOLT-DEBUG: considering symbol " << UniqueName << " for function\n"); - if (Address == Section->getAddress() + Section->getSize()) { + if (SymbolAddress == Section->getAddress() + Section->getSize()) { assert(SymbolSize == 0 && "unexpect non-zero sized symbol at end of section"); LLVM_DEBUG( @@ -1042,11 +1039,12 @@ void RewriteInstance::discoverFileObjects() { // their local labels. The only way to tell them apart is to look at // symbol scope - global vs local. if (PreviousFunction && SymbolType != SymbolRef::ST_Function) { - if (PreviousFunction->containsAddress(Address)) { + if (PreviousFunction->containsAddress(SymbolAddress)) { if (PreviousFunction->isSymbolValidInScope(Symbol, SymbolSize)) { LLVM_DEBUG(dbgs() << "BOLT-DEBUG: symbol is a function local symbol\n"); - } else if (Address == PreviousFunction->getAddress() && !SymbolSize) { + } else if (SymbolAddress == PreviousFunction->getAddress() && + !SymbolSize) { LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ignoring symbol as a marker\n"); } else if (opts::Verbosity > 1) { errs() << "BOLT-WARNING: symbol " << UniqueName @@ -1063,8 +1061,8 @@ void RewriteInstance::discoverFileObjects() { } } - if (PreviousFunction && PreviousFunction->containsAddress(Address) && - PreviousFunction->getAddress() != Address) { + if (PreviousFunction && PreviousFunction->containsAddress(SymbolAddress) && + PreviousFunction->getAddress() != SymbolAddress) { if (PreviousFunction->isSymbolValidInScope(Symbol, SymbolSize)) { if (opts::Verbosity >= 1) outs() << "BOLT-INFO: skipping possibly another entry for function " @@ -1076,12 +1074,12 @@ void RewriteInstance::discoverFileObjects() { registerName(0); - PreviousFunction->addEntryPointAtOffset(Address - + PreviousFunction->addEntryPointAtOffset(SymbolAddress - PreviousFunction->getAddress()); // Remove the symbol from FileSymRefs so that we can skip it from // in the future. - auto SI = FileSymRefs.find(Address); + auto SI = FileSymRefs.find(SymbolAddress); assert(SI != FileSymRefs.end() && "symbol expected to be present"); assert(SI->second == Symbol && "wrong symbol found"); FileSymRefs.erase(SI); @@ -1091,10 +1089,10 @@ void RewriteInstance::discoverFileObjects() { // Checkout for conflicts with function data from FDEs. bool IsSimple = true; - auto FDEI = CFIRdWrt->getFDEs().lower_bound(Address); + auto FDEI = CFIRdWrt->getFDEs().lower_bound(SymbolAddress); if (FDEI != CFIRdWrt->getFDEs().end()) { const dwarf::FDE &FDE = *FDEI->second; - if (FDEI->first != Address) { + if (FDEI->first != SymbolAddress) { // There's no matching starting address in FDE. Make sure the previous // FDE does not contain this address. if (FDEI != CFIRdWrt->getFDEs().begin()) { @@ -1102,7 +1100,8 @@ void RewriteInstance::discoverFileObjects() { const dwarf::FDE &PrevFDE = *FDEI->second; uint64_t PrevStart = PrevFDE.getInitialLocation(); uint64_t PrevLength = PrevFDE.getAddressRange(); - if (Address > PrevStart && Address < PrevStart + PrevLength) { + if (SymbolAddress > PrevStart && + SymbolAddress < PrevStart + PrevLength) { errs() << "BOLT-ERROR: function " << UniqueName << " is in conflict with FDE [" << Twine::utohexstr(PrevStart) << ", " @@ -1119,11 +1118,11 @@ void RewriteInstance::discoverFileObjects() { << "; symbol table : " << SymbolSize << ". Using max size.\n"; } SymbolSize = std::max(SymbolSize, FDE.getAddressRange()); - if (BC->getBinaryDataAtAddress(Address)) { - BC->setBinaryDataSize(Address, SymbolSize); + if (BC->getBinaryDataAtAddress(SymbolAddress)) { + BC->setBinaryDataSize(SymbolAddress, SymbolSize); } else { LLVM_DEBUG(dbgs() << "BOLT-DEBUG: No BD @ 0x" - << Twine::utohexstr(Address) << "\n"); + << Twine::utohexstr(SymbolAddress) << "\n"); } } } @@ -1132,7 +1131,7 @@ void RewriteInstance::discoverFileObjects() { // Since function may not have yet obtained its real size, do a search // using the list of registered functions instead of calling // getBinaryFunctionAtAddress(). - auto BFI = BC->getBinaryFunctions().find(Address); + auto BFI = BC->getBinaryFunctions().find(SymbolAddress); if (BFI != BC->getBinaryFunctions().end()) { BF = &BFI->second; // Duplicate the function name. Make sure everything matches before we add @@ -1146,15 +1145,17 @@ void RewriteInstance::discoverFileObjects() { << BF->getSize() << " new " << SymbolSize << "\n"; } BF->setSize(std::max(SymbolSize, BF->getSize())); - BC->setBinaryDataSize(Address, BF->getSize()); + BC->setBinaryDataSize(SymbolAddress, BF->getSize()); } BF->addAlternativeName(UniqueName); } else { - ErrorOr Section = BC->getSectionForAddress(Address); + ErrorOr Section = + BC->getSectionForAddress(SymbolAddress); // Skip symbols from invalid sections if (!Section) { errs() << "BOLT-WARNING: " << UniqueName << " (0x" - << Twine::utohexstr(Address) << ") does not have any section\n"; + << Twine::utohexstr(SymbolAddress) + << ") does not have any section\n"; continue; } @@ -1162,7 +1163,8 @@ void RewriteInstance::discoverFileObjects() { if (!Section->getSize()) continue; - BF = BC->createBinaryFunction(UniqueName, *Section, Address, SymbolSize); + BF = BC->createBinaryFunction(UniqueName, *Section, SymbolAddress, + SymbolSize); if (!IsSimple) BF->setSimple(false); } @@ -1283,6 +1285,77 @@ void RewriteInstance::discoverFileObjects() { registerFragments(); } +Error RewriteInstance::discoverRtFiniAddress() { + // Use DT_FINI if it's available. + if (BC->FiniAddress) { + BC->FiniFunctionAddress = BC->FiniAddress; + return Error::success(); + } + + if (!BC->FiniArrayAddress || !BC->FiniArraySize) { + return createStringError( + std::errc::not_supported, + "Instrumentation needs either DT_FINI or DT_FINI_ARRAY"); + } + + if (*BC->FiniArraySize < BC->AsmInfo->getCodePointerSize()) { + return createStringError(std::errc::not_supported, + "Need at least 1 DT_FINI_ARRAY slot"); + } + + ErrorOr FiniArraySection = + BC->getSectionForAddress(*BC->FiniArrayAddress); + if (auto EC = FiniArraySection.getError()) + return errorCodeToError(EC); + + if (const Relocation *Reloc = FiniArraySection->getDynamicRelocationAt(0)) { + BC->FiniFunctionAddress = Reloc->Addend; + return Error::success(); + } + + if (const Relocation *Reloc = FiniArraySection->getRelocationAt(0)) { + BC->FiniFunctionAddress = Reloc->Value; + return Error::success(); + } + + return createStringError(std::errc::not_supported, + "No relocation for first DT_FINI_ARRAY slot"); +} + +void RewriteInstance::updateRtFiniReloc() { + // Updating DT_FINI is handled by patchELFDynamic. + if (BC->FiniAddress) + return; + + const RuntimeLibrary *RT = BC->getRuntimeLibrary(); + if (!RT || !RT->getRuntimeFiniAddress()) + return; + + assert(BC->FiniArrayAddress && BC->FiniArraySize && + "inconsistent .fini_array state"); + + ErrorOr FiniArraySection = + BC->getSectionForAddress(*BC->FiniArrayAddress); + assert(FiniArraySection && ".fini_array removed"); + + if (std::optional Reloc = + FiniArraySection->takeDynamicRelocationAt(0)) { + assert(Reloc->Addend == BC->FiniFunctionAddress && + "inconsistent .fini_array dynamic relocation"); + Reloc->Addend = RT->getRuntimeFiniAddress(); + FiniArraySection->addDynamicRelocation(*Reloc); + } + + // Update the static relocation by adding a pending relocation which will get + // patched when flushPendingRelocations is called in rewriteFile. Note that + // flushPendingRelocations will calculate the value to patch as + // "Symbol + Addend". Since we don't have a symbol, just set the addend to the + // desired value. + FiniArraySection->addPendingRelocation(Relocation{ + /*Offset*/ 0, /*Symbol*/ nullptr, /*Type*/ Relocation::getAbs64(), + /*Addend*/ RT->getRuntimeFiniAddress(), /*Value*/ 0}); +} + void RewriteInstance::registerFragments() { if (!BC->HasSplitFunctions) return; @@ -1347,24 +1420,41 @@ void RewriteInstance::createPLTBinaryFunction(uint64_t TargetAddress, BinaryFunction *BF = BC->getBinaryFunctionAtAddress(EntryAddress); if (BF && BC->isAArch64()) { - // Handle IFUNC trampoline + // Handle IFUNC trampoline with symbol setPLTSymbol(BF, BF->getOneName()); return; } const Relocation *Rel = BC->getDynamicRelocationAt(TargetAddress); - if (!Rel || !Rel->Symbol) + if (!Rel) return; + MCSymbol *Symbol = Rel->Symbol; + if (!Symbol) { + if (!BC->isAArch64() || !Rel->Addend || !Rel->isIRelative()) + return; + + // IFUNC trampoline without symbol + BinaryFunction *TargetBF = BC->getBinaryFunctionAtAddress(Rel->Addend); + if (!TargetBF) { + errs() + << "BOLT-WARNING: Expected BF to be presented as IFUNC resolver at " + << Twine::utohexstr(Rel->Addend) << ", skipping\n"; + return; + } + + Symbol = TargetBF->getSymbol(); + } + ErrorOr Section = BC->getSectionForAddress(EntryAddress); assert(Section && "cannot get section for address"); if (!BF) - BF = BC->createBinaryFunction(Rel->Symbol->getName().str() + "@PLT", - *Section, EntryAddress, 0, EntrySize, + BF = BC->createBinaryFunction(Symbol->getName().str() + "@PLT", *Section, + EntryAddress, 0, EntrySize, Section->getAlignment()); else - BF->addAlternativeName(Rel->Symbol->getName().str() + "@PLT"); - setPLTSymbol(BF, Rel->Symbol->getName()); + BF->addAlternativeName(Symbol->getName().str() + "@PLT"); + setPLTSymbol(BF, Symbol->getName()); } void RewriteInstance::disassemblePLTSectionAArch64(BinarySection &Section) { @@ -1585,6 +1675,16 @@ void RewriteInstance::adjustFunctionBoundaries() { if (!Function.isSymbolValidInScope(Symbol, SymbolSize)) break; + // Skip basic block labels. This happens on RISC-V with linker relaxation + // enabled because every branch needs a relocation and corresponding + // symbol. We don't want to add such symbols as entry points. + const auto PrivateLabelPrefix = BC->AsmInfo->getPrivateLabelPrefix(); + if (!PrivateLabelPrefix.empty() && + cantFail(Symbol.getName()).starts_with(PrivateLabelPrefix)) { + ++NextSymRefI; + continue; + } + // This is potentially another entry point into the function. uint64_t EntryOffset = NextSymRefI->first - Function.getAddress(); LLVM_DEBUG(dbgs() << "BOLT-DEBUG: adding entry point to function " @@ -2101,6 +2201,19 @@ void RewriteInstance::processDynamicRelocations() { } // The rest of dynamic relocations - DT_RELA. + // The static executable might have .rela.dyn secion and not have PT_DYNAMIC + if (!DynamicRelocationsSize && BC->IsStaticExecutable) { + ErrorOr DynamicRelSectionOrErr = + BC->getUniqueSectionByName(getRelaDynSectionName()); + if (DynamicRelSectionOrErr) { + DynamicRelocationsAddress = DynamicRelSectionOrErr->getAddress(); + DynamicRelocationsSize = DynamicRelSectionOrErr->getSize(); + const SectionRef &SectionRef = DynamicRelSectionOrErr->getSectionRef(); + DynamicRelativeRelocationsCount = std::distance( + SectionRef.relocation_begin(), SectionRef.relocation_end()); + } + } + if (DynamicRelocationsSize > 0) { ErrorOr DynamicRelSectionOrErr = BC->getSectionForAddress(*DynamicRelocationsAddress); @@ -2526,7 +2639,17 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection, // Adjust the point of reference to a code location inside a function. if (ReferencedBF->containsAddress(Address, /*UseMaxSize = */ true)) { RefFunctionOffset = Address - ReferencedBF->getAddress(); - if (RefFunctionOffset) { + if (Relocation::isInstructionReference(RType)) { + // Instruction labels are created while disassembling so we just leave + // the symbol empty for now. Since the extracted value is typically + // unrelated to the referenced symbol (e.g., %pcrel_lo in RISC-V + // references an instruction but the patched value references the low + // bits of a data address), we set the extracted value to the symbol + // address in order to be able to correctly reconstruct the reference + // later. + ReferencedSymbol = nullptr; + ExtractedValue = Address; + } else if (RefFunctionOffset) { if (ContainingBF && ContainingBF != ReferencedBF) { ReferencedSymbol = ReferencedBF->addEntryPointAtOffset(RefFunctionOffset); @@ -3077,7 +3200,6 @@ void RewriteInstance::buildFunctionsCFG() { // Create annotation indices to allow lock-free execution BC->MIB->getOrCreateAnnotationIndex("JTIndexReg"); BC->MIB->getOrCreateAnnotationIndex("NOP"); - BC->MIB->getOrCreateAnnotationIndex("Size"); ParallelUtilities::WorkFuncWithAllocTy WorkFun = [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId) { @@ -3235,15 +3357,15 @@ void RewriteInstance::emitAndLink() { Linker->loadObject(ObjectMemBuffer->getMemBufferRef(), [this](auto MapSection) { mapFileSections(MapSection); }); - MCAsmLayout FinalLayout( - static_cast(Streamer.get())->getAssembler()); - // Update output addresses based on the new section map and // layout. Only do this for the object created by ourselves. - updateOutputValues(FinalLayout); + updateOutputValues(*Linker); - if (opts::UpdateDebugSections) + if (opts::UpdateDebugSections) { + MCAsmLayout FinalLayout( + static_cast(Streamer.get())->getAssembler()); DebugInfoRewriter->updateLineTableOffsets(FinalLayout); + } if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary()) RtLibrary->link(*BC, ToolPath, *Linker, [this](auto MapSection) { @@ -3574,6 +3696,9 @@ void RewriteInstance::mapAllocatableSections( } for (BinarySection &Section : BC->allocatableSections()) { + if (Section.isLinkOnly()) + continue; + if (!Section.hasValidSectionID()) continue; @@ -3635,9 +3760,12 @@ void RewriteInstance::mapAllocatableSections( } } -void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) { +void RewriteInstance::updateOutputValues(const BOLTLinker &Linker) { + if (std::optional Map = AddressMap::parse(*BC)) + BC->setIOAddressMap(std::move(*Map)); + for (BinaryFunction *Function : BC->getAllBinaryFunctions()) - Function->updateOutputValues(Layout); + Function->updateOutputValues(Linker); } void RewriteInstance::patchELFPHDRTable() { @@ -4554,15 +4682,12 @@ void RewriteInstance::updateELFSymbolTable( } } - assert((!NumHotTextSymsUpdated || NumHotTextSymsUpdated == 2) && - "either none or both __hot_start/__hot_end symbols were expected"); - assert((!NumHotDataSymsUpdated || NumHotDataSymsUpdated == 2) && - "either none or both __hot_data_start/__hot_data_end symbols were " - "expected"); + auto AddSymbol = [&](const StringRef &Name, uint64_t Address) { + if (!Address) + return; - auto addSymbol = [&](const std::string &Name) { ELFSymTy Symbol; - Symbol.st_value = getNewValueForSymbol(Name); + Symbol.st_value = Address; Symbol.st_shndx = ELF::SHN_ABS; Symbol.st_name = AddToStrTab(Name); Symbol.st_size = 0; @@ -4575,14 +4700,30 @@ void RewriteInstance::updateELFSymbolTable( Symbols.emplace_back(Symbol); }; + // Add runtime library start and fini address symbols + if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary()) { + AddSymbol("__bolt_runtime_start", RtLibrary->getRuntimeStartAddress()); + AddSymbol("__bolt_runtime_fini", RtLibrary->getRuntimeFiniAddress()); + } + + assert((!NumHotTextSymsUpdated || NumHotTextSymsUpdated == 2) && + "either none or both __hot_start/__hot_end symbols were expected"); + assert((!NumHotDataSymsUpdated || NumHotDataSymsUpdated == 2) && + "either none or both __hot_data_start/__hot_data_end symbols were " + "expected"); + + auto AddEmittedSymbol = [&](const StringRef &Name) { + AddSymbol(Name, getNewValueForSymbol(Name)); + }; + if (opts::HotText && !NumHotTextSymsUpdated) { - addSymbol("__hot_start"); - addSymbol("__hot_end"); + AddEmittedSymbol("__hot_start"); + AddEmittedSymbol("__hot_end"); } if (opts::HotData && !NumHotDataSymsUpdated) { - addSymbol("__hot_data_start"); - addSymbol("__hot_data_end"); + AddEmittedSymbol("__hot_data_start"); + AddEmittedSymbol("__hot_data_end"); } // Put local symbols at the beginning. @@ -4706,9 +4847,11 @@ void RewriteInstance::patchELFAllocatableRelrSection( const uint8_t PSize = BC->AsmInfo->getCodePointerSize(); const uint64_t MaxDelta = ((CHAR_BIT * DynamicRelrEntrySize) - 1) * PSize; - auto FixAddend = [&](const BinarySection &Section, const Relocation &Rel) { + auto FixAddend = [&](const BinarySection &Section, const Relocation &Rel, + uint64_t FileOffset) { // Fix relocation symbol value in place if no static relocation found - // on the same address + // on the same address. We won't check the BF relocations here since it + // is rare case and no optimization is required. if (Section.getRelocationAt(Rel.Offset)) return; @@ -4717,11 +4860,6 @@ void RewriteInstance::patchELFAllocatableRelrSection( if (!Addend) return; - uint64_t FileOffset = Section.getOutputFileOffset(); - if (!FileOffset) - FileOffset = Section.getInputFileOffset(); - - FileOffset += Rel.Offset; OS.pwrite(reinterpret_cast(&Addend), PSize, FileOffset); }; @@ -4743,7 +4881,7 @@ void RewriteInstance::patchELFAllocatableRelrSection( RelOffset = RelOffset == 0 ? SectionAddress + Rel.Offset : RelOffset; assert((RelOffset & 1) == 0 && "Wrong relocation offset"); RelOffsets.emplace(RelOffset); - FixAddend(Section, Rel); + FixAddend(Section, Rel, RelOffset); } } @@ -5075,7 +5213,13 @@ Error RewriteInstance::readELFDynamic(ELFObjectFile *File) { } break; case ELF::DT_FINI: - BC->FiniFunctionAddress = Dyn.getPtr(); + BC->FiniAddress = Dyn.getPtr(); + break; + case ELF::DT_FINI_ARRAY: + BC->FiniArrayAddress = Dyn.getPtr(); + break; + case ELF::DT_FINI_ARRAYSZ: + BC->FiniArraySize = Dyn.getPtr(); break; case ELF::DT_RELA: DynamicRelocationsAddress = Dyn.getPtr(); @@ -5271,8 +5415,10 @@ void RewriteInstance::rewriteFile() { if (!BF.getFileOffset() || !BF.isEmitted()) continue; OS.seek(BF.getFileOffset()); - for (unsigned I = 0; I < BF.getMaxSize(); ++I) - OS.write((unsigned char)BC->MIB->getTrapFillValue()); + StringRef TrapInstr = BC->MIB->getTrapFillValue(); + unsigned NInstr = BF.getMaxSize() / TrapInstr.size(); + for (unsigned I = 0; I < NInstr; ++I) + OS.write(TrapInstr.data(), TrapInstr.size()); } OS.seek(SavedPos); } @@ -5281,6 +5427,8 @@ void RewriteInstance::rewriteFile() { for (BinarySection &Section : BC->allocatableSections()) { if (!Section.isFinalized() || !Section.getOutputData()) continue; + if (Section.isLinkOnly()) + continue; if (opts::Verbosity >= 1) outs() << "BOLT: writing new section " << Section.getName() @@ -5340,7 +5488,10 @@ void RewriteInstance::rewriteFile() { } Out->keep(); - EC = sys::fs::setPermissions(opts::OutputFilename, sys::fs::perms::all_all); + EC = sys::fs::setPermissions( + opts::OutputFilename, + static_cast(sys::fs::perms::all_all & + ~sys::fs::getUmask())); check_error(EC, "cannot set permissions of output file"); } diff --git a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp index cc36406543f399543526e552bc6a0d06802625fc..cd1b975be7b90e636ca4d5d6877cb309994e1207 100644 --- a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp +++ b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp @@ -57,10 +57,11 @@ void InstrumentationRuntimeLibrary::adjustCommandLineOptions( "the input binary\n"; exit(1); } - if (!BC.FiniFunctionAddress && !BC.IsStaticExecutable) { - errs() << "BOLT-ERROR: input binary lacks DT_FINI entry in the dynamic " - "section but instrumentation currently relies on patching " - "DT_FINI to write the profile\n"; + + if (BC.IsStaticExecutable && !opts::InstrumentationSleepTime) { + errs() << "BOLT-ERROR: instrumentation of static binary currently does not " + "support profile output on binary finalization, so it " + "requires -instrumentation-sleep-time=N (N>0) usage\n"; exit(1); } @@ -89,13 +90,6 @@ void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC, "__BOLT", "__counters", MachO::S_REGULAR, SectionKind::getData())); - if (BC.IsStaticExecutable && !opts::InstrumentationSleepTime) { - errs() << "BOLT-ERROR: instrumentation of static binary currently does not " - "support profile output on binary finalization, so it " - "requires -instrumentation-sleep-time=N (N>0) usage\n"; - exit(1); - } - Section->setAlignment(llvm::Align(BC.RegularPageSize)); Streamer.switchSection(Section); diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 777a1e6cc743ba54fd37e26a14073ad8f1ec6740..642de6c3c618233ae08b799515fc3bae55cfcaeb 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64FixupKinds.h" #include "MCTargetDesc/AArch64MCExpr.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "Utils/AArch64BaseInfo.h" @@ -267,12 +268,40 @@ public: Inst.getOpcode() == AArch64::LDRXui); } - bool isLoad(const MCInst &Inst) const override { + bool mayLoad(const MCInst &Inst) const override { return isLDRB(Inst) || isLDRH(Inst) || isLDRW(Inst) || isLDRX(Inst); } + bool isAArch64Exclusive(const MCInst &Inst) const override { + return (Inst.getOpcode() == AArch64::LDXPX || + Inst.getOpcode() == AArch64::LDXPW || + Inst.getOpcode() == AArch64::LDXRX || + Inst.getOpcode() == AArch64::LDXRW || + Inst.getOpcode() == AArch64::LDXRH || + Inst.getOpcode() == AArch64::LDXRB || + Inst.getOpcode() == AArch64::STXPX || + Inst.getOpcode() == AArch64::STXPW || + Inst.getOpcode() == AArch64::STXRX || + Inst.getOpcode() == AArch64::STXRW || + Inst.getOpcode() == AArch64::STXRH || + Inst.getOpcode() == AArch64::STXRB || + Inst.getOpcode() == AArch64::LDAXPX || + Inst.getOpcode() == AArch64::LDAXPW || + Inst.getOpcode() == AArch64::LDAXRX || + Inst.getOpcode() == AArch64::LDAXRW || + Inst.getOpcode() == AArch64::LDAXRH || + Inst.getOpcode() == AArch64::LDAXRB || + Inst.getOpcode() == AArch64::STLXPX || + Inst.getOpcode() == AArch64::STLXPW || + Inst.getOpcode() == AArch64::STLXRX || + Inst.getOpcode() == AArch64::STLXRW || + Inst.getOpcode() == AArch64::STLXRH || + Inst.getOpcode() == AArch64::STLXRB || + Inst.getOpcode() == AArch64::CLREX); + } + bool isLoadFromStack(const MCInst &Inst) const { - if (!isLoad(Inst)) + if (!mayLoad(Inst)) return false; for (const MCOperand &Operand : useOperands(Inst)) { if (!Operand.isReg()) @@ -442,6 +471,22 @@ public: return true; } + void getCalleeSavedRegs(BitVector &Regs) const override { + Regs |= getAliases(AArch64::X18); + Regs |= getAliases(AArch64::X19); + Regs |= getAliases(AArch64::X20); + Regs |= getAliases(AArch64::X21); + Regs |= getAliases(AArch64::X22); + Regs |= getAliases(AArch64::X23); + Regs |= getAliases(AArch64::X24); + Regs |= getAliases(AArch64::X25); + Regs |= getAliases(AArch64::X26); + Regs |= getAliases(AArch64::X27); + Regs |= getAliases(AArch64::X28); + Regs |= getAliases(AArch64::LR); + Regs |= getAliases(AArch64::FP); + } + const MCExpr *getTargetExprFor(MCInst &Inst, const MCExpr *Expr, MCContext &Ctx, uint64_t RelType) const override { @@ -679,7 +724,7 @@ public: PCRelBase = DefBaseAddr; // Match LOAD to load the jump table (relative) target const MCInst *DefLoad = UsesAdd[2]; - assert(isLoad(*DefLoad) && + assert(mayLoad(*DefLoad) && "Failed to match indirect branch load pattern! (1)"); assert((ScaleValue != 1LL || isLDRB(*DefLoad)) && "Failed to match indirect branch load pattern! (2)"); @@ -819,6 +864,14 @@ public: /// add x16, x16, #0xbe0 /// br x17 /// + /// The other type of trampolines are located in .plt.got, that are used for + /// non-lazy bindings so doesn't use x16 arg to transfer .got entry address: + /// + /// adrp x16, 230000 + /// ldr x17, [x16, #3040] + /// br x17 + /// nop + /// uint64_t analyzePLTEntry(MCInst &Instruction, InstructionIterator Begin, InstructionIterator End, uint64_t BeginPC) const override { @@ -1012,7 +1065,7 @@ public: return true; } - bool isStore(const MCInst &Inst) const override { return false; } + bool mayStore(const MCInst &Inst) const override { return false; } bool createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx, bool IsTailCall) override { @@ -1286,6 +1339,10 @@ public: } } + StringRef getTrapFillValue() const override { + return StringRef("\0\0\0\0", 4); + } + bool createReturn(MCInst &Inst) const override { Inst.setOpcode(AArch64::RET); Inst.clear(); @@ -1550,6 +1607,52 @@ public: ELF::R_AARCH64_ADD_ABS_LO12_NC); return Insts; } + + std::optional + createRelocation(const MCFixup &Fixup, + const MCAsmBackend &MAB) const override { + const MCFixupKindInfo &FKI = MAB.getFixupKindInfo(Fixup.getKind()); + + assert(FKI.TargetOffset == 0 && "0-bit relocation offset expected"); + const uint64_t RelOffset = Fixup.getOffset(); + + uint64_t RelType; + if (Fixup.getKind() == MCFixupKind(AArch64::fixup_aarch64_pcrel_call26)) + RelType = ELF::R_AARCH64_CALL26; + else if (FKI.Flags & MCFixupKindInfo::FKF_IsPCRel) { + switch (FKI.TargetSize) { + default: + return std::nullopt; + case 16: + RelType = ELF::R_AARCH64_PREL16; + break; + case 32: + RelType = ELF::R_AARCH64_PREL32; + break; + case 64: + RelType = ELF::R_AARCH64_PREL64; + break; + } + } else { + switch (FKI.TargetSize) { + default: + return std::nullopt; + case 16: + RelType = ELF::R_AARCH64_ABS16; + break; + case 32: + RelType = ELF::R_AARCH64_ABS32; + break; + case 64: + RelType = ELF::R_AARCH64_ABS64; + break; + } + } + + auto [RelSymbol, RelAddend] = extractFixupExpr(Fixup); + + return Relocation({RelOffset, RelSymbol, RelType, RelAddend, 0}); + } }; } // end anonymous namespace diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp index ec5bca85231c0e0ef5d73fb3c538120950fd39be..d13eb22f95826411a5b3db991a9ab6fc7c7c93d1 100644 --- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp +++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp @@ -42,6 +42,7 @@ public: case ELF::R_RISCV_GOT_HI20: case ELF::R_RISCV_PCREL_HI20: case ELF::R_RISCV_PCREL_LO12_I: + case ELF::R_RISCV_PCREL_LO12_S: return true; default: llvm_unreachable("Unexpected RISCV relocation type in code"); @@ -171,6 +172,10 @@ public: return true; } + StringRef getTrapFillValue() const override { + return StringRef("\0\0\0\0", 4); + } + bool analyzeBranch(InstructionIterator Begin, InstructionIterator End, const MCSymbol *&TBB, const MCSymbol *&FBB, MCInst *&CondBranch, @@ -348,6 +353,7 @@ public: case ELF::R_RISCV_PCREL_HI20: return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_RISCV_PCREL_HI, Ctx); case ELF::R_RISCV_PCREL_LO12_I: + case ELF::R_RISCV_PCREL_LO12_S: return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_RISCV_PCREL_LO, Ctx); case ELF::R_RISCV_CALL: return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_RISCV_CALL, Ctx); diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index 265868fbddd41a604cc42192966b1bb13962cc46..4cb9d61710d1da7d85d7ca56c91a7f1a94500cb9 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -350,7 +350,7 @@ public: } } - bool isLoad(const MCInst &Inst) const override { + bool mayLoad(const MCInst &Inst) const override { if (isPop(Inst)) return true; @@ -363,7 +363,7 @@ public: return MCII.mayLoad(); } - bool isStore(const MCInst &Inst) const override { + bool mayStore(const MCInst &Inst) const override { if (isPush(Inst)) return true; @@ -416,7 +416,7 @@ public: } } - unsigned getTrapFillValue() const override { return 0xCC; } + StringRef getTrapFillValue() const override { return StringRef("\314", 1); } struct IndJmpMatcherFrag1 : MCInstMatcher { std::unique_ptr Base; @@ -1755,7 +1755,7 @@ public: // - Non-stack loads are prohibited (generally unsafe) // - Stack loads are OK if AllowStackMemOp is true // - Stack loads with RBP are OK if AllowBasePtrStackMemOp is true - if (isLoad(Inst)) { + if (mayLoad(Inst)) { // If stack memory operands are not allowed, no loads are allowed if (!AllowStackMemOp) return false; @@ -2190,7 +2190,7 @@ public: MCInst &CurInst = *Itr++; const MCInstrDesc &Desc = Info->get(CurInst.getOpcode()); if (Desc.hasDefOfPhysReg(CurInst, MethodRegNum, *RegInfo)) { - if (!isLoad(CurInst)) + if (!mayLoad(CurInst)) return false; if (std::optional MO = evaluateX86MemoryOperand(CurInst)) { @@ -2464,46 +2464,9 @@ public: } } - // Extract a symbol and an addend out of the fixup value expression. - // - // Only the following limited expression types are supported: - // Symbol + Addend - // Symbol + Constant + Addend - // Const + Addend - // Symbol - uint64_t Addend = 0; - MCSymbol *Symbol = nullptr; - const MCExpr *ValueExpr = Fixup.getValue(); - if (ValueExpr->getKind() == MCExpr::Binary) { - const auto *BinaryExpr = cast(ValueExpr); - assert(BinaryExpr->getOpcode() == MCBinaryExpr::Add && - "unexpected binary expression"); - const MCExpr *LHS = BinaryExpr->getLHS(); - if (LHS->getKind() == MCExpr::Constant) { - Addend = cast(LHS)->getValue(); - } else if (LHS->getKind() == MCExpr::Binary) { - const auto *LHSBinaryExpr = cast(LHS); - assert(LHSBinaryExpr->getOpcode() == MCBinaryExpr::Add && - "unexpected binary expression"); - const MCExpr *LLHS = LHSBinaryExpr->getLHS(); - assert(LLHS->getKind() == MCExpr::SymbolRef && "unexpected LLHS"); - Symbol = const_cast(this->getTargetSymbol(LLHS)); - const MCExpr *RLHS = LHSBinaryExpr->getRHS(); - assert(RLHS->getKind() == MCExpr::Constant && "unexpected RLHS"); - Addend = cast(RLHS)->getValue(); - } else { - assert(LHS->getKind() == MCExpr::SymbolRef && "unexpected LHS"); - Symbol = const_cast(this->getTargetSymbol(LHS)); - } - const MCExpr *RHS = BinaryExpr->getRHS(); - assert(RHS->getKind() == MCExpr::Constant && "unexpected RHS"); - Addend += cast(RHS)->getValue(); - } else { - assert(ValueExpr->getKind() == MCExpr::SymbolRef && "unexpected value"); - Symbol = const_cast(this->getTargetSymbol(ValueExpr)); - } + auto [RelSymbol, RelAddend] = extractFixupExpr(Fixup); - return Relocation({RelOffset, Symbol, RelType, Addend, 0}); + return Relocation({RelOffset, RelSymbol, RelType, RelAddend, 0}); } bool replaceImmWithSymbolRef(MCInst &Inst, const MCSymbol *Symbol, diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt index 191d2b895b926d0018bbaaaf3195da452adfbbf1..04fc7fee98ab9c070379cf1259028e81a29fe44b 100644 --- a/bolt/runtime/CMakeLists.txt +++ b/bolt/runtime/CMakeLists.txt @@ -1,4 +1,5 @@ cmake_minimum_required(VERSION 3.20.0) +include(CheckCXXCompilerFlag) include(CheckIncludeFiles) include(GNUInstallDirs) @@ -28,10 +29,17 @@ set(BOLT_RT_FLAGS -fno-rtti -fno-stack-protector -fPIC + -Wno-unused-function -mgeneral-regs-only) if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-sse") endif() +if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + check_cxx_compiler_flag("-mno-outline-atomics" CXX_SUPPORTS_OUTLINE_ATOMICS) + if (CXX_SUPPORTS_OUTLINE_ATOMICS) + set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-outline-atomics") + endif() +endif() if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") include(CheckCXXCompilerFlag) diff --git a/bolt/runtime/hugify.cpp b/bolt/runtime/hugify.cpp index 05c1be4f2d70ca6ffb144838e504085a99925b6c..b1c9835936052f9cd67a39efdaa94de1bee80911 100644 --- a/bolt/runtime/hugify.cpp +++ b/bolt/runtime/hugify.cpp @@ -5,8 +5,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===---------------------------------------------------------------------===// - -#if defined (__x86_64__) && !defined(__APPLE__) +#if (defined (__x86_64__) && !defined(__APPLE__)) \ +|| (defined(__aarch64__) && !defined(__APPLE__)) #include "common.h" @@ -170,6 +170,14 @@ extern "C" __attribute((naked)) void __bolt_hugify_self() { __asm__ __volatile__(SAVE_ALL "call __bolt_hugify_self_impl\n" RESTORE_ALL "jmp __bolt_hugify_start_program\n" :: :); +#elif defined(__aarch64__) + __asm__ __volatile__(SAVE_ALL + "bl __bolt_hugify__self__impl\n" + RESTORE_ALL + "adrp x16, __bolt_hugify_start_program\n" + "add x16, x16, #:lo12:__bolt_hugify_start_program\n" + "br x16\n" + :::); #else exit(1); #endif diff --git a/bolt/test/AArch64/Inputs/iplt.ld b/bolt/test/AArch64/Inputs/iplt.ld new file mode 100644 index 0000000000000000000000000000000000000000..1e54a249b2182e0ae72a4716e05bc4351bf71329 --- /dev/null +++ b/bolt/test/AArch64/Inputs/iplt.ld @@ -0,0 +1,3 @@ +SECTIONS { + .plt : ALIGN(16) { *(.plt) *(.iplt) } +} diff --git a/bolt/test/AArch64/Inputs/plt-got.yaml b/bolt/test/AArch64/Inputs/plt-got.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7856719c5df83d60ef5a329b2828667de574f297 --- /dev/null +++ b/bolt/test/AArch64/Inputs/plt-got.yaml @@ -0,0 +1,216 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_AARCH64 + Entry: 0x10360 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_R ] + VAddr: 0x40 + Align: 0x8 + Offset: 0x40 + - Type: PT_INTERP + Flags: [ PF_R ] + FirstSec: .interp + LastSec: .interp + VAddr: 0x270 + Offset: 0x270 + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .interp + LastSec: .rela.dyn + Align: 0x10000 + Offset: 0x0 + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + FirstSec: .plt.got + LastSec: .text + VAddr: 0x10350 + Align: 0x10000 + Offset: 0x2e0 + - Type: PT_LOAD + Flags: [ PF_W, PF_R ] + FirstSec: .interp + LastSec: .got + VAddr: 0x203B0 + Align: 0x10000 + Offset: 0x270 + - Type: PT_LOAD + Flags: [ PF_W, PF_R ] + FirstSec: .got.plt + LastSec: .got.plt + VAddr: 0x304E0 + Align: 0x10000 + Offset: 0x420 + - Type: PT_DYNAMIC + Flags: [ PF_W, PF_R ] + FirstSec: .dynamic + LastSec: .dynamic + VAddr: 0x203B0 + Align: 0x8 + Offset: 0x340 + - Type: PT_GNU_STACK + Flags: [ PF_W, PF_R ] + Offset: 0x0 +Sections: + - Name: .interp + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x270 + AddressAlign: 0x1 + Offset: 0x270 + Content: 2F6C69622F6C642D6C696E75782D616172636836342E736F2E3100 + - Name: .dynsym + Type: SHT_DYNSYM + Flags: [ SHF_ALLOC ] + Address: 0x2B0 + Link: .dynstr + AddressAlign: 0x8 + - Name: .dynstr + Type: SHT_STRTAB + Flags: [ SHF_ALLOC ] + Address: 0x2E0 + AddressAlign: 0x1 + - Name: .rela.dyn + Type: SHT_RELA + Flags: [ SHF_ALLOC ] + Address: 0x2F0 + Link: .dynsym + AddressAlign: 0x8 + Relocations: + - Offset: 0x204D8 + Symbol: abort + Type: R_AARCH64_GLOB_DAT + - Name: .plt.got + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x10350 + AddressAlign: 0x10 + Content: 90000090116E42F920021FD61F2003D5 + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x10360 + AddressAlign: 0x4 + Content: FF8300D1FD7B01A9FD43009188000090086D42F9E80700F9E80740F9080100F1E8179F1AA800003701000014E80740F900013FD601000014EEFFFF97007D20D41000009010420D9100021FD61F2003D5 + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x203B0 + Link: .dynstr + AddressAlign: 0x8 + Entries: + - Tag: DT_NEEDED + Value: 0x1 + - Tag: DT_RELA + Value: 0x2F0 + - Tag: DT_RELASZ + Value: 0x18 + - Tag: DT_RELAENT + Value: 0x18 + - Tag: DT_PLTGOT + Value: 0x304E0 + - Tag: DT_SYMTAB + Value: 0x2B0 + - Tag: DT_SYMENT + Value: 0x18 + - Tag: DT_STRTAB + Value: 0x2E0 + - Tag: DT_STRSZ + Value: 0x10 + - Tag: DT_GNU_HASH + Value: 0x290 + - Tag: DT_FLAGS_1 + Value: 0x8000000 + - Tag: DT_DEBUG + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Name: .got + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x204D0 + AddressAlign: 0x8 + Content: '00000000000000000000000000000000' + - Name: .got.plt + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x304E0 + AddressAlign: 0x8 + Content: B00302000000000000000000000000000000000000000000 + - Name: .rela.text + Type: SHT_RELA + Flags: [ SHF_INFO_LINK ] + Link: .symtab + AddressAlign: 0x8 + Offset: 0x1268 + Info: .text + Relocations: + - Offset: 0x1036C + Symbol: abort + Type: R_AARCH64_ADR_GOT_PAGE + - Offset: 0x10370 + Symbol: abort + Type: R_AARCH64_LD64_GOT_LO12_NC + - Offset: 0x10398 + Symbol: abort + Type: R_AARCH64_CALL26 + - Type: SectionHeaderTable + Sections: + - Name: .interp + - Name: .dynsym + - Name: .dynstr + - Name: .rela.dyn + - Name: .plt.got + - Name: .text + - Name: .dynamic + - Name: .got + - Name: .got.plt + - Name: .strtab + - Name: .symtab + - Name: .shstrtab + - Name: .rela.text +Symbols: + - Name: .text + Type: STT_SECTION + Section: .text + Value: 0x10360 + - Name: .dynamic + Type: STT_SECTION + Section: .dynamic + Value: 0x203B0 + - Name: .got + Type: STT_SECTION + Section: .got + Value: 0x204D0 + - Name: .got.plt + Type: STT_SECTION + Section: .got.plt + Value: 0x304E0 + - Name: 'abort$got' + Type: STT_OBJECT + Section: .got + Value: 0x204D8 + - Name: _start + Type: STT_FUNC + Section: .text + Value: 0x10360 + Size: 0x3C + - Name: _DYNAMIC + Section: .dynamic + Value: 0x203B0 + - Name: _GLOBAL_OFFSET_TABLE_ + Section: .got + Value: 0x204D0 + - Name: abort + Type: STT_FUNC + Binding: STB_GLOBAL + Size: 0x8 +DynamicSymbols: + - Name: abort + Type: STT_FUNC + Binding: STB_GLOBAL + Size: 0x8 +... diff --git a/bolt/test/AArch64/bf_min_alignment.s b/bolt/test/AArch64/bf_min_alignment.s new file mode 100644 index 0000000000000000000000000000000000000000..2dd06b373a798010820ed43738134e3c871d62c9 --- /dev/null +++ b/bolt/test/AArch64/bf_min_alignment.s @@ -0,0 +1,35 @@ +// This tests checks the minimum alignment of the AARch64 function +// is equal to 4. Otherwise the jitlinker would fail to link the +// binary since the size of the first function after reorder is not +// not a multiple of 4. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags -fPIC -pie %t.o -o %t.exe -nostdlib -Wl,-q +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-bolt %t.exe -o %t.bolt --use-old-text=0 --lite=0 \ +# RUN: --align-functions-max-bytes=1 \ +# RUN: --data %t.fdata --reorder-functions=exec-count +# RUN: llvm-nm -n %t.bolt | FileCheck %s + +# CHECK: {{0|4|8|c}} T dummy +# CHECK-NEXT: {{0|4|8|c}} T _start + + .text + .align 4 + .global _start + .type _start, %function +_start: +# FDATA: 0 [unknown] 0 1 _start 0 0 1 + bl dymmy + ret + .size _start, .-_start + + .global dummy + .type dummy, %function +dummy: +# FDATA: 0 [unknown] 0 1 dummy 0 0 42 + adr x0, .Lci + ret +.Lci: + .byte 0 + .size dummy, .-dummy diff --git a/bolt/test/AArch64/constant_island_pie_update.s b/bolt/test/AArch64/constant_island_pie_update.s index c6856988d52f737162103695b9c532c37e74f559..0ab67d07a854ec49da550bfd0aa882c875a672b1 100644 --- a/bolt/test/AArch64/constant_island_pie_update.s +++ b/bolt/test/AArch64/constant_island_pie_update.s @@ -8,13 +8,16 @@ # RUN: %clang %cflags -fPIC -pie %t.o -o %t.rela.exe -nostdlib \ # RUN: -Wl,-q -Wl,-z,notext # RUN: llvm-bolt %t.rela.exe -o %t.rela.bolt --use-old-text=0 --lite=0 -# RUN: llvm-objdump -j .text -d %t.rela.bolt | FileCheck %s +# RUN: llvm-objdump -j .text -d --show-all-symbols %t.rela.bolt | FileCheck %s # RUN: llvm-readelf -rsW %t.rela.bolt | FileCheck --check-prefix=ELFCHECK %s // .relr.dyn # RUN: %clang %cflags -fPIC -pie %t.o -o %t.relr.exe -nostdlib \ # RUN: -Wl,-q -Wl,-z,notext -Wl,--pack-dyn-relocs=relr +# RUN: llvm-objcopy --remove-section .rela.mytext %t.relr.exe # RUN: llvm-bolt %t.relr.exe -o %t.relr.bolt --use-old-text=0 --lite=0 -# RUN: llvm-objdump -j .text -d %t.relr.bolt | FileCheck %s +# RUN: llvm-objdump -j .text -d --show-all-symbols %t.relr.bolt | FileCheck %s +# RUN: llvm-objdump -j .text -d %t.relr.bolt | \ +# RUN: FileCheck %s --check-prefix=ADDENDCHECK # RUN: llvm-readelf -rsW %t.relr.bolt | FileCheck --check-prefix=ELFCHECK %s # RUN: llvm-readelf -SW %t.relr.bolt | FileCheck --check-prefix=RELRSZCHECK %s @@ -30,6 +33,11 @@ # CHECK-NEXT: {{.*}} .word 0x{{[0]+}}[[#ADDR]] # CHECK-NEXT: {{.*}} .word 0x00000000 +// Check that addend was properly patched in mytextP with stripped relocations +# ADDENDCHECK: [[#%x,ADDR:]] : +# ADDENDCHECK: {{.*}} : +# ADDENDCHECK-NEXT: {{.*}} .word 0x{{[0]+}}[[#ADDR]] +# ADDENDCHECK-NEXT: {{.*}} .word 0x00000000 // Check that we've relaxed adr to adrp + add to refer external CI # CHECK: : @@ -40,9 +48,10 @@ # ELFCHECK: [[#%x,OFF:]] [[#%x,INFO_DYN:]] R_AARCH64_RELATIVE # ELFCHECK-NEXT: [[#OFF + 8]] {{0*}}[[#INFO_DYN]] R_AARCH64_RELATIVE # ELFCHECK-NEXT: [[#OFF + 24]] {{0*}}[[#INFO_DYN]] R_AARCH64_RELATIVE +# ELFCHECK-NEXT: {{.*}} R_AARCH64_RELATIVE # ELFCHECK: {{.*}}[[#OFF]] {{.*}} $d -// Check that .relr.dyn size is 2 bytes to ensure that last 2 relocations were +// Check that .relr.dyn size is 2 bytes to ensure that last 3 relocations were // encoded as a bitmap so the total section size for 3 relocations is 2 bytes. # RELRSZCHECK: .relr.dyn RELR [[#%x,ADDR:]] [[#%x,OFF:]] {{0*}}10 @@ -81,3 +90,17 @@ addressDynCi: adr x1, .Lci bl _start .size addressDynCi, .-addressDynCi + + .section ".mytext", "ax" + .balign 8 + .global dummy + .type dummy, %function +dummy: + nop + .word 0 + .size dummy, .-dummy + + .global mytextP +mytextP: + .xword exitLocal + .size mytextP, .-mytextP diff --git a/bolt/test/AArch64/exclusive-instrument.s b/bolt/test/AArch64/exclusive-instrument.s new file mode 100644 index 0000000000000000000000000000000000000000..502dd83b2f2a5b8e8f111309d7fce38ad0fad19c --- /dev/null +++ b/bolt/test/AArch64/exclusive-instrument.s @@ -0,0 +1,39 @@ +// This test checks that the foo function having exclusive memory access +// instructions won't be instrumented. + +// REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}} + +// RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +// RUN: %s -o %t.o +// RUN: %clang %cflags -fPIC -pie %t.o -o %t.exe -nostdlib -Wl,-q -Wl,-fini=dummy +// RUN: llvm-bolt %t.exe -o %t.bolt -instrument -v=1 | FileCheck %s + +// CHECK: Function foo has exclusive instructions, skip instrumentation + +.global foo +.type foo, %function +foo: + ldaxr w9, [x10] + cbnz w9, .Lret + stlxr w12, w11, [x9] + cbz w12, foo + clrex +.Lret: + ret +.size foo, .-foo + +.global _start +.type _start, %function +_start: + cmp x0, #0 + b.eq .Lexit + bl foo +.Lexit: + ret +.size _start, .-_start + +.global dummy +.type dummy, %function +dummy: + ret +.size dummy, .-dummy diff --git a/bolt/test/AArch64/hook-fini.s b/bolt/test/AArch64/hook-fini.s new file mode 100644 index 0000000000000000000000000000000000000000..4f321d463ef322b541768f2f839e7045ac3d1a83 --- /dev/null +++ b/bolt/test/AArch64/hook-fini.s @@ -0,0 +1,103 @@ +## Test the different ways of hooking the fini function for instrumentation (via +## DT_FINI and via DT_FINI_ARRAY). We test the latter for both PIE and non-PIE +## binaries because of the different ways of handling relocations (static or +## dynamic). +## All tests perform the following steps: +## - Compile and link for the case to be tested +## - Some sanity-checks on the dynamic section and relocations in the binary to +## verify it has the shape we want for testing: +## - DT_FINI or DT_FINI_ARRAY in dynamic section +## - No relative relocations for non-PIE +## - Instrument +## - Verify generated binary +# REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}} + +# RUN: %clang %cflags -pie %s -Wl,-q -o %t.exe +# RUN: llvm-readelf -d %t.exe | FileCheck --check-prefix=DYN-FINI %s +# RUN: llvm-readelf -r %t.exe | FileCheck --check-prefix=RELOC-PIE %s +# RUN: llvm-bolt %t.exe -o %t --instrument +# RUN: llvm-readelf -drs %t | FileCheck --check-prefix=CHECK-FINI %s + +# RUN: %clang %cflags -pie %s -Wl,-q,-fini=0 -o %t-no-fini.exe +# RUN: llvm-readelf -d %t-no-fini.exe | FileCheck --check-prefix=DYN-NO-FINI %s +# RUN: llvm-readelf -r %t-no-fini.exe | FileCheck --check-prefix=RELOC-PIE %s +# RUN: llvm-bolt %t-no-fini.exe -o %t-no-fini --instrument +# RUN: llvm-readelf -drs %t-no-fini | FileCheck --check-prefix=CHECK-NO-FINI %s +# RUN: llvm-readelf -ds -x .fini_array %t-no-fini | FileCheck --check-prefix=CHECK-NO-FINI-RELOC %s + +## Create a dummy shared library to link against to force creation of the dynamic section. +# RUN: %clang %cflags %p/../Inputs/stub.c -fPIC -shared -o %t-stub.so +# RUN: %clang %cflags %s -no-pie -Wl,-q,-fini=0 %t-stub.so -o %t-no-pie-no-fini.exe +# RUN: llvm-readelf -r %t-no-pie-no-fini.exe | FileCheck --check-prefix=RELOC-NO-PIE %s +# RUN: llvm-bolt %t-no-pie-no-fini.exe -o %t-no-pie-no-fini --instrument +# RUN: llvm-readelf -ds -x .fini_array %t-no-pie-no-fini | FileCheck --check-prefix=CHECK-NO-PIE-NO-FINI %s + +## With fini: dynamic section should contain DT_FINI +# DYN-FINI: (FINI) + +## Without fini: dynamic section should only contain DT_FINI_ARRAY +# DYN-NO-FINI-NOT: (FINI) +# DYN-NO-FINI: (FINI_ARRAY) +# DYN-NO-FINI: (FINI_ARRAYSZ) + +## With PIE: binary should have relative relocations +# RELOC-PIE: R_AARCH64_RELATIVE + +## Without PIE: binary should not have relative relocations +# RELOC-NO-PIE-NOT: R_AARCH64_RELATIVE + +## Check that DT_FINI is set to __bolt_runtime_fini +# CHECK-FINI: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-FINI-DAG: (FINI) 0x[[FINI:[[:xdigit:]]+]] +# CHECK-FINI-DAG: (FINI_ARRAY) 0x[[FINI_ARRAY:[[:xdigit:]]+]] +## Check that the dynamic relocation at .fini_array was not patched +# CHECK-FINI: Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries +# CHECK-FINI-NOT: {{0+}}[[FINI_ARRAY]] {{.*}} R_AARCH64_RELATIVE [[FINI]] +# CHECK-FINI: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-FINI: {{0+}}[[FINI]] {{.*}} __bolt_runtime_fini + +## Check that DT_FINI_ARRAY has a dynamic relocation for __bolt_runtime_fini +# CHECK-NO-FINI: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-NO-FINI-NOT: (FINI) +# CHECK-NO-FINI: (FINI_ARRAY) 0x[[FINI_ARRAY:[[:xdigit:]]+]] +# CHECK-NO-FINI: Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries +# CHECK-NO-FINI: {{0+}}[[FINI_ARRAY]] {{.*}} R_AARCH64_RELATIVE [[FINI_ADDR:[[:xdigit:]]+]] +# CHECK-NO-FINI: Symbol table '.symtab' contains {{.*}} entries: +# CHECK-NO-FINI: {{0+}}[[FINI_ADDR]] {{.*}} __bolt_runtime_fini + +## Check that the static relocation in .fini_array is patched even for PIE +# CHECK-NO-FINI-RELOC: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-NO-FINI-RELOC: (FINI_ARRAY) 0x[[FINI_ARRAY:[[:xdigit:]]+]] +# CHECK-NO-FINI-RELOC: Symbol table '.symtab' contains {{.*}} entries: +## Read bytes separately so we can reverse them later +# CHECK-NO-FINI-RELOC: {{0+}}[[FINI_ADDR_B0:[[:xdigit:]]{2}]][[FINI_ADDR_B1:[[:xdigit:]]{2}]][[FINI_ADDR_B2:[[:xdigit:]]{2}]][[FINI_ADDR_B3:[[:xdigit:]]{2}]] {{.*}} __bolt_runtime_fini +# CHECK-NO-FINI-RELOC: Hex dump of section '.fini_array': +# CHECK-NO-FINI-RELOC: 0x{{0+}}[[FINI_ARRAY]] [[FINI_ADDR_B3]][[FINI_ADDR_B2]][[FINI_ADDR_B1]][[FINI_ADDR_B0]] 00000000 + +## Check that DT_FINI_ARRAY has static relocation applied for __bolt_runtime_fini +# CHECK-NO-PIE-NO-FINI: Dynamic section at offset {{.*}} contains {{.*}} entries: +# CHECK-NO-PIE-NO-FINI-NOT: (FINI) +# CHECK-NO-PIE-NO-FINI: (FINI_ARRAY) 0x[[FINI_ARRAY:[a-f0-9]+]] +# CHECK-NO-PIE-NO-FINI: Symbol table '.symtab' contains {{.*}} entries: +## Read address bytes separately so we can reverse them later +# CHECK-NO-PIE-NO-FINI: {{0+}}[[FINI_ADDR_B0:[[:xdigit:]]{2}]][[FINI_ADDR_B1:[[:xdigit:]]{2}]][[FINI_ADDR_B2:[[:xdigit:]]{2}]][[FINI_ADDR_B3:[[:xdigit:]]{2}]] {{.*}} __bolt_runtime_fini +# CHECK-NO-PIE-NO-FINI: Hex dump of section '.fini_array': +# CHECK-NO-PIE-NO-FINI: 0x{{0+}}[[FINI_ARRAY]] [[FINI_ADDR_B3]][[FINI_ADDR_B2]][[FINI_ADDR_B1]][[FINI_ADDR_B0]] 00000000 + + .globl _start + .type _start, %function +_start: + # Dummy relocation to force relocation mode. + .reloc 0, R_AARCH64_NONE + ret +.size _start, .-_start + + .globl _fini + .type _fini, %function +_fini: + ret + .size _fini, .-_fini + + .section .fini_array,"aw" + .align 3 + .dword _fini diff --git a/bolt/test/AArch64/ifunc.c b/bolt/test/AArch64/ifunc.c new file mode 100644 index 0000000000000000000000000000000000000000..79c035ed45373c70f19e2af0730ca6f5e3e5ef2c --- /dev/null +++ b/bolt/test/AArch64/ifunc.c @@ -0,0 +1,59 @@ +// This test checks that IFUNC trampoline is properly recognised by BOLT + +// With -O0 indirect call is performed on IPLT trampoline. IPLT trampoline +// has IFUNC symbol. +// RUN: %clang %cflags -nostdlib -O0 -no-pie %s -fuse-ld=lld \ +// RUN: -o %t.O0.exe -Wl,-q +// RUN: llvm-bolt %t.O0.exe -o %t.O0.bolt.exe \ +// RUN: --print-disasm --print-only=_start | \ +// RUN: FileCheck --check-prefix=CHECK %s +// RUN: llvm-readelf -aW %t.O0.bolt.exe | \ +// RUN: FileCheck --check-prefix=REL_CHECK %s + +// Non-pie static executable doesn't generate PT_DYNAMIC, check relocation +// is readed successfully and IPLT trampoline has been identified by bolt. +// RUN: %clang %cflags -nostdlib -O3 %s -fuse-ld=lld -no-pie \ +// RUN: -o %t.O3_nopie.exe -Wl,-q +// RUN: llvm-readelf -l %t.O3_nopie.exe | \ +// RUN: FileCheck --check-prefix=NON_DYN_CHECK %s +// RUN: llvm-bolt %t.O3_nopie.exe -o %t.O3_nopie.bolt.exe \ +// RUN: --print-disasm --print-only=_start | \ +// RUN: FileCheck --check-prefix=CHECK %s +// RUN: llvm-readelf -aW %t.O3_nopie.bolt.exe | \ +// RUN: FileCheck --check-prefix=REL_CHECK %s + +// With -O3 direct call is performed on IPLT trampoline. IPLT trampoline +// doesn't have associated symbol. The ifunc symbol has the same address as +// IFUNC resolver function. +// RUN: %clang %cflags -nostdlib -O3 %s -fuse-ld=lld -fPIC -pie \ +// RUN: -o %t.O3_pie.exe -Wl,-q +// RUN: llvm-bolt %t.O3_pie.exe -o %t.O3_pie.bolt.exe \ +// RUN: --print-disasm --print-only=_start | \ +// RUN: FileCheck --check-prefix=CHECK %s +// RUN: llvm-readelf -aW %t.O3_pie.bolt.exe | \ +// RUN: FileCheck --check-prefix=REL_CHECK %s + +// Check that IPLT trampoline located in .plt section are normally handled by +// BOLT. The gnu-ld linker doesn't use separate .iplt section. +// RUN: %clang %cflags -nostdlib -O3 %s -fuse-ld=lld -fPIC -pie \ +// RUN: -T %p/Inputs/iplt.ld -o %t.iplt_O3_pie.exe -Wl,-q +// RUN: llvm-bolt %t.iplt_O3_pie.exe -o %t.iplt_O3_pie.bolt.exe \ +// RUN: --print-disasm --print-only=_start | \ +// RUN: FileCheck --check-prefix=CHECK %s +// RUN: llvm-readelf -aW %t.iplt_O3_pie.bolt.exe | \ +// RUN: FileCheck --check-prefix=REL_CHECK %s + +// NON_DYN_CHECK-NOT: DYNAMIC + +// CHECK: b{{l?}} "{{resolver_foo|ifoo}}{{.*}}@PLT" + +// REL_CHECK: R_AARCH64_IRELATIVE [[#%x,REL_SYMB_ADDR:]] +// REL_CHECK: [[#REL_SYMB_ADDR]] {{.*}} FUNC {{.*}} resolver_foo + +static void foo() {} + +static void *resolver_foo(void) { return foo; } + +__attribute__((ifunc("resolver_foo"))) void ifoo(); + +void _start() { ifoo(); } diff --git a/bolt/test/AArch64/plt-got.test b/bolt/test/AArch64/plt-got.test new file mode 100644 index 0000000000000000000000000000000000000000..be1c095784b7090bdea01fb5492479a4abdb9938 --- /dev/null +++ b/bolt/test/AArch64/plt-got.test @@ -0,0 +1,7 @@ +// This test checks .plt.got handling by BOLT + +RUN: yaml2obj %p/Inputs/plt-got.yaml &> %t.exe +RUN: llvm-bolt %t.exe -o %t.bolt --print-disasm --print-only=_start/1 | \ +RUN: FileCheck %s + +CHECK: bl abort@PLT diff --git a/bolt/test/AArch64/r_aarch64_prelxx.s b/bolt/test/AArch64/r_aarch64_prelxx.s index 444dee72b7c04eb0172c85b01979592d954ce840..73bf8387d3634518278b12ee168c0ce44eba8f92 100644 --- a/bolt/test/AArch64/r_aarch64_prelxx.s +++ b/bolt/test/AArch64/r_aarch64_prelxx.s @@ -12,7 +12,7 @@ // CHECKPREL-NEXT: R_AARCH64_PREL32 {{.*}} _start + 4 // CHECKPREL-NEXT: R_AARCH64_PREL64 {{.*}} _start + 8 -// RUN: llvm-bolt %t.exe -o %t.bolt +// RUN: llvm-bolt %t.exe -o %t.bolt --strict // RUN: llvm-objdump -D %t.bolt | FileCheck %s --check-prefix=CHECKPREL32 // CHECKPREL32: [[#%x,DATATABLEADDR:]] : diff --git a/bolt/test/AArch64/reloc-call26.s b/bolt/test/AArch64/reloc-call26.s new file mode 100644 index 0000000000000000000000000000000000000000..42e4f7f2b43786fcdd72ba4a614e93562b380fe8 --- /dev/null +++ b/bolt/test/AArch64/reloc-call26.s @@ -0,0 +1,51 @@ +## This test checks processing of R_AARCH64_CALL26 relocation +## when option `--funcs` is enabled + +## We want to test on relocations against functions with both higher +## and lower addresses. The '--force-patch' option is used to prevent +## the functions func1 and func2 from being optimized, so that their +## addresses can remain unchanged. Therefore, the relocations can be +## updated via encodeValueAArch64 and the address order in the output +## binary is func1 < _start < func2. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.bolt --funcs=func1,func2 \ +# RUN: --force-patch 2>&1 | FileCheck %s -check-prefix=CHECK-BOLT +# RUN: llvm-objdump -d --disassemble-symbols='_start' %t.bolt | \ +# RUN: FileCheck %s +# RUN: llvm-nm --numeric-sort --extern-only %t.bolt | FileCheck \ +# RUN: %s -check-prefix=CHECK-FUNC-ORDER + +# CHECK-BOLT: BOLT-WARNING: failed to patch entries in func1. The function will not be optimized. +# CHECK-BOLT: BOLT-WARNING: failed to patch entries in func2. The function will not be optimized. +# CHECK: {{.*}} bl {{.*}} +# CHECK: {{.*}} bl {{.*}} + +# CHECK-FUNC-ORDER: {{.*}} func1 +# CHECK-FUNC-ORDER-NEXT: {{.*}} _start +# CHECK-FUNC-ORDER-NEXT: {{.*}} func2 + + .text + .align 4 + .global func1 + .type func1, %function +func1: + ret + .size func1, .-func1 + .global _start + .type _start, %function +_start: + bl func1 + bl func2 + mov w8, #93 + svc #0 + .size _start, .-_start + .global func2 + .type func2, %function +func2: + ret + .size func2, .-func2 diff --git a/bolt/test/RISCV/branch-no-secondary-entry.s b/bolt/test/RISCV/branch-no-secondary-entry.s new file mode 100644 index 0000000000000000000000000000000000000000..bf8191f25744c9049d0dbf19ad8e499dd31fc031 --- /dev/null +++ b/bolt/test/RISCV/branch-no-secondary-entry.s @@ -0,0 +1,18 @@ +/// Test that no secondary entry points are created for basic block labels used +/// by branches. +// RUN: %clang %cflags -o %t %s +// RUN: llvm-bolt -print-cfg -o /dev/null %t 2>&1 | FileCheck %s + +// CHECK: Binary Function "_start" after building cfg { +// CHECK: IsMultiEntry: 0 +// CHECK: beq t0, t1, .Ltmp0 +// CHECK: {{^}}.Ltmp0 +// CHECK: ret + + .globl _start +_start: + beq t0, t1, 1f +1: + ret + .size _start, .-_start + diff --git a/bolt/test/RISCV/load-store.s b/bolt/test/RISCV/load-store.s new file mode 100644 index 0000000000000000000000000000000000000000..5a9785571c808118b87109c87e4facfa88bafaf7 --- /dev/null +++ b/bolt/test/RISCV/load-store.s @@ -0,0 +1,16 @@ +// RUN: %clang %cflags -o %t %s +// RUN: link_fdata --no-lbr %s %t %t.fdata +// RUN: llvm-bolt %t -o /dev/null --data=%t.fdata --dyno-stats | FileCheck %s + +// CHECK: BOLT-INFO: program-wide dynostats after all optimizations before SCTC and FOP (no change): +// CHECK: 3000 : executed instructions +// CHECK: 1000 : executed load instructions +// CHECK: 1000 : executed store instructions + + .globl _start +_start: +# FDATA: 1 _start #_start# 1 + ld t0, (gp) + sd t0, (gp) + ret + .size _start, .-_start diff --git a/bolt/test/RISCV/mapping-syms.s b/bolt/test/RISCV/mapping-syms.s new file mode 100644 index 0000000000000000000000000000000000000000..e8fdeb0c7572dea6c7b0d2b7293121b40237c511 --- /dev/null +++ b/bolt/test/RISCV/mapping-syms.s @@ -0,0 +1,27 @@ +/// FIXME llvm-mc is used instead of clang because we need a recent change in +/// the RISC-V MC layer (D153260). Once that one is released, we can switch to +/// clang. (Note that the pre-merge check buildbots use the system's clang). +// RUN: llvm-mc -triple riscv64 -mattr=+c -filetype obj -o %t.o %s +// RUN: ld.lld -o %t %t.o +// RUN: llvm-bolt --print-cfg --print-only=_start -o %t.bolt %t 2>&1 | FileCheck %s +// RUN: llvm-objdump -d %t.bolt | FileCheck --check-prefix=CHECK-OBJDUMP %s + +// CHECK-NOT: BOLT-WARNING + +/// Check that .word is not disassembled by BOLT +// CHECK: 00000000: nop +// CHECK: 00000002: ret + +/// Check .word is still present in output +// CHECK-OBJDUMP: <_start>: +// CHECK-OBJDUMP-NEXT: nop +// CHECK-OBJDUMP-NEXT: unimp +// CHECK-OBJDUMP-NEXT: unimp +// CHECK-OBJDUMP-NEXT: ret + .text + .globl _start + .p2align 1 +_start: + nop + .word 0x0 + ret diff --git a/bolt/test/RISCV/reloc-abs.s b/bolt/test/RISCV/reloc-abs.s index 3e4b8b1395e1ff8312dadd0d4abbabf4359fa7b5..5b728f092b3c9f587d063db38f85cda7605b0866 100644 --- a/bolt/test/RISCV/reloc-abs.s +++ b/bolt/test/RISCV/reloc-abs.s @@ -17,8 +17,7 @@ _start: .option push .option norelax 1: -// CHECK: .Ltmp0 -// CHECK: auipc gp, %pcrel_hi(__global_pointer$) +// CHECK: auipc gp, %pcrel_hi(__global_pointer$) # Label: .Ltmp0 // CHECK-NEXT: addi gp, gp, %pcrel_lo(.Ltmp0) auipc gp, %pcrel_hi(__global_pointer$) addi gp, gp, %pcrel_lo(1b) diff --git a/bolt/test/RISCV/reloc-bb-split.s b/bolt/test/RISCV/reloc-bb-split.s new file mode 100644 index 0000000000000000000000000000000000000000..5995562cf130b07083c967d4797a141a6e9472e3 --- /dev/null +++ b/bolt/test/RISCV/reloc-bb-split.s @@ -0,0 +1,42 @@ +// RUN: %clang %cflags -o %t %s +// RUN: llvm-bolt --print-cfg --print-only=_start -o /dev/null %t \ +// RUN: | FileCheck %s + + .data + .globl d + .p2align 3 +d: + .dword 0 + + .text + .globl _start + .p2align 1 +// CHECK-LABEL: Binary Function "_start" after building cfg { +_start: +/// The local label is used for %pcrel_lo as well as a jump target so a new +/// basic block should start there. +// CHECK-LABEL: {{^}}.LBB00 +// CHECK: nop +// CHECK-LABEL: {{^}}.Ltmp0 +// CHECK: auipc t0, %pcrel_hi(d) # Label: .Ltmp1 +// CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp1)(t0) +// CHECK-NEXT: j .Ltmp0 + nop +1: + auipc t0, %pcrel_hi(d) + ld t0, %pcrel_lo(1b)(t0) + j 1b + +/// The local label is used only for %pcrel_lo so no new basic block should +/// start there. +// CHECK-LABEL: {{^}}.LFT0 +// CHECK: nop +// CHECK-NEXT: auipc t0, %pcrel_hi(d) # Label: .Ltmp2 +// CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp2)(t0) +// CHECK-NEXT: ret + nop +1: + auipc t0, %pcrel_hi(d) + ld t0, %pcrel_lo(1b)(t0) + ret + .size _start, .-_start diff --git a/bolt/test/RISCV/reloc-got.s b/bolt/test/RISCV/reloc-got.s index b6cd61be723bfa7c7012eee5a4dd484152a1921d..dcf9d0ea3ffbf23b999f1dfb2013a448a42e9739 100644 --- a/bolt/test/RISCV/reloc-got.s +++ b/bolt/test/RISCV/reloc-got.s @@ -14,8 +14,7 @@ d: // CHECK: Binary Function "_start" after building cfg { _start: nop // Here to not make the _start and .Ltmp0 symbols coincide -// CHECK: .Ltmp0 -// CHECK: auipc t0, %pcrel_hi(__BOLT_got_zero+{{[0-9]+}}) +// CHECK: auipc t0, %pcrel_hi(__BOLT_got_zero+{{[0-9]+}}) # Label: .Ltmp0 // CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp0)(t0) 1: auipc t0, %got_pcrel_hi(d) diff --git a/bolt/test/RISCV/reloc-pcrel.s b/bolt/test/RISCV/reloc-pcrel.s index 2d5a349d03e788ee1bba2a6586f9b5ad9cd6f45c..3ad3015a0a57fadaa65fa8e2902b63a76f6e9f8d 100644 --- a/bolt/test/RISCV/reloc-pcrel.s +++ b/bolt/test/RISCV/reloc-pcrel.s @@ -14,9 +14,11 @@ d: // CHECK: Binary Function "_start" after building cfg { _start: nop // Here to not make the _start and .Ltmp0 symbols coincide -// CHECK: .Ltmp0 -// CHECK: auipc t0, %pcrel_hi(d) +// CHECK: auipc t0, %pcrel_hi(d) # Label: .Ltmp0 // CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp0)(t0) ld t0, d +// CHECK-NEXT: auipc t1, %pcrel_hi(d) # Label: .Ltmp1 +// CHECK-NEXT: sd t0, %pcrel_lo(.Ltmp1)(t1) + sd t0, d, t1 ret .size _start, .-_start diff --git a/bolt/test/X86/Inputs/blarge_profile_stale.yaml b/bolt/test/X86/Inputs/blarge_profile_stale.yaml index afe76eda5485005569330f21adbf4289cbfc0423..f5abaed3da39412588473a90779761c9d245a4dd 100644 --- a/bolt/test/X86/Inputs/blarge_profile_stale.yaml +++ b/bolt/test/X86/Inputs/blarge_profile_stale.yaml @@ -6,6 +6,7 @@ header: profile-flags: [ lbr ] profile-origin: branch profile reader profile-events: '' + dfs-order: false functions: - name: SolveCubic fid: 6 @@ -15,20 +16,24 @@ functions: blocks: - bid: 0 insns: 43 - hash: 0xD2411AC186118199 + hash: 0xed4db287e71c0000 exec: 151 - succ: [ { bid: 1, cnt: 4, mis: 2 }, { bid: 11, cnt: 0 } ] + succ: [ { bid: 1, cnt: 151, mis: 2 }, { bid: 7, cnt: 0 } ] - bid: 1 insns: 7 - hash: 0xDF0C9CC1FEAA70C3 - succ: [ { bid: 10, cnt: 0 }, { bid: 2, cnt: 0 } ] + hash: 0x39330000e4560088 + succ: [ { bid: 13, cnt: 151 }, { bid: 2, cnt: 0 } ] - bid: 13 insns: 26 - hash: 0xF05DC5524E99E56F - succ: [ { bid: 15, cnt: 89 }, { bid: 14, cnt: 0 } ] - - bid: 15 + hash: 0xa9700000fe202a7 + succ: [ { bid: 3, cnt: 89 }, { bid: 2, cnt: 10 } ] + - bid: 3 + insns: 9 + hash: 0x62391dad18a700a0 + succ: [ { bid: 5, cnt: 151 } ] + - bid: 5 insns: 9 - hash: 0xB2E8338276A9834E + hash: 0x4d906d19ecec0111 - name: usqrt fid: 7 hash: 0x8B62B1F9AD81EA35 @@ -37,15 +42,15 @@ functions: blocks: - bid: 0 insns: 4 - hash: 0xb1e5b76571270000 + hash: 0x1111111111111111 exec: 20 succ: [ { bid: 1, cnt: 0 } ] - bid: 1 insns: 9 - hash: 0x587e93788b970010 + hash: 0x27e43a5e10cd0010 succ: [ { bid: 3, cnt: 320, mis: 171 }, { bid: 2, cnt: 0 } ] - bid: 3 insns: 2 - hash: 0x20e605d745e50039 + hash: 0x4db935b6471e0039 succ: [ { bid: 1, cnt: 300, mis: 33 }, { bid: 4, cnt: 20 } ] ... diff --git a/bolt/test/X86/Inputs/patch-entries.c b/bolt/test/X86/Inputs/patch-entries.c new file mode 100644 index 0000000000000000000000000000000000000000..46a3b41b048e466e6440f24b01c5ce299bc47ac4 --- /dev/null +++ b/bolt/test/X86/Inputs/patch-entries.c @@ -0,0 +1,8 @@ +#include "stub.h" + +static void foo() { printf("foo\n"); } + +int main() { + foo(); + return 0; +} diff --git a/bolt/test/X86/bolt-address-translation-internal-call.test b/bolt/test/X86/bolt-address-translation-internal-call.test index e24a9e6dc1c2272050f2c7a75d89dac48dcf8014..24cb635e13e9830e4a0f103b6aadcb83426cbc97 100644 --- a/bolt/test/X86/bolt-address-translation-internal-call.test +++ b/bolt/test/X86/bolt-address-translation-internal-call.test @@ -9,7 +9,7 @@ # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o # Delete our BB symbols so BOLT doesn't mark them as entry points # RUN: llvm-strip --strip-unneeded %t.o -# RUN: %clang %t.o -o %t.exe -Wl,-q +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q # RUN: llvm-bolt --enable-bat %t.exe --relocs -o %t.out | FileCheck %s # CHECK: BOLT-INFO: Wrote {{.*}} BAT maps @@ -29,6 +29,7 @@ main: push %rbx sub $0x120,%rsp mov $0x3,%rbx + movq rel(%rip), %rdi .J1: cmp $0x0,%rbx je .J2 @@ -49,4 +50,8 @@ main: .J4: pop %rbp retq +end: .size main, .-main + + .data +rel: .quad end diff --git a/bolt/test/X86/bug-function-layout-execount.s b/bolt/test/X86/bug-function-layout-execount.s new file mode 100644 index 0000000000000000000000000000000000000000..540b6790d01e900a77ccd1b2740b13ea24042108 --- /dev/null +++ b/bolt/test/X86/bug-function-layout-execount.s @@ -0,0 +1,73 @@ +# Verifies that llvm-bolt correctly sorts functions by their execution counts. + +# REQUIRES: x86_64-linux, asserts + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe --data %t.fdata --lite --reorder-functions=exec-count \ +# RUN: -v=2 --debug-only=hfsort -o /dev/null 2>&1 | FileCheck %s + +# CHECK: Starting pass: reorder-functions +# CHECK-NEXT: hot func func2 (1500) +# CHECK-NEXT: hot func func1 (500) +# CHECK-NEXT: hot func main (400) +# CHECK-NEXT: hot func func5 (110) +# CHECK-NEXT: hot func func3 (100) +# CHECK-NEXT: hot func func4 (99) + + .text + .globl main + .type main, %function +main: +# FDATA: 0 [unknown] 0 1 main 0 1 400 + .cfi_startproc + call func1 + retq + .size _start, .-_start + .cfi_endproc + + .globl func1 + .type func1,@function +func1: +# FDATA: 0 [unknown] 0 1 func1 0 1 500 + .cfi_startproc + retq + .size func1, .-func1 + .cfi_endproc + + .globl func2 + .type func2,@function +func2: +# FDATA: 0 [unknown] 0 1 func2 0 1 1500 + .cfi_startproc + retq + .size func2, .-func2 + .cfi_endproc + + .globl func3 + .type func3,@function +func3: +# FDATA: 0 [unknown] 0 1 func3 0 1 100 + .cfi_startproc + retq + .size func3, .-func3 + .cfi_endproc + + .globl func4 + .type func4,@function +func4: +# FDATA: 0 [unknown] 0 1 func4 0 1 99 + .cfi_startproc + retq + .size func4, .-func4 + .cfi_endproc + + .globl func5 + .type func5,@function +func5: +# FDATA: 0 [unknown] 0 1 func5 0 1 110 + .cfi_startproc + retq + .size func5, .-func5 + .cfi_endproc diff --git a/bolt/test/X86/calculate-emitted-block-size.s b/bolt/test/X86/calculate-emitted-block-size.s new file mode 100644 index 0000000000000000000000000000000000000000..b1d05b83cb87c74d8794066504331e47899452d0 --- /dev/null +++ b/bolt/test/X86/calculate-emitted-block-size.s @@ -0,0 +1,101 @@ +# Test BinaryContext::calculateEmittedSize's functionality to update +# BinaryBasicBlock::OutputAddressRange in place so that the emitted size +# of each basic block is given by BinaryBasicBlock::getOutputSize() + +# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=all \ +# RUN: --print-split --print-only=chain --print-output-address-range \ +# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \ +# RUN: 2>&1 | FileCheck --check-prefix=SPLITALL %s +# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --print-split \ +# RUN: --print-only=chain --print-output-address-range \ +# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \ +# RUN: 2>&1 | FileCheck --check-prefix=SPLITHOTCOLD %s + +# SPLITALL: {{^\.LBB00}} +# SPLITALL: Output Address Range: [0x0, 0x12) (18 bytes) +# SPLITALL: {{^\.LFT0}} +# SPLITALL: Output Address Range: [0x0, 0xa) (10 bytes) +# SPLITALL: {{^\.Ltmp1}} +# SPLITALL: Output Address Range: [0x0, 0x2) (2 bytes) +# SPLITALL: {{^\.Ltmp0}} +# SPLITALL: Output Address Range: [0x0, 0x10) (16 bytes) +# SPLITALL: {{^\.Ltmp2}} +# SPLITALL: Output Address Range: [0x0, 0x8) (8 bytes) +# SPLITALL: {{^\.LFT1}} +# SPLITALL: Output Address Range: [0x0, 0x8) (8 bytes) + +# SPLITHOTCOLD: {{^\.LBB00}} +# SPLITHOTCOLD: Output Address Range: [0x0, 0x9) (9 bytes) +# SPLITHOTCOLD: {{^\.LFT0}} +# SPLITHOTCOLD: Output Address Range: [0x9, 0xe) (5 bytes) +# SPLITHOTCOLD: {{^\.Ltmp1}} +# SPLITHOTCOLD: Output Address Range: [0xe, 0x10) (2 bytes) +# SPLITHOTCOLD: {{^\.Ltmp0}} +# SPLITHOTCOLD: Output Address Range: [0x10, 0x1b) (11 bytes) +# SPLITHOTCOLD: {{^\.Ltmp2}} +# SPLITHOTCOLD: Output Address Range: [0x1b, 0x20) (5 bytes) +# SPLITHOTCOLD: {{^\.LFT1}} +# SPLITHOTCOLD: Output Address Range: [0x0, 0x8) (8 bytes) + + .text + .globl chain + .type chain, @function +chain: + pushq %rbp + movq %rsp, %rbp + cmpl $2, %edi +LLentry_LLchain_start: + jge LLchain_start +# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLchain_start# 0 10 +# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLfast# 0 500 +LLfast: + movl $5, %eax +LLfast_LLexit: + jmp LLexit +# FDATA: 1 chain #LLfast_LLexit# 1 chain #LLexit# 0 500 +LLchain_start: + movl $10, %eax +LLchain_start_LLchain1: + jge LLchain1 +# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLchain1# 0 10 +# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLcold# 0 0 +LLcold: + addl $1, %eax +LLchain1: + addl $1, %eax +LLchain1_LLexit: + jmp LLexit +# FDATA: 1 chain #LLchain1_LLexit# 1 chain #LLexit# 0 10 +LLexit: + popq %rbp + ret +LLchain_end: + .size chain, LLchain_end-chain + + + .globl main + .type main, @function +main: + pushq %rbp + movq %rsp, %rbp + movl $1, %edi +LLmain_chain1: + call chain +# FDATA: 1 main #LLmain_chain1# 1 chain 0 0 500 + movl $4, %edi +LLmain_chain2: + call chain +# FDATA: 1 main #LLmain_chain2# 1 chain 0 0 10 + xorl %eax, %eax + popq %rbp + retq +.Lmain_end: + .size main, .Lmain_end-main diff --git a/bolt/test/X86/checkvma-large-section.test b/bolt/test/X86/checkvma-large-section.test new file mode 100644 index 0000000000000000000000000000000000000000..afa44111ead49e5d7d46f13a8578144cb6311fd9 --- /dev/null +++ b/bolt/test/X86/checkvma-large-section.test @@ -0,0 +1,35 @@ +# This test reproduces the issue with a section which ends at >4G address +REQUIRES: asserts +RUN: split-file %s %t +RUN: yaml2obj %t/yaml -o %t.exe --max-size=0 +RUN: llvm-bolt %t.exe -o /dev/null --allow-stripped +#--- yaml +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +ProgramHeaders: + - Type: PT_LOAD + FirstSec: .a + LastSec: .a + Align: 0x1000 + - Type: PT_LOAD + Flags: [ PF_R, PF_W ] + FirstSec: .large_sec + LastSec: .large_sec + VAddr: 0x80000000 + - Type: PT_GNU_RELRO + Flags: [ PF_R ] +Sections: + - Name: .a + Type: SHT_PROGBITS + Content: 00 + AddressAlign: 0x1 + - Name: .large_sec + Type: SHT_NOBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x80000000 + Size: 0x80000000 +... diff --git a/bolt/test/X86/dwarf4-df-dualcu.test b/bolt/test/X86/dwarf4-df-dualcu.test index 71726136d7ca5fd947dc4418a001be6b6bae6fa8..c8135ac54377f845fe9514bf9dddb44633a1a9b5 100644 --- a/bolt/test/X86/dwarf4-df-dualcu.test +++ b/bolt/test/X86/dwarf4-df-dualcu.test @@ -1,7 +1,7 @@ ; RUN: rm -rf %t ; RUN: mkdir %t ; RUN: cd %t -;; RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-df-dualcu-main.s \ +; RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-df-dualcu-main.s \ ; RUN: -split-dwarf-file=main.dwo -o main.o ; RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-df-dualcu-helper.s \ ; RUN: -split-dwarf-file=helper.dwo -o helper.o @@ -12,7 +12,7 @@ ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe.bolt >> %t/foo.txt ; RUN: cat %t/foo.txt | FileCheck -check-prefix=BOLT %s ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.dwo &> maindwo.txt -; RUN cat maindwo.txt | FileCheck -check-prefix=PRE-BOLT-DWO-MAIN %s +; RUN: cat maindwo.txt | FileCheck -check-prefix=PRE-BOLT-DWO-MAIN %s ; RUN: not llvm-dwarfdump --show-form --verbose --debug-info main.dwo.dwo &> mainddwodwo.txt ; RUN: cat mainddwodwo.txt | FileCheck -check-prefix=BOLT-DWO-MAIN %s ; RUN: llvm-dwarfdump --show-form --verbose --debug-info helper.dwo &> helperdwo.txt diff --git a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp index f6ebd6b76f60acc63ce87f045b791198ba08cb3a..4ed8be42cd0f37fdfa6e02e165059dc33d943cc6 100644 --- a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp +++ b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp @@ -1,7 +1,7 @@ // This test checks that .eh_frame_hdr address is in bounds of the last LOAD // end address i.e. the section address is smaller then the LOAD end address. -// REQUIRES: system-linux,bolt-runtime +// REQUIRES: system-linux,bolt-runtime,target=x86_64{{.*}} // RUN: %clangxx %cxxflags -static -Wl,-q %s -o %t.exe -Wl,--entry=_start // RUN: llvm-bolt %t.exe -o %t.instr -instrument \ diff --git a/bolt/test/X86/internal-call-instrument-so.s b/bolt/test/X86/internal-call-instrument-so.s index b8903fc7f8223c5545b41fbaec0d70f6b7e2ffbe..d13c828f605c3e3d5d8d5b4af93f03ebe198b05f 100644 --- a/bolt/test/X86/internal-call-instrument-so.s +++ b/bolt/test/X86/internal-call-instrument-so.s @@ -1,6 +1,6 @@ # This reproduces a bug with instrumentation crashes on internal call -# REQUIRES: system-linux,bolt-runtime +# REQUIRES: system-linux,bolt-runtime,target=x86_64{{.*}} # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o # Delete our BB symbols so BOLT doesn't mark them as entry points @@ -41,7 +41,6 @@ _start: retq .size _start, .-_start - .globl _fini .type _fini, %function .p2align 4 diff --git a/bolt/test/X86/internal-call-instrument.s b/bolt/test/X86/internal-call-instrument.s index 7ddfb4fb812d3528c6f2ca7fe01925ea18dfb664..c393f1dac864718c38dc6080394d495b6d119171 100644 --- a/bolt/test/X86/internal-call-instrument.s +++ b/bolt/test/X86/internal-call-instrument.s @@ -1,15 +1,23 @@ # This reproduces a bug with instrumentation crashes on internal call -# REQUIRES: x86_64-linux,bolt-runtime +# REQUIRES: x86_64-linux,bolt-runtime,target=x86_64{{.*}} # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o # Delete our BB symbols so BOLT doesn't mark them as entry points # RUN: llvm-strip --strip-unneeded %t.o -# RUN: %clang %t.o -o %t.exe -Wl,-q +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q # RUN: llvm-bolt --instrument %t.exe --relocs -o %t.out .text + .globl _start + .type _start, %function + .p2align 4 +_start: + call main + ret + .size _start, .-_start + .globl main .type main, %function .p2align 4 @@ -20,6 +28,7 @@ main: push %rbx sub $0x120,%rsp mov $0x3,%rbx + movq rel(%rip), %rdi .J1: cmp $0x0,%rbx je .J2 @@ -40,4 +49,15 @@ main: .J4: pop %rbp retq +end: .size main, .-main + + .globl _fini + .type _fini, %function + .p2align 4 +_fini: + hlt + .size _fini, .-_fini + + .data +rel: .quad end diff --git a/bolt/test/X86/issue26.s b/bolt/test/X86/issue26.s index a6e38b6e4ceffd9aaddb482aee4c09f9126925a5..6f9bc72d6e10dcff385e6d5e920407b0b02ae2cf 100644 --- a/bolt/test/X86/issue26.s +++ b/bolt/test/X86/issue26.s @@ -7,7 +7,7 @@ # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \ # RUN: %s -o %t.o # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -# RUN: llvm-bolt %t.exe --relocs --print-cfg -o %t.out \ +# RUN: llvm-bolt %t.exe --relocs --print-cfg -o %t.out 2>&1 \ # RUN: | FileCheck %s # CHECK-NOT: BOLT-WARNING: CFG invalid in XYZ @ .LBB0 diff --git a/bolt/test/X86/issue26.test b/bolt/test/X86/issue26.test index 5bf25e6a59bab40c0fe1ddefa92d1eebf3acb67f..bafd0912cf4a48e7f98e86e0653ab09ec8640c02 100644 --- a/bolt/test/X86/issue26.test +++ b/bolt/test/X86/issue26.test @@ -1,7 +1,7 @@ # This reproduces issue 26 from our github repo # RUN: yaml2obj %p/Inputs/issue26.yaml &> %t.exe -# RUN: llvm-bolt %t.exe --relocs --print-cfg -o %t.out \ +# RUN: llvm-bolt %t.exe --relocs --print-cfg -o %t.out 2>&1 \ # RUN: | FileCheck %s CHECK-NOT: BOLT-WARNING: CFG invalid in XYZ @ .LBB0 diff --git a/bolt/test/X86/jump-table-func-entry.s b/bolt/test/X86/jump-table-func-entry.s new file mode 100644 index 0000000000000000000000000000000000000000..77b444d520a1f105c86c6ea8f5210e29bc3a847e --- /dev/null +++ b/bolt/test/X86/jump-table-func-entry.s @@ -0,0 +1,72 @@ +# REQUIRES: system-linux + +## Check that BOLT correctly processes jump table that contains function start +## as one of its entries. + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.exe -no-pie -Wl,-q + +# RUN: llvm-bolt %t.exe --print-normalized --print-only=foo -o %t.out \ +# RUN: |& FileCheck %s + + + + .text + .globl _start + .type _start, %function +_start: + .cfi_startproc + call foo + ret + .cfi_endproc + .size _start, .-_start + + .globl foo + .type foo, %function +foo: + .cfi_startproc +.LBB00: + movq 0x8(%rdi), %rdi + movzbl 0x1(%rdi), %eax +.LBB00_br: + jmpq *"JUMP_TABLE/foo.0"(,%rax,8) +# CHECK: jmpq {{.*}} # JUMPTABLE +# CHECK-NEXT: Successors: {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}} + +.Ltmp87085: + xorl %eax, %eax + retq + +.Ltmp87086: + cmpb $0x0, 0x8(%rdi) + setne %al + retq + +.Ltmp87088: + movb $0x1, %al + retq + +.Ltmp87087: + movzbl 0x14(%rdi), %eax + andb $0x2, %al + shrb %al + retq + + .cfi_endproc +.size foo, .-foo + +# Jump tables +.section .rodata +"JUMP_TABLE/foo.0": + .quad .Ltmp87085 + .quad .Ltmp87086 + .quad .Ltmp87087 + .quad .LBB00 + .quad .Ltmp87088 + +# CHECK: Jump table {{.*}} for function foo +# CHECK-NEXT: 0x{{.*}} : +# CHECK-NEXT: 0x{{.*}} : +# CHECK-NEXT: 0x{{.*}} : +# CHECK-NEXT: 0x{{.*}} : +# CHECK-NEXT: 0x{{.*}} : diff --git a/bolt/test/X86/jump-table-icp.test b/bolt/test/X86/jump-table-icp.test index 708f1273af3f19600fe1ac6cf06b6ca375194cb1..5b989d18018b0505f7293c6d00d238e5b1e101d6 100644 --- a/bolt/test/X86/jump-table-icp.test +++ b/bolt/test/X86/jump-table-icp.test @@ -12,6 +12,7 @@ RUN: (llvm-bolt %t.exe --data %t.fdata -o %t --relocs \ RUN: --reorder-blocks=cache --split-functions --split-all-cold \ RUN: --use-gnu-stack --dyno-stats --indirect-call-promotion=jump-tables \ RUN: --print-icp -v=0 \ +RUN: --enable-bat --print-cache-metrics \ RUN: --icp-jt-remaining-percent-threshold=10 \ RUN: --icp-jt-total-percent-threshold=2 \ RUN: --indirect-call-promotion-topn=1 \ @@ -36,12 +37,14 @@ CHECK: Successors: .Ltmp{{.*}} (mispreds: 189, count: 189), .LFT{{.*}} (mispre CHECK: .LFT{{.*}} (4 instructions, align : 1) CHECK-NEXT: Exec Count : 881 CHECK: Predecessors: .LBB{{.*}} -CHECK: Successors: .Ltmp{{.*}} (mispreds: 138, count: 155), .Ltmp{{.*}} (mispreds: 0, count: 726) +CHECK: je {{.*}} # Offset: 28 +CHECK-NEXT: Successors: .Ltmp{{.*}} (mispreds: 138, count: 155), .Ltmp{{.*}} (mispreds: 0, count: 726) CHECK: .Ltmp{{.*}} (1 instructions, align : 1) CHECK-NEXT: Exec Count : 726 CHECK: Predecessors: .LFT{{.*}} -CHECK: Successors: .L{{.*}} (mispreds: 126, count: 157), .L{{.*}} (mispreds: 140, count: 156), .L{{.*}} (mispreds: 134, count: 152), .L{{.*}} (mispreds: 137, count: 150), .L{{.*}} (mispreds: 129, count: 148), .L{{.*}} (mispreds: 0, count: 0) +CHECK: jmpq {{.*}} # Offset: 28 +CHECK-NEXT: Successors: .L{{.*}} (mispreds: 126, count: 157), .L{{.*}} (mispreds: 140, count: 156), .L{{.*}} (mispreds: 134, count: 152), .L{{.*}} (mispreds: 137, count: 150), .L{{.*}} (mispreds: 129, count: 148), .L{{.*}} (mispreds: 0, count: 0) CHECK: .Ltmp{{.*}} (5 instructions, align : 1) CHECK-NEXT: Exec Count : 167 diff --git a/bolt/test/X86/keep-nops.s b/bolt/test/X86/keep-nops.s new file mode 100644 index 0000000000000000000000000000000000000000..37da2ff07b9b79827a3ff43e43897bc53b792fe3 --- /dev/null +++ b/bolt/test/X86/keep-nops.s @@ -0,0 +1,69 @@ +## Check that BOLT preserves NOP instructions of different sizes correctly. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-linux %s -o %t.o +# RUN: ld.lld %t.o -o %t.exe -q +# RUN: llvm-bolt %t.exe -o %t.bolt.exe --keep-nops --relocs --print-finalized \ +# RUN: |& FileCheck --check-prefix=CHECK-BOLT %s +# RUN: llvm-objdump -d %t.bolt.exe | FileCheck %s + + .text + .globl _start + .type _start,@function +_start: + .cfi_startproc + .nops 1 + .nops 2 + .nops 3 + .nops 4 + .nops 5 + .nops 6 + .nops 7 + .nops 8 + .nops 9 + .nops 10 + .nops 11 + .nops 12 + .nops 13 + .nops 14 + .nops 15 + +# CHECK: <_start>: +# CHECK-NEXT: 90 +# CHECK-NEXT: 66 90 +# CHECK-NEXT: 0f 1f 00 +# CHECK-NEXT: 0f 1f 40 00 +# CHECK-NEXT: 0f 1f 44 00 00 +# CHECK-NEXT: 66 0f 1f 44 00 00 +# CHECK-NEXT: 0f 1f 80 00 00 00 00 +# CHECK-NEXT: 0f 1f 84 00 00 00 00 00 +# CHECK-NEXT: 66 0f 1f 84 00 00 00 00 00 +# CHECK-NEXT: 66 2e 0f 1f 84 00 00 00 00 00 +# CHECK-NEXT: 66 66 2e 0f 1f 84 00 00 00 00 00 +# CHECK-NEXT: 66 66 66 2e 0f 1f 84 00 00 00 00 00 +# CHECK-NEXT: 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 +# CHECK-NEXT: 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 +# CHECK-NEXT: 66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 + +# CHECK-BOLT: Size: 1 +# CHECK-BOLT-NEXT: Size: 2 +# CHECK-BOLT-NEXT: Size: 3 +# CHECK-BOLT-NEXT: Size: 4 +# CHECK-BOLT-NEXT: Size: 5 +# CHECK-BOLT-NEXT: Size: 6 +# CHECK-BOLT-NEXT: Size: 7 +# CHECK-BOLT-NEXT: Size: 8 +# CHECK-BOLT-NEXT: Size: 9 +# CHECK-BOLT-NEXT: Size: 10 +# CHECK-BOLT-NEXT: Size: 11 +# CHECK-BOLT-NEXT: Size: 12 +# CHECK-BOLT-NEXT: Size: 13 +# CHECK-BOLT-NEXT: Size: 14 +# CHECK-BOLT-NEXT: Size: 15 + +# Needed for relocation mode. + .reloc 0, R_X86_64_NONE + + .size _start, .-_start + .cfi_endproc diff --git a/bolt/test/X86/patch-entries.test b/bolt/test/X86/patch-entries.test new file mode 100644 index 0000000000000000000000000000000000000000..54f358f273e793c30da84fe3a6134e77d4cb4759 --- /dev/null +++ b/bolt/test/X86/patch-entries.test @@ -0,0 +1,10 @@ +# Checking crashes against injected binary functions created by patch +# entries pass and debug info turned on. In these cases, we were +# trying to fetch input to output maps on injected functions and +# crashing. + +REQUIRES: system-linux + +RUN: %clang %cflags -no-pie -g %p/Inputs/patch-entries.c -fuse-ld=lld -o %t.exe \ +RUN: -Wl,-q -I%p/../Inputs +RUN: llvm-bolt -relocs %t.exe -o %t.out --update-debug-sections --force-patch diff --git a/bolt/test/X86/reader-stale-yaml.test b/bolt/test/X86/reader-stale-yaml.test index 3f9861d2b7092301518c103db31dabb094e7de0e..5231032f4f4a75da976c093865295d61fd026bdf 100644 --- a/bolt/test/X86/reader-stale-yaml.test +++ b/bolt/test/X86/reader-stale-yaml.test @@ -1,39 +1,71 @@ # This script checks that YamlProfileReader in llvm-bolt is reading data -# correctly and stale data is corrected. +# correctly and stale data is corrected by profile inference. RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe +# Testing "usqrt" RUN: llvm-bolt %t.exe -o /dev/null --b %p/Inputs/blarge_profile_stale.yaml \ RUN: --print-cfg --print-only=usqrt --infer-stale-profile=1 \ -RUN: --profile-ignore-hash=1 --profile-use-dfs 2>&1 | FileCheck %s +RUN: --profile-ignore-hash=1 --profile-use-dfs=0 2>&1 | FileCheck %s -check-prefix=CHECK1 +# Testing "SolveCubic" +RUN: llvm-bolt %t.exe -o /dev/null --b %p/Inputs/blarge_profile_stale.yaml \ +RUN: --print-cfg --print-only=SolveCubic --infer-stale-profile=1 \ +RUN: --profile-ignore-hash=1 --profile-use-dfs=0 2>&1 | FileCheck %s -check-prefix=CHECK2 + +# Function "usqrt" has stale profile, since the number of blocks in the profile +# (nblocks=6) does not match the size of the CFG in the binary. The entry +# block (bid=0) has an incorrect (missing) count, which should be inferred by +# the algorithm. # Verify that yaml reader works as expected. -CHECK: pre-processing profile using YAML profile reader +CHECK1: pre-processing profile using YAML profile reader +CHECK1: Binary Function "usqrt" after building cfg { +CHECK1: State : CFG constructed +CHECK1: Address : 0x401170 +CHECK1: Size : 0x43 +CHECK1: Section : .text +CHECK1: IsSimple : 1 +CHECK1: BB Count : 5 +CHECK1: Exec Count : 20 +CHECK1: Branch Count: 640 +CHECK1: } +# Verify block counts. +CHECK1: .LBB01 (4 instructions, align : 1) +CHECK1: Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 20) +CHECK1: .Ltmp[[#BB13:]] (9 instructions, align : 1) +CHECK1: Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 320), .LFT[[#BB0:]] (mispreds: 0, count: 0) +CHECK1: .LFT[[#BB0:]] (2 instructions, align : 1) +CHECK1: Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 0) +CHECK1: .Ltmp[[#BB12:]] (2 instructions, align : 1) +CHECK1: Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 300), .LFT[[#BB1:]] (mispreds: 0, count: 20) +CHECK1: .LFT[[#BB1:]] (2 instructions, align : 1) +# Check the overall inference stats. +CHECK1: 2 out of 7 functions in the binary (28.6%) have non-empty execution profile +CHECK1: inferred profile for 2 (100.00% of profiled, 100.00% of stale) functions responsible for {{.*}} samples ({{.*}} out of {{.*}}) -# Verify the inferred counts of "usqrt" that has stale profile: -# - the function has nblocks=6 in the profile, which makes it stale -# - block with bid=0 has an incorrect (missing) count, which is inferred -CHECK: Binary Function "usqrt" after building cfg { -CHECK: State : CFG constructed -CHECK: Address : 0x401170 -CHECK: Size : 0x43 -CHECK: Section : .text -CHECK: IsSimple : 1 -CHECK: BB Count : 5 -CHECK: Exec Count : 20 -CHECK: Branch Count: 640 -CHECK: } -# Verify block counts. -CHECK: .LBB01 (4 instructions, align : 1) -CHECK: Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 20) -CHECK: .Ltmp[[#BB13:]] (9 instructions, align : 1) -CHECK: Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 320), .LFT[[#BB0:]] (mispreds: 0, count: 0) -CHECK: .LFT[[#BB0:]] (2 instructions, align : 1) -CHECK: Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 0) -CHECK: .Ltmp[[#BB12:]] (2 instructions, align : 1) -CHECK: Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 300), .LFT[[#BB1:]] (mispreds: 0, count: 20) -CHECK: .LFT[[#BB1:]] (2 instructions, align : 1) +# Function "SolveCubic" has stale profile, since there is one jump in the +# profile (from bid=13 to bid=2) which is not in the CFG in the binary. The test +# verifies that the inference is able to match two blocks (bid=1 and bid=13) +# using "loose" hashes and then correctly propagate the counts. -# Check the overal inference stats. -CHECK: 2 out of 7 functions in the binary (28.6%) have non-empty execution profile -CHECK: inferred profile for 1 (50.00% of profiled, 100.00% of stale) functions responsible for 87.31% samples (640 out of 733) +CHECK2: pre-processing profile using YAML profile reader +CHECK2: Binary Function "SolveCubic" after building cfg { +CHECK2: State : CFG constructed +CHECK2: Address : 0x400e00 +CHECK2: Size : 0x368 +CHECK2: Section : .text +CHECK2: IsSimple : 1 +CHECK2: BB Count : 18 +CHECK2: Exec Count : 151 +CHECK2: Branch Count: 552 +# Verify block counts. +CHECK2: .LBB00 (43 instructions, align : 1) +CHECK2: Successors: .Ltmp[[#BB7:]] (mispreds: 0, count: 0), .LFT[[#BB1:]] (mispreds: 0, count: 151) +CHECK2: .LFT[[#BB1:]] (5 instructions, align : 1) +CHECK2: Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 151), .LFT[[#BB2:]] (mispreds: 0, count: 0) +CHECK2: .Ltmp[[#BB3:]] (26 instructions, align : 1) +CHECK2: Successors: .Ltmp[[#BB5:]] (mispreds: 0, count: 151), .LFT[[#BB4:]] (mispreds: 0, count: 0) +CHECK2: .Ltmp[[#BB5:]] (9 instructions, align : 1) +CHECK2: .Ltmp[[#BB13:]] (12 instructions, align : 1) +CHECK2: Successors: .Ltmp[[#BB3:]] (mispreds: 0, count: 151) +CHECK2: 2 out of 7 functions in the binary (28.6%) have non-empty execution profile diff --git a/bolt/test/X86/tail-duplication-pass.s b/bolt/test/X86/tail-duplication-pass.s index 677f4986eb89021ceb23ce81144f0908be22869e..ed50cc5227d8557dc39a71db4a5b481c71f9971a 100644 --- a/bolt/test/X86/tail-duplication-pass.s +++ b/bolt/test/X86/tail-duplication-pass.s @@ -7,12 +7,21 @@ # RUN: llvm-bolt %t.exe --data %t.fdata --reorder-blocks=ext-tsp \ # RUN: --print-finalized --tail-duplication=moderate \ # RUN: --tail-duplication-minimum-offset=1 -o %t.out | FileCheck %s +# RUN: llvm-bolt %t.exe --data %t.fdata --print-finalized \ +# RUN: --tail-duplication=aggressive --tail-duplication-minimum-offset=1 \ +# RUN: -o %t.out | FileCheck %s --check-prefix CHECK-NOLOOP # FDATA: 1 main 2 1 main #.BB2# 0 10 # FDATA: 1 main 4 1 main #.BB2# 0 20 # CHECK: BOLT-INFO: tail duplication modified 1 ({{.*}}%) functions; duplicated 1 blocks (1 bytes) responsible for {{.*}} dynamic executions ({{.*}}% of all block executions) # CHECK: BB Layout : .LBB00, .Ltail-dup0, .Ltmp0, .Ltmp1 +# Check that the successor of Ltail-dup0 is .LBB00, not itself. +# CHECK-NOLOOP: .Ltail-dup0 (1 instructions, align : 1) +# CHECK-NOLOOP: Predecessors: .LBB00 +# CHECK-NOLOOP: retq +# CHECK-NOLOOP: .Ltmp0 (1 instructions, align : 1) + .text .globl main .type main, %function diff --git a/bolt/test/assume-abi.test b/bolt/test/assume-abi.test new file mode 100644 index 0000000000000000000000000000000000000000..688ab011441d3a428c3358d99d10856fbdc9d771 --- /dev/null +++ b/bolt/test/assume-abi.test @@ -0,0 +1,7 @@ +# Validate the usage of the `--assume-abi` option in conjunction with +# options related to the RegAnalysis Pass. + +REQUIRES: system-linux + +RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q +RUN: llvm-bolt %t -o %t.bolt --assume-abi --indirect-call-promotion=all diff --git a/bolt/test/lsda.cpp b/bolt/test/lsda-section-name.cpp similarity index 89% rename from bolt/test/lsda.cpp rename to bolt/test/lsda-section-name.cpp index b7905a58b532daba5926142bf95413a0fee913fd..41fb17665821911bdf15080730f27b5333d06e0c 100644 --- a/bolt/test/lsda.cpp +++ b/bolt/test/lsda-section-name.cpp @@ -1,8 +1,8 @@ // This test check that LSDA section named by .gcc_except_table.main is // disassembled by BOLT. -// RUN: %clang++ %cxxflags -O3 -flto=thin -no-pie -c %s -o %t.o -// RUN: %clang++ %cxxflags -flto=thin -no-pie -fuse-ld=lld %t.o -o %t.exe \ +// RUN: %clang++ %cxxflags -O3 -no-pie -c %s -o %t.o +// RUN: %clang++ %cxxflags -no-pie -fuse-ld=lld %t.o -o %t.exe \ // RUN: -Wl,-q -Wl,--script=%S/Inputs/lsda.ldscript // RUN: llvm-readelf -SW %t.exe | FileCheck %s // RUN: llvm-bolt %t.exe -o %t.bolt diff --git a/bolt/test/permission.test b/bolt/test/permission.test new file mode 100644 index 0000000000000000000000000000000000000000..a5a98599eb83b40c7dac555e0258cde156641763 --- /dev/null +++ b/bolt/test/permission.test @@ -0,0 +1,13 @@ +# Ensure that the permissions of the optimized binary file comply with the +# system's umask. + +# This test performs a logical AND operation on the results of the `stat -c %a +# %t.bolt` and `umask` commands (both results are displayed in octal), and +# checks whether the result is equal to 0. +REQUIRES: system-linux + +RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q +RUN: llvm-bolt %t -o %t.bolt +RUN: echo $(( 8#$(stat -c %a %t.bolt) & 8#$(umask) )) | FileCheck %s + +CHECK: 0 diff --git a/bolt/test/runtime/AArch64/BiSheng/hugify.c b/bolt/test/runtime/AArch64/BiSheng/hugify.c new file mode 100644 index 0000000000000000000000000000000000000000..d40c1fe85e5ed26cf84cd50e2ef3a3651fbb101a --- /dev/null +++ b/bolt/test/runtime/AArch64/BiSheng/hugify.c @@ -0,0 +1,27 @@ +// Make sure BOLT correctly processes --hugify option + +#include + +int main(int argc, char **argv) { + printf("Hello world\n"); + return 0; +} + +/* +REQUIRES: system-linux,bolt-runtime,enable_bspub_common + +RUN: %clang %cflags -no-pie %s -o %t.nopie.exe -Wl,-q +RUN: %clang %cflags -fpic -pie %s -o -%t.pie.exe -Wl,-q + +RUN: llvm-bolt %t.nopie.exe --lite=0 -o %t.nopie --hugify +RUN: llvm-bolt -%t.pie.exe --lite=0 -o %t.pie --hugify + +RUN: %t.nopie | FileCheck %s -check-prefix=CHECK-NOPIE + +CHECK-NOPIE: Hello world + +RUN: %t.pie | FileCheck %s -check-prefix=CHECK-PIE + +CHECK-PIE: Hello world + +*/ diff --git a/bolt/test/runtime/AArch64/adrrelaxationpass.s b/bolt/test/runtime/AArch64/adrrelaxationpass.s index 5c50cd6371926cbe23175ded354dd7e719ef6338..fa9fb63c613dc1345fc7a81e14da721e91e42345 100644 --- a/bolt/test/runtime/AArch64/adrrelaxationpass.s +++ b/bolt/test/runtime/AArch64/adrrelaxationpass.s @@ -1,33 +1,27 @@ # The second and third ADR instructions are non-local to functions # and must be replaced with ADRP + ADD by BOLT -# Also since main is non-simple, we can't change it's length so we have to -# replace NOP with adrp, and if there is no nop before adr in non-simple +# Also since main and test are non-simple, we can't change it's length so we +# have to replace NOP with adrp, and if there is no nop before adr in non-simple # function, we can't guarantee we didn't break possible jump tables, so we -# fail in strict mode +# fail in non-strict mode # REQUIRES: system-linux # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ # RUN: %s -o %t.o # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -# RUN: llvm-bolt %t.exe -o %t.bolt --adr-relaxation=true +# RUN: llvm-bolt %t.exe -o %t.bolt --adr-relaxation=true --strict # RUN: llvm-objdump --no-print-imm-hex -d --disassemble-symbols=main %t.bolt | FileCheck %s # RUN: %t.bolt -# RUN: not llvm-bolt %t.exe -o %t.bolt --adr-relaxation=true --strict \ +# RUN: not llvm-bolt %t.exe -o %t.bolt --adr-relaxation=true \ # RUN: 2>&1 | FileCheck %s --check-prefix CHECK-ERROR - .data - .align 8 - .global Gvar -Gvar: .xword 0x0 - .global Gvar2 -Gvar2: .xword 0x42 - .text .align 4 .global test .type test, %function test: + adr x2, Gvar mov x0, xzr ret .size test, .-test @@ -47,6 +41,17 @@ br: .CI: .word 0xff + .data + .align 8 + .global Gvar +Gvar: .xword 0x0 + .global Gvar2 +Gvar2: .xword 0x42 + .balign 4 +jmptable: + .word 0 + .word test - jmptable + # CHECK:
: # CHECK-NEXT: adr x0, 0x{{[1-8a-f][0-9a-f]*}} # CHECK-NEXT: adrp x1, 0x{{[1-8a-f][0-9a-f]*}} @@ -54,4 +59,4 @@ br: # CHECK-NEXT: adrp x2, 0x{{[1-8a-f][0-9a-f]*}} # CHECK-NEXT: add x2, x2, #{{[1-8a-f][0-9a-f]*}} # CHECK-NEXT: adr x3, 0x{{[1-8a-f][0-9a-f]*}} -# CHECK-ERROR: BOLT-ERROR: Cannot relax adr in non-simple function main +# CHECK-ERROR: BOLT-ERROR: Cannot relax adr in non-simple function diff --git a/bolt/test/runtime/AArch64/controlflow.s b/bolt/test/runtime/AArch64/controlflow.s index fe9aab88f0c74047633ea5b9ee7e279d85ef8a00..7b0a38779f6e9cfb90027ad9adcd84a76568f4c8 100644 --- a/bolt/test/runtime/AArch64/controlflow.s +++ b/bolt/test/runtime/AArch64/controlflow.s @@ -48,6 +48,7 @@ test_cond_branch: .global test_branch_reg .type test_branch_reg, %function test_branch_reg: + nop adr x0, test_branch_zero br x0 panic @@ -97,6 +98,7 @@ test_call: .global test_call_reg .type test_call_reg, %function test_call_reg: + nop adr x0, test_call_foo blr x0 panic diff --git a/bolt/test/runtime/AArch64/hook-fini.test b/bolt/test/runtime/AArch64/hook-fini.test new file mode 100644 index 0000000000000000000000000000000000000000..8d23b21b6d612f5556608848237edc2c1af6f4de --- /dev/null +++ b/bolt/test/runtime/AArch64/hook-fini.test @@ -0,0 +1,61 @@ +# Test the different ways of hooking the fini function for instrumentation (via +# DT_FINI and via DT_FINI_ARRAY). We test the latter for both PIE and non-PIE +# binaries because of the different ways of handling relocations (static or +# dynamic). +# All tests perform the following steps: +# - Compile and link for the case to be tested +# - Some sanity-checks on the dynamic section and relocations in the binary to +# verify it has the shape we want for testing: +# - DT_FINI or DT_FINI_ARRAY in dynamic section +# - No relative relocations for non-PIE +# - Instrument +# - Run instrumented binary +# - Verify generated profile +REQUIRES: system-linux,bolt-runtime + +RUN: %clang %cflags -pie %p/Inputs/basic-instrumentation.s -Wl,-q -o %t.exe +RUN: llvm-readelf -d %t.exe | FileCheck --check-prefix=DYN-FINI %s +RUN: llvm-readelf -r %t.exe | FileCheck --check-prefix=RELOC-PIE %s +RUN: llvm-bolt %t.exe -o %t --instrument \ +RUN: --instrumentation-file=%t \ +RUN: --instrumentation-file-append-pid +RUN: rm -f %t.*.fdata +RUN: %t +RUN: cat %t.*.fdata | FileCheck %s + +RUN: %clang %cflags -pie %p/Inputs/basic-instrumentation.s -Wl,-q,-fini=0 -o %t-no-fini.exe +RUN: llvm-readelf -d %t-no-fini.exe | FileCheck --check-prefix=DYN-NO-FINI %s +RUN: llvm-readelf -r %t-no-fini.exe | FileCheck --check-prefix=RELOC-PIE %s +RUN: llvm-bolt %t-no-fini.exe -o %t-no-fini --instrument \ +RUN: --instrumentation-file=%t-no-fini \ +RUN: --instrumentation-file-append-pid +RUN: rm -f %t-no-fini.*.fdata +RUN: %t-no-fini +RUN: cat %t-no-fini.*.fdata | FileCheck %s + +RUN: %clang %cflags -no-pie %p/Inputs/basic-instrumentation.s -Wl,-q,-fini=0 -o %t-no-pie-no-fini.exe +RUN: llvm-readelf -d %t-no-pie-no-fini.exe | FileCheck --check-prefix=DYN-NO-FINI %s +RUN: llvm-readelf -r %t-no-pie-no-fini.exe | FileCheck --check-prefix=RELOC-NO-PIE %s +RUN: llvm-bolt %t-no-pie-no-fini.exe -o %t-no-pie-no-fini --instrument \ +RUN: --instrumentation-file=%t-no-pie-no-fini \ +RUN: --instrumentation-file-append-pid +RUN: rm -f %t-no-pie-no-fini.*.fdata +RUN: %t-no-pie-no-fini +RUN: cat %t-no-pie-no-fini.*.fdata | FileCheck %s + +# With fini: dynamic section should contain DT_FINI +DYN-FINI: (FINI) + +# Without fini: dynamic section should only contain DT_FINI_ARRAY +DYN-NO-FINI-NOT: (FINI) +DYN-NO-FINI: (FINI_ARRAY) +DYN-NO-FINI: (FINI_ARRAYSZ) + +# With PIE: binary should have relative relocations +RELOC-PIE: R_AARCH64_RELATIVE + +# Without PIE: binary should not have relative relocations +RELOC-NO-PIE-NOT: R_AARCH64_RELATIVE + +# The instrumented profile should at least say main was called once +CHECK: main 0 0 1{{$}} diff --git a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp b/bolt/test/runtime/Inputs/exceptions_split.cpp similarity index 85% rename from bolt/test/runtime/X86/Inputs/exceptions_split.cpp rename to bolt/test/runtime/Inputs/exceptions_split.cpp index 2c136b9a1cf5c958d175df17e629504357ca1287..de81adf7583ca3da8b8255e45809f4f7d2ad899f 100644 --- a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp +++ b/bolt/test/runtime/Inputs/exceptions_split.cpp @@ -3,31 +3,25 @@ // // Record performance data with no args. Run test with 2 args. -#include #include +#include -int foo() -{ - return 0; -} +int foo() { return 0; } void bar(int a) { if (a > 2 && a % 2) throw new int(); } -void filter_only(){ - foo(); -} +void filter_only() { foo(); } -int main(int argc, char **argv) -{ +int main(int argc, char **argv) { unsigned r = 0; uint64_t limit = (argc >= 2 ? 10 : 5000); for (uint64_t i = 0; i < limit; ++i) { i += foo(); - try { + try { bar(argc); try { if (argc >= 2) diff --git a/bolt/test/X86/asm-dump.c b/bolt/test/runtime/X86/asm-dump.c similarity index 100% rename from bolt/test/X86/asm-dump.c rename to bolt/test/runtime/X86/asm-dump.c diff --git a/bolt/test/runtime/X86/exceptions-lpstart-zero.s b/bolt/test/runtime/X86/exceptions-lpstart-zero.s new file mode 100644 index 0000000000000000000000000000000000000000..b487ff0fa2f59114705657d5c262d9a78d94faad --- /dev/null +++ b/bolt/test/runtime/X86/exceptions-lpstart-zero.s @@ -0,0 +1,91 @@ +# RUN: %clangxx %cflags -no-pie %s -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.exe.bolt +# RUN: %t.exe.bolt + +# REQUIRES: system-linux + +## Test that BOLT properly handles LPStart when LPStartEncoding is different +## from DW_EH_PE_omit. + +# The test case compiled with -O1 from: +# +# int main() { +# try { +# throw 42; +# } catch (...) { +# return 0; +# } +# return 1; +# } +# +# The exception table was modified with udata4 LPStartEncoding and sdata4 +# CallSiteEncoding. + + .text + .globl main # -- Begin function main + .p2align 4, 0x90 + .type main,@function +main: # @main +.Lfunc_begin0: + .cfi_startproc + .cfi_personality 3, __gxx_personality_v0 + .cfi_lsda 3, .Lexception0 +# %bb.0: + pushq %rax + .cfi_def_cfa_offset 16 + movl $4, %edi + callq __cxa_allocate_exception + movl $42, (%rax) +.Ltmp0: + movl $_ZTIi, %esi + movq %rax, %rdi + xorl %edx, %edx + callq __cxa_throw +.Ltmp1: +# %bb.1: +.LBB0_2: +.Ltmp2: + movq %rax, %rdi + callq __cxa_begin_catch + callq __cxa_end_catch + xorl %eax, %eax + popq %rcx + .cfi_def_cfa_offset 8 + retq +.Lfunc_end0: + .size main, .Lfunc_end0-main + .cfi_endproc + .section .gcc_except_table,"a",@progbits + .p2align 2 +GCC_except_table0: +.Lexception0: + .byte 3 # @LPStart Encoding = udata4 + .long 0 + .byte 3 # @TType Encoding = udata4 + .uleb128 .Lttbase0-.Lttbaseref0 +.Lttbaseref0: + .byte 11 # Call site Encoding = sdata4 + .uleb128 .Lcst_end0-.Lcst_begin0 +.Lcst_begin0: + .long .Lfunc_begin0-.Lfunc_begin0 # >> Call Site 1 << + .long .Ltmp0-.Lfunc_begin0 # Call between .Lfunc_begin0 and .Ltmp0 + .long 0 # has no landing pad + .byte 0 # On action: cleanup + .long .Ltmp0-.Lfunc_begin0 # >> Call Site 2 << + .long .Ltmp1-.Ltmp0 # Call between .Ltmp0 and .Ltmp1 + .long .Ltmp2 # jumps to .Ltmp2 + .byte 1 # On action: 1 + .long .Ltmp1-.Lfunc_begin0 # >> Call Site 3 << + .long .Lfunc_end0-.Ltmp1 # Call between .Ltmp1 and .Lfunc_end0 + .long 0 # has no landing pad + .byte 0 # On action: cleanup +.Lcst_end0: + .byte 1 # >> Action Record 1 << + # Catch TypeInfo 1 + .byte 0 # No further actions + .p2align 2 + # >> Catch TypeInfos << + .long 0 # TypeInfo 1 +.Lttbase0: + .p2align 2 + # -- End function diff --git a/bolt/test/runtime/X86/instrumentation-tail-call.s b/bolt/test/runtime/X86/instrumentation-tail-call.s index 792d084e3f3d44b77bdbfce3e6a5260d0d15e4f3..dfb12f03401a3ab27d8bd6a55bc1c1416baa5e1e 100644 --- a/bolt/test/runtime/X86/instrumentation-tail-call.s +++ b/bolt/test/runtime/X86/instrumentation-tail-call.s @@ -14,6 +14,9 @@ # CHECK: leaq 0x80(%rsp), %rsp +# RUN: FileCheck %s --input-file %t.fdata --check-prefix=CHECK-FDATA +# CHECK-FDATA: 1 main {{.*}} 1 targetFunc 0 0 1 + .text .globl main .type main, %function @@ -32,7 +35,8 @@ main: movq %rbp, %rsp pop %rbp mov -0x10(%rsp),%rax - jmp targetFunc + test %rsp, %rsp + jne targetFunc .LBBerror: addq $0x20, %rsp diff --git a/bolt/test/runtime/X86/reg-reassign-no-swap-bl.s b/bolt/test/runtime/X86/reg-reassign-no-swap-bl.s new file mode 100644 index 0000000000000000000000000000000000000000..4e2e70ed6cba9e43cdf23671fe3e53398d2237c7 --- /dev/null +++ b/bolt/test/runtime/X86/reg-reassign-no-swap-bl.s @@ -0,0 +1,59 @@ +# This test case is used to reproduce an issue found in the mongod database. +# In function rankRegisters, if there is a BH Reg in the basic block, then the BL Reg +# also cannot be swap. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang -no-pie %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.out -data=%t.fdata --reg-reassign | FileCheck %s +# RUN: %t.out + +# CHECK: Reg Reassignment Pass: no changes were made + .text + .globl main + .globl main.cold + .p2align 4, 0x90 + .type main,@function + .type main.cold,@function +main.cold: +bb1: + mov $0x2, %bh +bb2: + jmp bb5 +main: # @main + .cfi_startproc +# %bb.0: # %entry + pushq %rax + pushq %r12 + pushq %rbx + .cfi_def_cfa_offset 16 + mov $0x1, %r12 + shr $0x14, %r12 + add $0x14, %r12 + mov $0x11, %rbx + mov $0x1, %bh + mov $0x1, %bl +bb3: + add $0x1, %r12 +bb4: + jmp bb1 +bb5: + cmp $0x201, %rbx + jne 0x0 +bb6: + xorl %eax, %eax + popq %rcx + popq %rbx + popq %r12 + .cfi_def_cfa_offset 8 + retq +# FDATA: 1 main.cold #bb2# 1 main 0 0 100 +# FDATA: 1 main #bb3# 1 main #bb4# 0 100 +# FDATA: 1 main #bb4# 1 main.cold 0 0 100 + +.Lfunc_end0: + .size main, .Lfunc_end0-main + .cfi_endproc diff --git a/bolt/test/runtime/X86/reg-reassign-swap-cold.s b/bolt/test/runtime/X86/reg-reassign-swap-cold.s new file mode 100644 index 0000000000000000000000000000000000000000..115b5b0eeff8b504f31e2e629dca7eea870ba064 --- /dev/null +++ b/bolt/test/runtime/X86/reg-reassign-swap-cold.s @@ -0,0 +1,64 @@ +# This test case reproduces a bug where, during register swapping, +# the code fragments associated with the function need to be swapped +# together (which may be generated during PGO optimization). If not +# handled properly, optimized binary execution can result in a segmentation fault. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang -no-pie %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.out -data=%t.fdata --reg-reassign | FileCheck %s +# RUN: %t.out + +# CHECK: BOLT-INFO: Reg Reassignment Pass Stats +# CHECK-NEXT: 2 functions affected. + .text + .globl main + .globl main.cold + .p2align 4, 0x90 + .type main,@function + .type main.cold,@function +main.cold: +bb1: + cmp $0x3, %r12 + jne bb8 +bb2: + jmp bb4 +main: # @main + .cfi_startproc +# %bb.0: # %entry + pushq %rax + pushq %r12 + pushq %rbx + .cfi_def_cfa_offset 16 + mov $0x1, %r12 + mov $0x2, %rbx + add $0x1, %r12 + shr $0x14, %r12 + mov $0x3, %r12 +bb3: + jmp bb1 +bb4: + cmp $0x3, %r12 +bb5: + jne bb8 +bb6: + xorl %eax, %eax +bb7: + popq %rcx + popq %rbx + popq %r12 + .cfi_def_cfa_offset 8 + retq +bb8: + mov $0x1, %rax + jmp bb7 +# FDATA: 1 main.cold #bb2# 1 main #bb4# 0 100 +# FDATA: 1 main #bb5# 1 main #bb6# 0 100 +# FDATA: 1 main #bb3# 1 main.cold 0 0 100 + +.Lfunc_end0: + .size main, .Lfunc_end0-main + .cfi_endproc diff --git a/bolt/test/runtime/X86/retpoline-synthetic.test b/bolt/test/runtime/X86/retpoline-synthetic.test index 394d0189207fbd7d2de8fbe937b3b389da674979..3434d8c31869e257228f14c8b85c0c2ad5a336dc 100644 --- a/bolt/test/runtime/X86/retpoline-synthetic.test +++ b/bolt/test/runtime/X86/retpoline-synthetic.test @@ -23,8 +23,8 @@ CHECK-JUMP-NOT: jmpq * # Check generated retpoline stub names RUN: llvm-strings %t | FileCheck %s -check-prefix=CHECK-STRINGS CHECK-STRINGS-DAG: __retpoline_%rax_ -CHECK-STRINGS-DAG: __retpoline_mem_%rip+DATAat0x[[#]] -CHECK-STRINGS-DAG: __retpoline_mem_%rax+0 +CHECK-STRINGS-DAG: __retpoline_mem_%r{{.*}} +CHECK-STRINGS-DAG: __retpoline_mem_%r{{.*}} RUN: %t 1000 3 | FileCheck %s CHECK: 30000000 diff --git a/bolt/test/runtime/X86/section-order.test b/bolt/test/runtime/X86/section-order.test index a1317daba50e8d25265da1f251e68b33285ab626..12d5949fcd0d99366283d60a34aac84e7e39f859 100644 --- a/bolt/test/runtime/X86/section-order.test +++ b/bolt/test/runtime/X86/section-order.test @@ -1,4 +1,5 @@ REQUIRES: system-linux,bolt-runtime +REQUIRES: issues703 RUN: %clang %p/Inputs/basic-instrumentation.s -Wl,-q -o %t.exe RUN: llvm-bolt %t.exe -o %t --instrument diff --git a/bolt/test/runtime/X86/exceptions-instrumentation.test b/bolt/test/runtime/exceptions-instrumentation.test similarity index 95% rename from bolt/test/runtime/X86/exceptions-instrumentation.test rename to bolt/test/runtime/exceptions-instrumentation.test index 7a8f4ee81e4fc5cd8b7270d40b6209253ea831e2..4b8b3bee1fdb62deb80d273a9fd2c7e8cdd882af 100644 --- a/bolt/test/runtime/X86/exceptions-instrumentation.test +++ b/bolt/test/runtime/exceptions-instrumentation.test @@ -9,7 +9,7 @@ RUN: %t.exc arg1 arg2 arg3 RUN: llvm-bolt %t_exc_split -o %t.exc.bolted --data %t.fdata \ RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \ -RUN: --split-functions --split-eh=1 \ +RUN: --split-functions --split-eh=1 2>&1 \ RUN: | FileCheck --check-prefix=EXCEPTIONS %s EXCEPTIONS-NOT: invalid (possibly stale) profile diff --git a/bolt/test/runtime/iplt.c b/bolt/test/runtime/iplt.c index b0e2e6d250700c9a90af842b3954a98338844b91..d5b56d901e6227d90bfb210683bd3883145d5210 100644 --- a/bolt/test/runtime/iplt.c +++ b/bolt/test/runtime/iplt.c @@ -1,10 +1,16 @@ // This test checks that the ifuncs works after bolt. +// Compiling with 00 results in IFUNC indirect calling. -// RUN: %clang %cflags -no-pie %s -fuse-ld=lld \ +// RUN: %clang %cflags -O0 -no-pie %s -fuse-ld=lld \ // RUN: -o %t.exe -Wl,-q // RUN: llvm-bolt %t.exe -o %t.bolt.exe --use-old-text=0 --lite=0 // RUN: %t.bolt.exe | FileCheck %s +// RUN: %clang %cflags -O3 -no-pie %s -fuse-ld=lld \ +// RUN: -o %t.O3.exe -Wl,-q +// RUN: llvm-bolt %t.O3.exe -o %t.O3.bolt.exe --use-old-text=0 --lite=0 +// RUN: %t.O3.bolt.exe | FileCheck %s + // CHECK: foo #include diff --git a/bolt/test/runtime/mark-funcs.c b/bolt/test/runtime/mark-funcs.c new file mode 100644 index 0000000000000000000000000000000000000000..a8586ca8b6e1dfd32970967d13df660a928cd357 --- /dev/null +++ b/bolt/test/runtime/mark-funcs.c @@ -0,0 +1,22 @@ +#include + +int dummy() { + printf("Dummy called\n"); + return 0; +} + +int main(int argc, char **argv) { + if (dummy() != 0) + return 1; + printf("Main called\n"); + return 0; +} +// Check that emitting trap value works properly and +// does not break functions +// REQUIRES: system-linux +// RUN: %clangxx -Wl,-q %s -o %t.exe +// RUN: %t.exe | FileCheck %s +// CHECK: Dummy called +// CHECK-NEXT: Main called +// RUN: llvm-bolt %t.exe -o %t.exe.bolt -lite=false --mark-funcs +// RUN: %t.exe.bolt | FileCheck %s diff --git a/bolt/test/runtime/meta-merge-fdata.test b/bolt/test/runtime/meta-merge-fdata.test index 39f34ba3d8ac06d124b945a7ebf3cea8c26ce27d..6972e75c64de7b2b72957671a3f1a09e92595c84 100644 --- a/bolt/test/runtime/meta-merge-fdata.test +++ b/bolt/test/runtime/meta-merge-fdata.test @@ -1,7 +1,7 @@ # Meta test using merge-fdata binary UNSUPPORTED: asan # Instrumentation currently only works on X86 -REQUIRES: bolt-runtime +REQUIRES: x86_64-linux,bolt-runtime # Instrumentation, should test: # - Direct branches @@ -22,7 +22,7 @@ CHECK-FDATA: 0 [unknown] 0 1 _start 0 0 1 # Check that BOLT works with this profile RUN: llvm-bolt merge-fdata -o %t.bolt --data %t.fdata1 \ RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \ -RUN: --split-functions \ +RUN: --split-functions 2>&1 \ RUN: | FileCheck %s --check-prefix=CHECK-BOLT1 CHECK-BOLT1-NOT: invalid (possibly stale) profile @@ -44,7 +44,7 @@ RUN: cmp %t.fdata.base %t.fdata.inst # Optimize using merged fdata RUN: llvm-bolt merge-fdata -o %t.opt --data %t.fdata.base \ RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \ -RUN: --split-functions \ +RUN: --split-functions 2>&1 \ RUN: | FileCheck %s --check-prefix=CHECK-BOLT2 CHECK-BOLT2-NOT: invalid (possibly stale) profile diff --git a/bolt/test/runtime/X86/pie-exceptions-split.test b/bolt/test/runtime/pie-exceptions-split.test similarity index 95% rename from bolt/test/runtime/X86/pie-exceptions-split.test rename to bolt/test/runtime/pie-exceptions-split.test index 124fef60fd2dadf8d5dd2c23eb0c907de429eff0..30f2d02bc9e10b68c01ec9dd79d1f342683e3822 100644 --- a/bolt/test/runtime/X86/pie-exceptions-split.test +++ b/bolt/test/runtime/pie-exceptions-split.test @@ -16,9 +16,9 @@ RUN: --print-only=main 2>&1 | FileCheck %s ## All calls to printf() should be from exception handling code that was ## recorded as cold during the profile collection run. Check that the calls ## are placed after the split point. -CHECK-NOT: callq printf +CHECK-NOT: printf CHECK: HOT-COLD SPLIT POINT -CHECK: callq printf +CHECK: printf ## Verify the output still executes correctly when the exception path is being ## taken. diff --git a/bolt/test/verify-cfg.test b/bolt/test/verify-cfg.test new file mode 100644 index 0000000000000000000000000000000000000000..4a7de85cd427ab5892abb63607e089023bf7309b --- /dev/null +++ b/bolt/test/verify-cfg.test @@ -0,0 +1,8 @@ +# Verify if the `--verify-cfg` option might produce incorrect alerts. + +REQUIRES: system-linux + +RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q +RUN: llvm-bolt %t -o %t.bolt --verify-cfg 2>&1 | FileCheck %s + +CHECK-NOT: BOLT-ERROR: Invalid CFG detected after pass {{.*}} diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt index 048c3e54ca4482a3c60624e82305a094e0e8ed82..6fde9fe962a304b86bd7e47e20d91cb402445705 100644 --- a/lld/ELF/CMakeLists.txt +++ b/lld/ELF/CMakeLists.txt @@ -74,6 +74,7 @@ add_lld_library(lldELF Passes Support TargetParser + TransformUtils LINK_LIBS lldCommon diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp index ff72731b1f38d65a6896b109b62e72b20aea94fc..5e36964da94fc52328f66d978a65ee6d18a1e0f8 100644 --- a/lld/ELF/CallGraphSort.cpp +++ b/lld/ELF/CallGraphSort.cpp @@ -6,38 +6,21 @@ // //===----------------------------------------------------------------------===// /// -/// Implementation of Call-Chain Clustering from: Optimizing Function Placement -/// for Large-Scale Data-Center Applications -/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf -/// -/// The goal of this algorithm is to improve runtime performance of the final -/// executable by arranging code sections such that page table and i-cache -/// misses are minimized. -/// -/// Definitions: -/// * Cluster -/// * An ordered list of input sections which are laid out as a unit. At the -/// beginning of the algorithm each input section has its own cluster and -/// the weight of the cluster is the sum of the weight of all incoming -/// edges. -/// * Call-Chain Clustering (C³) Heuristic -/// * Defines when and how clusters are combined. Pick the highest weighted -/// input section then add it to its most likely predecessor if it wouldn't -/// penalize it too much. -/// * Density -/// * The weight of the cluster divided by the size of the cluster. This is a -/// proxy for the amount of execution time spent per byte of the cluster. -/// -/// It does so given a call graph profile by the following: -/// * Build a weighted call graph from the call graph profile -/// * Sort input sections by weight -/// * For each input section starting with the highest weight -/// * Find its most likely predecessor cluster -/// * Check if the combined cluster would be too large, or would have too low -/// a density. -/// * If not, then combine the clusters. -/// * Sort non-empty clusters by density +/// The file is responsible for sorting sections using LLVM call graph profile +/// data by placing frequently executed code sections together. The goal of the +/// placement is to improve the runtime performance of the final executable by +/// arranging code sections so that i-TLB misses and i-cache misses are reduced. /// +/// The algorithm first builds a call graph based on the profile data and then +/// iteratively merges "chains" (ordered lists) of input sections which will be +/// laid out as a unit. There are two implementations for deciding how to +/// merge a pair of chains: +/// - a simpler one, referred to as Call-Chain Clustering (C^3), that follows +/// "Optimizing Function Placement for Large-Scale Data-Center Applications" +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +/// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which +/// typically produces layouts with higher locality, and hence, yields fewer +/// instruction cache misses on large binaries. //===----------------------------------------------------------------------===// #include "CallGraphSort.h" @@ -45,6 +28,7 @@ #include "InputSection.h" #include "Symbols.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Transforms/Utils/CodeLayout.h" #include @@ -75,6 +59,33 @@ struct Cluster { Edge bestPred = {-1, 0}; }; +/// Implementation of the Call-Chain Clustering (C^3). The goal of this +/// algorithm is to improve runtime performance of the executable by arranging +/// code sections such that page table and i-cache misses are minimized. +/// +/// Definitions: +/// * Cluster +/// * An ordered list of input sections which are laid out as a unit. At the +/// beginning of the algorithm each input section has its own cluster and +/// the weight of the cluster is the sum of the weight of all incoming +/// edges. +/// * Call-Chain Clustering (C³) Heuristic +/// * Defines when and how clusters are combined. Pick the highest weighted +/// input section then add it to its most likely predecessor if it wouldn't +/// penalize it too much. +/// * Density +/// * The weight of the cluster divided by the size of the cluster. This is a +/// proxy for the amount of execution time spent per byte of the cluster. +/// +/// It does so given a call graph profile by the following: +/// * Build a weighted call graph from the call graph profile +/// * Sort input sections by weight +/// * For each input section starting with the highest weight +/// * Find its most likely predecessor cluster +/// * Check if the combined cluster would be too large, or would have too low +/// a density. +/// * If not, then combine the clusters. +/// * Sort non-empty clusters by density class CallGraphSort { public: CallGraphSort(); @@ -260,11 +271,74 @@ DenseMap CallGraphSort::run() { return orderMap; } +// Sort sections by the profile data using the Cache-Directed Sort algorithm. +// The placement is done by optimizing the locality by co-locating frequently +// executed code sections together. +DenseMap elf::computeCacheDirectedSortOrder() { + SmallVector funcSizes; + SmallVector funcCounts; + SmallVector callCounts; + SmallVector callOffsets; + SmallVector sections; + DenseMap secToTargetId; + + auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t { + auto res = secToTargetId.try_emplace(inSec, sections.size()); + if (res.second) { + // inSec does not appear before in the graph. + sections.push_back(inSec); + assert(inSec->getSize() > 0 && "found a function with zero size"); + funcSizes.push_back(inSec->getSize()); + funcCounts.push_back(0); + } + return res.first->second; + }; + + // Create the graph. + for (std::pair &c : config->callGraphProfile) { + const InputSectionBase *fromSB = cast(c.first.first); + const InputSectionBase *toSB = cast(c.first.second); + // Ignore edges between input sections belonging to different sections. + if (fromSB->getOutputSection() != toSB->getOutputSection()) + continue; + + uint64_t weight = c.second; + // Ignore edges with zero weight. + if (weight == 0) + continue; + + size_t from = getOrCreateNode(fromSB); + size_t to = getOrCreateNode(toSB); + // Ignore self-edges (recursive calls). + if (from == to) + continue; + + callCounts.push_back({from, to, weight}); + // Assume that the jump is at the middle of the input section. The profile + // data does not contain jump offsets. + callOffsets.push_back((funcSizes[from] + 1) / 2); + funcCounts[to] += weight; + } + + // Run the layout algorithm. + std::vector sortedSections = codelayout::computeCacheDirectedLayout( + funcSizes, funcCounts, callCounts, callOffsets); + + // Create the final order. + DenseMap orderMap; + int curOrder = 1; + for (uint64_t secIdx : sortedSections) + orderMap[sections[secIdx]] = curOrder++; + + return orderMap; +} + // Sort sections by the profile data provided by --callgraph-profile-file. // // This first builds a call graph based on the profile data then merges sections -// according to the C³ heuristic. All clusters are then sorted by a density -// metric to further improve locality. +// according either to the C³ or Cache-Directed-Sort ordering algorithm. DenseMap elf::computeCallGraphProfileOrder() { + if (config->callGraphProfileSort == CGProfileSortKind::Cdsort) + return computeCacheDirectedSortOrder(); return CallGraphSort().run(); } diff --git a/lld/ELF/CallGraphSort.h b/lld/ELF/CallGraphSort.h index 4997cb102c326402480c3c418e0b34a2f652bba0..1b54f2b62482284bb2d02581dc7481b367ff1760 100644 --- a/lld/ELF/CallGraphSort.h +++ b/lld/ELF/CallGraphSort.h @@ -14,6 +14,8 @@ namespace lld::elf { class InputSectionBase; +llvm::DenseMap computeCacheDirectedSortOrder(); + llvm::DenseMap computeCallGraphProfileOrder(); } // namespace lld::elf diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 6d0bdeb7bf938a83eb5c05beff2e0a5d126a8012..aa9f5456a7544f05503118857ae88342f84fae61 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -59,6 +59,9 @@ enum class BsymbolicKind { None, NonWeakFunctions, Functions, All }; // For --build-id. enum class BuildIdKind { None, Fast, Md5, Sha1, Hexstring, Uuid }; +// For --call-graph-profile-sort={none,hfsort,cdsort}. +enum class CGProfileSortKind { None, Hfsort, Cdsort }; + // For --discard-{all,locals,none}. enum class DiscardPolicy { Default, All, Locals, None }; @@ -214,7 +217,7 @@ struct Config { bool asNeeded = false; bool armBe8 = false; BsymbolicKind bsymbolic = BsymbolicKind::None; - bool callGraphProfileSort; + CGProfileSortKind callGraphProfileSort; bool checkSections; bool checkDynamicRelocs; llvm::DebugCompressionType compressDebugSections; @@ -246,6 +249,7 @@ struct Config { bool ltoDebugPassManager; bool ltoEmitAsm; bool ltoUniqueBasicBlockSectionNames; + bool ltoValidateAllVtablesHaveTypeInfos; bool ltoWholeProgramVisibility; bool mergeArmExidx; bool mipsN32Abi = false; @@ -479,6 +483,9 @@ struct Ctx { std::atomic hasTlsIe{false}; // True if we need to reserve two .got entries for local-dynamic TLS model. std::atomic needsTlsLd{false}; + // True if all native vtable symbols have corresponding type info symbols + // during LTO. + bool ltoAllVtablesHaveTypeInfos; void reset(); diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index da9ca44b3f209cee77f1b6060ea4b5ae9a20c868..4b563a0fdf2f335640784e4490068ce3d32e370a 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -105,6 +105,7 @@ void Ctx::reset() { backwardReferences.clear(); hasSympart.store(false, std::memory_order_relaxed); needsTlsLd.store(false, std::memory_order_relaxed); + ltoAllVtablesHaveTypeInfos = false; } llvm::raw_fd_ostream Ctx::openAuxiliaryFile(llvm::StringRef filename, @@ -1037,6 +1038,74 @@ template static void readCallGraphsFromObjectFiles() { } } +template +static void ltoValidateAllVtablesHaveTypeInfos(opt::InputArgList &args) { + DenseSet typeInfoSymbols; + SmallSetVector vtableSymbols; + auto processVtableAndTypeInfoSymbols = [&](StringRef name) { + if (name.consume_front("_ZTI")) + typeInfoSymbols.insert(name); + else if (name.consume_front("_ZTV")) + vtableSymbols.insert(name); + }; + + // Examine all native symbol tables. + for (ELFFileBase *f : ctx.objectFiles) { + using Elf_Sym = typename ELFT::Sym; + for (const Elf_Sym &s : f->template getGlobalELFSyms()) { + if (s.st_shndx != SHN_UNDEF) { + StringRef name = check(s.getName(f->getStringTable())); + processVtableAndTypeInfoSymbols(name); + } + } + } + + for (SharedFile *f : ctx.sharedFiles) { + using Elf_Sym = typename ELFT::Sym; + for (const Elf_Sym &s : f->template getELFSyms()) { + if (s.st_shndx != SHN_UNDEF) { + StringRef name = check(s.getName(f->getStringTable())); + processVtableAndTypeInfoSymbols(name); + } + } + } + + SmallSetVector vtableSymbolsWithNoRTTI; + for (StringRef s : vtableSymbols) + if (!typeInfoSymbols.count(s)) + vtableSymbolsWithNoRTTI.insert(s); + + // Remove known safe symbols. + for (auto *arg : args.filtered(OPT_lto_known_safe_vtables)) { + StringRef knownSafeName = arg->getValue(); + if (!knownSafeName.consume_front("_ZTV")) + error("--lto-known-safe-vtables=: expected symbol to start with _ZTV, " + "but got " + + knownSafeName); + vtableSymbolsWithNoRTTI.remove(knownSafeName); + } + + ctx.ltoAllVtablesHaveTypeInfos = vtableSymbolsWithNoRTTI.empty(); + // Check for unmatched RTTI symbols + for (StringRef s : vtableSymbolsWithNoRTTI) { + message( + "--lto-validate-all-vtables-have-type-infos: RTTI missing for vtable " + "_ZTV" + + s + ", --lto-whole-program-visibility disabled"); + } +} + +static CGProfileSortKind getCGProfileSortKind(opt::InputArgList &args) { + StringRef s = args.getLastArgValue(OPT_call_graph_profile_sort, "hfsort"); + if (s == "hfsort") + return CGProfileSortKind::Hfsort; + if (s == "cdsort") + return CGProfileSortKind::Cdsort; + if (s != "none") + error("unknown --call-graph-profile-sort= value: " + s); + return CGProfileSortKind::None; +} + static DebugCompressionType getCompressionType(StringRef s, StringRef option) { DebugCompressionType type = StringSwitch(s) .Case("zlib", DebugCompressionType::Zlib) @@ -1168,6 +1237,7 @@ static void readConfigs(opt::InputArgList &args) { else if (arg->getOption().matches(OPT_Bsymbolic)) config->bsymbolic = BsymbolicKind::All; } + config->callGraphProfileSort = getCGProfileSortKind(args); config->checkSections = args.hasFlag(OPT_check_sections, OPT_no_check_sections, true); config->chroot = args.getLastArgValue(OPT_chroot); @@ -1188,8 +1258,6 @@ static void readConfigs(opt::InputArgList &args) { args.hasFlag(OPT_eh_frame_hdr, OPT_no_eh_frame_hdr, false); config->emitLLVM = args.hasArg(OPT_plugin_opt_emit_llvm, false); config->emitRelocs = args.hasArg(OPT_emit_relocs); - config->callGraphProfileSort = args.hasFlag( - OPT_call_graph_profile_sort, OPT_no_call_graph_profile_sort, true); config->enableNewDtags = args.hasFlag(OPT_enable_new_dtags, OPT_disable_new_dtags, true); config->entry = args.getLastArgValue(OPT_entry); @@ -1233,6 +1301,9 @@ static void readConfigs(opt::InputArgList &args) { config->ltoWholeProgramVisibility = args.hasFlag(OPT_lto_whole_program_visibility, OPT_no_lto_whole_program_visibility, false); + config->ltoValidateAllVtablesHaveTypeInfos = + args.hasFlag(OPT_lto_validate_all_vtables_have_type_infos, + OPT_no_lto_validate_all_vtables_have_type_infos, false); config->ltoo = args::getInteger(args, OPT_lto_O, 2); if (config->ltoo > 3) error("invalid optimization level for LTO: " + Twine(config->ltoo)); @@ -1619,7 +1690,7 @@ static void readConfigs(opt::InputArgList &args) { config->symbolOrderingFile = getSymbolOrderingFile(*buffer); // Also need to disable CallGraphProfileSort to prevent // LLD order symbols with CGProfile - config->callGraphProfileSort = false; + config->callGraphProfileSort = CGProfileSortKind::None; } } @@ -2829,6 +2900,10 @@ void LinkerDriver::link(opt::InputArgList &args) { config->ltoEmitAsm || !config->thinLTOModulesToCompile.empty(); + // Handle --lto-validate-all-vtables-have-type-infos. + if (config->ltoValidateAllVtablesHaveTypeInfos) + invokeELFT(ltoValidateAllVtablesHaveTypeInfos, args); + // Do link-time optimization if given files are LLVM bitcode files. // This compiles bitcode files into real object files. // @@ -3021,7 +3096,7 @@ void LinkerDriver::link(opt::InputArgList &args) { } // Read the callgraph now that we know what was gced or icfed - if (config->callGraphProfileSort) { + if (config->callGraphProfileSort != CGProfileSortKind::None) { if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file)) if (std::optional buffer = readFile(arg->getValue())) readCallGraph(*buffer); diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp index a7df5f072f6f407dd53109b8a02931b8f815059c..ebc6ccdbea7861e9f1d59dc045ce500255ebb867 100644 --- a/lld/ELF/LTO.cpp +++ b/lld/ELF/LTO.cpp @@ -154,6 +154,9 @@ static lto::Config createConfig() { c.DwoDir = std::string(config->dwoDir); c.HasWholeProgramVisibility = config->ltoWholeProgramVisibility; + c.ValidateAllVtablesHaveTypeInfos = + config->ltoValidateAllVtablesHaveTypeInfos; + c.AllVtablesHaveTypeInfos = ctx.ltoAllVtablesHaveTypeInfos; c.AlwaysEmitRegularLTOObj = !config->ltoObjPath.empty(); for (const llvm::StringRef &name : config->thinLTOModulesToCompile) diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td index 4f8ea4fd4d2bb6cd21ee476f02815345c0c532f7..dea6c16949ee28698f35e5645ff31c80d5bbd1e4 100644 --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -125,9 +125,12 @@ defm as_needed: B<"as-needed", defm call_graph_ordering_file: Eq<"call-graph-ordering-file", "Layout sections to optimize the given callgraph">; -defm call_graph_profile_sort: BB<"call-graph-profile-sort", - "Reorder sections with call graph profile (default)", - "Do not reorder sections with call graph profile">; +def call_graph_profile_sort: JJ<"call-graph-profile-sort=">, + HelpText<"Reorder input sections with call graph profile using the specified algorithm (default: hfsort)">, + MetaVarName<"[none,hfsort]">, + Values<"none,hfsort">; +def : FF<"no-call-graph-profile-sort">, Alias, AliasArgs<["none"]>, + Flags<[HelpHidden]>; // --chroot doesn't have a help text because it is an internal option. def chroot: Separate<["--"], "chroot">; @@ -618,9 +621,14 @@ def lto_cs_profile_file: JJ<"lto-cs-profile-file=">, defm lto_pgo_warn_mismatch: BB<"lto-pgo-warn-mismatch", "turn on warnings about profile cfg mismatch (default)", "turn off warnings about profile cfg mismatch">; +defm lto_known_safe_vtables : EEq<"lto-known-safe-vtables", + "When --lto-validate-all-vtables-have-type-infos is enabled, skip validation on these vtables (_ZTV symbols)">; def lto_obj_path_eq: JJ<"lto-obj-path=">; def lto_sample_profile: JJ<"lto-sample-profile=">, HelpText<"Sample profile file path">; +defm lto_validate_all_vtables_have_type_infos: BB<"lto-validate-all-vtables-have-type-infos", + "Validate that all vtables have type infos for LTO link", + "Do not validate that all vtables have type infos for LTO link">; defm lto_whole_program_visibility: BB<"lto-whole-program-visibility", "Asserts that the LTO link has whole program visibility", "Asserts that the LTO link does not have whole program visibility">; diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1 index 0a5e4293dedaf5374946d5c7eb276d9f6f5da564..72b90094eec28241d22673d9942c10b9af6ffd19 100644 --- a/lld/docs/ld.lld.1 +++ b/lld/docs/ld.lld.1 @@ -117,6 +117,19 @@ is not intended to be cryptographically secure. .It Fl -build-id Synonym for .Fl -build-id Ns = Ns Cm fast . +.It Fl -call-graph-profile-sort Ns = Ns Ar algorithm +.Ar algorithm +may be: +.Pp +.Bl -tag -width 2n -compact +.It Cm none +Ignore call graph profile. +.It Cm hfsort +Use hfsort (default). +.It Cm cdsort +Use cdsort. +.El +.Pp .It Fl -color-diagnostics Ns = Ns Ar value Use colors in diagnostics. .Ar value diff --git a/lld/test/ELF/cgprofile-obj.s b/lld/test/ELF/cgprofile-obj.s index f56f3bcbf0c3c5e92c11e83b8c692f0cb17450e9..0848adc5e4279a7edbb8fdc3730f104ed711819b 100644 --- a/lld/test/ELF/cgprofile-obj.s +++ b/lld/test/ELF/cgprofile-obj.s @@ -3,8 +3,11 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o # RUN: ld.lld -e A %t.o -o %t # RUN: llvm-nm --no-sort %t | FileCheck %s -# RUN: ld.lld --no-call-graph-profile-sort -e A %t.o -o %t +# RUN: ld.lld --call-graph-profile-sort=none -e A %t.o -o %t # RUN: llvm-nm --no-sort %t | FileCheck %s --check-prefix=NO-CG +## --no-call-graph-profile-sort is an alias for --call-graph-profile-sort=none. +# RUN: ld.lld --no-call-graph-profile-sort -e A %t.o -o %t1 +# RUN: cmp %t %t1 .section .text.D,"ax",@progbits D: diff --git a/lld/test/ELF/cgprofile-txt.s b/lld/test/ELF/cgprofile-txt.s index 99cbfa574532523a842e8fa539598c08b2e61ae1..c9194bbbc43cbe0284091ef63b66ebdedc4e5813 100644 --- a/lld/test/ELF/cgprofile-txt.s +++ b/lld/test/ELF/cgprofile-txt.s @@ -24,8 +24,19 @@ # RUN: echo "TooManyPreds8 TooManyPreds 10" >> %t.call_graph # RUN: echo "TooManyPreds9 TooManyPreds 10" >> %t.call_graph # RUN: echo "TooManyPreds10 TooManyPreds 11" >> %t.call_graph -# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2 +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=hfsort -o %t2 # RUN: llvm-readobj --symbols %t2 | FileCheck %s +## --call-graph-profile-sort=hfsort is the default. +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2b +# RUN: cmp %t2 %t2b + +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=cdsort -o %t2 +# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CDSORT + +# RUN: not ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=sort \ +# RUN: -o /dev/null 2>&1 | FileCheck %s --check-prefix=UNKNOWN + +# UNKNOWN: error: unknown --call-graph-profile-sort= value: sort .section .text.D,"ax",@progbits D: @@ -159,6 +170,31 @@ TooManyPreds10: # CHECK: Name: _init2 # CHECK-NEXT: Value: 0x201141 +# CDSORT: Name: D +# CDSORT-NEXT: Value: 0x201123 +# CDSORT: Name: TooManyPreds +# CDSORT-NEXT: Value: 0x20112F +# CDSORT: Name: TooManyPreds10 +# CDSORT-NEXT: Value: 0x20112E +# CDSORT: Name: C +# CDSORT-NEXT: Value: 0x201122 +# CDSORT: Name: B +# CDSORT-NEXT: Value: 0x201121 +# CDSORT: Name: A +# CDSORT-NEXT: Value: 0x201120 +# CDSORT: Name: TS +# CDSORT-NEXT: Value: 0x20113D +# CDSORT: Name: PP +# CDSORT-NEXT: Value: 0x20113C +# CDSORT: Name: QC +# CDSORT-NEXT: Value: 0x20113E +# CDSORT: Name: GB +# CDSORT-NEXT: Value: 0x20113F +# CDSORT: Name: _init +# CDSORT-NEXT: Value: 0x201140 +# CDSORT: Name: _init2 +# CDSORT-NEXT: Value: 0x201141 + # NOSORT: Name: D # NOSORT-NEXT: Value: 0x201120 # NOSORT: Name: TooManyPreds diff --git a/lld/test/ELF/cgprofile-txt2.s b/lld/test/ELF/cgprofile-txt2.s index 91961db39c3a883fc948c3b609e2e8b95a1f4e4c..b59b6eeb292fabff00e32148498208b799d3cf46 100644 --- a/lld/test/ELF/cgprofile-txt2.s +++ b/lld/test/ELF/cgprofile-txt2.s @@ -5,17 +5,28 @@ # RUN: echo "B C 50" >> %t.call_graph # RUN: echo "C D 40" >> %t.call_graph # RUN: echo "D B 10" >> %t.call_graph -# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2 -# RUN: llvm-readobj --symbols %t2 | FileCheck %s +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=hfsort -o %t2 +# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKC3 +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=cdsort -o %t2 +# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKCDS -# CHECK: Name: A -# CHECK-NEXT: Value: 0x201123 -# CHECK: Name: B -# CHECK-NEXT: Value: 0x201120 -# CHECK: Name: C -# CHECK-NEXT: Value: 0x201121 -# CHECK: Name: D -# CHECK-NEXT: Value: 0x201122 +# CHECKC3: Name: A +# CHECKC3-NEXT: Value: 0x201123 +# CHECKC3: Name: B +# CHECKC3-NEXT: Value: 0x201120 +# CHECKC3: Name: C +# CHECKC3-NEXT: Value: 0x201121 +# CHECKC3: Name: D +# CHECKC3-NEXT: Value: 0x201122 + +# CHECKCDS: Name: A +# CHECKCDS-NEXT: Value: 0x201120 +# CHECKCDS: Name: B +# CHECKCDS-NEXT: Value: 0x201121 +# CHECKCDS: Name: C +# CHECKCDS-NEXT: Value: 0x201122 +# CHECKCDS: Name: D +# CHECKCDS-NEXT: Value: 0x201123 .section .text.A,"ax",@progbits .globl A diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll new file mode 100644 index 0000000000000000000000000000000000000000..fb357831d6f21a97f34d9a4bf09e70818669bbc4 --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll @@ -0,0 +1,26 @@ +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.Native = type { %struct.A } + +@_ZTV6Native = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI6Native, ptr @_ZN1A1nEi, ptr @_ZN6Native1fEi] } +@_ZTS6Native = linkonce_odr constant [8 x i8] c"6Native\00" +@_ZTI6Native = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS6Native, ptr @_ZTI1A } + +; Base type A does not need to emit a vtable if it's never instantiated. However, RTTI still gets generated +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + + +define linkonce_odr i32 @_ZN6Native1fEi(ptr %this, i32 %a) #0 { + ret i32 1; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +attributes #0 = { noinline optnone } diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll new file mode 100644 index 0000000000000000000000000000000000000000..4533504c601803158a2ecbc550163c32fc21620a --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll @@ -0,0 +1,19 @@ +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.Native = type { %struct.A } + +@_ZTV6Native = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1A1nEi, ptr @_ZN6Native1fEi] } + +define linkonce_odr i32 @_ZN6Native1fEi(ptr %this, i32 %a) #0 { + ret i32 1; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +attributes #0 = { noinline optnone } diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll new file mode 100644 index 0000000000000000000000000000000000000000..43df8366aa2ae0c68e9a5531a0661f90e897ae2e --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll @@ -0,0 +1,68 @@ +;; Source code: +;; cat > a.h <<'eof' +;; struct A { virtual int foo(); }; +;; int bar(A *a); +;; eof +;; cat > b.cc <<'eof' +;; #include "a.h" +;; struct B : A { int foo() { return 2; } }; +;; int baz() { B b; return bar(&b); } +;; eof +;; clang++ -flto=thin b.cc -c + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.B = type { %struct.A } +%struct.A = type { ptr } + +@_ZTV1B = linkonce_odr dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B3fooEv] }, !type !0, !type !1, !type !2, !type !3 +@_ZTS1B = linkonce_odr dso_local constant [3 x i8] c"1B\00" +@_ZTI1A = external constant ptr +@_ZTI1B = linkonce_odr dso_local constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A } +@_ZTV1A = external unnamed_addr constant { [3 x ptr] } + +define dso_local noundef i32 @_Z3bazv() #0 { +entry: + %b = alloca %struct.B + call void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %b) + %call = call noundef i32 @_Z3barP1A(ptr noundef %b) + ret i32 %call +} + +define linkonce_odr dso_local void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this1) + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1 + ret void +} + +declare i32 @_Z3barP1A(ptr noundef) + +define linkonce_odr dso_local void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1 + ret void +} + +define linkonce_odr i32 @_ZN1B3fooEv(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + ret i32 2 +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFivE.virtual"} +!2 = !{i64 16, !"_ZTS1B"} +!3 = !{i64 16, !"_ZTSM1BFivE.virtual"} diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll new file mode 100644 index 0000000000000000000000000000000000000000..6cc55df82e2f2814b1717a0ad09c55a81030ed95 --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll @@ -0,0 +1,16 @@ +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@_ZTV1B = external unnamed_addr constant { [4 x ptr] } + +define linkonce_odr void @_ZN1BC2Ev(ptr %this) #0 { + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1, align 8 + ret void +} + +attributes #0 = { noinline optnone } diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll new file mode 100644 index 0000000000000000000000000000000000000000..d6ac53f9fb936b0d1eb4f86549242288613dcf26 --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll @@ -0,0 +1,263 @@ +; REQUIRES: x86 + +;; Common artifacts +; RUN: opt --thinlto-bc -o %t1.o %s +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s +; RUN: cp %s %t1_regular.ll +; RUN: echo '!llvm.module.flags = !{!12, !13}' >> %t1_regular.ll +; RUN: echo '!12 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll +; RUN: echo '!13 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll +; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos.ll -o %t2.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2.bc -o %t2.o +; RUN: ld.lld %t2.o -o %t2.so -shared + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll -o %t2_nortti.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2_nortti.bc -o %t2_nortti.o +; RUN: ld.lld %t2_nortti.o -o %t2_nortti.so -shared + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_undef.ll -o %t2_undef.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2_undef.bc -o %t2_undef.o +; RUN: ld.lld %t2_undef.o -o %t2_undef.so -shared + +;; With --lto-whole-program-visibility, we assume no native types can interfere +;; and thus proceed with devirtualization even in the presence of native types + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.o -o %t3_index -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.o -o %t3_hybrid -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.o -o %t3_regular -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi +; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +;; With --lto-validate-all-vtables-have-type-infos, the linker checks for the presence of vtables +;; and RTTI in native files and blocks devirtualization to be conservative on correctness +;; for these types. + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.o -o %t4_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.o -o %t4_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.o -o %t4_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t4_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; DSOs behave similarly + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.so -o %t5_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.so -o %t5_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.so -o %t5_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t5_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +; VALIDATE-NOT: single-impl: +; VALIDATE: single-impl: devirtualized a call to _ZN1D1mEi +; VALIDATE-NOT: single-impl: + +;; When vtables without type infos are detected in native files, we have a hole in our knowledge so +;; --lto-validate-all-vtables-have-type-infos conservatively disables --lto-whole-program-visibility + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_nortti.o -o %t6_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_nortti.o -o %t6_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_nortti.o -o %t6_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t6_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; DSOs behave similarly + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_nortti.so -o %t7_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_nortti.so -o %t7_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_nortti.so -o %t7_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t7_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +; NO-RTTI-DAG: --lto-validate-all-vtables-have-type-infos: RTTI missing for vtable _ZTV6Native, --lto-whole-program-visibility disabled +; NO-RTTI-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +;; --lto-known-safe-vtables=* can be used to specifically allow types to participate in WPD +;; even if they don't have corresponding RTTI + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_nortti.o -o %t8_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_nortti.o -o %t8_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_nortti.o -o %t8_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t8_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Only check for definitions of vtables symbols, just having a reference does not allow a type to +;; be derived from + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_undef.o -o %t9_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_undef.o -o %t9_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_undef.o -o %t9_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t9_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.B = type { %struct.A } +%struct.C = type { %struct.A } +%struct.D = type { ptr } + +@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5 +@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1C, ptr @_ZN1C1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !6, !type !7, !type !8 +@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1D, ptr @_ZN1D1mEi] }, !type !9, !vcall_visibility !11 + +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + +@_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00" +@_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A } + +@_ZTS1C = linkonce_odr constant [3 x i8] c"1C\00" +@_ZTI1C = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1C, ptr @_ZTI1A } + +@_ZTS1D = internal constant [3 x i8] c"1D\00" +@_ZTI1D = internal constant { ptr, ptr } { ptr null, ptr @_ZTS1D } + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ] + +; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start +define i32 @_start(ptr %obj, ptr %obj2, i32 %a) { +entry: + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptrptr = getelementptr ptr, ptr %vtable, i32 1 + %fptr1 = load ptr, ptr %fptrptr, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi + ;; --lto-whole-program-visibility disabled so no devirtualization + ; CHECK-VALIDATE-IR: %call = tail call i32 %fptr1 + ; CHECK-NO-RTTI-IR: %call = tail call i32 %fptr1 + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + %fptr22 = load ptr, ptr %vtable, align 8 + + ;; We still have to call it as virtual. + ; CHECK-IR: %call2 = tail call i32 %fptr22 + ; CHECK-VALIDATE-IR: %call2 = tail call i32 %fptr22 + ; CHECK-NO-RTTI-IR: %call2 = tail call i32 %fptr22 + %call2 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call) + + %vtable2 = load ptr, ptr %obj2 + %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !10) + call void @llvm.assume(i1 %p2) + + %fptr33 = load ptr, ptr %vtable2, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call3 = tail call i32 @_ZN1D1mEi + ;; Types not present in native files can still be devirtualized + ; CHECK-VALIDATE-IR: %call3 = tail call i32 @_ZN1D1mEi + ;; --lto-whole-program-visibility disabled but being local this + ;; has VCallVisibilityTranslationUnit visibility so it's still devirtualized + ; CHECK-NO-RTTI-IR: %call3 = tail call i32 @_ZN1D1mEi + %call3 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call2) + + ret i32 %call3 +} +; CHECK-COMMON-IR-LABEL: ret i32 +; CHECK-COMMON-IR-LABEL: } + +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN1B1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1C1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFviE.virtual"} +!2 = !{i64 24, !"_ZTSM1AFviE.virtual"} +!3 = !{i64 16, !"_ZTS1B"} +!4 = !{i64 16, !"_ZTSM1BFviE.virtual"} +!5 = !{i64 24, !"_ZTSM1BFviE.virtual"} +!6 = !{i64 16, !"_ZTS1C"} +!7 = !{i64 16, !"_ZTSM1CFviE.virtual"} +!8 = !{i64 24, !"_ZTSM1CFviE.virtual"} +!9 = !{i64 16, !10} +!10 = distinct !{} +!11 = !{i64 2} diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll new file mode 100644 index 0000000000000000000000000000000000000000..15040b8707aede995aea588638eb7c7c3eafafaf --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll @@ -0,0 +1,183 @@ +; REQUIRES: x86 + +; RUN: rm -rf %t.dir +; RUN: split-file %s %t.dir +; RUN: cd %t.dir + +;; Common artifacts +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1.o ThinLTO.ll +; RUN: opt -module-summary -o %t2.o RegularLTO.ll + +;; --lto-whole-program-visibility when there's split ThinLTO and a RegularLTO with summary optimizes +;; using the combined index. +; RUN: ld.lld %t1.o %t2.o -o %t3 -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR,CHECK-COMMON-IR +; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-REGULAR-IR,CHECK-COMMON-REGULAR-IR + +;; --lto-validate-all-vtables-have-type-infos when there's split ThinLTO and a RegularLTO with summary behaves the same +;; as everything is present in the combined index. +; RUN: ld.lld %t1.o %t2.o -o %t3 -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR,CHECK-COMMON-IR +; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-REGULAR-IR,CHECK-COMMON-REGULAR-IR + +; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi +; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi + +;--- ThinLTO.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.B = type { %struct.A } +%struct.C = type { %struct.A } +%struct.D = type { ptr } + +@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1A1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5 +@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1C, ptr @_ZN1A1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !6, !type !7, !type !8 +@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1D, ptr @_ZN1D1mEi] }, !type !9, !vcall_visibility !11 + +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + +@_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00" +@_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A } + +@_ZTS1C = linkonce_odr constant [3 x i8] c"1C\00" +@_ZTI1C = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1C, ptr @_ZTI1A } + +@_ZTS1D = internal constant [3 x i8] c"1D\00" +@_ZTI1D = internal constant { ptr, ptr } { ptr null, ptr @_ZTS1D } + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ], section "llvm.metadata" + +; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start +define i32 @_start(ptr %obj, ptr %obj2, i32 %a) { + ;; Call function built with RegularLTO + %RegularLTOResult = call i32 @RegularLTO(ptr %obj, i32 %a) + + ;; ThinLTO code starts here + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptrptr = getelementptr ptr, ptr %vtable, i32 1 + %fptr1 = load ptr, ptr %fptrptr, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + %fptr22 = load ptr, ptr %vtable, align 8 + + ;; Check that the call was not devirtualized. + ; CHECK-IR: %call2 = tail call i32 %fptr22 + %call2 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call) + + %vtable2 = load ptr, ptr %obj2 + %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !10) + call void @llvm.assume(i1 %p2) + + %fptr33 = load ptr, ptr %vtable2, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call3 = tail call i32 @_ZN1D1mEi + %call3 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call2) + + ret i32 %call3 +} +; CHECK-COMMON-IR-LABEL: ret i32 +; CHECK-COMMON-IR-LABEL: } + +declare i32 @RegularLTO(ptr) +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN1A1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFviE.virtual"} +!2 = !{i64 24, !"_ZTSM1AFviE.virtual"} +!3 = !{i64 16, !"_ZTS1B"} +!4 = !{i64 16, !"_ZTSM1BFviE.virtual"} +!5 = !{i64 24, !"_ZTSM1BFviE.virtual"} +!6 = !{i64 16, !"_ZTS1C"} +!7 = !{i64 16, !"_ZTSM1CFviE.virtual"} +!8 = !{i64 24, !"_ZTSM1CFviE.virtual"} +!9 = !{i64 16, !10} +!10 = distinct !{} +!11 = !{i64 2} + +;--- RegularLTO.ll +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.Native = type { %struct.A } + +@_ZTV7Regular = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI7Regular, ptr @_ZN7Regular1fEi, ptr @_ZN1A1nEi] } , !type !0, !type !1, !type !2, !type !3, !type !4, !type !5 +@_ZTS7Regular = linkonce_odr constant [9 x i8] c"7Regular\00" +@_ZTI7Regular = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS7Regular, ptr @_ZTI1A } + +; Base type A does not need to emit a vtable if it's never instantiated. However, RTTI still gets generated +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [1 x ptr] [ ptr @_ZTV7Regular ], section "llvm.metadata" + +; CHECK-COMMON-REGULAR-IR-LABEL: define dso_local i32 @RegularLTO +define i32 @RegularLTO(ptr %obj, i32 %a) #0 { +entry: + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptr1 = load ptr, ptr %vtable, align 8 + + ;; Check that the call was not devirtualized. + ; CHECK-REGULAR-IR: %call = tail call i32 %fptr1 + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + ret i32 %call +} +; CHECK-COMMON-REGULAR-IR-LABEL: ret i32 +; CHECK-COMMON-REGULAR-IR-LABEL: } + +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN7Regular1fEi(ptr %this, i32 %a) #0 { + ret i32 1; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +attributes #0 = { noinline optnone } +!llvm.module.flags = !{!6, !7} + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFviE.virtual"} +!2 = !{i64 24, !"_ZTSM1AFviE.virtual"} +!3 = !{i64 16, !"_ZTS7Regular"} +!4 = !{i64 16, !"_ZTSM7RegularFviE.virtual"} +!5 = !{i64 24, !"_ZTSM7RegularFviE.virtual"} +!6 = !{i32 1, !"ThinLTO", i32 0} +!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1} diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll new file mode 100644 index 0000000000000000000000000000000000000000..30bd75606f7d2d0aeb4bfeb2e82f289941101d0a --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll @@ -0,0 +1,136 @@ +; REQUIRES: x86 + +;; Common artifacts +; RUN: opt --thinlto-bc -o %t1.o %s +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s +; RUN: cp %s %t1_regular.ll +; RUN: echo '!llvm.module.flags = !{!6, !7}' >> %t1_regular.ll +; RUN: echo '!6 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll +; RUN: echo '!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll +; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll + +;; With --lto-whole-program-visibility, we assume no native types can interfere +;; and thus proceed with devirtualization even in the presence of native types + +;; Index based WPD +; RUN: ld.lld %t1.o -o %t3_index -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o -o %t3_hybrid -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o -o %t3_regular -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi +; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +;; With --lto-whole-program-visibility and --lto-validate-all-vtables-have-type-infos +;; we rely on resolutions on the typename symbol to inform us of what's outside the summary. +;; Without the typename symbol in the LTO unit (e.g. RTTI disabled) this causes +;; conservative disablement of WPD on these types unless it's local + +;; Index based WPD +; RUN: ld.lld %t1.o -o %t3_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o -o %t3_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o -o %t3_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +; VALIDATE-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.B = type { %struct.A } +%struct.C = type { %struct.A } +%struct.D = type { ptr } + +@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1B1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1 +@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1C1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !2 +@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN1D1mEi] }, !type !3, !vcall_visibility !5 + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ] + +; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start +define i32 @_start(ptr %obj, ptr %obj2, i32 %a) { +entry: + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptrptr = getelementptr ptr, ptr %vtable, i32 1 + %fptr1 = load ptr, ptr %fptrptr, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi + ;; No resolution for _ZTS1A means we don't devirtualize + ; CHECK-VALIDATE-IR: %call = tail call i32 %fptr1 + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + %fptr22 = load ptr, ptr %vtable, align 8 + + ;; We still have to call it as virtual. + ; CHECK-IR: %call3 = tail call i32 %fptr22 + ; CHECK-VALIDATE-IR: %call3 = tail call i32 %fptr22 + %call3 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call) + + %vtable2 = load ptr, ptr %obj2 + %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !4) + call void @llvm.assume(i1 %p2) + + %fptr33 = load ptr, ptr %vtable2, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call4 = tail call i32 @_ZN1D1mEi + ;; Being local this has VCallVisibilityTranslationUnit + ;; visibility so it's still devirtualized + ; CHECK-VALIDATE-IR: %call4 = tail call i32 @_ZN1D1mEi + %call4 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call3) + ret i32 %call4 +} +; CHECK-COMMON-IR-LABEL: ret i32 +; CHECK-COMMON-IR-LABEL: } + +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN1B1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1C1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTS1B"} +!2 = !{i64 16, !"_ZTS1C"} +!3 = !{i64 16, !4} +!4 = distinct !{} +!5 = !{i64 2} diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll new file mode 100644 index 0000000000000000000000000000000000000000..4ef048d6b6c601b9bf174c24f3c8f4372814d0bc --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll @@ -0,0 +1,130 @@ +; REQUIRES: x86 + +;; Common artifacts +; RUN: opt --thinlto-bc -o %t1.o %s +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s +; RUN: cp %s %t1_regular.ll +; RUN: echo '!llvm.module.flags = !{!2, !3}' >> %t1_regular.ll +; RUN: echo '!2 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll +; RUN: echo '!3 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll +; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_ref.ll -o %t2.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2.bc -o %t2.o + +;; Native objects can contain only a reference to the base type infos if the base declaration has no key functions. +;; Because of that, --lto-validate-all-vtables-have-type-infos needs to query for the type info symbol inside native files rather than the +;; type name symbol that's used as the key in !type metadata to correctly stop devirtualization on the native type. + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.o -o %t3_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.o -o %t3_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.o -o %t3_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR + +; CHECK-NOT: single-impl: devirtualized a call to _ZN1A3fooEv + +;; Source code: +;; cat > a.h <<'eof' +;; struct A { virtual int foo(); }; +;; int bar(A *a); +;; eof +;; cat > main.cc <<'eof' +;; #include "a.h" +;; +;; int A::foo() { return 1; } +;; int bar(A *a) { return a->foo(); } +;; +;; extern int baz(); +;; int main() { +;; A a; +;; int i = bar(&a); +;; int j = baz(); +;; return i + j; +;; } +;; eof +;; clang++ -fwhole-program-vtables -fno-split-lto-unit -flto=thin main.cc -c + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { %struct.Abase } +%struct.Abase = type { ptr } + +@_ZTV1A = dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A3fooEv] }, align 8, !type !0, !type !1 +@_ZTS1A = dso_local constant [3 x i8] c"1A\00", align 1 +@_ZTI1A = dso_local constant { ptr, ptr } { ptr null, ptr @_ZTS1A }, align 8 + +define dso_local noundef i32 @_ZN1A3fooEv(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 align 2 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + ret i32 1 +} + +; CHECK-IR: define dso_local noundef i32 @_Z3barP1A +define dso_local noundef i32 @_Z3barP1A(ptr noundef %a) #0 { +entry: + %a.addr = alloca ptr + store ptr %a, ptr %a.addr + %0 = load ptr, ptr %a.addr + %vtable = load ptr, ptr %0 + %1 = call i1 @llvm.public.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %1) + %vfn = getelementptr inbounds ptr, ptr %vtable, i64 0 + %fptr = load ptr, ptr %vfn + ;; Check that the call was not devirtualized. + ; CHECK-IR: %call = call noundef i32 %fptr + %call = call noundef i32 %fptr(ptr noundef nonnull align 8 dereferenceable(8) %0) + ret i32 %call +} +; CHECK-IR: ret i32 +; CHECK-IR: } + +declare i1 @llvm.public.type.test(ptr, metadata) +declare void @llvm.assume(i1 noundef) + +define dso_local noundef i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %a = alloca %struct.A, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + store i32 0, ptr %retval, align 4 + call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %a) + %call = call noundef i32 @_Z3barP1A(ptr noundef %a) + store i32 %call, ptr %i, align 4 + %call1 = call noundef i32 @_Z3bazv() + store i32 %call1, ptr %j, align 4 + %0 = load i32, ptr %i, align 4 + %1 = load i32, ptr %j, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +define linkonce_odr dso_local void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1, align 8 + ret void +} + +declare noundef i32 @_Z3bazv() + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFivE.virtual"} diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h index 568c9cf87f80e4648d220d1d955e3894c2825a4a..50b78095d015486c611cf136e1dfeb8bf70f619f 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h @@ -723,10 +723,10 @@ public: void setMemProt(orc::MemProt Prot) { this->Prot = Prot; } /// Get the memory lifetime policy for this section. - orc::MemLifetimePolicy getMemLifetimePolicy() const { return MLP; } + orc::MemLifetime getMemLifetime() const { return ML; } /// Set the memory lifetime policy for this section. - void setMemLifetimePolicy(orc::MemLifetimePolicy MLP) { this->MLP = MLP; } + void setMemLifetime(orc::MemLifetime ML) { this->ML = ML; } /// Returns the ordinal for this section. SectionOrdinal getOrdinal() const { return SecOrdinal; } @@ -794,7 +794,7 @@ private: StringRef Name; orc::MemProt Prot; - orc::MemLifetimePolicy MLP = orc::MemLifetimePolicy::Standard; + orc::MemLifetime ML = orc::MemLifetime::Standard; SectionOrdinal SecOrdinal = 0; BlockSet Blocks; SymbolSet Symbols; diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h index 09e0d71cf0bd29cdda2f7d49bc74cd075eb609e3..1b8c4d4e181cdcc16aaf2aa5a234aa2980e1f46c 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h @@ -292,8 +292,8 @@ private: /// address of that block using the Segment's AllocGroup. Once memory has been /// populated, clients can call finalize to finalize the memory. /// -/// Note: Segments with MemLifetimePolicy::NoAlloc are not permitted, since -/// they would not be useful, and their presence is likely to indicate a bug. +/// Note: Segments with MemLifetime::NoAlloc are not permitted, since they would +/// not be useful, and their presence is likely to indicate a bug. class SimpleSegmentAlloc { public: /// Describes a segment to be allocated. diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h index c20366cfbb38883796b0c7f690421b1ada84134e..b8b5f90b6b0fbc88a1d91fd288c52f650cf71c5b 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h @@ -72,7 +72,7 @@ inline MemProt fromSysMemoryProtectionFlags(sys::Memory::ProtectionFlags PF) { /// deallocated if a call is made to /// JITLinkMemoryManager::InFlightAllocation::abandon. The policies below apply /// to finalized allocations. -enum class MemLifetimePolicy { +enum class MemLifetime { /// Standard memory should be allocated by the allocator and then deallocated /// when the deallocate method is called for the finalized allocation. Standard, @@ -89,15 +89,15 @@ enum class MemLifetimePolicy { }; /// Print a MemDeallocPolicy. -inline raw_ostream &operator<<(raw_ostream &OS, MemLifetimePolicy MLP) { +inline raw_ostream &operator<<(raw_ostream &OS, MemLifetime MLP) { switch (MLP) { - case MemLifetimePolicy::Standard: + case MemLifetime::Standard: OS << "standard"; break; - case MemLifetimePolicy::Finalize: + case MemLifetime::Finalize: OS << "finalize"; break; - case MemLifetimePolicy::NoAlloc: + case MemLifetime::NoAlloc: OS << "noalloc"; break; } @@ -124,11 +124,11 @@ public: AllocGroup() = default; /// Create an AllocGroup from a MemProt only -- uses - /// MemLifetimePolicy::Standard. + /// MemLifetime::Standard. AllocGroup(MemProt MP) : Id(static_cast(MP)) {} - /// Create an AllocGroup from a MemProt and a MemLifetimePolicy. - AllocGroup(MemProt MP, MemLifetimePolicy MLP) + /// Create an AllocGroup from a MemProt and a MemLifetime. + AllocGroup(MemProt MP, MemLifetime MLP) : Id(static_cast(MP) | (static_cast(MLP) << BitsForProt)) {} @@ -137,9 +137,9 @@ public: return static_cast(Id & ((1U << BitsForProt) - 1)); } - /// Returns the MemLifetimePolicy for this group. - MemLifetimePolicy getMemLifetimePolicy() const { - return static_cast(Id >> BitsForProt); + /// Returns the MemLifetime for this group. + MemLifetime getMemLifetime() const { + return static_cast(Id >> BitsForProt); } friend bool operator==(const AllocGroup &LHS, const AllocGroup &RHS) { @@ -203,8 +203,7 @@ private: /// Print an AllocGroup. inline raw_ostream &operator<<(raw_ostream &OS, AllocGroup AG) { - return OS << '(' << AG.getMemProt() << ", " << AG.getMemLifetimePolicy() - << ')'; + return OS << '(' << AG.getMemProt() << ", " << AG.getMemLifetime() << ')'; } } // end namespace orc diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h index 09c73db44a947b3fc8ef1747f9d032d1332f3b10..1285867565e22b80cdbf8223c729572d1b5adc5b 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h @@ -36,10 +36,9 @@ struct RemoteAllocGroup { RemoteAllocGroup(MemProt Prot, bool FinalizeLifetime) : Prot(Prot), FinalizeLifetime(FinalizeLifetime) {} RemoteAllocGroup(const AllocGroup &AG) : Prot(AG.getMemProt()) { - assert(AG.getMemLifetimePolicy() != orc::MemLifetimePolicy::NoAlloc && + assert(AG.getMemLifetime() != orc::MemLifetime::NoAlloc && "Cannot use no-alloc memory in a remote alloc request"); - FinalizeLifetime = - AG.getMemLifetimePolicy() == orc::MemLifetimePolicy::Finalize; + FinalizeLifetime = AG.getMemLifetime() == orc::MemLifetime::Finalize; } MemProt Prot; diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h index 5c23ba4f7ac498f2679af3c638ab6b62fdf39b40..76e19dd007912d3ad5cfb117ebfa3cb880721124 100644 --- a/llvm/include/llvm/LTO/Config.h +++ b/llvm/include/llvm/LTO/Config.h @@ -80,6 +80,12 @@ struct Config { /// link. bool HasWholeProgramVisibility = false; + /// We're validating that all native vtables have corresponding type infos. + bool ValidateAllVtablesHaveTypeInfos = false; + /// If all native vtables have corresponding type infos, allow + /// usage of RTTI to block devirtualization on types used in native files. + bool AllVtablesHaveTypeInfos = false; + /// Always emit a Regular LTO object even when it is empty because no Regular /// LTO modules were linked. This option is useful for some build system which /// want to know a priori all possible output files. diff --git a/llvm/include/llvm/Support/raw_ostream.h b/llvm/include/llvm/Support/raw_ostream.h index 1e01eb9ea19c4187302a91457b6d34fbe5b67584..a1ad1f8f5333244b65c4461a1225e8cd26b2895a 100644 --- a/llvm/include/llvm/Support/raw_ostream.h +++ b/llvm/include/llvm/Support/raw_ostream.h @@ -437,8 +437,8 @@ public: #ifndef NDEBUG uint64_t Pos = tell(); // /dev/null always reports a pos of 0, so we cannot perform this check - // in that case. - if (Pos) + // in that case. and, When size is 0, no extending will occur. + if (Pos && Size) assert(Size + Offset <= Pos && "We don't support extending the stream"); #endif pwrite_impl(Ptr, Size, Offset); diff --git a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h index 9e121d9c6f4ed1000f8ddcfd3c5e3658e4729572..0be3146f695a67607435c28535f6a5ec79f0db41 100644 --- a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h +++ b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h @@ -243,10 +243,18 @@ void updatePublicTypeTestCalls(Module &M, bool WholeProgramVisibilityEnabledInLTO); void updateVCallVisibilityInModule( Module &M, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols); + const DenseSet &DynamicExportSymbols, + bool ValidateAllVtablesHaveTypeInfos, + function_ref IsVisibleToRegularObj); void updateVCallVisibilityInIndex( ModuleSummaryIndex &Index, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols); + const DenseSet &DynamicExportSymbols, + const DenseSet &VisibleToRegularObjSymbols); + +void getVisibleToRegularObjVtableGUIDs( + ModuleSummaryIndex &Index, + DenseSet &VisibleToRegularObjSymbols, + function_ref IsVisibleToRegularObj); /// Perform index-based whole program devirtualization on the \p Summary /// index. Any devirtualized targets used by a type test in another module diff --git a/llvm/include/llvm/Transforms/Utils/CodeLayout.h b/llvm/include/llvm/Transforms/Utils/CodeLayout.h index e8106e474332199a9e49a19b04fe0d91725a90ca..f5127cff24af0dfd3901d19706db6f36656adcb8 100644 --- a/llvm/include/llvm/Transforms/Utils/CodeLayout.h +++ b/llvm/include/llvm/Transforms/Utils/CodeLayout.h @@ -14,14 +14,21 @@ #ifndef LLVM_TRANSFORMS_UTILS_CODELAYOUT_H #define LLVM_TRANSFORMS_UTILS_CODELAYOUT_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include #include -namespace llvm { +namespace llvm::codelayout { using EdgeT = std::pair; -using EdgeCountT = std::pair; + +struct EdgeCount { + uint64_t src; + uint64_t dst; + uint64_t count; +}; /// Find a layout of nodes (basic blocks) of a given CFG optimizing jump /// locality and thus processor I-cache utilization. This is achieved via @@ -34,25 +41,55 @@ using EdgeCountT = std::pair; /// \p EdgeCounts: The execution counts of every edge (jump) in the profile. The /// map also defines the edges in CFG and should include 0-count edges. /// \returns The best block order found. -std::vector -applyExtTspLayout(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts); +std::vector computeExtTspLayout(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts); /// Estimate the "quality" of a given node order in CFG. The higher the score, /// the better the order is. The score is designed to reflect the locality of /// the given order, which is anti-correlated with the number of I-cache misses /// in a typical execution of the function. -double calcExtTspScore(const std::vector &Order, - const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts); +double calcExtTspScore(ArrayRef Order, ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts); /// Estimate the "quality" of the current node order in CFG. -double calcExtTspScore(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts); +double calcExtTspScore(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts); + +/// Algorithm-specific params for Cache-Directed Sort. The values are tuned for +/// the best performance of large-scale front-end bound binaries. +struct CDSortConfig { + /// The size of the cache. + unsigned CacheEntries = 16; + /// The size of a line in the cache. + unsigned CacheSize = 2048; + /// The power exponent for the distance-based locality. + double DistancePower = 0.25; + /// The scale factor for the frequency-based locality. + double FrequencyScale = 0.25; +}; + +/// Apply a Cache-Directed Sort for functions represented by a call graph. +/// The placement is done by optimizing the call locality by co-locating +/// frequently executed functions. +/// \p FuncSizes: The sizes of the nodes (in bytes). +/// \p FuncCounts: The execution counts of the nodes in the profile. +/// \p CallCounts: The execution counts of every edge (jump) in the profile. The +/// map also defines the edges in CFG and should include 0-count edges. +/// \p CallOffsets: The offsets of the calls from their source nodes. +/// \returns The best function order found. +std::vector computeCacheDirectedLayout( + ArrayRef FuncSizes, ArrayRef FuncCounts, + ArrayRef CallCounts, ArrayRef CallOffsets); + +/// Apply a Cache-Directed Sort with a custom config. +std::vector computeCacheDirectedLayout( + const CDSortConfig &Config, ArrayRef FuncSizes, + ArrayRef FuncCounts, ArrayRef CallCounts, + ArrayRef CallOffsets); -} // end namespace llvm +} // namespace llvm::codelayout #endif // LLVM_TRANSFORMS_UTILS_CODELAYOUT_H diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 6913165add258561237efa246dd0edc37a8c447a..b69045b4d61f28b47ad70377002a3d938b9bcbeb 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -3502,7 +3502,7 @@ void MachineBlockPlacement::applyExtTsp() { auto BlockSizes = std::vector(F->size()); auto BlockCounts = std::vector(F->size()); - std::vector JumpCounts; + std::vector JumpCounts; for (MachineBasicBlock &MBB : *F) { // Getting the block frequency. BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB); @@ -3521,8 +3521,8 @@ void MachineBlockPlacement::applyExtTsp() { for (MachineBasicBlock *Succ : MBB.successors()) { auto EP = MBPI->getEdgeProbability(&MBB, Succ); BlockFrequency JumpFreq = BlockFreq * EP; - auto Jump = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]); - JumpCounts.push_back(std::make_pair(Jump, JumpFreq.getFrequency())); + JumpCounts.push_back( + {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()}); } } @@ -3535,7 +3535,7 @@ void MachineBlockPlacement::applyExtTsp() { calcExtTspScore(BlockSizes, BlockCounts, JumpCounts))); // Run the layout algorithm. - auto NewOrder = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts); + auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts); std::vector NewBlockOrder; NewBlockOrder.reserve(F->size()); for (uint64_t Node : NewOrder) { diff --git a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp index 6668854e1a6a6854b47c94f8726ff73b5573f27d..3bf7c9edb8bc596b05d86a1326d80ee4c637e4fc 100644 --- a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp @@ -161,7 +161,7 @@ Error COFFLinkGraphBuilder::graphifySections() { if (!GraphSec) { GraphSec = &G->createSection(SectionName, Prot); if ((*Sec)->Characteristics & COFF::IMAGE_SCN_LNK_REMOVE) - GraphSec->setMemLifetimePolicy(orc::MemLifetimePolicy::NoAlloc); + GraphSec->setMemLifetime(orc::MemLifetime::NoAlloc); } if (GraphSec->getMemProt() != Prot) return make_error("MemProt should match"); diff --git a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h index e726457983490df0557bb63b0a8eb5e8d6bcb2eb..127f33aad2eada4362a1ce31322811c0f3bd54a7 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h +++ b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h @@ -366,7 +366,7 @@ template Error ELFLinkGraphBuilder::graphifySections() { GraphSec = &G->createSection(*Name, Prot); // Non-SHF_ALLOC sections get NoAlloc memory lifetimes. if (!(Sec.sh_flags & ELF::SHF_ALLOC)) { - GraphSec->setMemLifetimePolicy(orc::MemLifetimePolicy::NoAlloc); + GraphSec->setMemLifetime(orc::MemLifetime::NoAlloc); LLVM_DEBUG({ dbgs() << " " << SecIndex << ": \"" << *Name << "\" is not a SHF_ALLOC section. Using NoAlloc lifetime.\n"; diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h index e69eddd6e1194479a10bbec118238ab1df5559f3..25569d63daa298a330169254c864d9d189e503ea 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h +++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h @@ -124,8 +124,7 @@ private: LLVM_DEBUG(dbgs() << "Fixing up blocks:\n"); for (auto &Sec : G.sections()) { - bool NoAllocSection = - Sec.getMemLifetimePolicy() == orc::MemLifetimePolicy::NoAlloc; + bool NoAllocSection = Sec.getMemLifetime() == orc::MemLifetime::NoAlloc; for (auto *B : Sec.blocks()) { LLVM_DEBUG(dbgs() << " " << *B << ":\n"); @@ -153,12 +152,11 @@ private: // If B is a block in a Standard or Finalize section then make sure // that no edges point to symbols in NoAlloc sections. - assert( - (NoAllocSection || !E.getTarget().isDefined() || - E.getTarget().getBlock().getSection().getMemLifetimePolicy() != - orc::MemLifetimePolicy::NoAlloc) && - "Block in allocated section has edge pointing to no-alloc " - "section"); + assert((NoAllocSection || !E.getTarget().isDefined() || + E.getTarget().getBlock().getSection().getMemLifetime() != + orc::MemLifetime::NoAlloc) && + "Block in allocated section has edge pointing to no-alloc " + "section"); // Dispatch to LinkerImpl for fixup. if (auto Err = impl().applyFixup(G, *B, E)) diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp index f481504135a5fba88f2e8354638a94ca6c497e0b..57e17aa78fed919f4cf7bb4fb422619dd4084bf7 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp @@ -26,10 +26,10 @@ BasicLayout::BasicLayout(LinkGraph &G) : G(G) { for (auto &Sec : G.sections()) { // Skip empty sections, and sections with NoAlloc lifetime policies. if (Sec.blocks().empty() || - Sec.getMemLifetimePolicy() == orc::MemLifetimePolicy::NoAlloc) + Sec.getMemLifetime() == orc::MemLifetime::NoAlloc) continue; - auto &Seg = Segments[{Sec.getMemProt(), Sec.getMemLifetimePolicy()}]; + auto &Seg = Segments[{Sec.getMemProt(), Sec.getMemLifetime()}]; for (auto *B : Sec.blocks()) if (LLVM_LIKELY(!B->isZeroFill())) Seg.ContentBlocks.push_back(B); @@ -90,7 +90,7 @@ BasicLayout::getContiguousPageBasedLayoutSizes(uint64_t PageSize) { inconvertibleErrorCode()); uint64_t SegSize = alignTo(Seg.ContentSize + Seg.ZeroFillSize, PageSize); - if (AG.getMemLifetimePolicy() == orc::MemLifetimePolicy::Standard) + if (AG.getMemLifetime() == orc::MemLifetime::Standard) SegsSizes.StandardSegs += SegSize; else SegsSizes.FinalizeSegs += SegSize; @@ -164,15 +164,15 @@ void SimpleSegmentAlloc::Create(JITLinkMemoryManager &MemMgr, auto &AG = KV.first; auto &Seg = KV.second; - assert(AG.getMemLifetimePolicy() != orc::MemLifetimePolicy::NoAlloc && + assert(AG.getMemLifetime() != orc::MemLifetime::NoAlloc && "NoAlloc segments are not supported by SimpleSegmentAlloc"); auto AGSectionName = AGSectionNames[static_cast(AG.getMemProt()) | - static_cast(AG.getMemLifetimePolicy()) << 3]; + static_cast(AG.getMemLifetime()) << 3]; auto &Sec = G->createSection(AGSectionName, AG.getMemProt()); - Sec.setMemLifetimePolicy(AG.getMemLifetimePolicy()); + Sec.setMemLifetime(AG.getMemLifetime()); if (Seg.ContentSize != 0) { NextAddr = @@ -419,10 +419,9 @@ void InProcessMemoryManager::allocate(const JITLinkDylib *JD, LinkGraph &G, auto &AG = KV.first; auto &Seg = KV.second; - auto &SegAddr = - (AG.getMemLifetimePolicy() == orc::MemLifetimePolicy::Standard) - ? NextStandardSegAddr - : NextFinalizeSegAddr; + auto &SegAddr = (AG.getMemLifetime() == orc::MemLifetime::Standard) + ? NextStandardSegAddr + : NextFinalizeSegAddr; Seg.WorkingMem = SegAddr.toPtr(); Seg.Addr = SegAddr; diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp index c40e0f9ffc8d4740accf7cb93e7e5dc986fff469..45385eb6f76dc25c848b0f5a289685b27ecac18c 100644 --- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp @@ -192,7 +192,7 @@ Error MachOLinkGraphBuilder::createNormalizedSections() { // TODO: Are there any other criteria for NoAlloc lifetime? if (NSec.Flags & MachO::S_ATTR_DEBUG) - NSec.GraphSection->setMemLifetimePolicy(orc::MemLifetimePolicy::NoAlloc); + NSec.GraphSection->setMemLifetime(orc::MemLifetime::NoAlloc); IndexToSection.insert(std::make_pair(SecIndex, std::move(NSec))); } diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp index ca4950077ffe92a4b46bb79a6ebc8ba11644a542..9cfe547c84c310b75c85c6204500cdd30342eb7a 100644 --- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp @@ -322,8 +322,8 @@ void SharedMemoryMapper::initialize(MemoryMapper::AllocInfo &AI, std::memset(Base + Segment.ContentSize, 0, Segment.ZeroFillSize); tpctypes::SharedMemorySegFinalizeRequest SegReq; - SegReq.RAG = {Segment.AG.getMemProt(), Segment.AG.getMemLifetimePolicy() == - MemLifetimePolicy::Finalize}; + SegReq.RAG = {Segment.AG.getMemProt(), + Segment.AG.getMemLifetime() == MemLifetime::Finalize}; SegReq.Addr = AI.MappingBase + Segment.Offset; SegReq.Size = Segment.ContentSize + Segment.ZeroFillSize; diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index bc8abb751221ceda41c7eba05460c50d6719ebd6..6efdf6a7c3c926a57bf3e55ed3f325677309fecf 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1285,13 +1285,27 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) { updateMemProfAttributes(*RegularLTO.CombinedModule, ThinLTO.CombinedIndex); + bool WholeProgramVisibilityEnabledInLTO = + Conf.HasWholeProgramVisibility && + // If validation is enabled, upgrade visibility only when all vtables + // have typeinfos. + (!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos); + + // This returns true when the name is local or not defined. Locals are + // expected to be handled separately. + auto IsVisibleToRegularObj = [&](StringRef name) { + auto It = GlobalResolutions.find(name); + return (It == GlobalResolutions.end() || It->second.VisibleOutsideSummary); + }; + // If allowed, upgrade public vcall visibility metadata to linkage unit // visibility before whole program devirtualization in the optimizer. - updateVCallVisibilityInModule(*RegularLTO.CombinedModule, - Conf.HasWholeProgramVisibility, - DynamicExportSymbols); + updateVCallVisibilityInModule( + *RegularLTO.CombinedModule, WholeProgramVisibilityEnabledInLTO, + DynamicExportSymbols, Conf.ValidateAllVtablesHaveTypeInfos, + IsVisibleToRegularObj); updatePublicTypeTestCalls(*RegularLTO.CombinedModule, - Conf.HasWholeProgramVisibility); + WholeProgramVisibilityEnabledInLTO); if (Conf.PreOptModuleHook && !Conf.PreOptModuleHook(0, *RegularLTO.CombinedModule)) @@ -1693,13 +1707,38 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, std::set ExportedGUIDs; - if (hasWholeProgramVisibility(Conf.HasWholeProgramVisibility)) + bool WholeProgramVisibilityEnabledInLTO = + Conf.HasWholeProgramVisibility && + // If validation is enabled, upgrade visibility only when all vtables + // have typeinfos. + (!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos); + if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) ThinLTO.CombinedIndex.setWithWholeProgramVisibility(); + + // If we're validating, get the vtable symbols that should not be + // upgraded because they correspond to typeIDs outside of index-based + // WPD info. + DenseSet VisibleToRegularObjSymbols; + if (WholeProgramVisibilityEnabledInLTO && + Conf.ValidateAllVtablesHaveTypeInfos) { + // This returns true when the name is local or not defined. Locals are + // expected to be handled separately. + auto IsVisibleToRegularObj = [&](StringRef name) { + auto It = GlobalResolutions.find(name); + return (It == GlobalResolutions.end() || + It->second.VisibleOutsideSummary); + }; + + getVisibleToRegularObjVtableGUIDs(ThinLTO.CombinedIndex, + VisibleToRegularObjSymbols, + IsVisibleToRegularObj); + } + // If allowed, upgrade public vcall visibility to linkage unit visibility in // the summaries before whole program devirtualization below. - updateVCallVisibilityInIndex(ThinLTO.CombinedIndex, - Conf.HasWholeProgramVisibility, - DynamicExportSymbols); + updateVCallVisibilityInIndex( + ThinLTO.CombinedIndex, WholeProgramVisibilityEnabledInLTO, + DynamicExportSymbols, VisibleToRegularObjSymbols); // Perform index-based WPD. This will return immediately if there are // no index entries in the typeIdMetadata map (e.g. if we are instead diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp index 1402da7fbbd2774a52dae95776f0a2a4318994d4..3e2216ca61a2c7281bd8143ff0d495bf2f57751c 100644 --- a/llvm/lib/LTO/LTOCodeGenerator.cpp +++ b/llvm/lib/LTO/LTOCodeGenerator.cpp @@ -604,11 +604,14 @@ bool LTOCodeGenerator::optimize() { // pipeline run below. updatePublicTypeTestCalls(*MergedModule, /* WholeProgramVisibilityEnabledInLTO */ false); - updateVCallVisibilityInModule(*MergedModule, - /* WholeProgramVisibilityEnabledInLTO */ false, - // FIXME: This needs linker information via a - // TBD new interface. - /* DynamicExportSymbols */ {}); + updateVCallVisibilityInModule( + *MergedModule, + /* WholeProgramVisibilityEnabledInLTO */ false, + // FIXME: These need linker information via a + // TBD new interface. + /*DynamicExportSymbols=*/{}, + /*ValidateAllVtablesHaveTypeInfos=*/false, + /*IsVisibleToRegularObj=*/[](StringRef) { return true; }); // We always run the verifier once on the merged module, the `DisableVerify` // parameter only applies to subsequent verify. diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp index 24cd6e1a0b415a00c03013446d9f81410f324bda..152f708969e13fb5a9f363f9c75b02c417b8818e 100644 --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -1058,11 +1058,14 @@ void ThinLTOCodeGenerator::run() { // via the internal option. Must be done before WPD below. if (hasWholeProgramVisibility(/* WholeProgramVisibilityEnabledInLTO */ false)) Index->setWithWholeProgramVisibility(); + + // FIXME: This needs linker information via a TBD new interface updateVCallVisibilityInIndex(*Index, - /* WholeProgramVisibilityEnabledInLTO */ false, - // FIXME: This needs linker information via a + /*WholeProgramVisibilityEnabledInLTO=*/false, + // FIXME: These need linker information via a // TBD new interface. - /* DynamicExportSymbols */ {}); + /*DynamicExportSymbols=*/{}, + /*VisibleToRegularObjSymbols=*/{}); // Perform index-based WPD. This will return immediately if there are // no index entries in the typeIdMetadata map (e.g. if we are instead diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index d332586423651c8cb004218914a0fb356cff8cf2..3406595950b58cb477ed7a6f5085ccce7b2461d8 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -784,12 +784,52 @@ bool hasWholeProgramVisibility(bool WholeProgramVisibilityEnabledInLTO) { !DisableWholeProgramVisibility; } +static bool +typeIDVisibleToRegularObj(StringRef TypeID, + function_ref IsVisibleToRegularObj) { + // TypeID for member function pointer type is an internal construct + // and won't exist in IsVisibleToRegularObj. The full TypeID + // will be present and participate in invalidation. + if (TypeID.ends_with(".virtual")) + return false; + + // TypeID that doesn't start with Itanium mangling (_ZTS) will be + // non-externally visible types which cannot interact with + // external native files. See CodeGenModule::CreateMetadataIdentifierImpl. + if (!TypeID.consume_front("_ZTS")) + return false; + + // TypeID is keyed off the type name symbol (_ZTS). However, the native + // object may not contain this symbol if it does not contain a key + // function for the base type and thus only contains a reference to the + // type info (_ZTI). To catch this case we query using the type info + // symbol corresponding to the TypeID. + std::string typeInfo = ("_ZTI" + TypeID).str(); + return IsVisibleToRegularObj(typeInfo); +} + +static bool +skipUpdateDueToValidation(GlobalVariable &GV, + function_ref IsVisibleToRegularObj) { + SmallVector Types; + GV.getMetadata(LLVMContext::MD_type, Types); + + for (auto Type : Types) + if (auto *TypeID = dyn_cast(Type->getOperand(1).get())) + return typeIDVisibleToRegularObj(TypeID->getString(), + IsVisibleToRegularObj); + + return false; +} + /// If whole program visibility asserted, then upgrade all public vcall /// visibility metadata on vtable definitions to linkage unit visibility in /// Module IR (for regular or hybrid LTO). void updateVCallVisibilityInModule( Module &M, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols) { + const DenseSet &DynamicExportSymbols, + bool ValidateAllVtablesHaveTypeInfos, + function_ref IsVisibleToRegularObj) { if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) return; for (GlobalVariable &GV : M.globals()) { @@ -800,7 +840,13 @@ void updateVCallVisibilityInModule( GV.getVCallVisibility() == GlobalObject::VCallVisibilityPublic && // Don't upgrade the visibility for symbols exported to the dynamic // linker, as we have no information on their eventual use. - !DynamicExportSymbols.count(GV.getGUID())) + !DynamicExportSymbols.count(GV.getGUID()) && + // With validation enabled, we want to exclude symbols visible to + // regular objects. Local symbols will be in this group due to the + // current implementation but those with VCallVisibilityTranslationUnit + // will have already been marked in clang so are unaffected. + !(ValidateAllVtablesHaveTypeInfos && + skipUpdateDueToValidation(GV, IsVisibleToRegularObj))) GV.setVCallVisibilityMetadata(GlobalObject::VCallVisibilityLinkageUnit); } } @@ -832,12 +878,26 @@ void updatePublicTypeTestCalls(Module &M, } } +/// Based on typeID string, get all associated vtable GUIDS that are +/// visible to regular objects. +void getVisibleToRegularObjVtableGUIDs( + ModuleSummaryIndex &Index, + DenseSet &VisibleToRegularObjSymbols, + function_ref IsVisibleToRegularObj) { + for (const auto &typeID : Index.typeIdCompatibleVtableMap()) { + if (typeIDVisibleToRegularObj(typeID.first, IsVisibleToRegularObj)) + for (const TypeIdOffsetVtableInfo &P : typeID.second) + VisibleToRegularObjSymbols.insert(P.VTableVI.getGUID()); + } +} + /// If whole program visibility asserted, then upgrade all public vcall /// visibility metadata on vtable definition summaries to linkage unit /// visibility in Module summary index (for ThinLTO). void updateVCallVisibilityInIndex( ModuleSummaryIndex &Index, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols) { + const DenseSet &DynamicExportSymbols, + const DenseSet &VisibleToRegularObjSymbols) { if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) return; for (auto &P : Index) { @@ -850,6 +910,12 @@ void updateVCallVisibilityInIndex( if (!GVar || GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic) continue; + // With validation enabled, we want to exclude symbols visible to regular + // objects. Local symbols will be in this group due to the current + // implementation but those with VCallVisibilityTranslationUnit will have + // already been marked in clang so are unaffected. + if (VisibleToRegularObjSymbols.count(P.first)) + continue; GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit); } } @@ -1045,8 +1111,8 @@ bool DevirtModule::tryFindVirtualCallTargets( } bool DevirtIndex::tryFindVirtualCallTargets( - std::vector &TargetsForSlot, const TypeIdCompatibleVtableInfo TIdInfo, - uint64_t ByteOffset) { + std::vector &TargetsForSlot, + const TypeIdCompatibleVtableInfo TIdInfo, uint64_t ByteOffset) { for (const TypeIdOffsetVtableInfo &P : TIdInfo) { // Find a representative copy of the vtable initializer. // We can have multiple available_externally, linkonce_odr and weak_odr diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp index ac74a1c116cce05ccb04133a6eeff668ba10910c..58b5afbc869e907de1fd33a838dc9b4ccedc60c7 100644 --- a/llvm/lib/Transforms/Utils/CodeLayout.cpp +++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp @@ -45,8 +45,11 @@ #include "llvm/Support/Debug.h" #include +#include using namespace llvm; +using namespace llvm::codelayout; + #define DEBUG_TYPE "code-layout" namespace llvm { @@ -61,8 +64,8 @@ cl::opt ApplyExtTspWithoutProfile( cl::init(true), cl::Hidden); } // namespace llvm -// Algorithm-specific params. The values are tuned for the best performance -// of large-scale front-end bound binaries. +// Algorithm-specific params for Ext-TSP. The values are tuned for the best +// performance of large-scale front-end bound binaries. static cl::opt ForwardWeightCond( "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1), cl::desc("The weight of conditional forward jumps for ExtTSP value")); @@ -113,6 +116,21 @@ static cl::opt EnableChainSplitAlongJumps( "ext-tsp-enable-chain-split-along-jumps", cl::ReallyHidden, cl::init(true), cl::desc("The maximum size of a chain to apply splitting")); +// Algorithm-specific options for CDS. +static cl::opt CacheEntries("cds-cache-entries", cl::ReallyHidden, + cl::desc("The size of the cache")); + +static cl::opt CacheSize("cds-cache-size", cl::ReallyHidden, + cl::desc("The size of a line in the cache")); + +static cl::opt DistancePower( + "cds-distance-power", cl::ReallyHidden, + cl::desc("The power exponent for the distance-based locality")); + +static cl::opt FrequencyScale( + "cds-frequency-scale", cl::ReallyHidden, + cl::desc("The scale factor for the frequency-based locality")); + namespace { // Epsilon for comparison of doubles. @@ -280,9 +298,9 @@ struct ChainT { } ChainEdge *getEdge(ChainT *Other) const { - for (auto It : Edges) { - if (It.first == Other) - return It.second; + for (const auto &[Chain, ChainEdge] : Edges) { + if (Chain == Other) + return ChainEdge; } return nullptr; } @@ -302,13 +320,13 @@ struct ChainT { Edges.push_back(std::make_pair(Other, Edge)); } - void merge(ChainT *Other, const std::vector &MergedBlocks) { - Nodes = MergedBlocks; - // Update the chain's data + void merge(ChainT *Other, std::vector MergedBlocks) { + Nodes = std::move(MergedBlocks); + // Update the chain's data. ExecutionCount += Other->ExecutionCount; Size += Other->Size; Id = Nodes[0]->Index; - // Update the node's data + // Update the node's data. for (size_t Idx = 0; Idx < Nodes.size(); Idx++) { Nodes[Idx]->CurChain = this; Nodes[Idx]->CurIndex = Idx; @@ -340,7 +358,7 @@ struct ChainT { /// An edge in the graph representing jumps between two chains. /// When nodes are merged into chains, the edges are combined too so that -/// there is always at most one edge between a pair of chains +/// there is always at most one edge between a pair of chains. struct ChainEdge { ChainEdge(const ChainEdge &) = delete; ChainEdge(ChainEdge &&) = default; @@ -426,40 +444,34 @@ private: uint64_t NodeT::outCount() const { uint64_t Count = 0; - for (JumpT *Jump : OutJumps) { + for (JumpT *Jump : OutJumps) Count += Jump->ExecutionCount; - } return Count; } uint64_t NodeT::inCount() const { uint64_t Count = 0; - for (JumpT *Jump : InJumps) { + for (JumpT *Jump : InJumps) Count += Jump->ExecutionCount; - } return Count; } void ChainT::mergeEdges(ChainT *Other) { - // Update edges adjacent to chain Other - for (auto EdgeIt : Other->Edges) { - ChainT *DstChain = EdgeIt.first; - ChainEdge *DstEdge = EdgeIt.second; + // Update edges adjacent to chain Other. + for (const auto &[DstChain, DstEdge] : Other->Edges) { ChainT *TargetChain = DstChain == Other ? this : DstChain; ChainEdge *CurEdge = getEdge(TargetChain); if (CurEdge == nullptr) { DstEdge->changeEndpoint(Other, this); this->addEdge(TargetChain, DstEdge); - if (DstChain != this && DstChain != Other) { + if (DstChain != this && DstChain != Other) DstChain->addEdge(this, DstEdge); - } } else { CurEdge->moveJumps(DstEdge); } - // Cleanup leftover edge - if (DstChain != Other) { + // Cleanup leftover edge. + if (DstChain != Other) DstChain->removeEdge(Other); - } } } @@ -512,7 +524,7 @@ private: MergedChain mergeNodes(const std::vector &X, const std::vector &Y, size_t MergeOffset, MergeTypeT MergeType) { - // Split the first chain, X, into X1 and X2 + // Split the first chain, X, into X1 and X2. NodeIter BeginX1 = X.begin(); NodeIter EndX1 = X.begin() + MergeOffset; NodeIter BeginX2 = X.begin() + MergeOffset; @@ -520,7 +532,7 @@ MergedChain mergeNodes(const std::vector &X, NodeIter BeginY = Y.begin(); NodeIter EndY = Y.end(); - // Construct a new chain from the three existing ones + // Construct a new chain from the three existing ones. switch (MergeType) { case MergeTypeT::X_Y: return MergedChain(BeginX1, EndX2, BeginY, EndY); @@ -539,15 +551,14 @@ MergedChain mergeNodes(const std::vector &X, /// The implementation of the ExtTSP algorithm. class ExtTSPImpl { public: - ExtTSPImpl(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts) + ExtTSPImpl(ArrayRef NodeSizes, ArrayRef NodeCounts, + ArrayRef EdgeCounts) : NumNodes(NodeSizes.size()) { initialize(NodeSizes, NodeCounts, EdgeCounts); } /// Run the algorithm and return an optimized ordering of nodes. - void run(std::vector &Result) { + std::vector run() { // Pass 1: Merge nodes with their mutually forced successors mergeForcedPairs(); @@ -558,20 +569,20 @@ public: mergeColdChains(); // Collect nodes from all chains - concatChains(Result); + return concatChains(); } private: /// Initialize the algorithm's data structures. - void initialize(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts) { + void initialize(const ArrayRef &NodeSizes, + const ArrayRef &NodeCounts, + const ArrayRef &EdgeCounts) { // Initialize nodes AllNodes.reserve(NumNodes); for (uint64_t Idx = 0; Idx < NumNodes; Idx++) { uint64_t Size = std::max(NodeSizes[Idx], 1ULL); uint64_t ExecutionCount = NodeCounts[Idx]; - // The execution count of the entry node is set to at least one + // The execution count of the entry node is set to at least one. if (Idx == 0 && ExecutionCount == 0) ExecutionCount = 1; AllNodes.emplace_back(Idx, Size, ExecutionCount); @@ -582,21 +593,18 @@ private: PredNodes.resize(NumNodes); std::vector OutDegree(NumNodes, 0); AllJumps.reserve(EdgeCounts.size()); - for (auto It : EdgeCounts) { - uint64_t Pred = It.first.first; - uint64_t Succ = It.first.second; - OutDegree[Pred]++; - // Ignore self-edges - if (Pred == Succ) + for (auto Edge : EdgeCounts) { + ++OutDegree[Edge.src]; + // Ignore self-edges. + if (Edge.src == Edge.dst) continue; - SuccNodes[Pred].push_back(Succ); - PredNodes[Succ].push_back(Pred); - uint64_t ExecutionCount = It.second; - if (ExecutionCount > 0) { - NodeT &PredNode = AllNodes[Pred]; - NodeT &SuccNode = AllNodes[Succ]; - AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount); + SuccNodes[Edge.src].push_back(Edge.dst); + PredNodes[Edge.dst].push_back(Edge.src); + if (Edge.count > 0) { + NodeT &PredNode = AllNodes[Edge.src]; + NodeT &SuccNode = AllNodes[Edge.dst]; + AllJumps.emplace_back(&PredNode, &SuccNode, Edge.count); SuccNode.InJumps.push_back(&AllJumps.back()); PredNode.OutJumps.push_back(&AllJumps.back()); } @@ -606,30 +614,29 @@ private: Jump.IsConditional = OutDegree[Jump.Source->Index] > 1; } - // Initialize chains + // Initialize chains. AllChains.reserve(NumNodes); HotChains.reserve(NumNodes); for (NodeT &Node : AllNodes) { AllChains.emplace_back(Node.Index, &Node); Node.CurChain = &AllChains.back(); - if (Node.ExecutionCount > 0) { + if (Node.ExecutionCount > 0) HotChains.push_back(&AllChains.back()); - } } - // Initialize chain edges + // Initialize chain edges. AllEdges.reserve(AllJumps.size()); for (NodeT &PredNode : AllNodes) { for (JumpT *Jump : PredNode.OutJumps) { NodeT *SuccNode = Jump->Target; ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain); - // this edge is already present in the graph + // this edge is already present in the graph. if (CurEdge != nullptr) { assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr); CurEdge->appendJump(Jump); continue; } - // this is a new edge + // this is a new edge. AllEdges.emplace_back(Jump); PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back()); SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back()); @@ -642,7 +649,7 @@ private: /// to B are from A. Such nodes should be adjacent in the optimal ordering; /// the method finds and merges such pairs of nodes. void mergeForcedPairs() { - // Find fallthroughs based on edge weights + // Find fallthroughs based on edge weights. for (NodeT &Node : AllNodes) { if (SuccNodes[Node.Index].size() == 1 && PredNodes[SuccNodes[Node.Index][0]].size() == 1 && @@ -669,12 +676,12 @@ private: } if (SuccNode == nullptr) continue; - // Break the cycle + // Break the cycle. AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr; Node.ForcedPred = nullptr; } - // Merge nodes with their fallthrough successors + // Merge nodes with their fallthrough successors. for (NodeT &Node : AllNodes) { if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) { const NodeT *CurBlock = &Node; @@ -689,7 +696,7 @@ private: /// Merge pairs of chains while improving the ExtTSP objective. void mergeChainPairs() { - /// Deterministically compare pairs of chains + /// Deterministically compare pairs of chains. auto compareChainPairs = [](const ChainT *A1, const ChainT *B1, const ChainT *A2, const ChainT *B2) { if (A1 != A2) @@ -701,21 +708,19 @@ private: ChainT *BestChainPred = nullptr; ChainT *BestChainSucc = nullptr; MergeGainT BestGain; - // Iterate over all pairs of chains + // Iterate over all pairs of chains. for (ChainT *ChainPred : HotChains) { - // Get candidates for merging with the current chain - for (auto EdgeIt : ChainPred->Edges) { - ChainT *ChainSucc = EdgeIt.first; - ChainEdge *Edge = EdgeIt.second; - // Ignore loop edges + // Get candidates for merging with the current chain. + for (const auto &[ChainSucc, Edge] : ChainPred->Edges) { + // Ignore loop edges. if (ChainPred == ChainSucc) continue; - // Stop early if the combined chain violates the maximum allowed size + // Stop early if the combined chain violates the maximum allowed size. if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize) continue; - // Compute the gain of merging the two chains + // Compute the gain of merging the two chains. MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge); if (CurGain.score() <= EPS) continue; @@ -731,11 +736,11 @@ private: } } - // Stop merging when there is no improvement + // Stop merging when there is no improvement. if (BestGain.score() <= EPS) break; - // Merge the best pair of chains + // Merge the best pair of chains. mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(), BestGain.mergeType()); } @@ -743,7 +748,7 @@ private: /// Merge remaining nodes into chains w/o taking jump counts into /// consideration. This allows to maintain the original node order in the - /// absence of profile data + /// absence of profile data. void mergeColdChains() { for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) { // Iterating in reverse order to make sure original fallthrough jumps are @@ -797,7 +802,7 @@ private: return Edge->getCachedMergeGain(ChainPred, ChainSucc); } - // Precompute jumps between ChainPred and ChainSucc + // Precompute jumps between ChainPred and ChainSucc. auto Jumps = Edge->jumps(); ChainEdge *EdgePP = ChainPred->getEdge(ChainPred); if (EdgePP != nullptr) { @@ -805,34 +810,34 @@ private: } assert(!Jumps.empty() && "trying to merge chains w/o jumps"); - // The object holds the best currently chosen gain of merging the two chains + // This object holds the best chosen gain of merging two chains. MergeGainT Gain = MergeGainT(); /// Given a merge offset and a list of merge types, try to merge two chains - /// and update Gain with a better alternative + /// and update Gain with a better alternative. auto tryChainMerging = [&](size_t Offset, const std::vector &MergeTypes) { - // Skip merging corresponding to concatenation w/o splitting + // Skip merging corresponding to concatenation w/o splitting. if (Offset == 0 || Offset == ChainPred->Nodes.size()) return; - // Skip merging if it breaks Forced successors + // Skip merging if it breaks Forced successors. NodeT *Node = ChainPred->Nodes[Offset - 1]; if (Node->ForcedSucc != nullptr) return; // Apply the merge, compute the corresponding gain, and update the best - // value, if the merge is beneficial + // value, if the merge is beneficial. for (const MergeTypeT &MergeType : MergeTypes) { Gain.updateIfLessThan( computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType)); } }; - // Try to concatenate two chains w/o splitting + // Try to concatenate two chains w/o splitting. Gain.updateIfLessThan( computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y)); if (EnableChainSplitAlongJumps) { - // Attach (a part of) ChainPred before the first node of ChainSucc + // Attach (a part of) ChainPred before the first node of ChainSucc. for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) { const NodeT *SrcBlock = Jump->Source; if (SrcBlock->CurChain != ChainPred) @@ -841,7 +846,7 @@ private: tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y}); } - // Attach (a part of) ChainPred after the last node of ChainSucc + // Attach (a part of) ChainPred after the last node of ChainSucc. for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) { const NodeT *DstBlock = Jump->Source; if (DstBlock->CurChain != ChainPred) @@ -851,12 +856,12 @@ private: } } - // Try to break ChainPred in various ways and concatenate with ChainSucc + // Try to break ChainPred in various ways and concatenate with ChainSucc. if (ChainPred->Nodes.size() <= ChainSplitThreshold) { for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) { // Try to split the chain in different ways. In practice, applying // X2_Y_X1 merging is almost never provides benefits; thus, we exclude - // it from consideration to reduce the search space + // it from consideration to reduce the search space. tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1, MergeTypeT::X2_X1_Y}); } @@ -875,12 +880,12 @@ private: auto MergedBlocks = mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType); - // Do not allow a merge that does not preserve the original entry point + // Do not allow a merge that does not preserve the original entry point. if ((ChainPred->isEntry() || ChainSucc->isEntry()) && !MergedBlocks.getFirstNode()->isEntry()) return MergeGainT(); - // The gain for the new chain + // The gain for the new chain. auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->Score; return MergeGainT(NewGainScore, MergeOffset, MergeType); } @@ -891,39 +896,39 @@ private: MergeTypeT MergeType) { assert(Into != From && "a chain cannot be merged with itself"); - // Merge the nodes + // Merge the nodes. MergedChain MergedNodes = mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType); Into->merge(From, MergedNodes.getNodes()); - // Merge the edges + // Merge the edges. Into->mergeEdges(From); From->clear(); - // Update cached ext-tsp score for the new chain + // Update cached ext-tsp score for the new chain. ChainEdge *SelfEdge = Into->getEdge(Into); if (SelfEdge != nullptr) { MergedNodes = MergedChain(Into->Nodes.begin(), Into->Nodes.end()); Into->Score = extTSPScore(MergedNodes, SelfEdge->jumps()); } - // Remove the chain from the list of active chains + // Remove the chain from the list of active chains. llvm::erase_value(HotChains, From); - // Invalidate caches + // Invalidate caches. for (auto EdgeIt : Into->Edges) EdgeIt.second->invalidateCache(); } /// Concatenate all chains into the final order. - void concatChains(std::vector &Order) { - // Collect chains and calculate density stats for their sorting + std::vector concatChains() { + // Collect chains and calculate density stats for their sorting. std::vector SortedChains; DenseMap ChainDensity; for (ChainT &Chain : AllChains) { if (!Chain.Nodes.empty()) { SortedChains.push_back(&Chain); - // Using doubles to avoid overflow of ExecutionCounts + // Using doubles to avoid overflow of ExecutionCounts. double Size = 0; double ExecutionCount = 0; for (NodeT *Node : Chain.Nodes) { @@ -935,27 +940,28 @@ private: } } - // Sorting chains by density in the decreasing order - std::stable_sort(SortedChains.begin(), SortedChains.end(), - [&](const ChainT *L, const ChainT *R) { - // Make sure the original entry point is at the - // beginning of the order - if (L->isEntry() != R->isEntry()) - return L->isEntry(); - - const double DL = ChainDensity[L]; - const double DR = ChainDensity[R]; - // Compare by density and break ties by chain identifiers - return (DL != DR) ? (DL > DR) : (L->Id < R->Id); - }); - - // Collect the nodes in the order specified by their chains + // Sorting chains by density in the decreasing order. + std::sort(SortedChains.begin(), SortedChains.end(), + [&](const ChainT *L, const ChainT *R) { + // Place the entry point is at the beginning of the order. + if (L->isEntry() != R->isEntry()) + return L->isEntry(); + + const double DL = ChainDensity[L]; + const double DR = ChainDensity[R]; + // Compare by density and break ties by chain identifiers. + return (DL != DR) ? (DL > DR) : (L->Id < R->Id); + return std::make_tuple(-DL, L->Id) < + std::make_tuple(-DR, R->Id); + }); + + // Collect the nodes in the order specified by their chains. + std::vector Order; Order.reserve(NumNodes); - for (const ChainT *Chain : SortedChains) { - for (NodeT *Node : Chain->Nodes) { + for (const ChainT *Chain : SortedChains) + for (NodeT *Node : Chain->Nodes) Order.push_back(Node->Index); - } - } + return Order; } private: @@ -984,61 +990,466 @@ private: std::vector HotChains; }; +/// The implementation of the Cache-Directed Sort (CDS) algorithm for ordering +/// functions represented by a call graph. +class CDSortImpl { +public: + CDSortImpl(const CDSortConfig &Config, ArrayRef NodeSizes, + ArrayRef NodeCounts, ArrayRef EdgeCounts, + ArrayRef EdgeOffsets) + : Config(Config), NumNodes(NodeSizes.size()) { + initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets); + } + + /// Run the algorithm and return an ordered set of function clusters. + std::vector run() { + // Merge pairs of chains while improving the objective. + mergeChainPairs(); + + LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number" + << " of chains from " << NumNodes << " to " + << HotChains.size() << "\n"); + + // Collect nodes from all the chains. + return concatChains(); + } + +private: + /// Initialize the algorithm's data structures. + void initialize(const ArrayRef &NodeSizes, + const ArrayRef &NodeCounts, + const ArrayRef &EdgeCounts, + const ArrayRef &EdgeOffsets) { + // Initialize nodes. + AllNodes.reserve(NumNodes); + for (uint64_t Node = 0; Node < NumNodes; Node++) { + uint64_t Size = std::max(NodeSizes[Node], 1ULL); + uint64_t ExecutionCount = NodeCounts[Node]; + AllNodes.emplace_back(Node, Size, ExecutionCount); + TotalSamples += ExecutionCount; + if (ExecutionCount > 0) + TotalSize += Size; + } + + // Initialize jumps between the nodes. + SuccNodes.resize(NumNodes); + PredNodes.resize(NumNodes); + AllJumps.reserve(EdgeCounts.size()); + for (size_t I = 0; I < EdgeCounts.size(); I++) { + auto [Pred, Succ, Count] = EdgeCounts[I]; + // Ignore recursive calls. + if (Pred == Succ) + continue; + + SuccNodes[Pred].push_back(Succ); + PredNodes[Succ].push_back(Pred); + if (Count > 0) { + NodeT &PredNode = AllNodes[Pred]; + NodeT &SuccNode = AllNodes[Succ]; + AllJumps.emplace_back(&PredNode, &SuccNode, Count); + AllJumps.back().Offset = EdgeOffsets[I]; + SuccNode.InJumps.push_back(&AllJumps.back()); + PredNode.OutJumps.push_back(&AllJumps.back()); + } + } + + // Initialize chains. + AllChains.reserve(NumNodes); + HotChains.reserve(NumNodes); + for (NodeT &Node : AllNodes) { + // Adjust execution counts. + Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount()); + Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount()); + // Create chain. + AllChains.emplace_back(Node.Index, &Node); + Node.CurChain = &AllChains.back(); + if (Node.ExecutionCount > 0) + HotChains.push_back(&AllChains.back()); + } + + // Initialize chain edges. + AllEdges.reserve(AllJumps.size()); + for (NodeT &PredNode : AllNodes) { + for (JumpT *Jump : PredNode.OutJumps) { + NodeT *SuccNode = Jump->Target; + ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain); + // this edge is already present in the graph. + if (CurEdge != nullptr) { + assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr); + CurEdge->appendJump(Jump); + continue; + } + // this is a new edge. + AllEdges.emplace_back(Jump); + PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back()); + SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back()); + } + } + } + + /// Merge pairs of chains while there is an improvement in the objective. + void mergeChainPairs() { + // Create a priority queue containing all edges ordered by the merge gain. + auto GainComparator = [](ChainEdge *L, ChainEdge *R) { + return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) < + std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id); + }; + std::set Queue(GainComparator); + + // Insert the edges into the queue. + for (ChainT *ChainPred : HotChains) { + for (const auto &[Chain, Edge] : ChainPred->Edges) { + // Ignore self-edges. + if (Edge->isSelfEdge()) + continue; + // Ignore already processed edges. + if (Edge->gain() != -1.0) + continue; + + // Compute the gain of merging the two chains. + MergeGainT Gain = getBestMergeGain(Edge); + Edge->setMergeGain(Gain); + + if (Edge->gain() > EPS) + Queue.insert(Edge); + } + } + + // Merge the chains while the gain of merging is positive. + while (!Queue.empty()) { + // Extract the best (top) edge for merging. + ChainEdge *BestEdge = *Queue.begin(); + Queue.erase(Queue.begin()); + // Ignore self-edges. + if (BestEdge->isSelfEdge()) + continue; + // Ignore edges with non-positive gains. + if (BestEdge->gain() <= EPS) + continue; + + ChainT *BestSrcChain = BestEdge->srcChain(); + ChainT *BestDstChain = BestEdge->dstChain(); + + // Remove outdated edges from the queue. + for (const auto &[Chain, ChainEdge] : BestSrcChain->Edges) + Queue.erase(ChainEdge); + for (const auto &[Chain, ChainEdge] : BestDstChain->Edges) + Queue.erase(ChainEdge); + + // Merge the best pair of chains. + MergeGainT BestGain = BestEdge->getMergeGain(); + mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(), + BestGain.mergeType()); + + // Insert newly created edges into the queue. + for (const auto &[Chain, Edge] : BestSrcChain->Edges) { + // Ignore loop edges. + if (Edge->isSelfEdge()) + continue; + + // Compute the gain of merging the two chains. + MergeGainT Gain = getBestMergeGain(Edge); + Edge->setMergeGain(Gain); + + if (Edge->gain() > EPS) + Queue.insert(Edge); + } + } + } + + /// Compute the gain of merging two chains. + /// + /// The function considers all possible ways of merging two chains and + /// computes the one having the largest increase in ExtTSP objective. The + /// result is a pair with the first element being the gain and the second + /// element being the corresponding merging type. + MergeGainT getBestMergeGain(ChainEdge *Edge) const { + // Precompute jumps between ChainPred and ChainSucc. + auto Jumps = Edge->jumps(); + assert(!Jumps.empty() && "trying to merge chains w/o jumps"); + ChainT *SrcChain = Edge->srcChain(); + ChainT *DstChain = Edge->dstChain(); + + // This object holds the best currently chosen gain of merging two chains. + MergeGainT Gain = MergeGainT(); + + /// Given a list of merge types, try to merge two chains and update Gain + /// with a better alternative. + auto tryChainMerging = [&](const std::vector &MergeTypes) { + // Apply the merge, compute the corresponding gain, and update the best + // value, if the merge is beneficial. + for (const MergeTypeT &MergeType : MergeTypes) { + MergeGainT NewGain = + computeMergeGain(SrcChain, DstChain, Jumps, MergeType); + + // When forward and backward gains are the same, prioritize merging that + // preserves the original order of the functions in the binary. + if (std::abs(Gain.score() - NewGain.score()) < EPS) { + if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) || + (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) { + Gain = NewGain; + } + } else if (NewGain.score() > Gain.score() + EPS) { + Gain = NewGain; + } + } + }; + + // Try to concatenate two chains w/o splitting. + tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X}); + + return Gain; + } + + /// Compute the score gain of merging two chains, respecting a given type. + /// + /// The two chains are not modified in the method. + MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc, + const std::vector &Jumps, + MergeTypeT MergeType) const { + // This doesn't depend on the ordering of the nodes + double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc); + + // Merge offset is always 0, as the chains are not split. + size_t MergeOffset = 0; + auto MergedBlocks = + mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType); + double DistGain = distBasedLocalityGain(MergedBlocks, Jumps); + + double GainScore = DistGain + Config.FrequencyScale * FreqGain; + // Scale the result to increase the importance of merging short chains. + if (GainScore >= 0.0) + GainScore /= std::min(ChainPred->Size, ChainSucc->Size); + + return MergeGainT(GainScore, MergeOffset, MergeType); + } + + /// Compute the change of the frequency locality after merging the chains. + double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const { + auto missProbability = [&](double ChainDensity) { + double PageSamples = ChainDensity * Config.CacheSize; + if (PageSamples >= TotalSamples) + return 0.0; + double P = PageSamples / TotalSamples; + return pow(1.0 - P, static_cast(Config.CacheEntries)); + }; + + // Cache misses on the chains before merging. + double CurScore = + ChainPred->ExecutionCount * missProbability(ChainPred->density()) + + ChainSucc->ExecutionCount * missProbability(ChainSucc->density()); + + // Cache misses on the merged chain + double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount; + double MergedSize = ChainPred->Size + ChainSucc->Size; + double MergedDensity = static_cast(MergedCounts) / MergedSize; + double NewScore = MergedCounts * missProbability(MergedDensity); + + return CurScore - NewScore; + } + + /// Compute the distance locality for a jump / call. + double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const { + uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr; + double D = Dist == 0 ? 0.1 : static_cast(Dist); + return static_cast(Count) * std::pow(D, -Config.DistancePower); + } + + /// Compute the change of the distance locality after merging the chains. + double distBasedLocalityGain(const MergedChain &MergedBlocks, + const std::vector &Jumps) const { + if (Jumps.empty()) + return 0.0; + uint64_t CurAddr = 0; + MergedBlocks.forEach([&](const NodeT *Node) { + Node->EstimatedAddr = CurAddr; + CurAddr += Node->Size; + }); + + double CurScore = 0; + double NewScore = 0; + for (const JumpT *Arc : Jumps) { + uint64_t SrcAddr = Arc->Source->EstimatedAddr + Arc->Offset; + uint64_t DstAddr = Arc->Target->EstimatedAddr; + NewScore += distScore(SrcAddr, DstAddr, Arc->ExecutionCount); + CurScore += distScore(0, TotalSize, Arc->ExecutionCount); + } + return NewScore - CurScore; + } + + /// Merge chain From into chain Into, update the list of active chains, + /// adjacency information, and the corresponding cached values. + void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset, + MergeTypeT MergeType) { + assert(Into != From && "a chain cannot be merged with itself"); + + // Merge the nodes. + MergedChain MergedNodes = + mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType); + Into->merge(From, MergedNodes.getNodes()); + + // Merge the edges. + Into->mergeEdges(From); + From->clear(); + + // Remove the chain from the list of active chains. + llvm::erase_value(HotChains, From); + } + + /// Concatenate all chains into the final order. + std::vector concatChains() { + // Collect chains and calculate density stats for their sorting. + std::vector SortedChains; + DenseMap ChainDensity; + for (ChainT &Chain : AllChains) { + if (!Chain.Nodes.empty()) { + SortedChains.push_back(&Chain); + // Using doubles to avoid overflow of ExecutionCounts. + double Size = 0; + double ExecutionCount = 0; + for (NodeT *Node : Chain.Nodes) { + Size += static_cast(Node->Size); + ExecutionCount += static_cast(Node->ExecutionCount); + } + assert(Size > 0 && "a chain of zero size"); + ChainDensity[&Chain] = ExecutionCount / Size; + } + } + + // Sort chains by density in the decreasing order. + std::sort(SortedChains.begin(), SortedChains.end(), + [&](const ChainT *L, const ChainT *R) { + const double DL = ChainDensity[L]; + const double DR = ChainDensity[R]; + // Compare by density and break ties by chain identifiers. + return std::make_tuple(-DL, L->Id) < + std::make_tuple(-DR, R->Id); + }); + + // Collect the nodes in the order specified by their chains. + std::vector Order; + Order.reserve(NumNodes); + for (const ChainT *Chain : SortedChains) + for (NodeT *Node : Chain->Nodes) + Order.push_back(Node->Index); + return Order; + } + +private: + /// Config for the algorithm. + const CDSortConfig Config; + + /// The number of nodes in the graph. + const size_t NumNodes; + + /// Successors of each node. + std::vector> SuccNodes; + + /// Predecessors of each node. + std::vector> PredNodes; + + /// All nodes (functions) in the graph. + std::vector AllNodes; + + /// All jumps (function calls) between the nodes. + std::vector AllJumps; + + /// All chains of nodes. + std::vector AllChains; + + /// All edges between the chains. + std::vector AllEdges; + + /// Active chains. The vector gets updated at runtime when chains are merged. + std::vector HotChains; + + /// The total number of samples in the graph. + uint64_t TotalSamples{0}; + + /// The total size of the nodes in the graph. + uint64_t TotalSize{0}; +}; + } // end of anonymous namespace std::vector -llvm::applyExtTspLayout(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts) { - // Verify correctness of the input data +codelayout::computeExtTspLayout(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts) { + // Verify correctness of the input data. assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input"); assert(NodeSizes.size() > 2 && "Incorrect input"); - // Apply the reordering algorithm + // Apply the reordering algorithm. ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts); - std::vector Result; - Alg.run(Result); + std::vector Result = Alg.run(); - // Verify correctness of the output + // Verify correctness of the output. assert(Result.front() == 0 && "Original entry point is not preserved"); assert(Result.size() == NodeSizes.size() && "Incorrect size of layout"); return Result; } -double llvm::calcExtTspScore(const std::vector &Order, - const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts) { - // Estimate addresses of the blocks in memory +double codelayout::calcExtTspScore(ArrayRef Order, + ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts) { + // Estimate addresses of the blocks in memory. std::vector Addr(NodeSizes.size(), 0); for (size_t Idx = 1; Idx < Order.size(); Idx++) { Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]]; } std::vector OutDegree(NodeSizes.size(), 0); - for (auto It : EdgeCounts) { - uint64_t Pred = It.first.first; - OutDegree[Pred]++; - } + for (auto Edge : EdgeCounts) + ++OutDegree[Edge.src]; - // Increase the score for each jump + // Increase the score for each jump. double Score = 0; - for (auto It : EdgeCounts) { - uint64_t Pred = It.first.first; - uint64_t Succ = It.first.second; - uint64_t Count = It.second; - bool IsConditional = OutDegree[Pred] > 1; - Score += ::extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count, - IsConditional); + for (auto Edge : EdgeCounts) { + bool IsConditional = OutDegree[Edge.src] > 1; + Score += ::extTSPScore(Addr[Edge.src], NodeSizes[Edge.src], Addr[Edge.dst], + Edge.count, IsConditional); } return Score; } -double llvm::calcExtTspScore(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts) { +double codelayout::calcExtTspScore(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts) { std::vector Order(NodeSizes.size()); for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) { Order[Idx] = Idx; } return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts); } + +std::vector codelayout::computeCacheDirectedLayout( + const CDSortConfig &Config, ArrayRef FuncSizes, + ArrayRef FuncCounts, ArrayRef CallCounts, + ArrayRef CallOffsets) { + // Verify correctness of the input data. + assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input"); + + // Apply the reordering algorithm. + CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets); + std::vector Result = Alg.run(); + assert(Result.size() == FuncSizes.size() && "Incorrect size of layout"); + return Result; +} + +std::vector codelayout::computeCacheDirectedLayout( + ArrayRef FuncSizes, ArrayRef FuncCounts, + ArrayRef CallCounts, ArrayRef CallOffsets) { + CDSortConfig Config; + // Populate the config from the command-line options. + if (CacheEntries.getNumOccurrences() > 0) + Config.CacheEntries = CacheEntries; + if (CacheSize.getNumOccurrences() > 0) + Config.CacheSize = CacheSize; + if (DistancePower.getNumOccurrences() > 0) + Config.DistancePower = DistancePower; + if (FrequencyScale.getNumOccurrences() > 0) + Config.FrequencyScale = FrequencyScale; + return computeCacheDirectedLayout(Config, FuncSizes, FuncCounts, CallCounts, + CallOffsets); +} diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index 00dd5206d75a756005e663bc2400faf3811e4793..8e17f53c62c6ef9f83061f4246eb00396ed4d0dd 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -501,7 +501,7 @@ public: auto FixedAI = std::move(AI); FixedAI.MappingBase -= DeltaAddr; for (auto &Seg : FixedAI.Segments) - Seg.AG = {MemProt::Read | MemProt::Write, Seg.AG.getMemLifetimePolicy()}; + Seg.AG = {MemProt::Read | MemProt::Write, Seg.AG.getMemLifetime()}; FixedAI.Actions.clear(); InProcessMemoryMapper::initialize( FixedAI, [this, OnInitialized = std::move(OnInitialized)]( diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp index 28ecf1af321f1eda338155de5c0f982c6ca77efc..abaf1e123145cd25ab051c07b2054065f2499645 100644 --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -581,9 +581,14 @@ int main(int argc, char **argv) { // the facility for updating public visibility to linkage unit visibility when // specified by an internal option. This is normally done during LTO which is // not performed via opt. - updateVCallVisibilityInModule(*M, - /* WholeProgramVisibilityEnabledInLTO */ false, - /* DynamicExportSymbols */ {}); + updateVCallVisibilityInModule( + *M, + /*WholeProgramVisibilityEnabledInLTO=*/false, + // FIXME: These need linker information via a + // TBD new interface. + /*DynamicExportSymbols=*/{}, + /*ValidateAllVtablesHaveTypeInfos=*/false, + /*IsVisibleToRegularObj=*/[](StringRef) { return true; }); // Figure out what stream we are supposed to write to... std::unique_ptr Out; diff --git a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp index ff153f6d4b32766c0924604ac1fbcbb8934db08f..711f35fc7683c3190e785d0cf5ea5f0d0f65a9c8 100644 --- a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp +++ b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp @@ -798,7 +798,7 @@ TEST(LinkGraphTest, BasicLayoutHonorsNoAlloc) { // Create a NoAlloc section and block. auto &Sec2 = G.createSection("__metadata", orc::MemProt::Read | orc::MemProt::Write); - Sec2.setMemLifetimePolicy(orc::MemLifetimePolicy::NoAlloc); + Sec2.setMemLifetime(orc::MemLifetime::NoAlloc); G.createContentBlock(Sec2, BlockContent.slice(0, 8), orc::ExecutorAddr(), 8, 0);