diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt
index 89462f8a14c1441a4150907279163196eee607ea..f163d45342874d0e8dd2a64a292e735ddf3cd626 100644
--- a/bolt/CMakeLists.txt
+++ b/bolt/CMakeLists.txt
@@ -35,7 +35,8 @@ set(BOLT_ENABLE_RUNTIME_default OFF)
 if ((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64"
     OR CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
     AND (CMAKE_SYSTEM_NAME STREQUAL "Linux"
-      OR CMAKE_SYSTEM_NAME STREQUAL "Darwin"))
+      OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+    AND (NOT CMAKE_CROSSCOMPILING))
   set(BOLT_ENABLE_RUNTIME_default ON)
 endif()
 option(BOLT_ENABLE_RUNTIME "Enable BOLT runtime" ${BOLT_ENABLE_RUNTIME_default})
diff --git a/bolt/include/bolt/Core/AddressMap.h b/bolt/include/bolt/Core/AddressMap.h
new file mode 100644
index 0000000000000000000000000000000000000000..85a9ab4473aafedd192145acaa334208e22ac6e6
--- /dev/null
+++ b/bolt/include/bolt/Core/AddressMap.h
@@ -0,0 +1,79 @@
+//===- bolt/Core/AddressMap.h - Input-output address map --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the AddressMap class used for looking
+// up addresses in the output object.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BOLT_CORE_ADDRESS_MAP_H
+#define BOLT_CORE_ADDRESS_MAP_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCSymbol.h"
+
+#include <optional>
+#include <unordered_map>
+
+namespace llvm {
+
+class MCStreamer;
+
+namespace bolt {
+
+class BinaryContext;
+
+/// Helper class to create a mapping from input entities to output addresses
+/// needed for updating debugging symbols and BAT. We emit a section containing
+/// <Input entity, Output MCSymbol> pairs to the object file and JITLink will
+/// transform this in <Input entity, Output address> pairs. The linker output
+/// can then be parsed and used to establish the mapping.
+///
+/// The entities that can be mapped to output address are input addresses and
+/// labels (MCSymbol). Input addresses support one-to-many mapping.
+class AddressMap {
+  static const char *const AddressSectionName;
+  static const char *const LabelSectionName;
+
+  /// Map multiple <input address> to <output address>.
+  using Addr2AddrMapTy = std::unordered_multimap<uint64_t, uint64_t>;
+  Addr2AddrMapTy Address2AddressMap;
+
+  /// Map MCSymbol to its output address. Normally used for temp symbols that
+  /// are not updated by the linker.
+  using Label2AddrMapTy = DenseMap<const MCSymbol *, uint64_t>;
+  Label2AddrMapTy Label2AddrMap;
+
+public:
+  static void emit(MCStreamer &Streamer, BinaryContext &BC);
+  static std::optional<AddressMap> parse(BinaryContext &BC);
+
+  std::optional<uint64_t> lookup(uint64_t InputAddress) const {
+    auto It = Address2AddressMap.find(InputAddress);
+    if (It != Address2AddressMap.end())
+      return It->second;
+    return std::nullopt;
+  }
+
+  std::optional<uint64_t> lookup(const MCSymbol *Symbol) const {
+    auto It = Label2AddrMap.find(Symbol);
+    if (It != Label2AddrMap.end())
+      return It->second;
+    return std::nullopt;
+  }
+
+  std::pair<Addr2AddrMapTy::const_iterator, Addr2AddrMapTy::const_iterator>
+  lookupAll(uint64_t InputAddress) const {
+    return Address2AddressMap.equal_range(InputAddress);
+  }
+};
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/include/bolt/Core/BinaryBasicBlock.h b/bolt/include/bolt/Core/BinaryBasicBlock.h
index 02be9c1d4f118d9edc65202395bb879177cb5412..bc95e2c4de3a11ec2b94e9faf7575ce696856611 100644
--- a/bolt/include/bolt/Core/BinaryBasicBlock.h
+++ b/bolt/include/bolt/Core/BinaryBasicBlock.h
@@ -100,16 +100,6 @@ private:
   using LocSymsTy = std::vector<std::pair<uint32_t, const MCSymbol *>>;
   std::unique_ptr<LocSymsTy> LocSyms;
 
-  /// After output/codegen, map output offsets of instructions in this basic
-  /// block to instruction offsets in the original function. Note that the
-  /// output basic block could be different from the input basic block.
-  /// We only map instruction of interest, such as calls and markers.
-  ///
-  /// We store the offset array in a basic block to facilitate BAT tables
-  /// generation. Otherwise, the mapping could be done at function level.
-  using OffsetTranslationTableTy = std::vector<std::pair<uint32_t, uint32_t>>;
-  std::unique_ptr<OffsetTranslationTableTy> OffsetTranslationTable;
-
   /// Alignment requirements for the block.
   uint32_t Alignment{1};
 
@@ -828,8 +818,7 @@ public:
     return OutputAddressRange;
   }
 
-  /// Update addresses of special instructions inside this basic block.
-  void updateOutputValues(const MCAsmLayout &Layout);
+  bool hasLocSyms() const { return LocSyms != nullptr; }
 
   /// Return mapping of input offsets to symbols in the output.
   LocSymsTy &getLocSyms() {
@@ -841,19 +830,6 @@ public:
     return const_cast<BinaryBasicBlock *>(this)->getLocSyms();
   }
 
-  /// Return offset translation table for the basic block.
-  OffsetTranslationTableTy &getOffsetTranslationTable() {
-    return OffsetTranslationTable
-               ? *OffsetTranslationTable
-               : *(OffsetTranslationTable =
-                       std::make_unique<OffsetTranslationTableTy>());
-  }
-
-  /// Return offset translation table for the basic block.
-  const OffsetTranslationTableTy &getOffsetTranslationTable() const {
-    return const_cast<BinaryBasicBlock *>(this)->getOffsetTranslationTable();
-  }
-
   /// Return size of the basic block in the output binary.
   uint64_t getOutputSize() const {
     return OutputAddressRange.second - OutputAddressRange.first;
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 79f91985c4920c521540223141e908644d58701f..ef0a1c6f68320bc74bb43b179d21db09ad52c13a 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -13,6 +13,7 @@
 #ifndef BOLT_CORE_BINARY_CONTEXT_H
 #define BOLT_CORE_BINARY_CONTEXT_H
 
+#include "bolt/Core/AddressMap.h"
 #include "bolt/Core/BinaryData.h"
 #include "bolt/Core/BinarySection.h"
 #include "bolt/Core/DebugData.h"
@@ -221,6 +222,9 @@ class BinaryContext {
   bool ContainsDwarf5{false};
   bool ContainsDwarfLegacy{false};
 
+  /// Mapping from input to output addresses.
+  std::optional<AddressMap> IOAddressMap;
+
   /// Preprocess DWO debug information.
   void preprocessDWODebugInfo();
 
@@ -638,9 +642,22 @@ public:
   /// Total hotness score according to profiling data for this binary.
   uint64_t TotalScore{0};
 
-  /// Binary-wide stats for macro-fusion.
-  uint64_t MissedMacroFusionPairs{0};
-  uint64_t MissedMacroFusionExecCount{0};
+  /// Binary-wide aggregated stats.
+  struct BinaryStats {
+    /// Stats for macro-fusion.
+    uint64_t MissedMacroFusionPairs{0};
+    uint64_t MissedMacroFusionExecCount{0};
+
+    /// Stats for stale profile matching:
+    ///   the total number of basic blocks in the profile
+    uint32_t NumStaleBlocks{0};
+    ///   the number of matched basic blocks
+    uint32_t NumMatchedBlocks{0};
+    ///   the total count of samples in the profile
+    uint64_t StaleSampleCount{0};
+    ///   the count of matched samples
+    uint64_t MatchedSampleCount{0};
+  } Stats;
 
   // Address of the first allocated segment.
   uint64_t FirstAllocAddress{std::numeric_limits<uint64_t>::max()};
@@ -663,6 +680,15 @@ public:
   /// the execution of the binary is completed.
   std::optional<uint64_t> FiniFunctionAddress;
 
+  /// DT_FINI.
+  std::optional<uint64_t> FiniAddress;
+
+  /// DT_FINI_ARRAY. Only used when DT_FINI is not set.
+  std::optional<uint64_t> FiniArrayAddress;
+
+  /// DT_FINI_ARRAYSZ. Only used when DT_FINI is not set.
+  std::optional<uint64_t> FiniArraySize;
+
   /// Page alignment used for code layout.
   uint64_t PageAlign{HugePageSize};
 
@@ -1195,6 +1221,9 @@ public:
   ///
   /// Return the pair where the first size is for the main part, and the second
   /// size is for the cold one.
+  /// Modify BinaryBasicBlock::OutputAddressRange for each basic block in the
+  /// function in place so that BinaryBasicBlock::getOutputSize() gives the
+  /// emitted size of the basic block.
   std::pair<size_t, size_t> calculateEmittedSize(BinaryFunction &BF,
                                                  bool FixBranches = true);
 
@@ -1204,8 +1233,8 @@ public:
   uint64_t
   computeInstructionSize(const MCInst &Inst,
                          const MCCodeEmitter *Emitter = nullptr) const {
-    if (auto Size = MIB->getAnnotationWithDefault<uint32_t>(Inst, "Size"))
-      return Size;
+    if (std::optional<uint32_t> Size = MIB->getSize(Inst))
+      return *Size;
 
     if (!Emitter)
       Emitter = this->MCE.get();
@@ -1255,6 +1284,9 @@ public:
   /// Return true if the function should be emitted to the output file.
   bool shouldEmit(const BinaryFunction &Function) const;
 
+  /// Dump the assembly representation of MCInst to debug output.
+  void dump(const MCInst &Inst) const;
+
   /// Print the string name for a CFI operation.
   static void printCFI(raw_ostream &OS, const MCCFIInstruction &Inst);
 
@@ -1330,6 +1362,12 @@ public:
         /* DWARFMustBeAtTheEnd */ false));
     return Streamer;
   }
+
+  void setIOAddressMap(AddressMap Map) { IOAddressMap = std::move(Map); }
+  const AddressMap &getIOAddressMap() const {
+    assert(IOAddressMap && "Address map not set yet");
+    return *IOAddressMap;
+  }
 };
 
 template <typename T, typename = std::enable_if_t<sizeof(T) == 1>>
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index c393b5b851d99bcce6962c9c50be14a153b1b597..0c62df34fa6ac94f43fff4fa88776c11329ccb18 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -192,9 +192,6 @@ public:
   static constexpr uint64_t COUNT_NO_PROFILE =
       BinaryBasicBlock::COUNT_NO_PROFILE;
 
-  /// We have to use at least 2-byte alignment for functions because of C++ ABI.
-  static constexpr unsigned MinAlign = 2;
-
   static const char TimerGroupName[];
   static const char TimerGroupDesc[];
 
@@ -322,10 +319,6 @@ private:
   /// Execution halts whenever this function is entered.
   bool TrapsOnEntry{false};
 
-  /// True if the function had an indirect branch with a fixed internal
-  /// destination.
-  bool HasFixedIndirectBranch{false};
-
   /// True if the function is a fragment of another function. This means that
   /// this function could only be entered via its parent or one of its sibling
   /// fragments. It could be entered at any basic block. It can also return
@@ -366,14 +359,15 @@ private:
   std::string ColdCodeSectionName;
 
   /// Parent function fragment for split function fragments.
-  SmallPtrSet<BinaryFunction *, 1> ParentFragments;
+  using FragmentsSetTy = SmallPtrSet<BinaryFunction *, 1>;
+  FragmentsSetTy ParentFragments;
 
   /// Indicate if the function body was folded into another function.
   /// Used by ICF optimization.
   BinaryFunction *FoldedIntoFunction{nullptr};
 
   /// All fragments for a parent function.
-  SmallPtrSet<BinaryFunction *, 1> Fragments;
+  FragmentsSetTy Fragments;
 
   /// The profile data for the number of times the function was executed.
   uint64_t ExecutionCount{COUNT_NO_PROFILE};
@@ -381,7 +375,7 @@ private:
   /// Profile match ratio.
   float ProfileMatchRatio{0.0f};
 
-  /// Raw branch count for this function in the profile
+  /// Raw branch count for this function in the profile.
   uint64_t RawBranchCount{0};
 
   /// Indicates the type of profile the function is using.
@@ -576,9 +570,6 @@ private:
   /// Count the number of functions created.
   static uint64_t Count;
 
-  /// Map offsets of special instructions to addresses in the output.
-  InputOffsetToAddressMapTy InputOffsetToAddressMap;
-
   /// Register alternative function name.
   void addAlternativeName(std::string NewName) {
     Aliases.push_back(std::move(NewName));
@@ -1193,7 +1184,7 @@ public:
 
     if (!Islands->FunctionConstantIslandLabel) {
       Islands->FunctionConstantIslandLabel =
-          BC.Ctx->createNamedTempSymbol("func_const_island");
+          BC.Ctx->getOrCreateSymbol("func_const_island@" + getOneName());
     }
     return Islands->FunctionConstantIslandLabel;
   }
@@ -1203,7 +1194,7 @@ public:
 
     if (!Islands->FunctionColdConstantIslandLabel) {
       Islands->FunctionColdConstantIslandLabel =
-          BC.Ctx->createNamedTempSymbol("func_cold_const_island");
+          BC.Ctx->getOrCreateSymbol("func_cold_const_island@" + getOneName());
     }
     return Islands->FunctionColdConstantIslandLabel;
   }
@@ -1223,14 +1214,7 @@ public:
   }
 
   /// Update output values of the function based on the final \p Layout.
-  void updateOutputValues(const MCAsmLayout &Layout);
-
-  /// Return mapping of input to output addresses. Most users should call
-  /// translateInputToOutputAddress() for address translation.
-  InputOffsetToAddressMapTy &getInputOffsetToAddressMap() {
-    assert(isEmitted() && "cannot use address mapping before code emission");
-    return InputOffsetToAddressMap;
-  }
+  void updateOutputValues(const BOLTLinker &Linker);
 
   /// Register relocation type \p RelType at a given \p Address in the function
   /// against \p Symbol.
@@ -1457,7 +1441,8 @@ public:
 
   /// Rebuilds BBs layout, ignoring dead BBs. Returns the number of removed
   /// BBs and the removed number of bytes of code.
-  std::pair<unsigned, uint64_t> eraseInvalidBBs();
+  std::pair<unsigned, uint64_t>
+  eraseInvalidBBs(const MCCodeEmitter *Emitter = nullptr);
 
   /// Get the relative order between two basic blocks in the original
   /// layout.  The result is > 0 if B occurs before A and < 0 if B
@@ -1729,8 +1714,24 @@ public:
     return *this;
   }
 
-  Align getAlign() const { return Align(Alignment); }
+  uint16_t getMinAlignment() const {
+    // Align data in code BFs minimum to CI alignment
+    if (!size() && hasIslandsInfo())
+      return getConstantIslandAlignment();
+
+    // Minimal code alignment on AArch64 and RISCV is 4
+    if (BC.isAArch64() || BC.isRISCV())
+      return 4;
+
+    // We have to use at least 2-byte alignment for functions because
+    // of C++ ABI.
+    return 2;
+  }
+
+  Align getMinAlign() const { return Align(getMinAlignment()); }
+
   uint16_t getAlignment() const { return Alignment; }
+  Align getAlign() const { return Align(getAlignment()); }
 
   BinaryFunction &setMaxAlignmentBytes(uint16_t MaxAlignBytes) {
     MaxAlignmentBytes = MaxAlignBytes;
@@ -1779,6 +1780,15 @@ public:
     return llvm::is_contained(Fragments, &Other);
   }
 
+  /// Return the child fragment form parent function
+  iterator_range<FragmentsSetTy::const_iterator> getFragments() const {
+    return iterator_range<FragmentsSetTy::const_iterator>(Fragments.begin(),
+                                                          Fragments.end());
+  }
+
+  /// Return the parent function for split function fragments.
+  FragmentsSetTy *getParentFragments() { return &ParentFragments; }
+
   /// Returns if this function is a parent or child of \p Other function.
   bool isParentOrChildOf(const BinaryFunction &Other) const {
     return isChildOf(Other) || isParentOf(Other);
@@ -2170,6 +2180,11 @@ public:
   /// its code emission.
   bool requiresAddressTranslation() const;
 
+  /// Return true if the linker needs to generate an address map for this
+  /// function. Used for keeping track of the mapping from input to out
+  /// addresses of basic blocks.
+  bool requiresAddressMap() const;
+
   /// Adjust branch instructions to match the CFG.
   ///
   /// As it comes to internal branches, the CFG represents "the ultimate source
diff --git a/bolt/include/bolt/Core/BinarySection.h b/bolt/include/bolt/Core/BinarySection.h
index f1041777926fde1ee06fb3f0bdd4c8e33f1f72c0..92ab6ea0d38e14eabd0f47cc7bb82fa11257dae1 100644
--- a/bolt/include/bolt/Core/BinarySection.h
+++ b/bolt/include/bolt/Core/BinarySection.h
@@ -97,6 +97,8 @@ class BinarySection {
   mutable bool IsReordered{false}; // Have the contents been reordered?
   bool IsAnonymous{false};         // True if the name should not be included
                                    // in the output file.
+  bool IsLinkOnly{false};          // True if the section should not be included
+                                   // in the output file.
 
   uint64_t hash(const BinaryData &BD,
                 std::map<const BinaryData *, uint64_t> &Cache) const;
@@ -373,8 +375,12 @@ public:
   /// Add a dynamic relocation at the given /p Offset.
   void addDynamicRelocation(uint64_t Offset, MCSymbol *Symbol, uint64_t Type,
                             uint64_t Addend, uint64_t Value = 0) {
-    assert(Offset < getSize() && "offset not within section bounds");
-    DynamicRelocations.emplace(Relocation{Offset, Symbol, Type, Addend, Value});
+    addDynamicRelocation(Relocation{Offset, Symbol, Type, Addend, Value});
+  }
+
+  void addDynamicRelocation(const Relocation &Reloc) {
+    assert(Reloc.Offset < getSize() && "offset not within section bounds");
+    DynamicRelocations.emplace(Reloc);
   }
 
   /// Add relocation against the original contents of this section.
@@ -408,6 +414,18 @@ public:
     return Itr != DynamicRelocations.end() ? &*Itr : nullptr;
   }
 
+  std::optional<Relocation> takeDynamicRelocationAt(uint64_t Offset) {
+    Relocation Key{Offset, 0, 0, 0, 0};
+    auto Itr = DynamicRelocations.find(Key);
+
+    if (Itr == DynamicRelocations.end())
+      return std::nullopt;
+
+    Relocation Reloc = *Itr;
+    DynamicRelocations.erase(Itr);
+    return Reloc;
+  }
+
   uint64_t hash(const BinaryData &BD) const {
     std::map<const BinaryData *, uint64_t> Cache;
     return hash(BD, Cache);
@@ -452,6 +470,8 @@ public:
   void setIndex(uint32_t I) { Index = I; }
   void setOutputName(const Twine &Name) { OutputName = Name.str(); }
   void setAnonymous(bool Flag) { IsAnonymous = Flag; }
+  bool isLinkOnly() const { return IsLinkOnly; }
+  void setLinkOnly() { IsLinkOnly = true; }
 
   /// Emit the section as data, possibly with relocations.
   /// Use name \p SectionName for the section during the emission.
diff --git a/bolt/include/bolt/Core/HashUtilities.h b/bolt/include/bolt/Core/HashUtilities.h
index 8d445ff837564a3431c8737fdede1f4bae6f75ce..53ea110aa683b9f8501b4287d1e7be5519db076c 100644
--- a/bolt/include/bolt/Core/HashUtilities.h
+++ b/bolt/include/bolt/Core/HashUtilities.h
@@ -20,8 +20,6 @@
 namespace llvm {
 namespace bolt {
 
-uint16_t hash_64_to_16(const uint64_t Hash);
-
 std::string hashInteger(uint64_t Value);
 
 std::string hashSymbol(BinaryContext &BC, const MCSymbol &Symbol);
@@ -35,6 +33,8 @@ using OperandHashFuncTy = function_ref<typename std::string(const MCOperand &)>;
 std::string hashBlock(BinaryContext &BC, const BinaryBasicBlock &BB,
                       OperandHashFuncTy OperandHashFunc);
 
+std::string hashBlockLoose(BinaryContext &BC, const BinaryBasicBlock &BB);
+
 } // namespace bolt
 } // namespace llvm
 
diff --git a/bolt/include/bolt/Core/Linker.h b/bolt/include/bolt/Core/Linker.h
index 69e1fe431c0b8f6223a01f3a1e26f41ae226449f..1e0876a0e13d9d419b55be8d9ff3978254d490b8 100644
--- a/bolt/include/bolt/Core/Linker.h
+++ b/bolt/include/bolt/Core/Linker.h
@@ -31,6 +31,11 @@ public:
       std::function<void(const BinarySection &Section, uint64_t Address)>;
   using SectionsMapper = std::function<void(SectionMapper)>;
 
+  struct SymbolInfo {
+    uint64_t Address;
+    uint64_t Size;
+  };
+
   virtual ~BOLTLinker() = default;
 
   /// Load and link \p Obj. \p MapSections will be called before the object is
@@ -38,8 +43,16 @@ public:
   /// of a section can be changed by calling the passed SectionMapper.
   virtual void loadObject(MemoryBufferRef Obj, SectionsMapper MapSections) = 0;
 
+  /// Return the address and size of a symbol or std::nullopt if it cannot be
+  /// found.
+  virtual std::optional<SymbolInfo> lookupSymbolInfo(StringRef Name) const = 0;
+
   /// Return the address of a symbol or std::nullopt if it cannot be found.
-  virtual std::optional<uint64_t> lookupSymbol(StringRef Name) const = 0;
+  std::optional<uint64_t> lookupSymbol(StringRef Name) const {
+    if (const auto Info = lookupSymbolInfo(Name))
+      return Info->Address;
+    return std::nullopt;
+  }
 };
 
 } // namespace bolt
diff --git a/bolt/include/bolt/Core/MCPlus.h b/bolt/include/bolt/Core/MCPlus.h
index b4a72ac274fade22ad9036b47206a10f937b2692..b6a9e73f2347e71faa09beb4ae2aad9ff7fc5747 100644
--- a/bolt/include/bolt/Core/MCPlus.h
+++ b/bolt/include/bolt/Core/MCPlus.h
@@ -32,11 +32,16 @@ namespace MCPlus {
 /// pad and the uint64_t represents the action.
 using MCLandingPad = std::pair<const MCSymbol *, uint64_t>;
 
-/// An extension to MCInst is provided via an extra operand of type MCInst with
-/// ANNOTATION_LABEL opcode (i.e. we are tying an annotation instruction to an
-/// existing one). The annotation instruction contains a list of Immediate
-/// operands. Each operand either contains a value, or is a pointer to
-/// an instance of class MCAnnotation.
+/// An extension to MCInst is provided via extra operands, i.e. operands that
+/// are not used in the instruction assembly. Any kind of metadata can be
+/// attached to MCInst with this "annotation" extension using MCPlusBuilder
+/// interface.
+//
+/// The first extra operand must be of type kInst with an empty (nullptr)
+/// value. The kInst operand type is unused on most non-VLIW architectures.
+/// We use it to mark the beginning of annotations operands. The rest of the
+/// operands are of Immediate type with annotation info encoded into the value
+/// of the immediate.
 ///
 /// There are 2 distinct groups of annotations. The first group is a first-class
 /// annotation that affects semantics of the instruction, such as an
@@ -55,7 +60,7 @@ using MCLandingPad = std::pair<const MCSymbol *, uint64_t>;
 /// of their corresponding operand.
 ///
 /// Annotations in the second group could be addressed either by name, or by
-/// by and index which could be queried by providing a name.
+/// by index which could be queried by providing the name.
 class MCAnnotation {
 public:
   enum Kind {
@@ -66,6 +71,8 @@ public:
     kTailCall,            /// Tail call.
     kConditionalTailCall, /// CTC.
     kOffset,              /// Offset in the function.
+    kLabel,               /// MCSymbol pointing to this instruction.
+    kSize,                /// Size of the instruction.
     kGeneric              /// First generic annotation.
   };
 
@@ -105,10 +112,11 @@ private:
 /// Return a number of operands in \Inst excluding operands representing
 /// annotations.
 inline unsigned getNumPrimeOperands(const MCInst &Inst) {
-  if (Inst.getNumOperands() > 0 && std::prev(Inst.end())->isInst()) {
-    assert(std::prev(Inst.end())->getInst()->getOpcode() ==
-           TargetOpcode::ANNOTATION_LABEL);
-    return Inst.getNumOperands() - 1;
+  for (signed I = Inst.getNumOperands() - 1; I >= 0; --I) {
+    if (Inst.getOperand(I).isInst())
+      return I;
+    if (!Inst.getOperand(I).isImm())
+      return Inst.getNumOperands();
   }
   return Inst.getNumOperands();
 }
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index cd4676f370e64e27dbfad76e21e2d839f8a8b1b4..1a7f544c1b6a6acf0aa5569111e62e9e5936f208 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -29,6 +29,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/RWMutex.h"
 #include <cassert>
 #include <cstdint>
 #include <map>
@@ -64,7 +65,6 @@ public:
 private:
   /// A struct that represents a single annotation allocator
   struct AnnotationAllocator {
-    SpecificBumpPtrAllocator<MCInst> MCInstAllocator;
     BumpPtrAllocator ValueAllocator;
     std::unordered_set<MCPlus::MCAnnotation *> AnnotationPool;
   };
@@ -96,60 +96,62 @@ private:
     return SignExtend64<56>(ImmValue & 0xff'ffff'ffff'ffffULL);
   }
 
-  MCInst *getAnnotationInst(const MCInst &Inst) const {
-    if (Inst.getNumOperands() == 0)
-      return nullptr;
+  std::optional<unsigned> getFirstAnnotationOpIndex(const MCInst &Inst) const {
+    const unsigned NumPrimeOperands = MCPlus::getNumPrimeOperands(Inst);
+    if (Inst.getNumOperands() == NumPrimeOperands)
+      return std::nullopt;
 
-    const MCOperand &LastOp = Inst.getOperand(Inst.getNumOperands() - 1);
-    if (!LastOp.isInst())
-      return nullptr;
+    assert(Inst.getOperand(NumPrimeOperands).getInst() == nullptr &&
+           "Empty instruction expected.");
 
-    MCInst *AnnotationInst = const_cast<MCInst *>(LastOp.getInst());
-    assert(AnnotationInst->getOpcode() == TargetOpcode::ANNOTATION_LABEL);
+    return NumPrimeOperands + 1;
+  }
 
-    return AnnotationInst;
+  MCInst::iterator getAnnotationInstOp(MCInst &Inst) const {
+    for (MCInst::iterator Iter = Inst.begin(); Iter != Inst.end(); ++Iter) {
+      if (Iter->isInst()) {
+        assert(Iter->getInst() == nullptr && "Empty instruction expected.");
+        return Iter;
+      }
+    }
+    return Inst.end();
   }
 
-  void removeAnnotationInst(MCInst &Inst) const {
-    assert(getAnnotationInst(Inst) && "Expected annotation instruction.");
-    Inst.erase(std::prev(Inst.end()));
-    assert(!getAnnotationInst(Inst) &&
-           "More than one annotation instruction detected.");
+  void removeAnnotations(MCInst &Inst) const {
+    Inst.erase(getAnnotationInstOp(Inst), Inst.end());
   }
 
-  void setAnnotationOpValue(MCInst &Inst, unsigned Index, int64_t Value,
-                            AllocatorIdTy AllocatorId = 0) {
-    MCInst *AnnotationInst = getAnnotationInst(Inst);
-    if (!AnnotationInst) {
-      AnnotationAllocator &Allocator = getAnnotationAllocator(AllocatorId);
-      AnnotationInst = new (Allocator.MCInstAllocator.Allocate()) MCInst();
-      AnnotationInst->setOpcode(TargetOpcode::ANNOTATION_LABEL);
-      Inst.addOperand(MCOperand::createInst(AnnotationInst));
+  void setAnnotationOpValue(MCInst &Inst, unsigned Index, int64_t Value) const {
+    const int64_t AnnotationValue = encodeAnnotationImm(Index, Value);
+    const std::optional<unsigned> FirstAnnotationOp =
+        getFirstAnnotationOpIndex(Inst);
+    if (!FirstAnnotationOp) {
+      Inst.addOperand(MCOperand::createInst(nullptr));
+      Inst.addOperand(MCOperand::createImm(AnnotationValue));
+      return;
     }
 
-    const int64_t AnnotationValue = encodeAnnotationImm(Index, Value);
-    for (int I = AnnotationInst->getNumOperands() - 1; I >= 0; --I) {
-      int64_t ImmValue = AnnotationInst->getOperand(I).getImm();
+    for (unsigned I = *FirstAnnotationOp; I < Inst.getNumOperands(); ++I) {
+      const int64_t ImmValue = Inst.getOperand(I).getImm();
       if (extractAnnotationIndex(ImmValue) == Index) {
-        AnnotationInst->getOperand(I).setImm(AnnotationValue);
+        Inst.getOperand(I).setImm(AnnotationValue);
         return;
       }
     }
 
-    AnnotationInst->addOperand(MCOperand::createImm(AnnotationValue));
+    Inst.addOperand(MCOperand::createImm(AnnotationValue));
   }
 
   std::optional<int64_t> getAnnotationOpValue(const MCInst &Inst,
                                               unsigned Index) const {
-    const MCInst *AnnotationInst = getAnnotationInst(Inst);
-    if (!AnnotationInst)
+    std::optional<unsigned> FirstAnnotationOp = getFirstAnnotationOpIndex(Inst);
+    if (!FirstAnnotationOp)
       return std::nullopt;
 
-    for (int I = AnnotationInst->getNumOperands() - 1; I >= 0; --I) {
-      int64_t ImmValue = AnnotationInst->getOperand(I).getImm();
-      if (extractAnnotationIndex(ImmValue) == Index) {
+    for (unsigned I = *FirstAnnotationOp; I < Inst.getNumOperands(); ++I) {
+      const int64_t ImmValue = Inst.getOperand(I).getImm();
+      if (extractAnnotationIndex(ImmValue) == Index)
         return extractAnnotationValue(ImmValue);
-      }
     }
 
     return std::nullopt;
@@ -166,20 +168,21 @@ protected:
   /// Names of non-standard annotations.
   SmallVector<std::string, 8> AnnotationNames;
 
-  /// Allocate the TailCall annotation value. Clients of the target-specific
+  /// A mutex that is used to control parallel accesses to
+  /// AnnotationNameIndexMap and AnnotationsNames.
+  mutable llvm::sys::RWMutex AnnotationNameMutex;
+
+  /// Set TailCall annotation value to true. Clients of the target-specific
   /// MCPlusBuilder classes must use convert/lower/create* interfaces instead.
-  void setTailCall(MCInst &Inst);
+  void setTailCall(MCInst &Inst) const;
 
   /// Transfer annotations from \p SrcInst to \p DstInst.
   void moveAnnotations(MCInst &&SrcInst, MCInst &DstInst) const {
-    assert(!getAnnotationInst(DstInst) &&
-           "Destination instruction should not have annotations.");
-    const MCInst *AnnotationInst = getAnnotationInst(SrcInst);
-    if (!AnnotationInst)
-      return;
+    MCInst::iterator AnnotationOp = getAnnotationInstOp(SrcInst);
+    for (MCInst::iterator Iter = AnnotationOp; Iter != SrcInst.end(); ++Iter)
+      DstInst.addOperand(*Iter);
 
-    DstInst.addOperand(MCOperand::createInst(AnnotationInst));
-    removeAnnotationInst(SrcInst);
+    SrcInst.erase(AnnotationOp, SrcInst.end());
   }
 
 public:
@@ -384,7 +387,6 @@ public:
 
       Allocator.AnnotationPool.clear();
       Allocator.ValueAllocator.Reset();
-      Allocator.MCInstAllocator.DestroyAll();
     }
   }
 
@@ -613,12 +615,15 @@ public:
 
   virtual bool isMoveMem2Reg(const MCInst &Inst) const { return false; }
 
-  virtual bool isLoad(const MCInst &Inst) const {
-    llvm_unreachable("not implemented");
-    return false;
+  virtual bool mayLoad(const MCInst &Inst) const {
+    return Info->get(Inst.getOpcode()).mayLoad();
+  }
+
+  virtual bool mayStore(const MCInst &Inst) const {
+    return Info->get(Inst.getOpcode()).mayStore();
   }
 
-  virtual bool isStore(const MCInst &Inst) const {
+  virtual bool isAArch64Exclusive(const MCInst &Inst) const {
     llvm_unreachable("not implemented");
     return false;
   }
@@ -639,9 +644,12 @@ public:
     return false;
   }
 
-  /// If non-zero, this is used to fill the executable space with instructions
-  /// that will trap. Defaults to 0.
-  virtual unsigned getTrapFillValue() const { return 0; }
+  /// Used to fill the executable space with instructions
+  /// that will trap.
+  virtual StringRef getTrapFillValue() const {
+    llvm_unreachable("not implemented");
+    return StringRef();
+  }
 
   /// Interface and basic functionality of a MCInstMatcher. The idea is to make
   /// it easy to match one or more MCInsts against a tree-like pattern and
@@ -1116,20 +1124,19 @@ public:
   std::optional<MCPlus::MCLandingPad> getEHInfo(const MCInst &Inst) const;
 
   /// Add handler and action info for call instruction.
-  void addEHInfo(MCInst &Inst, const MCPlus::MCLandingPad &LP);
+  void addEHInfo(MCInst &Inst, const MCPlus::MCLandingPad &LP) const;
 
   /// Update exception-handling info for the invoke instruction \p Inst.
   /// Return true on success and false otherwise, e.g. if the instruction is
   /// not an invoke.
-  bool updateEHInfo(MCInst &Inst, const MCPlus::MCLandingPad &LP);
+  bool updateEHInfo(MCInst &Inst, const MCPlus::MCLandingPad &LP) const;
 
   /// Return non-negative GNU_args_size associated with the instruction
   /// or -1 if there's no associated info.
   int64_t getGnuArgsSize(const MCInst &Inst) const;
 
   /// Add the value of GNU_args_size to Inst if it already has EH info.
-  void addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize,
-                      AllocatorIdTy AllocId = 0);
+  void addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize) const;
 
   /// Return jump table addressed by this instruction.
   uint64_t getJumpTable(const MCInst &Inst) const;
@@ -1142,7 +1149,7 @@ public:
                     AllocatorIdTy AllocId = 0);
 
   /// Disassociate instruction with a jump table.
-  bool unsetJumpTable(MCInst &Inst);
+  bool unsetJumpTable(MCInst &Inst) const;
 
   /// Return destination of conditional tail call instruction if \p Inst is one.
   std::optional<uint64_t> getConditionalTailCall(const MCInst &Inst) const;
@@ -1150,11 +1157,11 @@ public:
   /// Mark the \p Instruction as a conditional tail call, and set its
   /// destination address if it is known. If \p Instruction was already marked,
   /// update its destination with \p Dest.
-  bool setConditionalTailCall(MCInst &Inst, uint64_t Dest = 0);
+  bool setConditionalTailCall(MCInst &Inst, uint64_t Dest = 0) const;
 
   /// If \p Inst was marked as a conditional tail call convert it to a regular
   /// branch. Return true if the instruction was converted.
-  bool unsetConditionalTailCall(MCInst &Inst);
+  bool unsetConditionalTailCall(MCInst &Inst) const;
 
   /// Return offset of \p Inst in the original function, if available.
   std::optional<uint32_t> getOffset(const MCInst &Inst) const;
@@ -1163,10 +1170,23 @@ public:
   uint32_t getOffsetWithDefault(const MCInst &Inst, uint32_t Default) const;
 
   /// Set offset of \p Inst in the original function.
-  bool setOffset(MCInst &Inst, uint32_t Offset, AllocatorIdTy AllocatorId = 0);
+  bool setOffset(MCInst &Inst, uint32_t Offset) const;
 
   /// Remove offset annotation.
-  bool clearOffset(MCInst &Inst);
+  bool clearOffset(MCInst &Inst) const;
+
+  /// Return the label of \p Inst, if available.
+  MCSymbol *getLabel(const MCInst &Inst) const;
+
+  /// Set the label of \p Inst. This label will be emitted right before \p Inst
+  /// is emitted to MCStreamer.
+  bool setLabel(MCInst &Inst, MCSymbol *Label);
+
+  /// Get instruction size specified via annotation.
+  std::optional<uint32_t> getSize(const MCInst &Inst) const;
+
+  /// Set instruction size.
+  void setSize(MCInst &Inst, uint32_t Size) const;
 
   /// Return MCSymbol that represents a target of this instruction at a given
   /// operand number \p OpNum. If there's no symbol associated with
@@ -1730,8 +1750,51 @@ public:
     return true;
   }
 
+  /// Extract a symbol and an addend out of the fixup value expression.
+  ///
+  /// Only the following limited expression types are supported:
+  ///   Symbol + Addend
+  ///   Symbol + Constant + Addend
+  ///   Const + Addend
+  ///   Symbol
+  std::pair<MCSymbol *, uint64_t> extractFixupExpr(const MCFixup &Fixup) const {
+    uint64_t Addend = 0;
+    MCSymbol *Symbol = nullptr;
+    const MCExpr *ValueExpr = Fixup.getValue();
+    if (ValueExpr->getKind() == MCExpr::Binary) {
+      const auto *BinaryExpr = cast<MCBinaryExpr>(ValueExpr);
+      assert(BinaryExpr->getOpcode() == MCBinaryExpr::Add &&
+             "unexpected binary expression");
+      const MCExpr *LHS = BinaryExpr->getLHS();
+      if (LHS->getKind() == MCExpr::Constant) {
+        Addend = cast<MCConstantExpr>(LHS)->getValue();
+      } else if (LHS->getKind() == MCExpr::Binary) {
+        const auto *LHSBinaryExpr = cast<MCBinaryExpr>(LHS);
+        assert(LHSBinaryExpr->getOpcode() == MCBinaryExpr::Add &&
+               "unexpected binary expression");
+        const MCExpr *LLHS = LHSBinaryExpr->getLHS();
+        assert(LLHS->getKind() == MCExpr::SymbolRef && "unexpected LLHS");
+        Symbol = const_cast<MCSymbol *>(this->getTargetSymbol(LLHS));
+        const MCExpr *RLHS = LHSBinaryExpr->getRHS();
+        assert(RLHS->getKind() == MCExpr::Constant && "unexpected RLHS");
+        Addend = cast<MCConstantExpr>(RLHS)->getValue();
+      } else {
+        assert(LHS->getKind() == MCExpr::SymbolRef && "unexpected LHS");
+        Symbol = const_cast<MCSymbol *>(this->getTargetSymbol(LHS));
+      }
+      const MCExpr *RHS = BinaryExpr->getRHS();
+      assert(RHS->getKind() == MCExpr::Constant && "unexpected RHS");
+      Addend += cast<MCConstantExpr>(RHS)->getValue();
+    } else {
+      assert(ValueExpr->getKind() == MCExpr::SymbolRef && "unexpected value");
+      Symbol = const_cast<MCSymbol *>(this->getTargetSymbol(ValueExpr));
+    }
+    return std::make_pair(Symbol, Addend);
+  }
+
   /// Return annotation index matching the \p Name.
   std::optional<unsigned> getAnnotationIndex(StringRef Name) const {
+    std::shared_lock<llvm::sys::RWMutex> Lock(AnnotationNameMutex);
     auto AI = AnnotationNameIndexMap.find(Name);
     if (AI != AnnotationNameIndexMap.end())
       return AI->second;
@@ -1741,10 +1804,10 @@ public:
   /// Return annotation index matching the \p Name. Create a new index if the
   /// \p Name wasn't registered previously.
   unsigned getOrCreateAnnotationIndex(StringRef Name) {
-    auto AI = AnnotationNameIndexMap.find(Name);
-    if (AI != AnnotationNameIndexMap.end())
-      return AI->second;
+    if (std::optional<unsigned> Index = getAnnotationIndex(Name))
+      return *Index;
 
+    std::unique_lock<llvm::sys::RWMutex> Lock(AnnotationNameMutex);
     const unsigned Index =
         AnnotationNameIndexMap.size() + MCPlus::MCAnnotation::kGeneric;
     AnnotationNameIndexMap.insert(std::make_pair(Name, Index));
@@ -1765,8 +1828,7 @@ public:
 
     if (!std::is_trivial<ValueType>::value)
       Allocator.AnnotationPool.insert(A);
-    setAnnotationOpValue(Inst, Index, reinterpret_cast<int64_t>(A),
-                         AllocatorId);
+    setAnnotationOpValue(Inst, Index, reinterpret_cast<int64_t>(A));
     return A->getValue();
   }
 
@@ -1899,21 +1961,21 @@ public:
   ///
   /// Return true if the annotation was removed, false if the annotation
   /// was not present.
-  bool removeAnnotation(MCInst &Inst, unsigned Index);
+  bool removeAnnotation(MCInst &Inst, unsigned Index) const;
 
   /// Remove annotation associated with \p Name.
   ///
   /// Return true if the annotation was removed, false if the annotation
   /// was not present.
-  bool removeAnnotation(MCInst &Inst, StringRef Name) {
+  bool removeAnnotation(MCInst &Inst, StringRef Name) const {
     const auto Index = getAnnotationIndex(Name);
     if (!Index)
       return false;
     return removeAnnotation(Inst, *Index);
   }
 
-  /// Remove meta-data, but don't destroy it.
-  void stripAnnotations(MCInst &Inst, bool KeepTC = false);
+  /// Remove meta-data from the instruction, but don't destroy it.
+  void stripAnnotations(MCInst &Inst, bool KeepTC = false) const;
 
   virtual InstructionListType
   createInstrumentedIndirectCall(MCInst &&CallInst, MCSymbol *HandlerFuncAddr,
diff --git a/bolt/include/bolt/Core/Relocation.h b/bolt/include/bolt/Core/Relocation.h
index 5ae288a91986e523e93f506dd1bf4f5143925d92..bdea698b9531bb84c16cd0cf82a6e511c44d3843 100644
--- a/bolt/include/bolt/Core/Relocation.h
+++ b/bolt/include/bolt/Core/Relocation.h
@@ -97,6 +97,10 @@ struct Relocation {
   /// Return true if relocation type is for thread local storage.
   static bool isTLS(uint64_t Type);
 
+  /// Return true of relocation type is for referencing a specific instruction
+  /// (as opposed to a function, basic block, etc).
+  static bool isInstructionReference(uint64_t Type);
+
   /// Return code for a NONE relocation
   static uint64_t getNone();
 
@@ -119,6 +123,10 @@ struct Relocation {
   /// otherwise.
   bool isRelative() const { return isRelative(Type); }
 
+  /// Return true if this relocation is R_*_IRELATIVE type. Return false
+  /// otherwise.
+  bool isIRelative() const { return isIRelative(Type); }
+
   /// Emit relocation at a current \p Streamer' position. The caller is
   /// responsible for setting the position correctly.
   size_t emit(MCStreamer *Streamer) const;
diff --git a/bolt/include/bolt/Passes/ReorderFunctions.h b/bolt/include/bolt/Passes/ReorderFunctions.h
index 52156a600791cb6871bb6ad34cedfa51574b3896..27094bee771ad5293693d553d1d4e59eff31029f 100644
--- a/bolt/include/bolt/Passes/ReorderFunctions.h
+++ b/bolt/include/bolt/Passes/ReorderFunctions.h
@@ -32,6 +32,7 @@ public:
     RT_EXEC_COUNT,
     RT_HFSORT,
     RT_HFSORT_PLUS,
+    RT_CDS,
     RT_PETTIS_HANSEN,
     RT_RANDOM,
     RT_USER
diff --git a/bolt/include/bolt/Rewrite/JITLinkLinker.h b/bolt/include/bolt/Rewrite/JITLinkLinker.h
index 104c75bea0c26934343fcd50283e854ba4fc8cca..1c41a26ac256350c09f9c3a78c0bc7a2bb869298 100644
--- a/bolt/include/bolt/Rewrite/JITLinkLinker.h
+++ b/bolt/include/bolt/Rewrite/JITLinkLinker.h
@@ -17,7 +17,6 @@
 #include "bolt/Rewrite/ExecutableFileMemoryManager.h"
 #include "llvm/ExecutionEngine/JITLink/JITLinkDylib.h"
 
-#include <map>
 #include <memory>
 #include <vector>
 
@@ -35,7 +34,7 @@ private:
   std::unique_ptr<ExecutableFileMemoryManager> MM;
   jitlink::JITLinkDylib Dylib{"main"};
   std::vector<ExecutableFileMemoryManager::FinalizedAlloc> Allocs;
-  std::map<std::string, uint64_t> Symtab;
+  StringMap<SymbolInfo> Symtab;
 
 public:
   JITLinkLinker(BinaryContext &BC,
@@ -43,7 +42,7 @@ public:
   ~JITLinkLinker();
 
   void loadObject(MemoryBufferRef Obj, SectionsMapper MapSections) override;
-  std::optional<uint64_t> lookupSymbol(StringRef Name) const override;
+  std::optional<SymbolInfo> lookupSymbolInfo(StringRef Name) const override;
 
   static SmallVector<jitlink::Block *, 2>
   orderedBlocks(const jitlink::Section &Section);
diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index 072c8109241d21d041d1d3db2907f1e6b99ffaee..261a7337535b29b989316ad6a55f85fd4e910a7b 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -95,6 +95,15 @@ private:
   /// from meta data in the file.
   void discoverFileObjects();
 
+  /// Check whether we should use DT_FINI or DT_FINI_ARRAY for instrumentation.
+  /// DT_FINI is preferred; DT_FINI_ARRAY is only used when no DT_FINI entry was
+  /// found.
+  Error discoverRtFiniAddress();
+
+  /// If DT_FINI_ARRAY is used for instrumentation, update the relocation of its
+  /// first entry to point to the instrumentation library's fini address.
+  void updateRtFiniReloc();
+
   /// Create and initialize metadata rewriters for this instance.
   void initializeMetadataManager();
 
@@ -190,7 +199,7 @@ private:
   void mapAllocatableSections(BOLTLinker::SectionMapper MapSection);
 
   /// Update output object's values based on the final \p Layout.
-  void updateOutputValues(const MCAsmLayout &Layout);
+  void updateOutputValues(const BOLTLinker &Linker);
 
   /// Rewrite back all functions (hopefully optimized) that fit in the original
   /// memory footprint for that function. If the function is now larger and does
@@ -415,6 +424,7 @@ private:
 
   /// Common section names.
   static StringRef getEHFrameSectionName() { return ".eh_frame"; }
+  static StringRef getRelaDynSectionName() { return ".rela.dyn"; }
 
   /// An instance of the input binary we are processing, externally owned.
   llvm::object::ELFObjectFileBase *InputFile;
@@ -503,11 +513,11 @@ private:
   };
 
   /// AArch64 PLT sections.
-  const PLTSectionInfo AArch64_PLTSections[3] = {
-      {".plt"}, {".iplt"}, {nullptr}};
+  const PLTSectionInfo AArch64_PLTSections[4] = {
+      {".plt"}, {".plt.got"}, {".iplt"}, {nullptr}};
 
   /// RISCV PLT sections.
-  const PLTSectionInfo RISCV_PLTSections[3] = {{".plt"}, {nullptr}};
+  const PLTSectionInfo RISCV_PLTSections[2] = {{".plt"}, {nullptr}};
 
   /// Return PLT information for a section with \p SectionName or nullptr
   /// if the section is not PLT.
diff --git a/bolt/lib/Core/AddressMap.cpp b/bolt/lib/Core/AddressMap.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..efa376d408db882b2fb3b937cab287063b5c79fc
--- /dev/null
+++ b/bolt/lib/Core/AddressMap.cpp
@@ -0,0 +1,118 @@
+//===- bolt/Core/AddressMap.cpp - Input-output Address Map ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Core/AddressMap.h"
+#include "bolt/Core/BinaryContext.h"
+#include "bolt/Core/BinaryFunction.h"
+#include "bolt/Core/BinarySection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/DataExtractor.h"
+
+namespace llvm {
+namespace bolt {
+
+const char *const AddressMap::AddressSectionName = ".bolt.addr2addr_map";
+const char *const AddressMap::LabelSectionName = ".bolt.label2addr_map";
+
+static void emitAddress(MCStreamer &Streamer, uint64_t InputAddress,
+                        const MCSymbol *OutputLabel) {
+  Streamer.emitIntValue(InputAddress, 8);
+  Streamer.emitSymbolValue(OutputLabel, 8);
+}
+
+static void emitLabel(MCStreamer &Streamer, const MCSymbol *OutputLabel) {
+  Streamer.emitIntValue(reinterpret_cast<uint64_t>(OutputLabel), 8);
+  Streamer.emitSymbolValue(OutputLabel, 8);
+}
+
+void AddressMap::emit(MCStreamer &Streamer, BinaryContext &BC) {
+  // Mark map sections as link-only to avoid allocation in the output file.
+  const unsigned Flags = BinarySection::getFlags(/*IsReadOnly*/ true,
+                                                 /*IsText*/ false,
+                                                 /*IsAllocatable*/ true);
+  BC.registerOrUpdateSection(AddressSectionName, ELF::SHT_PROGBITS, Flags)
+      .setLinkOnly();
+  BC.registerOrUpdateSection(LabelSectionName, ELF::SHT_PROGBITS, Flags)
+      .setLinkOnly();
+
+  for (const auto &[BFAddress, BF] : BC.getBinaryFunctions()) {
+    if (!BF.requiresAddressMap())
+      continue;
+
+    for (const auto &BB : BF) {
+      if (!BB.getLabel()->isDefined())
+        continue;
+
+      Streamer.switchSection(BC.getDataSection(LabelSectionName));
+      emitLabel(Streamer, BB.getLabel());
+
+      if (!BB.hasLocSyms())
+        continue;
+
+      Streamer.switchSection(BC.getDataSection(AddressSectionName));
+      for (auto [Offset, Symbol] : BB.getLocSyms())
+        emitAddress(Streamer, BFAddress + Offset, Symbol);
+    }
+  }
+}
+
+std::optional<AddressMap> AddressMap::parse(BinaryContext &BC) {
+  auto AddressMapSection = BC.getUniqueSectionByName(AddressSectionName);
+  auto LabelMapSection = BC.getUniqueSectionByName(LabelSectionName);
+
+  if (!AddressMapSection && !LabelMapSection)
+    return std::nullopt;
+
+  AddressMap Parsed;
+
+  const size_t EntrySize = 2 * BC.AsmInfo->getCodePointerSize();
+  auto parseSection =
+      [&](BinarySection &Section,
+          function_ref<void(uint64_t, uint64_t)> InsertCallback) {
+        StringRef Buffer = Section.getOutputContents();
+        assert(Buffer.size() % EntrySize == 0 && "Unexpected address map size");
+
+        DataExtractor DE(Buffer, BC.AsmInfo->isLittleEndian(),
+                         BC.AsmInfo->getCodePointerSize());
+        DataExtractor::Cursor Cursor(0);
+
+        while (Cursor && !DE.eof(Cursor)) {
+          const uint64_t Input = DE.getAddress(Cursor);
+          const uint64_t Output = DE.getAddress(Cursor);
+          InsertCallback(Input, Output);
+        }
+
+        assert(Cursor && "Error reading address map section");
+        BC.deregisterSection(Section);
+      };
+
+  if (AddressMapSection) {
+    Parsed.Address2AddressMap.reserve(AddressMapSection->getOutputSize() /
+                                      EntrySize);
+    parseSection(*AddressMapSection, [&](uint64_t Input, uint64_t Output) {
+      if (!Parsed.Address2AddressMap.count(Input))
+        Parsed.Address2AddressMap.insert({Input, Output});
+    });
+  }
+
+  if (LabelMapSection) {
+    Parsed.Label2AddrMap.reserve(LabelMapSection->getOutputSize() / EntrySize);
+    parseSection(*LabelMapSection, [&](uint64_t Input, uint64_t Output) {
+      assert(!Parsed.Label2AddrMap.count(
+                 reinterpret_cast<const MCSymbol *>(Input)) &&
+             "Duplicate label entry detected.");
+      Parsed.Label2AddrMap.insert(
+          {reinterpret_cast<const MCSymbol *>(Input), Output});
+    });
+  }
+
+  return Parsed;
+}
+
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/lib/Core/BinaryBasicBlock.cpp b/bolt/lib/Core/BinaryBasicBlock.cpp
index b271b86ec69920456088371dd80efc9f062be813..984bc6dbd220ab05904e3ee909e46f56983c2304 100644
--- a/bolt/lib/Core/BinaryBasicBlock.cpp
+++ b/bolt/lib/Core/BinaryBasicBlock.cpp
@@ -14,7 +14,6 @@
 #include "bolt/Core/BinaryContext.h"
 #include "bolt/Core/BinaryFunction.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Errc.h"
 
@@ -613,27 +612,5 @@ BinaryBasicBlock *BinaryBasicBlock::splitAt(iterator II) {
   return NewBlock;
 }
 
-void BinaryBasicBlock::updateOutputValues(const MCAsmLayout &Layout) {
-  if (!LocSyms)
-    return;
-
-  const uint64_t BBAddress = getOutputAddressRange().first;
-  const uint64_t BBOffset = Layout.getSymbolOffset(*getLabel());
-  for (const auto &LocSymKV : *LocSyms) {
-    const uint32_t InputFunctionOffset = LocSymKV.first;
-    const uint32_t OutputOffset = static_cast<uint32_t>(
-        Layout.getSymbolOffset(*LocSymKV.second) - BBOffset);
-    getOffsetTranslationTable().emplace_back(
-        std::make_pair(OutputOffset, InputFunctionOffset));
-
-    // Update reverse (relative to BAT) address lookup table for function.
-    if (getFunction()->requiresAddressTranslation()) {
-      getFunction()->getInputOffsetToAddressMap().emplace(
-          std::make_pair(InputFunctionOffset, OutputOffset + BBAddress));
-    }
-  }
-  LocSyms.reset(nullptr);
-}
-
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 2d2b35ee2bd9c919fc0ee6c3c9310932d84aa125..6761771a2ee6f18f224398de5c4b5a3e70d7176f 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -503,6 +503,9 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address,
   // Is one of the targets __builtin_unreachable?
   bool HasUnreachable = false;
 
+  // Does one of the entries match function start address?
+  bool HasStartAsEntry = false;
+
   // Number of targets other than __builtin_unreachable.
   uint64_t NumRealEntries = 0;
 
@@ -567,14 +570,21 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address,
       continue;
     }
 
+    // Function start is another special case. It is allowed in the jump table,
+    // but we need at least one another regular entry to distinguish the table
+    // from, e.g. a function pointer array.
+    if (Value == BF.getAddress()) {
+      HasStartAsEntry = true;
+      addEntryAddress(Value);
+      continue;
+    }
+
     // Function or one of its fragments.
     const BinaryFunction *TargetBF = getBinaryFunctionContainingAddress(Value);
-
-    bool DoesBelongToFunction = BF.containsAddress(Value) ||
-                                (TargetBF && TargetBF->isParentOrChildOf(BF));
-
-    // We assume that a jump table cannot have function start as an entry.
-    if (!DoesBelongToFunction || Value == BF.getAddress()) {
+    const bool DoesBelongToFunction =
+        BF.containsAddress(Value) ||
+        (TargetBF && TargetBF->isParentOrChildOf(BF));
+    if (!DoesBelongToFunction) {
       LLVM_DEBUG({
         if (!BF.containsAddress(Value)) {
           dbgs() << "FAIL: function doesn't contain this address\n";
@@ -589,8 +599,6 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address,
             }
           }
         }
-        if (Value == BF.getAddress())
-          dbgs() << "FAIL: jump table cannot have function start as an entry\n";
       });
       break;
     }
@@ -611,9 +619,9 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address,
   }
 
   // It's a jump table if the number of real entries is more than 1, or there's
-  // one real entry and "unreachable" targets. If there are only multiple
-  // "unreachable" targets, then it's not a jump table.
-  return NumRealEntries + HasUnreachable >= 2;
+  // one real entry and one or more special targets. If there are only multiple
+  // special targets, then it's not a jump table.
+  return NumRealEntries + (HasUnreachable || HasStartAsEntry) >= 2;
 }
 
 void BinaryContext::populateJumpTables() {
@@ -1696,6 +1704,15 @@ bool BinaryContext::shouldEmit(const BinaryFunction &Function) const {
   return HasRelocations || Function.isSimple();
 }
 
+void BinaryContext::dump(const MCInst &Inst) const {
+  if (LLVM_UNLIKELY(!InstPrinter)) {
+    dbgs() << "Cannot dump for InstPrinter is not initialized.\n";
+    return;
+  }
+  InstPrinter->printInst(&Inst, 0, "", *STI, dbgs());
+  dbgs() << "\n";
+}
+
 void BinaryContext::printCFI(raw_ostream &OS, const MCCFIInstruction &Inst) {
   uint32_t Operation = Inst.getOperation();
   switch (Operation) {
@@ -1752,10 +1769,10 @@ void BinaryContext::printCFI(raw_ostream &OS, const MCCFIInstruction &Inst) {
 }
 
 MarkerSymType BinaryContext::getMarkerType(const SymbolRef &Symbol) const {
-  // For aarch64, the ABI defines mapping symbols so we identify data in the
-  // code section (see IHI0056B). $x identifies a symbol starting code or the
-  // end of a data chunk inside code, $d indentifies start of data.
-  if (!isAArch64() || ELFSymbolRef(Symbol).getSize())
+  // For aarch64 and riscv, the ABI defines mapping symbols so we identify data
+  // in the code section (see IHI0056B). $x identifies a symbol starting code or
+  // the end of a data chunk inside code, $d indentifies start of data.
+  if ((!isAArch64() && !isRISCV()) || ELFSymbolRef(Symbol).getSize())
     return MarkerSymType::NONE;
 
   Expected<StringRef> NameOrError = Symbol.getName();
@@ -1855,6 +1872,10 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction,
   }
   if (std::optional<uint32_t> Offset = MIB->getOffset(Instruction))
     OS << " # Offset: " << *Offset;
+  if (std::optional<uint32_t> Size = MIB->getSize(Instruction))
+    OS << " # Size: " << *Size;
+  if (MCSymbol *Label = MIB->getLabel(Instruction))
+    OS << " # Label: " << *Label;
 
   MIB->printAnnotations(Instruction, OS);
 
@@ -2259,14 +2280,36 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) {
   MCAsmLayout Layout(Assembler);
   Assembler.layout(Layout);
 
+  // Obtain fragment sizes.
+  std::vector<uint64_t> FragmentSizes;
+  // Main fragment size.
   const uint64_t HotSize =
       Layout.getSymbolOffset(*EndLabel) - Layout.getSymbolOffset(*StartLabel);
-  const uint64_t ColdSize =
-      std::accumulate(SplitLabels.begin(), SplitLabels.end(), 0ULL,
-                      [&](const uint64_t Accu, const LabelRange &Labels) {
-                        return Accu + Layout.getSymbolOffset(*Labels.second) -
-                               Layout.getSymbolOffset(*Labels.first);
-                      });
+  FragmentSizes.push_back(HotSize);
+  // Split fragment sizes.
+  uint64_t ColdSize = 0;
+  for (const auto &Labels : SplitLabels) {
+    uint64_t Size = Layout.getSymbolOffset(*Labels.second) -
+                    Layout.getSymbolOffset(*Labels.first);
+    FragmentSizes.push_back(Size);
+    ColdSize += Size;
+  }
+
+  // Populate new start and end offsets of each basic block.
+  uint64_t FragmentIndex = 0;
+  for (FunctionFragment &FF : BF.getLayout().fragments()) {
+    BinaryBasicBlock *PrevBB = nullptr;
+    for (BinaryBasicBlock *BB : FF) {
+      const uint64_t BBStartOffset = Layout.getSymbolOffset(*(BB->getLabel()));
+      BB->setOutputStartAddress(BBStartOffset);
+      if (PrevBB)
+        PrevBB->setOutputEndAddress(BBStartOffset);
+      PrevBB = BB;
+    }
+    if (PrevBB)
+      PrevBB->setOutputEndAddress(FragmentSizes[FragmentIndex]);
+    FragmentIndex++;
+  }
 
   // Clean-up the effect of the code emission.
   for (const MCSymbol &Symbol : Assembler.symbols()) {
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index c4129615ac32de564b095a68a555dbf14cf7cf95..9c7905955835bfbd6bd3f8a114b8f791b5226651 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -161,9 +161,17 @@ private:
   /// \p FirstInstr indicates if \p NewLoc represents the first instruction
   /// in a sequence, such as a function fragment.
   ///
+  /// If \p NewLoc location matches \p PrevLoc, no new line number entry will be
+  /// created and the function will return \p PrevLoc while \p InstrLabel will
+  /// be ignored. Otherwise, the caller should use \p InstrLabel to mark the
+  /// corresponding instruction by emitting \p InstrLabel before it.
+  /// If \p InstrLabel is set by the caller, its value will be used with \p
+  /// \p NewLoc. If it was nullptr on entry, it will be populated with a pointer
+  /// to a new temp symbol used with \p NewLoc.
+  ///
   /// Return new current location which is either \p NewLoc or \p PrevLoc.
   SMLoc emitLineInfo(const BinaryFunction &BF, SMLoc NewLoc, SMLoc PrevLoc,
-                     bool FirstInstr);
+                     bool FirstInstr, MCSymbol *&InstrLabel);
 
   /// Use \p FunctionEndSymbol to mark the end of the line info sequence.
   /// Note that it does not automatically result in the insertion of the EOS
@@ -214,6 +222,10 @@ void BinaryEmitter::emitAll(StringRef OrgSecPrefix) {
   }
 
   emitDataSections(OrgSecPrefix);
+
+  // TODO Enable for Mach-O once BinaryContext::getDataSection supports it.
+  if (BC.isELF())
+    AddressMap::emit(Streamer, BC);
 }
 
 void BinaryEmitter::emitFunctions() {
@@ -305,7 +317,7 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function,
     // tentative layout.
     Section->ensureMinAlignment(Align(opts::AlignFunctions));
 
-    Streamer.emitCodeAlignment(Align(BinaryFunction::MinAlign), &*BC.STI);
+    Streamer.emitCodeAlignment(Function.getMinAlign(), &*BC.STI);
     uint16_t MaxAlignBytes = FF.isSplitFragment()
                                  ? Function.getMaxColdAlignmentBytes()
                                  : Function.getMaxAlignmentBytes();
@@ -376,7 +388,7 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function,
   }
 
   if (opts::MarkFuncs)
-    Streamer.emitIntValue(BC.MIB->getTrapFillValue(), 1);
+    Streamer.emitBytes(BC.MIB->getTrapFillValue());
 
   // Emit CFI end
   if (Function.hasCFI())
@@ -420,7 +432,7 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF,
     // case, the call site entries in that LSDA have 0 as offset to the landing
     // pad, which the runtime interprets as "no handler". To prevent this,
     // insert some padding.
-    Streamer.emitIntValue(BC.MIB->getTrapFillValue(), 1);
+    Streamer.emitBytes(BC.MIB->getTrapFillValue());
   }
 
   // Track the first emitted instruction with debug info.
@@ -479,19 +491,39 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF,
         // are relaxable, we should be safe.
       }
 
-      if (!EmitCodeOnly && opts::UpdateDebugSections && BF.getDWARFUnit()) {
-        LastLocSeen = emitLineInfo(BF, Instr.getLoc(), LastLocSeen, FirstInstr);
-        FirstInstr = false;
+      if (!EmitCodeOnly) {
+        // A symbol to be emitted before the instruction to mark its location.
+        MCSymbol *InstrLabel = BC.MIB->getLabel(Instr);
+
+        if (opts::UpdateDebugSections && BF.getDWARFUnit()) {
+          LastLocSeen = emitLineInfo(BF, Instr.getLoc(), LastLocSeen,
+                                     FirstInstr, InstrLabel);
+          FirstInstr = false;
+        }
+
+        // Prepare to tag this location with a label if we need to keep track of
+        // the location of calls/returns for BOLT address translation maps
+        if (BF.requiresAddressTranslation() && BC.MIB->getOffset(Instr)) {
+          const uint32_t Offset = *BC.MIB->getOffset(Instr);
+          if (!InstrLabel)
+            InstrLabel = BC.Ctx->createTempSymbol();
+          BB->getLocSyms().emplace_back(Offset, InstrLabel);
+        }
+
+        if (InstrLabel)
+          Streamer.emitLabel(InstrLabel);
       }
 
-      // Prepare to tag this location with a label if we need to keep track of
-      // the location of calls/returns for BOLT address translation maps
-      if (!EmitCodeOnly && BF.requiresAddressTranslation() &&
-          BC.MIB->getOffset(Instr)) {
-        const uint32_t Offset = *BC.MIB->getOffset(Instr);
-        MCSymbol *LocSym = BC.Ctx->createTempSymbol();
-        Streamer.emitLabel(LocSym);
-        BB->getLocSyms().emplace_back(Offset, LocSym);
+      // Emit sized NOPs via MCAsmBackend::writeNopData() interface on x86.
+      // This is a workaround for invalid NOPs handling by asm/disasm layer.
+      if (BC.MIB->isNoop(Instr) && BC.isX86()) {
+        if (std::optional<uint32_t> Size = BC.MIB->getSize(Instr)) {
+          SmallString<15> Code;
+          raw_svector_ostream VecOS(Code);
+          BC.MAB->writeNopData(VecOS, *Size, BC.STI.get());
+          Streamer.emitBytes(Code);
+          continue;
+        }
       }
 
       Streamer.emitInstruction(Instr, *BC.STI);
@@ -654,7 +686,8 @@ void BinaryEmitter::emitConstantIslands(BinaryFunction &BF, bool EmitColdPart,
 }
 
 SMLoc BinaryEmitter::emitLineInfo(const BinaryFunction &BF, SMLoc NewLoc,
-                                  SMLoc PrevLoc, bool FirstInstr) {
+                                  SMLoc PrevLoc, bool FirstInstr,
+                                  MCSymbol *&InstrLabel) {
   DWARFUnit *FunctionCU = BF.getDWARFUnit();
   const DWARFDebugLine::LineTable *FunctionLineTable = BF.getDWARFLineTable();
   assert(FunctionCU && "cannot emit line info for function without CU");
@@ -704,12 +737,12 @@ SMLoc BinaryEmitter::emitLineInfo(const BinaryFunction &BF, SMLoc NewLoc,
   const MCDwarfLoc &DwarfLoc = BC.Ctx->getCurrentDwarfLoc();
   BC.Ctx->clearDwarfLocSeen();
 
-  MCSymbol *LineSym = BC.Ctx->createTempSymbol();
-  Streamer.emitLabel(LineSym);
+  if (!InstrLabel)
+    InstrLabel = BC.Ctx->createTempSymbol();
 
   BC.getDwarfLineTable(FunctionUnitIndex)
       .getMCLineSections()
-      .addLineEntry(MCDwarfLineEntry(LineSym, DwarfLoc),
+      .addLineEntry(MCDwarfLineEntry(InstrLabel, DwarfLoc),
                     Streamer.getCurrentSectionOnly());
 
   return NewLoc;
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 5b44a76dc8c385fa3d8f3b492b5008240e554acb..49a9dd902120dc3f5cf22c9980bc1d9c096e517c 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -25,7 +25,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCExpr.h"
@@ -59,6 +58,7 @@ extern cl::OptionCategory BoltRelocCategory;
 
 extern cl::opt<bool> EnableBAT;
 extern cl::opt<bool> Instrument;
+extern cl::opt<bool> KeepNops;
 extern cl::opt<bool> StrictMode;
 extern cl::opt<bool> UpdateDebugSections;
 extern cl::opt<unsigned> Verbosity;
@@ -110,6 +110,13 @@ cl::opt<bool>
                             cl::desc("try to preserve basic block alignment"),
                             cl::cat(BoltOptCategory));
 
+static cl::opt<bool> PrintOutputAddressRange(
+    "print-output-address-range",
+    cl::desc(
+        "print output address range for each basic block in the function when"
+        "BinaryFunction::print is called"),
+    cl::Hidden, cl::cat(BoltOptCategory));
+
 cl::opt<bool>
 PrintDynoStats("dyno-stats",
   cl::desc("print execution info based on profile"),
@@ -165,8 +172,6 @@ bool shouldPrint(const BinaryFunction &Function) {
 namespace llvm {
 namespace bolt {
 
-constexpr unsigned BinaryFunction::MinAlign;
-
 template <typename R> static bool emptyRange(const R &Range) {
   return Range.begin() == Range.end();
 }
@@ -325,7 +330,8 @@ void BinaryFunction::markUnreachableBlocks() {
 
 // Any unnecessary fallthrough jumps revealed after calling eraseInvalidBBs
 // will be cleaned up by fixBranches().
-std::pair<unsigned, uint64_t> BinaryFunction::eraseInvalidBBs() {
+std::pair<unsigned, uint64_t>
+BinaryFunction::eraseInvalidBBs(const MCCodeEmitter *Emitter) {
   DenseSet<const BinaryBasicBlock *> InvalidBBs;
   unsigned Count = 0;
   uint64_t Bytes = 0;
@@ -334,7 +340,7 @@ std::pair<unsigned, uint64_t> BinaryFunction::eraseInvalidBBs() {
       assert(!isEntryPoint(*BB) && "all entry blocks must be valid");
       InvalidBBs.insert(BB);
       ++Count;
-      Bytes += BC.computeCodeSize(BB->begin(), BB->end());
+      Bytes += BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
     }
   }
 
@@ -433,8 +439,6 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) {
   OS << "\n  IsSplit     : " << isSplit();
   OS << "\n  BB Count    : " << size();
 
-  if (HasFixedIndirectBranch)
-    OS << "\n  HasFixedIndirectBranch : true";
   if (HasUnknownControlFlow)
     OS << "\n  Unknown CF  : true";
   if (getPersonalityFunction())
@@ -515,6 +519,11 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) {
       OS << BB->getName() << " (" << BB->size()
          << " instructions, align : " << BB->getAlignment() << ")\n";
 
+      if (opts::PrintOutputAddressRange)
+        OS << formatv("  Output Address Range: [{0:x}, {1:x}) ({2} bytes)\n",
+                      BB->getOutputAddressRange().first,
+                      BB->getOutputAddressRange().second, BB->getOutputSize());
+
       if (isEntryPoint(*BB)) {
         if (MCSymbol *EntrySymbol = getSecondaryEntryPointSymbol(*BB))
           OS << "  Secondary Entry Point: " << EntrySymbol->getName() << '\n';
@@ -536,7 +545,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) {
       if (BB->getCFIState() >= 0)
         OS << "  CFI State : " << BB->getCFIState() << '\n';
       if (opts::EnableBAT) {
-        OS << "  Input offset: " << Twine::utohexstr(BB->getInputOffset())
+        OS << "  Input offset: 0x" << Twine::utohexstr(BB->getInputOffset())
            << "\n";
       }
       if (!BB->pred_empty()) {
@@ -1119,7 +1128,7 @@ void BinaryFunction::handleIndirectBranch(MCInst &Instruction, uint64_t Size,
       Instruction.clear();
       MIB->createUncondBranch(Instruction, TargetSymbol, BC.Ctx.get());
       TakenBranches.emplace_back(Offset, IndirectTarget - getAddress());
-      HasFixedIndirectBranch = true;
+      addEntryPointAtOffset(IndirectTarget - getAddress());
     } else {
       MIB->convertJmpToTailCall(Instruction);
       BC.addInterproceduralReference(this, IndirectTarget);
@@ -1174,6 +1183,13 @@ bool BinaryFunction::disassemble() {
   // basic block.
   Labels[0] = Ctx->createNamedTempSymbol("BB0");
 
+  // Map offsets in the function to a label that should always point to the
+  // corresponding instruction. This is used for labels that shouldn't point to
+  // the start of a basic block but always to a specific instruction. This is
+  // used, for example, on RISC-V where %pcrel_lo relocations point to the
+  // corresponding %pcrel_hi.
+  LabelsMapType InstructionLabels;
+
   uint64_t Size = 0; // instruction size
   for (uint64_t Offset = 0; Offset < getSize(); Offset += Size) {
     MCInst Instruction;
@@ -1330,9 +1346,23 @@ bool BinaryFunction::disassemble() {
                 ItrE = Relocations.lower_bound(Offset + Size);
            Itr != ItrE; ++Itr) {
         const Relocation &Relocation = Itr->second;
+        MCSymbol *Symbol = Relocation.Symbol;
+
+        if (Relocation::isInstructionReference(Relocation.Type)) {
+          uint64_t RefOffset = Relocation.Value - getAddress();
+          LabelsMapType::iterator LI = InstructionLabels.find(RefOffset);
+
+          if (LI == InstructionLabels.end()) {
+            Symbol = BC.Ctx->createNamedTempSymbol();
+            InstructionLabels.emplace(RefOffset, Symbol);
+          } else {
+            Symbol = LI->second;
+          }
+        }
+
         int64_t Value = Relocation.Value;
         const bool Result = BC.MIB->replaceImmWithSymbolRef(
-            Instruction, Relocation.Symbol, Relocation.Addend, Ctx.get(), Value,
+            Instruction, Symbol, Relocation.Addend, Ctx.get(), Value,
             Relocation.Type);
         (void)Result;
         assert(Result && "cannot replace immediate with relocation");
@@ -1361,12 +1391,19 @@ add_instruction:
       // NOTE: disassembly loses the correct size information for noops.
       //       E.g. nopw 0x0(%rax,%rax,1) is 9 bytes, but re-encoded it's only
       //       5 bytes. Preserve the size info using annotations.
-      MIB->addAnnotation(Instruction, "Size", static_cast<uint32_t>(Size));
+      MIB->setSize(Instruction, Size);
     }
 
     addInstruction(Offset, std::move(Instruction));
   }
 
+  for (auto [Offset, Label] : InstructionLabels) {
+    InstrMapType::iterator II = Instructions.find(Offset);
+    assert(II != Instructions.end() && "reference to non-existing instruction");
+
+    BC.MIB->setLabel(II->second, Label);
+  }
+
   // Reset symbolizer for the disassembler.
   BC.SymbolicDisAsm->setSymbolizer(nullptr);
 
@@ -1866,9 +1903,6 @@ bool BinaryFunction::postProcessIndirectBranches(
     LastIndirectJumpBB->updateJumpTableSuccessors();
   }
 
-  if (HasFixedIndirectBranch)
-    return false;
-
   // Validate that all data references to function offsets are claimed by
   // recognized jump tables. Register externally referenced blocks as entry
   // points.
@@ -1973,7 +2007,7 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
       }
     }
     if (LastNonNop && !MIB->getOffset(*LastNonNop))
-      MIB->setOffset(*LastNonNop, static_cast<uint32_t>(Offset), AllocatorId);
+      MIB->setOffset(*LastNonNop, static_cast<uint32_t>(Offset));
   };
 
   for (auto I = Instructions.begin(), E = Instructions.end(); I != E; ++I) {
@@ -1996,7 +2030,7 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
     if (MIB->isNoop(Instr) && !MIB->getOffset(Instr)) {
       // If "Offset" annotation is not present, set it and mark the nop for
       // deletion.
-      MIB->setOffset(Instr, static_cast<uint32_t>(Offset), AllocatorId);
+      MIB->setOffset(Instr, static_cast<uint32_t>(Offset));
       // Annotate ordinary nops, so we can safely delete them if required.
       MIB->addAnnotation(Instr, "NOP", static_cast<uint32_t>(1), AllocatorId);
     }
@@ -2221,8 +2255,8 @@ void BinaryFunction::calculateMacroOpFusionStats() {
                       << Twine::utohexstr(getAddress() + Offset)
                       << " in function " << *this << "; executed "
                       << BB.getKnownExecutionCount() << " times.\n");
-    ++BC.MissedMacroFusionPairs;
-    BC.MissedMacroFusionExecCount += BB.getKnownExecutionCount();
+    ++BC.Stats.MissedMacroFusionPairs;
+    BC.Stats.MissedMacroFusionExecCount += BB.getKnownExecutionCount();
   }
 }
 
@@ -2277,6 +2311,13 @@ void BinaryFunction::removeConditionalTailCalls() {
     assert(CTCTargetLabel && "symbol expected for conditional tail call");
     MCInst TailCallInstr;
     BC.MIB->createTailCall(TailCallInstr, CTCTargetLabel, BC.Ctx.get());
+
+    // Move offset from CTCInstr to TailCallInstr.
+    if (const std::optional<uint32_t> Offset = BC.MIB->getOffset(*CTCInstr)) {
+      BC.MIB->setOffset(TailCallInstr, *Offset);
+      BC.MIB->clearOffset(*CTCInstr);
+    }
+
     // Link new BBs to the original input offset of the BB where the CTC
     // is, so we can map samples recorded in new BBs back to the original BB
     // seem in the input binary (if using BAT)
@@ -2849,6 +2890,14 @@ bool BinaryFunction::requiresAddressTranslation() const {
   return opts::EnableBAT || hasSDTMarker() || hasPseudoProbe();
 }
 
+bool BinaryFunction::requiresAddressMap() const {
+  if (isInjected())
+    return false;
+
+  return opts::UpdateDebugSections || isMultiEntry() ||
+         requiresAddressTranslation();
+}
+
 uint64_t BinaryFunction::getInstructionCount() const {
   uint64_t Count = 0;
   for (const BinaryBasicBlock &BB : blocks())
@@ -3143,6 +3192,10 @@ void BinaryFunction::dumpGraphToFile(std::string Filename) const {
 }
 
 bool BinaryFunction::validateCFG() const {
+  // Skip the validation of CFG after it is finalized
+  if (CurrentState == State::CFG_Finalized)
+    return true;
+
   bool Valid = true;
   for (BinaryBasicBlock *BB : BasicBlocks)
     Valid &= BB->validateSuccessorInvariants();
@@ -3322,7 +3375,7 @@ void BinaryFunction::propagateGnuArgsSizeInfo(
         }
       } else if (BC.MIB->isInvoke(Instr)) {
         // Add the value of GNU_args_size as an extra operand to invokes.
-        BC.MIB->addGnuArgsSize(Instr, CurrentGnuArgsSize, AllocId);
+        BC.MIB->addGnuArgsSize(Instr, CurrentGnuArgsSize);
       }
       ++II;
     }
@@ -4016,7 +4069,7 @@ void BinaryFunction::calculateLoopInfo() {
   }
 }
 
-void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) {
+void BinaryFunction::updateOutputValues(const BOLTLinker &Linker) {
   if (!isEmitted()) {
     assert(!isInjected() && "injected function should be emitted");
     setOutputAddress(getAddress());
@@ -4024,16 +4077,17 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) {
     return;
   }
 
-  const uint64_t BaseAddress = getCodeSection()->getOutputAddress();
+  const auto SymbolInfo = Linker.lookupSymbolInfo(getSymbol()->getName());
+  assert(SymbolInfo && "Cannot find function entry symbol");
+  setOutputAddress(SymbolInfo->Address);
+  setOutputSize(SymbolInfo->Size);
+
   if (BC.HasRelocations || isInjected()) {
-    const uint64_t StartOffset = Layout.getSymbolOffset(*getSymbol());
-    const uint64_t EndOffset = Layout.getSymbolOffset(*getFunctionEndLabel());
-    setOutputAddress(BaseAddress + StartOffset);
-    setOutputSize(EndOffset - StartOffset);
     if (hasConstantIsland()) {
-      const uint64_t DataOffset =
-          Layout.getSymbolOffset(*getFunctionConstantIslandLabel());
-      setOutputDataAddress(BaseAddress + DataOffset);
+      const auto DataAddress =
+          Linker.lookupSymbol(getFunctionConstantIslandLabel()->getName());
+      assert(DataAddress && "Cannot find function CI symbol");
+      setOutputDataAddress(*DataAddress);
       for (auto It : Islands->Offsets) {
         const uint64_t OldOffset = It.first;
         BinaryData *BD = BC.getBinaryDataAtAddress(getAddress() + OldOffset);
@@ -4041,8 +4095,11 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) {
           continue;
 
         MCSymbol *Symbol = It.second;
-        const uint64_t NewOffset = Layout.getSymbolOffset(*Symbol);
-        BD->setOutputLocation(*getCodeSection(), NewOffset);
+        const auto NewAddress = Linker.lookupSymbol(Symbol->getName());
+        assert(NewAddress && "Cannot find CI symbol");
+        auto &Section = *getCodeSection();
+        const auto NewOffset = *NewAddress - Section.getOutputAddress();
+        BD->setOutputLocation(Section, NewOffset);
       }
     }
     if (isSplit()) {
@@ -4052,7 +4109,6 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) {
         // If fragment is empty, cold section might not exist
         if (FF.empty() && ColdSection.getError())
           continue;
-        const uint64_t ColdBaseAddress = ColdSection->getOutputAddress();
 
         const MCSymbol *ColdStartSymbol = getSymbol(FF.getFragmentNum());
         // If fragment is empty, symbol might have not been emitted
@@ -4061,31 +4117,24 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) {
           continue;
         assert(ColdStartSymbol && ColdStartSymbol->isDefined() &&
                "split function should have defined cold symbol");
-        const MCSymbol *ColdEndSymbol =
-            getFunctionEndLabel(FF.getFragmentNum());
-        assert(ColdEndSymbol && ColdEndSymbol->isDefined() &&
-               "split function should have defined cold end symbol");
-        const uint64_t ColdStartOffset =
-            Layout.getSymbolOffset(*ColdStartSymbol);
-        const uint64_t ColdEndOffset = Layout.getSymbolOffset(*ColdEndSymbol);
-        FF.setAddress(ColdBaseAddress + ColdStartOffset);
-        FF.setImageSize(ColdEndOffset - ColdStartOffset);
+        const auto ColdStartSymbolInfo =
+            Linker.lookupSymbolInfo(ColdStartSymbol->getName());
+        assert(ColdStartSymbolInfo && "Cannot find cold start symbol");
+        FF.setAddress(ColdStartSymbolInfo->Address);
+        FF.setImageSize(ColdStartSymbolInfo->Size);
         if (hasConstantIsland()) {
-          const uint64_t DataOffset =
-              Layout.getSymbolOffset(*getFunctionColdConstantIslandLabel());
-          setOutputColdDataAddress(ColdBaseAddress + DataOffset);
+          const auto DataAddress = Linker.lookupSymbol(
+              getFunctionColdConstantIslandLabel()->getName());
+          assert(DataAddress && "Cannot find cold CI symbol");
+          setOutputColdDataAddress(*DataAddress);
         }
       }
     }
-  } else {
-    setOutputAddress(getAddress());
-    setOutputSize(Layout.getSymbolOffset(*getFunctionEndLabel()));
   }
 
   // Update basic block output ranges for the debug info, if we have
   // secondary entry points in the symbol table to update or if writing BAT.
-  if (!opts::UpdateDebugSections && !isMultiEntry() &&
-      !requiresAddressTranslation())
+  if (!requiresAddressMap())
     return;
 
   // Output ranges should match the input if the body hasn't changed.
@@ -4114,15 +4163,24 @@ void BinaryFunction::updateOutputValues(const MCAsmLayout &Layout) {
           assert(FragmentBaseAddress == getOutputAddress());
       }
 
-      const uint64_t BBOffset = Layout.getSymbolOffset(*BB->getLabel());
-      const uint64_t BBAddress = FragmentBaseAddress + BBOffset;
+      // Injected functions likely will fail lookup, as they have no
+      // input range. Just assign the BB the output address of the
+      // function.
+      auto MaybeBBAddress = BC.getIOAddressMap().lookup(BB->getLabel());
+      const uint64_t BBAddress = MaybeBBAddress  ? *MaybeBBAddress
+                                 : BB->isSplit() ? FF.getAddress()
+                                                 : getOutputAddress();
       BB->setOutputStartAddress(BBAddress);
 
-      if (PrevBB)
+      if (PrevBB) {
+        assert(PrevBB->getOutputAddressRange().first <= BBAddress &&
+               "Bad output address for basic block.");
+        assert((PrevBB->getOutputAddressRange().first != BBAddress ||
+                !hasInstructions() || !PrevBB->getNumNonPseudos()) &&
+               "Bad output address for basic block.");
         PrevBB->setOutputEndAddress(BBAddress);
+      }
       PrevBB = BB;
-
-      BB->updateOutputValues(Layout);
     }
 
     PrevBB->setOutputEndAddress(PrevBB->isSplit()
@@ -4175,9 +4233,8 @@ uint64_t BinaryFunction::translateInputToOutputAddress(uint64_t Address) const {
 
   // Check if the address is associated with an instruction that is tracked
   // by address translation.
-  auto KV = InputOffsetToAddressMap.find(Address - getAddress());
-  if (KV != InputOffsetToAddressMap.end())
-    return KV->second;
+  if (auto OutputAddress = BC.getIOAddressMap().lookup(Address))
+    return *OutputAddress;
 
   // FIXME: #18950828 - we rely on relative offsets inside basic blocks to stay
   //        intact. Instead we can use pseudo instructions and/or annotations.
@@ -4299,10 +4356,11 @@ MCInst *BinaryFunction::getInstructionAtOffset(uint64_t Offset) {
     }
 
     if (MCInst *LastInstr = BB->getLastNonPseudoInstr()) {
-      const uint32_t Size =
-          BC.MIB->getAnnotationWithDefault<uint32_t>(*LastInstr, "Size");
-      if (BB->getEndOffset() - Offset == Size)
-        return LastInstr;
+      if (std::optional<uint32_t> Size = BC.MIB->getSize(*LastInstr)) {
+        if (BB->getEndOffset() - Offset == Size) {
+          return LastInstr;
+        }
+      }
     }
 
     return nullptr;
@@ -4471,7 +4529,7 @@ void BinaryFunction::addRelocation(uint64_t Address, MCSymbol *Symbol,
   uint64_t Offset = Address - getAddress();
   LLVM_DEBUG(dbgs() << "BOLT-DEBUG: addRelocation in "
                     << formatv("{0}@{1:x} against {2}\n", *this, Offset,
-                               Symbol->getName()));
+                               (Symbol ? Symbol->getName() : "<undef>")));
   bool IsCI = BC.isAArch64() && isInConstantIsland(Address);
   std::map<uint64_t, Relocation> &Rels =
       IsCI ? Islands->Relocations : Relocations;
diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt
index a4612fb93f8c349e3d1f9c39fbf1373b95042f93..c913179ebcc517ef17650ea5de30a48a12c3185f 100644
--- a/bolt/lib/Core/CMakeLists.txt
+++ b/bolt/lib/Core/CMakeLists.txt
@@ -11,6 +11,7 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_library(LLVMBOLTCore
+  AddressMap.cpp
   BinaryBasicBlock.cpp
   BinaryContext.cpp
   BinaryData.cpp
diff --git a/bolt/lib/Core/DynoStats.cpp b/bolt/lib/Core/DynoStats.cpp
index ee40eefd6f7c24e1691ae8b7156d7e5832f9985f..5dd55e13e5b31fdea55eb3071d8e62f8c5368b54 100644
--- a/bolt/lib/Core/DynoStats.cpp
+++ b/bolt/lib/Core/DynoStats.cpp
@@ -215,10 +215,10 @@ DynoStats getDynoStats(BinaryFunction &BF) {
         }
       }
 
-      if (BC.MIB->isStore(Instr)) {
+      if (BC.MIB->mayStore(Instr)) {
         Stats[DynoStats::STORES] += BBExecutionCount;
       }
-      if (BC.MIB->isLoad(Instr)) {
+      if (BC.MIB->mayLoad(Instr)) {
         Stats[DynoStats::LOADS] += BBExecutionCount;
       }
       if (!BC.MIB->isCall(Instr))
diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp
index 667f1757e13d7101859025ec9b67bd25a865b099..b0bfa7fc052085acf540b0692b492ee7abc735e1 100644
--- a/bolt/lib/Core/Exceptions.cpp
+++ b/bolt/lib/Core/Exceptions.cpp
@@ -112,13 +112,18 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
   uint64_t Offset = getLSDAAddress() - LSDASectionAddress;
   assert(Data.isValidOffset(Offset) && "wrong LSDA address");
 
-  uint8_t LPStartEncoding = Data.getU8(&Offset);
-  uint64_t LPStart = 0;
-  // Convert to offset if LPStartEncoding is typed absptr DW_EH_PE_absptr
-  if (std::optional<uint64_t> MaybeLPStart = Data.getEncodedPointer(
-          &Offset, LPStartEncoding, Offset + LSDASectionAddress))
-    LPStart = (LPStartEncoding && 0xFF == 0) ? *MaybeLPStart
-                                             : *MaybeLPStart - Address;
+  const uint8_t LPStartEncoding = Data.getU8(&Offset);
+  uint64_t LPStart = Address;
+  if (LPStartEncoding != dwarf::DW_EH_PE_omit) {
+    std::optional<uint64_t> MaybeLPStart = Data.getEncodedPointer(
+        &Offset, LPStartEncoding, Offset + LSDASectionAddress);
+    if (!MaybeLPStart) {
+      errs() << "BOLT-ERROR: unsupported LPStartEncoding: "
+             << (unsigned)LPStartEncoding << '\n';
+      exit(1);
+    }
+    LPStart = *MaybeLPStart;
+  }
 
   const uint8_t TTypeEncoding = Data.getU8(&Offset);
   LSDATypeEncoding = TTypeEncoding;
@@ -175,30 +180,13 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
     uint64_t LandingPad = *Data.getEncodedPointer(
         &CallSitePtr, CallSiteEncoding, CallSitePtr + LSDASectionAddress);
     uint64_t ActionEntry = Data.getULEB128(&CallSitePtr);
-
-    uint64_t LPOffset = LPStart + LandingPad;
-    uint64_t LPAddress = Address + LPOffset;
-
-    // Verify if landing pad code is located outside current function
-    // Support landing pad to builtin_unreachable
-    if (LPAddress < Address || LPAddress > Address + getSize()) {
-      BinaryFunction *Fragment =
-          BC.getBinaryFunctionContainingAddress(LPAddress);
-      assert(Fragment != nullptr &&
-             "BOLT-ERROR: cannot find landing pad fragment");
-      BC.addInterproceduralReference(this, Fragment->getAddress());
-      BC.processInterproceduralReferences();
-      assert(isParentOrChildOf(*Fragment) &&
-             "BOLT-ERROR: cannot have landing pads in different functions");
-      setHasIndirectTargetToSplitFragment(true);
-      BC.addFragmentsToSkip(this);
-      return;
-    }
+    if (LandingPad)
+      LandingPad += LPStart;
 
     if (opts::PrintExceptions) {
       outs() << "Call Site: [0x" << Twine::utohexstr(RangeBase + Start)
              << ", 0x" << Twine::utohexstr(RangeBase + Start + Length)
-             << "); landing pad: 0x" << Twine::utohexstr(LPOffset)
+             << "); landing pad: 0x" << Twine::utohexstr(LandingPad)
              << "; action entry: 0x" << Twine::utohexstr(ActionEntry) << "\n";
       outs() << "  current offset is " << (CallSitePtr - CallSiteTableStart)
              << '\n';
@@ -206,7 +194,24 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
 
     // Create a handler entry if necessary.
     MCSymbol *LPSymbol = nullptr;
-    if (LPOffset) {
+    if (LandingPad) {
+      // Verify if landing pad code is located outside current function
+      // Support landing pad to builtin_unreachable
+      if (LandingPad < Address || LandingPad > Address + getSize()) {
+        BinaryFunction *Fragment =
+            BC.getBinaryFunctionContainingAddress(LandingPad);
+        assert(Fragment != nullptr &&
+               "BOLT-ERROR: cannot find landing pad fragment");
+        BC.addInterproceduralReference(this, Fragment->getAddress());
+        BC.processInterproceduralReferences();
+        assert(isParentOrChildOf(*Fragment) &&
+               "BOLT-ERROR: cannot have landing pads in different functions");
+        setHasIndirectTargetToSplitFragment(true);
+        BC.addFragmentsToSkip(this);
+        return;
+      }
+
+      const uint64_t LPOffset = LandingPad - getAddress();
       if (!getInstructionAtOffset(LPOffset)) {
         if (opts::Verbosity >= 1)
           errs() << "BOLT-WARNING: landing pad " << Twine::utohexstr(LPOffset)
diff --git a/bolt/lib/Core/HashUtilities.cpp b/bolt/lib/Core/HashUtilities.cpp
index 0752eaeabef85069cfa3a6dbebe7326a49446630..88f01e4f936d30c2a34b415b911a91c12bb8393f 100644
--- a/bolt/lib/Core/HashUtilities.cpp
+++ b/bolt/lib/Core/HashUtilities.cpp
@@ -130,5 +130,43 @@ std::string hashBlock(BinaryContext &BC, const BinaryBasicBlock &BB,
   return HashString;
 }
 
+/// A "loose" hash of a basic block to use with the stale profile matching. The
+/// computed value will be the same for blocks with minor changes (such as
+/// reordering of instructions or using different operands) but may result in
+/// collisions that need to be resolved by a stronger hashing.
+std::string hashBlockLoose(BinaryContext &BC, const BinaryBasicBlock &BB) {
+  // The hash is computed by creating a string of all lexicographically ordered
+  // instruction opcodes, which is then hashed with std::hash.
+  std::set<std::string> Opcodes;
+  for (const MCInst &Inst : BB) {
+    // Skip pseudo instructions and nops.
+    if (BC.MIB->isPseudo(Inst) || BC.MIB->isNoop(Inst))
+      continue;
+
+    // Ignore unconditional jumps, as they can be added / removed as a result
+    // of basic block reordering.
+    if (BC.MIB->isUnconditionalBranch(Inst))
+      continue;
+
+    // Do not distinguish different types of conditional jumps.
+    if (BC.MIB->isConditionalBranch(Inst)) {
+      Opcodes.insert("JMP");
+      continue;
+    }
+
+    std::string Mnemonic = BC.InstPrinter->getMnemonic(&Inst).first;
+    Mnemonic.erase(
+        std::remove_if(Mnemonic.begin(), Mnemonic.end(),
+                       [](unsigned char ch) { return std::isspace(ch); }),
+        Mnemonic.end());
+    Opcodes.insert(Mnemonic);
+  }
+
+  std::string HashString;
+  for (const std::string &Opcode : Opcodes)
+    HashString.append(Opcode);
+  return HashString;
+}
+
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp
index 027cef1063ee3ebe9181423156e804739f52305e..0cafd3d20ffb95a33c750459ee4166e2b372377e 100644
--- a/bolt/lib/Core/MCPlusBuilder.cpp
+++ b/bolt/lib/Core/MCPlusBuilder.cpp
@@ -120,7 +120,7 @@ bool MCPlusBuilder::equals(const MCTargetExpr &A, const MCTargetExpr &B,
   llvm_unreachable("target-specific expressions are unsupported");
 }
 
-void MCPlusBuilder::setTailCall(MCInst &Inst) {
+void MCPlusBuilder::setTailCall(MCInst &Inst) const {
   assert(!hasAnnotation(Inst, MCAnnotation::kTailCall));
   setAnnotationOpValue(Inst, MCAnnotation::kTailCall, true);
 }
@@ -149,7 +149,7 @@ std::optional<MCLandingPad> MCPlusBuilder::getEHInfo(const MCInst &Inst) const {
                         static_cast<uint64_t>(*Action));
 }
 
-void MCPlusBuilder::addEHInfo(MCInst &Inst, const MCLandingPad &LP) {
+void MCPlusBuilder::addEHInfo(MCInst &Inst, const MCLandingPad &LP) const {
   if (isCall(Inst)) {
     assert(!getEHInfo(Inst));
     setAnnotationOpValue(Inst, MCAnnotation::kEHLandingPad,
@@ -159,7 +159,7 @@ void MCPlusBuilder::addEHInfo(MCInst &Inst, const MCLandingPad &LP) {
   }
 }
 
-bool MCPlusBuilder::updateEHInfo(MCInst &Inst, const MCLandingPad &LP) {
+bool MCPlusBuilder::updateEHInfo(MCInst &Inst, const MCLandingPad &LP) const {
   if (!isInvoke(Inst))
     return false;
 
@@ -178,13 +178,12 @@ int64_t MCPlusBuilder::getGnuArgsSize(const MCInst &Inst) const {
   return *Value;
 }
 
-void MCPlusBuilder::addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize,
-                                   AllocatorIdTy AllocId) {
+void MCPlusBuilder::addGnuArgsSize(MCInst &Inst, int64_t GnuArgsSize) const {
   assert(GnuArgsSize >= 0 && "cannot set GNU_args_size to negative value");
   assert(getGnuArgsSize(Inst) == -1LL && "GNU_args_size already set");
   assert(isInvoke(Inst) && "GNU_args_size can only be set for invoke");
 
-  setAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize, GnuArgsSize, AllocId);
+  setAnnotationOpValue(Inst, MCAnnotation::kGnuArgsSize, GnuArgsSize);
 }
 
 uint64_t MCPlusBuilder::getJumpTable(const MCInst &Inst) const {
@@ -203,12 +202,12 @@ bool MCPlusBuilder::setJumpTable(MCInst &Inst, uint64_t Value,
                                  uint16_t IndexReg, AllocatorIdTy AllocId) {
   if (!isIndirectBranch(Inst))
     return false;
-  setAnnotationOpValue(Inst, MCAnnotation::kJumpTable, Value, AllocId);
+  setAnnotationOpValue(Inst, MCAnnotation::kJumpTable, Value);
   getOrCreateAnnotationAs<uint16_t>(Inst, "JTIndexReg", AllocId) = IndexReg;
   return true;
 }
 
-bool MCPlusBuilder::unsetJumpTable(MCInst &Inst) {
+bool MCPlusBuilder::unsetJumpTable(MCInst &Inst) const {
   if (!getJumpTable(Inst))
     return false;
   removeAnnotation(Inst, MCAnnotation::kJumpTable);
@@ -225,7 +224,7 @@ MCPlusBuilder::getConditionalTailCall(const MCInst &Inst) const {
   return static_cast<uint64_t>(*Value);
 }
 
-bool MCPlusBuilder::setConditionalTailCall(MCInst &Inst, uint64_t Dest) {
+bool MCPlusBuilder::setConditionalTailCall(MCInst &Inst, uint64_t Dest) const {
   if (!isConditionalBranch(Inst))
     return false;
 
@@ -233,7 +232,7 @@ bool MCPlusBuilder::setConditionalTailCall(MCInst &Inst, uint64_t Dest) {
   return true;
 }
 
-bool MCPlusBuilder::unsetConditionalTailCall(MCInst &Inst) {
+bool MCPlusBuilder::unsetConditionalTailCall(MCInst &Inst) const {
   if (!getConditionalTailCall(Inst))
     return false;
   removeAnnotation(Inst, MCAnnotation::kConditionalTailCall);
@@ -255,63 +254,76 @@ uint32_t MCPlusBuilder::getOffsetWithDefault(const MCInst &Inst,
   return Default;
 }
 
-bool MCPlusBuilder::setOffset(MCInst &Inst, uint32_t Offset,
-                              AllocatorIdTy AllocatorId) {
-  setAnnotationOpValue(Inst, MCAnnotation::kOffset, Offset, AllocatorId);
+bool MCPlusBuilder::setOffset(MCInst &Inst, uint32_t Offset) const {
+  setAnnotationOpValue(Inst, MCAnnotation::kOffset, Offset);
   return true;
 }
 
-bool MCPlusBuilder::clearOffset(MCInst &Inst) {
+bool MCPlusBuilder::clearOffset(MCInst &Inst) const {
   if (!hasAnnotation(Inst, MCAnnotation::kOffset))
     return false;
   removeAnnotation(Inst, MCAnnotation::kOffset);
   return true;
 }
 
-bool MCPlusBuilder::hasAnnotation(const MCInst &Inst, unsigned Index) const {
-  const MCInst *AnnotationInst = getAnnotationInst(Inst);
-  if (!AnnotationInst)
-    return false;
+MCSymbol *MCPlusBuilder::getLabel(const MCInst &Inst) const {
+  if (auto Label = tryGetAnnotationAs<MCSymbol *>(Inst, MCAnnotation::kLabel))
+    return *Label;
+  return nullptr;
+}
+
+bool MCPlusBuilder::setLabel(MCInst &Inst, MCSymbol *Label) {
+  getOrCreateAnnotationAs<MCSymbol *>(Inst, MCAnnotation::kLabel) = Label;
+  return true;
+}
 
+std::optional<uint32_t> MCPlusBuilder::getSize(const MCInst &Inst) const {
+  if (std::optional<int64_t> Value =
+          getAnnotationOpValue(Inst, MCAnnotation::kSize))
+    return static_cast<uint32_t>(*Value);
+  return std::nullopt;
+}
+
+void MCPlusBuilder::setSize(MCInst &Inst, uint32_t Size) const {
+  setAnnotationOpValue(Inst, MCAnnotation::kSize, Size);
+}
+
+bool MCPlusBuilder::hasAnnotation(const MCInst &Inst, unsigned Index) const {
   return (bool)getAnnotationOpValue(Inst, Index);
 }
 
-bool MCPlusBuilder::removeAnnotation(MCInst &Inst, unsigned Index) {
-  MCInst *AnnotationInst = getAnnotationInst(Inst);
-  if (!AnnotationInst)
+bool MCPlusBuilder::removeAnnotation(MCInst &Inst, unsigned Index) const {
+  std::optional<unsigned> FirstAnnotationOp = getFirstAnnotationOpIndex(Inst);
+  if (!FirstAnnotationOp)
     return false;
 
-  for (int I = AnnotationInst->getNumOperands() - 1; I >= 0; --I) {
-    int64_t ImmValue = AnnotationInst->getOperand(I).getImm();
+  for (unsigned I = Inst.getNumOperands() - 1; I >= *FirstAnnotationOp; --I) {
+    const int64_t ImmValue = Inst.getOperand(I).getImm();
     if (extractAnnotationIndex(ImmValue) == Index) {
-      AnnotationInst->erase(AnnotationInst->begin() + I);
+      Inst.erase(Inst.begin() + I);
       return true;
     }
   }
   return false;
 }
 
-void MCPlusBuilder::stripAnnotations(MCInst &Inst, bool KeepTC) {
-  MCInst *AnnotationInst = getAnnotationInst(Inst);
-  if (!AnnotationInst)
-    return;
-  // Preserve TailCall annotation.
-  auto IsTC = hasAnnotation(Inst, MCAnnotation::kTailCall);
+void MCPlusBuilder::stripAnnotations(MCInst &Inst, bool KeepTC) const {
+  KeepTC &= hasAnnotation(Inst, MCAnnotation::kTailCall);
 
-  removeAnnotationInst(Inst);
+  removeAnnotations(Inst);
 
-  if (KeepTC && IsTC)
+  if (KeepTC)
     setTailCall(Inst);
 }
 
 void MCPlusBuilder::printAnnotations(const MCInst &Inst,
                                      raw_ostream &OS) const {
-  const MCInst *AnnotationInst = getAnnotationInst(Inst);
-  if (!AnnotationInst)
+  std::optional<unsigned> FirstAnnotationOp = getFirstAnnotationOpIndex(Inst);
+  if (!FirstAnnotationOp)
     return;
 
-  for (unsigned I = 0; I < AnnotationInst->getNumOperands(); ++I) {
-    const int64_t Imm = AnnotationInst->getOperand(I).getImm();
+  for (unsigned I = *FirstAnnotationOp; I < Inst.getNumOperands(); ++I) {
+    const int64_t Imm = Inst.getOperand(I).getImm();
     const unsigned Index = extractAnnotationIndex(Imm);
     const int64_t Value = extractAnnotationValue(Imm);
     const auto *Annotation = reinterpret_cast<const MCAnnotation *>(Value);
diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp
index e985d6da82c197781d4fba7707f7766caaede793..70fcc6953ed71bc0fe0f8f4bf899c72844cf7200 100644
--- a/bolt/lib/Core/Relocation.cpp
+++ b/bolt/lib/Core/Relocation.cpp
@@ -101,6 +101,7 @@ static bool isSupportedRISCV(uint64_t Type) {
   case ELF::R_RISCV_GOT_HI20:
   case ELF::R_RISCV_PCREL_HI20:
   case ELF::R_RISCV_PCREL_LO12_I:
+  case ELF::R_RISCV_PCREL_LO12_S:
   case ELF::R_RISCV_RVC_JUMP:
   case ELF::R_RISCV_RVC_BRANCH:
   case ELF::R_RISCV_ADD32:
@@ -195,6 +196,7 @@ static size_t getSizeForTypeRISCV(uint64_t Type) {
   case ELF::R_RISCV_BRANCH:
   case ELF::R_RISCV_PCREL_HI20:
   case ELF::R_RISCV_PCREL_LO12_I:
+  case ELF::R_RISCV_PCREL_LO12_S:
   case ELF::R_RISCV_32_PCREL:
   case ELF::R_RISCV_CALL:
   case ELF::R_RISCV_CALL_PLT:
@@ -338,13 +340,22 @@ static uint64_t encodeValueAArch64(uint64_t Type, uint64_t Value, uint64_t PC) {
   switch (Type) {
   default:
     llvm_unreachable("unsupported relocation");
+  case ELF::R_AARCH64_ABS16:
   case ELF::R_AARCH64_ABS32:
+  case ELF::R_AARCH64_ABS64:
     break;
   case ELF::R_AARCH64_PREL16:
   case ELF::R_AARCH64_PREL32:
   case ELF::R_AARCH64_PREL64:
     Value -= PC;
     break;
+  case ELF::R_AARCH64_CALL26:
+    Value -= PC;
+    assert(isInt<28>(Value) && "only PC +/- 128MB is allowed for direct call");
+    // Immediate goes in bits 25:0 of BL.
+    // OP 1001_01 goes in bits 31:26 of BL.
+    Value = ((Value >> 2) & 0x3ffffff) | 0x94000000ULL;
+    break;
   }
   return Value;
 }
@@ -473,6 +484,10 @@ static uint64_t extractIImmRISCV(uint32_t Contents) {
   return SignExtend64<12>(Contents >> 20);
 }
 
+static uint64_t extractSImmRISCV(uint32_t Contents) {
+  return SignExtend64<12>(((Contents >> 7) & 0x1f) | ((Contents >> 25) << 5));
+}
+
 static uint64_t extractJImmRISCV(uint32_t Contents) {
   return SignExtend64<21>(
       (((Contents >> 21) & 0x3ff) << 1) | (((Contents >> 20) & 0x1) << 11) |
@@ -509,6 +524,8 @@ static uint64_t extractValueRISCV(uint64_t Type, uint64_t Contents,
     return extractUImmRISCV(Contents);
   case ELF::R_RISCV_PCREL_LO12_I:
     return extractIImmRISCV(Contents);
+  case ELF::R_RISCV_PCREL_LO12_S:
+    return extractSImmRISCV(Contents);
   case ELF::R_RISCV_RVC_JUMP:
     return SignExtend64<11>(Contents >> 2);
   case ELF::R_RISCV_RVC_BRANCH:
@@ -685,6 +702,7 @@ static bool isPCRelativeRISCV(uint64_t Type) {
   case ELF::R_RISCV_GOT_HI20:
   case ELF::R_RISCV_PCREL_HI20:
   case ELF::R_RISCV_PCREL_LO12_I:
+  case ELF::R_RISCV_PCREL_LO12_S:
   case ELF::R_RISCV_RVC_JUMP:
   case ELF::R_RISCV_RVC_BRANCH:
   case ELF::R_RISCV_32_PCREL:
@@ -781,6 +799,19 @@ bool Relocation::isTLS(uint64_t Type) {
   return isTLSX86(Type);
 }
 
+bool Relocation::isInstructionReference(uint64_t Type) {
+  if (Arch != Triple::riscv64)
+    return false;
+
+  switch (Type) {
+  default:
+    return false;
+  case ELF::R_RISCV_PCREL_LO12_I:
+  case ELF::R_RISCV_PCREL_LO12_S:
+    return true;
+  }
+}
+
 uint64_t Relocation::getNone() {
   if (Arch == Triple::aarch64)
     return ELF::R_AARCH64_NONE;
diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/ADRRelaxationPass.cpp
index 76924d96fcf9bc5795cef91326a8b904f5a8545f..27a1377adef1641848188f31b55b47ac5f143764 100644
--- a/bolt/lib/Passes/ADRRelaxationPass.cpp
+++ b/bolt/lib/Passes/ADRRelaxationPass.cpp
@@ -29,7 +29,16 @@ static cl::opt<bool>
 namespace llvm {
 namespace bolt {
 
+// We don't exit directly from runOnFunction since it would call ThreadPool
+// destructor which might result in internal assert if we're not finished
+// creating async jobs on the moment of exit. So we're finishing all parallel
+// jobs and checking the exit flag after it.
+static bool PassFailed = false;
+
 void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
+  if (PassFailed)
+    return;
+
   BinaryContext &BC = BF.getBinaryContext();
   for (BinaryBasicBlock &BB : BF) {
     for (auto It = BB.begin(); It != BB.end(); ++It) {
@@ -47,28 +56,41 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
           continue;
       }
 
-      BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol);
-      if (TargetBF && TargetBF == &BF)
-        continue;
+      // Don't relax adr if it points to the same function and it is not split
+      // and BF initial size is < 1MB.
+      const unsigned OneMB = 0x100000;
+      if (!BF.isSplit() && BF.getSize() < OneMB) {
+        BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol);
+        if (TargetBF && TargetBF == &BF)
+          continue;
+      }
 
       MCPhysReg Reg;
       BC.MIB->getADRReg(Inst, Reg);
       int64_t Addend = BC.MIB->getTargetAddend(Inst);
-      InstructionListType Addr =
-          BC.MIB->materializeAddress(Symbol, BC.Ctx.get(), Reg, Addend);
+      InstructionListType Addr;
+
+      {
+        auto L = BC.scopeLock();
+        Addr = BC.MIB->materializeAddress(Symbol, BC.Ctx.get(), Reg, Addend);
+      }
 
       if (It != BB.begin() && BC.MIB->isNoop(*std::prev(It))) {
         It = BB.eraseInstruction(std::prev(It));
-      } else if (opts::StrictMode && !BF.isSimple()) {
+      } else if (std::next(It) != BB.end() && BC.MIB->isNoop(*std::next(It))) {
+        BB.eraseInstruction(std::next(It));
+      } else if (!opts::StrictMode && !BF.isSimple()) {
         // If the function is not simple, it may contain a jump table undetected
         // by us. This jump table may use an offset from the branch instruction
         // to land in the desired place. If we add new instructions, we
         // invalidate this offset, so we have to rely on linker-inserted NOP to
         // replace it with ADRP, and abort if it is not present.
+        auto L = BC.scopeLock();
         errs() << formatv("BOLT-ERROR: Cannot relax adr in non-simple function "
-                          "{0}. Can't proceed in current mode.\n",
+                          "{0}. Use --strict option to override\n",
                           BF.getOneName());
-        exit(1);
+        PassFailed = true;
+        return;
       }
       It = BB.replaceInstruction(It, Addr);
     }
@@ -85,7 +107,10 @@ void ADRRelaxationPass::runOnFunctions(BinaryContext &BC) {
 
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, nullptr,
-      "ADRRelaxationPass", /* ForceSequential */ true);
+      "ADRRelaxationPass");
+
+  if (PassFailed)
+    exit(1);
 }
 
 } // end namespace bolt
diff --git a/bolt/lib/Passes/Aligner.cpp b/bolt/lib/Passes/Aligner.cpp
index ef419bb6baaa21068ab2aedd4f0537df010e07df..7c387525434bd39bc847ebe15349784435a6ea4f 100644
--- a/bolt/lib/Passes/Aligner.cpp
+++ b/bolt/lib/Passes/Aligner.cpp
@@ -163,20 +163,6 @@ void AlignerPass::runOnFunctions(BinaryContext &BC) {
     else
       alignMaxBytes(BF);
 
-    // Align objects that contains constant islands and no code
-    // to at least 8 bytes.
-    if (!BF.size() && BF.hasIslandsInfo()) {
-      const uint16_t Alignment = BF.getConstantIslandAlignment();
-      if (BF.getAlignment() < Alignment)
-        BF.setAlignment(Alignment);
-
-      if (BF.getMaxAlignmentBytes() < Alignment)
-        BF.setMaxAlignmentBytes(Alignment);
-
-      if (BF.getMaxColdAlignmentBytes() < Alignment)
-        BF.setMaxColdAlignmentBytes(Alignment);
-    }
-
     if (opts::AlignBlocks && !opts::PreserveBlocksAlignment)
       alignBlocks(BF, Emitter.MCE.get());
   };
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index a674fb4fef76a352c386ceace5733446dad9f2b7..4e1343e2c30be56dc2d9c719790a276b3b38469b 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -317,38 +317,46 @@ void NormalizeCFG::runOnFunctions(BinaryContext &BC) {
 }
 
 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
-  if (!Function.getLayout().block_empty()) {
-    unsigned Count;
-    uint64_t Bytes;
-    Function.markUnreachableBlocks();
-    LLVM_DEBUG({
-      for (BinaryBasicBlock &BB : Function) {
-        if (!BB.isValid()) {
-          dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
-                 << " in function " << Function << "\n";
-          Function.dump();
-        }
+  BinaryContext &BC = Function.getBinaryContext();
+  unsigned Count;
+  uint64_t Bytes;
+  Function.markUnreachableBlocks();
+  LLVM_DEBUG({
+    for (BinaryBasicBlock &BB : Function) {
+      if (!BB.isValid()) {
+        dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
+               << " in function " << Function << "\n";
+        Function.dump();
       }
-    });
-    std::tie(Count, Bytes) = Function.eraseInvalidBBs();
-    DeletedBlocks += Count;
-    DeletedBytes += Bytes;
-    if (Count) {
-      Modified.insert(&Function);
-      if (opts::Verbosity > 0)
-        outs() << "BOLT-INFO: removed " << Count
-               << " dead basic block(s) accounting for " << Bytes
-               << " bytes in function " << Function << '\n';
     }
+  });
+  BinaryContext::IndependentCodeEmitter Emitter =
+      BC.createIndependentMCCodeEmitter();
+  std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get());
+  DeletedBlocks += Count;
+  DeletedBytes += Bytes;
+  if (Count) {
+    auto L = BC.scopeLock();
+    Modified.insert(&Function);
+    if (opts::Verbosity > 0)
+      outs() << "BOLT-INFO: removed " << Count
+             << " dead basic block(s) accounting for " << Bytes
+             << " bytes in function " << Function << '\n';
   }
 }
 
 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
-  for (auto &It : BC.getBinaryFunctions()) {
-    BinaryFunction &Function = It.second;
-    if (shouldOptimize(Function))
-      runOnFunction(Function);
-  }
+  ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+    runOnFunction(BF);
+  };
+
+  ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
+    return !shouldOptimize(BF) || BF.getLayout().block_empty();
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
+      SkipPredicate, "elimininate-unreachable");
 
   if (DeletedBlocks)
     outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
@@ -574,57 +582,50 @@ bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
 }
 
 void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
-  std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations;
-
-  for (auto &It : BC.getBinaryFunctions()) {
-    BinaryFunction &BF = It.second;
-
-    for (FunctionFragment &FF : BF.getLayout().fragments()) {
+  for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
+    for (FunctionFragment &FF : BF->getLayout().fragments()) {
+      // Reset at the start of the new fragment.
       int64_t CurrentGnuArgsSize = 0;
 
       for (BinaryBasicBlock *const BB : FF) {
-        // First convert GnuArgsSize annotations into CFIs. This may change
-        // instr pointers, so do it before recording ptrs for preserved
-        // annotations
-        if (BF.usesGnuArgsSize()) {
-          for (auto II = BB->begin(); II != BB->end(); ++II) {
-            if (!BC.MIB->isInvoke(*II))
-              continue;
+        for (auto II = BB->begin(); II != BB->end(); ++II) {
+
+          // Convert GnuArgsSize annotations into CFIs.
+          if (BF->usesGnuArgsSize() && BC.MIB->isInvoke(*II)) {
             const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
             assert(NewGnuArgsSize >= 0 &&
-                   "expected non-negative GNU_args_size");
+                   "Expected non-negative GNU_args_size.");
             if (NewGnuArgsSize != CurrentGnuArgsSize) {
-              auto InsertII = BF.addCFIInstruction(
+              auto InsertII = BF->addCFIInstruction(
                   BB, II,
                   MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
               CurrentGnuArgsSize = NewGnuArgsSize;
               II = std::next(InsertII);
             }
           }
-        }
 
-        // Now record preserved annotations separately and then strip
-        // annotations.
-        for (auto II = BB->begin(); II != BB->end(); ++II) {
-          if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II))
-            PreservedOffsetAnnotations.emplace_back(&(*II),
-                                                    *BC.MIB->getOffset(*II));
+          // Preserve selected annotations and strip the rest.
+          std::optional<uint32_t> Offset = BF->requiresAddressTranslation()
+                                               ? BC.MIB->getOffset(*II)
+                                               : std::nullopt;
+          std::optional<uint32_t> Size = BC.MIB->getSize(*II);
+          MCSymbol *Label = BC.MIB->getLabel(*II);
+
           BC.MIB->stripAnnotations(*II);
+
+          if (Offset)
+            BC.MIB->setOffset(*II, *Offset);
+          if (Size)
+            BC.MIB->setSize(*II, *Size);
+          if (Label)
+            BC.MIB->setLabel(*II, Label);
         }
       }
     }
   }
-  for (BinaryFunction *BF : BC.getInjectedBinaryFunctions())
-    for (BinaryBasicBlock &BB : *BF)
-      for (MCInst &Instruction : BB)
-        BC.MIB->stripAnnotations(Instruction);
 
   // Release all memory taken by annotations
   BC.MIB->freeAnnotations();
-
-  // Reinsert preserved annotations we need during code emission.
-  for (const std::pair<MCInst *, uint32_t> &Item : PreservedOffsetAnnotations)
-    BC.MIB->setOffset(*Item.first, Item.second);
 }
 
 // Check for dirty state in MCSymbol objects that might be a consequence
@@ -1454,6 +1455,14 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
                      100.0 * NumInferredFunctions / NumAllStaleFunctions,
                      100.0 * InferredSampleCount / TotalSampleCount,
                      InferredSampleCount, TotalSampleCount);
+    outs() << format(
+        "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
+        " (%zu out of %zu stale) responsible for %.2f%% samples"
+        " (%zu out of %zu stale)\n",
+        100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks,
+        BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks,
+        100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount,
+        BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount);
   }
 
   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
@@ -1562,10 +1571,11 @@ void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
   }
 
   // Print information on missed macro-fusion opportunities seen on input.
-  if (BC.MissedMacroFusionPairs) {
-    outs() << "BOLT-INFO: the input contains " << BC.MissedMacroFusionPairs
-           << " (dynamic count : " << BC.MissedMacroFusionExecCount
-           << ") opportunities for macro-fusion optimization";
+  if (BC.Stats.MissedMacroFusionPairs) {
+    outs() << format("BOLT-INFO: the input contains %zu (dynamic count : %zu)"
+                     " opportunities for macro-fusion optimization",
+                     BC.Stats.MissedMacroFusionPairs,
+                     BC.Stats.MissedMacroFusionExecCount);
     switch (opts::AlignMacroOpFusion) {
     case MFT_NONE:
       outs() << ". Use -align-macro-fusion to fix.\n";
diff --git a/bolt/lib/Passes/IndirectCallPromotion.cpp b/bolt/lib/Passes/IndirectCallPromotion.cpp
index ea8019431cf52f96b6febe92c24488d474b0c94e..89727233ec78b05f35bb79a4361b64507d232262 100644
--- a/bolt/lib/Passes/IndirectCallPromotion.cpp
+++ b/bolt/lib/Passes/IndirectCallPromotion.cpp
@@ -754,6 +754,15 @@ IndirectCallPromotion::rewriteCall(
   const bool IsTailCallOrJT =
       (MIB->isTailCall(CallInst) || Function.getJumpTable(CallInst));
 
+  // If we are tracking the indirect call/jump address, propagate the address to
+  // the ICP code.
+  const std::optional<uint32_t> IndirectInstrOffset = MIB->getOffset(CallInst);
+  if (IndirectInstrOffset) {
+    for (auto &[Symbol, Instructions] : ICPcode)
+      for (MCInst &Inst : Instructions)
+        MIB->setOffset(Inst, *IndirectInstrOffset);
+  }
+
   // Move instructions from the tail of the original call block
   // to the merge block.
 
@@ -767,10 +776,12 @@ IndirectCallPromotion::rewriteCall(
       TailInsts.push_back(*++TailInst);
 
   InstructionListType MovedInst = IndCallBlock.splitInstructions(&CallInst);
-  // Link new BBs to the original input offset of the BB where the indirect
-  // call site is, so we can map samples recorded in new BBs back to the
-  // original BB seen in the input binary (if using BAT)
-  const uint32_t OrigOffset = IndCallBlock.getInputOffset();
+  // Link new BBs to the original input offset of the indirect call site or its
+  // containing BB, so we can map samples recorded in new BBs back to the
+  // original BB seen in the input binary (if using BAT).
+  const uint32_t OrigOffset = IndirectInstrOffset
+                                  ? *IndirectInstrOffset
+                                  : IndCallBlock.getInputOffset();
 
   IndCallBlock.eraseInstructions(MethodFetchInsns.begin(),
                                  MethodFetchInsns.end());
diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp
index 98044599d497e71bf35dbf9ed1ce784a3b5271b8..72adb319d71dc0e437981a31c9ba682523f9b065 100644
--- a/bolt/lib/Passes/Instrumentation.cpp
+++ b/bolt/lib/Passes/Instrumentation.cpp
@@ -13,6 +13,7 @@
 #include "bolt/Passes/Instrumentation.h"
 #include "bolt/Core/ParallelUtilities.h"
 #include "bolt/RuntimeLibs/InstrumentationRuntimeLibrary.h"
+#include "bolt/Utils/CommandLineOpts.h"
 #include "bolt/Utils/Utils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/RWMutex.h"
@@ -85,6 +86,24 @@ cl::opt<bool> InstrumentCalls("instrument-calls",
 namespace llvm {
 namespace bolt {
 
+static bool hasAArch64ExclusiveMemop(BinaryFunction &Function) {
+  // FIXME ARMv8-a architecture reference manual says that software must avoid
+  // having any explicit memory accesses between exclusive load and associated
+  // store instruction. So for now skip instrumentation for functions that have
+  // these instructions, since it might lead to runtime deadlock.
+  BinaryContext &BC = Function.getBinaryContext();
+  for (const BinaryBasicBlock &BB : Function)
+    for (const MCInst &Inst : BB)
+      if (BC.MIB->isAArch64Exclusive(Inst)) {
+        if (opts::Verbosity >= 1)
+          outs() << "BOLT-INSTRUMENTER: Function " << Function
+                 << " has exclusive instructions, skip instrumentation\n";
+        return true;
+      }
+
+  return false;
+}
+
 uint32_t Instrumentation::getFunctionNameIndex(const BinaryFunction &Function) {
   auto Iter = FuncToStringIdx.find(&Function);
   if (Iter != FuncToStringIdx.end())
@@ -288,6 +307,9 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function,
   if (BC.isMachO() && Function.hasName("___GLOBAL_init_65535/1"))
     return;
 
+  if (BC.isAArch64() && hasAArch64ExclusiveMemop(Function))
+    return;
+
   SplitWorklistTy SplitWorklist;
   SplitInstrsTy SplitInstrs;
 
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index 6f4d1170dbe2a4aa52bed58f14062f5f998943f2..a81689bc37469a43877afe948a07ed6049fc2b12 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -293,7 +293,7 @@ uint64_t LongJmpPass::tentativeLayoutRelocColdPart(
   for (BinaryFunction *Func : SortedFunctions) {
     if (!Func->isSplit())
       continue;
-    DotAddress = alignTo(DotAddress, BinaryFunction::MinAlign);
+    DotAddress = alignTo(DotAddress, Func->getMinAlignment());
     uint64_t Pad =
         offsetToAlignment(DotAddress, llvm::Align(Func->getAlignment()));
     if (Pad <= Func->getMaxColdAlignmentBytes())
@@ -352,7 +352,7 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode(
         DotAddress = alignTo(DotAddress, opts::AlignText);
     }
 
-    DotAddress = alignTo(DotAddress, BinaryFunction::MinAlign);
+    DotAddress = alignTo(DotAddress, Func->getMinAlignment());
     uint64_t Pad =
         offsetToAlignment(DotAddress, llvm::Align(Func->getAlignment()));
     if (Pad <= Func->getMaxAlignmentBytes())
diff --git a/bolt/lib/Passes/MCF.cpp b/bolt/lib/Passes/MCF.cpp
index ec040120a919fb3658c14a0179d967ca4d4f7d6e..c3898d2dce989efdd7f3f77149b5417578ad678a 100644
--- a/bolt/lib/Passes/MCF.cpp
+++ b/bolt/lib/Passes/MCF.cpp
@@ -262,6 +262,7 @@ bool guessPredEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) {
       continue;
 
     Pred->getBranchInfo(*BB).Count = Guessed;
+    GuessedArcs.insert(std::make_pair(Pred, BB));
     return true;
   }
   llvm_unreachable("Expected unguessed arc");
diff --git a/bolt/lib/Passes/RegReAssign.cpp b/bolt/lib/Passes/RegReAssign.cpp
index 19e1a84c48d1b5f16875f1968e17b5247f430b49..8b9dc9c1fdd506c89e9cb97b2ca1612687d6e407 100644
--- a/bolt/lib/Passes/RegReAssign.cpp
+++ b/bolt/lib/Passes/RegReAssign.cpp
@@ -140,7 +140,7 @@ void RegReAssign::rankRegisters(BinaryFunction &Function) {
   std::fill(RegScore.begin(), RegScore.end(), 0);
   std::fill(RankedRegs.begin(), RankedRegs.end(), 0);
 
-  for (BinaryBasicBlock &BB : Function) {
+  auto countRegScore = [&](BinaryBasicBlock &BB) {
     for (MCInst &Inst : BB) {
       const bool CannotUseREX = BC.MIB->cannotUseREX(Inst);
       const MCInstrDesc &Desc = BC.MII->get(Inst.getOpcode());
@@ -175,9 +175,25 @@ void RegReAssign::rankRegisters(BinaryFunction &Function) {
           continue;
 
         // Disallow substituitions involving regs in instrs that cannot use REX
+        // The relationship of X86 registers is shown in the diagram. BL and BH
+        // do not have a direct alias relationship. However, if the BH register
+        // cannot be swapped, then the BX/EBX/RBX registers cannot be swapped as
+        // well, which means that BL register also cannot be swapped. Therefore,
+        // in the presence of BX/EBX/RBX registers, BL and BH have an alias
+        // relationship.
+        // ┌─────────────────┐
+        // │  RBX            │
+        // ├─────┬───────────┤
+        // │     │  EBX      │
+        // ├─────┴──┬────────┤
+        // │        │   BX   │
+        // ├────────┼───┬────┤
+        // │        │BH │BL  │
+        // └────────┴───┴────┘
         if (CannotUseREX) {
           RegScore[RegEC] =
               std::numeric_limits<decltype(RegScore)::value_type>::min();
+          RegScore[BC.MIB->getAliasSized(Reg, 1)] = RegScore[RegEC];
           continue;
         }
 
@@ -185,13 +201,22 @@ void RegReAssign::rankRegisters(BinaryFunction &Function) {
         if (BC.MIB->isUpper8BitReg(Reg) && ClassicCSR.test(Reg)) {
           RegScore[RegEC] =
               std::numeric_limits<decltype(RegScore)::value_type>::min();
+          RegScore[BC.MIB->getAliasSized(Reg, 1)] = RegScore[RegEC];
           continue;
         }
 
         RegScore[RegEC] += BB.getKnownExecutionCount();
       }
     }
+  };
+  for (BinaryBasicBlock &BB : Function)
+    countRegScore(BB);
+
+  for (BinaryFunction *ChildFrag : Function.getFragments()) {
+    for (BinaryBasicBlock &BB : *ChildFrag)
+      countRegScore(BB);
   }
+
   std::iota(RankedRegs.begin(), RankedRegs.end(), 0); // 0, 1, 2, 3...
   llvm::sort(RankedRegs,
              [&](size_t A, size_t B) { return RegScore[A] > RegScore[B]; });
@@ -213,6 +238,17 @@ void RegReAssign::aggressivePassOverFunction(BinaryFunction &Function) {
   BinaryContext &BC = Function.getBinaryContext();
   rankRegisters(Function);
 
+  // If there is a situation where function:
+  //   A() -> A.cold()
+  //   A.localalias() -> A.cold()
+  // simply swapping these two calls can cause issues.
+  for (BinaryFunction *ChildFrag : Function.getFragments()) {
+    if (ChildFrag->getParentFragments()->size() > 1)
+      return;
+    if (ChildFrag->empty())
+      return;
+  }
+
   // Bail early if our registers are all black listed, before running expensive
   // analysis passes
   bool Bail = true;
@@ -304,6 +340,10 @@ void RegReAssign::aggressivePassOverFunction(BinaryFunction &Function) {
                       << " with " << BC.MRI->getName(ExtReg) << "\n\n");
     swap(Function, ClassicReg, ExtReg);
     FuncsChanged.insert(&Function);
+    for (BinaryFunction *ChildFrag : Function.getFragments()) {
+      swap(*ChildFrag, ClassicReg, ExtReg);
+      FuncsChanged.insert(ChildFrag);
+    }
     ++Begin;
     if (Begin == End)
       break;
@@ -315,6 +355,13 @@ bool RegReAssign::conservativePassOverFunction(BinaryFunction &Function) {
   BinaryContext &BC = Function.getBinaryContext();
   rankRegisters(Function);
 
+  for (BinaryFunction *ChildFrag : Function.getFragments()) {
+    if (ChildFrag->getParentFragments()->size() > 1)
+      return false;
+    if (ChildFrag->empty())
+      return false;
+  }
+
   // Try swapping R12, R13, R14 or R15 with RBX (we work with all callee-saved
   // regs except RBP)
   MCPhysReg Candidate = 0;
@@ -340,11 +387,24 @@ bool RegReAssign::conservativePassOverFunction(BinaryFunction &Function) {
   if (!RBX)
     return false;
 
+  // The high 8 bits of the register will never be swapped. To prevent the high
+  // 8 bits from being swapped incorrectly, we should switched to swapping the
+  // low 8 bits of the register instead.
+  if (BC.MIB->isUpper8BitReg(RBX)) {
+    RBX = BC.MIB->getAliasSized(RBX, 1);
+    if (RegScore[RBX] < 0 || RegScore[RBX] > RegScore[Candidate])
+      return false;
+  }
+
   LLVM_DEBUG(dbgs() << "\n ** Swapping " << BC.MRI->getName(RBX) << " with "
                     << BC.MRI->getName(Candidate) << "\n\n");
   (void)BC;
   swap(Function, RBX, Candidate);
   FuncsChanged.insert(&Function);
+  for (BinaryFunction *ChildFrag : Function.getFragments()) {
+    swap(*ChildFrag, RBX, Candidate);
+    FuncsChanged.insert(ChildFrag);
+  }
   return true;
 }
 
@@ -404,7 +464,7 @@ void RegReAssign::runOnFunctions(BinaryContext &BC) {
   for (auto &I : BC.getBinaryFunctions()) {
     BinaryFunction &Function = I.second;
 
-    if (!Function.isSimple() || Function.isIgnored())
+    if (!Function.isSimple() || Function.isIgnored() || Function.isFragment())
       continue;
 
     LLVM_DEBUG(dbgs() << "====================================\n");
diff --git a/bolt/lib/Passes/ReorderAlgorithm.cpp b/bolt/lib/Passes/ReorderAlgorithm.cpp
index b5052cdaddb13e38fd8b8d7a3f3d5b999ad90ad9..3c3365e1d3d711321c3eda520012d5cbb64e0507 100644
--- a/bolt/lib/Passes/ReorderAlgorithm.cpp
+++ b/bolt/lib/Passes/ReorderAlgorithm.cpp
@@ -531,21 +531,21 @@ void ExtTSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF,
   }
 
   // Initialize CFG edges
-  using JumpT = std::pair<uint64_t, uint64_t>;
-  std::vector<std::pair<JumpT, uint64_t>> JumpCounts;
+  std::vector<codelayout::EdgeCount> JumpCounts;
   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
     auto BI = BB->branch_info_begin();
     for (BinaryBasicBlock *SuccBB : BB->successors()) {
       assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
              "missing profile for a jump");
-      auto It = std::make_pair(BB->getLayoutIndex(), SuccBB->getLayoutIndex());
-      JumpCounts.push_back(std::make_pair(It, BI->Count));
+      JumpCounts.push_back(
+          {BB->getLayoutIndex(), SuccBB->getLayoutIndex(), BI->Count});
       ++BI;
     }
   }
 
   // Run the layout algorithm
-  auto Result = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
+  auto Result =
+      codelayout::computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
   Order.reserve(BF.getLayout().block_size());
   for (uint64_t R : Result)
     Order.push_back(OrigOrder[R]);
diff --git a/bolt/lib/Passes/ReorderData.cpp b/bolt/lib/Passes/ReorderData.cpp
index 4df6ce37596d71a3132d259873745690567411a3..6e1f9b6d77512e12572ad2a6198bb261de95ad81 100644
--- a/bolt/lib/Passes/ReorderData.cpp
+++ b/bolt/lib/Passes/ReorderData.cpp
@@ -413,17 +413,17 @@ bool ReorderData::markUnmoveableSymbols(BinaryContext &BC,
   auto Range = BC.getBinaryDataForSection(Section);
   bool FoundUnmoveable = false;
   for (auto Itr = Range.begin(); Itr != Range.end(); ++Itr) {
+    BinaryData *Next =
+        std::next(Itr) != Range.end() ? std::next(Itr)->second : nullptr;
     if (Itr->second->getName().startswith("PG.")) {
       BinaryData *Prev =
           Itr != Range.begin() ? std::prev(Itr)->second : nullptr;
-      BinaryData *Next = Itr != Range.end() ? std::next(Itr)->second : nullptr;
       bool PrevIsPrivate = Prev && isPrivate(Prev);
       bool NextIsPrivate = Next && isPrivate(Next);
       if (isPrivate(Itr->second) && (PrevIsPrivate || NextIsPrivate))
         Itr->second->setIsMoveable(false);
     } else {
       // check for overlapping symbols.
-      BinaryData *Next = Itr != Range.end() ? std::next(Itr)->second : nullptr;
       if (Next && Itr->second->getEndAddress() != Next->getAddress() &&
           Next->containsAddress(Itr->second->getEndAddress())) {
         Itr->second->setIsMoveable(false);
diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp
index 2fc99f652bf1c375abaa73ff5712720daa0ea10c..70f87ac40c3c149b0079b2038d077a41bbf382d5 100644
--- a/bolt/lib/Passes/ReorderFunctions.cpp
+++ b/bolt/lib/Passes/ReorderFunctions.cpp
@@ -15,6 +15,7 @@
 #include "bolt/Utils/Utils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/CodeLayout.h"
 #include <fstream>
 
 #define DEBUG_TYPE "hfsort"
@@ -29,82 +30,72 @@ extern cl::opt<uint32_t> RandomSeed;
 
 extern size_t padFunction(const bolt::BinaryFunction &Function);
 
-cl::opt<bolt::ReorderFunctions::ReorderType>
-ReorderFunctions("reorder-functions",
-  cl::desc("reorder and cluster functions (works only with relocations)"),
-  cl::init(bolt::ReorderFunctions::RT_NONE),
-  cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE,
-      "none",
-      "do not reorder functions"),
-    clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT,
-      "exec-count",
-      "order by execution count"),
-    clEnumValN(bolt::ReorderFunctions::RT_HFSORT,
-      "hfsort",
-      "use hfsort algorithm"),
-    clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS,
-      "hfsort+",
-      "use hfsort+ algorithm"),
-    clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN,
-      "pettis-hansen",
-      "use Pettis-Hansen algorithm"),
-    clEnumValN(bolt::ReorderFunctions::RT_RANDOM,
-      "random",
-      "reorder functions randomly"),
-    clEnumValN(bolt::ReorderFunctions::RT_USER,
-      "user",
-      "use function order specified by -function-order")),
-  cl::ZeroOrMore,
-  cl::cat(BoltOptCategory));
+cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions(
+    "reorder-functions",
+    cl::desc("reorder and cluster functions (works only with relocations)"),
+    cl::init(bolt::ReorderFunctions::RT_NONE),
+    cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE, "none",
+                          "do not reorder functions"),
+               clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT, "exec-count",
+                          "order by execution count"),
+               clEnumValN(bolt::ReorderFunctions::RT_HFSORT, "hfsort",
+                          "use hfsort algorithm"),
+               clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS, "hfsort+",
+                          "use hfsort+ algorithm"),
+               clEnumValN(bolt::ReorderFunctions::RT_CDS, "cds",
+                          "use cache-directed sort"),
+               clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN,
+                          "pettis-hansen", "use Pettis-Hansen algorithm"),
+               clEnumValN(bolt::ReorderFunctions::RT_RANDOM, "random",
+                          "reorder functions randomly"),
+               clEnumValN(bolt::ReorderFunctions::RT_USER, "user",
+                          "use function order specified by -function-order")),
+    cl::ZeroOrMore, cl::cat(BoltOptCategory));
 
 static cl::opt<bool> ReorderFunctionsUseHotSize(
     "reorder-functions-use-hot-size",
     cl::desc("use a function's hot size when doing clustering"), cl::init(true),
     cl::cat(BoltOptCategory));
 
-static cl::opt<std::string>
-FunctionOrderFile("function-order",
-  cl::desc("file containing an ordered list of functions to use for function "
-           "reordering"),
-  cl::cat(BoltOptCategory));
+static cl::opt<std::string> FunctionOrderFile(
+    "function-order",
+    cl::desc("file containing an ordered list of functions to use for function "
+             "reordering"),
+    cl::cat(BoltOptCategory));
 
-static cl::opt<std::string>
-GenerateFunctionOrderFile("generate-function-order",
-  cl::desc("file to dump the ordered list of functions to use for function "
-           "reordering"),
-  cl::cat(BoltOptCategory));
+static cl::opt<std::string> GenerateFunctionOrderFile(
+    "generate-function-order",
+    cl::desc("file to dump the ordered list of functions to use for function "
+             "reordering"),
+    cl::cat(BoltOptCategory));
 
-static cl::opt<std::string>
-LinkSectionsFile("generate-link-sections",
-  cl::desc("generate a list of function sections in a format suitable for "
-           "inclusion in a linker script"),
-  cl::cat(BoltOptCategory));
+static cl::opt<std::string> LinkSectionsFile(
+    "generate-link-sections",
+    cl::desc("generate a list of function sections in a format suitable for "
+             "inclusion in a linker script"),
+    cl::cat(BoltOptCategory));
 
 static cl::opt<bool>
     UseEdgeCounts("use-edge-counts",
                   cl::desc("use edge count data when doing clustering"),
                   cl::init(true), cl::cat(BoltOptCategory));
 
-static cl::opt<bool>
-CgFromPerfData("cg-from-perf-data",
-  cl::desc("use perf data directly when constructing the call graph"
-           " for stale functions"),
-  cl::init(true),
-  cl::ZeroOrMore,
-  cl::cat(BoltOptCategory));
+static cl::opt<bool> CgFromPerfData(
+    "cg-from-perf-data",
+    cl::desc("use perf data directly when constructing the call graph"
+             " for stale functions"),
+    cl::init(true), cl::ZeroOrMore, cl::cat(BoltOptCategory));
 
 static cl::opt<bool> CgIgnoreRecursiveCalls(
     "cg-ignore-recursive-calls",
     cl::desc("ignore recursive calls when constructing the call graph"),
     cl::init(true), cl::cat(BoltOptCategory));
 
-static cl::opt<bool>
-CgUseSplitHotSize("cg-use-split-hot-size",
-  cl::desc("use hot/cold data on basic blocks to determine hot sizes for "
-           "call graph functions"),
-  cl::init(false),
-  cl::ZeroOrMore,
-  cl::cat(BoltOptCategory));
+static cl::opt<bool> CgUseSplitHotSize(
+    "cg-use-split-hot-size",
+    cl::desc("use hot/cold data on basic blocks to determine hot sizes for "
+             "call graph functions"),
+    cl::init(false), cl::ZeroOrMore, cl::cat(BoltOptCategory));
 
 } // namespace opts
 
@@ -157,13 +148,13 @@ void ReorderFunctions::printStats(const std::vector<Cluster> &Clusters,
   bool PrintDetailed = opts::Verbosity > 1;
 #ifndef NDEBUG
   PrintDetailed |=
-    (DebugFlag && isCurrentDebugType("hfsort") && opts::Verbosity > 0);
+      (DebugFlag && isCurrentDebugType("hfsort") && opts::Verbosity > 0);
 #endif
-  uint64_t TotalSize   = 0;
-  uint64_t CurPage     = 0;
-  uint64_t Hotfuncs    = 0;
+  uint64_t TotalSize = 0;
+  uint64_t CurPage = 0;
+  uint64_t Hotfuncs = 0;
   double TotalDistance = 0;
-  double TotalCalls    = 0;
+  double TotalCalls = 0;
   double TotalCalls64B = 0;
   double TotalCalls4KB = 0;
   double TotalCalls2MB = 0;
@@ -198,21 +189,22 @@ void ReorderFunctions::printStats(const std::vector<Cluster> &Clusters,
                    << "BOLT-INFO:   Src: " << *Cg.nodeIdToFunc(FuncId) << "\n"
                    << "BOLT-INFO:   Dst: " << *Cg.nodeIdToFunc(Dst) << "\n"
                    << "BOLT-INFO:   Weight = " << W << "\n"
-                   << "BOLT-INFO:   AvgOffset = " << Arc.avgCallOffset() << "\n";
+                   << "BOLT-INFO:   AvgOffset = " << Arc.avgCallOffset()
+                   << "\n";
           Calls += W;
-          if (D < 64)        TotalCalls64B += W;
-          if (D < 4096)      TotalCalls4KB += W;
-          if (D < (2 << 20)) TotalCalls2MB += W;
+          if (D < 64)
+            TotalCalls64B += W;
+          if (D < 4096)
+            TotalCalls4KB += W;
+          if (D < (2 << 20))
+            TotalCalls2MB += W;
           Dist += Arc.weight() * D;
           if (PrintDetailed)
             outs() << format("BOLT-INFO: arc: %u [@%lu+%.1lf] -> %u [@%lu]: "
                              "weight = %.0lf, callDist = %f\n",
-                             Arc.src(),
-                             FuncAddr[Arc.src()],
-                             Arc.avgCallOffset(),
-                             Arc.dst(),
-                             FuncAddr[Arc.dst()],
-                             Arc.weight(), D);
+                             Arc.src(), FuncAddr[Arc.src()],
+                             Arc.avgCallOffset(), Arc.dst(),
+                             FuncAddr[Arc.dst()], Arc.weight(), D);
         }
         TotalCalls += Calls;
         TotalDistance += Dist;
@@ -290,39 +282,74 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
   switch (opts::ReorderFunctions) {
   case RT_NONE:
     break;
-  case RT_EXEC_COUNT:
-    {
-      std::vector<BinaryFunction *> SortedFunctions(BFs.size());
-      uint32_t Index = 0;
-      llvm::transform(llvm::make_second_range(BFs), SortedFunctions.begin(),
-                      [](BinaryFunction &BF) { return &BF; });
-      llvm::stable_sort(SortedFunctions, [&](const BinaryFunction *A,
-                                             const BinaryFunction *B) {
-        if (A->isIgnored())
-          return false;
-        const size_t PadA = opts::padFunction(*A);
-        const size_t PadB = opts::padFunction(*B);
-        if (!PadA || !PadB) {
-          if (PadA)
-            return true;
-          if (PadB)
-            return false;
-        }
-        return !A->hasProfile() &&
-               (B->hasProfile() ||
-                (A->getExecutionCount() > B->getExecutionCount()));
-      });
-      for (BinaryFunction *BF : SortedFunctions)
-        if (BF->hasProfile())
-          BF->setIndex(Index++);
-    }
-    break;
+  case RT_EXEC_COUNT: {
+    std::vector<BinaryFunction *> SortedFunctions(BFs.size());
+    llvm::transform(llvm::make_second_range(BFs), SortedFunctions.begin(),
+                    [](BinaryFunction &BF) { return &BF; });
+    llvm::stable_sort(SortedFunctions,
+                      [&](const BinaryFunction *A, const BinaryFunction *B) {
+                        if (A->isIgnored())
+                          return false;
+                        if (B->isIgnored())
+                          return true;
+                        const size_t PadA = opts::padFunction(*A);
+                        const size_t PadB = opts::padFunction(*B);
+                        if (!PadA || !PadB) {
+                          if (PadA)
+                            return true;
+                          if (PadB)
+                            return false;
+                        }
+                        if (!A->hasProfile())
+                          return false;
+                        if (!B->hasProfile())
+                          return true;
+                        return A->getExecutionCount() > B->getExecutionCount();
+                      });
+    uint32_t Index = 0;
+    for (BinaryFunction *BF : SortedFunctions)
+      if (BF->hasProfile()) {
+        BF->setIndex(Index++);
+        LLVM_DEBUG(if (opts::Verbosity > 1) {
+          dbgs() << "BOLT-INFO: hot func " << BF->getPrintName() << " ("
+                 << BF->getExecutionCount() << ")\n";
+        });
+      }
+  } break;
   case RT_HFSORT:
     Clusters = clusterize(Cg);
     break;
   case RT_HFSORT_PLUS:
     Clusters = hfsortPlus(Cg);
     break;
+  case RT_CDS: {
+    // It is required that the sum of incoming arc weights is not greater
+    // than the number of samples for every function. Ensuring the call graph
+    // obeys the property before running the algorithm.
+    Cg.adjustArcWeights();
+
+    // Initialize CFG nodes and their data
+    std::vector<uint64_t> FuncSizes;
+    std::vector<uint64_t> FuncCounts;
+    std::vector<codelayout::EdgeCount> CallCounts;
+    std::vector<uint64_t> CallOffsets;
+    for (NodeId F = 0; F < Cg.numNodes(); ++F) {
+      FuncSizes.push_back(Cg.size(F));
+      FuncCounts.push_back(Cg.samples(F));
+      for (NodeId Succ : Cg.successors(F)) {
+        const Arc &Arc = *Cg.findArc(F, Succ);
+        CallCounts.push_back({F, Succ, uint64_t(Arc.weight())});
+        CallOffsets.push_back(uint64_t(Arc.avgCallOffset()));
+      }
+    }
+
+    // Run the layout algorithm.
+    std::vector<uint64_t> Result = codelayout::computeCacheDirectedLayout(
+        FuncSizes, FuncCounts, CallCounts, CallOffsets);
+
+    // Create a single cluster from the computed order of hot functions.
+    Clusters.emplace_back(Cluster(Result, Cg));
+  } break;
   case RT_PETTIS_HANSEN:
     Clusters = pettisAndHansen(Cg);
     break;
@@ -330,74 +357,71 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
     std::srand(opts::RandomSeed);
     Clusters = randomClusters(Cg);
     break;
-  case RT_USER:
-    {
-      // Build LTOCommonNameMap
-      StringMap<std::vector<uint64_t>> LTOCommonNameMap;
-      for (const BinaryFunction &BF : llvm::make_second_range(BFs))
-        for (StringRef Name : BF.getNames())
-          if (std::optional<StringRef> LTOCommonName = getLTOCommonName(Name))
-            LTOCommonNameMap[*LTOCommonName].push_back(BF.getAddress());
-
-      uint32_t Index = 0;
-      uint32_t InvalidEntries = 0;
-      for (const std::string &Function : readFunctionOrderFile()) {
-        std::vector<uint64_t> FuncAddrs;
-
-        BinaryData *BD = BC.getBinaryDataByName(Function);
-        if (!BD) {
-          // If we can't find the main symbol name, look for alternates.
-          uint32_t LocalID = 1;
-          while (true) {
-            const std::string FuncName =
-                Function + "/" + std::to_string(LocalID);
-            BD = BC.getBinaryDataByName(FuncName);
-            if (BD)
-              FuncAddrs.push_back(BD->getAddress());
-            else
-              break;
-            LocalID++;
-          }
-          // Strip LTO suffixes
-          if (std::optional<StringRef> CommonName = getLTOCommonName(Function))
-            if (LTOCommonNameMap.contains(*CommonName))
-              llvm::append_range(FuncAddrs, LTOCommonNameMap[*CommonName]);
-        } else {
-          FuncAddrs.push_back(BD->getAddress());
+  case RT_USER: {
+    // Build LTOCommonNameMap
+    StringMap<std::vector<uint64_t>> LTOCommonNameMap;
+    for (const BinaryFunction &BF : llvm::make_second_range(BFs))
+      for (StringRef Name : BF.getNames())
+        if (std::optional<StringRef> LTOCommonName = getLTOCommonName(Name))
+          LTOCommonNameMap[*LTOCommonName].push_back(BF.getAddress());
+
+    uint32_t Index = 0;
+    uint32_t InvalidEntries = 0;
+    for (const std::string &Function : readFunctionOrderFile()) {
+      std::vector<uint64_t> FuncAddrs;
+
+      BinaryData *BD = BC.getBinaryDataByName(Function);
+      if (!BD) {
+        // If we can't find the main symbol name, look for alternates.
+        uint32_t LocalID = 1;
+        while (true) {
+          const std::string FuncName = Function + "/" + std::to_string(LocalID);
+          BD = BC.getBinaryDataByName(FuncName);
+          if (BD)
+            FuncAddrs.push_back(BD->getAddress());
+          else
+            break;
+          LocalID++;
         }
+        // Strip LTO suffixes
+        if (std::optional<StringRef> CommonName = getLTOCommonName(Function))
+          if (LTOCommonNameMap.contains(*CommonName))
+            llvm::append_range(FuncAddrs, LTOCommonNameMap[*CommonName]);
+      } else {
+        FuncAddrs.push_back(BD->getAddress());
+      }
 
-        if (FuncAddrs.empty()) {
+      if (FuncAddrs.empty()) {
+        if (opts::Verbosity >= 1)
+          errs() << "BOLT-WARNING: Reorder functions: can't find function "
+                 << "for " << Function << "\n";
+        ++InvalidEntries;
+        continue;
+      }
+
+      for (const uint64_t FuncAddr : FuncAddrs) {
+        const BinaryData *FuncBD = BC.getBinaryDataAtAddress(FuncAddr);
+        assert(FuncBD);
+
+        BinaryFunction *BF = BC.getFunctionForSymbol(FuncBD->getSymbol());
+        if (!BF) {
           if (opts::Verbosity >= 1)
             errs() << "BOLT-WARNING: Reorder functions: can't find function "
                    << "for " << Function << "\n";
           ++InvalidEntries;
-          continue;
-        }
-
-        for (const uint64_t FuncAddr : FuncAddrs) {
-          const BinaryData *FuncBD = BC.getBinaryDataAtAddress(FuncAddr);
-          assert(FuncBD);
-
-          BinaryFunction *BF = BC.getFunctionForSymbol(FuncBD->getSymbol());
-          if (!BF) {
-            if (opts::Verbosity >= 1)
-              errs() << "BOLT-WARNING: Reorder functions: can't find function "
-                     << "for " << Function << "\n";
-            ++InvalidEntries;
-            break;
-          }
-          if (!BF->hasValidIndex())
-            BF->setIndex(Index++);
-          else if (opts::Verbosity > 0)
-            errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function
-                   << "\n";
+          break;
         }
+        if (!BF->hasValidIndex())
+          BF->setIndex(Index++);
+        else if (opts::Verbosity > 0)
+          errs() << "BOLT-WARNING: Duplicate reorder entry for " << Function
+                 << "\n";
       }
-      if (InvalidEntries)
-        errs() << "BOLT-WARNING: Reorder functions: can't find functions for "
-               << InvalidEntries << " entries in -function-order list\n";
     }
-    break;
+    if (InvalidEntries)
+      errs() << "BOLT-WARNING: Reorder functions: can't find functions for "
+             << InvalidEntries << " entries in -function-order list\n";
+  } break;
   }
 
   reorder(std::move(Clusters), BFs);
diff --git a/bolt/lib/Passes/ShrinkWrapping.cpp b/bolt/lib/Passes/ShrinkWrapping.cpp
index cdf38e35ee87da5b3c222707e69dbdc8cf5d3406..17f169cc332b644e3a4ee99fb00c33fb873dab8e 100644
--- a/bolt/lib/Passes/ShrinkWrapping.cpp
+++ b/bolt/lib/Passes/ShrinkWrapping.cpp
@@ -1960,7 +1960,7 @@ bool ShrinkWrapping::perform(bool HotOnly) {
     for (const auto &Instr : *BB) {
       if (BC.MIB->isPseudo(Instr))
         continue;
-      if (BC.MIB->isStore(Instr))
+      if (BC.MIB->mayStore(Instr))
         TotalStoreInstrs += BBExecCount;
       TotalInstrs += BBExecCount;
     }
diff --git a/bolt/lib/Passes/StokeInfo.cpp b/bolt/lib/Passes/StokeInfo.cpp
index cbd2c3c7a1a1255c818de812e6eff78ee0d3d1ca..57e5a08113dd0f11bee6f5d2391e50c3d2a80a2b 100644
--- a/bolt/lib/Passes/StokeInfo.cpp
+++ b/bolt/lib/Passes/StokeInfo.cpp
@@ -75,7 +75,7 @@ void StokeInfo::checkInstr(const BinaryFunction &BF, StokeFuncInfo &FuncInfo) {
       if (IsPush)
         FuncInfo.StackOut = true;
 
-      if (MIB->isStore(It) && !IsPush && !IsRipAddr)
+      if (MIB->mayStore(It) && !IsPush && !IsRipAddr)
         FuncInfo.HeapOut = true;
 
       if (IsRipAddr)
diff --git a/bolt/lib/Passes/TailDuplication.cpp b/bolt/lib/Passes/TailDuplication.cpp
index c04efd759bf3030f0b818b1d38ad42f49355d3f6..7141d5d99aa65e188ddffa72a6065b9550ee658a 100644
--- a/bolt/lib/Passes/TailDuplication.cpp
+++ b/bolt/lib/Passes/TailDuplication.cpp
@@ -303,7 +303,7 @@ TailDuplication::aggressiveDuplicate(BinaryBasicBlock &BB,
   if (isInCacheLine(BB, Tail))
     return BlocksToDuplicate;
 
-  BinaryBasicBlock *CurrBB = &BB;
+  BinaryBasicBlock *CurrBB = &Tail;
   while (CurrBB) {
     LLVM_DEBUG(dbgs() << "Aggressive tail duplication: adding "
                       << CurrBB->getName() << " to duplication list\n";);
diff --git a/bolt/lib/Passes/ValidateInternalCalls.cpp b/bolt/lib/Passes/ValidateInternalCalls.cpp
index 22dadf4f6403be3699cf0a0a2dc7521ced10e49d..516f91acb5084417e4844cd4948e525f79cefaa0 100644
--- a/bolt/lib/Passes/ValidateInternalCalls.cpp
+++ b/bolt/lib/Passes/ValidateInternalCalls.cpp
@@ -281,18 +281,16 @@ bool ValidateInternalCalls::analyzeFunction(BinaryFunction &Function) const {
           LLVM_DEBUG({
             dbgs() << "Detected out-of-range PIC reference in " << Function
                    << "\nReturn address load: ";
-            BC.InstPrinter->printInst(TargetInst, 0, "", *BC.STI, dbgs());
-            dbgs() << "\nUse: ";
-            BC.InstPrinter->printInst(&Use, 0, "", *BC.STI, dbgs());
-            dbgs() << "\n";
+            BC.dump(*TargetInst);
+            dbgs() << "Use: ";
+            BC.dump(Use);
             Function.dump();
           });
           return false;
         }
         LLVM_DEBUG({
           dbgs() << "Validated access: ";
-          BC.InstPrinter->printInst(&Use, 0, "", *BC.STI, dbgs());
-          dbgs() << "\n";
+          BC.dump(Use);
         });
       }
       if (!UseDetected) {
diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index 57a850eb17234c98a86a85d26f75ff5e31c1de7b..e004309e0e21365008774d033f25b65588f3ff24 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -46,9 +46,14 @@ void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
   // allowing it to overwrite the previously inserted key in the map.
   Map[BBOutputOffset] = BBInputOffset;
 
-  for (const auto &IOPair : BB.getOffsetTranslationTable()) {
-    const uint64_t OutputOffset = IOPair.first + BBOutputOffset;
-    const uint32_t InputOffset = IOPair.second;
+  const auto &IOAddressMap =
+      BB.getFunction()->getBinaryContext().getIOAddressMap();
+
+  for (const auto &[InputOffset, Sym] : BB.getLocSyms()) {
+    const auto InputAddress = BB.getFunction()->getAddress() + InputOffset;
+    const auto OutputAddress = IOAddressMap.lookup(InputAddress);
+    assert(OutputAddress && "Unknown instruction address");
+    const auto OutputOffset = *OutputAddress - FuncAddress;
 
     // Is this the first instruction in the BB? No need to duplicate the entry.
     if (OutputOffset == BBOutputOffset)
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 24dbe34b2f6a0da974392788eb8e3c34664e6038..cbc079afbb7e4f00afcde7680fd80a9acd825893 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -46,6 +46,11 @@ static cl::opt<bool>
                      cl::desc("aggregate basic samples (without LBR info)"),
                      cl::cat(AggregatorCategory));
 
+static cl::opt<std::string>
+    ITraceAggregation("itrace",
+                      cl::desc("Generate LBR info with perf itrace argument"),
+                      cl::cat(AggregatorCategory));
+
 static cl::opt<bool>
 FilterMemProfile("filter-mem-profile",
   cl::desc("if processing a memory profile, filter out stack or heap accesses "
@@ -163,16 +168,23 @@ void DataAggregator::start() {
 
   findPerfExecutable();
 
-  if (opts::BasicAggregation)
+  if (opts::BasicAggregation) {
     launchPerfProcess("events without LBR",
                       MainEventsPPI,
                       "script -F pid,event,ip",
                       /*Wait = */false);
-  else
+  } else if (!opts::ITraceAggregation.empty()) {
+    std::string ItracePerfScriptArgs = llvm::formatv(
+        "script -F pid,ip,brstack --itrace={0}", opts::ITraceAggregation);
+    launchPerfProcess("branch events with itrace", MainEventsPPI,
+                      ItracePerfScriptArgs.c_str(),
+                      /*Wait = */ false);
+  } else {
     launchPerfProcess("branch events",
                       MainEventsPPI,
                       "script -F pid,ip,brstack",
                       /*Wait = */false);
+  }
 
   // Note: we launch script for mem events regardless of the option, as the
   //       command fails fairly fast if mem events were not collected.
@@ -1479,13 +1491,10 @@ std::error_code DataAggregator::parseBranchEvents() {
     NumTraces += parseLBRSample(Sample, NeedsSkylakeFix);
   }
 
-  for (const auto &LBR : BranchLBRs) {
-    const Trace &Trace = LBR.first;
-    if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Trace.From))
-      BF->setHasProfileAvailable();
-    if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Trace.To))
-      BF->setHasProfileAvailable();
-  }
+  for (const Trace &Trace : llvm::make_first_range(BranchLBRs))
+    for (const uint64_t Addr : {Trace.From, Trace.To})
+      if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Addr))
+        BF->setHasProfileAvailable();
 
   auto printColored = [](raw_ostream &OS, float Percent, float T1, float T2) {
     OS << " (";
@@ -1721,12 +1730,9 @@ std::error_code DataAggregator::parsePreAggregatedLBRSamples() {
     if (std::error_code EC = AggrEntry.getError())
       return EC;
 
-    if (BinaryFunction *BF =
-            getBinaryFunctionContainingAddress(AggrEntry->From.Offset))
-      BF->setHasProfileAvailable();
-    if (BinaryFunction *BF =
-            getBinaryFunctionContainingAddress(AggrEntry->To.Offset))
-      BF->setHasProfileAvailable();
+    for (const uint64_t Addr : {AggrEntry->From.Offset, AggrEntry->To.Offset})
+      if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Addr))
+        BF->setHasProfileAvailable();
 
     AggregatedLBRs.emplace_back(std::move(AggrEntry.get()));
   }
diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp
index 0e12e8cb307002d37d75751c59d3d40f199d2bd9..dcc7578041fae6bfff826f98ecfa7a8ec01f98bf 100644
--- a/bolt/lib/Profile/DataReader.cpp
+++ b/bolt/lib/Profile/DataReader.cpp
@@ -698,7 +698,8 @@ bool DataReader::recordBranch(BinaryFunction &BF, uint64_t From, uint64_t To,
       if (!BC.MIB->isNoop(Instr))
         break;
 
-      Offset += BC.MIB->getAnnotationWithDefault<uint32_t>(Instr, "Size");
+      if (std::optional<uint32_t> Size = BC.MIB->getSize(Instr))
+        Offset += *Size;
     }
 
     if (To == Offset)
diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index b009d57a0e6e43a3362b454acad0e8e62b245d9c..d00bf87ffc8ad871ecba876be40dc8b822a5503b 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -27,17 +27,18 @@
 
 #include "bolt/Core/HashUtilities.h"
 #include "bolt/Profile/YAMLProfileReader.h"
+#include "llvm/ADT/Bitfields.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/SampleProfileInference.h"
 
 #include <queue>
 
+using namespace llvm;
+
 #undef DEBUG_TYPE
 #define DEBUG_TYPE "bolt-prof"
 
-using namespace llvm;
-
 namespace opts {
 
 extern cl::OptionCategory BoltOptCategory;
@@ -72,64 +73,39 @@ cl::opt<bool> StaleMatchingJoinIslands(
 
 cl::opt<unsigned> StaleMatchingCostBlockInc(
     "stale-matching-cost-block-inc",
-    cl::desc("The cost of increasing a block's count by one."), cl::init(110),
+    cl::desc("The cost of increasing a block count by one."), cl::init(150),
     cl::ReallyHidden, cl::cat(BoltOptCategory));
 
 cl::opt<unsigned> StaleMatchingCostBlockDec(
     "stale-matching-cost-block-dec",
-    cl::desc("The cost of decreasing a block's count by one."), cl::init(100),
+    cl::desc("The cost of decreasing a block count by one."), cl::init(150),
     cl::ReallyHidden, cl::cat(BoltOptCategory));
 
-cl::opt<unsigned> StaleMatchingCostBlockEntryInc(
-    "stale-matching-cost-block-entry-inc",
-    cl::desc("The cost of increasing the entry block's count by one."),
-    cl::init(110), cl::ReallyHidden, cl::cat(BoltOptCategory));
-
-cl::opt<unsigned> StaleMatchingCostBlockEntryDec(
-    "stale-matching-cost-block-entry-dec",
-    cl::desc("The cost of decreasing the entry block's count by one."),
-    cl::init(100), cl::ReallyHidden, cl::cat(BoltOptCategory));
-
-cl::opt<unsigned> StaleMatchingCostBlockZeroInc(
-    "stale-matching-cost-block-zero-inc",
-    cl::desc("The cost of increasing a count of zero-weight block by one."),
-    cl::init(10), cl::Hidden, cl::cat(BoltOptCategory));
-
-cl::opt<unsigned> StaleMatchingCostBlockUnknownInc(
-    "stale-matching-cost-block-unknown-inc",
-    cl::desc("The cost of increasing an unknown block's count by one."),
-    cl::init(10), cl::ReallyHidden, cl::cat(BoltOptCategory));
-
 cl::opt<unsigned> StaleMatchingCostJumpInc(
     "stale-matching-cost-jump-inc",
-    cl::desc("The cost of increasing a jump's count by one."), cl::init(100),
+    cl::desc("The cost of increasing a jump count by one."), cl::init(150),
     cl::ReallyHidden, cl::cat(BoltOptCategory));
 
-cl::opt<unsigned> StaleMatchingCostJumpFTInc(
-    "stale-matching-cost-jump-ft-inc",
-    cl::desc("The cost of increasing a fall-through jump's count by one."),
-    cl::init(100), cl::ReallyHidden, cl::cat(BoltOptCategory));
-
 cl::opt<unsigned> StaleMatchingCostJumpDec(
     "stale-matching-cost-jump-dec",
-    cl::desc("The cost of decreasing a jump's count by one."), cl::init(110),
+    cl::desc("The cost of decreasing a jump count by one."), cl::init(150),
     cl::ReallyHidden, cl::cat(BoltOptCategory));
 
-cl::opt<unsigned> StaleMatchingCostJumpFTDec(
-    "stale-matching-cost-jump-ft-dec",
-    cl::desc("The cost of decreasing a fall-through jump's count by one."),
-    cl::init(110), cl::ReallyHidden, cl::cat(BoltOptCategory));
+cl::opt<unsigned> StaleMatchingCostBlockUnknownInc(
+    "stale-matching-cost-block-unknown-inc",
+    cl::desc("The cost of increasing an unknown block count by one."),
+    cl::init(1), cl::ReallyHidden, cl::cat(BoltOptCategory));
 
 cl::opt<unsigned> StaleMatchingCostJumpUnknownInc(
     "stale-matching-cost-jump-unknown-inc",
-    cl::desc("The cost of increasing an unknown jump's count by one."),
-    cl::init(50), cl::ReallyHidden, cl::cat(BoltOptCategory));
+    cl::desc("The cost of increasing an unknown jump count by one."),
+    cl::init(140), cl::ReallyHidden, cl::cat(BoltOptCategory));
 
 cl::opt<unsigned> StaleMatchingCostJumpUnknownFTInc(
     "stale-matching-cost-jump-unknown-ft-inc",
     cl::desc(
-        "The cost of increasing an unknown fall-through jump's count by one."),
-    cl::init(5), cl::ReallyHidden, cl::cat(BoltOptCategory));
+        "The cost of increasing an unknown fall-through jump count by one."),
+    cl::init(3), cl::ReallyHidden, cl::cat(BoltOptCategory));
 
 } // namespace opts
 
@@ -141,49 +117,32 @@ namespace bolt {
 /// components are of smaller size (e.g., uint16_t or uint8_t).
 struct BlendedBlockHash {
 private:
-  static uint64_t combineHashes(uint16_t Hash1, uint16_t Hash2, uint16_t Hash3,
-                                uint16_t Hash4) {
-    uint64_t Hash = 0;
-
-    Hash |= uint64_t(Hash4);
-    Hash <<= 16;
-
-    Hash |= uint64_t(Hash3);
-    Hash <<= 16;
-
-    Hash |= uint64_t(Hash2);
-    Hash <<= 16;
-
-    Hash |= uint64_t(Hash1);
-
-    return Hash;
-  }
-
-  static void parseHashes(uint64_t Hash, uint16_t &Hash1, uint16_t &Hash2,
-                          uint16_t &Hash3, uint16_t &Hash4) {
-    Hash1 = Hash & 0xffff;
-    Hash >>= 16;
-
-    Hash2 = Hash & 0xffff;
-    Hash >>= 16;
-
-    Hash3 = Hash & 0xffff;
-    Hash >>= 16;
-
-    Hash4 = Hash & 0xffff;
-    Hash >>= 16;
-  }
+  using ValueOffset = Bitfield::Element<uint16_t, 0, 16>;
+  using ValueOpcode = Bitfield::Element<uint16_t, 16, 16>;
+  using ValueInstr = Bitfield::Element<uint16_t, 32, 16>;
+  using ValuePred = Bitfield::Element<uint8_t, 48, 8>;
+  using ValueSucc = Bitfield::Element<uint8_t, 56, 8>;
 
 public:
   explicit BlendedBlockHash() {}
 
-  explicit BlendedBlockHash(uint64_t CombinedHash) {
-    parseHashes(CombinedHash, Offset, OpcodeHash, InstrHash, NeighborHash);
+  explicit BlendedBlockHash(uint64_t Hash) {
+    Offset = Bitfield::get<ValueOffset>(Hash);
+    OpcodeHash = Bitfield::get<ValueOpcode>(Hash);
+    InstrHash = Bitfield::get<ValueInstr>(Hash);
+    PredHash = Bitfield::get<ValuePred>(Hash);
+    SuccHash = Bitfield::get<ValueSucc>(Hash);
   }
 
   /// Combine the blended hash into uint64_t.
   uint64_t combine() const {
-    return combineHashes(Offset, OpcodeHash, InstrHash, NeighborHash);
+    uint64_t Hash = 0;
+    Bitfield::set<ValueOffset>(Hash, Offset);
+    Bitfield::set<ValueOpcode>(Hash, OpcodeHash);
+    Bitfield::set<ValueInstr>(Hash, InstrHash);
+    Bitfield::set<ValuePred>(Hash, PredHash);
+    Bitfield::set<ValueSucc>(Hash, SuccHash);
+    return Hash;
   }
 
   /// Compute a distance between two given blended hashes. The smaller the
@@ -194,7 +153,8 @@ public:
            "incorrect blended hash distance computation");
     uint64_t Dist = 0;
     // Account for NeighborHash
-    Dist += NeighborHash == BBH.NeighborHash ? 0 : 1;
+    Dist += SuccHash == BBH.SuccHash ? 0 : 1;
+    Dist += PredHash == BBH.PredHash ? 0 : 1;
     Dist <<= 16;
     // Account for InstrHash
     Dist += InstrHash == BBH.InstrHash ? 0 : 1;
@@ -211,9 +171,10 @@ public:
   /// (Strong) Hash of the basic block instructions, including opcodes and
   /// operands.
   uint16_t InstrHash{0};
-  /// Hash of the (loose) basic block together with (loose) hashes of its
-  /// successors and predecessors.
-  uint16_t NeighborHash{0};
+  /// (Loose) Hashes of the predecessors of the basic block.
+  uint8_t PredHash{0};
+  /// (Loose) Hashes of the successors of the basic block.
+  uint8_t SuccHash{0};
 };
 
 /// The object is used to identify and match basic blocks in a BinaryFunction
@@ -236,14 +197,11 @@ public:
   /// Find the most similar block for a given hash.
   const FlowBlock *matchBlock(BlendedBlockHash BlendedHash) const {
     auto BlockIt = OpHashToBlocks.find(BlendedHash.OpcodeHash);
-    if (BlockIt == OpHashToBlocks.end()) {
+    if (BlockIt == OpHashToBlocks.end())
       return nullptr;
-    }
     FlowBlock *BestBlock = nullptr;
     uint64_t BestDist = std::numeric_limits<uint64_t>::max();
-    for (auto It : BlockIt->second) {
-      FlowBlock *Block = It.second;
-      BlendedBlockHash Hash = It.first;
+    for (const auto &[Hash, Block] : BlockIt->second) {
       uint64_t Dist = Hash.distance(BlendedHash);
       if (BestBlock == nullptr || Dist < BestDist) {
         BestDist = Dist;
@@ -253,6 +211,14 @@ public:
     return BestBlock;
   }
 
+  /// Returns true if the two basic blocks (in the binary and in the profile)
+  /// corresponding to the given hashes are matched to each other with a high
+  /// confidence.
+  static bool isHighConfidenceMatch(BlendedBlockHash Hash1,
+                                    BlendedBlockHash Hash2) {
+    return Hash1.InstrHash == Hash2.InstrHash;
+  }
+
 private:
   using HashBlockPairType = std::pair<BlendedBlockHash, FlowBlock *>;
   std::unordered_map<uint16_t, std::vector<HashBlockPairType>> OpHashToBlocks;
@@ -266,46 +232,49 @@ void BinaryFunction::computeBlockHashes() const {
 
   std::vector<BlendedBlockHash> BlendedHashes(BasicBlocks.size());
   std::vector<uint64_t> OpcodeHashes(BasicBlocks.size());
-  // Initialize hash components
+  // Initialize hash components.
   for (size_t I = 0; I < BasicBlocks.size(); I++) {
     const BinaryBasicBlock *BB = BasicBlocks[I];
     assert(BB->getIndex() == I && "incorrect block index");
     BlendedHashes[I].Offset = BB->getOffset();
-    // Hashing complete instructions
+    // Hashing complete instructions.
     std::string InstrHashStr = hashBlock(
         BC, *BB, [&](const MCOperand &Op) { return hashInstOperand(BC, Op); });
     uint64_t InstrHash = std::hash<std::string>{}(InstrHashStr);
-    BlendedHashes[I].InstrHash = hash_64_to_16(InstrHash);
-    // Hashing opcodes
-    std::string OpcodeHashStr =
-        hashBlock(BC, *BB, [](const MCOperand &Op) { return std::string(); });
+    BlendedHashes[I].InstrHash = (uint16_t)hash_value(InstrHash);
+    // Hashing opcodes.
+    std::string OpcodeHashStr = hashBlockLoose(BC, *BB);
     OpcodeHashes[I] = std::hash<std::string>{}(OpcodeHashStr);
-    BlendedHashes[I].OpcodeHash = hash_64_to_16(OpcodeHashes[I]);
+    BlendedHashes[I].OpcodeHash = (uint16_t)hash_value(OpcodeHashes[I]);
   }
 
-  // Initialize neighbor hash
+  // Initialize neighbor hash.
   for (size_t I = 0; I < BasicBlocks.size(); I++) {
     const BinaryBasicBlock *BB = BasicBlocks[I];
-    uint64_t Hash = OpcodeHashes[I];
-    // Append hashes of successors
+    // Append hashes of successors.
+    uint64_t Hash = 0;
     for (BinaryBasicBlock *SuccBB : BB->successors()) {
       uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()];
       Hash = hashing::detail::hash_16_bytes(Hash, SuccHash);
     }
-    // Append hashes of predecessors
+    BlendedHashes[I].SuccHash = (uint8_t)hash_value(Hash);
+
+    // Append hashes of predecessors.
+    Hash = 0;
     for (BinaryBasicBlock *PredBB : BB->predecessors()) {
       uint64_t PredHash = OpcodeHashes[PredBB->getIndex()];
       Hash = hashing::detail::hash_16_bytes(Hash, PredHash);
     }
-    BlendedHashes[I].NeighborHash = hash_64_to_16(Hash);
+    BlendedHashes[I].PredHash = (uint8_t)hash_value(Hash);
   }
 
-  //  Assign hashes
+  //  Assign hashes.
   for (size_t I = 0; I < BasicBlocks.size(); I++) {
     const BinaryBasicBlock *BB = BasicBlocks[I];
     BB->setHash(BlendedHashes[I].combine());
   }
 }
+
 /// Create a wrapper flow function to use with the profile inference algorithm,
 /// and initialize its jumps and metadata.
 FlowFunction
@@ -314,7 +283,7 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
 
   // Add a special "dummy" source so that there is always a unique entry point.
   // Because of the extra source, for all other blocks in FlowFunction it holds
-  // that Block.Index == BB->getLayoutIndex() + 1
+  // that Block.Index == BB->getIndex() + 1
   FlowBlock EntryBlock;
   EntryBlock.Index = 0;
   Func.Blocks.push_back(EntryBlock);
@@ -325,7 +294,7 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
     FlowBlock &Block = Func.Blocks.back();
     Block.Index = Func.Blocks.size() - 1;
     (void)BB;
-    assert(Block.Index == BB->getLayoutIndex() + 1 &&
+    assert(Block.Index == BB->getIndex() + 1 &&
            "incorrectly assigned basic block index");
   }
 
@@ -341,8 +310,8 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
 
       Func.Jumps.emplace_back();
       FlowJump &Jump = Func.Jumps.back();
-      Jump.Source = SrcBB->getLayoutIndex() + 1;
-      Jump.Target = DstBB->getLayoutIndex() + 1;
+      Jump.Source = SrcBB->getIndex() + 1;
+      Jump.Target = DstBB->getIndex() + 1;
       InDegree[Jump.Target]++;
       UniqueSuccs.insert(DstBB);
     }
@@ -354,8 +323,8 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
 
       Func.Jumps.emplace_back();
       FlowJump &Jump = Func.Jumps.back();
-      Jump.Source = SrcBB->getLayoutIndex() + 1;
-      Jump.Target = DstBB->getLayoutIndex() + 1;
+      Jump.Source = SrcBB->getIndex() + 1;
+      Jump.Target = DstBB->getIndex() + 1;
       InDegree[Jump.Target]++;
       UniqueSuccs.insert(DstBB);
     }
@@ -393,7 +362,8 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
 /// of the basic blocks in the binary, the count is "matched" to the block.
 /// Similarly, if both the source and the target of a count in the profile are
 /// matched to a jump in the binary, the count is recorded in CFG.
-void matchWeightsByHashes(const BinaryFunction::BasicBlockOrderType &BlockOrder,
+void matchWeightsByHashes(BinaryContext &BC,
+                          const BinaryFunction::BasicBlockOrderType &BlockOrder,
                           const yaml::bolt::BinaryFunctionProfile &YamlBF,
                           FlowFunction &Func) {
   assert(Func.Blocks.size() == BlockOrder.size() + 1);
@@ -417,19 +387,34 @@ void matchWeightsByHashes(const BinaryFunction::BasicBlockOrderType &BlockOrder,
   // Match blocks from the profile to the blocks in CFG
   for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) {
     assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile");
-    BlendedBlockHash BlendedHash(YamlBB.Hash);
-    const FlowBlock *MatchedBlock = Matcher.matchBlock(BlendedHash);
+    BlendedBlockHash YamlHash(YamlBB.Hash);
+    const FlowBlock *MatchedBlock = Matcher.matchBlock(YamlHash);
+    // Always match the entry block.
+    if (MatchedBlock == nullptr && YamlBB.Index == 0)
+      MatchedBlock = Blocks[0];
     if (MatchedBlock != nullptr) {
       MatchedBlocks[YamlBB.Index] = MatchedBlock;
-      LLVM_DEBUG(dbgs() << "Matched yaml block with bid = " << YamlBB.Index
-                        << " and hash = " << Twine::utohexstr(YamlBB.Hash)
-                        << " to BB with index = " << MatchedBlock->Index - 1
+      BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1];
+      LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBB.Index << ")"
+                        << " with hash " << Twine::utohexstr(YamlBB.Hash)
+                        << " to BB (index = " << MatchedBlock->Index - 1 << ")"
+                        << " with hash " << Twine::utohexstr(BinHash.combine())
                         << "\n");
+      // Update matching stats accounting for the matched block.
+      if (Matcher.isHighConfidenceMatch(BinHash, YamlHash)) {
+        ++BC.Stats.NumMatchedBlocks;
+        BC.Stats.MatchedSampleCount += YamlBB.ExecCount;
+        LLVM_DEBUG(dbgs() << "  exact match\n");
+      }
     } else {
       LLVM_DEBUG(
-          dbgs() << "Couldn't match yaml block with bid = " << YamlBB.Index
-                 << " and hash = " << Twine::utohexstr(YamlBB.Hash) << "\n");
+          dbgs() << "Couldn't match yaml block (bid = " << YamlBB.Index << ")"
+                 << " with hash " << Twine::utohexstr(YamlBB.Hash) << "\n");
     }
+
+    // Update matching stats.
+    ++BC.Stats.NumStaleBlocks;
+    BC.Stats.StaleSampleCount += YamlBB.ExecCount;
   }
 
   // Match jumps from the profile to the jumps from CFG
@@ -475,7 +460,7 @@ void matchWeightsByHashes(const BinaryFunction::BasicBlockOrderType &BlockOrder,
   // Assign block counts based on in-/out- jumps
   for (FlowBlock &Block : Func.Blocks) {
     if (OutWeight[Block.Index] == 0 && InWeight[Block.Index] == 0) {
-      assert(Block.HasUnknownWeight && "unmatched block with positive count");
+      assert(Block.HasUnknownWeight && "unmatched block with a positive count");
       continue;
     }
     Block.HasUnknownWeight = false;
@@ -577,16 +562,15 @@ void applyInference(FlowFunction &Func) {
   Params.JoinIslands = opts::StaleMatchingJoinIslands;
 
   Params.CostBlockInc = opts::StaleMatchingCostBlockInc;
+  Params.CostBlockEntryInc = opts::StaleMatchingCostBlockInc;
   Params.CostBlockDec = opts::StaleMatchingCostBlockDec;
-  Params.CostBlockEntryInc = opts::StaleMatchingCostBlockEntryInc;
-  Params.CostBlockEntryDec = opts::StaleMatchingCostBlockEntryDec;
-  Params.CostBlockZeroInc = opts::StaleMatchingCostBlockZeroInc;
+  Params.CostBlockEntryDec = opts::StaleMatchingCostBlockDec;
   Params.CostBlockUnknownInc = opts::StaleMatchingCostBlockUnknownInc;
 
   Params.CostJumpInc = opts::StaleMatchingCostJumpInc;
-  Params.CostJumpFTInc = opts::StaleMatchingCostJumpFTInc;
+  Params.CostJumpFTInc = opts::StaleMatchingCostJumpInc;
   Params.CostJumpDec = opts::StaleMatchingCostJumpDec;
-  Params.CostJumpFTDec = opts::StaleMatchingCostJumpFTDec;
+  Params.CostJumpFTDec = opts::StaleMatchingCostJumpDec;
   Params.CostJumpUnknownInc = opts::StaleMatchingCostJumpUnknownInc;
   Params.CostJumpUnknownFTInc = opts::StaleMatchingCostJumpUnknownFTInc;
 
@@ -691,31 +675,33 @@ void assignProfile(BinaryFunction &BF,
 
 bool YAMLProfileReader::inferStaleProfile(
     BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF) {
-  // Make sure that block indices and hashes are up to date
-  BF.getLayout().updateLayoutIndices();
+  LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for "
+                    << "\"" << BF.getPrintName() << "\"\n");
+
+  // Make sure that block hashes are up to date.
   BF.computeBlockHashes();
 
   const BinaryFunction::BasicBlockOrderType BlockOrder(
       BF.getLayout().block_begin(), BF.getLayout().block_end());
 
-  // Create a wrapper flow function to use with the profile inference algorithm
+  // Create a wrapper flow function to use with the profile inference algorithm.
   FlowFunction Func = createFlowFunction(BlockOrder);
 
   // Match as many block/jump counts from the stale profile as possible
-  matchWeightsByHashes(BlockOrder, YamlBF, Func);
+  matchWeightsByHashes(BF.getBinaryContext(), BlockOrder, YamlBF, Func);
 
   // Adjust the flow function by marking unreachable blocks Unlikely so that
-  // they don't get any counts assigned
+  // they don't get any counts assigned.
   preprocessUnreachableBlocks(Func);
 
-  // Check if profile inference can be applied for the instance
+  // Check if profile inference can be applied for the instance.
   if (!canApplyInference(Func))
     return false;
 
-  // Apply the profile inference algorithm
+  // Apply the profile inference algorithm.
   applyInference(Func);
 
-  // Collect inferred counts and update function annotations
+  // Collect inferred counts and update function annotations.
   assignProfile(BF, BlockOrder, Func);
 
   // As of now, we always mark the binary function having "correct" profile.
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index 90e43b402750d89617774ebccac2b1bce7ecec74..3fd489b570d6c17a3591d97dc7bdd032c6348960 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -250,9 +250,6 @@ bool YAMLProfileReader::parseFunctionProfile(
            << " edges in profile did not match function " << BF << '\n';
 
   if (!ProfileMatched && opts::InferStaleProfile) {
-    if (opts::Verbosity >= 1)
-      outs() << "BOLT-INFO: applying profile inference for "
-             << "\"" << BF.getPrintName() << "\"\n";
     if (inferStaleProfile(BF, YamlBF)) {
       ProfileMatched = true;
       BF.markProfiled(YamlBP.Header.Flags);
@@ -355,8 +352,10 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) {
         continue;
 
       yaml::bolt::BinaryFunctionProfile &YamlBF = *PI->getValue();
-      if (profileMatches(YamlBF, Function))
+      if (profileMatches(YamlBF, Function)) {
         matchProfileToFunction(YamlBF, Function);
+        break;
+      }
     }
   }
 
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 517984d990fc5a3ecfe8a2d84b910e69d00c3311..5aab26322537292216ce4abb640bc30b337165b6 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -72,6 +72,11 @@ static cl::opt<bool> JTFootprintReductionFlag(
              "instructions at jump sites"),
     cl::cat(BoltOptCategory));
 
+static cl::opt<bool>
+    KeepNops("keep-nops",
+             cl::desc("keep no-op instructions. By default they are removed."),
+             cl::Hidden, cl::cat(BoltOptCategory));
+
 cl::opt<bool> NeverPrint("never-print", cl::desc("never print"),
                          cl::ReallyHidden, cl::cat(BoltOptCategory));
 
@@ -359,7 +364,8 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
 
   Manager.registerPass(std::make_unique<ShortenInstructions>(NeverPrint));
 
-  Manager.registerPass(std::make_unique<RemoveNops>(NeverPrint));
+  Manager.registerPass(std::make_unique<RemoveNops>(NeverPrint),
+                       !opts::KeepNops);
 
   Manager.registerPass(std::make_unique<NormalizeCFG>(PrintNormalized));
 
diff --git a/bolt/lib/Rewrite/JITLinkLinker.cpp b/bolt/lib/Rewrite/JITLinkLinker.cpp
index 3c74fd5932bf001db04b6b93adceab82c039eaab..994450c75fcfb311037fa9ba08b582e85e72f1cd 100644
--- a/bolt/lib/Rewrite/JITLinkLinker.cpp
+++ b/bolt/lib/Rewrite/JITLinkLinker.cpp
@@ -31,7 +31,7 @@ bool hasSymbols(const jitlink::Block &B) {
 Error markSectionsLive(jitlink::LinkGraph &G) {
   for (auto &Section : G.sections()) {
     // We only need allocatable sections.
-    if (Section.getMemLifetimePolicy() == orc::MemLifetimePolicy::NoAlloc)
+    if (Section.getMemLifetime() == orc::MemLifetime::NoAlloc)
       continue;
 
     // Skip empty sections.
@@ -142,8 +142,8 @@ struct JITLinkLinker::Context : jitlink::JITLinkContext {
     });
 
     for (auto *Symbol : G.defined_symbols()) {
-      Linker.Symtab.insert(
-          {Symbol->getName().str(), Symbol->getAddress().getValue()});
+      SymbolInfo Info{Symbol->getAddress().getValue(), Symbol->getSize()};
+      Linker.Symtab.insert({Symbol->getName().str(), Info});
     }
 
     return Error::success();
@@ -174,7 +174,8 @@ void JITLinkLinker::loadObject(MemoryBufferRef Obj,
   jitlink::link(std::move(*LG), std::move(Ctx));
 }
 
-std::optional<uint64_t> JITLinkLinker::lookupSymbol(StringRef Name) const {
+std::optional<JITLinkLinker::SymbolInfo>
+JITLinkLinker::lookupSymbolInfo(StringRef Name) const {
   auto It = Symtab.find(Name.data());
   if (It == Symtab.end())
     return std::nullopt;
diff --git a/bolt/lib/Rewrite/MachORewriteInstance.cpp b/bolt/lib/Rewrite/MachORewriteInstance.cpp
index fc7500a6deb08b399e1c14536fe1475cc997a6e7..b827a196c82653aaa79bf0b065314d9befc571f2 100644
--- a/bolt/lib/Rewrite/MachORewriteInstance.cpp
+++ b/bolt/lib/Rewrite/MachORewriteInstance.cpp
@@ -20,7 +20,6 @@
 #include "bolt/Rewrite/JITLinkLinker.h"
 #include "bolt/RuntimeLibs/InstrumentationRuntimeLibrary.h"
 #include "bolt/Utils/Utils.h"
-#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
@@ -476,9 +475,6 @@ void MachORewriteInstance::emitAndLink() {
       "error creating in-memory object");
   assert(Obj && "createObjectFile cannot return nullptr");
 
-  MCAsmLayout FinalLayout(
-      static_cast<MCObjectStreamer *>(Streamer.get())->getAssembler());
-
   auto EFMM = std::make_unique<ExecutableFileMemoryManager>(*BC);
   EFMM->setNewSecPrefix(getNewSecPrefix());
   EFMM->setOrgSecPrefix(getOrgSecPrefix());
@@ -568,8 +564,10 @@ void MachORewriteInstance::rewriteFile() {
   writeInstrumentationSection("I__literal16", OS);
 
   Out->keep();
-  EC = sys::fs::setPermissions(opts::OutputFilename,
-                               sys::fs::perms::all_all);
+  EC = sys::fs::setPermissions(
+      opts::OutputFilename,
+      static_cast<sys::fs::perms>(sys::fs::perms::all_all &
+                                  ~sys::fs::getUmask()));
   check_error(EC, "cannot set permissions of output file");
 }
 
diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
index 64b8a8b6d400fe85efd2d3e7cc507347461ea7c3..316b83cfbd38a52c822ca1eaa6840199b9c12296 100644
--- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
+++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp
@@ -183,9 +183,7 @@ void PseudoProbeRewriter::updatePseudoProbes() {
         // A call probe may be duplicated due to ICP
         // Go through output of InputOffsetToAddressMap to collect all related
         // probes
-        const InputOffsetToAddressMapTy &Offset2Addr =
-            F->getInputOffsetToAddressMap();
-        auto CallOutputAddresses = Offset2Addr.equal_range(Offset);
+        auto CallOutputAddresses = BC.getIOAddressMap().lookupAll(AP.first);
         auto CallOutputAddress = CallOutputAddresses.first;
         if (CallOutputAddress == CallOutputAddresses.second) {
           Probe->setAddress(INT64_MAX);
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 1ade842c4ee0539513d85a20385ddbc5f1793ace..7063b243b52dcc7f5fb5dc06a6c3773efd2bbb76 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "bolt/Rewrite/RewriteInstance.h"
+#include "bolt/Core/AddressMap.h"
 #include "bolt/Core/BinaryContext.h"
 #include "bolt/Core/BinaryEmitter.h"
 #include "bolt/Core/BinaryFunction.h"
@@ -407,8 +408,9 @@ static bool checkOffsets(const typename ELFT::Phdr &Phdr,
     return true;
 
   // Only non-empty sections can be at the end of a segment.
-  uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1;
-  AddressRange SectionAddressRange(Sec.sh_offset, Sec.sh_offset + SectionSize);
+  uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1ull;
+  AddressRange SectionAddressRange((uint64_t)Sec.sh_offset,
+                                   Sec.sh_offset + SectionSize);
   AddressRange SegmentAddressRange(Phdr.p_offset,
                                    Phdr.p_offset + Phdr.p_filesz);
   if (SegmentAddressRange.contains(SectionAddressRange))
@@ -424,8 +426,9 @@ template <class ELFT>
 static bool checkVMA(const typename ELFT::Phdr &Phdr,
                      const typename ELFT::Shdr &Sec, bool &Overlap) {
   // Only non-empty sections can be at the end of a segment.
-  uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1;
-  AddressRange SectionAddressRange(Sec.sh_addr, Sec.sh_addr + SectionSize);
+  uint64_t SectionSize = Sec.sh_size ? Sec.sh_size : 1ull;
+  AddressRange SectionAddressRange((uint64_t)Sec.sh_addr,
+                                   Sec.sh_addr + SectionSize);
   AddressRange SegmentAddressRange(Phdr.p_vaddr, Phdr.p_vaddr + Phdr.p_memsz);
 
   if (SegmentAddressRange.contains(SectionAddressRange))
@@ -699,6 +702,10 @@ Error RewriteInstance::run() {
   adjustCommandLineOptions();
   discoverFileObjects();
 
+  if (opts::Instrument && !BC->IsStaticExecutable)
+    if (Error E = discoverRtFiniAddress())
+      return E;
+
   preprocessProfileData();
 
   // Skip disassembling if we have a translation table and we are running an
@@ -735,6 +742,9 @@ Error RewriteInstance::run() {
 
   updateMetadata();
 
+  if (opts::Instrument && !BC->IsStaticExecutable)
+    updateRtFiniReloc();
+
   if (opts::LinuxKernelMode) {
     errs() << "BOLT-WARNING: not writing the output file for Linux Kernel\n";
     return Error::success();
@@ -751,9 +761,6 @@ Error RewriteInstance::run() {
 void RewriteInstance::discoverFileObjects() {
   NamedRegionTimer T("discoverFileObjects", "discover file objects",
                      TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
-  FileSymRefs.clear();
-  BC->getBinaryFunctions().clear();
-  BC->clearBinaryData();
 
   // For local symbols we want to keep track of associated FILE symbol name for
   // disambiguation by combined name.
@@ -798,7 +805,12 @@ void RewriteInstance::discoverFileObjects() {
   }
 
   // Sort symbols in the file by value. Ignore symbols from non-allocatable
-  // sections.
+  // sections. We memoize getAddress(), as it has rather high overhead.
+  struct SymbolInfo {
+    uint64_t Address;
+    SymbolRef Symbol;
+  };
+  std::vector<SymbolInfo> SortedSymbols;
   auto isSymbolInMemory = [this](const SymbolRef &Sym) {
     if (cantFail(Sym.getType()) == SymbolRef::ST_File)
       return false;
@@ -809,25 +821,22 @@ void RewriteInstance::discoverFileObjects() {
     BinarySection Section(*BC, *cantFail(Sym.getSection()));
     return Section.isAllocatable();
   };
-  std::vector<SymbolRef> SortedFileSymbols;
-  llvm::copy_if(InputFile->symbols(), std::back_inserter(SortedFileSymbols),
-                isSymbolInMemory);
-  auto CompareSymbols = [this](const SymbolRef &A, const SymbolRef &B) {
-    // Marker symbols have the highest precedence, while
-    // SECTIONs have the lowest.
-    auto AddressA = cantFail(A.getAddress());
-    auto AddressB = cantFail(B.getAddress());
-    if (AddressA != AddressB)
-      return AddressA < AddressB;
-
-    bool AMarker = BC->isMarker(A);
-    bool BMarker = BC->isMarker(B);
+  for (const SymbolRef &Symbol : InputFile->symbols())
+    if (isSymbolInMemory(Symbol))
+      SortedSymbols.push_back({cantFail(Symbol.getAddress()), Symbol});
+
+  auto CompareSymbols = [this](const SymbolInfo &A, const SymbolInfo &B) {
+    if (A.Address != B.Address)
+      return A.Address < B.Address;
+
+    const bool AMarker = BC->isMarker(A.Symbol);
+    const bool BMarker = BC->isMarker(B.Symbol);
     if (AMarker || BMarker) {
       return AMarker && !BMarker;
     }
 
-    auto AType = cantFail(A.getType());
-    auto BType = cantFail(B.getType());
+    const auto AType = cantFail(A.Symbol.getType());
+    const auto BType = cantFail(B.Symbol.getType());
     if (AType == SymbolRef::ST_Function && BType != SymbolRef::ST_Function)
       return true;
     if (BType == SymbolRef::ST_Debug && AType != SymbolRef::ST_Debug)
@@ -835,11 +844,10 @@ void RewriteInstance::discoverFileObjects() {
 
     return false;
   };
+  llvm::stable_sort(SortedSymbols, CompareSymbols);
 
-  llvm::stable_sort(SortedFileSymbols, CompareSymbols);
-
-  auto LastSymbol = SortedFileSymbols.end();
-  if (!SortedFileSymbols.empty())
+  auto LastSymbol = SortedSymbols.end();
+  if (!SortedSymbols.empty())
     --LastSymbol;
 
   // For aarch64, the ABI defines mapping symbols so we identify data in the
@@ -854,39 +862,34 @@ void RewriteInstance::discoverFileObjects() {
   };
 
   std::vector<MarkerSym> SortedMarkerSymbols;
-  auto addExtraDataMarkerPerSymbol =
-      [this](const std::vector<SymbolRef> &SortedFileSymbols,
-             std::vector<MarkerSym> &SortedMarkerSymbols) {
-        bool IsData = false;
-        uint64_t LastAddr = 0;
-        for (auto Sym = SortedFileSymbols.begin();
-             Sym < SortedFileSymbols.end(); ++Sym) {
-          uint64_t Address = cantFail(Sym->getAddress());
-          if (LastAddr == Address) // don't repeat markers
-            continue;
+  auto addExtraDataMarkerPerSymbol = [&]() {
+    bool IsData = false;
+    uint64_t LastAddr = 0;
+    for (const auto &SymInfo : SortedSymbols) {
+      if (LastAddr == SymInfo.Address) // don't repeat markers
+        continue;
 
-          MarkerSymType MarkerType = BC->getMarkerType(*Sym);
-          if (MarkerType != MarkerSymType::NONE) {
-            SortedMarkerSymbols.push_back(MarkerSym{Address, MarkerType});
-            LastAddr = Address;
-            IsData = MarkerType == MarkerSymType::DATA;
-            continue;
-          }
+      MarkerSymType MarkerType = BC->getMarkerType(SymInfo.Symbol);
+      if (MarkerType != MarkerSymType::NONE) {
+        SortedMarkerSymbols.push_back(MarkerSym{SymInfo.Address, MarkerType});
+        LastAddr = SymInfo.Address;
+        IsData = MarkerType == MarkerSymType::DATA;
+        continue;
+      }
 
-          if (IsData) {
-            SortedMarkerSymbols.push_back(
-                MarkerSym{cantFail(Sym->getAddress()), MarkerSymType::DATA});
-            LastAddr = Address;
-          }
-        }
-      };
+      if (IsData) {
+        SortedMarkerSymbols.push_back({SymInfo.Address, MarkerSymType::DATA});
+        LastAddr = SymInfo.Address;
+      }
+    }
+  };
 
-  if (BC->isAArch64()) {
-    addExtraDataMarkerPerSymbol(SortedFileSymbols, SortedMarkerSymbols);
+  if (BC->isAArch64() || BC->isRISCV()) {
+    addExtraDataMarkerPerSymbol();
     LastSymbol = std::stable_partition(
-        SortedFileSymbols.begin(), SortedFileSymbols.end(),
-        [this](const SymbolRef &Symbol) { return !BC->isMarker(Symbol); });
-    if (!SortedFileSymbols.empty())
+        SortedSymbols.begin(), SortedSymbols.end(),
+        [this](const SymbolInfo &S) { return !BC->isMarker(S.Symbol); });
+    if (!SortedSymbols.empty())
       --LastSymbol;
   }
 
@@ -894,27 +897,21 @@ void RewriteInstance::discoverFileObjects() {
   unsigned AnonymousId = 0;
 
   // Regex object for matching cold fragments.
-  Regex ColdFragment(".*\\.cold(\\.[0-9]+)?");
-
-  const auto SortedSymbolsEnd = LastSymbol == SortedFileSymbols.end()
-                                    ? LastSymbol
-                                    : std::next(LastSymbol);
-  for (auto ISym = SortedFileSymbols.begin(); ISym != SortedSymbolsEnd;
-       ++ISym) {
-    const SymbolRef &Symbol = *ISym;
-    // Keep undefined symbols for pretty printing?
-    if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Undefined)
-      continue;
-
+  const Regex ColdFragment(".*\\.cold(\\.[0-9]+)?");
+
+  const auto SortedSymbolsEnd =
+      LastSymbol == SortedSymbols.end() ? LastSymbol : std::next(LastSymbol);
+  for (auto Iter = SortedSymbols.begin(); Iter != SortedSymbolsEnd; ++Iter) {
+    const SymbolRef &Symbol = Iter->Symbol;
+    const uint64_t SymbolAddress = Iter->Address;
+    const auto SymbolFlags = cantFail(Symbol.getFlags());
     const SymbolRef::Type SymbolType = cantFail(Symbol.getType());
 
     if (SymbolType == SymbolRef::ST_File)
       continue;
 
     StringRef SymName = cantFail(Symbol.getName(), "cannot get symbol name");
-    uint64_t Address =
-        cantFail(Symbol.getAddress(), "cannot get symbol address");
-    if (Address == 0) {
+    if (SymbolAddress == 0) {
       if (opts::Verbosity >= 1 && SymbolType == SymbolRef::ST_Function)
         errs() << "BOLT-WARNING: function with 0 address seen\n";
       continue;
@@ -924,11 +921,12 @@ void RewriteInstance::discoverFileObjects() {
     if (SymName == "__hot_start" || SymName == "__hot_end")
       continue;
 
-    FileSymRefs[Address] = Symbol;
+    FileSymRefs[SymbolAddress] = Symbol;
 
     // Skip section symbols that will be registered by disassemblePLT().
-    if ((cantFail(Symbol.getType()) == SymbolRef::ST_Debug)) {
-      ErrorOr<BinarySection &> BSection = BC->getSectionForAddress(Address);
+    if (SymbolType == SymbolRef::ST_Debug) {
+      ErrorOr<BinarySection &> BSection =
+          BC->getSectionForAddress(SymbolAddress);
       if (BSection && getPLTSectionInfo(BSection->getName()))
         continue;
     }
@@ -950,10 +948,10 @@ void RewriteInstance::discoverFileObjects() {
     std::string AlternativeName;
     if (Name.empty()) {
       UniqueName = "ANONYMOUS." + std::to_string(AnonymousId++);
-    } else if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Global) {
+    } else if (SymbolFlags & SymbolRef::SF_Global) {
       if (const BinaryData *BD = BC->getBinaryDataByName(Name)) {
         if (BD->getSize() == ELFSymbolRef(Symbol).getSize() &&
-            BD->getAddress() == Address) {
+            BD->getAddress() == SymbolAddress) {
           if (opts::Verbosity > 1)
             errs() << "BOLT-WARNING: ignoring duplicate global symbol " << Name
                    << "\n";
@@ -989,14 +987,13 @@ void RewriteInstance::discoverFileObjects() {
 
     uint64_t SymbolSize = ELFSymbolRef(Symbol).getSize();
     uint64_t SymbolAlignment = Symbol.getAlignment();
-    unsigned SymbolFlags = cantFail(Symbol.getFlags());
 
     auto registerName = [&](uint64_t FinalSize) {
       // Register names even if it's not a function, e.g. for an entry point.
-      BC->registerNameAtAddress(UniqueName, Address, FinalSize, SymbolAlignment,
-                                SymbolFlags);
+      BC->registerNameAtAddress(UniqueName, SymbolAddress, FinalSize,
+                                SymbolAlignment, SymbolFlags);
       if (!AlternativeName.empty())
-        BC->registerNameAtAddress(AlternativeName, Address, FinalSize,
+        BC->registerNameAtAddress(AlternativeName, SymbolAddress, FinalSize,
                                   SymbolAlignment, SymbolFlags);
     };
 
@@ -1016,7 +1013,7 @@ void RewriteInstance::discoverFileObjects() {
     LLVM_DEBUG(dbgs() << "BOLT-DEBUG: considering symbol " << UniqueName
                       << " for function\n");
 
-    if (Address == Section->getAddress() + Section->getSize()) {
+    if (SymbolAddress == Section->getAddress() + Section->getSize()) {
       assert(SymbolSize == 0 &&
              "unexpect non-zero sized symbol at end of section");
       LLVM_DEBUG(
@@ -1042,11 +1039,12 @@ void RewriteInstance::discoverFileObjects() {
     // their local labels. The only way to tell them apart is to look at
     // symbol scope - global vs local.
     if (PreviousFunction && SymbolType != SymbolRef::ST_Function) {
-      if (PreviousFunction->containsAddress(Address)) {
+      if (PreviousFunction->containsAddress(SymbolAddress)) {
         if (PreviousFunction->isSymbolValidInScope(Symbol, SymbolSize)) {
           LLVM_DEBUG(dbgs()
                      << "BOLT-DEBUG: symbol is a function local symbol\n");
-        } else if (Address == PreviousFunction->getAddress() && !SymbolSize) {
+        } else if (SymbolAddress == PreviousFunction->getAddress() &&
+                   !SymbolSize) {
           LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ignoring symbol as a marker\n");
         } else if (opts::Verbosity > 1) {
           errs() << "BOLT-WARNING: symbol " << UniqueName
@@ -1063,8 +1061,8 @@ void RewriteInstance::discoverFileObjects() {
       }
     }
 
-    if (PreviousFunction && PreviousFunction->containsAddress(Address) &&
-        PreviousFunction->getAddress() != Address) {
+    if (PreviousFunction && PreviousFunction->containsAddress(SymbolAddress) &&
+        PreviousFunction->getAddress() != SymbolAddress) {
       if (PreviousFunction->isSymbolValidInScope(Symbol, SymbolSize)) {
         if (opts::Verbosity >= 1)
           outs() << "BOLT-INFO: skipping possibly another entry for function "
@@ -1076,12 +1074,12 @@ void RewriteInstance::discoverFileObjects() {
 
         registerName(0);
 
-        PreviousFunction->addEntryPointAtOffset(Address -
+        PreviousFunction->addEntryPointAtOffset(SymbolAddress -
                                                 PreviousFunction->getAddress());
 
         // Remove the symbol from FileSymRefs so that we can skip it from
         // in the future.
-        auto SI = FileSymRefs.find(Address);
+        auto SI = FileSymRefs.find(SymbolAddress);
         assert(SI != FileSymRefs.end() && "symbol expected to be present");
         assert(SI->second == Symbol && "wrong symbol found");
         FileSymRefs.erase(SI);
@@ -1091,10 +1089,10 @@ void RewriteInstance::discoverFileObjects() {
 
     // Checkout for conflicts with function data from FDEs.
     bool IsSimple = true;
-    auto FDEI = CFIRdWrt->getFDEs().lower_bound(Address);
+    auto FDEI = CFIRdWrt->getFDEs().lower_bound(SymbolAddress);
     if (FDEI != CFIRdWrt->getFDEs().end()) {
       const dwarf::FDE &FDE = *FDEI->second;
-      if (FDEI->first != Address) {
+      if (FDEI->first != SymbolAddress) {
         // There's no matching starting address in FDE. Make sure the previous
         // FDE does not contain this address.
         if (FDEI != CFIRdWrt->getFDEs().begin()) {
@@ -1102,7 +1100,8 @@ void RewriteInstance::discoverFileObjects() {
           const dwarf::FDE &PrevFDE = *FDEI->second;
           uint64_t PrevStart = PrevFDE.getInitialLocation();
           uint64_t PrevLength = PrevFDE.getAddressRange();
-          if (Address > PrevStart && Address < PrevStart + PrevLength) {
+          if (SymbolAddress > PrevStart &&
+              SymbolAddress < PrevStart + PrevLength) {
             errs() << "BOLT-ERROR: function " << UniqueName
                    << " is in conflict with FDE ["
                    << Twine::utohexstr(PrevStart) << ", "
@@ -1119,11 +1118,11 @@ void RewriteInstance::discoverFileObjects() {
                  << "; symbol table : " << SymbolSize << ". Using max size.\n";
         }
         SymbolSize = std::max(SymbolSize, FDE.getAddressRange());
-        if (BC->getBinaryDataAtAddress(Address)) {
-          BC->setBinaryDataSize(Address, SymbolSize);
+        if (BC->getBinaryDataAtAddress(SymbolAddress)) {
+          BC->setBinaryDataSize(SymbolAddress, SymbolSize);
         } else {
           LLVM_DEBUG(dbgs() << "BOLT-DEBUG: No BD @ 0x"
-                            << Twine::utohexstr(Address) << "\n");
+                            << Twine::utohexstr(SymbolAddress) << "\n");
         }
       }
     }
@@ -1132,7 +1131,7 @@ void RewriteInstance::discoverFileObjects() {
     // Since function may not have yet obtained its real size, do a search
     // using the list of registered functions instead of calling
     // getBinaryFunctionAtAddress().
-    auto BFI = BC->getBinaryFunctions().find(Address);
+    auto BFI = BC->getBinaryFunctions().find(SymbolAddress);
     if (BFI != BC->getBinaryFunctions().end()) {
       BF = &BFI->second;
       // Duplicate the function name. Make sure everything matches before we add
@@ -1146,15 +1145,17 @@ void RewriteInstance::discoverFileObjects() {
                  << BF->getSize() << " new " << SymbolSize << "\n";
         }
         BF->setSize(std::max(SymbolSize, BF->getSize()));
-        BC->setBinaryDataSize(Address, BF->getSize());
+        BC->setBinaryDataSize(SymbolAddress, BF->getSize());
       }
       BF->addAlternativeName(UniqueName);
     } else {
-      ErrorOr<BinarySection &> Section = BC->getSectionForAddress(Address);
+      ErrorOr<BinarySection &> Section =
+          BC->getSectionForAddress(SymbolAddress);
       // Skip symbols from invalid sections
       if (!Section) {
         errs() << "BOLT-WARNING: " << UniqueName << " (0x"
-               << Twine::utohexstr(Address) << ") does not have any section\n";
+               << Twine::utohexstr(SymbolAddress)
+               << ") does not have any section\n";
         continue;
       }
 
@@ -1162,7 +1163,8 @@ void RewriteInstance::discoverFileObjects() {
       if (!Section->getSize())
         continue;
 
-      BF = BC->createBinaryFunction(UniqueName, *Section, Address, SymbolSize);
+      BF = BC->createBinaryFunction(UniqueName, *Section, SymbolAddress,
+                                    SymbolSize);
       if (!IsSimple)
         BF->setSimple(false);
     }
@@ -1283,6 +1285,77 @@ void RewriteInstance::discoverFileObjects() {
   registerFragments();
 }
 
+Error RewriteInstance::discoverRtFiniAddress() {
+  // Use DT_FINI if it's available.
+  if (BC->FiniAddress) {
+    BC->FiniFunctionAddress = BC->FiniAddress;
+    return Error::success();
+  }
+
+  if (!BC->FiniArrayAddress || !BC->FiniArraySize) {
+    return createStringError(
+        std::errc::not_supported,
+        "Instrumentation needs either DT_FINI or DT_FINI_ARRAY");
+  }
+
+  if (*BC->FiniArraySize < BC->AsmInfo->getCodePointerSize()) {
+    return createStringError(std::errc::not_supported,
+                             "Need at least 1 DT_FINI_ARRAY slot");
+  }
+
+  ErrorOr<BinarySection &> FiniArraySection =
+      BC->getSectionForAddress(*BC->FiniArrayAddress);
+  if (auto EC = FiniArraySection.getError())
+    return errorCodeToError(EC);
+
+  if (const Relocation *Reloc = FiniArraySection->getDynamicRelocationAt(0)) {
+    BC->FiniFunctionAddress = Reloc->Addend;
+    return Error::success();
+  }
+
+  if (const Relocation *Reloc = FiniArraySection->getRelocationAt(0)) {
+    BC->FiniFunctionAddress = Reloc->Value;
+    return Error::success();
+  }
+
+  return createStringError(std::errc::not_supported,
+                           "No relocation for first DT_FINI_ARRAY slot");
+}
+
+void RewriteInstance::updateRtFiniReloc() {
+  // Updating DT_FINI is handled by patchELFDynamic.
+  if (BC->FiniAddress)
+    return;
+
+  const RuntimeLibrary *RT = BC->getRuntimeLibrary();
+  if (!RT || !RT->getRuntimeFiniAddress())
+    return;
+
+  assert(BC->FiniArrayAddress && BC->FiniArraySize &&
+         "inconsistent .fini_array state");
+
+  ErrorOr<BinarySection &> FiniArraySection =
+      BC->getSectionForAddress(*BC->FiniArrayAddress);
+  assert(FiniArraySection && ".fini_array removed");
+
+  if (std::optional<Relocation> Reloc =
+          FiniArraySection->takeDynamicRelocationAt(0)) {
+    assert(Reloc->Addend == BC->FiniFunctionAddress &&
+           "inconsistent .fini_array dynamic relocation");
+    Reloc->Addend = RT->getRuntimeFiniAddress();
+    FiniArraySection->addDynamicRelocation(*Reloc);
+  }
+
+  // Update the static relocation by adding a pending relocation which will get
+  // patched when flushPendingRelocations is called in rewriteFile. Note that
+  // flushPendingRelocations will calculate the value to patch as
+  // "Symbol + Addend". Since we don't have a symbol, just set the addend to the
+  // desired value.
+  FiniArraySection->addPendingRelocation(Relocation{
+      /*Offset*/ 0, /*Symbol*/ nullptr, /*Type*/ Relocation::getAbs64(),
+      /*Addend*/ RT->getRuntimeFiniAddress(), /*Value*/ 0});
+}
+
 void RewriteInstance::registerFragments() {
   if (!BC->HasSplitFunctions)
     return;
@@ -1347,24 +1420,41 @@ void RewriteInstance::createPLTBinaryFunction(uint64_t TargetAddress,
 
   BinaryFunction *BF = BC->getBinaryFunctionAtAddress(EntryAddress);
   if (BF && BC->isAArch64()) {
-    // Handle IFUNC trampoline
+    // Handle IFUNC trampoline with symbol
     setPLTSymbol(BF, BF->getOneName());
     return;
   }
 
   const Relocation *Rel = BC->getDynamicRelocationAt(TargetAddress);
-  if (!Rel || !Rel->Symbol)
+  if (!Rel)
     return;
 
+  MCSymbol *Symbol = Rel->Symbol;
+  if (!Symbol) {
+    if (!BC->isAArch64() || !Rel->Addend || !Rel->isIRelative())
+      return;
+
+    // IFUNC trampoline without symbol
+    BinaryFunction *TargetBF = BC->getBinaryFunctionAtAddress(Rel->Addend);
+    if (!TargetBF) {
+      errs()
+          << "BOLT-WARNING: Expected BF to be presented as IFUNC resolver at "
+          << Twine::utohexstr(Rel->Addend) << ", skipping\n";
+      return;
+    }
+
+    Symbol = TargetBF->getSymbol();
+  }
+
   ErrorOr<BinarySection &> Section = BC->getSectionForAddress(EntryAddress);
   assert(Section && "cannot get section for address");
   if (!BF)
-    BF = BC->createBinaryFunction(Rel->Symbol->getName().str() + "@PLT",
-                                  *Section, EntryAddress, 0, EntrySize,
+    BF = BC->createBinaryFunction(Symbol->getName().str() + "@PLT", *Section,
+                                  EntryAddress, 0, EntrySize,
                                   Section->getAlignment());
   else
-    BF->addAlternativeName(Rel->Symbol->getName().str() + "@PLT");
-  setPLTSymbol(BF, Rel->Symbol->getName());
+    BF->addAlternativeName(Symbol->getName().str() + "@PLT");
+  setPLTSymbol(BF, Symbol->getName());
 }
 
 void RewriteInstance::disassemblePLTSectionAArch64(BinarySection &Section) {
@@ -1585,6 +1675,16 @@ void RewriteInstance::adjustFunctionBoundaries() {
       if (!Function.isSymbolValidInScope(Symbol, SymbolSize))
         break;
 
+      // Skip basic block labels. This happens on RISC-V with linker relaxation
+      // enabled because every branch needs a relocation and corresponding
+      // symbol. We don't want to add such symbols as entry points.
+      const auto PrivateLabelPrefix = BC->AsmInfo->getPrivateLabelPrefix();
+      if (!PrivateLabelPrefix.empty() &&
+          cantFail(Symbol.getName()).starts_with(PrivateLabelPrefix)) {
+        ++NextSymRefI;
+        continue;
+      }
+
       // This is potentially another entry point into the function.
       uint64_t EntryOffset = NextSymRefI->first - Function.getAddress();
       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: adding entry point to function "
@@ -2101,6 +2201,19 @@ void RewriteInstance::processDynamicRelocations() {
   }
 
   // The rest of dynamic relocations - DT_RELA.
+  // The static executable might have .rela.dyn secion and not have PT_DYNAMIC
+  if (!DynamicRelocationsSize && BC->IsStaticExecutable) {
+    ErrorOr<BinarySection &> DynamicRelSectionOrErr =
+        BC->getUniqueSectionByName(getRelaDynSectionName());
+    if (DynamicRelSectionOrErr) {
+      DynamicRelocationsAddress = DynamicRelSectionOrErr->getAddress();
+      DynamicRelocationsSize = DynamicRelSectionOrErr->getSize();
+      const SectionRef &SectionRef = DynamicRelSectionOrErr->getSectionRef();
+      DynamicRelativeRelocationsCount = std::distance(
+          SectionRef.relocation_begin(), SectionRef.relocation_end());
+    }
+  }
+
   if (DynamicRelocationsSize > 0) {
     ErrorOr<BinarySection &> DynamicRelSectionOrErr =
         BC->getSectionForAddress(*DynamicRelocationsAddress);
@@ -2526,7 +2639,17 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
     // Adjust the point of reference to a code location inside a function.
     if (ReferencedBF->containsAddress(Address, /*UseMaxSize = */ true)) {
       RefFunctionOffset = Address - ReferencedBF->getAddress();
-      if (RefFunctionOffset) {
+      if (Relocation::isInstructionReference(RType)) {
+        // Instruction labels are created while disassembling so we just leave
+        // the symbol empty for now. Since the extracted value is typically
+        // unrelated to the referenced symbol (e.g., %pcrel_lo in RISC-V
+        // references an instruction but the patched value references the low
+        // bits of a data address), we set the extracted value to the symbol
+        // address in order to be able to correctly reconstruct the reference
+        // later.
+        ReferencedSymbol = nullptr;
+        ExtractedValue = Address;
+      } else if (RefFunctionOffset) {
         if (ContainingBF && ContainingBF != ReferencedBF) {
           ReferencedSymbol =
               ReferencedBF->addEntryPointAtOffset(RefFunctionOffset);
@@ -3077,7 +3200,6 @@ void RewriteInstance::buildFunctionsCFG() {
   // Create annotation indices to allow lock-free execution
   BC->MIB->getOrCreateAnnotationIndex("JTIndexReg");
   BC->MIB->getOrCreateAnnotationIndex("NOP");
-  BC->MIB->getOrCreateAnnotationIndex("Size");
 
   ParallelUtilities::WorkFuncWithAllocTy WorkFun =
       [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocId) {
@@ -3235,15 +3357,15 @@ void RewriteInstance::emitAndLink() {
   Linker->loadObject(ObjectMemBuffer->getMemBufferRef(),
                      [this](auto MapSection) { mapFileSections(MapSection); });
 
-  MCAsmLayout FinalLayout(
-      static_cast<MCObjectStreamer *>(Streamer.get())->getAssembler());
-
   // Update output addresses based on the new section map and
   // layout. Only do this for the object created by ourselves.
-  updateOutputValues(FinalLayout);
+  updateOutputValues(*Linker);
 
-  if (opts::UpdateDebugSections)
+  if (opts::UpdateDebugSections) {
+    MCAsmLayout FinalLayout(
+        static_cast<MCObjectStreamer *>(Streamer.get())->getAssembler());
     DebugInfoRewriter->updateLineTableOffsets(FinalLayout);
+  }
 
   if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary())
     RtLibrary->link(*BC, ToolPath, *Linker, [this](auto MapSection) {
@@ -3574,6 +3696,9 @@ void RewriteInstance::mapAllocatableSections(
     }
 
     for (BinarySection &Section : BC->allocatableSections()) {
+      if (Section.isLinkOnly())
+        continue;
+
       if (!Section.hasValidSectionID())
         continue;
 
@@ -3635,9 +3760,12 @@ void RewriteInstance::mapAllocatableSections(
   }
 }
 
-void RewriteInstance::updateOutputValues(const MCAsmLayout &Layout) {
+void RewriteInstance::updateOutputValues(const BOLTLinker &Linker) {
+  if (std::optional<AddressMap> Map = AddressMap::parse(*BC))
+    BC->setIOAddressMap(std::move(*Map));
+
   for (BinaryFunction *Function : BC->getAllBinaryFunctions())
-    Function->updateOutputValues(Layout);
+    Function->updateOutputValues(Linker);
 }
 
 void RewriteInstance::patchELFPHDRTable() {
@@ -4554,15 +4682,12 @@ void RewriteInstance::updateELFSymbolTable(
     }
   }
 
-  assert((!NumHotTextSymsUpdated || NumHotTextSymsUpdated == 2) &&
-         "either none or both __hot_start/__hot_end symbols were expected");
-  assert((!NumHotDataSymsUpdated || NumHotDataSymsUpdated == 2) &&
-         "either none or both __hot_data_start/__hot_data_end symbols were "
-         "expected");
+  auto AddSymbol = [&](const StringRef &Name, uint64_t Address) {
+    if (!Address)
+      return;
 
-  auto addSymbol = [&](const std::string &Name) {
     ELFSymTy Symbol;
-    Symbol.st_value = getNewValueForSymbol(Name);
+    Symbol.st_value = Address;
     Symbol.st_shndx = ELF::SHN_ABS;
     Symbol.st_name = AddToStrTab(Name);
     Symbol.st_size = 0;
@@ -4575,14 +4700,30 @@ void RewriteInstance::updateELFSymbolTable(
     Symbols.emplace_back(Symbol);
   };
 
+  // Add runtime library start and fini address symbols
+  if (RuntimeLibrary *RtLibrary = BC->getRuntimeLibrary()) {
+    AddSymbol("__bolt_runtime_start", RtLibrary->getRuntimeStartAddress());
+    AddSymbol("__bolt_runtime_fini", RtLibrary->getRuntimeFiniAddress());
+  }
+
+  assert((!NumHotTextSymsUpdated || NumHotTextSymsUpdated == 2) &&
+         "either none or both __hot_start/__hot_end symbols were expected");
+  assert((!NumHotDataSymsUpdated || NumHotDataSymsUpdated == 2) &&
+         "either none or both __hot_data_start/__hot_data_end symbols were "
+         "expected");
+
+  auto AddEmittedSymbol = [&](const StringRef &Name) {
+    AddSymbol(Name, getNewValueForSymbol(Name));
+  };
+
   if (opts::HotText && !NumHotTextSymsUpdated) {
-    addSymbol("__hot_start");
-    addSymbol("__hot_end");
+    AddEmittedSymbol("__hot_start");
+    AddEmittedSymbol("__hot_end");
   }
 
   if (opts::HotData && !NumHotDataSymsUpdated) {
-    addSymbol("__hot_data_start");
-    addSymbol("__hot_data_end");
+    AddEmittedSymbol("__hot_data_start");
+    AddEmittedSymbol("__hot_data_end");
   }
 
   // Put local symbols at the beginning.
@@ -4706,9 +4847,11 @@ void RewriteInstance::patchELFAllocatableRelrSection(
   const uint8_t PSize = BC->AsmInfo->getCodePointerSize();
   const uint64_t MaxDelta = ((CHAR_BIT * DynamicRelrEntrySize) - 1) * PSize;
 
-  auto FixAddend = [&](const BinarySection &Section, const Relocation &Rel) {
+  auto FixAddend = [&](const BinarySection &Section, const Relocation &Rel,
+                       uint64_t FileOffset) {
     // Fix relocation symbol value in place if no static relocation found
-    // on the same address
+    // on the same address. We won't check the BF relocations here since it
+    // is rare case and no optimization is required.
     if (Section.getRelocationAt(Rel.Offset))
       return;
 
@@ -4717,11 +4860,6 @@ void RewriteInstance::patchELFAllocatableRelrSection(
     if (!Addend)
       return;
 
-    uint64_t FileOffset = Section.getOutputFileOffset();
-    if (!FileOffset)
-      FileOffset = Section.getInputFileOffset();
-
-    FileOffset += Rel.Offset;
     OS.pwrite(reinterpret_cast<const char *>(&Addend), PSize, FileOffset);
   };
 
@@ -4743,7 +4881,7 @@ void RewriteInstance::patchELFAllocatableRelrSection(
       RelOffset = RelOffset == 0 ? SectionAddress + Rel.Offset : RelOffset;
       assert((RelOffset & 1) == 0 && "Wrong relocation offset");
       RelOffsets.emplace(RelOffset);
-      FixAddend(Section, Rel);
+      FixAddend(Section, Rel, RelOffset);
     }
   }
 
@@ -5075,7 +5213,13 @@ Error RewriteInstance::readELFDynamic(ELFObjectFile<ELFT> *File) {
       }
       break;
     case ELF::DT_FINI:
-      BC->FiniFunctionAddress = Dyn.getPtr();
+      BC->FiniAddress = Dyn.getPtr();
+      break;
+    case ELF::DT_FINI_ARRAY:
+      BC->FiniArrayAddress = Dyn.getPtr();
+      break;
+    case ELF::DT_FINI_ARRAYSZ:
+      BC->FiniArraySize = Dyn.getPtr();
       break;
     case ELF::DT_RELA:
       DynamicRelocationsAddress = Dyn.getPtr();
@@ -5271,8 +5415,10 @@ void RewriteInstance::rewriteFile() {
       if (!BF.getFileOffset() || !BF.isEmitted())
         continue;
       OS.seek(BF.getFileOffset());
-      for (unsigned I = 0; I < BF.getMaxSize(); ++I)
-        OS.write((unsigned char)BC->MIB->getTrapFillValue());
+      StringRef TrapInstr = BC->MIB->getTrapFillValue();
+      unsigned NInstr = BF.getMaxSize() / TrapInstr.size();
+      for (unsigned I = 0; I < NInstr; ++I)
+        OS.write(TrapInstr.data(), TrapInstr.size());
     }
     OS.seek(SavedPos);
   }
@@ -5281,6 +5427,8 @@ void RewriteInstance::rewriteFile() {
   for (BinarySection &Section : BC->allocatableSections()) {
     if (!Section.isFinalized() || !Section.getOutputData())
       continue;
+    if (Section.isLinkOnly())
+      continue;
 
     if (opts::Verbosity >= 1)
       outs() << "BOLT: writing new section " << Section.getName()
@@ -5340,7 +5488,10 @@ void RewriteInstance::rewriteFile() {
   }
 
   Out->keep();
-  EC = sys::fs::setPermissions(opts::OutputFilename, sys::fs::perms::all_all);
+  EC = sys::fs::setPermissions(
+      opts::OutputFilename,
+      static_cast<sys::fs::perms>(sys::fs::perms::all_all &
+                                  ~sys::fs::getUmask()));
   check_error(EC, "cannot set permissions of output file");
 }
 
diff --git a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
index cc36406543f399543526e552bc6a0d06802625fc..cd1b975be7b90e636ca4d5d6877cb309994e1207 100644
--- a/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
+++ b/bolt/lib/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
@@ -57,10 +57,11 @@ void InstrumentationRuntimeLibrary::adjustCommandLineOptions(
               "the input binary\n";
     exit(1);
   }
-  if (!BC.FiniFunctionAddress && !BC.IsStaticExecutable) {
-    errs() << "BOLT-ERROR: input binary lacks DT_FINI entry in the dynamic "
-              "section but instrumentation currently relies on patching "
-              "DT_FINI to write the profile\n";
+
+  if (BC.IsStaticExecutable && !opts::InstrumentationSleepTime) {
+    errs() << "BOLT-ERROR: instrumentation of static binary currently does not "
+              "support profile output on binary finalization, so it "
+              "requires -instrumentation-sleep-time=N (N>0) usage\n";
     exit(1);
   }
 
@@ -89,13 +90,6 @@ void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC,
                                  "__BOLT", "__counters", MachO::S_REGULAR,
                                  SectionKind::getData()));
 
-  if (BC.IsStaticExecutable && !opts::InstrumentationSleepTime) {
-    errs() << "BOLT-ERROR: instrumentation of static binary currently does not "
-              "support profile output on binary finalization, so it "
-              "requires -instrumentation-sleep-time=N (N>0) usage\n";
-    exit(1);
-  }
-
   Section->setAlignment(llvm::Align(BC.RegularPageSize));
   Streamer.switchSection(Section);
 
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 777a1e6cc743ba54fd37e26a14073ad8f1ec6740..642de6c3c618233ae08b799515fc3bae55cfcaeb 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "Utils/AArch64BaseInfo.h"
@@ -267,12 +268,40 @@ public:
             Inst.getOpcode() == AArch64::LDRXui);
   }
 
-  bool isLoad(const MCInst &Inst) const override {
+  bool mayLoad(const MCInst &Inst) const override {
     return isLDRB(Inst) || isLDRH(Inst) || isLDRW(Inst) || isLDRX(Inst);
   }
 
+  bool isAArch64Exclusive(const MCInst &Inst) const override {
+    return (Inst.getOpcode() == AArch64::LDXPX ||
+            Inst.getOpcode() == AArch64::LDXPW ||
+            Inst.getOpcode() == AArch64::LDXRX ||
+            Inst.getOpcode() == AArch64::LDXRW ||
+            Inst.getOpcode() == AArch64::LDXRH ||
+            Inst.getOpcode() == AArch64::LDXRB ||
+            Inst.getOpcode() == AArch64::STXPX ||
+            Inst.getOpcode() == AArch64::STXPW ||
+            Inst.getOpcode() == AArch64::STXRX ||
+            Inst.getOpcode() == AArch64::STXRW ||
+            Inst.getOpcode() == AArch64::STXRH ||
+            Inst.getOpcode() == AArch64::STXRB ||
+            Inst.getOpcode() == AArch64::LDAXPX ||
+            Inst.getOpcode() == AArch64::LDAXPW ||
+            Inst.getOpcode() == AArch64::LDAXRX ||
+            Inst.getOpcode() == AArch64::LDAXRW ||
+            Inst.getOpcode() == AArch64::LDAXRH ||
+            Inst.getOpcode() == AArch64::LDAXRB ||
+            Inst.getOpcode() == AArch64::STLXPX ||
+            Inst.getOpcode() == AArch64::STLXPW ||
+            Inst.getOpcode() == AArch64::STLXRX ||
+            Inst.getOpcode() == AArch64::STLXRW ||
+            Inst.getOpcode() == AArch64::STLXRH ||
+            Inst.getOpcode() == AArch64::STLXRB ||
+            Inst.getOpcode() == AArch64::CLREX);
+  }
+
   bool isLoadFromStack(const MCInst &Inst) const {
-    if (!isLoad(Inst))
+    if (!mayLoad(Inst))
       return false;
     for (const MCOperand &Operand : useOperands(Inst)) {
       if (!Operand.isReg())
@@ -442,6 +471,22 @@ public:
     return true;
   }
 
+  void getCalleeSavedRegs(BitVector &Regs) const override {
+    Regs |= getAliases(AArch64::X18);
+    Regs |= getAliases(AArch64::X19);
+    Regs |= getAliases(AArch64::X20);
+    Regs |= getAliases(AArch64::X21);
+    Regs |= getAliases(AArch64::X22);
+    Regs |= getAliases(AArch64::X23);
+    Regs |= getAliases(AArch64::X24);
+    Regs |= getAliases(AArch64::X25);
+    Regs |= getAliases(AArch64::X26);
+    Regs |= getAliases(AArch64::X27);
+    Regs |= getAliases(AArch64::X28);
+    Regs |= getAliases(AArch64::LR);
+    Regs |= getAliases(AArch64::FP);
+  }
+
   const MCExpr *getTargetExprFor(MCInst &Inst, const MCExpr *Expr,
                                  MCContext &Ctx,
                                  uint64_t RelType) const override {
@@ -679,7 +724,7 @@ public:
     PCRelBase = DefBaseAddr;
     // Match LOAD to load the jump table (relative) target
     const MCInst *DefLoad = UsesAdd[2];
-    assert(isLoad(*DefLoad) &&
+    assert(mayLoad(*DefLoad) &&
            "Failed to match indirect branch load pattern! (1)");
     assert((ScaleValue != 1LL || isLDRB(*DefLoad)) &&
            "Failed to match indirect branch load pattern! (2)");
@@ -819,6 +864,14 @@ public:
   ///    add     x16, x16, #0xbe0
   ///    br      x17
   ///
+  ///  The other type of trampolines are located in .plt.got, that are used for
+  ///  non-lazy bindings so doesn't use x16 arg to transfer .got entry address:
+  ///
+  ///    adrp    x16, 230000
+  ///    ldr     x17, [x16, #3040]
+  ///    br      x17
+  ///    nop
+  ///
   uint64_t analyzePLTEntry(MCInst &Instruction, InstructionIterator Begin,
                            InstructionIterator End,
                            uint64_t BeginPC) const override {
@@ -1012,7 +1065,7 @@ public:
     return true;
   }
 
-  bool isStore(const MCInst &Inst) const override { return false; }
+  bool mayStore(const MCInst &Inst) const override { return false; }
 
   bool createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx,
                         bool IsTailCall) override {
@@ -1286,6 +1339,10 @@ public:
     }
   }
 
+  StringRef getTrapFillValue() const override {
+    return StringRef("\0\0\0\0", 4);
+  }
+
   bool createReturn(MCInst &Inst) const override {
     Inst.setOpcode(AArch64::RET);
     Inst.clear();
@@ -1550,6 +1607,52 @@ public:
                           ELF::R_AARCH64_ADD_ABS_LO12_NC);
     return Insts;
   }
+
+  std::optional<Relocation>
+  createRelocation(const MCFixup &Fixup,
+                   const MCAsmBackend &MAB) const override {
+    const MCFixupKindInfo &FKI = MAB.getFixupKindInfo(Fixup.getKind());
+
+    assert(FKI.TargetOffset == 0 && "0-bit relocation offset expected");
+    const uint64_t RelOffset = Fixup.getOffset();
+
+    uint64_t RelType;
+    if (Fixup.getKind() == MCFixupKind(AArch64::fixup_aarch64_pcrel_call26))
+      RelType = ELF::R_AARCH64_CALL26;
+    else if (FKI.Flags & MCFixupKindInfo::FKF_IsPCRel) {
+      switch (FKI.TargetSize) {
+      default:
+        return std::nullopt;
+      case 16:
+        RelType = ELF::R_AARCH64_PREL16;
+        break;
+      case 32:
+        RelType = ELF::R_AARCH64_PREL32;
+        break;
+      case 64:
+        RelType = ELF::R_AARCH64_PREL64;
+        break;
+      }
+    } else {
+      switch (FKI.TargetSize) {
+      default:
+        return std::nullopt;
+      case 16:
+        RelType = ELF::R_AARCH64_ABS16;
+        break;
+      case 32:
+        RelType = ELF::R_AARCH64_ABS32;
+        break;
+      case 64:
+        RelType = ELF::R_AARCH64_ABS64;
+        break;
+      }
+    }
+
+    auto [RelSymbol, RelAddend] = extractFixupExpr(Fixup);
+
+    return Relocation({RelOffset, RelSymbol, RelType, RelAddend, 0});
+  }
 };
 
 } // end anonymous namespace
diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
index ec5bca85231c0e0ef5d73fb3c538120950fd39be..d13eb22f95826411a5b3db991a9ab6fc7c7c93d1 100644
--- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
+++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
@@ -42,6 +42,7 @@ public:
     case ELF::R_RISCV_GOT_HI20:
     case ELF::R_RISCV_PCREL_HI20:
     case ELF::R_RISCV_PCREL_LO12_I:
+    case ELF::R_RISCV_PCREL_LO12_S:
       return true;
     default:
       llvm_unreachable("Unexpected RISCV relocation type in code");
@@ -171,6 +172,10 @@ public:
     return true;
   }
 
+  StringRef getTrapFillValue() const override {
+    return StringRef("\0\0\0\0", 4);
+  }
+
   bool analyzeBranch(InstructionIterator Begin, InstructionIterator End,
                      const MCSymbol *&TBB, const MCSymbol *&FBB,
                      MCInst *&CondBranch,
@@ -348,6 +353,7 @@ public:
     case ELF::R_RISCV_PCREL_HI20:
       return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_RISCV_PCREL_HI, Ctx);
     case ELF::R_RISCV_PCREL_LO12_I:
+    case ELF::R_RISCV_PCREL_LO12_S:
       return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_RISCV_PCREL_LO, Ctx);
     case ELF::R_RISCV_CALL:
       return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_RISCV_CALL, Ctx);
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 265868fbddd41a604cc42192966b1bb13962cc46..4cb9d61710d1da7d85d7ca56c91a7f1a94500cb9 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -350,7 +350,7 @@ public:
     }
   }
 
-  bool isLoad(const MCInst &Inst) const override {
+  bool mayLoad(const MCInst &Inst) const override {
     if (isPop(Inst))
       return true;
 
@@ -363,7 +363,7 @@ public:
     return MCII.mayLoad();
   }
 
-  bool isStore(const MCInst &Inst) const override {
+  bool mayStore(const MCInst &Inst) const override {
     if (isPush(Inst))
       return true;
 
@@ -416,7 +416,7 @@ public:
     }
   }
 
-  unsigned getTrapFillValue() const override { return 0xCC; }
+  StringRef getTrapFillValue() const override { return StringRef("\314", 1); }
 
   struct IndJmpMatcherFrag1 : MCInstMatcher {
     std::unique_ptr<MCInstMatcher> Base;
@@ -1755,7 +1755,7 @@ public:
     // - Non-stack loads are prohibited (generally unsafe)
     // - Stack loads are OK if AllowStackMemOp is true
     // - Stack loads with RBP are OK if AllowBasePtrStackMemOp is true
-    if (isLoad(Inst)) {
+    if (mayLoad(Inst)) {
       // If stack memory operands are not allowed, no loads are allowed
       if (!AllowStackMemOp)
         return false;
@@ -2190,7 +2190,7 @@ public:
       MCInst &CurInst = *Itr++;
       const MCInstrDesc &Desc = Info->get(CurInst.getOpcode());
       if (Desc.hasDefOfPhysReg(CurInst, MethodRegNum, *RegInfo)) {
-        if (!isLoad(CurInst))
+        if (!mayLoad(CurInst))
           return false;
         if (std::optional<X86MemOperand> MO =
                 evaluateX86MemoryOperand(CurInst)) {
@@ -2464,46 +2464,9 @@ public:
       }
     }
 
-    // Extract a symbol and an addend out of the fixup value expression.
-    //
-    // Only the following limited expression types are supported:
-    //   Symbol + Addend
-    //   Symbol + Constant + Addend
-    //   Const + Addend
-    //   Symbol
-    uint64_t Addend = 0;
-    MCSymbol *Symbol = nullptr;
-    const MCExpr *ValueExpr = Fixup.getValue();
-    if (ValueExpr->getKind() == MCExpr::Binary) {
-      const auto *BinaryExpr = cast<MCBinaryExpr>(ValueExpr);
-      assert(BinaryExpr->getOpcode() == MCBinaryExpr::Add &&
-             "unexpected binary expression");
-      const MCExpr *LHS = BinaryExpr->getLHS();
-      if (LHS->getKind() == MCExpr::Constant) {
-        Addend = cast<MCConstantExpr>(LHS)->getValue();
-      } else if (LHS->getKind() == MCExpr::Binary) {
-        const auto *LHSBinaryExpr = cast<MCBinaryExpr>(LHS);
-        assert(LHSBinaryExpr->getOpcode() == MCBinaryExpr::Add &&
-               "unexpected binary expression");
-        const MCExpr *LLHS = LHSBinaryExpr->getLHS();
-        assert(LLHS->getKind() == MCExpr::SymbolRef && "unexpected LLHS");
-        Symbol = const_cast<MCSymbol *>(this->getTargetSymbol(LLHS));
-        const MCExpr *RLHS = LHSBinaryExpr->getRHS();
-        assert(RLHS->getKind() == MCExpr::Constant && "unexpected RLHS");
-        Addend = cast<MCConstantExpr>(RLHS)->getValue();
-      } else {
-        assert(LHS->getKind() == MCExpr::SymbolRef && "unexpected LHS");
-        Symbol = const_cast<MCSymbol *>(this->getTargetSymbol(LHS));
-      }
-      const MCExpr *RHS = BinaryExpr->getRHS();
-      assert(RHS->getKind() == MCExpr::Constant && "unexpected RHS");
-      Addend += cast<MCConstantExpr>(RHS)->getValue();
-    } else {
-      assert(ValueExpr->getKind() == MCExpr::SymbolRef && "unexpected value");
-      Symbol = const_cast<MCSymbol *>(this->getTargetSymbol(ValueExpr));
-    }
+    auto [RelSymbol, RelAddend] = extractFixupExpr(Fixup);
 
-    return Relocation({RelOffset, Symbol, RelType, Addend, 0});
+    return Relocation({RelOffset, RelSymbol, RelType, RelAddend, 0});
   }
 
   bool replaceImmWithSymbolRef(MCInst &Inst, const MCSymbol *Symbol,
diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt
index 191d2b895b926d0018bbaaaf3195da452adfbbf1..04fc7fee98ab9c070379cf1259028e81a29fe44b 100644
--- a/bolt/runtime/CMakeLists.txt
+++ b/bolt/runtime/CMakeLists.txt
@@ -1,4 +1,5 @@
 cmake_minimum_required(VERSION 3.20.0)
+include(CheckCXXCompilerFlag)
 include(CheckIncludeFiles)
 include(GNUInstallDirs)
 
@@ -28,10 +29,17 @@ set(BOLT_RT_FLAGS
   -fno-rtti
   -fno-stack-protector
   -fPIC
+  -Wno-unused-function
   -mgeneral-regs-only)
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
   set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-sse")
 endif()
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+  check_cxx_compiler_flag("-mno-outline-atomics" CXX_SUPPORTS_OUTLINE_ATOMICS)
+  if (CXX_SUPPORTS_OUTLINE_ATOMICS)
+    set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-outline-atomics")
+  endif()
+endif()
 
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
   include(CheckCXXCompilerFlag)
diff --git a/bolt/runtime/hugify.cpp b/bolt/runtime/hugify.cpp
index 05c1be4f2d70ca6ffb144838e504085a99925b6c..b1c9835936052f9cd67a39efdaa94de1bee80911 100644
--- a/bolt/runtime/hugify.cpp
+++ b/bolt/runtime/hugify.cpp
@@ -5,8 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===---------------------------------------------------------------------===//
-
-#if defined (__x86_64__) && !defined(__APPLE__)
+#if (defined (__x86_64__) && !defined(__APPLE__))                             \
+|| (defined(__aarch64__) && !defined(__APPLE__))
 
 #include "common.h"
 
@@ -170,6 +170,14 @@ extern "C" __attribute((naked)) void __bolt_hugify_self() {
   __asm__ __volatile__(SAVE_ALL "call __bolt_hugify_self_impl\n" RESTORE_ALL
                                 "jmp __bolt_hugify_start_program\n" ::
                                     :);
+#elif defined(__aarch64__)
+  __asm__ __volatile__(SAVE_ALL
+                       "bl __bolt_hugify__self__impl\n"
+                       RESTORE_ALL
+                       "adrp x16, __bolt_hugify_start_program\n"
+                       "add x16, x16, #:lo12:__bolt_hugify_start_program\n"
+                       "br x16\n"
+                       :::);
 #else
   exit(1);
 #endif
diff --git a/bolt/test/AArch64/Inputs/iplt.ld b/bolt/test/AArch64/Inputs/iplt.ld
new file mode 100644
index 0000000000000000000000000000000000000000..1e54a249b2182e0ae72a4716e05bc4351bf71329
--- /dev/null
+++ b/bolt/test/AArch64/Inputs/iplt.ld
@@ -0,0 +1,3 @@
+SECTIONS {
+  .plt : ALIGN(16) { *(.plt) *(.iplt) }
+}
diff --git a/bolt/test/AArch64/Inputs/plt-got.yaml b/bolt/test/AArch64/Inputs/plt-got.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7856719c5df83d60ef5a329b2828667de574f297
--- /dev/null
+++ b/bolt/test/AArch64/Inputs/plt-got.yaml
@@ -0,0 +1,216 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_AARCH64
+  Entry:           0x10360
+ProgramHeaders:
+  - Type:            PT_PHDR
+    Flags:           [ PF_R ]
+    VAddr:           0x40
+    Align:           0x8
+    Offset:          0x40
+  - Type:            PT_INTERP
+    Flags:           [ PF_R ]
+    FirstSec:        .interp
+    LastSec:         .interp
+    VAddr:           0x270
+    Offset:          0x270
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .interp
+    LastSec:         .rela.dyn
+    Align:           0x10000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .plt.got
+    LastSec:         .text
+    VAddr:           0x10350
+    Align:           0x10000
+    Offset:          0x2e0
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .interp
+    LastSec:         .got
+    VAddr:           0x203B0
+    Align:           0x10000
+    Offset:          0x270
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .got.plt
+    LastSec:         .got.plt
+    VAddr:           0x304E0
+    Align:           0x10000
+    Offset:          0x420
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x203B0
+    Align:           0x8
+    Offset:          0x340
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Offset:          0x0
+Sections:
+  - Name:            .interp
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x270
+    AddressAlign:    0x1
+    Offset:          0x270
+    Content:         2F6C69622F6C642D6C696E75782D616172636836342E736F2E3100
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2B0
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2E0
+    AddressAlign:    0x1
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x2F0
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x204D8
+        Symbol:          abort
+        Type:            R_AARCH64_GLOB_DAT
+  - Name:            .plt.got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x10350
+    AddressAlign:    0x10
+    Content:         90000090116E42F920021FD61F2003D5
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x10360
+    AddressAlign:    0x4
+    Content:         FF8300D1FD7B01A9FD43009188000090086D42F9E80700F9E80740F9080100F1E8179F1AA800003701000014E80740F900013FD601000014EEFFFF97007D20D41000009010420D9100021FD61F2003D5
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x203B0
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x1
+      - Tag:             DT_RELA
+        Value:           0x2F0
+      - Tag:             DT_RELASZ
+        Value:           0x18
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_PLTGOT
+        Value:           0x304E0
+      - Tag:             DT_SYMTAB
+        Value:           0x2B0
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_STRTAB
+        Value:           0x2E0
+      - Tag:             DT_STRSZ
+        Value:           0x10
+      - Tag:             DT_GNU_HASH
+        Value:           0x290
+      - Tag:             DT_FLAGS_1
+        Value:           0x8000000
+      - Tag:             DT_DEBUG
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x204D0
+    AddressAlign:    0x8
+    Content:         '00000000000000000000000000000000'
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x304E0
+    AddressAlign:    0x8
+    Content:         B00302000000000000000000000000000000000000000000
+  - Name:            .rela.text
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Offset:          0x1268
+    Info:            .text
+    Relocations:
+      - Offset:          0x1036C
+        Symbol:          abort
+        Type:            R_AARCH64_ADR_GOT_PAGE
+      - Offset:          0x10370
+        Symbol:          abort
+        Type:            R_AARCH64_LD64_GOT_LO12_NC
+      - Offset:          0x10398
+        Symbol:          abort
+        Type:            R_AARCH64_CALL26
+  - Type:            SectionHeaderTable
+    Sections:
+      - Name:            .interp
+      - Name:            .dynsym
+      - Name:            .dynstr
+      - Name:            .rela.dyn
+      - Name:            .plt.got
+      - Name:            .text
+      - Name:            .dynamic
+      - Name:            .got
+      - Name:            .got.plt
+      - Name:            .strtab
+      - Name:            .symtab
+      - Name:            .shstrtab
+      - Name:            .rela.text
+Symbols:
+  - Name:            .text
+    Type:            STT_SECTION
+    Section:         .text
+    Value:           0x10360
+  - Name:            .dynamic
+    Type:            STT_SECTION
+    Section:         .dynamic
+    Value:           0x203B0
+  - Name:            .got
+    Type:            STT_SECTION
+    Section:         .got
+    Value:           0x204D0
+  - Name:            .got.plt
+    Type:            STT_SECTION
+    Section:         .got.plt
+    Value:           0x304E0
+  - Name:            'abort$got'
+    Type:            STT_OBJECT
+    Section:         .got
+    Value:           0x204D8
+  - Name:            _start
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x10360
+    Size:            0x3C
+  - Name:            _DYNAMIC
+    Section:         .dynamic
+    Value:           0x203B0
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Section:         .got
+    Value:           0x204D0
+  - Name:            abort
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+    Size:            0x8
+DynamicSymbols:
+  - Name:            abort
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+    Size:            0x8
+...
diff --git a/bolt/test/AArch64/bf_min_alignment.s b/bolt/test/AArch64/bf_min_alignment.s
new file mode 100644
index 0000000000000000000000000000000000000000..2dd06b373a798010820ed43738134e3c871d62c9
--- /dev/null
+++ b/bolt/test/AArch64/bf_min_alignment.s
@@ -0,0 +1,35 @@
+// This tests checks the minimum alignment of the AARch64 function
+// is equal to 4. Otherwise the jitlinker would fail to link the
+// binary since the size of the first function after reorder is not
+// not a multiple of 4.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -fPIC -pie %t.o -o %t.exe -nostdlib -Wl,-q
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-bolt %t.exe -o %t.bolt --use-old-text=0 --lite=0 \
+# RUN:   --align-functions-max-bytes=1 \
+# RUN:   --data %t.fdata --reorder-functions=exec-count
+# RUN: llvm-nm -n %t.bolt | FileCheck %s
+
+# CHECK: {{0|4|8|c}} T dummy
+# CHECK-NEXT: {{0|4|8|c}} T _start
+
+  .text
+  .align 4
+  .global _start
+  .type _start, %function
+_start:
+# FDATA: 0 [unknown] 0 1 _start 0 0 1
+   bl dymmy
+   ret
+  .size _start, .-_start
+
+  .global dummy
+  .type dummy, %function
+dummy:
+# FDATA: 0 [unknown] 0 1 dummy 0 0 42
+  adr x0, .Lci
+  ret
+.Lci:
+  .byte 0
+  .size dummy, .-dummy
diff --git a/bolt/test/AArch64/constant_island_pie_update.s b/bolt/test/AArch64/constant_island_pie_update.s
index c6856988d52f737162103695b9c532c37e74f559..0ab67d07a854ec49da550bfd0aa882c875a672b1 100644
--- a/bolt/test/AArch64/constant_island_pie_update.s
+++ b/bolt/test/AArch64/constant_island_pie_update.s
@@ -8,13 +8,16 @@
 # RUN: %clang %cflags -fPIC -pie %t.o -o %t.rela.exe -nostdlib \
 # RUN:   -Wl,-q -Wl,-z,notext
 # RUN: llvm-bolt %t.rela.exe -o %t.rela.bolt --use-old-text=0 --lite=0
-# RUN: llvm-objdump -j .text -d %t.rela.bolt | FileCheck %s
+# RUN: llvm-objdump -j .text -d --show-all-symbols %t.rela.bolt | FileCheck %s
 # RUN: llvm-readelf -rsW %t.rela.bolt | FileCheck --check-prefix=ELFCHECK %s
 // .relr.dyn
 # RUN: %clang %cflags -fPIC -pie %t.o -o %t.relr.exe -nostdlib \
 # RUN:   -Wl,-q -Wl,-z,notext -Wl,--pack-dyn-relocs=relr
+# RUN: llvm-objcopy --remove-section .rela.mytext %t.relr.exe
 # RUN: llvm-bolt %t.relr.exe -o %t.relr.bolt --use-old-text=0 --lite=0
-# RUN: llvm-objdump -j .text -d %t.relr.bolt | FileCheck %s
+# RUN: llvm-objdump -j .text -d --show-all-symbols %t.relr.bolt | FileCheck %s
+# RUN: llvm-objdump -j .text -d %t.relr.bolt | \
+# RUN:   FileCheck %s --check-prefix=ADDENDCHECK
 # RUN: llvm-readelf -rsW %t.relr.bolt | FileCheck --check-prefix=ELFCHECK %s
 # RUN: llvm-readelf -SW %t.relr.bolt | FileCheck --check-prefix=RELRSZCHECK %s
 
@@ -30,6 +33,11 @@
 # CHECK-NEXT: {{.*}} .word 0x{{[0]+}}[[#ADDR]]
 # CHECK-NEXT: {{.*}} .word 0x00000000
 
+// Check that addend was properly patched in mytextP with stripped relocations
+# ADDENDCHECK: [[#%x,ADDR:]] <exitLocal>:
+# ADDENDCHECK: {{.*}} <mytextP>:
+# ADDENDCHECK-NEXT: {{.*}} .word 0x{{[0]+}}[[#ADDR]]
+# ADDENDCHECK-NEXT: {{.*}} .word 0x00000000
 
 // Check that we've relaxed adr to adrp + add to refer external CI
 # CHECK: <addressDynCi>:
@@ -40,9 +48,10 @@
 # ELFCHECK: [[#%x,OFF:]] [[#%x,INFO_DYN:]] R_AARCH64_RELATIVE
 # ELFCHECK-NEXT: [[#OFF + 8]] {{0*}}[[#INFO_DYN]] R_AARCH64_RELATIVE
 # ELFCHECK-NEXT: [[#OFF + 24]] {{0*}}[[#INFO_DYN]] R_AARCH64_RELATIVE
+# ELFCHECK-NEXT: {{.*}} R_AARCH64_RELATIVE
 # ELFCHECK: {{.*}}[[#OFF]] {{.*}} $d
 
-// Check that .relr.dyn size is 2 bytes to ensure that last 2 relocations were
+// Check that .relr.dyn size is 2 bytes to ensure that last 3 relocations were
 // encoded as a bitmap so the total section size for 3 relocations is 2 bytes.
 # RELRSZCHECK: .relr.dyn RELR [[#%x,ADDR:]] [[#%x,OFF:]] {{0*}}10
 
@@ -81,3 +90,17 @@ addressDynCi:
   adr x1, .Lci
   bl _start
 .size addressDynCi, .-addressDynCi
+
+  .section ".mytext", "ax"
+  .balign 8
+  .global dummy
+  .type dummy, %function
+dummy:
+  nop
+  .word 0
+  .size dummy, .-dummy
+
+  .global mytextP
+mytextP:
+  .xword exitLocal
+  .size mytextP, .-mytextP
diff --git a/bolt/test/AArch64/exclusive-instrument.s b/bolt/test/AArch64/exclusive-instrument.s
new file mode 100644
index 0000000000000000000000000000000000000000..502dd83b2f2a5b8e8f111309d7fce38ad0fad19c
--- /dev/null
+++ b/bolt/test/AArch64/exclusive-instrument.s
@@ -0,0 +1,39 @@
+// This test checks that the foo function having exclusive memory access
+// instructions won't be instrumented.
+
+// REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}}
+
+// RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+// RUN:   %s -o %t.o
+// RUN: %clang %cflags -fPIC -pie %t.o -o %t.exe -nostdlib -Wl,-q -Wl,-fini=dummy
+// RUN: llvm-bolt %t.exe -o %t.bolt -instrument -v=1 | FileCheck %s
+
+// CHECK: Function foo has exclusive instructions, skip instrumentation
+
+.global foo
+.type foo, %function
+foo:
+  ldaxr w9, [x10]
+  cbnz w9, .Lret
+  stlxr w12, w11, [x9]
+  cbz w12, foo
+  clrex
+.Lret:
+  ret
+.size foo, .-foo
+
+.global _start
+.type _start, %function
+_start:
+  cmp x0, #0
+  b.eq .Lexit
+  bl foo
+.Lexit:
+  ret
+.size _start, .-_start
+
+.global dummy
+.type dummy, %function
+dummy:
+  ret
+.size dummy, .-dummy
diff --git a/bolt/test/AArch64/hook-fini.s b/bolt/test/AArch64/hook-fini.s
new file mode 100644
index 0000000000000000000000000000000000000000..4f321d463ef322b541768f2f839e7045ac3d1a83
--- /dev/null
+++ b/bolt/test/AArch64/hook-fini.s
@@ -0,0 +1,103 @@
+## Test the different ways of hooking the fini function for instrumentation (via
+## DT_FINI and via DT_FINI_ARRAY). We test the latter for both PIE and non-PIE
+## binaries because of the different ways of handling relocations (static or
+## dynamic).
+## All tests perform the following steps:
+## - Compile and link for the case to be tested
+## - Some sanity-checks on the dynamic section and relocations in the binary to
+##   verify it has the shape we want for testing:
+##   - DT_FINI or DT_FINI_ARRAY in dynamic section
+##   - No relative relocations for non-PIE
+## - Instrument
+## - Verify generated binary
+# REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}}
+
+# RUN: %clang %cflags -pie %s -Wl,-q -o %t.exe
+# RUN: llvm-readelf -d %t.exe | FileCheck --check-prefix=DYN-FINI %s
+# RUN: llvm-readelf -r %t.exe | FileCheck --check-prefix=RELOC-PIE %s
+# RUN: llvm-bolt %t.exe -o %t --instrument
+# RUN: llvm-readelf -drs %t | FileCheck --check-prefix=CHECK-FINI %s
+
+# RUN: %clang %cflags -pie %s -Wl,-q,-fini=0 -o %t-no-fini.exe
+# RUN: llvm-readelf -d %t-no-fini.exe | FileCheck --check-prefix=DYN-NO-FINI %s
+# RUN: llvm-readelf -r %t-no-fini.exe | FileCheck --check-prefix=RELOC-PIE %s
+# RUN: llvm-bolt %t-no-fini.exe -o %t-no-fini --instrument
+# RUN: llvm-readelf -drs %t-no-fini | FileCheck --check-prefix=CHECK-NO-FINI %s
+# RUN: llvm-readelf -ds -x .fini_array %t-no-fini | FileCheck --check-prefix=CHECK-NO-FINI-RELOC %s
+
+## Create a dummy shared library to link against to force creation of the dynamic section.
+# RUN: %clang %cflags %p/../Inputs/stub.c -fPIC -shared -o %t-stub.so
+# RUN: %clang %cflags %s -no-pie -Wl,-q,-fini=0 %t-stub.so -o %t-no-pie-no-fini.exe
+# RUN: llvm-readelf -r %t-no-pie-no-fini.exe | FileCheck --check-prefix=RELOC-NO-PIE %s
+# RUN: llvm-bolt %t-no-pie-no-fini.exe -o %t-no-pie-no-fini --instrument
+# RUN: llvm-readelf -ds -x .fini_array %t-no-pie-no-fini | FileCheck --check-prefix=CHECK-NO-PIE-NO-FINI %s
+
+## With fini: dynamic section should contain DT_FINI
+# DYN-FINI: (FINI)
+
+## Without fini: dynamic section should only contain DT_FINI_ARRAY
+# DYN-NO-FINI-NOT: (FINI)
+# DYN-NO-FINI:     (FINI_ARRAY)
+# DYN-NO-FINI:     (FINI_ARRAYSZ)
+
+## With PIE: binary should have relative relocations
+# RELOC-PIE: R_AARCH64_RELATIVE
+
+## Without PIE: binary should not have relative relocations
+# RELOC-NO-PIE-NOT: R_AARCH64_RELATIVE
+
+## Check that DT_FINI is set to __bolt_runtime_fini
+# CHECK-FINI:     Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-FINI-DAG: (FINI) 0x[[FINI:[[:xdigit:]]+]]
+# CHECK-FINI-DAG: (FINI_ARRAY) 0x[[FINI_ARRAY:[[:xdigit:]]+]]
+## Check that the dynamic relocation at .fini_array was not patched
+# CHECK-FINI:     Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-FINI-NOT: {{0+}}[[FINI_ARRAY]] {{.*}} R_AARCH64_RELATIVE [[FINI]]
+# CHECK-FINI:     Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-FINI:     {{0+}}[[FINI]] {{.*}} __bolt_runtime_fini
+
+## Check that DT_FINI_ARRAY has a dynamic relocation for __bolt_runtime_fini
+# CHECK-NO-FINI:     Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-NO-FINI-NOT: (FINI)
+# CHECK-NO-FINI:     (FINI_ARRAY) 0x[[FINI_ARRAY:[[:xdigit:]]+]]
+# CHECK-NO-FINI:     Relocation section '.rela.dyn' at offset {{.*}} contains {{.*}} entries
+# CHECK-NO-FINI:     {{0+}}[[FINI_ARRAY]] {{.*}} R_AARCH64_RELATIVE [[FINI_ADDR:[[:xdigit:]]+]]
+# CHECK-NO-FINI:     Symbol table '.symtab' contains {{.*}} entries:
+# CHECK-NO-FINI:     {{0+}}[[FINI_ADDR]] {{.*}} __bolt_runtime_fini
+
+## Check that the static relocation in .fini_array is patched even for PIE
+# CHECK-NO-FINI-RELOC: Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-NO-FINI-RELOC: (FINI_ARRAY) 0x[[FINI_ARRAY:[[:xdigit:]]+]]
+# CHECK-NO-FINI-RELOC: Symbol table '.symtab' contains {{.*}} entries:
+## Read  bytes separately so we can reverse them later
+# CHECK-NO-FINI-RELOC: {{0+}}[[FINI_ADDR_B0:[[:xdigit:]]{2}]][[FINI_ADDR_B1:[[:xdigit:]]{2}]][[FINI_ADDR_B2:[[:xdigit:]]{2}]][[FINI_ADDR_B3:[[:xdigit:]]{2}]] {{.*}} __bolt_runtime_fini
+# CHECK-NO-FINI-RELOC: Hex dump of section '.fini_array':
+# CHECK-NO-FINI-RELOC: 0x{{0+}}[[FINI_ARRAY]] [[FINI_ADDR_B3]][[FINI_ADDR_B2]][[FINI_ADDR_B1]][[FINI_ADDR_B0]] 00000000
+
+## Check that DT_FINI_ARRAY has static relocation applied for __bolt_runtime_fini
+# CHECK-NO-PIE-NO-FINI:     Dynamic section at offset {{.*}} contains {{.*}} entries:
+# CHECK-NO-PIE-NO-FINI-NOT: (FINI)
+# CHECK-NO-PIE-NO-FINI:     (FINI_ARRAY) 0x[[FINI_ARRAY:[a-f0-9]+]]
+# CHECK-NO-PIE-NO-FINI:     Symbol table '.symtab' contains {{.*}} entries:
+## Read address bytes separately so we can reverse them later
+# CHECK-NO-PIE-NO-FINI:     {{0+}}[[FINI_ADDR_B0:[[:xdigit:]]{2}]][[FINI_ADDR_B1:[[:xdigit:]]{2}]][[FINI_ADDR_B2:[[:xdigit:]]{2}]][[FINI_ADDR_B3:[[:xdigit:]]{2}]] {{.*}} __bolt_runtime_fini
+# CHECK-NO-PIE-NO-FINI:     Hex dump of section '.fini_array':
+# CHECK-NO-PIE-NO-FINI:     0x{{0+}}[[FINI_ARRAY]] [[FINI_ADDR_B3]][[FINI_ADDR_B2]][[FINI_ADDR_B1]][[FINI_ADDR_B0]] 00000000
+
+  .globl _start
+  .type _start, %function
+_start:
+  # Dummy relocation to force relocation mode.
+  .reloc 0, R_AARCH64_NONE
+  ret
+.size _start, .-_start
+
+  .globl _fini
+  .type _fini, %function
+_fini:
+  ret
+  .size _fini, .-_fini
+
+  .section .fini_array,"aw"
+  .align 3
+  .dword _fini
diff --git a/bolt/test/AArch64/ifunc.c b/bolt/test/AArch64/ifunc.c
new file mode 100644
index 0000000000000000000000000000000000000000..79c035ed45373c70f19e2af0730ca6f5e3e5ef2c
--- /dev/null
+++ b/bolt/test/AArch64/ifunc.c
@@ -0,0 +1,59 @@
+// This test checks that IFUNC trampoline is properly recognised by BOLT
+
+// With -O0 indirect call is performed on IPLT trampoline. IPLT trampoline
+// has IFUNC symbol.
+// RUN: %clang %cflags -nostdlib -O0 -no-pie %s -fuse-ld=lld \
+// RUN:    -o %t.O0.exe -Wl,-q
+// RUN: llvm-bolt %t.O0.exe -o %t.O0.bolt.exe \
+// RUN:   --print-disasm --print-only=_start | \
+// RUN:   FileCheck --check-prefix=CHECK %s
+// RUN: llvm-readelf -aW %t.O0.bolt.exe | \
+// RUN:   FileCheck --check-prefix=REL_CHECK %s
+
+// Non-pie static executable doesn't generate PT_DYNAMIC, check relocation
+// is readed successfully and IPLT trampoline has been identified by bolt.
+// RUN: %clang %cflags -nostdlib -O3 %s -fuse-ld=lld -no-pie \
+// RUN:   -o %t.O3_nopie.exe -Wl,-q
+// RUN: llvm-readelf -l %t.O3_nopie.exe | \
+// RUN:   FileCheck --check-prefix=NON_DYN_CHECK %s
+// RUN: llvm-bolt %t.O3_nopie.exe -o %t.O3_nopie.bolt.exe  \
+// RUN:   --print-disasm --print-only=_start | \
+// RUN:   FileCheck --check-prefix=CHECK %s
+// RUN: llvm-readelf -aW %t.O3_nopie.bolt.exe | \
+// RUN:   FileCheck --check-prefix=REL_CHECK %s
+
+// With -O3 direct call is performed on IPLT trampoline. IPLT trampoline
+// doesn't have associated symbol. The ifunc symbol has the same address as
+// IFUNC resolver function.
+// RUN: %clang %cflags -nostdlib -O3 %s -fuse-ld=lld -fPIC -pie \
+// RUN:   -o %t.O3_pie.exe -Wl,-q
+// RUN: llvm-bolt %t.O3_pie.exe -o %t.O3_pie.bolt.exe  \
+// RUN:   --print-disasm --print-only=_start | \
+// RUN:   FileCheck --check-prefix=CHECK %s
+// RUN: llvm-readelf -aW %t.O3_pie.bolt.exe | \
+// RUN:   FileCheck --check-prefix=REL_CHECK %s
+
+// Check that IPLT trampoline located in .plt section are normally handled by
+// BOLT. The gnu-ld linker doesn't use separate .iplt section.
+// RUN: %clang %cflags -nostdlib -O3 %s -fuse-ld=lld -fPIC -pie \
+// RUN:   -T %p/Inputs/iplt.ld -o %t.iplt_O3_pie.exe -Wl,-q
+// RUN: llvm-bolt %t.iplt_O3_pie.exe -o %t.iplt_O3_pie.bolt.exe  \
+// RUN:   --print-disasm --print-only=_start  | \
+// RUN:   FileCheck --check-prefix=CHECK %s
+// RUN: llvm-readelf -aW %t.iplt_O3_pie.bolt.exe | \
+// RUN:   FileCheck --check-prefix=REL_CHECK %s
+
+// NON_DYN_CHECK-NOT: DYNAMIC
+
+// CHECK: b{{l?}} "{{resolver_foo|ifoo}}{{.*}}@PLT"
+
+// REL_CHECK: R_AARCH64_IRELATIVE [[#%x,REL_SYMB_ADDR:]]
+// REL_CHECK: [[#REL_SYMB_ADDR]] {{.*}} FUNC {{.*}} resolver_foo
+
+static void foo() {}
+
+static void *resolver_foo(void) { return foo; }
+
+__attribute__((ifunc("resolver_foo"))) void ifoo();
+
+void _start() { ifoo(); }
diff --git a/bolt/test/AArch64/plt-got.test b/bolt/test/AArch64/plt-got.test
new file mode 100644
index 0000000000000000000000000000000000000000..be1c095784b7090bdea01fb5492479a4abdb9938
--- /dev/null
+++ b/bolt/test/AArch64/plt-got.test
@@ -0,0 +1,7 @@
+// This test checks .plt.got handling by BOLT
+
+RUN: yaml2obj %p/Inputs/plt-got.yaml &> %t.exe
+RUN: llvm-bolt %t.exe -o %t.bolt --print-disasm --print-only=_start/1 | \
+RUN:   FileCheck %s
+
+CHECK: bl abort@PLT
diff --git a/bolt/test/AArch64/r_aarch64_prelxx.s b/bolt/test/AArch64/r_aarch64_prelxx.s
index 444dee72b7c04eb0172c85b01979592d954ce840..73bf8387d3634518278b12ee168c0ce44eba8f92 100644
--- a/bolt/test/AArch64/r_aarch64_prelxx.s
+++ b/bolt/test/AArch64/r_aarch64_prelxx.s
@@ -12,7 +12,7 @@
 // CHECKPREL-NEXT:  R_AARCH64_PREL32      {{.*}} _start + 4
 // CHECKPREL-NEXT:  R_AARCH64_PREL64      {{.*}} _start + 8
 
-// RUN: llvm-bolt %t.exe -o %t.bolt
+// RUN: llvm-bolt %t.exe -o %t.bolt --strict
 // RUN: llvm-objdump -D %t.bolt | FileCheck %s --check-prefix=CHECKPREL32
 
 // CHECKPREL32: [[#%x,DATATABLEADDR:]] <datatable>:
diff --git a/bolt/test/AArch64/reloc-call26.s b/bolt/test/AArch64/reloc-call26.s
new file mode 100644
index 0000000000000000000000000000000000000000..42e4f7f2b43786fcdd72ba4a614e93562b380fe8
--- /dev/null
+++ b/bolt/test/AArch64/reloc-call26.s
@@ -0,0 +1,51 @@
+## This test checks processing of R_AARCH64_CALL26 relocation
+## when option `--funcs` is enabled
+
+## We want to test on relocations against functions with both higher
+## and lower addresses. The '--force-patch' option is used to prevent
+## the functions func1 and func2 from being optimized, so that their
+## addresses can remain unchanged. Therefore, the relocations can be
+## updated via encodeValueAArch64 and the address order in the output
+## binary is func1 < _start < func2.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:   %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --funcs=func1,func2 \
+# RUN:   --force-patch 2>&1 | FileCheck %s -check-prefix=CHECK-BOLT
+# RUN: llvm-objdump -d --disassemble-symbols='_start' %t.bolt | \
+# RUN:   FileCheck %s
+# RUN: llvm-nm --numeric-sort --extern-only %t.bolt  | FileCheck \
+# RUN:   %s -check-prefix=CHECK-FUNC-ORDER
+
+# CHECK-BOLT: BOLT-WARNING: failed to patch entries in func1. The function will not be optimized.
+# CHECK-BOLT: BOLT-WARNING: failed to patch entries in func2. The function will not be optimized.
+# CHECK: {{.*}} bl {{.*}} <func1>
+# CHECK: {{.*}} bl {{.*}} <func2>
+
+# CHECK-FUNC-ORDER: {{.*}} func1
+# CHECK-FUNC-ORDER-NEXT: {{.*}} _start
+# CHECK-FUNC-ORDER-NEXT: {{.*}} func2
+
+  .text
+  .align 4
+  .global func1
+  .type func1, %function
+func1:
+  ret
+  .size func1, .-func1
+  .global _start
+  .type _start, %function
+_start:
+  bl func1
+  bl func2
+  mov     w8, #93
+  svc     #0
+  .size _start, .-_start
+  .global func2
+  .type func2, %function
+func2:
+  ret
+  .size func2, .-func2
diff --git a/bolt/test/RISCV/branch-no-secondary-entry.s b/bolt/test/RISCV/branch-no-secondary-entry.s
new file mode 100644
index 0000000000000000000000000000000000000000..bf8191f25744c9049d0dbf19ad8e499dd31fc031
--- /dev/null
+++ b/bolt/test/RISCV/branch-no-secondary-entry.s
@@ -0,0 +1,18 @@
+/// Test that no secondary entry points are created for basic block labels used
+/// by branches.
+// RUN: %clang %cflags -o %t %s
+// RUN: llvm-bolt -print-cfg -o /dev/null %t 2>&1 | FileCheck %s
+
+// CHECK: Binary Function "_start" after building cfg {
+// CHECK: IsMultiEntry: 0
+// CHECK: beq t0, t1, .Ltmp0
+// CHECK: {{^}}.Ltmp0
+// CHECK: ret
+
+    .globl _start
+_start:
+    beq t0, t1, 1f
+1:
+    ret
+    .size _start, .-_start
+
diff --git a/bolt/test/RISCV/load-store.s b/bolt/test/RISCV/load-store.s
new file mode 100644
index 0000000000000000000000000000000000000000..5a9785571c808118b87109c87e4facfa88bafaf7
--- /dev/null
+++ b/bolt/test/RISCV/load-store.s
@@ -0,0 +1,16 @@
+// RUN: %clang %cflags -o %t %s
+// RUN: link_fdata --no-lbr %s %t %t.fdata
+// RUN: llvm-bolt %t -o /dev/null --data=%t.fdata --dyno-stats | FileCheck %s
+
+// CHECK: BOLT-INFO: program-wide dynostats after all optimizations before SCTC and FOP (no change):
+// CHECK: 3000 : executed instructions
+// CHECK: 1000 : executed load instructions
+// CHECK: 1000 : executed store instructions
+
+    .globl _start
+_start:
+# FDATA: 1 _start #_start# 1
+    ld t0, (gp)
+    sd t0, (gp)
+    ret
+    .size _start, .-_start
diff --git a/bolt/test/RISCV/mapping-syms.s b/bolt/test/RISCV/mapping-syms.s
new file mode 100644
index 0000000000000000000000000000000000000000..e8fdeb0c7572dea6c7b0d2b7293121b40237c511
--- /dev/null
+++ b/bolt/test/RISCV/mapping-syms.s
@@ -0,0 +1,27 @@
+/// FIXME llvm-mc is used instead of clang because we need a recent change in
+/// the RISC-V MC layer (D153260). Once that one is released, we can switch to
+/// clang. (Note that the pre-merge check buildbots use the system's clang).
+// RUN: llvm-mc -triple riscv64 -mattr=+c -filetype obj -o %t.o %s
+// RUN: ld.lld -o %t %t.o
+// RUN: llvm-bolt --print-cfg --print-only=_start -o %t.bolt %t 2>&1 | FileCheck %s
+// RUN: llvm-objdump -d %t.bolt | FileCheck --check-prefix=CHECK-OBJDUMP %s
+
+// CHECK-NOT: BOLT-WARNING
+
+/// Check that .word is not disassembled by BOLT
+// CHECK: 00000000: nop
+// CHECK: 00000002: ret
+
+/// Check .word is still present in output
+// CHECK-OBJDUMP: <_start>:
+// CHECK-OBJDUMP-NEXT: nop
+// CHECK-OBJDUMP-NEXT: unimp
+// CHECK-OBJDUMP-NEXT: unimp
+// CHECK-OBJDUMP-NEXT: ret
+    .text
+    .globl _start
+    .p2align 1
+_start:
+    nop
+    .word 0x0
+    ret
diff --git a/bolt/test/RISCV/reloc-abs.s b/bolt/test/RISCV/reloc-abs.s
index 3e4b8b1395e1ff8312dadd0d4abbabf4359fa7b5..5b728f092b3c9f587d063db38f85cda7605b0866 100644
--- a/bolt/test/RISCV/reloc-abs.s
+++ b/bolt/test/RISCV/reloc-abs.s
@@ -17,8 +17,7 @@ _start:
   .option push
   .option norelax
 1:
-// CHECK: .Ltmp0
-// CHECK: auipc gp, %pcrel_hi(__global_pointer$)
+// CHECK: auipc gp, %pcrel_hi(__global_pointer$) # Label: .Ltmp0
 // CHECK-NEXT: addi gp, gp, %pcrel_lo(.Ltmp0)
   auipc gp, %pcrel_hi(__global_pointer$)
   addi  gp, gp, %pcrel_lo(1b)
diff --git a/bolt/test/RISCV/reloc-bb-split.s b/bolt/test/RISCV/reloc-bb-split.s
new file mode 100644
index 0000000000000000000000000000000000000000..5995562cf130b07083c967d4797a141a6e9472e3
--- /dev/null
+++ b/bolt/test/RISCV/reloc-bb-split.s
@@ -0,0 +1,42 @@
+// RUN: %clang %cflags -o %t %s
+// RUN: llvm-bolt --print-cfg --print-only=_start -o /dev/null %t \
+// RUN:    | FileCheck %s
+
+  .data
+  .globl d
+  .p2align 3
+d:
+  .dword 0
+
+  .text
+  .globl _start
+  .p2align 1
+// CHECK-LABEL: Binary Function "_start" after building cfg {
+_start:
+/// The local label is used for %pcrel_lo as well as a jump target so a new
+/// basic block should start there.
+// CHECK-LABEL: {{^}}.LBB00
+// CHECK: nop
+// CHECK-LABEL: {{^}}.Ltmp0
+// CHECK: auipc t0, %pcrel_hi(d) # Label: .Ltmp1
+// CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp1)(t0)
+// CHECK-NEXT: j .Ltmp0
+  nop
+1:
+  auipc t0, %pcrel_hi(d)
+  ld t0, %pcrel_lo(1b)(t0)
+  j 1b
+
+/// The local label is used only for %pcrel_lo so no new basic block should
+/// start there.
+// CHECK-LABEL: {{^}}.LFT0
+// CHECK: nop
+// CHECK-NEXT: auipc t0, %pcrel_hi(d) # Label: .Ltmp2
+// CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp2)(t0)
+// CHECK-NEXT: ret
+  nop
+1:
+  auipc t0, %pcrel_hi(d)
+  ld t0, %pcrel_lo(1b)(t0)
+  ret
+  .size _start, .-_start
diff --git a/bolt/test/RISCV/reloc-got.s b/bolt/test/RISCV/reloc-got.s
index b6cd61be723bfa7c7012eee5a4dd484152a1921d..dcf9d0ea3ffbf23b999f1dfb2013a448a42e9739 100644
--- a/bolt/test/RISCV/reloc-got.s
+++ b/bolt/test/RISCV/reloc-got.s
@@ -14,8 +14,7 @@ d:
 // CHECK: Binary Function "_start" after building cfg {
 _start:
   nop // Here to not make the _start and .Ltmp0 symbols coincide
-// CHECK: .Ltmp0
-// CHECK: auipc t0, %pcrel_hi(__BOLT_got_zero+{{[0-9]+}})
+// CHECK: auipc t0, %pcrel_hi(__BOLT_got_zero+{{[0-9]+}}) # Label: .Ltmp0
 // CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp0)(t0)
 1:
   auipc t0, %got_pcrel_hi(d)
diff --git a/bolt/test/RISCV/reloc-pcrel.s b/bolt/test/RISCV/reloc-pcrel.s
index 2d5a349d03e788ee1bba2a6586f9b5ad9cd6f45c..3ad3015a0a57fadaa65fa8e2902b63a76f6e9f8d 100644
--- a/bolt/test/RISCV/reloc-pcrel.s
+++ b/bolt/test/RISCV/reloc-pcrel.s
@@ -14,9 +14,11 @@ d:
 // CHECK: Binary Function "_start" after building cfg {
 _start:
   nop // Here to not make the _start and .Ltmp0 symbols coincide
-// CHECK: .Ltmp0
-// CHECK: auipc t0, %pcrel_hi(d)
+// CHECK: auipc t0, %pcrel_hi(d) # Label: .Ltmp0
 // CHECK-NEXT: ld t0, %pcrel_lo(.Ltmp0)(t0)
   ld t0, d
+// CHECK-NEXT: auipc t1, %pcrel_hi(d) # Label: .Ltmp1
+// CHECK-NEXT: sd t0, %pcrel_lo(.Ltmp1)(t1)
+  sd t0, d, t1
   ret
   .size _start, .-_start
diff --git a/bolt/test/X86/Inputs/blarge_profile_stale.yaml b/bolt/test/X86/Inputs/blarge_profile_stale.yaml
index afe76eda5485005569330f21adbf4289cbfc0423..f5abaed3da39412588473a90779761c9d245a4dd 100644
--- a/bolt/test/X86/Inputs/blarge_profile_stale.yaml
+++ b/bolt/test/X86/Inputs/blarge_profile_stale.yaml
@@ -6,6 +6,7 @@ header:
   profile-flags:   [ lbr ]
   profile-origin:  branch profile reader
   profile-events:  ''
+  dfs-order:       false
 functions:
   - name:            SolveCubic
     fid:             6
@@ -15,20 +16,24 @@ functions:
     blocks:
       - bid:             0
         insns:           43
-        hash:            0xD2411AC186118199
+        hash:            0xed4db287e71c0000
         exec:            151
-        succ:            [ { bid: 1, cnt: 4, mis: 2 }, { bid: 11, cnt: 0 } ]
+        succ:            [ { bid: 1, cnt: 151, mis: 2 }, { bid: 7, cnt: 0 } ]
       - bid:             1
         insns:           7
-        hash:            0xDF0C9CC1FEAA70C3
-        succ:            [ { bid: 10, cnt: 0 }, { bid: 2, cnt: 0 } ]
+        hash:            0x39330000e4560088
+        succ:            [ { bid: 13, cnt: 151 }, { bid: 2, cnt: 0 } ]
       - bid:             13
         insns:           26
-        hash:            0xF05DC5524E99E56F
-        succ:            [ { bid: 15, cnt: 89 }, { bid: 14, cnt: 0 } ]
-      - bid:             15
+        hash:            0xa9700000fe202a7
+        succ:            [ { bid: 3, cnt: 89 }, { bid: 2, cnt: 10 } ]
+      - bid:             3
+        insns:           9
+        hash:            0x62391dad18a700a0
+        succ:            [ { bid: 5, cnt: 151 } ]
+      - bid:             5
         insns:           9
-        hash:            0xB2E8338276A9834E
+        hash:            0x4d906d19ecec0111
   - name:            usqrt
     fid:             7
     hash:            0x8B62B1F9AD81EA35
@@ -37,15 +42,15 @@ functions:
     blocks:
       - bid:             0
         insns:           4
-        hash:            0xb1e5b76571270000
+        hash:            0x1111111111111111
         exec:            20
         succ:            [ { bid: 1, cnt: 0 } ]
       - bid:             1
         insns:           9
-        hash:            0x587e93788b970010
+        hash:            0x27e43a5e10cd0010
         succ:            [ { bid: 3, cnt: 320, mis: 171 }, { bid: 2, cnt: 0 } ]
       - bid:             3
         insns:           2
-        hash:            0x20e605d745e50039
+        hash:            0x4db935b6471e0039
         succ:            [ { bid: 1, cnt: 300, mis: 33 }, { bid: 4, cnt: 20 } ]
 ...
diff --git a/bolt/test/X86/Inputs/patch-entries.c b/bolt/test/X86/Inputs/patch-entries.c
new file mode 100644
index 0000000000000000000000000000000000000000..46a3b41b048e466e6440f24b01c5ce299bc47ac4
--- /dev/null
+++ b/bolt/test/X86/Inputs/patch-entries.c
@@ -0,0 +1,8 @@
+#include "stub.h"
+
+static void foo() { printf("foo\n"); }
+
+int main() {
+  foo();
+  return 0;
+}
diff --git a/bolt/test/X86/bolt-address-translation-internal-call.test b/bolt/test/X86/bolt-address-translation-internal-call.test
index e24a9e6dc1c2272050f2c7a75d89dac48dcf8014..24cb635e13e9830e4a0f103b6aadcb83426cbc97 100644
--- a/bolt/test/X86/bolt-address-translation-internal-call.test
+++ b/bolt/test/X86/bolt-address-translation-internal-call.test
@@ -9,7 +9,7 @@
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
 # Delete our BB symbols so BOLT doesn't mark them as entry points
 # RUN: llvm-strip --strip-unneeded %t.o
-# RUN: %clang %t.o -o %t.exe -Wl,-q
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
 
 # RUN: llvm-bolt --enable-bat %t.exe --relocs -o %t.out | FileCheck %s
 # CHECK: BOLT-INFO: Wrote {{.*}} BAT maps
@@ -29,6 +29,7 @@ main:
   push   %rbx
   sub    $0x120,%rsp
   mov    $0x3,%rbx
+  movq   rel(%rip), %rdi
 .J1:
   cmp    $0x0,%rbx
   je     .J2
@@ -49,4 +50,8 @@ main:
 .J4:
   pop    %rbp
   retq
+end:
   .size main, .-main
+
+  .data
+rel: .quad end
diff --git a/bolt/test/X86/bug-function-layout-execount.s b/bolt/test/X86/bug-function-layout-execount.s
new file mode 100644
index 0000000000000000000000000000000000000000..540b6790d01e900a77ccd1b2740b13ea24042108
--- /dev/null
+++ b/bolt/test/X86/bug-function-layout-execount.s
@@ -0,0 +1,73 @@
+# Verifies that llvm-bolt correctly sorts functions by their execution counts.
+
+# REQUIRES: x86_64-linux, asserts
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe --data %t.fdata --lite --reorder-functions=exec-count \
+# RUN:   -v=2 --debug-only=hfsort -o /dev/null 2>&1 | FileCheck %s
+
+# CHECK: Starting pass: reorder-functions
+# CHECK-NEXT: hot func func2 (1500)
+# CHECK-NEXT: hot func func1 (500)
+# CHECK-NEXT: hot func main (400)
+# CHECK-NEXT: hot func func5 (110)
+# CHECK-NEXT: hot func func3 (100)
+# CHECK-NEXT: hot func func4 (99)
+
+  .text
+  .globl main
+  .type main, %function
+main:
+# FDATA: 0 [unknown] 0 1 main 0 1 400
+  .cfi_startproc
+  call func1
+  retq
+  .size _start, .-_start
+  .cfi_endproc
+
+  .globl  func1
+  .type func1,@function
+func1:
+# FDATA: 0 [unknown] 0 1 func1 0 1 500
+  .cfi_startproc
+  retq
+  .size func1, .-func1
+  .cfi_endproc
+
+  .globl  func2
+  .type func2,@function
+func2:
+# FDATA: 0 [unknown] 0 1 func2 0 1 1500
+  .cfi_startproc
+  retq
+  .size func2, .-func2
+  .cfi_endproc
+
+  .globl  func3
+  .type func3,@function
+func3:
+# FDATA: 0 [unknown] 0 1 func3 0 1 100
+  .cfi_startproc
+  retq
+  .size func3, .-func3
+  .cfi_endproc
+
+  .globl  func4
+  .type func4,@function
+func4:
+# FDATA: 0 [unknown] 0 1 func4 0 1 99
+  .cfi_startproc
+  retq
+  .size func4, .-func4
+  .cfi_endproc
+
+  .globl  func5
+  .type func5,@function
+func5:
+# FDATA: 0 [unknown] 0 1 func5 0 1 110
+  .cfi_startproc
+  retq
+  .size func5, .-func5
+  .cfi_endproc
diff --git a/bolt/test/X86/calculate-emitted-block-size.s b/bolt/test/X86/calculate-emitted-block-size.s
new file mode 100644
index 0000000000000000000000000000000000000000..b1d05b83cb87c74d8794066504331e47899452d0
--- /dev/null
+++ b/bolt/test/X86/calculate-emitted-block-size.s
@@ -0,0 +1,101 @@
+# Test BinaryContext::calculateEmittedSize's functionality to update
+# BinaryBasicBlock::OutputAddressRange in place so that the emitted size
+# of each basic block is given by BinaryBasicBlock::getOutputSize()
+
+# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=all \
+# RUN:         --print-split --print-only=chain --print-output-address-range \
+# RUN:         --data=%t.fdata --reorder-blocks=ext-tsp \
+# RUN:     2>&1 | FileCheck --check-prefix=SPLITALL %s
+# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --print-split \
+# RUN:         --print-only=chain --print-output-address-range \
+# RUN:         --data=%t.fdata --reorder-blocks=ext-tsp \
+# RUN:     2>&1 | FileCheck --check-prefix=SPLITHOTCOLD %s
+
+# SPLITALL: {{^\.LBB00}}
+# SPLITALL: Output Address Range: [0x0, 0x12) (18 bytes)
+# SPLITALL: {{^\.LFT0}}
+# SPLITALL: Output Address Range: [0x0, 0xa) (10 bytes)
+# SPLITALL: {{^\.Ltmp1}}
+# SPLITALL: Output Address Range: [0x0, 0x2) (2 bytes)
+# SPLITALL: {{^\.Ltmp0}}
+# SPLITALL: Output Address Range: [0x0, 0x10) (16 bytes)
+# SPLITALL: {{^\.Ltmp2}}
+# SPLITALL: Output Address Range: [0x0, 0x8) (8 bytes)
+# SPLITALL: {{^\.LFT1}}
+# SPLITALL: Output Address Range: [0x0, 0x8) (8 bytes)
+
+# SPLITHOTCOLD: {{^\.LBB00}}
+# SPLITHOTCOLD: Output Address Range: [0x0, 0x9) (9 bytes)
+# SPLITHOTCOLD: {{^\.LFT0}}
+# SPLITHOTCOLD: Output Address Range: [0x9, 0xe) (5 bytes)
+# SPLITHOTCOLD: {{^\.Ltmp1}}
+# SPLITHOTCOLD: Output Address Range: [0xe, 0x10) (2 bytes)
+# SPLITHOTCOLD: {{^\.Ltmp0}}
+# SPLITHOTCOLD: Output Address Range: [0x10, 0x1b) (11 bytes)
+# SPLITHOTCOLD: {{^\.Ltmp2}}
+# SPLITHOTCOLD: Output Address Range: [0x1b, 0x20) (5 bytes)
+# SPLITHOTCOLD: {{^\.LFT1}}
+# SPLITHOTCOLD: Output Address Range: [0x0, 0x8) (8 bytes)
+
+        .text
+        .globl  chain
+        .type   chain, @function
+chain:
+        pushq   %rbp
+        movq    %rsp, %rbp
+        cmpl    $2, %edi
+LLentry_LLchain_start:
+        jge     LLchain_start
+# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLchain_start# 0 10
+# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLfast# 0 500
+LLfast:
+        movl    $5, %eax
+LLfast_LLexit:
+        jmp     LLexit
+# FDATA: 1 chain #LLfast_LLexit# 1 chain #LLexit# 0 500
+LLchain_start:
+        movl    $10, %eax
+LLchain_start_LLchain1:
+        jge     LLchain1
+# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLchain1# 0 10
+# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLcold# 0 0
+LLcold:
+        addl    $1, %eax
+LLchain1:
+        addl    $1, %eax
+LLchain1_LLexit:
+        jmp     LLexit
+# FDATA: 1 chain #LLchain1_LLexit# 1 chain #LLexit# 0 10
+LLexit:
+        popq    %rbp
+        ret
+LLchain_end:
+        .size   chain, LLchain_end-chain
+
+
+        .globl  main
+        .type   main, @function
+main:
+        pushq   %rbp
+        movq    %rsp, %rbp
+        movl    $1, %edi
+LLmain_chain1:
+        call    chain
+# FDATA: 1 main #LLmain_chain1# 1 chain 0 0 500
+        movl    $4, %edi
+LLmain_chain2:
+        call    chain
+# FDATA: 1 main #LLmain_chain2# 1 chain 0 0 10
+        xorl    %eax, %eax
+        popq    %rbp
+        retq
+.Lmain_end:
+        .size   main, .Lmain_end-main
diff --git a/bolt/test/X86/checkvma-large-section.test b/bolt/test/X86/checkvma-large-section.test
new file mode 100644
index 0000000000000000000000000000000000000000..afa44111ead49e5d7d46f13a8578144cb6311fd9
--- /dev/null
+++ b/bolt/test/X86/checkvma-large-section.test
@@ -0,0 +1,35 @@
+# This test reproduces the issue with a section which ends at >4G address
+REQUIRES: asserts
+RUN: split-file %s %t
+RUN: yaml2obj %t/yaml -o %t.exe --max-size=0
+RUN: llvm-bolt %t.exe -o /dev/null --allow-stripped
+#--- yaml
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data: ELFDATA2LSB
+  Type: ET_EXEC
+  Machine: EM_X86_64
+ProgramHeaders:
+  - Type: PT_LOAD
+    FirstSec: .a
+    LastSec: .a
+    Align: 0x1000
+  - Type: PT_LOAD
+    Flags: [ PF_R, PF_W ]
+    FirstSec: .large_sec
+    LastSec: .large_sec
+    VAddr: 0x80000000
+  - Type: PT_GNU_RELRO
+    Flags: [ PF_R ]
+Sections:
+  - Name: .a
+    Type: SHT_PROGBITS
+    Content: 00
+    AddressAlign: 0x1
+  - Name: .large_sec
+    Type: SHT_NOBITS
+    Flags: [ SHF_WRITE, SHF_ALLOC ]
+    Address: 0x80000000
+    Size: 0x80000000
+...
diff --git a/bolt/test/X86/dwarf4-df-dualcu.test b/bolt/test/X86/dwarf4-df-dualcu.test
index 71726136d7ca5fd947dc4418a001be6b6bae6fa8..c8135ac54377f845fe9514bf9dddb44633a1a9b5 100644
--- a/bolt/test/X86/dwarf4-df-dualcu.test
+++ b/bolt/test/X86/dwarf4-df-dualcu.test
@@ -1,7 +1,7 @@
 ; RUN: rm -rf %t
 ; RUN: mkdir %t
 ; RUN: cd %t
-;; RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-df-dualcu-main.s \
+; RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-df-dualcu-main.s \
 ; RUN: -split-dwarf-file=main.dwo -o main.o
 ; RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-df-dualcu-helper.s \
 ; RUN: -split-dwarf-file=helper.dwo -o helper.o
@@ -12,7 +12,7 @@
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe.bolt >> %t/foo.txt
 ; RUN: cat %t/foo.txt | FileCheck -check-prefix=BOLT %s
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.dwo &> maindwo.txt
-; RUN cat maindwo.txt | FileCheck -check-prefix=PRE-BOLT-DWO-MAIN %s
+; RUN: cat maindwo.txt | FileCheck -check-prefix=PRE-BOLT-DWO-MAIN %s
 ; RUN: not llvm-dwarfdump --show-form --verbose --debug-info main.dwo.dwo &> mainddwodwo.txt
 ; RUN: cat mainddwodwo.txt | FileCheck -check-prefix=BOLT-DWO-MAIN %s
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info helper.dwo &> helperdwo.txt
diff --git a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp
index f6ebd6b76f60acc63ce87f045b791198ba08cb3a..4ed8be42cd0f37fdfa6e02e165059dc33d943cc6 100644
--- a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp
+++ b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp
@@ -1,7 +1,7 @@
 // This test checks that .eh_frame_hdr address is in bounds of the last LOAD
 // end address i.e. the section address is smaller then the LOAD end address.
 
-// REQUIRES: system-linux,bolt-runtime
+// REQUIRES: system-linux,bolt-runtime,target=x86_64{{.*}}
 
 // RUN: %clangxx %cxxflags -static -Wl,-q %s -o %t.exe -Wl,--entry=_start
 // RUN: llvm-bolt %t.exe -o %t.instr -instrument \
diff --git a/bolt/test/X86/internal-call-instrument-so.s b/bolt/test/X86/internal-call-instrument-so.s
index b8903fc7f8223c5545b41fbaec0d70f6b7e2ffbe..d13c828f605c3e3d5d8d5b4af93f03ebe198b05f 100644
--- a/bolt/test/X86/internal-call-instrument-so.s
+++ b/bolt/test/X86/internal-call-instrument-so.s
@@ -1,6 +1,6 @@
 # This reproduces a bug with instrumentation crashes on internal call
 
-# REQUIRES: system-linux,bolt-runtime
+# REQUIRES: system-linux,bolt-runtime,target=x86_64{{.*}}
 
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
 # Delete our BB symbols so BOLT doesn't mark them as entry points
@@ -41,7 +41,6 @@ _start:
   retq
   .size _start, .-_start
 
-
   .globl  _fini
   .type _fini, %function
   .p2align  4
diff --git a/bolt/test/X86/internal-call-instrument.s b/bolt/test/X86/internal-call-instrument.s
index 7ddfb4fb812d3528c6f2ca7fe01925ea18dfb664..c393f1dac864718c38dc6080394d495b6d119171 100644
--- a/bolt/test/X86/internal-call-instrument.s
+++ b/bolt/test/X86/internal-call-instrument.s
@@ -1,15 +1,23 @@
 # This reproduces a bug with instrumentation crashes on internal call
 
-# REQUIRES: x86_64-linux,bolt-runtime
+# REQUIRES: x86_64-linux,bolt-runtime,target=x86_64{{.*}}
 
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
 # Delete our BB symbols so BOLT doesn't mark them as entry points
 # RUN: llvm-strip --strip-unneeded %t.o
-# RUN: %clang %t.o -o %t.exe -Wl,-q
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
 
 # RUN: llvm-bolt --instrument %t.exe --relocs -o %t.out
 
   .text
+  .globl _start
+  .type _start, %function
+  .p2align  4
+_start:
+  call main
+  ret
+  .size _start, .-_start
+
   .globl  main
   .type main, %function
   .p2align  4
@@ -20,6 +28,7 @@ main:
   push   %rbx
   sub    $0x120,%rsp
   mov    $0x3,%rbx
+  movq   rel(%rip), %rdi
 .J1:
   cmp    $0x0,%rbx
   je     .J2
@@ -40,4 +49,15 @@ main:
 .J4:
   pop    %rbp
   retq
+end:
   .size main, .-main
+
+  .globl  _fini
+  .type _fini, %function
+  .p2align  4
+_fini:
+  hlt
+  .size _fini, .-_fini
+
+  .data
+rel: .quad end
diff --git a/bolt/test/X86/issue26.s b/bolt/test/X86/issue26.s
index a6e38b6e4ceffd9aaddb482aee4c09f9126925a5..6f9bc72d6e10dcff385e6d5e920407b0b02ae2cf 100644
--- a/bolt/test/X86/issue26.s
+++ b/bolt/test/X86/issue26.s
@@ -7,7 +7,7 @@
 # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
 # RUN:   %s -o %t.o
 # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe --relocs --print-cfg -o %t.out \
+# RUN: llvm-bolt %t.exe --relocs --print-cfg -o %t.out 2>&1 \
 # RUN:    | FileCheck %s
 
 # CHECK-NOT: BOLT-WARNING: CFG invalid in XYZ @ .LBB0
diff --git a/bolt/test/X86/issue26.test b/bolt/test/X86/issue26.test
index 5bf25e6a59bab40c0fe1ddefa92d1eebf3acb67f..bafd0912cf4a48e7f98e86e0653ab09ec8640c02 100644
--- a/bolt/test/X86/issue26.test
+++ b/bolt/test/X86/issue26.test
@@ -1,7 +1,7 @@
 # This reproduces issue 26 from our github repo
 
 # RUN: yaml2obj %p/Inputs/issue26.yaml &> %t.exe
-# RUN: llvm-bolt %t.exe --relocs --print-cfg -o %t.out \
+# RUN: llvm-bolt %t.exe --relocs --print-cfg -o %t.out 2>&1 \
 # RUN:    | FileCheck %s
 
 CHECK-NOT: BOLT-WARNING: CFG invalid in XYZ @ .LBB0
diff --git a/bolt/test/X86/jump-table-func-entry.s b/bolt/test/X86/jump-table-func-entry.s
new file mode 100644
index 0000000000000000000000000000000000000000..77b444d520a1f105c86c6ea8f5210e29bc3a847e
--- /dev/null
+++ b/bolt/test/X86/jump-table-func-entry.s
@@ -0,0 +1,72 @@
+# REQUIRES: system-linux
+
+## Check that BOLT correctly processes jump table that contains function start
+## as one of its entries.
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -no-pie -Wl,-q
+
+# RUN: llvm-bolt %t.exe --print-normalized --print-only=foo -o %t.out \
+# RUN:   |& FileCheck %s
+
+
+
+  .text
+  .globl _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  call foo
+  ret
+  .cfi_endproc
+  .size _start, .-_start
+
+  .globl foo
+  .type foo, %function
+foo:
+	.cfi_startproc
+.LBB00:
+          movq	0x8(%rdi), %rdi
+          movzbl	0x1(%rdi), %eax
+.LBB00_br:
+	        jmpq	*"JUMP_TABLE/foo.0"(,%rax,8)
+# CHECK:  jmpq {{.*}} # JUMPTABLE
+# CHECK-NEXT: Successors: {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}
+
+.Ltmp87085:
+	xorl	%eax, %eax
+	retq
+
+.Ltmp87086:
+	cmpb	$0x0, 0x8(%rdi)
+	setne	%al
+	retq
+
+.Ltmp87088:
+	movb	$0x1, %al
+	retq
+
+.Ltmp87087:
+	movzbl	0x14(%rdi), %eax
+	andb	$0x2, %al
+	shrb	%al
+	retq
+
+	.cfi_endproc
+.size foo, .-foo
+
+# Jump tables
+.section .rodata
+"JUMP_TABLE/foo.0":
+	.quad	.Ltmp87085
+	.quad	.Ltmp87086
+	.quad	.Ltmp87087
+	.quad	.LBB00
+	.quad	.Ltmp87088
+
+# CHECK: Jump table {{.*}} for function foo
+# CHECK-NEXT: 0x{{.*}} :
+# CHECK-NEXT: 0x{{.*}} :
+# CHECK-NEXT: 0x{{.*}} :
+# CHECK-NEXT: 0x{{.*}} :
+# CHECK-NEXT: 0x{{.*}} :
diff --git a/bolt/test/X86/jump-table-icp.test b/bolt/test/X86/jump-table-icp.test
index 708f1273af3f19600fe1ac6cf06b6ca375194cb1..5b989d18018b0505f7293c6d00d238e5b1e101d6 100644
--- a/bolt/test/X86/jump-table-icp.test
+++ b/bolt/test/X86/jump-table-icp.test
@@ -12,6 +12,7 @@ RUN: (llvm-bolt %t.exe --data %t.fdata -o %t --relocs \
 RUN:   --reorder-blocks=cache --split-functions --split-all-cold \
 RUN:   --use-gnu-stack --dyno-stats --indirect-call-promotion=jump-tables \
 RUN:   --print-icp -v=0 \
+RUN:   --enable-bat --print-cache-metrics \
 RUN:   --icp-jt-remaining-percent-threshold=10 \
 RUN:   --icp-jt-total-percent-threshold=2 \
 RUN:   --indirect-call-promotion-topn=1 \
@@ -36,12 +37,14 @@ CHECK:   Successors: .Ltmp{{.*}} (mispreds: 189, count: 189), .LFT{{.*}} (mispre
 CHECK: .LFT{{.*}} (4 instructions, align : 1)
 CHECK-NEXT:   Exec Count : 881
 CHECK:   Predecessors: .LBB{{.*}}
-CHECK:   Successors: .Ltmp{{.*}} (mispreds: 138, count: 155), .Ltmp{{.*}} (mispreds: 0, count: 726)
+CHECK:     je {{.*}} # Offset: 28
+CHECK-NEXT: Successors: .Ltmp{{.*}} (mispreds: 138, count: 155), .Ltmp{{.*}} (mispreds: 0, count: 726)
 
 CHECK: .Ltmp{{.*}} (1 instructions, align : 1)
 CHECK-NEXT:   Exec Count : 726
 CHECK:   Predecessors: .LFT{{.*}}
-CHECK:   Successors: .L{{.*}} (mispreds: 126, count: 157), .L{{.*}} (mispreds: 140, count: 156), .L{{.*}} (mispreds: 134, count: 152), .L{{.*}} (mispreds: 137, count: 150), .L{{.*}} (mispreds: 129, count: 148), .L{{.*}} (mispreds: 0, count: 0)
+CHECK:     jmpq {{.*}} # Offset: 28
+CHECK-NEXT: Successors: .L{{.*}} (mispreds: 126, count: 157), .L{{.*}} (mispreds: 140, count: 156), .L{{.*}} (mispreds: 134, count: 152), .L{{.*}} (mispreds: 137, count: 150), .L{{.*}} (mispreds: 129, count: 148), .L{{.*}} (mispreds: 0, count: 0)
 
 CHECK: .Ltmp{{.*}} (5 instructions, align : 1)
 CHECK-NEXT:  Exec Count : 167
diff --git a/bolt/test/X86/keep-nops.s b/bolt/test/X86/keep-nops.s
new file mode 100644
index 0000000000000000000000000000000000000000..37da2ff07b9b79827a3ff43e43897bc53b792fe3
--- /dev/null
+++ b/bolt/test/X86/keep-nops.s
@@ -0,0 +1,69 @@
+## Check that BOLT preserves NOP instructions of different sizes correctly.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-linux %s -o %t.o
+# RUN: ld.lld %t.o -o %t.exe -q
+# RUN: llvm-bolt %t.exe -o %t.bolt.exe --keep-nops --relocs --print-finalized \
+# RUN:   |& FileCheck --check-prefix=CHECK-BOLT %s
+# RUN: llvm-objdump -d %t.bolt.exe | FileCheck %s
+
+  .text
+  .globl _start
+  .type _start,@function
+_start:
+  .cfi_startproc
+  .nops 1
+  .nops 2
+  .nops 3
+  .nops 4
+  .nops 5
+  .nops 6
+  .nops 7
+  .nops 8
+  .nops 9
+  .nops 10
+  .nops 11
+  .nops 12
+  .nops 13
+  .nops 14
+  .nops 15
+
+# CHECK: <_start>:
+# CHECK-NEXT: 90
+# CHECK-NEXT: 66 90
+# CHECK-NEXT: 0f 1f 00
+# CHECK-NEXT: 0f 1f 40 00
+# CHECK-NEXT: 0f 1f 44 00 00
+# CHECK-NEXT: 66 0f 1f 44 00 00
+# CHECK-NEXT: 0f 1f 80 00 00 00 00
+# CHECK-NEXT: 0f 1f 84 00 00 00 00 00
+# CHECK-NEXT: 66 0f 1f 84 00 00 00 00 00
+# CHECK-NEXT: 66 2e 0f 1f 84 00 00 00 00 00
+# CHECK-NEXT: 66 66 2e 0f 1f 84 00 00 00 00 00
+# CHECK-NEXT: 66 66 66 2e 0f 1f 84 00 00 00 00 00
+# CHECK-NEXT: 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
+# CHECK-NEXT: 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
+# CHECK-NEXT: 66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
+
+# CHECK-BOLT:       Size: 1
+# CHECK-BOLT-NEXT:  Size: 2
+# CHECK-BOLT-NEXT:  Size: 3
+# CHECK-BOLT-NEXT:  Size: 4
+# CHECK-BOLT-NEXT:  Size: 5
+# CHECK-BOLT-NEXT:  Size: 6
+# CHECK-BOLT-NEXT:  Size: 7
+# CHECK-BOLT-NEXT:  Size: 8
+# CHECK-BOLT-NEXT:  Size: 9
+# CHECK-BOLT-NEXT:  Size: 10
+# CHECK-BOLT-NEXT:  Size: 11
+# CHECK-BOLT-NEXT:  Size: 12
+# CHECK-BOLT-NEXT:  Size: 13
+# CHECK-BOLT-NEXT:  Size: 14
+# CHECK-BOLT-NEXT:  Size: 15
+
+# Needed for relocation mode.
+  .reloc 0, R_X86_64_NONE
+
+  .size _start, .-_start
+  .cfi_endproc
diff --git a/bolt/test/X86/patch-entries.test b/bolt/test/X86/patch-entries.test
new file mode 100644
index 0000000000000000000000000000000000000000..54f358f273e793c30da84fe3a6134e77d4cb4759
--- /dev/null
+++ b/bolt/test/X86/patch-entries.test
@@ -0,0 +1,10 @@
+# Checking crashes against injected binary functions created by patch
+# entries pass and debug info turned on. In these cases, we were
+# trying to fetch input to output maps on injected functions and
+# crashing.
+
+REQUIRES: system-linux
+
+RUN: %clang %cflags -no-pie -g %p/Inputs/patch-entries.c -fuse-ld=lld -o %t.exe \
+RUN:   -Wl,-q -I%p/../Inputs
+RUN: llvm-bolt -relocs %t.exe -o %t.out --update-debug-sections --force-patch
diff --git a/bolt/test/X86/reader-stale-yaml.test b/bolt/test/X86/reader-stale-yaml.test
index 3f9861d2b7092301518c103db31dabb094e7de0e..5231032f4f4a75da976c093865295d61fd026bdf 100644
--- a/bolt/test/X86/reader-stale-yaml.test
+++ b/bolt/test/X86/reader-stale-yaml.test
@@ -1,39 +1,71 @@
 # This script checks that YamlProfileReader in llvm-bolt is reading data
-# correctly and stale data is corrected.
+# correctly and stale data is corrected by profile inference.
 
 RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe
+# Testing "usqrt"
 RUN: llvm-bolt %t.exe -o /dev/null --b %p/Inputs/blarge_profile_stale.yaml \
 RUN:   --print-cfg --print-only=usqrt --infer-stale-profile=1 \
-RUN:   --profile-ignore-hash=1 --profile-use-dfs 2>&1 | FileCheck %s
+RUN:   --profile-ignore-hash=1 --profile-use-dfs=0 2>&1 | FileCheck %s -check-prefix=CHECK1
+# Testing "SolveCubic"
+RUN: llvm-bolt %t.exe -o /dev/null --b %p/Inputs/blarge_profile_stale.yaml \
+RUN:   --print-cfg --print-only=SolveCubic --infer-stale-profile=1 \
+RUN:   --profile-ignore-hash=1 --profile-use-dfs=0 2>&1 | FileCheck %s -check-prefix=CHECK2
+
+# Function "usqrt" has stale profile, since the number of blocks in the profile
+# (nblocks=6) does not match the size of the CFG in the binary. The entry
+# block (bid=0) has an incorrect (missing) count, which should be inferred by
+# the algorithm.
 
 # Verify that yaml reader works as expected.
-CHECK:  pre-processing profile using YAML profile reader
+CHECK1:  pre-processing profile using YAML profile reader
+CHECK1:    Binary Function "usqrt" after building cfg {
+CHECK1:      State       : CFG constructed
+CHECK1:      Address     : 0x401170
+CHECK1:      Size        : 0x43
+CHECK1:      Section     : .text
+CHECK1:      IsSimple    : 1
+CHECK1:      BB Count    : 5
+CHECK1:      Exec Count  : 20
+CHECK1:      Branch Count: 640
+CHECK1:    }
+# Verify block counts.
+CHECK1:    .LBB01 (4 instructions, align : 1)
+CHECK1:      Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 20)
+CHECK1:    .Ltmp[[#BB13:]] (9 instructions, align : 1)
+CHECK1:      Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 320), .LFT[[#BB0:]] (mispreds: 0, count: 0)
+CHECK1:    .LFT[[#BB0:]] (2 instructions, align : 1)
+CHECK1:      Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 0)
+CHECK1:    .Ltmp[[#BB12:]] (2 instructions, align : 1)
+CHECK1:      Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 300), .LFT[[#BB1:]] (mispreds: 0, count: 20)
+CHECK1:    .LFT[[#BB1:]] (2 instructions, align : 1)
+# Check the overall inference stats.
+CHECK1:  2 out of 7 functions in the binary (28.6%) have non-empty execution profile
+CHECK1:  inferred profile for 2 (100.00% of profiled, 100.00% of stale) functions responsible for {{.*}} samples ({{.*}} out of {{.*}})
 
-# Verify the inferred counts of "usqrt" that has stale profile:
-#   - the function has nblocks=6 in the profile, which makes it stale
-#   - block with bid=0 has an incorrect (missing) count, which is inferred
-CHECK:    Binary Function "usqrt" after building cfg {
-CHECK:      State       : CFG constructed
-CHECK:      Address     : 0x401170
-CHECK:      Size        : 0x43
-CHECK:      Section     : .text
-CHECK:      IsSimple    : 1
-CHECK:      BB Count    : 5
-CHECK:      Exec Count  : 20
-CHECK:      Branch Count: 640
-CHECK:    }
 
-# Verify block counts.
-CHECK:    .LBB01 (4 instructions, align : 1)
-CHECK:      Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 20)
-CHECK:    .Ltmp[[#BB13:]] (9 instructions, align : 1)
-CHECK:      Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 320), .LFT[[#BB0:]] (mispreds: 0, count: 0)
-CHECK:    .LFT[[#BB0:]] (2 instructions, align : 1)
-CHECK:      Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 0)
-CHECK:    .Ltmp[[#BB12:]] (2 instructions, align : 1)
-CHECK:      Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 300), .LFT[[#BB1:]] (mispreds: 0, count: 20)
-CHECK:    .LFT[[#BB1:]] (2 instructions, align : 1)
+# Function "SolveCubic" has stale profile, since there is one jump in the
+# profile (from bid=13 to bid=2) which is not in the CFG in the binary. The test
+# verifies that the inference is able to match two blocks (bid=1 and bid=13)
+# using "loose" hashes and then correctly propagate the counts.
 
-# Check the overal inference stats.
-CHECK:  2 out of 7 functions in the binary (28.6%) have non-empty execution profile
-CHECK:  inferred profile for 1 (50.00% of profiled, 100.00% of stale) functions responsible for 87.31% samples (640 out of 733)
+CHECK2:  pre-processing profile using YAML profile reader
+CHECK2:    Binary Function "SolveCubic" after building cfg {
+CHECK2:      State       : CFG constructed
+CHECK2:      Address     : 0x400e00
+CHECK2:      Size        : 0x368
+CHECK2:      Section     : .text
+CHECK2:      IsSimple    : 1
+CHECK2:      BB Count    : 18
+CHECK2:      Exec Count  : 151
+CHECK2:      Branch Count: 552
+# Verify block counts.
+CHECK2:    .LBB00 (43 instructions, align : 1)
+CHECK2:      Successors: .Ltmp[[#BB7:]] (mispreds: 0, count: 0), .LFT[[#BB1:]] (mispreds: 0, count: 151)
+CHECK2:    .LFT[[#BB1:]] (5 instructions, align : 1)
+CHECK2:      Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 151), .LFT[[#BB2:]] (mispreds: 0, count: 0)
+CHECK2:    .Ltmp[[#BB3:]] (26 instructions, align : 1)
+CHECK2:      Successors: .Ltmp[[#BB5:]] (mispreds: 0, count: 151), .LFT[[#BB4:]] (mispreds: 0, count: 0)
+CHECK2:    .Ltmp[[#BB5:]] (9 instructions, align : 1)
+CHECK2:    .Ltmp[[#BB13:]] (12 instructions, align : 1)
+CHECK2:      Successors: .Ltmp[[#BB3:]] (mispreds: 0, count: 151)
+CHECK2:  2 out of 7 functions in the binary (28.6%) have non-empty execution profile
diff --git a/bolt/test/X86/tail-duplication-pass.s b/bolt/test/X86/tail-duplication-pass.s
index 677f4986eb89021ceb23ce81144f0908be22869e..ed50cc5227d8557dc39a71db4a5b481c71f9971a 100644
--- a/bolt/test/X86/tail-duplication-pass.s
+++ b/bolt/test/X86/tail-duplication-pass.s
@@ -7,12 +7,21 @@
 # RUN: llvm-bolt %t.exe --data %t.fdata --reorder-blocks=ext-tsp \
 # RUN:    --print-finalized --tail-duplication=moderate \
 # RUN:    --tail-duplication-minimum-offset=1 -o %t.out | FileCheck %s
+# RUN: llvm-bolt %t.exe --data %t.fdata --print-finalized \
+# RUN:    --tail-duplication=aggressive --tail-duplication-minimum-offset=1 \
+# RUN:    -o %t.out | FileCheck %s --check-prefix CHECK-NOLOOP
 
 # FDATA: 1 main 2 1 main #.BB2# 0 10
 # FDATA: 1 main 4 1 main #.BB2# 0 20
 # CHECK: BOLT-INFO: tail duplication modified 1 ({{.*}}%) functions; duplicated 1 blocks (1 bytes) responsible for {{.*}} dynamic executions ({{.*}}% of all block executions)
 # CHECK: BB Layout   : .LBB00, .Ltail-dup0, .Ltmp0, .Ltmp1
 
+# Check that the successor of Ltail-dup0 is .LBB00, not itself.
+# CHECK-NOLOOP: .Ltail-dup0 (1 instructions, align : 1)
+# CHECK-NOLOOP: Predecessors: .LBB00
+# CHECK-NOLOOP: retq
+# CHECK-NOLOOP: .Ltmp0 (1 instructions, align : 1)
+
     .text
     .globl main
     .type main, %function
diff --git a/bolt/test/assume-abi.test b/bolt/test/assume-abi.test
new file mode 100644
index 0000000000000000000000000000000000000000..688ab011441d3a428c3358d99d10856fbdc9d771
--- /dev/null
+++ b/bolt/test/assume-abi.test
@@ -0,0 +1,7 @@
+# Validate the usage of the `--assume-abi` option in conjunction with
+# options related to the RegAnalysis Pass.
+
+REQUIRES: system-linux
+
+RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q
+RUN: llvm-bolt %t -o %t.bolt --assume-abi --indirect-call-promotion=all
diff --git a/bolt/test/lsda.cpp b/bolt/test/lsda-section-name.cpp
similarity index 89%
rename from bolt/test/lsda.cpp
rename to bolt/test/lsda-section-name.cpp
index b7905a58b532daba5926142bf95413a0fee913fd..41fb17665821911bdf15080730f27b5333d06e0c 100644
--- a/bolt/test/lsda.cpp
+++ b/bolt/test/lsda-section-name.cpp
@@ -1,8 +1,8 @@
 // This test check that LSDA section named by .gcc_except_table.main is
 // disassembled by BOLT.
 
-// RUN: %clang++ %cxxflags -O3 -flto=thin -no-pie -c %s -o %t.o
-// RUN: %clang++ %cxxflags -flto=thin -no-pie -fuse-ld=lld %t.o -o %t.exe \
+// RUN: %clang++ %cxxflags -O3 -no-pie -c %s -o %t.o
+// RUN: %clang++ %cxxflags -no-pie -fuse-ld=lld %t.o -o %t.exe \
 // RUN:   -Wl,-q -Wl,--script=%S/Inputs/lsda.ldscript
 // RUN: llvm-readelf -SW %t.exe | FileCheck %s
 // RUN: llvm-bolt %t.exe -o %t.bolt
diff --git a/bolt/test/permission.test b/bolt/test/permission.test
new file mode 100644
index 0000000000000000000000000000000000000000..a5a98599eb83b40c7dac555e0258cde156641763
--- /dev/null
+++ b/bolt/test/permission.test
@@ -0,0 +1,13 @@
+# Ensure that the permissions of the optimized binary file comply with the
+# system's umask.
+
+# This test performs a logical AND operation on the results of the `stat -c %a
+# %t.bolt` and `umask` commands (both results are displayed in octal), and
+# checks whether the result is equal to 0.
+REQUIRES: system-linux
+
+RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q
+RUN: llvm-bolt %t -o %t.bolt
+RUN: echo $(( 8#$(stat -c %a %t.bolt) & 8#$(umask) )) | FileCheck %s
+
+CHECK: 0
diff --git a/bolt/test/runtime/AArch64/BiSheng/hugify.c b/bolt/test/runtime/AArch64/BiSheng/hugify.c
new file mode 100644
index 0000000000000000000000000000000000000000..d40c1fe85e5ed26cf84cd50e2ef3a3651fbb101a
--- /dev/null
+++ b/bolt/test/runtime/AArch64/BiSheng/hugify.c
@@ -0,0 +1,27 @@
+// Make sure BOLT correctly processes --hugify option
+
+#include <stdio.h>
+ 
+int main(int argc, char **argv) {
+  printf("Hello world\n");
+  return 0;
+}
+ 
+/*
+REQUIRES: system-linux,bolt-runtime,enable_bspub_common
+ 
+RUN: %clang %cflags -no-pie %s -o %t.nopie.exe -Wl,-q
+RUN: %clang %cflags -fpic -pie %s -o -%t.pie.exe -Wl,-q
+ 
+RUN: llvm-bolt %t.nopie.exe --lite=0 -o %t.nopie --hugify
+RUN: llvm-bolt -%t.pie.exe --lite=0 -o %t.pie --hugify
+ 
+RUN: %t.nopie | FileCheck %s -check-prefix=CHECK-NOPIE
+ 
+CHECK-NOPIE: Hello world
+
+RUN: %t.pie | FileCheck %s -check-prefix=CHECK-PIE
+
+CHECK-PIE: Hello world
+
+*/
diff --git a/bolt/test/runtime/AArch64/adrrelaxationpass.s b/bolt/test/runtime/AArch64/adrrelaxationpass.s
index 5c50cd6371926cbe23175ded354dd7e719ef6338..fa9fb63c613dc1345fc7a81e14da721e91e42345 100644
--- a/bolt/test/runtime/AArch64/adrrelaxationpass.s
+++ b/bolt/test/runtime/AArch64/adrrelaxationpass.s
@@ -1,33 +1,27 @@
 # The second and third ADR instructions are non-local to functions
 # and must be replaced with ADRP + ADD by BOLT
-# Also since main is non-simple, we can't change it's length so we have to
-# replace NOP with adrp, and if there is no nop before adr in non-simple
+# Also since main and test are non-simple, we can't change it's length so we
+# have to replace NOP with adrp, and if there is no nop before adr in non-simple
 # function, we can't guarantee we didn't break possible jump tables, so we
-# fail in strict mode
+# fail in non-strict mode
 
 # REQUIRES: system-linux
 
 # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
 # RUN:   %s -o %t.o
 # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --adr-relaxation=true
+# RUN: llvm-bolt %t.exe -o %t.bolt --adr-relaxation=true --strict
 # RUN: llvm-objdump --no-print-imm-hex -d --disassemble-symbols=main %t.bolt | FileCheck %s
 # RUN: %t.bolt
-# RUN: not llvm-bolt %t.exe -o %t.bolt --adr-relaxation=true --strict \
+# RUN: not llvm-bolt %t.exe -o %t.bolt --adr-relaxation=true \
 # RUN: 2>&1 | FileCheck %s --check-prefix CHECK-ERROR
 
-  .data
-  .align 8
-  .global Gvar
-Gvar: .xword 0x0
-  .global Gvar2
-Gvar2: .xword 0x42
-
   .text
   .align 4
   .global test
   .type test, %function
 test:
+  adr x2, Gvar
   mov x0, xzr
   ret
   .size test, .-test
@@ -47,6 +41,17 @@ br:
 .CI:
   .word 0xff
 
+  .data
+  .align 8
+  .global Gvar
+Gvar: .xword 0x0
+  .global Gvar2
+Gvar2: .xword 0x42
+  .balign 4
+jmptable:
+  .word 0
+  .word test - jmptable
+
 # CHECK: <main>:
 # CHECK-NEXT: adr x0, 0x{{[1-8a-f][0-9a-f]*}}
 # CHECK-NEXT: adrp x1, 0x{{[1-8a-f][0-9a-f]*}}
@@ -54,4 +59,4 @@ br:
 # CHECK-NEXT: adrp x2, 0x{{[1-8a-f][0-9a-f]*}}
 # CHECK-NEXT: add x2, x2, #{{[1-8a-f][0-9a-f]*}}
 # CHECK-NEXT: adr x3, 0x{{[1-8a-f][0-9a-f]*}}
-# CHECK-ERROR: BOLT-ERROR: Cannot relax adr in non-simple function main
+# CHECK-ERROR: BOLT-ERROR: Cannot relax adr in non-simple function
diff --git a/bolt/test/runtime/AArch64/controlflow.s b/bolt/test/runtime/AArch64/controlflow.s
index fe9aab88f0c74047633ea5b9ee7e279d85ef8a00..7b0a38779f6e9cfb90027ad9adcd84a76568f4c8 100644
--- a/bolt/test/runtime/AArch64/controlflow.s
+++ b/bolt/test/runtime/AArch64/controlflow.s
@@ -48,6 +48,7 @@ test_cond_branch:
   .global test_branch_reg
   .type test_branch_reg, %function
 test_branch_reg:
+  nop
   adr x0, test_branch_zero
   br x0
   panic
@@ -97,6 +98,7 @@ test_call:
   .global test_call_reg
   .type test_call_reg, %function
 test_call_reg:
+  nop
   adr x0, test_call_foo
   blr x0
   panic
diff --git a/bolt/test/runtime/AArch64/hook-fini.test b/bolt/test/runtime/AArch64/hook-fini.test
new file mode 100644
index 0000000000000000000000000000000000000000..8d23b21b6d612f5556608848237edc2c1af6f4de
--- /dev/null
+++ b/bolt/test/runtime/AArch64/hook-fini.test
@@ -0,0 +1,61 @@
+# Test the different ways of hooking the fini function for instrumentation (via
+# DT_FINI and via DT_FINI_ARRAY). We test the latter for both PIE and non-PIE
+# binaries because of the different ways of handling relocations (static or
+# dynamic).
+# All tests perform the following steps:
+# - Compile and link for the case to be tested
+# - Some sanity-checks on the dynamic section and relocations in the binary to
+#   verify it has the shape we want for testing:
+#   - DT_FINI or DT_FINI_ARRAY in dynamic section
+#   - No relative relocations for non-PIE
+# - Instrument
+# - Run instrumented binary
+# - Verify generated profile
+REQUIRES: system-linux,bolt-runtime
+
+RUN: %clang %cflags -pie %p/Inputs/basic-instrumentation.s -Wl,-q -o %t.exe
+RUN: llvm-readelf -d %t.exe | FileCheck --check-prefix=DYN-FINI %s
+RUN: llvm-readelf -r %t.exe | FileCheck --check-prefix=RELOC-PIE %s
+RUN: llvm-bolt %t.exe -o %t --instrument \
+RUN:     --instrumentation-file=%t \
+RUN:     --instrumentation-file-append-pid
+RUN: rm -f %t.*.fdata
+RUN: %t
+RUN: cat %t.*.fdata | FileCheck %s
+
+RUN: %clang %cflags -pie %p/Inputs/basic-instrumentation.s -Wl,-q,-fini=0 -o %t-no-fini.exe
+RUN: llvm-readelf -d %t-no-fini.exe | FileCheck --check-prefix=DYN-NO-FINI %s
+RUN: llvm-readelf -r %t-no-fini.exe | FileCheck --check-prefix=RELOC-PIE %s
+RUN: llvm-bolt %t-no-fini.exe -o %t-no-fini --instrument \
+RUN:     --instrumentation-file=%t-no-fini \
+RUN:     --instrumentation-file-append-pid
+RUN: rm -f %t-no-fini.*.fdata
+RUN: %t-no-fini
+RUN: cat %t-no-fini.*.fdata | FileCheck %s
+
+RUN: %clang %cflags -no-pie %p/Inputs/basic-instrumentation.s -Wl,-q,-fini=0 -o %t-no-pie-no-fini.exe
+RUN: llvm-readelf -d %t-no-pie-no-fini.exe | FileCheck --check-prefix=DYN-NO-FINI %s
+RUN: llvm-readelf -r %t-no-pie-no-fini.exe | FileCheck --check-prefix=RELOC-NO-PIE %s
+RUN: llvm-bolt %t-no-pie-no-fini.exe -o %t-no-pie-no-fini --instrument \
+RUN:     --instrumentation-file=%t-no-pie-no-fini \
+RUN:     --instrumentation-file-append-pid
+RUN: rm -f %t-no-pie-no-fini.*.fdata
+RUN: %t-no-pie-no-fini
+RUN: cat %t-no-pie-no-fini.*.fdata | FileCheck %s
+
+# With fini: dynamic section should contain DT_FINI
+DYN-FINI: (FINI)
+
+# Without fini: dynamic section should only contain DT_FINI_ARRAY
+DYN-NO-FINI-NOT: (FINI)
+DYN-NO-FINI:     (FINI_ARRAY)
+DYN-NO-FINI:     (FINI_ARRAYSZ)
+
+# With PIE: binary should have relative relocations
+RELOC-PIE: R_AARCH64_RELATIVE
+
+# Without PIE: binary should not have relative relocations
+RELOC-NO-PIE-NOT: R_AARCH64_RELATIVE
+
+# The instrumented profile should at least say main was called once
+CHECK: main 0 0 1{{$}}
diff --git a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp b/bolt/test/runtime/Inputs/exceptions_split.cpp
similarity index 85%
rename from bolt/test/runtime/X86/Inputs/exceptions_split.cpp
rename to bolt/test/runtime/Inputs/exceptions_split.cpp
index 2c136b9a1cf5c958d175df17e629504357ca1287..de81adf7583ca3da8b8255e45809f4f7d2ad899f 100644
--- a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp
+++ b/bolt/test/runtime/Inputs/exceptions_split.cpp
@@ -3,31 +3,25 @@
 //
 // Record performance data with no args. Run test with 2 args.
 
-#include <stdio.h>
 #include <stdint.h>
+#include <stdio.h>
 
-int foo()
-{
-  return 0;
-}
+int foo() { return 0; }
 
 void bar(int a) {
   if (a > 2 && a % 2)
     throw new int();
 }
 
-void filter_only(){
-  foo();
-}
+void filter_only() { foo(); }
 
-int main(int argc, char **argv)
-{
+int main(int argc, char **argv) {
   unsigned r = 0;
 
   uint64_t limit = (argc >= 2 ? 10 : 5000);
   for (uint64_t i = 0; i < limit; ++i) {
     i += foo();
-    try  {
+    try {
       bar(argc);
       try {
         if (argc >= 2)
diff --git a/bolt/test/X86/asm-dump.c b/bolt/test/runtime/X86/asm-dump.c
similarity index 100%
rename from bolt/test/X86/asm-dump.c
rename to bolt/test/runtime/X86/asm-dump.c
diff --git a/bolt/test/runtime/X86/exceptions-lpstart-zero.s b/bolt/test/runtime/X86/exceptions-lpstart-zero.s
new file mode 100644
index 0000000000000000000000000000000000000000..b487ff0fa2f59114705657d5c262d9a78d94faad
--- /dev/null
+++ b/bolt/test/runtime/X86/exceptions-lpstart-zero.s
@@ -0,0 +1,91 @@
+# RUN: %clangxx %cflags -no-pie %s -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt
+# RUN: %t.exe.bolt
+
+# REQUIRES: system-linux
+
+## Test that BOLT properly handles LPStart when LPStartEncoding is different
+## from DW_EH_PE_omit.
+
+# The test case compiled with -O1 from:
+#
+# int main() {
+#   try {
+#     throw 42;
+#   } catch (...) {
+#     return 0;
+#   }
+#   return 1;
+# }
+#
+# The exception table was modified with udata4 LPStartEncoding and sdata4
+# CallSiteEncoding.
+
+	.text
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.cfi_startproc
+	.cfi_personality 3, __gxx_personality_v0
+	.cfi_lsda 3, .Lexception0
+# %bb.0:
+	pushq	%rax
+	.cfi_def_cfa_offset 16
+	movl	$4, %edi
+	callq	__cxa_allocate_exception
+	movl	$42, (%rax)
+.Ltmp0:
+	movl	$_ZTIi, %esi
+	movq	%rax, %rdi
+	xorl	%edx, %edx
+	callq	__cxa_throw
+.Ltmp1:
+# %bb.1:
+.LBB0_2:
+.Ltmp2:
+	movq	%rax, %rdi
+	callq	__cxa_begin_catch
+	callq	__cxa_end_catch
+	xorl	%eax, %eax
+	popq	%rcx
+	.cfi_def_cfa_offset 8
+	retq
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+	.section	.gcc_except_table,"a",@progbits
+	.p2align	2
+GCC_except_table0:
+.Lexception0:
+	.byte	3                               # @LPStart Encoding = udata4
+  .long 0
+	.byte	3                               # @TType Encoding = udata4
+	.uleb128 .Lttbase0-.Lttbaseref0
+.Lttbaseref0:
+	.byte	11                              # Call site Encoding = sdata4
+	.uleb128 .Lcst_end0-.Lcst_begin0
+.Lcst_begin0:
+	.long .Lfunc_begin0-.Lfunc_begin0    # >> Call Site 1 <<
+	.long .Ltmp0-.Lfunc_begin0           #   Call between .Lfunc_begin0 and .Ltmp0
+	.long	0                               #     has no landing pad
+	.byte	0                               #   On action: cleanup
+	.long .Ltmp0-.Lfunc_begin0           # >> Call Site 2 <<
+	.long .Ltmp1-.Ltmp0                  #   Call between .Ltmp0 and .Ltmp1
+	.long .Ltmp2                         #     jumps to .Ltmp2
+	.byte	1                               #   On action: 1
+	.long .Ltmp1-.Lfunc_begin0           # >> Call Site 3 <<
+	.long .Lfunc_end0-.Ltmp1             #   Call between .Ltmp1 and .Lfunc_end0
+	.long	0                               #     has no landing pad
+	.byte	0                               #   On action: cleanup
+.Lcst_end0:
+	.byte	1                               # >> Action Record 1 <<
+                                        #   Catch TypeInfo 1
+	.byte	0                               #   No further actions
+	.p2align	2
+                                        # >> Catch TypeInfos <<
+	.long	0                               # TypeInfo 1
+.Lttbase0:
+	.p2align	2
+                                        # -- End function
diff --git a/bolt/test/runtime/X86/instrumentation-tail-call.s b/bolt/test/runtime/X86/instrumentation-tail-call.s
index 792d084e3f3d44b77bdbfce3e6a5260d0d15e4f3..dfb12f03401a3ab27d8bd6a55bc1c1416baa5e1e 100644
--- a/bolt/test/runtime/X86/instrumentation-tail-call.s
+++ b/bolt/test/runtime/X86/instrumentation-tail-call.s
@@ -14,6 +14,9 @@
 
 # CHECK: leaq 0x80(%rsp), %rsp
 
+# RUN: FileCheck %s --input-file %t.fdata --check-prefix=CHECK-FDATA
+# CHECK-FDATA: 1 main {{.*}} 1 targetFunc 0 0 1
+
   .text
   .globl  main
   .type main, %function
@@ -32,7 +35,8 @@ main:
   movq %rbp, %rsp
   pop %rbp
   mov -0x10(%rsp),%rax
-  jmp targetFunc
+  test %rsp, %rsp
+  jne targetFunc
 
 .LBBerror:
   addq $0x20, %rsp
diff --git a/bolt/test/runtime/X86/reg-reassign-no-swap-bl.s b/bolt/test/runtime/X86/reg-reassign-no-swap-bl.s
new file mode 100644
index 0000000000000000000000000000000000000000..4e2e70ed6cba9e43cdf23671fe3e53398d2237c7
--- /dev/null
+++ b/bolt/test/runtime/X86/reg-reassign-no-swap-bl.s
@@ -0,0 +1,59 @@
+# This test case is used to reproduce an issue found in the mongod database.
+# In function rankRegisters, if there is a BH Reg in the basic block, then the BL Reg
+# also cannot be swap.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang -no-pie %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.out -data=%t.fdata --reg-reassign | FileCheck %s
+# RUN: %t.out
+
+# CHECK: Reg Reassignment Pass: no changes were made
+  .text
+  .globl  main
+  .globl  main.cold
+  .p2align  4, 0x90
+  .type  main,@function
+  .type  main.cold,@function
+main.cold:
+bb1:
+  mov    $0x2,  %bh
+bb2:
+  jmp    bb5
+main:                                   # @main
+  .cfi_startproc
+# %bb.0:                                # %entry
+  pushq  %rax
+  pushq   %r12
+  pushq   %rbx
+  .cfi_def_cfa_offset 16
+  mov     $0x1,   %r12
+  shr     $0x14,  %r12
+  add     $0x14,  %r12
+  mov     $0x11,  %rbx
+  mov    $0x1,  %bh
+  mov    $0x1,  %bl
+bb3:
+  add    $0x1,   %r12
+bb4:
+  jmp    bb1
+bb5:
+  cmp     $0x201,  %rbx
+  jne     0x0
+bb6:
+  xorl    %eax, %eax
+  popq    %rcx
+  popq    %rbx
+  popq    %r12
+  .cfi_def_cfa_offset 8
+  retq
+# FDATA: 1 main.cold #bb2# 1 main 0 0 100
+# FDATA: 1 main #bb3# 1 main #bb4# 0 100
+# FDATA: 1 main #bb4# 1 main.cold 0 0 100
+
+.Lfunc_end0:
+  .size  main, .Lfunc_end0-main
+  .cfi_endproc
diff --git a/bolt/test/runtime/X86/reg-reassign-swap-cold.s b/bolt/test/runtime/X86/reg-reassign-swap-cold.s
new file mode 100644
index 0000000000000000000000000000000000000000..115b5b0eeff8b504f31e2e629dca7eea870ba064
--- /dev/null
+++ b/bolt/test/runtime/X86/reg-reassign-swap-cold.s
@@ -0,0 +1,64 @@
+# This test case reproduces a bug where, during register swapping,
+# the code fragments associated with the function need to be swapped
+# together (which may be generated during PGO optimization). If not
+# handled properly, optimized binary execution can result in a segmentation fault.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang -no-pie %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.out -data=%t.fdata --reg-reassign | FileCheck %s
+# RUN: %t.out
+
+# CHECK: BOLT-INFO: Reg Reassignment Pass Stats
+# CHECK-NEXT: 2 functions affected.
+  .text
+  .globl  main
+  .globl  main.cold
+  .p2align  4, 0x90
+  .type   main,@function
+  .type   main.cold,@function
+main.cold:
+bb1:
+  cmp     $0x3, %r12
+  jne     bb8
+bb2:
+  jmp     bb4
+main:                                   # @main
+  .cfi_startproc
+# %bb.0:                                # %entry
+  pushq   %rax
+  pushq   %r12
+  pushq   %rbx
+  .cfi_def_cfa_offset 16
+  mov     $0x1,  %r12
+  mov     $0x2,  %rbx
+  add     $0x1,  %r12
+  shr     $0x14, %r12
+  mov     $0x3,  %r12
+bb3:
+  jmp     bb1
+bb4:
+  cmp     $0x3,  %r12
+bb5:
+  jne     bb8
+bb6:
+  xorl    %eax, %eax
+bb7:
+  popq    %rcx
+  popq    %rbx
+  popq    %r12
+  .cfi_def_cfa_offset 8
+  retq
+bb8:
+  mov  $0x1, %rax
+  jmp  bb7
+# FDATA: 1 main.cold #bb2# 1 main #bb4# 0 100
+# FDATA: 1 main #bb5# 1 main #bb6# 0 100
+# FDATA: 1 main #bb3# 1 main.cold 0 0 100
+
+.Lfunc_end0:
+  .size  main, .Lfunc_end0-main
+  .cfi_endproc
diff --git a/bolt/test/runtime/X86/retpoline-synthetic.test b/bolt/test/runtime/X86/retpoline-synthetic.test
index 394d0189207fbd7d2de8fbe937b3b389da674979..3434d8c31869e257228f14c8b85c0c2ad5a336dc 100644
--- a/bolt/test/runtime/X86/retpoline-synthetic.test
+++ b/bolt/test/runtime/X86/retpoline-synthetic.test
@@ -23,8 +23,8 @@ CHECK-JUMP-NOT: jmpq   *
 # Check generated retpoline stub names
 RUN: llvm-strings %t | FileCheck %s -check-prefix=CHECK-STRINGS
 CHECK-STRINGS-DAG: __retpoline_%rax_
-CHECK-STRINGS-DAG: __retpoline_mem_%rip+DATAat0x[[#]]
-CHECK-STRINGS-DAG: __retpoline_mem_%rax+0
+CHECK-STRINGS-DAG: __retpoline_mem_%r{{.*}}
+CHECK-STRINGS-DAG: __retpoline_mem_%r{{.*}}
 
 RUN: %t 1000 3 | FileCheck %s
 CHECK: 30000000
diff --git a/bolt/test/runtime/X86/section-order.test b/bolt/test/runtime/X86/section-order.test
index a1317daba50e8d25265da1f251e68b33285ab626..12d5949fcd0d99366283d60a34aac84e7e39f859 100644
--- a/bolt/test/runtime/X86/section-order.test
+++ b/bolt/test/runtime/X86/section-order.test
@@ -1,4 +1,5 @@
 REQUIRES: system-linux,bolt-runtime
+REQUIRES: issues703
 
 RUN: %clang %p/Inputs/basic-instrumentation.s -Wl,-q -o %t.exe
 RUN: llvm-bolt %t.exe -o %t --instrument
diff --git a/bolt/test/runtime/X86/exceptions-instrumentation.test b/bolt/test/runtime/exceptions-instrumentation.test
similarity index 95%
rename from bolt/test/runtime/X86/exceptions-instrumentation.test
rename to bolt/test/runtime/exceptions-instrumentation.test
index 7a8f4ee81e4fc5cd8b7270d40b6209253ea831e2..4b8b3bee1fdb62deb80d273a9fd2c7e8cdd882af 100644
--- a/bolt/test/runtime/X86/exceptions-instrumentation.test
+++ b/bolt/test/runtime/exceptions-instrumentation.test
@@ -9,7 +9,7 @@ RUN: %t.exc arg1 arg2 arg3
 
 RUN: llvm-bolt %t_exc_split -o %t.exc.bolted --data %t.fdata \
 RUN:   --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \
-RUN:   --split-functions --split-eh=1 \
+RUN:   --split-functions --split-eh=1 2>&1 \
 RUN:   | FileCheck --check-prefix=EXCEPTIONS %s
 EXCEPTIONS-NOT: invalid (possibly stale) profile
 
diff --git a/bolt/test/runtime/iplt.c b/bolt/test/runtime/iplt.c
index b0e2e6d250700c9a90af842b3954a98338844b91..d5b56d901e6227d90bfb210683bd3883145d5210 100644
--- a/bolt/test/runtime/iplt.c
+++ b/bolt/test/runtime/iplt.c
@@ -1,10 +1,16 @@
 // This test checks that the ifuncs works after bolt.
+// Compiling with 00 results in IFUNC indirect calling.
 
-// RUN: %clang %cflags -no-pie %s -fuse-ld=lld \
+// RUN: %clang %cflags -O0 -no-pie %s -fuse-ld=lld \
 // RUN:    -o %t.exe -Wl,-q
 // RUN: llvm-bolt %t.exe -o %t.bolt.exe --use-old-text=0 --lite=0
 // RUN: %t.bolt.exe  | FileCheck %s
 
+// RUN: %clang %cflags -O3 -no-pie %s -fuse-ld=lld \
+// RUN:    -o %t.O3.exe -Wl,-q
+// RUN: llvm-bolt %t.O3.exe -o %t.O3.bolt.exe --use-old-text=0 --lite=0
+// RUN: %t.O3.bolt.exe  | FileCheck %s
+
 // CHECK: foo
 
 #include <stdio.h>
diff --git a/bolt/test/runtime/mark-funcs.c b/bolt/test/runtime/mark-funcs.c
new file mode 100644
index 0000000000000000000000000000000000000000..a8586ca8b6e1dfd32970967d13df660a928cd357
--- /dev/null
+++ b/bolt/test/runtime/mark-funcs.c
@@ -0,0 +1,22 @@
+#include <stdio.h>
+
+int dummy() {
+  printf("Dummy called\n");
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  if (dummy() != 0)
+    return 1;
+  printf("Main called\n");
+  return 0;
+}
+// Check that emitting trap value works properly and
+// does not break functions
+// REQUIRES: system-linux
+// RUN: %clangxx -Wl,-q %s -o %t.exe
+// RUN: %t.exe | FileCheck %s
+// CHECK: Dummy called
+// CHECK-NEXT: Main called
+// RUN: llvm-bolt %t.exe -o %t.exe.bolt -lite=false --mark-funcs
+// RUN: %t.exe.bolt | FileCheck %s
diff --git a/bolt/test/runtime/meta-merge-fdata.test b/bolt/test/runtime/meta-merge-fdata.test
index 39f34ba3d8ac06d124b945a7ebf3cea8c26ce27d..6972e75c64de7b2b72957671a3f1a09e92595c84 100644
--- a/bolt/test/runtime/meta-merge-fdata.test
+++ b/bolt/test/runtime/meta-merge-fdata.test
@@ -1,7 +1,7 @@
 # Meta test using merge-fdata binary
 UNSUPPORTED: asan
 # Instrumentation currently only works on X86
-REQUIRES: bolt-runtime
+REQUIRES: x86_64-linux,bolt-runtime
 
 # Instrumentation, should test:
 # - Direct branches
@@ -22,7 +22,7 @@ CHECK-FDATA: 0 [unknown] 0 1 _start 0 0 1
 # Check that BOLT works with this profile
 RUN: llvm-bolt merge-fdata -o %t.bolt --data %t.fdata1 \
 RUN:   --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \
-RUN:   --split-functions \
+RUN:   --split-functions 2>&1 \
 RUN:   | FileCheck %s --check-prefix=CHECK-BOLT1
 CHECK-BOLT1-NOT: invalid (possibly stale) profile
 
@@ -44,7 +44,7 @@ RUN: cmp %t.fdata.base %t.fdata.inst
 # Optimize using merged fdata
 RUN: llvm-bolt merge-fdata -o %t.opt --data %t.fdata.base \
 RUN:   --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \
-RUN:   --split-functions \
+RUN:   --split-functions 2>&1 \
 RUN:   | FileCheck %s --check-prefix=CHECK-BOLT2
 CHECK-BOLT2-NOT: invalid (possibly stale) profile
 
diff --git a/bolt/test/runtime/X86/pie-exceptions-split.test b/bolt/test/runtime/pie-exceptions-split.test
similarity index 95%
rename from bolt/test/runtime/X86/pie-exceptions-split.test
rename to bolt/test/runtime/pie-exceptions-split.test
index 124fef60fd2dadf8d5dd2c23eb0c907de429eff0..30f2d02bc9e10b68c01ec9dd79d1f342683e3822 100644
--- a/bolt/test/runtime/X86/pie-exceptions-split.test
+++ b/bolt/test/runtime/pie-exceptions-split.test
@@ -16,9 +16,9 @@ RUN:   --print-only=main 2>&1 | FileCheck %s
 ## All calls to printf() should be from exception handling code that was
 ## recorded as cold during the profile collection run. Check that the calls
 ## are placed after the split point.
-CHECK-NOT: callq printf
+CHECK-NOT: printf
 CHECK: HOT-COLD SPLIT POINT
-CHECK:     callq printf
+CHECK:     printf
 
 ## Verify the output still executes correctly when the exception path is being
 ## taken.
diff --git a/bolt/test/verify-cfg.test b/bolt/test/verify-cfg.test
new file mode 100644
index 0000000000000000000000000000000000000000..4a7de85cd427ab5892abb63607e089023bf7309b
--- /dev/null
+++ b/bolt/test/verify-cfg.test
@@ -0,0 +1,8 @@
+# Verify if the `--verify-cfg` option might produce incorrect alerts.
+
+REQUIRES: system-linux
+
+RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q
+RUN: llvm-bolt %t -o %t.bolt --verify-cfg 2>&1 | FileCheck %s
+
+CHECK-NOT: BOLT-ERROR: Invalid CFG detected after pass {{.*}}
diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt
index 048c3e54ca4482a3c60624e82305a094e0e8ed82..6fde9fe962a304b86bd7e47e20d91cb402445705 100644
--- a/lld/ELF/CMakeLists.txt
+++ b/lld/ELF/CMakeLists.txt
@@ -74,6 +74,7 @@ add_lld_library(lldELF
   Passes
   Support
   TargetParser
+  TransformUtils
 
   LINK_LIBS
   lldCommon
diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp
index ff72731b1f38d65a6896b109b62e72b20aea94fc..5e36964da94fc52328f66d978a65ee6d18a1e0f8 100644
--- a/lld/ELF/CallGraphSort.cpp
+++ b/lld/ELF/CallGraphSort.cpp
@@ -6,38 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 ///
-/// Implementation of Call-Chain Clustering from: Optimizing Function Placement
-/// for Large-Scale Data-Center Applications
-/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
-///
-/// The goal of this algorithm is to improve runtime performance of the final
-/// executable by arranging code sections such that page table and i-cache
-/// misses are minimized.
-///
-/// Definitions:
-/// * Cluster
-///   * An ordered list of input sections which are laid out as a unit. At the
-///     beginning of the algorithm each input section has its own cluster and
-///     the weight of the cluster is the sum of the weight of all incoming
-///     edges.
-/// * Call-Chain Clustering (C³) Heuristic
-///   * Defines when and how clusters are combined. Pick the highest weighted
-///     input section then add it to its most likely predecessor if it wouldn't
-///     penalize it too much.
-/// * Density
-///   * The weight of the cluster divided by the size of the cluster. This is a
-///     proxy for the amount of execution time spent per byte of the cluster.
-///
-/// It does so given a call graph profile by the following:
-/// * Build a weighted call graph from the call graph profile
-/// * Sort input sections by weight
-/// * For each input section starting with the highest weight
-///   * Find its most likely predecessor cluster
-///   * Check if the combined cluster would be too large, or would have too low
-///     a density.
-///   * If not, then combine the clusters.
-/// * Sort non-empty clusters by density
+/// The file is responsible for sorting sections using LLVM call graph profile
+/// data by placing frequently executed code sections together. The goal of the
+/// placement is to improve the runtime performance of the final executable by
+/// arranging code sections so that i-TLB misses and i-cache misses are reduced.
 ///
+/// The algorithm first builds a call graph based on the profile data and then
+/// iteratively merges "chains" (ordered lists) of input sections which will be
+/// laid out as a unit. There are two implementations for deciding how to
+/// merge a pair of chains:
+///  - a simpler one, referred to as Call-Chain Clustering (C^3), that follows
+///    "Optimizing Function Placement for Large-Scale Data-Center Applications"
+/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
+/// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which
+///   typically produces layouts with higher locality, and hence, yields fewer
+///   instruction cache misses on large binaries.
 //===----------------------------------------------------------------------===//
 
 #include "CallGraphSort.h"
@@ -45,6 +28,7 @@
 #include "InputSection.h"
 #include "Symbols.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Transforms/Utils/CodeLayout.h"
 
 #include <numeric>
 
@@ -75,6 +59,33 @@ struct Cluster {
   Edge bestPred = {-1, 0};
 };
 
+/// Implementation of the Call-Chain Clustering (C^3). The goal of this
+/// algorithm is to improve runtime performance of the executable by arranging
+/// code sections such that page table and i-cache misses are minimized.
+///
+/// Definitions:
+/// * Cluster
+///   * An ordered list of input sections which are laid out as a unit. At the
+///     beginning of the algorithm each input section has its own cluster and
+///     the weight of the cluster is the sum of the weight of all incoming
+///     edges.
+/// * Call-Chain Clustering (C³) Heuristic
+///   * Defines when and how clusters are combined. Pick the highest weighted
+///     input section then add it to its most likely predecessor if it wouldn't
+///     penalize it too much.
+/// * Density
+///   * The weight of the cluster divided by the size of the cluster. This is a
+///     proxy for the amount of execution time spent per byte of the cluster.
+///
+/// It does so given a call graph profile by the following:
+/// * Build a weighted call graph from the call graph profile
+/// * Sort input sections by weight
+/// * For each input section starting with the highest weight
+///   * Find its most likely predecessor cluster
+///   * Check if the combined cluster would be too large, or would have too low
+///     a density.
+///   * If not, then combine the clusters.
+/// * Sort non-empty clusters by density
 class CallGraphSort {
 public:
   CallGraphSort();
@@ -260,11 +271,74 @@ DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
   return orderMap;
 }
 
+// Sort sections by the profile data using the Cache-Directed Sort algorithm.
+// The placement is done by optimizing the locality by co-locating frequently
+// executed code sections together.
+DenseMap<const InputSectionBase *, int> elf::computeCacheDirectedSortOrder() {
+  SmallVector<uint64_t, 0> funcSizes;
+  SmallVector<uint64_t, 0> funcCounts;
+  SmallVector<codelayout::EdgeCount, 0> callCounts;
+  SmallVector<uint64_t, 0> callOffsets;
+  SmallVector<const InputSectionBase *, 0> sections;
+  DenseMap<const InputSectionBase *, size_t> secToTargetId;
+
+  auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t {
+    auto res = secToTargetId.try_emplace(inSec, sections.size());
+    if (res.second) {
+      // inSec does not appear before in the graph.
+      sections.push_back(inSec);
+      assert(inSec->getSize() > 0 && "found a function with zero size");
+      funcSizes.push_back(inSec->getSize());
+      funcCounts.push_back(0);
+    }
+    return res.first->second;
+  };
+
+  // Create the graph.
+  for (std::pair<SectionPair, uint64_t> &c : config->callGraphProfile) {
+    const InputSectionBase *fromSB = cast<InputSectionBase>(c.first.first);
+    const InputSectionBase *toSB = cast<InputSectionBase>(c.first.second);
+    // Ignore edges between input sections belonging to different sections.
+    if (fromSB->getOutputSection() != toSB->getOutputSection())
+      continue;
+
+    uint64_t weight = c.second;
+    // Ignore edges with zero weight.
+    if (weight == 0)
+      continue;
+
+    size_t from = getOrCreateNode(fromSB);
+    size_t to = getOrCreateNode(toSB);
+    // Ignore self-edges (recursive calls).
+    if (from == to)
+      continue;
+
+    callCounts.push_back({from, to, weight});
+    // Assume that the jump is at the middle of the input section. The profile
+    // data does not contain jump offsets.
+    callOffsets.push_back((funcSizes[from] + 1) / 2);
+    funcCounts[to] += weight;
+  }
+
+  // Run the layout algorithm.
+  std::vector<uint64_t> sortedSections = codelayout::computeCacheDirectedLayout(
+      funcSizes, funcCounts, callCounts, callOffsets);
+
+  // Create the final order.
+  DenseMap<const InputSectionBase *, int> orderMap;
+  int curOrder = 1;
+  for (uint64_t secIdx : sortedSections)
+    orderMap[sections[secIdx]] = curOrder++;
+
+  return orderMap;
+}
+
 // Sort sections by the profile data provided by --callgraph-profile-file.
 //
 // This first builds a call graph based on the profile data then merges sections
-// according to the C³ heuristic. All clusters are then sorted by a density
-// metric to further improve locality.
+// according either to the C³ or Cache-Directed-Sort ordering algorithm.
 DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
+  if (config->callGraphProfileSort == CGProfileSortKind::Cdsort)
+    return computeCacheDirectedSortOrder();
   return CallGraphSort().run();
 }
diff --git a/lld/ELF/CallGraphSort.h b/lld/ELF/CallGraphSort.h
index 4997cb102c326402480c3c418e0b34a2f652bba0..1b54f2b62482284bb2d02581dc7481b367ff1760 100644
--- a/lld/ELF/CallGraphSort.h
+++ b/lld/ELF/CallGraphSort.h
@@ -14,6 +14,8 @@
 namespace lld::elf {
 class InputSectionBase;
 
+llvm::DenseMap<const InputSectionBase *, int> computeCacheDirectedSortOrder();
+
 llvm::DenseMap<const InputSectionBase *, int> computeCallGraphProfileOrder();
 } // namespace lld::elf
 
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 6d0bdeb7bf938a83eb5c05beff2e0a5d126a8012..aa9f5456a7544f05503118857ae88342f84fae61 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -59,6 +59,9 @@ enum class BsymbolicKind { None, NonWeakFunctions, Functions, All };
 // For --build-id.
 enum class BuildIdKind { None, Fast, Md5, Sha1, Hexstring, Uuid };
 
+// For --call-graph-profile-sort={none,hfsort,cdsort}.
+enum class CGProfileSortKind { None, Hfsort, Cdsort };
+
 // For --discard-{all,locals,none}.
 enum class DiscardPolicy { Default, All, Locals, None };
 
@@ -214,7 +217,7 @@ struct Config {
   bool asNeeded = false;
   bool armBe8 = false;
   BsymbolicKind bsymbolic = BsymbolicKind::None;
-  bool callGraphProfileSort;
+  CGProfileSortKind callGraphProfileSort;
   bool checkSections;
   bool checkDynamicRelocs;
   llvm::DebugCompressionType compressDebugSections;
@@ -246,6 +249,7 @@ struct Config {
   bool ltoDebugPassManager;
   bool ltoEmitAsm;
   bool ltoUniqueBasicBlockSectionNames;
+  bool ltoValidateAllVtablesHaveTypeInfos;
   bool ltoWholeProgramVisibility;
   bool mergeArmExidx;
   bool mipsN32Abi = false;
@@ -479,6 +483,9 @@ struct Ctx {
   std::atomic<bool> hasTlsIe{false};
   // True if we need to reserve two .got entries for local-dynamic TLS model.
   std::atomic<bool> needsTlsLd{false};
+  // True if all native vtable symbols have corresponding type info symbols
+  // during LTO.
+  bool ltoAllVtablesHaveTypeInfos;
 
   void reset();
 
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index da9ca44b3f209cee77f1b6060ea4b5ae9a20c868..4b563a0fdf2f335640784e4490068ce3d32e370a 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -105,6 +105,7 @@ void Ctx::reset() {
   backwardReferences.clear();
   hasSympart.store(false, std::memory_order_relaxed);
   needsTlsLd.store(false, std::memory_order_relaxed);
+  ltoAllVtablesHaveTypeInfos = false;
 }
 
 llvm::raw_fd_ostream Ctx::openAuxiliaryFile(llvm::StringRef filename,
@@ -1037,6 +1038,74 @@ template <class ELFT> static void readCallGraphsFromObjectFiles() {
   }
 }
 
+template <class ELFT>
+static void ltoValidateAllVtablesHaveTypeInfos(opt::InputArgList &args) {
+  DenseSet<StringRef> typeInfoSymbols;
+  SmallSetVector<StringRef, 0> vtableSymbols;
+  auto processVtableAndTypeInfoSymbols = [&](StringRef name) {
+    if (name.consume_front("_ZTI"))
+      typeInfoSymbols.insert(name);
+    else if (name.consume_front("_ZTV"))
+      vtableSymbols.insert(name);
+  };
+
+  // Examine all native symbol tables.
+  for (ELFFileBase *f : ctx.objectFiles) {
+    using Elf_Sym = typename ELFT::Sym;
+    for (const Elf_Sym &s : f->template getGlobalELFSyms<ELFT>()) {
+      if (s.st_shndx != SHN_UNDEF) {
+        StringRef name = check(s.getName(f->getStringTable()));
+        processVtableAndTypeInfoSymbols(name);
+      }
+    }
+  }
+
+  for (SharedFile *f : ctx.sharedFiles) {
+    using Elf_Sym = typename ELFT::Sym;
+    for (const Elf_Sym &s : f->template getELFSyms<ELFT>()) {
+      if (s.st_shndx != SHN_UNDEF) {
+        StringRef name = check(s.getName(f->getStringTable()));
+        processVtableAndTypeInfoSymbols(name);
+      }
+    }
+  }
+
+  SmallSetVector<StringRef, 0> vtableSymbolsWithNoRTTI;
+  for (StringRef s : vtableSymbols)
+    if (!typeInfoSymbols.count(s))
+      vtableSymbolsWithNoRTTI.insert(s);
+
+  // Remove known safe symbols.
+  for (auto *arg : args.filtered(OPT_lto_known_safe_vtables)) {
+    StringRef knownSafeName = arg->getValue();
+    if (!knownSafeName.consume_front("_ZTV"))
+      error("--lto-known-safe-vtables=: expected symbol to start with _ZTV, "
+            "but got " +
+            knownSafeName);
+    vtableSymbolsWithNoRTTI.remove(knownSafeName);
+  }
+
+  ctx.ltoAllVtablesHaveTypeInfos = vtableSymbolsWithNoRTTI.empty();
+  // Check for unmatched RTTI symbols
+  for (StringRef s : vtableSymbolsWithNoRTTI) {
+    message(
+        "--lto-validate-all-vtables-have-type-infos: RTTI missing for vtable "
+        "_ZTV" +
+        s + ", --lto-whole-program-visibility disabled");
+  }
+}
+
+static CGProfileSortKind getCGProfileSortKind(opt::InputArgList &args) {
+  StringRef s = args.getLastArgValue(OPT_call_graph_profile_sort, "hfsort");
+  if (s == "hfsort")
+    return CGProfileSortKind::Hfsort;
+  if (s == "cdsort")
+    return CGProfileSortKind::Cdsort;
+  if (s != "none")
+    error("unknown --call-graph-profile-sort= value: " + s);
+  return CGProfileSortKind::None;
+}
+
 static DebugCompressionType getCompressionType(StringRef s, StringRef option) {
   DebugCompressionType type = StringSwitch<DebugCompressionType>(s)
                                   .Case("zlib", DebugCompressionType::Zlib)
@@ -1168,6 +1237,7 @@ static void readConfigs(opt::InputArgList &args) {
     else if (arg->getOption().matches(OPT_Bsymbolic))
       config->bsymbolic = BsymbolicKind::All;
   }
+  config->callGraphProfileSort = getCGProfileSortKind(args);
   config->checkSections =
       args.hasFlag(OPT_check_sections, OPT_no_check_sections, true);
   config->chroot = args.getLastArgValue(OPT_chroot);
@@ -1188,8 +1258,6 @@ static void readConfigs(opt::InputArgList &args) {
       args.hasFlag(OPT_eh_frame_hdr, OPT_no_eh_frame_hdr, false);
   config->emitLLVM = args.hasArg(OPT_plugin_opt_emit_llvm, false);
   config->emitRelocs = args.hasArg(OPT_emit_relocs);
-  config->callGraphProfileSort = args.hasFlag(
-      OPT_call_graph_profile_sort, OPT_no_call_graph_profile_sort, true);
   config->enableNewDtags =
       args.hasFlag(OPT_enable_new_dtags, OPT_disable_new_dtags, true);
   config->entry = args.getLastArgValue(OPT_entry);
@@ -1233,6 +1301,9 @@ static void readConfigs(opt::InputArgList &args) {
   config->ltoWholeProgramVisibility =
       args.hasFlag(OPT_lto_whole_program_visibility,
                    OPT_no_lto_whole_program_visibility, false);
+  config->ltoValidateAllVtablesHaveTypeInfos =
+      args.hasFlag(OPT_lto_validate_all_vtables_have_type_infos,
+                   OPT_no_lto_validate_all_vtables_have_type_infos, false);
   config->ltoo = args::getInteger(args, OPT_lto_O, 2);
   if (config->ltoo > 3)
     error("invalid optimization level for LTO: " + Twine(config->ltoo));
@@ -1619,7 +1690,7 @@ static void readConfigs(opt::InputArgList &args) {
       config->symbolOrderingFile = getSymbolOrderingFile(*buffer);
       // Also need to disable CallGraphProfileSort to prevent
       // LLD order symbols with CGProfile
-      config->callGraphProfileSort = false;
+      config->callGraphProfileSort = CGProfileSortKind::None;
     }
   }
 
@@ -2829,6 +2900,10 @@ void LinkerDriver::link(opt::InputArgList &args) {
                                 config->ltoEmitAsm ||
                                 !config->thinLTOModulesToCompile.empty();
 
+  // Handle --lto-validate-all-vtables-have-type-infos.
+  if (config->ltoValidateAllVtablesHaveTypeInfos)
+    invokeELFT(ltoValidateAllVtablesHaveTypeInfos, args);
+
   // Do link-time optimization if given files are LLVM bitcode files.
   // This compiles bitcode files into real object files.
   //
@@ -3021,7 +3096,7 @@ void LinkerDriver::link(opt::InputArgList &args) {
   }
 
   // Read the callgraph now that we know what was gced or icfed
-  if (config->callGraphProfileSort) {
+  if (config->callGraphProfileSort != CGProfileSortKind::None) {
     if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file))
       if (std::optional<MemoryBufferRef> buffer = readFile(arg->getValue()))
         readCallGraph(*buffer);
diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp
index a7df5f072f6f407dd53109b8a02931b8f815059c..ebc6ccdbea7861e9f1d59dc045ce500255ebb867 100644
--- a/lld/ELF/LTO.cpp
+++ b/lld/ELF/LTO.cpp
@@ -154,6 +154,9 @@ static lto::Config createConfig() {
   c.DwoDir = std::string(config->dwoDir);
 
   c.HasWholeProgramVisibility = config->ltoWholeProgramVisibility;
+  c.ValidateAllVtablesHaveTypeInfos =
+      config->ltoValidateAllVtablesHaveTypeInfos;
+  c.AllVtablesHaveTypeInfos = ctx.ltoAllVtablesHaveTypeInfos;
   c.AlwaysEmitRegularLTOObj = !config->ltoObjPath.empty();
 
   for (const llvm::StringRef &name : config->thinLTOModulesToCompile)
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index 4f8ea4fd4d2bb6cd21ee476f02815345c0c532f7..dea6c16949ee28698f35e5645ff31c80d5bbd1e4 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -125,9 +125,12 @@ defm as_needed: B<"as-needed",
 defm call_graph_ordering_file:
   Eq<"call-graph-ordering-file", "Layout sections to optimize the given callgraph">;
 
-defm call_graph_profile_sort: BB<"call-graph-profile-sort",
-    "Reorder sections with call graph profile (default)",
-    "Do not reorder sections with call graph profile">;
+def call_graph_profile_sort: JJ<"call-graph-profile-sort=">,
+  HelpText<"Reorder input sections with call graph profile using the specified algorithm (default: hfsort)">,
+  MetaVarName<"[none,hfsort]">,
+  Values<"none,hfsort">;
+def : FF<"no-call-graph-profile-sort">, Alias<call_graph_profile_sort>, AliasArgs<["none"]>,
+  Flags<[HelpHidden]>;
 
 // --chroot doesn't have a help text because it is an internal option.
 def chroot: Separate<["--"], "chroot">;
@@ -618,9 +621,14 @@ def lto_cs_profile_file: JJ<"lto-cs-profile-file=">,
 defm lto_pgo_warn_mismatch: BB<"lto-pgo-warn-mismatch",
   "turn on warnings about profile cfg mismatch (default)",
   "turn off warnings about profile cfg mismatch">;
+defm lto_known_safe_vtables : EEq<"lto-known-safe-vtables",
+  "When --lto-validate-all-vtables-have-type-infos is enabled, skip validation on these vtables (_ZTV symbols)">;
 def lto_obj_path_eq: JJ<"lto-obj-path=">;
 def lto_sample_profile: JJ<"lto-sample-profile=">,
   HelpText<"Sample profile file path">;
+defm lto_validate_all_vtables_have_type_infos: BB<"lto-validate-all-vtables-have-type-infos",
+  "Validate that all vtables have type infos for LTO link",
+  "Do not validate that all vtables have type infos for LTO link">;
 defm lto_whole_program_visibility: BB<"lto-whole-program-visibility",
   "Asserts that the LTO link has whole program visibility",
   "Asserts that the LTO link does not have whole program visibility">;
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index 0a5e4293dedaf5374946d5c7eb276d9f6f5da564..72b90094eec28241d22673d9942c10b9af6ffd19 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -117,6 +117,19 @@ is not intended to be cryptographically secure.
 .It Fl -build-id
 Synonym for
 .Fl -build-id Ns = Ns Cm fast .
+.It Fl -call-graph-profile-sort Ns = Ns Ar algorithm
+.Ar algorithm
+may be:
+.Pp
+.Bl -tag -width 2n -compact
+.It Cm none
+Ignore call graph profile.
+.It Cm hfsort
+Use hfsort (default).
+.It Cm cdsort
+Use cdsort.
+.El
+.Pp
 .It Fl -color-diagnostics Ns = Ns Ar value
 Use colors in diagnostics.
 .Ar value
diff --git a/lld/test/ELF/cgprofile-obj.s b/lld/test/ELF/cgprofile-obj.s
index f56f3bcbf0c3c5e92c11e83b8c692f0cb17450e9..0848adc5e4279a7edbb8fdc3730f104ed711819b 100644
--- a/lld/test/ELF/cgprofile-obj.s
+++ b/lld/test/ELF/cgprofile-obj.s
@@ -3,8 +3,11 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
 # RUN: ld.lld -e A %t.o -o %t
 # RUN: llvm-nm --no-sort %t | FileCheck %s
-# RUN: ld.lld --no-call-graph-profile-sort -e A %t.o -o %t
+# RUN: ld.lld --call-graph-profile-sort=none -e A %t.o -o %t
 # RUN: llvm-nm --no-sort %t | FileCheck %s --check-prefix=NO-CG
+## --no-call-graph-profile-sort is an alias for --call-graph-profile-sort=none.
+# RUN: ld.lld --no-call-graph-profile-sort -e A %t.o -o %t1
+# RUN: cmp %t %t1
 
     .section    .text.D,"ax",@progbits
 D:
diff --git a/lld/test/ELF/cgprofile-txt.s b/lld/test/ELF/cgprofile-txt.s
index 99cbfa574532523a842e8fa539598c08b2e61ae1..c9194bbbc43cbe0284091ef63b66ebdedc4e5813 100644
--- a/lld/test/ELF/cgprofile-txt.s
+++ b/lld/test/ELF/cgprofile-txt.s
@@ -24,8 +24,19 @@
 # RUN: echo "TooManyPreds8 TooManyPreds 10" >> %t.call_graph
 # RUN: echo "TooManyPreds9 TooManyPreds 10" >> %t.call_graph
 # RUN: echo "TooManyPreds10 TooManyPreds 11" >> %t.call_graph
-# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=hfsort -o %t2
 # RUN: llvm-readobj --symbols %t2 | FileCheck %s
+## --call-graph-profile-sort=hfsort is the default.
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2b
+# RUN: cmp %t2 %t2b
+
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=cdsort -o %t2
+# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CDSORT
+
+# RUN: not ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=sort \
+# RUN:   -o /dev/null 2>&1 | FileCheck %s --check-prefix=UNKNOWN
+
+# UNKNOWN: error: unknown --call-graph-profile-sort= value: sort
 
     .section    .text.D,"ax",@progbits
 D:
@@ -159,6 +170,31 @@ TooManyPreds10:
 # CHECK:          Name: _init2
 # CHECK-NEXT:     Value: 0x201141
 
+# CDSORT:          Name: D
+# CDSORT-NEXT:     Value: 0x201123
+# CDSORT:          Name: TooManyPreds
+# CDSORT-NEXT:     Value: 0x20112F
+# CDSORT:          Name: TooManyPreds10
+# CDSORT-NEXT:     Value: 0x20112E
+# CDSORT:          Name: C
+# CDSORT-NEXT:     Value: 0x201122
+# CDSORT:          Name: B
+# CDSORT-NEXT:     Value: 0x201121
+# CDSORT:          Name: A
+# CDSORT-NEXT:     Value: 0x201120
+# CDSORT:          Name: TS
+# CDSORT-NEXT:     Value: 0x20113D
+# CDSORT:          Name: PP
+# CDSORT-NEXT:     Value: 0x20113C
+# CDSORT:          Name: QC
+# CDSORT-NEXT:     Value: 0x20113E
+# CDSORT:          Name: GB
+# CDSORT-NEXT:     Value: 0x20113F
+# CDSORT:          Name: _init
+# CDSORT-NEXT:     Value: 0x201140
+# CDSORT:          Name: _init2
+# CDSORT-NEXT:     Value: 0x201141
+
 # NOSORT:          Name: D
 # NOSORT-NEXT:     Value: 0x201120
 # NOSORT:          Name: TooManyPreds
diff --git a/lld/test/ELF/cgprofile-txt2.s b/lld/test/ELF/cgprofile-txt2.s
index 91961db39c3a883fc948c3b609e2e8b95a1f4e4c..b59b6eeb292fabff00e32148498208b799d3cf46 100644
--- a/lld/test/ELF/cgprofile-txt2.s
+++ b/lld/test/ELF/cgprofile-txt2.s
@@ -5,17 +5,28 @@
 # RUN: echo "B C 50" >> %t.call_graph
 # RUN: echo "C D 40" >> %t.call_graph
 # RUN: echo "D B 10" >> %t.call_graph
-# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2
-# RUN: llvm-readobj --symbols %t2 | FileCheck %s
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=hfsort -o %t2
+# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKC3
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=cdsort -o %t2
+# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKCDS
 
-# CHECK:      Name: A
-# CHECK-NEXT: Value: 0x201123
-# CHECK:      Name: B
-# CHECK-NEXT: Value: 0x201120
-# CHECK:      Name: C
-# CHECK-NEXT: Value: 0x201121
-# CHECK:      Name: D
-# CHECK-NEXT: Value: 0x201122
+# CHECKC3:      Name: A
+# CHECKC3-NEXT: Value: 0x201123
+# CHECKC3:      Name: B
+# CHECKC3-NEXT: Value: 0x201120
+# CHECKC3:      Name: C
+# CHECKC3-NEXT: Value: 0x201121
+# CHECKC3:      Name: D
+# CHECKC3-NEXT: Value: 0x201122
+
+# CHECKCDS:      Name: A
+# CHECKCDS-NEXT: Value: 0x201120
+# CHECKCDS:      Name: B
+# CHECKCDS-NEXT: Value: 0x201121
+# CHECKCDS:      Name: C
+# CHECKCDS-NEXT: Value: 0x201122
+# CHECKCDS:      Name: D
+# CHECKCDS-NEXT: Value: 0x201123
 
 .section    .text.A,"ax",@progbits
 .globl  A
diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fb357831d6f21a97f34d9a4bf09e70818669bbc4
--- /dev/null
+++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll
@@ -0,0 +1,26 @@
+; REQUIRES: x86
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.Native = type { %struct.A }
+
+@_ZTV6Native = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI6Native, ptr @_ZN1A1nEi, ptr @_ZN6Native1fEi] }
+@_ZTS6Native = linkonce_odr constant [8 x i8] c"6Native\00"
+@_ZTI6Native = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS6Native, ptr @_ZTI1A }
+
+; Base type A does not need to emit a vtable if it's never instantiated. However, RTTI still gets generated
+@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00"
+@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A }
+
+
+define linkonce_odr i32 @_ZN6Native1fEi(ptr %this, i32 %a) #0 {
+   ret i32 1;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+attributes #0 = { noinline optnone }
diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4533504c601803158a2ecbc550163c32fc21620a
--- /dev/null
+++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll
@@ -0,0 +1,19 @@
+; REQUIRES: x86
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.Native = type { %struct.A }
+
+@_ZTV6Native = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1A1nEi, ptr @_ZN6Native1fEi] }
+
+define linkonce_odr i32 @_ZN6Native1fEi(ptr %this, i32 %a) #0 {
+   ret i32 1;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+attributes #0 = { noinline optnone }
diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll
new file mode 100644
index 0000000000000000000000000000000000000000..43df8366aa2ae0c68e9a5531a0661f90e897ae2e
--- /dev/null
+++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll
@@ -0,0 +1,68 @@
+;; Source code:
+;; cat > a.h <<'eof'
+;; struct A { virtual int foo(); };
+;; int bar(A *a);
+;; eof
+;; cat > b.cc <<'eof'
+;; #include "a.h"
+;; struct B : A { int foo() { return 2; } };
+;; int baz() { B b; return bar(&b); }
+;; eof
+;; clang++ -flto=thin b.cc -c
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.B = type { %struct.A }
+%struct.A = type { ptr }
+
+@_ZTV1B = linkonce_odr dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B3fooEv] }, !type !0, !type !1, !type !2, !type !3
+@_ZTS1B = linkonce_odr dso_local constant [3 x i8] c"1B\00"
+@_ZTI1A = external constant ptr
+@_ZTI1B = linkonce_odr dso_local constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A }
+@_ZTV1A = external unnamed_addr constant { [3 x ptr] }
+
+define dso_local noundef i32 @_Z3bazv() #0 {
+entry:
+  %b = alloca %struct.B
+  call void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %b)
+  %call = call noundef i32 @_Z3barP1A(ptr noundef %b)
+  ret i32 %call
+}
+
+define linkonce_odr dso_local void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 {
+entry:
+  %this.addr = alloca ptr
+  store ptr %this, ptr %this.addr
+  %this1 = load ptr, ptr %this.addr
+  call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this1)
+  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1
+  ret void
+}
+
+declare i32 @_Z3barP1A(ptr noundef)
+
+define linkonce_odr dso_local void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 {
+entry:
+  %this.addr = alloca ptr
+  store ptr %this, ptr %this.addr
+  %this1 = load ptr, ptr %this.addr
+  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1
+  ret void
+}
+
+define linkonce_odr i32 @_ZN1B3fooEv(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 {
+entry:
+  %this.addr = alloca ptr
+  store ptr %this, ptr %this.addr
+  %this1 = load ptr, ptr %this.addr
+  ret i32 2
+}
+
+;; Make sure we don't inline or otherwise optimize out the direct calls.
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTSM1AFivE.virtual"}
+!2 = !{i64 16, !"_ZTS1B"}
+!3 = !{i64 16, !"_ZTSM1BFivE.virtual"}
diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6cc55df82e2f2814b1717a0ad09c55a81030ed95
--- /dev/null
+++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll
@@ -0,0 +1,16 @@
+; REQUIRES: x86
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@_ZTV1B = external unnamed_addr constant { [4 x ptr] }
+
+define linkonce_odr void @_ZN1BC2Ev(ptr %this) #0 {
+  %this.addr = alloca ptr, align 8
+  store ptr %this, ptr %this.addr, align 8
+  %this1 = load ptr, ptr %this.addr, align 8
+  store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1, align 8
+  ret void
+}
+
+attributes #0 = { noinline optnone }
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d6ac53f9fb936b0d1eb4f86549242288613dcf26
--- /dev/null
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll
@@ -0,0 +1,263 @@
+; REQUIRES: x86
+
+;; Common artifacts
+; RUN: opt --thinlto-bc -o %t1.o %s
+; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s
+; RUN: cp %s %t1_regular.ll
+; RUN: echo '!llvm.module.flags = !{!12, !13}' >> %t1_regular.ll
+; RUN: echo '!12 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll
+; RUN: echo '!13 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll
+; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll
+
+; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos.ll -o %t2.bc
+; RUN: llc -relocation-model=pic -filetype=obj %t2.bc -o %t2.o
+; RUN: ld.lld %t2.o -o %t2.so -shared
+
+; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll -o %t2_nortti.bc
+; RUN: llc -relocation-model=pic -filetype=obj %t2_nortti.bc -o %t2_nortti.o
+; RUN: ld.lld %t2_nortti.o -o %t2_nortti.so -shared
+
+; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_undef.ll -o %t2_undef.bc
+; RUN: llc -relocation-model=pic -filetype=obj %t2_undef.bc -o %t2_undef.o
+; RUN: ld.lld %t2_undef.o -o %t2_undef.so -shared
+
+;; With --lto-whole-program-visibility, we assume no native types can interfere
+;; and thus proceed with devirtualization even in the presence of native types
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2.o -o %t3_index -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2.o -o %t3_hybrid -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2.o -o %t3_regular -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi
+
+;; With --lto-validate-all-vtables-have-type-infos, the linker checks for the presence of vtables
+;; and RTTI in native files and blocks devirtualization to be conservative on correctness
+;; for these types.
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2.o -o %t4_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2.o -o %t4_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2.o -o %t4_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t4_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; DSOs behave similarly
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2.so -o %t5_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2.so -o %t5_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2.so -o %t5_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t5_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+; VALIDATE-NOT: single-impl:
+; VALIDATE:     single-impl: devirtualized a call to _ZN1D1mEi
+; VALIDATE-NOT: single-impl:
+
+;; When vtables without type infos are detected in native files, we have a hole in our knowledge so
+;; --lto-validate-all-vtables-have-type-infos conservatively disables --lto-whole-program-visibility
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2_nortti.o -o %t6_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2_nortti.o -o %t6_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2_nortti.o -o %t6_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t6_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+;; DSOs behave similarly
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2_nortti.so -o %t7_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2_nortti.so -o %t7_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2_nortti.so -o %t7_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t7_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+; NO-RTTI-DAG: --lto-validate-all-vtables-have-type-infos: RTTI missing for vtable _ZTV6Native, --lto-whole-program-visibility disabled
+; NO-RTTI-DAG: single-impl: devirtualized a call to _ZN1D1mEi
+
+;; --lto-known-safe-vtables=* can be used to specifically allow types to participate in WPD
+;; even if they don't have corresponding RTTI
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2_nortti.o -o %t8_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2_nortti.o -o %t8_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2_nortti.o -o %t8_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t8_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Only check for definitions of vtables symbols, just having a reference does not allow a type to
+;; be derived from
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2_undef.o -o %t9_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2_undef.o -o %t9_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2_undef.o -o %t9_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t9_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.B = type { %struct.A }
+%struct.C = type { %struct.A }
+%struct.D = type { ptr }
+
+@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5
+@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1C, ptr @_ZN1C1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !6, !type !7, !type !8
+@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1D, ptr @_ZN1D1mEi] }, !type !9, !vcall_visibility !11
+
+@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00"
+@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A }
+
+@_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00"
+@_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A }
+
+@_ZTS1C = linkonce_odr constant [3 x i8] c"1C\00"
+@_ZTI1C = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1C, ptr @_ZTI1A }
+
+@_ZTS1D = internal constant [3 x i8] c"1D\00"
+@_ZTI1D = internal constant { ptr, ptr } { ptr null, ptr @_ZTS1D }
+
+;; Prevent the vtables from being dead code eliminated.
+@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ]
+
+; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start
+define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
+entry:
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr ptr, ptr %vtable, i32 1
+  %fptr1 = load ptr, ptr %fptrptr, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi
+  ;; --lto-whole-program-visibility disabled so no devirtualization
+  ; CHECK-VALIDATE-IR: %call = tail call i32 %fptr1
+  ; CHECK-NO-RTTI-IR: %call = tail call i32 %fptr1
+  %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a)
+
+  %fptr22 = load ptr, ptr %vtable, align 8
+
+  ;; We still have to call it as virtual.
+  ; CHECK-IR: %call2 = tail call i32 %fptr22
+  ; CHECK-VALIDATE-IR: %call2 = tail call i32 %fptr22
+  ; CHECK-NO-RTTI-IR: %call2 = tail call i32 %fptr22
+  %call2 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call)
+
+  %vtable2 = load ptr, ptr %obj2
+  %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !10)
+  call void @llvm.assume(i1 %p2)
+
+  %fptr33 = load ptr, ptr %vtable2, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call3 = tail call i32 @_ZN1D1mEi
+  ;; Types not present in native files can still be devirtualized
+  ; CHECK-VALIDATE-IR: %call3 = tail call i32 @_ZN1D1mEi
+  ;; --lto-whole-program-visibility disabled but being local this
+  ;;  has VCallVisibilityTranslationUnit visibility so it's still devirtualized
+  ; CHECK-NO-RTTI-IR: %call3 = tail call i32 @_ZN1D1mEi
+  %call3 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call2)
+
+  ret i32 %call3
+}
+; CHECK-COMMON-IR-LABEL: ret i32
+; CHECK-COMMON-IR-LABEL: }
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+define linkonce_odr i32 @_ZN1B1fEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define linkonce_odr i32 @_ZN1C1fEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+;; Make sure we don't inline or otherwise optimize out the direct calls.
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTSM1AFviE.virtual"}
+!2 = !{i64 24, !"_ZTSM1AFviE.virtual"}
+!3 = !{i64 16, !"_ZTS1B"}
+!4 = !{i64 16, !"_ZTSM1BFviE.virtual"}
+!5 = !{i64 24, !"_ZTSM1BFviE.virtual"}
+!6 = !{i64 16, !"_ZTS1C"}
+!7 = !{i64 16, !"_ZTSM1CFviE.virtual"}
+!8 = !{i64 24, !"_ZTSM1CFviE.virtual"}
+!9 = !{i64 16, !10}
+!10 = distinct !{}
+!11 = !{i64 2}
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll
new file mode 100644
index 0000000000000000000000000000000000000000..15040b8707aede995aea588638eb7c7c3eafafaf
--- /dev/null
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll
@@ -0,0 +1,183 @@
+; REQUIRES: x86
+
+; RUN: rm -rf %t.dir
+; RUN: split-file %s %t.dir
+; RUN: cd %t.dir
+
+;; Common artifacts
+; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1.o ThinLTO.ll
+; RUN: opt -module-summary -o %t2.o RegularLTO.ll
+
+;; --lto-whole-program-visibility when there's split ThinLTO and a RegularLTO with summary optimizes
+;; using the combined index.
+; RUN: ld.lld %t1.o %t2.o -o %t3 -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR,CHECK-COMMON-IR
+; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-REGULAR-IR,CHECK-COMMON-REGULAR-IR
+
+;; --lto-validate-all-vtables-have-type-infos when there's split ThinLTO and a RegularLTO with summary behaves the same
+;; as everything is present in the combined index.
+; RUN: ld.lld %t1.o %t2.o -o %t3 -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR,CHECK-COMMON-IR
+; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-REGULAR-IR,CHECK-COMMON-REGULAR-IR
+
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi
+
+;--- ThinLTO.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.B = type { %struct.A }
+%struct.C = type { %struct.A }
+%struct.D = type { ptr }
+
+@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1A1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5
+@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1C, ptr @_ZN1A1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !6, !type !7, !type !8
+@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1D, ptr @_ZN1D1mEi] }, !type !9, !vcall_visibility !11
+
+@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00"
+@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A }
+
+@_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00"
+@_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A }
+
+@_ZTS1C = linkonce_odr constant [3 x i8] c"1C\00"
+@_ZTI1C = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1C, ptr @_ZTI1A }
+
+@_ZTS1D = internal constant [3 x i8] c"1D\00"
+@_ZTI1D = internal constant { ptr, ptr } { ptr null, ptr @_ZTS1D }
+
+;; Prevent the vtables from being dead code eliminated.
+@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ], section "llvm.metadata"
+
+; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start
+define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
+  ;; Call function built with RegularLTO
+  %RegularLTOResult = call i32 @RegularLTO(ptr %obj, i32 %a)
+
+  ;; ThinLTO code starts here
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr ptr, ptr %vtable, i32 1
+  %fptr1 = load ptr, ptr %fptrptr, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi
+  %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a)
+
+  %fptr22 = load ptr, ptr %vtable, align 8
+
+  ;; Check that the call was not devirtualized.
+  ; CHECK-IR: %call2 = tail call i32 %fptr22
+  %call2 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call)
+
+  %vtable2 = load ptr, ptr %obj2
+  %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !10)
+  call void @llvm.assume(i1 %p2)
+
+  %fptr33 = load ptr, ptr %vtable2, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call3 = tail call i32 @_ZN1D1mEi
+  %call3 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call2)
+
+  ret i32 %call3
+}
+; CHECK-COMMON-IR-LABEL: ret i32
+; CHECK-COMMON-IR-LABEL: }
+
+declare i32 @RegularLTO(ptr)
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+define linkonce_odr i32 @_ZN1A1fEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+;; Make sure we don't inline or otherwise optimize out the direct calls.
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTSM1AFviE.virtual"}
+!2 = !{i64 24, !"_ZTSM1AFviE.virtual"}
+!3 = !{i64 16, !"_ZTS1B"}
+!4 = !{i64 16, !"_ZTSM1BFviE.virtual"}
+!5 = !{i64 24, !"_ZTSM1BFviE.virtual"}
+!6 = !{i64 16, !"_ZTS1C"}
+!7 = !{i64 16, !"_ZTSM1CFviE.virtual"}
+!8 = !{i64 24, !"_ZTSM1CFviE.virtual"}
+!9 = !{i64 16, !10}
+!10 = distinct !{}
+!11 = !{i64 2}
+
+;--- RegularLTO.ll
+; REQUIRES: x86
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.Native = type { %struct.A }
+
+@_ZTV7Regular = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI7Regular, ptr @_ZN7Regular1fEi, ptr @_ZN1A1nEi] } , !type !0, !type !1, !type !2, !type !3, !type !4, !type !5
+@_ZTS7Regular = linkonce_odr constant [9 x i8] c"7Regular\00"
+@_ZTI7Regular = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS7Regular, ptr @_ZTI1A }
+
+; Base type A does not need to emit a vtable if it's never instantiated. However, RTTI still gets generated
+@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00"
+@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A }
+
+;; Prevent the vtables from being dead code eliminated.
+@llvm.used = appending global [1 x ptr] [ ptr @_ZTV7Regular ], section "llvm.metadata"
+
+; CHECK-COMMON-REGULAR-IR-LABEL: define dso_local i32 @RegularLTO
+define i32 @RegularLTO(ptr %obj, i32 %a) #0 {
+entry:
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A")
+  call void @llvm.assume(i1 %p)
+  %fptr1 = load ptr, ptr %vtable, align 8
+
+  ;; Check that the call was not devirtualized.
+  ; CHECK-REGULAR-IR: %call = tail call i32 %fptr1
+  %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a)
+
+  ret i32 %call
+}
+; CHECK-COMMON-REGULAR-IR-LABEL: ret i32
+; CHECK-COMMON-REGULAR-IR-LABEL: }
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+define linkonce_odr i32 @_ZN7Regular1fEi(ptr %this, i32 %a) #0 {
+   ret i32 1;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+attributes #0 = { noinline optnone }
+!llvm.module.flags = !{!6, !7}
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTSM1AFviE.virtual"}
+!2 = !{i64 24, !"_ZTSM1AFviE.virtual"}
+!3 = !{i64 16, !"_ZTS7Regular"}
+!4 = !{i64 16, !"_ZTSM7RegularFviE.virtual"}
+!5 = !{i64 24, !"_ZTSM7RegularFviE.virtual"}
+!6 = !{i32 1, !"ThinLTO", i32 0}
+!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1}
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll
new file mode 100644
index 0000000000000000000000000000000000000000..30bd75606f7d2d0aeb4bfeb2e82f289941101d0a
--- /dev/null
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll
@@ -0,0 +1,136 @@
+; REQUIRES: x86
+
+;; Common artifacts
+; RUN: opt --thinlto-bc -o %t1.o %s
+; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s
+; RUN: cp %s %t1_regular.ll
+; RUN: echo '!llvm.module.flags = !{!6, !7}' >> %t1_regular.ll
+; RUN: echo '!6 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll
+; RUN: echo '!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll
+; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll
+
+;; With --lto-whole-program-visibility, we assume no native types can interfere
+;; and thus proceed with devirtualization even in the presence of native types
+
+;; Index based WPD
+; RUN: ld.lld %t1.o -o %t3_index -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o -o %t3_hybrid -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o -o %t3_regular -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi
+
+;; With --lto-whole-program-visibility and --lto-validate-all-vtables-have-type-infos
+;; we rely on resolutions on the typename symbol to inform us of what's outside the summary.
+;; Without the typename symbol in the LTO unit (e.g. RTTI disabled) this causes
+;; conservative disablement of WPD on these types unless it's local
+
+;; Index based WPD
+; RUN: ld.lld %t1.o -o %t3_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o -o %t3_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o -o %t3_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+; VALIDATE-DAG: single-impl: devirtualized a call to _ZN1D1mEi
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.B = type { %struct.A }
+%struct.C = type { %struct.A }
+%struct.D = type { ptr }
+
+@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1B1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1
+@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1C1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !2
+@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN1D1mEi] }, !type !3, !vcall_visibility !5
+
+;; Prevent the vtables from being dead code eliminated.
+@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ]
+
+; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start
+define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
+entry:
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr ptr, ptr %vtable, i32 1
+  %fptr1 = load ptr, ptr %fptrptr, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi
+  ;; No resolution for _ZTS1A means we don't devirtualize
+  ; CHECK-VALIDATE-IR: %call = tail call i32 %fptr1
+  %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a)
+
+  %fptr22 = load ptr, ptr %vtable, align 8
+
+  ;; We still have to call it as virtual.
+  ; CHECK-IR: %call3 = tail call i32 %fptr22
+  ; CHECK-VALIDATE-IR: %call3 = tail call i32 %fptr22
+  %call3 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call)
+
+  %vtable2 = load ptr, ptr %obj2
+  %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !4)
+  call void @llvm.assume(i1 %p2)
+
+  %fptr33 = load ptr, ptr %vtable2, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call4 = tail call i32 @_ZN1D1mEi
+  ;; Being local this has VCallVisibilityTranslationUnit
+  ;; visibility so it's still devirtualized
+  ; CHECK-VALIDATE-IR: %call4 = tail call i32 @_ZN1D1mEi
+  %call4 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call3)
+  ret i32 %call4
+}
+; CHECK-COMMON-IR-LABEL: ret i32
+; CHECK-COMMON-IR-LABEL: }
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+define linkonce_odr i32 @_ZN1B1fEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define linkonce_odr i32 @_ZN1C1fEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+;; Make sure we don't inline or otherwise optimize out the direct calls.
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTS1B"}
+!2 = !{i64 16, !"_ZTS1C"}
+!3 = !{i64 16, !4}
+!4 = distinct !{}
+!5 = !{i64 2}
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4ef048d6b6c601b9bf174c24f3c8f4372814d0bc
--- /dev/null
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll
@@ -0,0 +1,130 @@
+; REQUIRES: x86
+
+;; Common artifacts
+; RUN: opt --thinlto-bc -o %t1.o %s
+; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s
+; RUN: cp %s %t1_regular.ll
+; RUN: echo '!llvm.module.flags = !{!2, !3}' >> %t1_regular.ll
+; RUN: echo '!2 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll
+; RUN: echo '!3 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll
+; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll
+
+; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_ref.ll -o %t2.bc
+; RUN: llc -relocation-model=pic -filetype=obj %t2.bc -o %t2.o
+
+;; Native objects can contain only a reference to the base type infos if the base declaration has no key functions.
+;; Because of that, --lto-validate-all-vtables-have-type-infos needs to query for the type info symbol inside native files rather than the
+;; type name symbol that's used as the key in !type metadata to correctly stop devirtualization on the native type.
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2.o -o %t3_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2.o -o %t3_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2.o -o %t3_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s
+; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR
+
+; CHECK-NOT:     single-impl: devirtualized a call to _ZN1A3fooEv
+
+;; Source code:
+;; cat > a.h <<'eof'
+;; struct A { virtual int foo(); };
+;; int bar(A *a);
+;; eof
+;; cat > main.cc <<'eof'
+;; #include "a.h"
+;;
+;; int A::foo() { return 1; }
+;; int bar(A *a) { return a->foo(); }
+;;
+;; extern int baz();
+;; int main() {
+;;   A a;
+;;   int i = bar(&a);
+;;   int j = baz();
+;;   return i + j;
+;; }
+;; eof
+;; clang++ -fwhole-program-vtables -fno-split-lto-unit -flto=thin main.cc -c
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { %struct.Abase }
+%struct.Abase = type { ptr }
+
+@_ZTV1A = dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A3fooEv] }, align 8, !type !0, !type !1
+@_ZTS1A = dso_local constant [3 x i8] c"1A\00", align 1
+@_ZTI1A = dso_local constant { ptr, ptr } { ptr null, ptr @_ZTS1A }, align 8
+
+define dso_local noundef i32 @_ZN1A3fooEv(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 align 2 {
+entry:
+  %this.addr = alloca ptr
+  store ptr %this, ptr %this.addr
+  %this1 = load ptr, ptr %this.addr
+  ret i32 1
+}
+
+; CHECK-IR: define dso_local noundef i32 @_Z3barP1A
+define dso_local noundef i32 @_Z3barP1A(ptr noundef %a) #0 {
+entry:
+  %a.addr = alloca ptr
+  store ptr %a, ptr %a.addr
+  %0 = load ptr, ptr %a.addr
+  %vtable = load ptr, ptr %0
+  %1 = call i1 @llvm.public.type.test(ptr %vtable, metadata !"_ZTS1A")
+  call void @llvm.assume(i1 %1)
+  %vfn = getelementptr inbounds ptr, ptr %vtable, i64 0
+  %fptr = load ptr, ptr %vfn
+  ;; Check that the call was not devirtualized.
+  ; CHECK-IR: %call = call noundef i32 %fptr
+  %call = call noundef i32 %fptr(ptr noundef nonnull align 8 dereferenceable(8) %0)
+  ret i32 %call
+}
+; CHECK-IR: ret i32
+; CHECK-IR: }
+
+declare i1 @llvm.public.type.test(ptr, metadata)
+declare void @llvm.assume(i1 noundef)
+
+define dso_local noundef i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %a = alloca %struct.A, align 8
+  %i = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 0, ptr %retval, align 4
+  call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %a)
+  %call = call noundef i32 @_Z3barP1A(ptr noundef %a)
+  store i32 %call, ptr %i, align 4
+  %call1 = call noundef i32 @_Z3bazv()
+  store i32 %call1, ptr %j, align 4
+  %0 = load i32, ptr %i, align 4
+  %1 = load i32, ptr %j, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+define linkonce_odr dso_local void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 align 2 {
+entry:
+  %this.addr = alloca ptr, align 8
+  store ptr %this, ptr %this.addr, align 8
+  %this1 = load ptr, ptr %this.addr, align 8
+  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1, align 8
+  ret void
+}
+
+declare noundef i32 @_Z3bazv()
+
+;; Make sure we don't inline or otherwise optimize out the direct calls.
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTSM1AFivE.virtual"}
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
index 568c9cf87f80e4648d220d1d955e3894c2825a4a..50b78095d015486c611cf136e1dfeb8bf70f619f 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
@@ -723,10 +723,10 @@ public:
   void setMemProt(orc::MemProt Prot) { this->Prot = Prot; }
 
   /// Get the memory lifetime policy for this section.
-  orc::MemLifetimePolicy getMemLifetimePolicy() const { return MLP; }
+  orc::MemLifetime getMemLifetime() const { return ML; }
 
   /// Set the memory lifetime policy for this section.
-  void setMemLifetimePolicy(orc::MemLifetimePolicy MLP) { this->MLP = MLP; }
+  void setMemLifetime(orc::MemLifetime ML) { this->ML = ML; }
 
   /// Returns the ordinal for this section.
   SectionOrdinal getOrdinal() const { return SecOrdinal; }
@@ -794,7 +794,7 @@ private:
 
   StringRef Name;
   orc::MemProt Prot;
-  orc::MemLifetimePolicy MLP = orc::MemLifetimePolicy::Standard;
+  orc::MemLifetime ML = orc::MemLifetime::Standard;
   SectionOrdinal SecOrdinal = 0;
   BlockSet Blocks;
   SymbolSet Symbols;
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h
index 09e0d71cf0bd29cdda2f7d49bc74cd075eb609e3..1b8c4d4e181cdcc16aaf2aa5a234aa2980e1f46c 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h
@@ -292,8 +292,8 @@ private:
 /// address of that block using the Segment's AllocGroup. Once memory has been
 /// populated, clients can call finalize to finalize the memory.
 ///
-/// Note: Segments with MemLifetimePolicy::NoAlloc are not permitted, since
-/// they would not be useful, and their presence is likely to indicate a bug.
+/// Note: Segments with MemLifetime::NoAlloc are not permitted, since they would
+/// not be useful, and their presence is likely to indicate a bug.
 class SimpleSegmentAlloc {
 public:
   /// Describes a segment to be allocated.
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h
index c20366cfbb38883796b0c7f690421b1ada84134e..b8b5f90b6b0fbc88a1d91fd288c52f650cf71c5b 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h
@@ -72,7 +72,7 @@ inline MemProt fromSysMemoryProtectionFlags(sys::Memory::ProtectionFlags PF) {
 /// deallocated if a call is made to
 /// JITLinkMemoryManager::InFlightAllocation::abandon. The policies below apply
 /// to finalized allocations.
-enum class MemLifetimePolicy {
+enum class MemLifetime {
   /// Standard memory should be allocated by the allocator and then deallocated
   /// when the deallocate method is called for the finalized allocation.
   Standard,
@@ -89,15 +89,15 @@ enum class MemLifetimePolicy {
 };
 
 /// Print a MemDeallocPolicy.
-inline raw_ostream &operator<<(raw_ostream &OS, MemLifetimePolicy MLP) {
+inline raw_ostream &operator<<(raw_ostream &OS, MemLifetime MLP) {
   switch (MLP) {
-  case MemLifetimePolicy::Standard:
+  case MemLifetime::Standard:
     OS << "standard";
     break;
-  case MemLifetimePolicy::Finalize:
+  case MemLifetime::Finalize:
     OS << "finalize";
     break;
-  case MemLifetimePolicy::NoAlloc:
+  case MemLifetime::NoAlloc:
     OS << "noalloc";
     break;
   }
@@ -124,11 +124,11 @@ public:
   AllocGroup() = default;
 
   /// Create an AllocGroup from a MemProt only -- uses
-  /// MemLifetimePolicy::Standard.
+  /// MemLifetime::Standard.
   AllocGroup(MemProt MP) : Id(static_cast<underlying_type>(MP)) {}
 
-  /// Create an AllocGroup from a MemProt and a MemLifetimePolicy.
-  AllocGroup(MemProt MP, MemLifetimePolicy MLP)
+  /// Create an AllocGroup from a MemProt and a MemLifetime.
+  AllocGroup(MemProt MP, MemLifetime MLP)
       : Id(static_cast<underlying_type>(MP) |
            (static_cast<underlying_type>(MLP) << BitsForProt)) {}
 
@@ -137,9 +137,9 @@ public:
     return static_cast<MemProt>(Id & ((1U << BitsForProt) - 1));
   }
 
-  /// Returns the MemLifetimePolicy for this group.
-  MemLifetimePolicy getMemLifetimePolicy() const {
-    return static_cast<MemLifetimePolicy>(Id >> BitsForProt);
+  /// Returns the MemLifetime for this group.
+  MemLifetime getMemLifetime() const {
+    return static_cast<MemLifetime>(Id >> BitsForProt);
   }
 
   friend bool operator==(const AllocGroup &LHS, const AllocGroup &RHS) {
@@ -203,8 +203,7 @@ private:
 
 /// Print an AllocGroup.
 inline raw_ostream &operator<<(raw_ostream &OS, AllocGroup AG) {
-  return OS << '(' << AG.getMemProt() << ", " << AG.getMemLifetimePolicy()
-            << ')';
+  return OS << '(' << AG.getMemProt() << ", " << AG.getMemLifetime() << ')';
 }
 
 } // end namespace orc
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h
index 09c73db44a947b3fc8ef1747f9d032d1332f3b10..1285867565e22b80cdbf8223c729572d1b5adc5b 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h
@@ -36,10 +36,9 @@ struct RemoteAllocGroup {
   RemoteAllocGroup(MemProt Prot, bool FinalizeLifetime)
       : Prot(Prot), FinalizeLifetime(FinalizeLifetime) {}
   RemoteAllocGroup(const AllocGroup &AG) : Prot(AG.getMemProt()) {
-    assert(AG.getMemLifetimePolicy() != orc::MemLifetimePolicy::NoAlloc &&
+    assert(AG.getMemLifetime() != orc::MemLifetime::NoAlloc &&
            "Cannot use no-alloc memory in a remote alloc request");
-    FinalizeLifetime =
-        AG.getMemLifetimePolicy() == orc::MemLifetimePolicy::Finalize;
+    FinalizeLifetime = AG.getMemLifetime() == orc::MemLifetime::Finalize;
   }
 
   MemProt Prot;
diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h
index 5c23ba4f7ac498f2679af3c638ab6b62fdf39b40..76e19dd007912d3ad5cfb117ebfa3cb880721124 100644
--- a/llvm/include/llvm/LTO/Config.h
+++ b/llvm/include/llvm/LTO/Config.h
@@ -80,6 +80,12 @@ struct Config {
   /// link.
   bool HasWholeProgramVisibility = false;
 
+  /// We're validating that all native vtables have corresponding type infos.
+  bool ValidateAllVtablesHaveTypeInfos = false;
+  /// If all native vtables have corresponding type infos, allow
+  /// usage of RTTI to block devirtualization on types used in native files.
+  bool AllVtablesHaveTypeInfos = false;
+
   /// Always emit a Regular LTO object even when it is empty because no Regular
   /// LTO modules were linked. This option is useful for some build system which
   /// want to know a priori all possible output files.
diff --git a/llvm/include/llvm/Support/raw_ostream.h b/llvm/include/llvm/Support/raw_ostream.h
index 1e01eb9ea19c4187302a91457b6d34fbe5b67584..a1ad1f8f5333244b65c4461a1225e8cd26b2895a 100644
--- a/llvm/include/llvm/Support/raw_ostream.h
+++ b/llvm/include/llvm/Support/raw_ostream.h
@@ -437,8 +437,8 @@ public:
 #ifndef NDEBUG
     uint64_t Pos = tell();
     // /dev/null always reports a pos of 0, so we cannot perform this check
-    // in that case.
-    if (Pos)
+    // in that case. and, When size is 0, no extending will occur.
+    if (Pos && Size)
       assert(Size + Offset <= Pos && "We don't support extending the stream");
 #endif
     pwrite_impl(Ptr, Size, Offset);
diff --git a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
index 9e121d9c6f4ed1000f8ddcfd3c5e3658e4729572..0be3146f695a67607435c28535f6a5ec79f0db41 100644
--- a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
+++ b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
@@ -243,10 +243,18 @@ void updatePublicTypeTestCalls(Module &M,
                                bool WholeProgramVisibilityEnabledInLTO);
 void updateVCallVisibilityInModule(
     Module &M, bool WholeProgramVisibilityEnabledInLTO,
-    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols);
+    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols,
+    bool ValidateAllVtablesHaveTypeInfos,
+    function_ref<bool(StringRef)> IsVisibleToRegularObj);
 void updateVCallVisibilityInIndex(
     ModuleSummaryIndex &Index, bool WholeProgramVisibilityEnabledInLTO,
-    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols);
+    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols,
+    const DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols);
+
+void getVisibleToRegularObjVtableGUIDs(
+    ModuleSummaryIndex &Index,
+    DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols,
+    function_ref<bool(StringRef)> IsVisibleToRegularObj);
 
 /// Perform index-based whole program devirtualization on the \p Summary
 /// index. Any devirtualized targets used by a type test in another module
diff --git a/llvm/include/llvm/Transforms/Utils/CodeLayout.h b/llvm/include/llvm/Transforms/Utils/CodeLayout.h
index e8106e474332199a9e49a19b04fe0d91725a90ca..f5127cff24af0dfd3901d19706db6f36656adcb8 100644
--- a/llvm/include/llvm/Transforms/Utils/CodeLayout.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeLayout.h
@@ -14,14 +14,21 @@
 #ifndef LLVM_TRANSFORMS_UTILS_CODELAYOUT_H
 #define LLVM_TRANSFORMS_UTILS_CODELAYOUT_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 
+#include <utility>
 #include <vector>
 
-namespace llvm {
+namespace llvm::codelayout {
 
 using EdgeT = std::pair<uint64_t, uint64_t>;
-using EdgeCountT = std::pair<EdgeT, uint64_t>;
+
+struct EdgeCount {
+  uint64_t src;
+  uint64_t dst;
+  uint64_t count;
+};
 
 /// Find a layout of nodes (basic blocks) of a given CFG optimizing jump
 /// locality and thus processor I-cache utilization. This is achieved via
@@ -34,25 +41,55 @@ using EdgeCountT = std::pair<EdgeT, uint64_t>;
 /// \p EdgeCounts: The execution counts of every edge (jump) in the profile. The
 ///    map also defines the edges in CFG and should include 0-count edges.
 /// \returns The best block order found.
-std::vector<uint64_t>
-applyExtTspLayout(const std::vector<uint64_t> &NodeSizes,
-                  const std::vector<uint64_t> &NodeCounts,
-                  const std::vector<EdgeCountT> &EdgeCounts);
+std::vector<uint64_t> computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,
+                                          ArrayRef<uint64_t> NodeCounts,
+                                          ArrayRef<EdgeCount> EdgeCounts);
 
 /// Estimate the "quality" of a given node order in CFG. The higher the score,
 /// the better the order is. The score is designed to reflect the locality of
 /// the given order, which is anti-correlated with the number of I-cache misses
 /// in a typical execution of the function.
-double calcExtTspScore(const std::vector<uint64_t> &Order,
-                       const std::vector<uint64_t> &NodeSizes,
-                       const std::vector<uint64_t> &NodeCounts,
-                       const std::vector<EdgeCountT> &EdgeCounts);
+double calcExtTspScore(ArrayRef<uint64_t> Order, ArrayRef<uint64_t> NodeSizes,
+                       ArrayRef<uint64_t> NodeCounts,
+                       ArrayRef<EdgeCount> EdgeCounts);
 
 /// Estimate the "quality" of the current node order in CFG.
-double calcExtTspScore(const std::vector<uint64_t> &NodeSizes,
-                       const std::vector<uint64_t> &NodeCounts,
-                       const std::vector<EdgeCountT> &EdgeCounts);
+double calcExtTspScore(ArrayRef<uint64_t> NodeSizes,
+                       ArrayRef<uint64_t> NodeCounts,
+                       ArrayRef<EdgeCount> EdgeCounts);
+
+/// Algorithm-specific params for Cache-Directed Sort. The values are tuned for
+/// the best performance of large-scale front-end bound binaries.
+struct CDSortConfig {
+  /// The size of the cache.
+  unsigned CacheEntries = 16;
+  /// The size of a line in the cache.
+  unsigned CacheSize = 2048;
+  /// The power exponent for the distance-based locality.
+  double DistancePower = 0.25;
+  /// The scale factor for the frequency-based locality.
+  double FrequencyScale = 0.25;
+};
+
+/// Apply a Cache-Directed Sort for functions represented by a call graph.
+/// The placement is done by optimizing the call locality by co-locating
+/// frequently executed functions.
+/// \p FuncSizes: The sizes of the nodes (in bytes).
+/// \p FuncCounts: The execution counts of the nodes in the profile.
+/// \p CallCounts: The execution counts of every edge (jump) in the profile. The
+///    map also defines the edges in CFG and should include 0-count edges.
+/// \p CallOffsets: The offsets of the calls from their source nodes.
+/// \returns The best function order found.
+std::vector<uint64_t> computeCacheDirectedLayout(
+    ArrayRef<uint64_t> FuncSizes, ArrayRef<uint64_t> FuncCounts,
+    ArrayRef<EdgeCount> CallCounts, ArrayRef<uint64_t> CallOffsets);
+
+/// Apply a Cache-Directed Sort with a custom config.
+std::vector<uint64_t> computeCacheDirectedLayout(
+    const CDSortConfig &Config, ArrayRef<uint64_t> FuncSizes,
+    ArrayRef<uint64_t> FuncCounts, ArrayRef<EdgeCount> CallCounts,
+    ArrayRef<uint64_t> CallOffsets);
 
-} // end namespace llvm
+} // namespace llvm::codelayout
 
 #endif // LLVM_TRANSFORMS_UTILS_CODELAYOUT_H
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 6913165add258561237efa246dd0edc37a8c447a..b69045b4d61f28b47ad70377002a3d938b9bcbeb 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -3502,7 +3502,7 @@ void MachineBlockPlacement::applyExtTsp() {
 
   auto BlockSizes = std::vector<uint64_t>(F->size());
   auto BlockCounts = std::vector<uint64_t>(F->size());
-  std::vector<EdgeCountT> JumpCounts;
+  std::vector<codelayout::EdgeCount> JumpCounts;
   for (MachineBasicBlock &MBB : *F) {
     // Getting the block frequency.
     BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
@@ -3521,8 +3521,8 @@ void MachineBlockPlacement::applyExtTsp() {
     for (MachineBasicBlock *Succ : MBB.successors()) {
       auto EP = MBPI->getEdgeProbability(&MBB, Succ);
       BlockFrequency JumpFreq = BlockFreq * EP;
-      auto Jump = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]);
-      JumpCounts.push_back(std::make_pair(Jump, JumpFreq.getFrequency()));
+      JumpCounts.push_back(
+          {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
     }
   }
 
@@ -3535,7 +3535,7 @@ void MachineBlockPlacement::applyExtTsp() {
                        calcExtTspScore(BlockSizes, BlockCounts, JumpCounts)));
 
   // Run the layout algorithm.
-  auto NewOrder = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
+  auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
   std::vector<const MachineBasicBlock *> NewBlockOrder;
   NewBlockOrder.reserve(F->size());
   for (uint64_t Node : NewOrder) {
diff --git a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp
index 6668854e1a6a6854b47c94f8726ff73b5573f27d..3bf7c9edb8bc596b05d86a1326d80ee4c637e4fc 100644
--- a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp
@@ -161,7 +161,7 @@ Error COFFLinkGraphBuilder::graphifySections() {
     if (!GraphSec) {
       GraphSec = &G->createSection(SectionName, Prot);
       if ((*Sec)->Characteristics & COFF::IMAGE_SCN_LNK_REMOVE)
-        GraphSec->setMemLifetimePolicy(orc::MemLifetimePolicy::NoAlloc);
+        GraphSec->setMemLifetime(orc::MemLifetime::NoAlloc);
     }
     if (GraphSec->getMemProt() != Prot)
       return make_error<JITLinkError>("MemProt should match");
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
index e726457983490df0557bb63b0a8eb5e8d6bcb2eb..127f33aad2eada4362a1ce31322811c0f3bd54a7 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
@@ -366,7 +366,7 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySections() {
       GraphSec = &G->createSection(*Name, Prot);
       // Non-SHF_ALLOC sections get NoAlloc memory lifetimes.
       if (!(Sec.sh_flags & ELF::SHF_ALLOC)) {
-        GraphSec->setMemLifetimePolicy(orc::MemLifetimePolicy::NoAlloc);
+        GraphSec->setMemLifetime(orc::MemLifetime::NoAlloc);
         LLVM_DEBUG({
           dbgs() << "      " << SecIndex << ": \"" << *Name
                  << "\" is not a SHF_ALLOC section. Using NoAlloc lifetime.\n";
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
index e69eddd6e1194479a10bbec118238ab1df5559f3..25569d63daa298a330169254c864d9d189e503ea 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
@@ -124,8 +124,7 @@ private:
     LLVM_DEBUG(dbgs() << "Fixing up blocks:\n");
 
     for (auto &Sec : G.sections()) {
-      bool NoAllocSection =
-          Sec.getMemLifetimePolicy() == orc::MemLifetimePolicy::NoAlloc;
+      bool NoAllocSection = Sec.getMemLifetime() == orc::MemLifetime::NoAlloc;
 
       for (auto *B : Sec.blocks()) {
         LLVM_DEBUG(dbgs() << "  " << *B << ":\n");
@@ -153,12 +152,11 @@ private:
 
           // If B is a block in a Standard or Finalize section then make sure
           // that no edges point to symbols in NoAlloc sections.
-          assert(
-              (NoAllocSection || !E.getTarget().isDefined() ||
-               E.getTarget().getBlock().getSection().getMemLifetimePolicy() !=
-                   orc::MemLifetimePolicy::NoAlloc) &&
-              "Block in allocated section has edge pointing to no-alloc "
-              "section");
+          assert((NoAllocSection || !E.getTarget().isDefined() ||
+                  E.getTarget().getBlock().getSection().getMemLifetime() !=
+                      orc::MemLifetime::NoAlloc) &&
+                 "Block in allocated section has edge pointing to no-alloc "
+                 "section");
 
           // Dispatch to LinkerImpl for fixup.
           if (auto Err = impl().applyFixup(G, *B, E))
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
index f481504135a5fba88f2e8354638a94ca6c497e0b..57e17aa78fed919f4cf7bb4fb422619dd4084bf7 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
@@ -26,10 +26,10 @@ BasicLayout::BasicLayout(LinkGraph &G) : G(G) {
   for (auto &Sec : G.sections()) {
     // Skip empty sections, and sections with NoAlloc lifetime policies.
     if (Sec.blocks().empty() ||
-        Sec.getMemLifetimePolicy() == orc::MemLifetimePolicy::NoAlloc)
+        Sec.getMemLifetime() == orc::MemLifetime::NoAlloc)
       continue;
 
-    auto &Seg = Segments[{Sec.getMemProt(), Sec.getMemLifetimePolicy()}];
+    auto &Seg = Segments[{Sec.getMemProt(), Sec.getMemLifetime()}];
     for (auto *B : Sec.blocks())
       if (LLVM_LIKELY(!B->isZeroFill()))
         Seg.ContentBlocks.push_back(B);
@@ -90,7 +90,7 @@ BasicLayout::getContiguousPageBasedLayoutSizes(uint64_t PageSize) {
                                      inconvertibleErrorCode());
 
     uint64_t SegSize = alignTo(Seg.ContentSize + Seg.ZeroFillSize, PageSize);
-    if (AG.getMemLifetimePolicy() == orc::MemLifetimePolicy::Standard)
+    if (AG.getMemLifetime() == orc::MemLifetime::Standard)
       SegsSizes.StandardSegs += SegSize;
     else
       SegsSizes.FinalizeSegs += SegSize;
@@ -164,15 +164,15 @@ void SimpleSegmentAlloc::Create(JITLinkMemoryManager &MemMgr,
     auto &AG = KV.first;
     auto &Seg = KV.second;
 
-    assert(AG.getMemLifetimePolicy() != orc::MemLifetimePolicy::NoAlloc &&
+    assert(AG.getMemLifetime() != orc::MemLifetime::NoAlloc &&
            "NoAlloc segments are not supported by SimpleSegmentAlloc");
 
     auto AGSectionName =
         AGSectionNames[static_cast<unsigned>(AG.getMemProt()) |
-                       static_cast<bool>(AG.getMemLifetimePolicy()) << 3];
+                       static_cast<bool>(AG.getMemLifetime()) << 3];
 
     auto &Sec = G->createSection(AGSectionName, AG.getMemProt());
-    Sec.setMemLifetimePolicy(AG.getMemLifetimePolicy());
+    Sec.setMemLifetime(AG.getMemLifetime());
 
     if (Seg.ContentSize != 0) {
       NextAddr =
@@ -419,10 +419,9 @@ void InProcessMemoryManager::allocate(const JITLinkDylib *JD, LinkGraph &G,
     auto &AG = KV.first;
     auto &Seg = KV.second;
 
-    auto &SegAddr =
-        (AG.getMemLifetimePolicy() == orc::MemLifetimePolicy::Standard)
-            ? NextStandardSegAddr
-            : NextFinalizeSegAddr;
+    auto &SegAddr = (AG.getMemLifetime() == orc::MemLifetime::Standard)
+                        ? NextStandardSegAddr
+                        : NextFinalizeSegAddr;
 
     Seg.WorkingMem = SegAddr.toPtr<char *>();
     Seg.Addr = SegAddr;
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
index c40e0f9ffc8d4740accf7cb93e7e5dc986fff469..45385eb6f76dc25c848b0f5a289685b27ecac18c 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
@@ -192,7 +192,7 @@ Error MachOLinkGraphBuilder::createNormalizedSections() {
 
     // TODO: Are there any other criteria for NoAlloc lifetime?
     if (NSec.Flags & MachO::S_ATTR_DEBUG)
-      NSec.GraphSection->setMemLifetimePolicy(orc::MemLifetimePolicy::NoAlloc);
+      NSec.GraphSection->setMemLifetime(orc::MemLifetime::NoAlloc);
 
     IndexToSection.insert(std::make_pair(SecIndex, std::move(NSec)));
   }
diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
index ca4950077ffe92a4b46bb79a6ebc8ba11644a542..9cfe547c84c310b75c85c6204500cdd30342eb7a 100644
--- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
@@ -322,8 +322,8 @@ void SharedMemoryMapper::initialize(MemoryMapper::AllocInfo &AI,
     std::memset(Base + Segment.ContentSize, 0, Segment.ZeroFillSize);
 
     tpctypes::SharedMemorySegFinalizeRequest SegReq;
-    SegReq.RAG = {Segment.AG.getMemProt(), Segment.AG.getMemLifetimePolicy() ==
-                                               MemLifetimePolicy::Finalize};
+    SegReq.RAG = {Segment.AG.getMemProt(),
+                  Segment.AG.getMemLifetime() == MemLifetime::Finalize};
     SegReq.Addr = AI.MappingBase + Segment.Offset;
     SegReq.Size = Segment.ContentSize + Segment.ZeroFillSize;
 
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index bc8abb751221ceda41c7eba05460c50d6719ebd6..6efdf6a7c3c926a57bf3e55ed3f325677309fecf 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1285,13 +1285,27 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
 
   updateMemProfAttributes(*RegularLTO.CombinedModule, ThinLTO.CombinedIndex);
 
+  bool WholeProgramVisibilityEnabledInLTO =
+      Conf.HasWholeProgramVisibility &&
+      // If validation is enabled, upgrade visibility only when all vtables
+      // have typeinfos.
+      (!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos);
+
+  // This returns true when the name is local or not defined. Locals are
+  // expected to be handled separately.
+  auto IsVisibleToRegularObj = [&](StringRef name) {
+    auto It = GlobalResolutions.find(name);
+    return (It == GlobalResolutions.end() || It->second.VisibleOutsideSummary);
+  };
+
   // If allowed, upgrade public vcall visibility metadata to linkage unit
   // visibility before whole program devirtualization in the optimizer.
-  updateVCallVisibilityInModule(*RegularLTO.CombinedModule,
-                                Conf.HasWholeProgramVisibility,
-                                DynamicExportSymbols);
+  updateVCallVisibilityInModule(
+      *RegularLTO.CombinedModule, WholeProgramVisibilityEnabledInLTO,
+      DynamicExportSymbols, Conf.ValidateAllVtablesHaveTypeInfos,
+      IsVisibleToRegularObj);
   updatePublicTypeTestCalls(*RegularLTO.CombinedModule,
-                            Conf.HasWholeProgramVisibility);
+                            WholeProgramVisibilityEnabledInLTO);
 
   if (Conf.PreOptModuleHook &&
       !Conf.PreOptModuleHook(0, *RegularLTO.CombinedModule))
@@ -1693,13 +1707,38 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
 
   std::set<GlobalValue::GUID> ExportedGUIDs;
 
-  if (hasWholeProgramVisibility(Conf.HasWholeProgramVisibility))
+  bool WholeProgramVisibilityEnabledInLTO =
+      Conf.HasWholeProgramVisibility &&
+      // If validation is enabled, upgrade visibility only when all vtables
+      // have typeinfos.
+      (!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos);
+  if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
     ThinLTO.CombinedIndex.setWithWholeProgramVisibility();
+
+  // If we're validating, get the vtable symbols that should not be
+  // upgraded because they correspond to typeIDs outside of index-based
+  // WPD info.
+  DenseSet<GlobalValue::GUID> VisibleToRegularObjSymbols;
+  if (WholeProgramVisibilityEnabledInLTO &&
+      Conf.ValidateAllVtablesHaveTypeInfos) {
+    // This returns true when the name is local or not defined. Locals are
+    // expected to be handled separately.
+    auto IsVisibleToRegularObj = [&](StringRef name) {
+      auto It = GlobalResolutions.find(name);
+      return (It == GlobalResolutions.end() ||
+              It->second.VisibleOutsideSummary);
+    };
+
+    getVisibleToRegularObjVtableGUIDs(ThinLTO.CombinedIndex,
+                                      VisibleToRegularObjSymbols,
+                                      IsVisibleToRegularObj);
+  }
+
   // If allowed, upgrade public vcall visibility to linkage unit visibility in
   // the summaries before whole program devirtualization below.
-  updateVCallVisibilityInIndex(ThinLTO.CombinedIndex,
-                               Conf.HasWholeProgramVisibility,
-                               DynamicExportSymbols);
+  updateVCallVisibilityInIndex(
+      ThinLTO.CombinedIndex, WholeProgramVisibilityEnabledInLTO,
+      DynamicExportSymbols, VisibleToRegularObjSymbols);
 
   // Perform index-based WPD. This will return immediately if there are
   // no index entries in the typeIdMetadata map (e.g. if we are instead
diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp
index 1402da7fbbd2774a52dae95776f0a2a4318994d4..3e2216ca61a2c7281bd8143ff0d495bf2f57751c 100644
--- a/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -604,11 +604,14 @@ bool LTOCodeGenerator::optimize() {
   // pipeline run below.
   updatePublicTypeTestCalls(*MergedModule,
                             /* WholeProgramVisibilityEnabledInLTO */ false);
-  updateVCallVisibilityInModule(*MergedModule,
-                                /* WholeProgramVisibilityEnabledInLTO */ false,
-                                // FIXME: This needs linker information via a
-                                // TBD new interface.
-                                /* DynamicExportSymbols */ {});
+  updateVCallVisibilityInModule(
+      *MergedModule,
+      /* WholeProgramVisibilityEnabledInLTO */ false,
+      // FIXME: These need linker information via a
+      // TBD new interface.
+      /*DynamicExportSymbols=*/{},
+      /*ValidateAllVtablesHaveTypeInfos=*/false,
+      /*IsVisibleToRegularObj=*/[](StringRef) { return true; });
 
   // We always run the verifier once on the merged module, the `DisableVerify`
   // parameter only applies to subsequent verify.
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 24cd6e1a0b415a00c03013446d9f81410f324bda..152f708969e13fb5a9f363f9c75b02c417b8818e 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -1058,11 +1058,14 @@ void ThinLTOCodeGenerator::run() {
   // via the internal option. Must be done before WPD below.
   if (hasWholeProgramVisibility(/* WholeProgramVisibilityEnabledInLTO */ false))
     Index->setWithWholeProgramVisibility();
+
+  // FIXME: This needs linker information via a TBD new interface
   updateVCallVisibilityInIndex(*Index,
-                               /* WholeProgramVisibilityEnabledInLTO */ false,
-                               // FIXME: This needs linker information via a
+                               /*WholeProgramVisibilityEnabledInLTO=*/false,
+                               // FIXME: These need linker information via a
                                // TBD new interface.
-                               /* DynamicExportSymbols */ {});
+                               /*DynamicExportSymbols=*/{},
+                               /*VisibleToRegularObjSymbols=*/{});
 
   // Perform index-based WPD. This will return immediately if there are
   // no index entries in the typeIdMetadata map (e.g. if we are instead
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index d332586423651c8cb004218914a0fb356cff8cf2..3406595950b58cb477ed7a6f5085ccce7b2461d8 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -784,12 +784,52 @@ bool hasWholeProgramVisibility(bool WholeProgramVisibilityEnabledInLTO) {
          !DisableWholeProgramVisibility;
 }
 
+static bool
+typeIDVisibleToRegularObj(StringRef TypeID,
+                          function_ref<bool(StringRef)> IsVisibleToRegularObj) {
+  // TypeID for member function pointer type is an internal construct
+  // and won't exist in IsVisibleToRegularObj. The full TypeID
+  // will be present and participate in invalidation.
+  if (TypeID.ends_with(".virtual"))
+    return false;
+
+  // TypeID that doesn't start with Itanium mangling (_ZTS) will be
+  // non-externally visible types which cannot interact with
+  // external native files. See CodeGenModule::CreateMetadataIdentifierImpl.
+  if (!TypeID.consume_front("_ZTS"))
+    return false;
+
+  // TypeID is keyed off the type name symbol (_ZTS). However, the native
+  // object may not contain this symbol if it does not contain a key
+  // function for the base type and thus only contains a reference to the
+  // type info (_ZTI). To catch this case we query using the type info
+  // symbol corresponding to the TypeID.
+  std::string typeInfo = ("_ZTI" + TypeID).str();
+  return IsVisibleToRegularObj(typeInfo);
+}
+
+static bool
+skipUpdateDueToValidation(GlobalVariable &GV,
+                          function_ref<bool(StringRef)> IsVisibleToRegularObj) {
+  SmallVector<MDNode *, 2> Types;
+  GV.getMetadata(LLVMContext::MD_type, Types);
+
+  for (auto Type : Types)
+    if (auto *TypeID = dyn_cast<MDString>(Type->getOperand(1).get()))
+      return typeIDVisibleToRegularObj(TypeID->getString(),
+                                       IsVisibleToRegularObj);
+
+  return false;
+}
+
 /// If whole program visibility asserted, then upgrade all public vcall
 /// visibility metadata on vtable definitions to linkage unit visibility in
 /// Module IR (for regular or hybrid LTO).
 void updateVCallVisibilityInModule(
     Module &M, bool WholeProgramVisibilityEnabledInLTO,
-    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols) {
+    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols,
+    bool ValidateAllVtablesHaveTypeInfos,
+    function_ref<bool(StringRef)> IsVisibleToRegularObj) {
   if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
     return;
   for (GlobalVariable &GV : M.globals()) {
@@ -800,7 +840,13 @@ void updateVCallVisibilityInModule(
         GV.getVCallVisibility() == GlobalObject::VCallVisibilityPublic &&
         // Don't upgrade the visibility for symbols exported to the dynamic
         // linker, as we have no information on their eventual use.
-        !DynamicExportSymbols.count(GV.getGUID()))
+        !DynamicExportSymbols.count(GV.getGUID()) &&
+        // With validation enabled, we want to exclude symbols visible to
+        // regular objects. Local symbols will be in this group due to the
+        // current implementation but those with VCallVisibilityTranslationUnit
+        // will have already been marked in clang so are unaffected.
+        !(ValidateAllVtablesHaveTypeInfos &&
+          skipUpdateDueToValidation(GV, IsVisibleToRegularObj)))
       GV.setVCallVisibilityMetadata(GlobalObject::VCallVisibilityLinkageUnit);
   }
 }
@@ -832,12 +878,26 @@ void updatePublicTypeTestCalls(Module &M,
   }
 }
 
+/// Based on typeID string, get all associated vtable GUIDS that are
+/// visible to regular objects.
+void getVisibleToRegularObjVtableGUIDs(
+    ModuleSummaryIndex &Index,
+    DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols,
+    function_ref<bool(StringRef)> IsVisibleToRegularObj) {
+  for (const auto &typeID : Index.typeIdCompatibleVtableMap()) {
+    if (typeIDVisibleToRegularObj(typeID.first, IsVisibleToRegularObj))
+      for (const TypeIdOffsetVtableInfo &P : typeID.second)
+        VisibleToRegularObjSymbols.insert(P.VTableVI.getGUID());
+  }
+}
+
 /// If whole program visibility asserted, then upgrade all public vcall
 /// visibility metadata on vtable definition summaries to linkage unit
 /// visibility in Module summary index (for ThinLTO).
 void updateVCallVisibilityInIndex(
     ModuleSummaryIndex &Index, bool WholeProgramVisibilityEnabledInLTO,
-    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols) {
+    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols,
+    const DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols) {
   if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
     return;
   for (auto &P : Index) {
@@ -850,6 +910,12 @@ void updateVCallVisibilityInIndex(
       if (!GVar ||
           GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic)
         continue;
+      // With validation enabled, we want to exclude symbols visible to regular
+      // objects. Local symbols will be in this group due to the current
+      // implementation but those with VCallVisibilityTranslationUnit will have
+      // already been marked in clang so are unaffected.
+      if (VisibleToRegularObjSymbols.count(P.first))
+        continue;
       GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit);
     }
   }
@@ -1045,8 +1111,8 @@ bool DevirtModule::tryFindVirtualCallTargets(
 }
 
 bool DevirtIndex::tryFindVirtualCallTargets(
-    std::vector<ValueInfo> &TargetsForSlot, const TypeIdCompatibleVtableInfo TIdInfo,
-    uint64_t ByteOffset) {
+    std::vector<ValueInfo> &TargetsForSlot,
+    const TypeIdCompatibleVtableInfo TIdInfo, uint64_t ByteOffset) {
   for (const TypeIdOffsetVtableInfo &P : TIdInfo) {
     // Find a representative copy of the vtable initializer.
     // We can have multiple available_externally, linkonce_odr and weak_odr
diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
index ac74a1c116cce05ccb04133a6eeff668ba10910c..58b5afbc869e907de1fd33a838dc9b4ccedc60c7 100644
--- a/llvm/lib/Transforms/Utils/CodeLayout.cpp
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -45,8 +45,11 @@
 #include "llvm/Support/Debug.h"
 
 #include <cmath>
+#include <set>
 
 using namespace llvm;
+using namespace llvm::codelayout;
+
 #define DEBUG_TYPE "code-layout"
 
 namespace llvm {
@@ -61,8 +64,8 @@ cl::opt<bool> ApplyExtTspWithoutProfile(
     cl::init(true), cl::Hidden);
 } // namespace llvm
 
-// Algorithm-specific params. The values are tuned for the best performance
-// of large-scale front-end bound binaries.
+// Algorithm-specific params for Ext-TSP. The values are tuned for the best
+// performance of large-scale front-end bound binaries.
 static cl::opt<double> ForwardWeightCond(
     "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1),
     cl::desc("The weight of conditional forward jumps for ExtTSP value"));
@@ -113,6 +116,21 @@ static cl::opt<bool> EnableChainSplitAlongJumps(
     "ext-tsp-enable-chain-split-along-jumps", cl::ReallyHidden, cl::init(true),
     cl::desc("The maximum size of a chain to apply splitting"));
 
+// Algorithm-specific options for CDS.
+static cl::opt<unsigned> CacheEntries("cds-cache-entries", cl::ReallyHidden,
+                                      cl::desc("The size of the cache"));
+
+static cl::opt<unsigned> CacheSize("cds-cache-size", cl::ReallyHidden,
+                                   cl::desc("The size of a line in the cache"));
+
+static cl::opt<double> DistancePower(
+    "cds-distance-power", cl::ReallyHidden,
+    cl::desc("The power exponent for the distance-based locality"));
+
+static cl::opt<double> FrequencyScale(
+    "cds-frequency-scale", cl::ReallyHidden,
+    cl::desc("The scale factor for the frequency-based locality"));
+
 namespace {
 
 // Epsilon for comparison of doubles.
@@ -280,9 +298,9 @@ struct ChainT {
   }
 
   ChainEdge *getEdge(ChainT *Other) const {
-    for (auto It : Edges) {
-      if (It.first == Other)
-        return It.second;
+    for (const auto &[Chain, ChainEdge] : Edges) {
+      if (Chain == Other)
+        return ChainEdge;
     }
     return nullptr;
   }
@@ -302,13 +320,13 @@ struct ChainT {
     Edges.push_back(std::make_pair(Other, Edge));
   }
 
-  void merge(ChainT *Other, const std::vector<NodeT *> &MergedBlocks) {
-    Nodes = MergedBlocks;
-    // Update the chain's data
+  void merge(ChainT *Other, std::vector<NodeT *> MergedBlocks) {
+    Nodes = std::move(MergedBlocks);
+    // Update the chain's data.
     ExecutionCount += Other->ExecutionCount;
     Size += Other->Size;
     Id = Nodes[0]->Index;
-    // Update the node's data
+    // Update the node's data.
     for (size_t Idx = 0; Idx < Nodes.size(); Idx++) {
       Nodes[Idx]->CurChain = this;
       Nodes[Idx]->CurIndex = Idx;
@@ -340,7 +358,7 @@ struct ChainT {
 
 /// An edge in the graph representing jumps between two chains.
 /// When nodes are merged into chains, the edges are combined too so that
-/// there is always at most one edge between a pair of chains
+/// there is always at most one edge between a pair of chains.
 struct ChainEdge {
   ChainEdge(const ChainEdge &) = delete;
   ChainEdge(ChainEdge &&) = default;
@@ -426,40 +444,34 @@ private:
 
 uint64_t NodeT::outCount() const {
   uint64_t Count = 0;
-  for (JumpT *Jump : OutJumps) {
+  for (JumpT *Jump : OutJumps)
     Count += Jump->ExecutionCount;
-  }
   return Count;
 }
 
 uint64_t NodeT::inCount() const {
   uint64_t Count = 0;
-  for (JumpT *Jump : InJumps) {
+  for (JumpT *Jump : InJumps)
     Count += Jump->ExecutionCount;
-  }
   return Count;
 }
 
 void ChainT::mergeEdges(ChainT *Other) {
-  // Update edges adjacent to chain Other
-  for (auto EdgeIt : Other->Edges) {
-    ChainT *DstChain = EdgeIt.first;
-    ChainEdge *DstEdge = EdgeIt.second;
+  // Update edges adjacent to chain Other.
+  for (const auto &[DstChain, DstEdge] : Other->Edges) {
     ChainT *TargetChain = DstChain == Other ? this : DstChain;
     ChainEdge *CurEdge = getEdge(TargetChain);
     if (CurEdge == nullptr) {
       DstEdge->changeEndpoint(Other, this);
       this->addEdge(TargetChain, DstEdge);
-      if (DstChain != this && DstChain != Other) {
+      if (DstChain != this && DstChain != Other)
         DstChain->addEdge(this, DstEdge);
-      }
     } else {
       CurEdge->moveJumps(DstEdge);
     }
-    // Cleanup leftover edge
-    if (DstChain != Other) {
+    // Cleanup leftover edge.
+    if (DstChain != Other)
       DstChain->removeEdge(Other);
-    }
   }
 }
 
@@ -512,7 +524,7 @@ private:
 MergedChain mergeNodes(const std::vector<NodeT *> &X,
                        const std::vector<NodeT *> &Y, size_t MergeOffset,
                        MergeTypeT MergeType) {
-  // Split the first chain, X, into X1 and X2
+  // Split the first chain, X, into X1 and X2.
   NodeIter BeginX1 = X.begin();
   NodeIter EndX1 = X.begin() + MergeOffset;
   NodeIter BeginX2 = X.begin() + MergeOffset;
@@ -520,7 +532,7 @@ MergedChain mergeNodes(const std::vector<NodeT *> &X,
   NodeIter BeginY = Y.begin();
   NodeIter EndY = Y.end();
 
-  // Construct a new chain from the three existing ones
+  // Construct a new chain from the three existing ones.
   switch (MergeType) {
   case MergeTypeT::X_Y:
     return MergedChain(BeginX1, EndX2, BeginY, EndY);
@@ -539,15 +551,14 @@ MergedChain mergeNodes(const std::vector<NodeT *> &X,
 /// The implementation of the ExtTSP algorithm.
 class ExtTSPImpl {
 public:
-  ExtTSPImpl(const std::vector<uint64_t> &NodeSizes,
-             const std::vector<uint64_t> &NodeCounts,
-             const std::vector<EdgeCountT> &EdgeCounts)
+  ExtTSPImpl(ArrayRef<uint64_t> NodeSizes, ArrayRef<uint64_t> NodeCounts,
+             ArrayRef<EdgeCount> EdgeCounts)
       : NumNodes(NodeSizes.size()) {
     initialize(NodeSizes, NodeCounts, EdgeCounts);
   }
 
   /// Run the algorithm and return an optimized ordering of nodes.
-  void run(std::vector<uint64_t> &Result) {
+  std::vector<uint64_t> run() {
     // Pass 1: Merge nodes with their mutually forced successors
     mergeForcedPairs();
 
@@ -558,20 +569,20 @@ public:
     mergeColdChains();
 
     // Collect nodes from all chains
-    concatChains(Result);
+    return concatChains();
   }
 
 private:
   /// Initialize the algorithm's data structures.
-  void initialize(const std::vector<uint64_t> &NodeSizes,
-                  const std::vector<uint64_t> &NodeCounts,
-                  const std::vector<EdgeCountT> &EdgeCounts) {
+  void initialize(const ArrayRef<uint64_t> &NodeSizes,
+                  const ArrayRef<uint64_t> &NodeCounts,
+                  const ArrayRef<EdgeCount> &EdgeCounts) {
     // Initialize nodes
     AllNodes.reserve(NumNodes);
     for (uint64_t Idx = 0; Idx < NumNodes; Idx++) {
       uint64_t Size = std::max<uint64_t>(NodeSizes[Idx], 1ULL);
       uint64_t ExecutionCount = NodeCounts[Idx];
-      // The execution count of the entry node is set to at least one
+      // The execution count of the entry node is set to at least one.
       if (Idx == 0 && ExecutionCount == 0)
         ExecutionCount = 1;
       AllNodes.emplace_back(Idx, Size, ExecutionCount);
@@ -582,21 +593,18 @@ private:
     PredNodes.resize(NumNodes);
     std::vector<uint64_t> OutDegree(NumNodes, 0);
     AllJumps.reserve(EdgeCounts.size());
-    for (auto It : EdgeCounts) {
-      uint64_t Pred = It.first.first;
-      uint64_t Succ = It.first.second;
-      OutDegree[Pred]++;
-      // Ignore self-edges
-      if (Pred == Succ)
+    for (auto Edge : EdgeCounts) {
+      ++OutDegree[Edge.src];
+      // Ignore self-edges.
+      if (Edge.src == Edge.dst)
         continue;
 
-      SuccNodes[Pred].push_back(Succ);
-      PredNodes[Succ].push_back(Pred);
-      uint64_t ExecutionCount = It.second;
-      if (ExecutionCount > 0) {
-        NodeT &PredNode = AllNodes[Pred];
-        NodeT &SuccNode = AllNodes[Succ];
-        AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount);
+      SuccNodes[Edge.src].push_back(Edge.dst);
+      PredNodes[Edge.dst].push_back(Edge.src);
+      if (Edge.count > 0) {
+        NodeT &PredNode = AllNodes[Edge.src];
+        NodeT &SuccNode = AllNodes[Edge.dst];
+        AllJumps.emplace_back(&PredNode, &SuccNode, Edge.count);
         SuccNode.InJumps.push_back(&AllJumps.back());
         PredNode.OutJumps.push_back(&AllJumps.back());
       }
@@ -606,30 +614,29 @@ private:
       Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
     }
 
-    // Initialize chains
+    // Initialize chains.
     AllChains.reserve(NumNodes);
     HotChains.reserve(NumNodes);
     for (NodeT &Node : AllNodes) {
       AllChains.emplace_back(Node.Index, &Node);
       Node.CurChain = &AllChains.back();
-      if (Node.ExecutionCount > 0) {
+      if (Node.ExecutionCount > 0)
         HotChains.push_back(&AllChains.back());
-      }
     }
 
-    // Initialize chain edges
+    // Initialize chain edges.
     AllEdges.reserve(AllJumps.size());
     for (NodeT &PredNode : AllNodes) {
       for (JumpT *Jump : PredNode.OutJumps) {
         NodeT *SuccNode = Jump->Target;
         ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
-        // this edge is already present in the graph
+        // this edge is already present in the graph.
         if (CurEdge != nullptr) {
           assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
           CurEdge->appendJump(Jump);
           continue;
         }
-        // this is a new edge
+        // this is a new edge.
         AllEdges.emplace_back(Jump);
         PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
         SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
@@ -642,7 +649,7 @@ private:
   /// to B are from A. Such nodes should be adjacent in the optimal ordering;
   /// the method finds and merges such pairs of nodes.
   void mergeForcedPairs() {
-    // Find fallthroughs based on edge weights
+    // Find fallthroughs based on edge weights.
     for (NodeT &Node : AllNodes) {
       if (SuccNodes[Node.Index].size() == 1 &&
           PredNodes[SuccNodes[Node.Index][0]].size() == 1 &&
@@ -669,12 +676,12 @@ private:
       }
       if (SuccNode == nullptr)
         continue;
-      // Break the cycle
+      // Break the cycle.
       AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr;
       Node.ForcedPred = nullptr;
     }
 
-    // Merge nodes with their fallthrough successors
+    // Merge nodes with their fallthrough successors.
     for (NodeT &Node : AllNodes) {
       if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) {
         const NodeT *CurBlock = &Node;
@@ -689,7 +696,7 @@ private:
 
   /// Merge pairs of chains while improving the ExtTSP objective.
   void mergeChainPairs() {
-    /// Deterministically compare pairs of chains
+    /// Deterministically compare pairs of chains.
     auto compareChainPairs = [](const ChainT *A1, const ChainT *B1,
                                 const ChainT *A2, const ChainT *B2) {
       if (A1 != A2)
@@ -701,21 +708,19 @@ private:
       ChainT *BestChainPred = nullptr;
       ChainT *BestChainSucc = nullptr;
       MergeGainT BestGain;
-      // Iterate over all pairs of chains
+      // Iterate over all pairs of chains.
       for (ChainT *ChainPred : HotChains) {
-        // Get candidates for merging with the current chain
-        for (auto EdgeIt : ChainPred->Edges) {
-          ChainT *ChainSucc = EdgeIt.first;
-          ChainEdge *Edge = EdgeIt.second;
-          // Ignore loop edges
+        // Get candidates for merging with the current chain.
+        for (const auto &[ChainSucc, Edge] : ChainPred->Edges) {
+          // Ignore loop edges.
           if (ChainPred == ChainSucc)
             continue;
 
-          // Stop early if the combined chain violates the maximum allowed size
+          // Stop early if the combined chain violates the maximum allowed size.
           if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
             continue;
 
-          // Compute the gain of merging the two chains
+          // Compute the gain of merging the two chains.
           MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge);
           if (CurGain.score() <= EPS)
             continue;
@@ -731,11 +736,11 @@ private:
         }
       }
 
-      // Stop merging when there is no improvement
+      // Stop merging when there is no improvement.
       if (BestGain.score() <= EPS)
         break;
 
-      // Merge the best pair of chains
+      // Merge the best pair of chains.
       mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
                   BestGain.mergeType());
     }
@@ -743,7 +748,7 @@ private:
 
   /// Merge remaining nodes into chains w/o taking jump counts into
   /// consideration. This allows to maintain the original node order in the
-  /// absence of profile data
+  /// absence of profile data.
   void mergeColdChains() {
     for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
       // Iterating in reverse order to make sure original fallthrough jumps are
@@ -797,7 +802,7 @@ private:
       return Edge->getCachedMergeGain(ChainPred, ChainSucc);
     }
 
-    // Precompute jumps between ChainPred and ChainSucc
+    // Precompute jumps between ChainPred and ChainSucc.
     auto Jumps = Edge->jumps();
     ChainEdge *EdgePP = ChainPred->getEdge(ChainPred);
     if (EdgePP != nullptr) {
@@ -805,34 +810,34 @@ private:
     }
     assert(!Jumps.empty() && "trying to merge chains w/o jumps");
 
-    // The object holds the best currently chosen gain of merging the two chains
+    // This object holds the best chosen gain of merging two chains.
     MergeGainT Gain = MergeGainT();
 
     /// Given a merge offset and a list of merge types, try to merge two chains
-    /// and update Gain with a better alternative
+    /// and update Gain with a better alternative.
     auto tryChainMerging = [&](size_t Offset,
                                const std::vector<MergeTypeT> &MergeTypes) {
-      // Skip merging corresponding to concatenation w/o splitting
+      // Skip merging corresponding to concatenation w/o splitting.
       if (Offset == 0 || Offset == ChainPred->Nodes.size())
         return;
-      // Skip merging if it breaks Forced successors
+      // Skip merging if it breaks Forced successors.
       NodeT *Node = ChainPred->Nodes[Offset - 1];
       if (Node->ForcedSucc != nullptr)
         return;
       // Apply the merge, compute the corresponding gain, and update the best
-      // value, if the merge is beneficial
+      // value, if the merge is beneficial.
       for (const MergeTypeT &MergeType : MergeTypes) {
         Gain.updateIfLessThan(
             computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
       }
     };
 
-    // Try to concatenate two chains w/o splitting
+    // Try to concatenate two chains w/o splitting.
     Gain.updateIfLessThan(
         computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y));
 
     if (EnableChainSplitAlongJumps) {
-      // Attach (a part of) ChainPred before the first node of ChainSucc
+      // Attach (a part of) ChainPred before the first node of ChainSucc.
       for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {
         const NodeT *SrcBlock = Jump->Source;
         if (SrcBlock->CurChain != ChainPred)
@@ -841,7 +846,7 @@ private:
         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});
       }
 
-      // Attach (a part of) ChainPred after the last node of ChainSucc
+      // Attach (a part of) ChainPred after the last node of ChainSucc.
       for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {
         const NodeT *DstBlock = Jump->Source;
         if (DstBlock->CurChain != ChainPred)
@@ -851,12 +856,12 @@ private:
       }
     }
 
-    // Try to break ChainPred in various ways and concatenate with ChainSucc
+    // Try to break ChainPred in various ways and concatenate with ChainSucc.
     if (ChainPred->Nodes.size() <= ChainSplitThreshold) {
       for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) {
         // Try to split the chain in different ways. In practice, applying
         // X2_Y_X1 merging is almost never provides benefits; thus, we exclude
-        // it from consideration to reduce the search space
+        // it from consideration to reduce the search space.
         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1,
                                  MergeTypeT::X2_X1_Y});
       }
@@ -875,12 +880,12 @@ private:
     auto MergedBlocks =
         mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
 
-    // Do not allow a merge that does not preserve the original entry point
+    // Do not allow a merge that does not preserve the original entry point.
     if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
         !MergedBlocks.getFirstNode()->isEntry())
       return MergeGainT();
 
-    // The gain for the new chain
+    // The gain for the new chain.
     auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->Score;
     return MergeGainT(NewGainScore, MergeOffset, MergeType);
   }
@@ -891,39 +896,39 @@ private:
                    MergeTypeT MergeType) {
     assert(Into != From && "a chain cannot be merged with itself");
 
-    // Merge the nodes
+    // Merge the nodes.
     MergedChain MergedNodes =
         mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
     Into->merge(From, MergedNodes.getNodes());
 
-    // Merge the edges
+    // Merge the edges.
     Into->mergeEdges(From);
     From->clear();
 
-    // Update cached ext-tsp score for the new chain
+    // Update cached ext-tsp score for the new chain.
     ChainEdge *SelfEdge = Into->getEdge(Into);
     if (SelfEdge != nullptr) {
       MergedNodes = MergedChain(Into->Nodes.begin(), Into->Nodes.end());
       Into->Score = extTSPScore(MergedNodes, SelfEdge->jumps());
     }
 
-    // Remove the chain from the list of active chains
+    // Remove the chain from the list of active chains.
     llvm::erase_value(HotChains, From);
 
-    // Invalidate caches
+    // Invalidate caches.
     for (auto EdgeIt : Into->Edges)
       EdgeIt.second->invalidateCache();
   }
 
   /// Concatenate all chains into the final order.
-  void concatChains(std::vector<uint64_t> &Order) {
-    // Collect chains and calculate density stats for their sorting
+  std::vector<uint64_t> concatChains() {
+    // Collect chains and calculate density stats for their sorting.
     std::vector<const ChainT *> SortedChains;
     DenseMap<const ChainT *, double> ChainDensity;
     for (ChainT &Chain : AllChains) {
       if (!Chain.Nodes.empty()) {
         SortedChains.push_back(&Chain);
-        // Using doubles to avoid overflow of ExecutionCounts
+        // Using doubles to avoid overflow of ExecutionCounts.
         double Size = 0;
         double ExecutionCount = 0;
         for (NodeT *Node : Chain.Nodes) {
@@ -935,27 +940,28 @@ private:
       }
     }
 
-    // Sorting chains by density in the decreasing order
-    std::stable_sort(SortedChains.begin(), SortedChains.end(),
-                     [&](const ChainT *L, const ChainT *R) {
-                       // Make sure the original entry point is at the
-                       // beginning of the order
-                       if (L->isEntry() != R->isEntry())
-                         return L->isEntry();
-
-                       const double DL = ChainDensity[L];
-                       const double DR = ChainDensity[R];
-                       // Compare by density and break ties by chain identifiers
-                       return (DL != DR) ? (DL > DR) : (L->Id < R->Id);
-                     });
-
-    // Collect the nodes in the order specified by their chains
+    // Sorting chains by density in the decreasing order.
+    std::sort(SortedChains.begin(), SortedChains.end(),
+              [&](const ChainT *L, const ChainT *R) {
+                // Place the entry point is at the beginning of the order.
+                if (L->isEntry() != R->isEntry())
+                  return L->isEntry();
+
+                const double DL = ChainDensity[L];
+                const double DR = ChainDensity[R];
+                // Compare by density and break ties by chain identifiers.
+                return (DL != DR) ? (DL > DR) : (L->Id < R->Id);
+                return std::make_tuple(-DL, L->Id) <
+                       std::make_tuple(-DR, R->Id);
+              });
+
+    // Collect the nodes in the order specified by their chains.
+    std::vector<uint64_t> Order;
     Order.reserve(NumNodes);
-    for (const ChainT *Chain : SortedChains) {
-      for (NodeT *Node : Chain->Nodes) {
+    for (const ChainT *Chain : SortedChains)
+      for (NodeT *Node : Chain->Nodes)
         Order.push_back(Node->Index);
-      }
-    }
+    return Order;
   }
 
 private:
@@ -984,61 +990,466 @@ private:
   std::vector<ChainT *> HotChains;
 };
 
+/// The implementation of the Cache-Directed Sort (CDS) algorithm for ordering
+/// functions represented by a call graph.
+class CDSortImpl {
+public:
+  CDSortImpl(const CDSortConfig &Config, ArrayRef<uint64_t> NodeSizes,
+             ArrayRef<uint64_t> NodeCounts, ArrayRef<EdgeCount> EdgeCounts,
+             ArrayRef<uint64_t> EdgeOffsets)
+      : Config(Config), NumNodes(NodeSizes.size()) {
+    initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets);
+  }
+
+  /// Run the algorithm and return an ordered set of function clusters.
+  std::vector<uint64_t> run() {
+    // Merge pairs of chains while improving the objective.
+    mergeChainPairs();
+
+    LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number"
+                      << " of chains from " << NumNodes << " to "
+                      << HotChains.size() << "\n");
+
+    // Collect nodes from all the chains.
+    return concatChains();
+  }
+
+private:
+  /// Initialize the algorithm's data structures.
+  void initialize(const ArrayRef<uint64_t> &NodeSizes,
+                  const ArrayRef<uint64_t> &NodeCounts,
+                  const ArrayRef<EdgeCount> &EdgeCounts,
+                  const ArrayRef<uint64_t> &EdgeOffsets) {
+    // Initialize nodes.
+    AllNodes.reserve(NumNodes);
+    for (uint64_t Node = 0; Node < NumNodes; Node++) {
+      uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
+      uint64_t ExecutionCount = NodeCounts[Node];
+      AllNodes.emplace_back(Node, Size, ExecutionCount);
+      TotalSamples += ExecutionCount;
+      if (ExecutionCount > 0)
+        TotalSize += Size;
+    }
+
+    // Initialize jumps between the nodes.
+    SuccNodes.resize(NumNodes);
+    PredNodes.resize(NumNodes);
+    AllJumps.reserve(EdgeCounts.size());
+    for (size_t I = 0; I < EdgeCounts.size(); I++) {
+      auto [Pred, Succ, Count] = EdgeCounts[I];
+      // Ignore recursive calls.
+      if (Pred == Succ)
+        continue;
+
+      SuccNodes[Pred].push_back(Succ);
+      PredNodes[Succ].push_back(Pred);
+      if (Count > 0) {
+        NodeT &PredNode = AllNodes[Pred];
+        NodeT &SuccNode = AllNodes[Succ];
+        AllJumps.emplace_back(&PredNode, &SuccNode, Count);
+        AllJumps.back().Offset = EdgeOffsets[I];
+        SuccNode.InJumps.push_back(&AllJumps.back());
+        PredNode.OutJumps.push_back(&AllJumps.back());
+      }
+    }
+
+    // Initialize chains.
+    AllChains.reserve(NumNodes);
+    HotChains.reserve(NumNodes);
+    for (NodeT &Node : AllNodes) {
+      // Adjust execution counts.
+      Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount());
+      Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount());
+      // Create chain.
+      AllChains.emplace_back(Node.Index, &Node);
+      Node.CurChain = &AllChains.back();
+      if (Node.ExecutionCount > 0)
+        HotChains.push_back(&AllChains.back());
+    }
+
+    // Initialize chain edges.
+    AllEdges.reserve(AllJumps.size());
+    for (NodeT &PredNode : AllNodes) {
+      for (JumpT *Jump : PredNode.OutJumps) {
+        NodeT *SuccNode = Jump->Target;
+        ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
+        // this edge is already present in the graph.
+        if (CurEdge != nullptr) {
+          assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
+          CurEdge->appendJump(Jump);
+          continue;
+        }
+        // this is a new edge.
+        AllEdges.emplace_back(Jump);
+        PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
+        SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
+      }
+    }
+  }
+
+  /// Merge pairs of chains while there is an improvement in the objective.
+  void mergeChainPairs() {
+    // Create a priority queue containing all edges ordered by the merge gain.
+    auto GainComparator = [](ChainEdge *L, ChainEdge *R) {
+      return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) <
+             std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id);
+    };
+    std::set<ChainEdge *, decltype(GainComparator)> Queue(GainComparator);
+
+    // Insert the edges into the queue.
+    for (ChainT *ChainPred : HotChains) {
+      for (const auto &[Chain, Edge] : ChainPred->Edges) {
+        // Ignore self-edges.
+        if (Edge->isSelfEdge())
+          continue;
+        // Ignore already processed edges.
+        if (Edge->gain() != -1.0)
+          continue;
+
+        // Compute the gain of merging the two chains.
+        MergeGainT Gain = getBestMergeGain(Edge);
+        Edge->setMergeGain(Gain);
+
+        if (Edge->gain() > EPS)
+          Queue.insert(Edge);
+      }
+    }
+
+    // Merge the chains while the gain of merging is positive.
+    while (!Queue.empty()) {
+      // Extract the best (top) edge for merging.
+      ChainEdge *BestEdge = *Queue.begin();
+      Queue.erase(Queue.begin());
+      // Ignore self-edges.
+      if (BestEdge->isSelfEdge())
+        continue;
+      // Ignore edges with non-positive gains.
+      if (BestEdge->gain() <= EPS)
+        continue;
+
+      ChainT *BestSrcChain = BestEdge->srcChain();
+      ChainT *BestDstChain = BestEdge->dstChain();
+
+      // Remove outdated edges from the queue.
+      for (const auto &[Chain, ChainEdge] : BestSrcChain->Edges)
+        Queue.erase(ChainEdge);
+      for (const auto &[Chain, ChainEdge] : BestDstChain->Edges)
+        Queue.erase(ChainEdge);
+
+      // Merge the best pair of chains.
+      MergeGainT BestGain = BestEdge->getMergeGain();
+      mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(),
+                  BestGain.mergeType());
+
+      // Insert newly created edges into the queue.
+      for (const auto &[Chain, Edge] : BestSrcChain->Edges) {
+        // Ignore loop edges.
+        if (Edge->isSelfEdge())
+          continue;
+
+        // Compute the gain of merging the two chains.
+        MergeGainT Gain = getBestMergeGain(Edge);
+        Edge->setMergeGain(Gain);
+
+        if (Edge->gain() > EPS)
+          Queue.insert(Edge);
+      }
+    }
+  }
+
+  /// Compute the gain of merging two chains.
+  ///
+  /// The function considers all possible ways of merging two chains and
+  /// computes the one having the largest increase in ExtTSP objective. The
+  /// result is a pair with the first element being the gain and the second
+  /// element being the corresponding merging type.
+  MergeGainT getBestMergeGain(ChainEdge *Edge) const {
+    // Precompute jumps between ChainPred and ChainSucc.
+    auto Jumps = Edge->jumps();
+    assert(!Jumps.empty() && "trying to merge chains w/o jumps");
+    ChainT *SrcChain = Edge->srcChain();
+    ChainT *DstChain = Edge->dstChain();
+
+    // This object holds the best currently chosen gain of merging two chains.
+    MergeGainT Gain = MergeGainT();
+
+    /// Given a list of merge types, try to merge two chains and update Gain
+    /// with a better alternative.
+    auto tryChainMerging = [&](const std::vector<MergeTypeT> &MergeTypes) {
+      // Apply the merge, compute the corresponding gain, and update the best
+      // value, if the merge is beneficial.
+      for (const MergeTypeT &MergeType : MergeTypes) {
+        MergeGainT NewGain =
+            computeMergeGain(SrcChain, DstChain, Jumps, MergeType);
+
+        // When forward and backward gains are the same, prioritize merging that
+        // preserves the original order of the functions in the binary.
+        if (std::abs(Gain.score() - NewGain.score()) < EPS) {
+          if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) ||
+              (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) {
+            Gain = NewGain;
+          }
+        } else if (NewGain.score() > Gain.score() + EPS) {
+          Gain = NewGain;
+        }
+      }
+    };
+
+    // Try to concatenate two chains w/o splitting.
+    tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X});
+
+    return Gain;
+  }
+
+  /// Compute the score gain of merging two chains, respecting a given type.
+  ///
+  /// The two chains are not modified in the method.
+  MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
+                              const std::vector<JumpT *> &Jumps,
+                              MergeTypeT MergeType) const {
+    // This doesn't depend on the ordering of the nodes
+    double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc);
+
+    // Merge offset is always 0, as the chains are not split.
+    size_t MergeOffset = 0;
+    auto MergedBlocks =
+        mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
+    double DistGain = distBasedLocalityGain(MergedBlocks, Jumps);
+
+    double GainScore = DistGain + Config.FrequencyScale * FreqGain;
+    // Scale the result to increase the importance of merging short chains.
+    if (GainScore >= 0.0)
+      GainScore /= std::min(ChainPred->Size, ChainSucc->Size);
+
+    return MergeGainT(GainScore, MergeOffset, MergeType);
+  }
+
+  /// Compute the change of the frequency locality after merging the chains.
+  double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const {
+    auto missProbability = [&](double ChainDensity) {
+      double PageSamples = ChainDensity * Config.CacheSize;
+      if (PageSamples >= TotalSamples)
+        return 0.0;
+      double P = PageSamples / TotalSamples;
+      return pow(1.0 - P, static_cast<double>(Config.CacheEntries));
+    };
+
+    // Cache misses on the chains before merging.
+    double CurScore =
+        ChainPred->ExecutionCount * missProbability(ChainPred->density()) +
+        ChainSucc->ExecutionCount * missProbability(ChainSucc->density());
+
+    // Cache misses on the merged chain
+    double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount;
+    double MergedSize = ChainPred->Size + ChainSucc->Size;
+    double MergedDensity = static_cast<double>(MergedCounts) / MergedSize;
+    double NewScore = MergedCounts * missProbability(MergedDensity);
+
+    return CurScore - NewScore;
+  }
+
+  /// Compute the distance locality for a jump / call.
+  double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const {
+    uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr;
+    double D = Dist == 0 ? 0.1 : static_cast<double>(Dist);
+    return static_cast<double>(Count) * std::pow(D, -Config.DistancePower);
+  }
+
+  /// Compute the change of the distance locality after merging the chains.
+  double distBasedLocalityGain(const MergedChain &MergedBlocks,
+                               const std::vector<JumpT *> &Jumps) const {
+    if (Jumps.empty())
+      return 0.0;
+    uint64_t CurAddr = 0;
+    MergedBlocks.forEach([&](const NodeT *Node) {
+      Node->EstimatedAddr = CurAddr;
+      CurAddr += Node->Size;
+    });
+
+    double CurScore = 0;
+    double NewScore = 0;
+    for (const JumpT *Arc : Jumps) {
+      uint64_t SrcAddr = Arc->Source->EstimatedAddr + Arc->Offset;
+      uint64_t DstAddr = Arc->Target->EstimatedAddr;
+      NewScore += distScore(SrcAddr, DstAddr, Arc->ExecutionCount);
+      CurScore += distScore(0, TotalSize, Arc->ExecutionCount);
+    }
+    return NewScore - CurScore;
+  }
+
+  /// Merge chain From into chain Into, update the list of active chains,
+  /// adjacency information, and the corresponding cached values.
+  void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
+                   MergeTypeT MergeType) {
+    assert(Into != From && "a chain cannot be merged with itself");
+
+    // Merge the nodes.
+    MergedChain MergedNodes =
+        mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
+    Into->merge(From, MergedNodes.getNodes());
+
+    // Merge the edges.
+    Into->mergeEdges(From);
+    From->clear();
+
+    // Remove the chain from the list of active chains.
+    llvm::erase_value(HotChains, From);
+  }
+
+  /// Concatenate all chains into the final order.
+  std::vector<uint64_t> concatChains() {
+    // Collect chains and calculate density stats for their sorting.
+    std::vector<const ChainT *> SortedChains;
+    DenseMap<const ChainT *, double> ChainDensity;
+    for (ChainT &Chain : AllChains) {
+      if (!Chain.Nodes.empty()) {
+        SortedChains.push_back(&Chain);
+        // Using doubles to avoid overflow of ExecutionCounts.
+        double Size = 0;
+        double ExecutionCount = 0;
+        for (NodeT *Node : Chain.Nodes) {
+          Size += static_cast<double>(Node->Size);
+          ExecutionCount += static_cast<double>(Node->ExecutionCount);
+        }
+        assert(Size > 0 && "a chain of zero size");
+        ChainDensity[&Chain] = ExecutionCount / Size;
+      }
+    }
+
+    // Sort chains by density in the decreasing order.
+    std::sort(SortedChains.begin(), SortedChains.end(),
+              [&](const ChainT *L, const ChainT *R) {
+                const double DL = ChainDensity[L];
+                const double DR = ChainDensity[R];
+                // Compare by density and break ties by chain identifiers.
+                return std::make_tuple(-DL, L->Id) <
+                       std::make_tuple(-DR, R->Id);
+              });
+
+    // Collect the nodes in the order specified by their chains.
+    std::vector<uint64_t> Order;
+    Order.reserve(NumNodes);
+    for (const ChainT *Chain : SortedChains)
+      for (NodeT *Node : Chain->Nodes)
+        Order.push_back(Node->Index);
+    return Order;
+  }
+
+private:
+  /// Config for the algorithm.
+  const CDSortConfig Config;
+
+  /// The number of nodes in the graph.
+  const size_t NumNodes;
+
+  /// Successors of each node.
+  std::vector<std::vector<uint64_t>> SuccNodes;
+
+  /// Predecessors of each node.
+  std::vector<std::vector<uint64_t>> PredNodes;
+
+  /// All nodes (functions) in the graph.
+  std::vector<NodeT> AllNodes;
+
+  /// All jumps (function calls) between the nodes.
+  std::vector<JumpT> AllJumps;
+
+  /// All chains of nodes.
+  std::vector<ChainT> AllChains;
+
+  /// All edges between the chains.
+  std::vector<ChainEdge> AllEdges;
+
+  /// Active chains. The vector gets updated at runtime when chains are merged.
+  std::vector<ChainT *> HotChains;
+
+  /// The total number of samples in the graph.
+  uint64_t TotalSamples{0};
+
+  /// The total size of the nodes in the graph.
+  uint64_t TotalSize{0};
+};
+
 } // end of anonymous namespace
 
 std::vector<uint64_t>
-llvm::applyExtTspLayout(const std::vector<uint64_t> &NodeSizes,
-                        const std::vector<uint64_t> &NodeCounts,
-                        const std::vector<EdgeCountT> &EdgeCounts) {
-  // Verify correctness of the input data
+codelayout::computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,
+                                ArrayRef<uint64_t> NodeCounts,
+                                ArrayRef<EdgeCount> EdgeCounts) {
+  // Verify correctness of the input data.
   assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
   assert(NodeSizes.size() > 2 && "Incorrect input");
 
-  // Apply the reordering algorithm
+  // Apply the reordering algorithm.
   ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts);
-  std::vector<uint64_t> Result;
-  Alg.run(Result);
+  std::vector<uint64_t> Result = Alg.run();
 
-  // Verify correctness of the output
+  // Verify correctness of the output.
   assert(Result.front() == 0 && "Original entry point is not preserved");
   assert(Result.size() == NodeSizes.size() && "Incorrect size of layout");
   return Result;
 }
 
-double llvm::calcExtTspScore(const std::vector<uint64_t> &Order,
-                             const std::vector<uint64_t> &NodeSizes,
-                             const std::vector<uint64_t> &NodeCounts,
-                             const std::vector<EdgeCountT> &EdgeCounts) {
-  // Estimate addresses of the blocks in memory
+double codelayout::calcExtTspScore(ArrayRef<uint64_t> Order,
+                                   ArrayRef<uint64_t> NodeSizes,
+                                   ArrayRef<uint64_t> NodeCounts,
+                                   ArrayRef<EdgeCount> EdgeCounts) {
+  // Estimate addresses of the blocks in memory.
   std::vector<uint64_t> Addr(NodeSizes.size(), 0);
   for (size_t Idx = 1; Idx < Order.size(); Idx++) {
     Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
   }
   std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);
-  for (auto It : EdgeCounts) {
-    uint64_t Pred = It.first.first;
-    OutDegree[Pred]++;
-  }
+  for (auto Edge : EdgeCounts)
+    ++OutDegree[Edge.src];
 
-  // Increase the score for each jump
+  // Increase the score for each jump.
   double Score = 0;
-  for (auto It : EdgeCounts) {
-    uint64_t Pred = It.first.first;
-    uint64_t Succ = It.first.second;
-    uint64_t Count = It.second;
-    bool IsConditional = OutDegree[Pred] > 1;
-    Score += ::extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count,
-                           IsConditional);
+  for (auto Edge : EdgeCounts) {
+    bool IsConditional = OutDegree[Edge.src] > 1;
+    Score += ::extTSPScore(Addr[Edge.src], NodeSizes[Edge.src], Addr[Edge.dst],
+                           Edge.count, IsConditional);
   }
   return Score;
 }
 
-double llvm::calcExtTspScore(const std::vector<uint64_t> &NodeSizes,
-                             const std::vector<uint64_t> &NodeCounts,
-                             const std::vector<EdgeCountT> &EdgeCounts) {
+double codelayout::calcExtTspScore(ArrayRef<uint64_t> NodeSizes,
+                                   ArrayRef<uint64_t> NodeCounts,
+                                   ArrayRef<EdgeCount> EdgeCounts) {
   std::vector<uint64_t> Order(NodeSizes.size());
   for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
     Order[Idx] = Idx;
   }
   return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
 }
+
+std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
+    const CDSortConfig &Config, ArrayRef<uint64_t> FuncSizes,
+    ArrayRef<uint64_t> FuncCounts, ArrayRef<EdgeCount> CallCounts,
+    ArrayRef<uint64_t> CallOffsets) {
+  // Verify correctness of the input data.
+  assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input");
+
+  // Apply the reordering algorithm.
+  CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
+  std::vector<uint64_t> Result = Alg.run();
+  assert(Result.size() == FuncSizes.size() && "Incorrect size of layout");
+  return Result;
+}
+
+std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
+    ArrayRef<uint64_t> FuncSizes, ArrayRef<uint64_t> FuncCounts,
+    ArrayRef<EdgeCount> CallCounts, ArrayRef<uint64_t> CallOffsets) {
+  CDSortConfig Config;
+  // Populate the config from the command-line options.
+  if (CacheEntries.getNumOccurrences() > 0)
+    Config.CacheEntries = CacheEntries;
+  if (CacheSize.getNumOccurrences() > 0)
+    Config.CacheSize = CacheSize;
+  if (DistancePower.getNumOccurrences() > 0)
+    Config.DistancePower = DistancePower;
+  if (FrequencyScale.getNumOccurrences() > 0)
+    Config.FrequencyScale = FrequencyScale;
+  return computeCacheDirectedLayout(Config, FuncSizes, FuncCounts, CallCounts,
+                                    CallOffsets);
+}
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 00dd5206d75a756005e663bc2400faf3811e4793..8e17f53c62c6ef9f83061f4246eb00396ed4d0dd 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -501,7 +501,7 @@ public:
     auto FixedAI = std::move(AI);
     FixedAI.MappingBase -= DeltaAddr;
     for (auto &Seg : FixedAI.Segments)
-      Seg.AG = {MemProt::Read | MemProt::Write, Seg.AG.getMemLifetimePolicy()};
+      Seg.AG = {MemProt::Read | MemProt::Write, Seg.AG.getMemLifetime()};
     FixedAI.Actions.clear();
     InProcessMemoryMapper::initialize(
         FixedAI, [this, OnInitialized = std::move(OnInitialized)](
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index 28ecf1af321f1eda338155de5c0f982c6ca77efc..abaf1e123145cd25ab051c07b2054065f2499645 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -581,9 +581,14 @@ int main(int argc, char **argv) {
   // the facility for updating public visibility to linkage unit visibility when
   // specified by an internal option. This is normally done during LTO which is
   // not performed via opt.
-  updateVCallVisibilityInModule(*M,
-                                /* WholeProgramVisibilityEnabledInLTO */ false,
-                                /* DynamicExportSymbols */ {});
+  updateVCallVisibilityInModule(
+      *M,
+      /*WholeProgramVisibilityEnabledInLTO=*/false,
+      // FIXME: These need linker information via a
+      // TBD new interface.
+      /*DynamicExportSymbols=*/{},
+      /*ValidateAllVtablesHaveTypeInfos=*/false,
+      /*IsVisibleToRegularObj=*/[](StringRef) { return true; });
 
   // Figure out what stream we are supposed to write to...
   std::unique_ptr<ToolOutputFile> Out;
diff --git a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
index ff153f6d4b32766c0924604ac1fbcbb8934db08f..711f35fc7683c3190e785d0cf5ea5f0d0f65a9c8 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
@@ -798,7 +798,7 @@ TEST(LinkGraphTest, BasicLayoutHonorsNoAlloc) {
   // Create a NoAlloc section and block.
   auto &Sec2 =
       G.createSection("__metadata", orc::MemProt::Read | orc::MemProt::Write);
-  Sec2.setMemLifetimePolicy(orc::MemLifetimePolicy::NoAlloc);
+  Sec2.setMemLifetime(orc::MemLifetime::NoAlloc);
   G.createContentBlock(Sec2, BlockContent.slice(0, 8), orc::ExecutorAddr(), 8,
                        0);