diff --git a/.gitattributes b/.gitattributes index 05a0e946187b8160d0c54c23a9f8100f44e0f43b..c9a618551d7d788cde5f4326178966a3917277e9 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1 @@ -*.xz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text diff --git a/0001-Fix-trap-value-for-non-X86.patch b/0001-Fix-trap-value-for-non-X86.patch deleted file mode 100644 index 83542e4a2372b0dafd3cff6cb91677e6335a9e1d..0000000000000000000000000000000000000000 --- a/0001-Fix-trap-value-for-non-X86.patch +++ /dev/null @@ -1,126 +0,0 @@ -From 868d8c360b3e1e5f291cb3e0dae0777a4529228f Mon Sep 17 00:00:00 2001 -From: Denis Revunov -Date: Thu, 27 Jul 2023 11:48:08 -0400 -Subject: [PATCH] Fix trap value for non-X86 - -The trap value used by BOLT was assumed to be single-byte instruction. -It made some functions unaligned on AArch64(e.g exceptions-instrumentation test) -and caused emission failures. Fix that by changing fill value to StringRef. - -Reviewed By: rafauler - -Differential Revision: https://reviews.llvm.org/D158191 ---- - bolt/include/bolt/Core/MCPlusBuilder.h | 9 ++++++--- - bolt/lib/Core/BinaryEmitter.cpp | 4 ++-- - bolt/lib/Rewrite/RewriteInstance.cpp | 6 ++++-- - bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 4 ++++ - bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp | 4 ++++ - bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 2 +- - 6 files changed, 21 insertions(+), 8 deletions(-) - -diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h -index 56d0228cd..beb06751d 100644 ---- a/bolt/include/bolt/Core/MCPlusBuilder.h -+++ b/bolt/include/bolt/Core/MCPlusBuilder.h -@@ -636,9 +636,12 @@ public: - return false; - } - -- /// If non-zero, this is used to fill the executable space with instructions -- /// that will trap. Defaults to 0. -- virtual unsigned getTrapFillValue() const { return 0; } -+ /// Used to fill the executable space with instructions -+ /// that will trap. -+ virtual StringRef getTrapFillValue() const { -+ llvm_unreachable("not implemented"); -+ return StringRef(); -+ } - - /// Interface and basic functionality of a MCInstMatcher. The idea is to make - /// it easy to match one or more MCInsts against a tree-like pattern and -diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp -index c4129615a..df076c81d 100644 ---- a/bolt/lib/Core/BinaryEmitter.cpp -+++ b/bolt/lib/Core/BinaryEmitter.cpp -@@ -376,7 +376,7 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function, - } - - if (opts::MarkFuncs) -- Streamer.emitIntValue(BC.MIB->getTrapFillValue(), 1); -+ Streamer.emitBytes(BC.MIB->getTrapFillValue()); - - // Emit CFI end - if (Function.hasCFI()) -@@ -420,7 +420,7 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF, - // case, the call site entries in that LSDA have 0 as offset to the landing - // pad, which the runtime interprets as "no handler". To prevent this, - // insert some padding. -- Streamer.emitIntValue(BC.MIB->getTrapFillValue(), 1); -+ Streamer.emitBytes(BC.MIB->getTrapFillValue()); - } - - // Track the first emitted instruction with debug info. -diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp -index fe8c134b8..c6ea0b009 100644 ---- a/bolt/lib/Rewrite/RewriteInstance.cpp -+++ b/bolt/lib/Rewrite/RewriteInstance.cpp -@@ -5273,8 +5273,10 @@ void RewriteInstance::rewriteFile() { - if (!BF.getFileOffset() || !BF.isEmitted()) - continue; - OS.seek(BF.getFileOffset()); -- for (unsigned I = 0; I < BF.getMaxSize(); ++I) -- OS.write((unsigned char)BC->MIB->getTrapFillValue()); -+ StringRef TrapInstr = BC->MIB->getTrapFillValue(); -+ unsigned NInstr = BF.getMaxSize() / TrapInstr.size(); -+ for (unsigned I = 0; I < NInstr; ++I) -+ OS.write(TrapInstr.data(), TrapInstr.size()); - } - OS.seek(SavedPos); - } -diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp -index acf21ba23..cd66b654e 100644 ---- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp -+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp -@@ -1142,6 +1142,10 @@ public: - } - } - -+ StringRef getTrapFillValue() const override { -+ return StringRef("\0\0\0\0", 4); -+ } -+ - bool createReturn(MCInst &Inst) const override { - Inst.setOpcode(AArch64::RET); - Inst.clear(); -diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp -index ec5bca852..badc1bde8 100644 ---- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp -+++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp -@@ -171,6 +171,10 @@ public: - return true; - } - -+ StringRef getTrapFillValue() const override { -+ return StringRef("\0\0\0\0", 4); -+ } -+ - bool analyzeBranch(InstructionIterator Begin, InstructionIterator End, - const MCSymbol *&TBB, const MCSymbol *&FBB, - MCInst *&CondBranch, -diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp -index 3ee161d0b..5e3c01a1c 100644 ---- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp -+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp -@@ -397,7 +397,7 @@ public: - } - } - -- unsigned getTrapFillValue() const override { return 0xCC; } -+ StringRef getTrapFillValue() const override { return StringRef("\314", 1); } - - struct IndJmpMatcherFrag1 : MCInstMatcher { - std::unique_ptr Base; --- -2.33.0 - diff --git a/0002-Add-test-for-emitting-trap-value.patch b/0002-Add-test-for-emitting-trap-value.patch deleted file mode 100644 index 8cc1c6d8308dc848072e0b6be000f9fc12c96068..0000000000000000000000000000000000000000 --- a/0002-Add-test-for-emitting-trap-value.patch +++ /dev/null @@ -1,44 +0,0 @@ -From e4ae238a42296a84bc819dd1fb61f3c699952f17 Mon Sep 17 00:00:00 2001 -From: Denis Revunov -Date: Thu, 17 Aug 2023 18:30:07 +0300 -Subject: [PATCH] Add test for emitting trap value - -Reviewed By: rafauler - -Differential Revision: https://reviews.llvm.org/D158191 ---- - bolt/test/runtime/mark-funcs.c | 22 ++++++++++++++++++++++ - 1 file changed, 22 insertions(+) - create mode 100644 bolt/test/runtime/mark-funcs.c - -diff --git a/bolt/test/runtime/mark-funcs.c b/bolt/test/runtime/mark-funcs.c -new file mode 100644 -index 000000000..a8586ca8b ---- /dev/null -+++ b/bolt/test/runtime/mark-funcs.c -@@ -0,0 +1,22 @@ -+#include -+ -+int dummy() { -+ printf("Dummy called\n"); -+ return 0; -+} -+ -+int main(int argc, char **argv) { -+ if (dummy() != 0) -+ return 1; -+ printf("Main called\n"); -+ return 0; -+} -+// Check that emitting trap value works properly and -+// does not break functions -+// REQUIRES: system-linux -+// RUN: %clangxx -Wl,-q %s -o %t.exe -+// RUN: %t.exe | FileCheck %s -+// CHECK: Dummy called -+// CHECK-NEXT: Main called -+// RUN: llvm-bolt %t.exe -o %t.exe.bolt -lite=false --mark-funcs -+// RUN: %t.exe.bolt | FileCheck %s --- -2.33.0 - diff --git a/0003-AArch64-Add-AArch64-support-for-inline.patch b/0003-AArch64-Add-AArch64-support-for-inline.patch index cb64595fbbeddb127f769c8facf7676b1178cdf9..521f3e52fcce63dd13c858438960a95a6aa35745 100644 --- a/0003-AArch64-Add-AArch64-support-for-inline.patch +++ b/0003-AArch64-Add-AArch64-support-for-inline.patch @@ -75,7 +75,7 @@ index 8dcb8934f..67dd294fb 100644 + } + } + } -+ if (skip) ++ if (skip) + break; + } + if (skip) { @@ -112,9 +112,9 @@ index d109a5d52..acf21ba23 100644 + return true; + } + - bool createTailCall(MCInst &Inst, const MCSymbol *Target, - MCContext *Ctx) override { - Inst.setOpcode(AArch64::B); + InstructionListType createCmpJE(MCPhysReg RegNo, int64_t Imm, + const MCSymbol *Target, + MCContext *Ctx) const override { diff --git a/bolt/test/AArch64/Inputs/inline-foo.c b/bolt/test/AArch64/Inputs/inline-foo.c new file mode 100644 index 000000000..1307c13f2 diff --git a/0006-AArch64-Add-CFG-block-count-correction-optimization.patch b/0006-AArch64-Add-CFG-block-count-correction-optimization.patch index b90b76d9461c27694516afc455562b5889485b0b..00cd7046a2070e71cc17fc0efc18d7eea9c6aac0 100644 --- a/0006-AArch64-Add-CFG-block-count-correction-optimization.patch +++ b/0006-AArch64-Add-CFG-block-count-correction-optimization.patch @@ -867,8 +867,8 @@ index a4612fb93..f93147d39 100644 --- a/bolt/lib/Core/CMakeLists.txt +++ b/bolt/lib/Core/CMakeLists.txt @@ -12,6 +12,7 @@ set(LLVM_LINK_COMPONENTS - add_llvm_library(LLVMBOLTCore + AddressMap.cpp BinaryBasicBlock.cpp + BinaryBasicBlockFeature.cpp BinaryContext.cpp @@ -1086,7 +1086,7 @@ index 000000000..d93aef648 + unsigned NumIndirectCalls{0}; + + for (auto &Inst : BB) { -+ if (BC.MIB->isLoad(Inst)) { ++ if (BC.MIB->mayLoad(Inst)) { + ++NumLoads; + } else if (BC.MIB->isCall(Inst)) { + ++NumCalls; @@ -1548,7 +1548,7 @@ index 000000000..585dbcae2 + for (auto &Inst : BB) { + if (BC.MIB->isCall(Inst)) + CallSet.insert(&BB); -+ else if (BC.MIB->isStore(Inst)) ++ else if (BC.MIB->mayStore(Inst)) + StoreSet.insert(&BB); + } + } diff --git a/0009-support-aarch64-instrumentation.patch b/0009-support-aarch64-instrumentation.patch deleted file mode 100644 index df97a14f08116a4370e2475b2b2403f3b5505d24..0000000000000000000000000000000000000000 --- a/0009-support-aarch64-instrumentation.patch +++ /dev/null @@ -1,2630 +0,0 @@ -From a7d826d3985dd886523df050949f1c3c151df636 Mon Sep 17 00:00:00 2001 -From: rfwang07 -Date: Thu, 31 Oct 2024 15:34:10 +0800 -Subject: [PATCH] support aarch64 instrumentation - ---- - bolt/CMakeLists.txt | 6 +- - bolt/include/bolt/Core/MCPlusBuilder.h | 24 +- - bolt/lib/Core/BinaryFunction.cpp | 6 + - bolt/lib/Passes/Instrumentation.cpp | 28 +- - bolt/lib/Passes/MCF.cpp | 1 + - bolt/lib/Passes/TailDuplication.cpp | 2 +- - .../Target/AArch64/AArch64MCPlusBuilder.cpp | 446 +++++++++++++++++- - bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 67 +-- - bolt/runtime/CMakeLists.txt | 12 +- - bolt/runtime/common.h | 417 ++-------------- - bolt/runtime/instr.cpp | 61 ++- - bolt/runtime/sys_aarch64.h | 394 ++++++++++++++++ - bolt/runtime/sys_x86_64.h | 360 ++++++++++++++ - bolt/test/AArch64/exclusive-instrument.s | 39 ++ - bolt/test/X86/asm-dump.c | 5 +- - ...olt-address-translation-internal-call.test | 9 +- - .../test/X86/instrumentation-eh_frame_hdr.cpp | 2 +- - bolt/test/X86/internal-call-instrument.s | 24 +- - bolt/test/X86/tail-duplication-pass.s | 9 + - bolt/test/assume-abi.test | 7 + - .../AArch64/Inputs/basic-instrumentation.s | 9 + - .../AArch64/basic-instrumentation.test | 22 + - .../AArch64/instrumentation-ind-call.c | 38 ++ - .../{X86 => }/Inputs/exceptions_split.cpp | 16 +- - .../runtime/X86/instrumentation-tail-call.s | 6 +- - .../{X86 => }/exceptions-instrumentation.test | 0 - .../{X86 => }/pie-exceptions-split.test | 4 +- - 27 files changed, 1545 insertions(+), 469 deletions(-) - create mode 100644 bolt/runtime/sys_aarch64.h - create mode 100644 bolt/runtime/sys_x86_64.h - create mode 100644 bolt/test/AArch64/exclusive-instrument.s - create mode 100644 bolt/test/assume-abi.test - create mode 100644 bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s - create mode 100644 bolt/test/runtime/AArch64/basic-instrumentation.test - create mode 100644 bolt/test/runtime/AArch64/instrumentation-ind-call.c - rename bolt/test/runtime/{X86 => }/Inputs/exceptions_split.cpp (85%) - rename bolt/test/runtime/{X86 => }/exceptions-instrumentation.test (100%) - rename bolt/test/runtime/{X86 => }/pie-exceptions-split.test (95%) - -diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt -index 4ff90c1..89462f8 100644 ---- a/bolt/CMakeLists.txt -+++ b/bolt/CMakeLists.txt -@@ -32,10 +32,10 @@ foreach (tgt ${BOLT_TARGETS_TO_BUILD}) - endforeach() - - set(BOLT_ENABLE_RUNTIME_default OFF) --if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" -+if ((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" -+ OR CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") - AND (CMAKE_SYSTEM_NAME STREQUAL "Linux" -- OR CMAKE_SYSTEM_NAME STREQUAL "Darwin") -- AND "X86" IN_LIST BOLT_TARGETS_TO_BUILD) -+ OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")) - set(BOLT_ENABLE_RUNTIME_default ON) - endif() - option(BOLT_ENABLE_RUNTIME "Enable BOLT runtime" ${BOLT_ENABLE_RUNTIME_default}) -diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h -index beb0675..e6945c9 100644 ---- a/bolt/include/bolt/Core/MCPlusBuilder.h -+++ b/bolt/include/bolt/Core/MCPlusBuilder.h -@@ -498,9 +498,9 @@ public: - } - - /// Create increment contents of target by 1 for Instrumentation -- virtual InstructionListType createInstrIncMemory(const MCSymbol *Target, -- MCContext *Ctx, -- bool IsLeaf) const { -+ virtual InstructionListType -+ createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, -+ unsigned CodePointerSize) const { - llvm_unreachable("not implemented"); - return InstructionListType(); - } -@@ -620,6 +620,11 @@ public: - return false; - } - -+ virtual bool isAArch64Exclusive(const MCInst &Inst) const { -+ llvm_unreachable("not implemented"); -+ return false; -+ } -+ - virtual bool isCleanRegXOR(const MCInst &Inst) const { - llvm_unreachable("not implemented"); - return false; -@@ -1597,18 +1602,11 @@ public: - return false; - } - -- virtual void createLoadImmediate(MCInst &Inst, const MCPhysReg Dest, -- uint32_t Imm) const { -+ virtual InstructionListType createLoadImmediate(const MCPhysReg Dest, -+ uint64_t Imm) const { - llvm_unreachable("not implemented"); - } - -- /// Create instruction to increment contents of target by 1 -- virtual bool createIncMemory(MCInst &Inst, const MCSymbol *Target, -- MCContext *Ctx) const { -- llvm_unreachable("not implemented"); -- return false; -- } -- - /// Create a fragment of code (sequence of instructions) that load a 32-bit - /// address from memory, zero-extends it to 64 and jump to it (indirect jump). - virtual bool -@@ -1969,7 +1967,7 @@ public: - } - - virtual InstructionListType createSymbolTrampoline(const MCSymbol *TgtSym, -- MCContext *Ctx) const { -+ MCContext *Ctx) { - llvm_unreachable("not implemented"); - return InstructionListType(); - } -diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp -index 5b44a76..b79bd58 100644 ---- a/bolt/lib/Core/BinaryFunction.cpp -+++ b/bolt/lib/Core/BinaryFunction.cpp -@@ -2305,6 +2305,12 @@ void BinaryFunction::removeConditionalTailCalls() { - - // This branch is no longer a conditional tail call. - BC.MIB->unsetConditionalTailCall(*CTCInstr); -+ -+ // Move offset from CTCInstr to TailCallInstr. -+ if (std::optional Offset = BC.MIB->getOffset(*CTCInstr)) { -+ BC.MIB->setOffset(TailCallInstr, *Offset); -+ BC.MIB->clearOffset(*CTCInstr); -+ } - } - - insertBasicBlocks(std::prev(end()), std::move(NewBlocks), -diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp -index fae6770..72adb31 100644 ---- a/bolt/lib/Passes/Instrumentation.cpp -+++ b/bolt/lib/Passes/Instrumentation.cpp -@@ -13,6 +13,7 @@ - #include "bolt/Passes/Instrumentation.h" - #include "bolt/Core/ParallelUtilities.h" - #include "bolt/RuntimeLibs/InstrumentationRuntimeLibrary.h" -+#include "bolt/Utils/CommandLineOpts.h" - #include "bolt/Utils/Utils.h" - #include "llvm/Support/CommandLine.h" - #include "llvm/Support/RWMutex.h" -@@ -85,6 +86,24 @@ cl::opt InstrumentCalls("instrument-calls", - namespace llvm { - namespace bolt { - -+static bool hasAArch64ExclusiveMemop(BinaryFunction &Function) { -+ // FIXME ARMv8-a architecture reference manual says that software must avoid -+ // having any explicit memory accesses between exclusive load and associated -+ // store instruction. So for now skip instrumentation for functions that have -+ // these instructions, since it might lead to runtime deadlock. -+ BinaryContext &BC = Function.getBinaryContext(); -+ for (const BinaryBasicBlock &BB : Function) -+ for (const MCInst &Inst : BB) -+ if (BC.MIB->isAArch64Exclusive(Inst)) { -+ if (opts::Verbosity >= 1) -+ outs() << "BOLT-INSTRUMENTER: Function " << Function -+ << " has exclusive instructions, skip instrumentation\n"; -+ return true; -+ } -+ -+ return false; -+} -+ - uint32_t Instrumentation::getFunctionNameIndex(const BinaryFunction &Function) { - auto Iter = FuncToStringIdx.find(&Function); - if (Iter != FuncToStringIdx.end()) -@@ -176,7 +195,8 @@ Instrumentation::createInstrumentationSnippet(BinaryContext &BC, bool IsLeaf) { - auto L = BC.scopeLock(); - MCSymbol *Label = BC.Ctx->createNamedTempSymbol("InstrEntry"); - Summary->Counters.emplace_back(Label); -- return BC.MIB->createInstrIncMemory(Label, BC.Ctx.get(), IsLeaf); -+ return BC.MIB->createInstrIncMemory(Label, BC.Ctx.get(), IsLeaf, -+ BC.AsmInfo->getCodePointerSize()); - } - - // Helper instruction sequence insertion function -@@ -287,6 +307,9 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function, - if (BC.isMachO() && Function.hasName("___GLOBAL_init_65535/1")) - return; - -+ if (BC.isAArch64() && hasAArch64ExclusiveMemop(Function)) -+ return; -+ - SplitWorklistTy SplitWorklist; - SplitInstrsTy SplitInstrs; - -@@ -504,9 +527,6 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function, - } - - void Instrumentation::runOnFunctions(BinaryContext &BC) { -- if (!BC.isX86()) -- return; -- - const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/false, - /*IsText=*/false, - /*IsAllocatable=*/true); -diff --git a/bolt/lib/Passes/MCF.cpp b/bolt/lib/Passes/MCF.cpp -index ec04012..c3898d2 100644 ---- a/bolt/lib/Passes/MCF.cpp -+++ b/bolt/lib/Passes/MCF.cpp -@@ -262,6 +262,7 @@ bool guessPredEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) { - continue; - - Pred->getBranchInfo(*BB).Count = Guessed; -+ GuessedArcs.insert(std::make_pair(Pred, BB)); - return true; - } - llvm_unreachable("Expected unguessed arc"); -diff --git a/bolt/lib/Passes/TailDuplication.cpp b/bolt/lib/Passes/TailDuplication.cpp -index c04efd7..7141d5d 100644 ---- a/bolt/lib/Passes/TailDuplication.cpp -+++ b/bolt/lib/Passes/TailDuplication.cpp -@@ -303,7 +303,7 @@ TailDuplication::aggressiveDuplicate(BinaryBasicBlock &BB, - if (isInCacheLine(BB, Tail)) - return BlocksToDuplicate; - -- BinaryBasicBlock *CurrBB = &BB; -+ BinaryBasicBlock *CurrBB = &Tail; - while (CurrBB) { - LLVM_DEBUG(dbgs() << "Aggressive tail duplication: adding " - << CurrBB->getName() << " to duplication list\n";); -diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp -index cd66b65..3f6497e 100644 ---- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp -+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp -@@ -16,6 +16,9 @@ - #include "Utils/AArch64BaseInfo.h" - #include "bolt/Core/MCPlusBuilder.h" - #include "llvm/BinaryFormat/ELF.h" -+#include "llvm/MC/MCContext.h" -+#include "llvm/MC/MCFixupKindInfo.h" -+#include "llvm/MC/MCInstBuilder.h" - #include "llvm/MC/MCInstrInfo.h" - #include "llvm/MC/MCRegisterInfo.h" - #include "llvm/Support/Debug.h" -@@ -28,6 +31,100 @@ using namespace bolt; - - namespace { - -+static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) { -+ Inst.setOpcode(AArch64::MRS); -+ Inst.clear(); -+ Inst.addOperand(MCOperand::createReg(RegName)); -+ Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV)); -+} -+ -+static void setSystemFlag(MCInst &Inst, MCPhysReg RegName) { -+ Inst.setOpcode(AArch64::MSR); -+ Inst.clear(); -+ Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV)); -+ Inst.addOperand(MCOperand::createReg(RegName)); -+} -+ -+static void createPushRegisters(MCInst &Inst, MCPhysReg Reg1, MCPhysReg Reg2) { -+ Inst.clear(); -+ unsigned NewOpcode = AArch64::STPXpre; -+ Inst.setOpcode(NewOpcode); -+ Inst.addOperand(MCOperand::createReg(AArch64::SP)); -+ Inst.addOperand(MCOperand::createReg(Reg1)); -+ Inst.addOperand(MCOperand::createReg(Reg2)); -+ Inst.addOperand(MCOperand::createReg(AArch64::SP)); -+ Inst.addOperand(MCOperand::createImm(-2)); -+} -+ -+static void createPopRegisters(MCInst &Inst, MCPhysReg Reg1, MCPhysReg Reg2) { -+ Inst.clear(); -+ unsigned NewOpcode = AArch64::LDPXpost; -+ Inst.setOpcode(NewOpcode); -+ Inst.addOperand(MCOperand::createReg(AArch64::SP)); -+ Inst.addOperand(MCOperand::createReg(Reg1)); -+ Inst.addOperand(MCOperand::createReg(Reg2)); -+ Inst.addOperand(MCOperand::createReg(AArch64::SP)); -+ Inst.addOperand(MCOperand::createImm(2)); -+} -+ -+static void loadReg(MCInst &Inst, MCPhysReg To, MCPhysReg From) { -+ Inst.setOpcode(AArch64::LDRXui); -+ Inst.clear(); -+ if (From == AArch64::SP) { -+ Inst.setOpcode(AArch64::LDRXpost); -+ Inst.addOperand(MCOperand::createReg(From)); -+ Inst.addOperand(MCOperand::createReg(To)); -+ Inst.addOperand(MCOperand::createReg(From)); -+ Inst.addOperand(MCOperand::createImm(16)); -+ } else { -+ Inst.addOperand(MCOperand::createReg(To)); -+ Inst.addOperand(MCOperand::createReg(From)); -+ Inst.addOperand(MCOperand::createImm(0)); -+ } -+} -+ -+static void storeReg(MCInst &Inst, MCPhysReg From, MCPhysReg To) { -+ Inst.setOpcode(AArch64::STRXui); -+ Inst.clear(); -+ if (To == AArch64::SP) { -+ Inst.setOpcode(AArch64::STRXpre); -+ Inst.addOperand(MCOperand::createReg(To)); -+ Inst.addOperand(MCOperand::createReg(From)); -+ Inst.addOperand(MCOperand::createReg(To)); -+ Inst.addOperand(MCOperand::createImm(-16)); -+ } else { -+ Inst.addOperand(MCOperand::createReg(From)); -+ Inst.addOperand(MCOperand::createReg(To)); -+ Inst.addOperand(MCOperand::createImm(0)); -+ } -+} -+ -+static void atomicAdd(MCInst &Inst, MCPhysReg RegTo, MCPhysReg RegCnt) { -+ // NOTE: Supports only ARM with LSE extension -+ Inst.setOpcode(AArch64::LDADDX); -+ Inst.clear(); -+ Inst.addOperand(MCOperand::createReg(AArch64::XZR)); -+ Inst.addOperand(MCOperand::createReg(RegCnt)); -+ Inst.addOperand(MCOperand::createReg(RegTo)); -+} -+ -+static void createMovz(MCInst &Inst, MCPhysReg Reg, uint64_t Imm) { -+ assert(Imm <= UINT16_MAX && "Invalid Imm size"); -+ Inst.clear(); -+ Inst.setOpcode(AArch64::MOVZXi); -+ Inst.addOperand(MCOperand::createReg(Reg)); -+ Inst.addOperand(MCOperand::createImm(Imm & 0xFFFF)); -+ Inst.addOperand(MCOperand::createImm(0)); -+} -+ -+static InstructionListType createIncMemory(MCPhysReg RegTo, MCPhysReg RegTmp) { -+ InstructionListType Insts; -+ Insts.emplace_back(); -+ createMovz(Insts.back(), RegTmp, 1); -+ Insts.emplace_back(); -+ atomicAdd(Insts.back(), RegTo, RegTmp); -+ return Insts; -+} - class AArch64MCPlusBuilder : public MCPlusBuilder { - public: - AArch64MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info, -@@ -176,6 +273,34 @@ public: - return isLDRB(Inst) || isLDRH(Inst) || isLDRW(Inst) || isLDRX(Inst); - } - -+ bool isAArch64Exclusive(const MCInst &Inst) const override { -+ return (Inst.getOpcode() == AArch64::LDXPX || -+ Inst.getOpcode() == AArch64::LDXPW || -+ Inst.getOpcode() == AArch64::LDXRX || -+ Inst.getOpcode() == AArch64::LDXRW || -+ Inst.getOpcode() == AArch64::LDXRH || -+ Inst.getOpcode() == AArch64::LDXRB || -+ Inst.getOpcode() == AArch64::STXPX || -+ Inst.getOpcode() == AArch64::STXPW || -+ Inst.getOpcode() == AArch64::STXRX || -+ Inst.getOpcode() == AArch64::STXRW || -+ Inst.getOpcode() == AArch64::STXRH || -+ Inst.getOpcode() == AArch64::STXRB || -+ Inst.getOpcode() == AArch64::LDAXPX || -+ Inst.getOpcode() == AArch64::LDAXPW || -+ Inst.getOpcode() == AArch64::LDAXRX || -+ Inst.getOpcode() == AArch64::LDAXRW || -+ Inst.getOpcode() == AArch64::LDAXRH || -+ Inst.getOpcode() == AArch64::LDAXRB || -+ Inst.getOpcode() == AArch64::STLXPX || -+ Inst.getOpcode() == AArch64::STLXPW || -+ Inst.getOpcode() == AArch64::STLXRX || -+ Inst.getOpcode() == AArch64::STLXRW || -+ Inst.getOpcode() == AArch64::STLXRH || -+ Inst.getOpcode() == AArch64::STLXRB || -+ Inst.getOpcode() == AArch64::CLREX); -+ } -+ - bool isLoadFromStack(const MCInst &Inst) const { - if (!isLoad(Inst)) - return false; -@@ -207,6 +332,40 @@ public: - return Inst.getOpcode() == AArch64::BLR; - } - -+ MCPhysReg getSpRegister(int Size) const { -+ switch (Size) { -+ case 4: -+ return AArch64::WSP; -+ case 8: -+ return AArch64::SP; -+ default: -+ llvm_unreachable("Unexpected size"); -+ } -+ } -+ -+ MCPhysReg getIntArgRegister(unsigned ArgNo) const override { -+ switch (ArgNo) { -+ case 0: -+ return AArch64::X0; -+ case 1: -+ return AArch64::X1; -+ case 2: -+ return AArch64::X2; -+ case 3: -+ return AArch64::X3; -+ case 4: -+ return AArch64::X4; -+ case 5: -+ return AArch64::X5; -+ case 6: -+ return AArch64::X6; -+ case 7: -+ return AArch64::X7; -+ default: -+ return getNoRegister(); -+ } -+ } -+ - bool hasPCRelOperand(const MCInst &Inst) const override { - // ADRP is blacklisted and is an exception. Even though it has a - // PC-relative operand, this operand is not a complete symbol reference -@@ -313,6 +472,22 @@ public: - return true; - } - -+ void getCalleeSavedRegs(BitVector &Regs) const override { -+ Regs |= getAliases(AArch64::X18); -+ Regs |= getAliases(AArch64::X19); -+ Regs |= getAliases(AArch64::X20); -+ Regs |= getAliases(AArch64::X21); -+ Regs |= getAliases(AArch64::X22); -+ Regs |= getAliases(AArch64::X23); -+ Regs |= getAliases(AArch64::X24); -+ Regs |= getAliases(AArch64::X25); -+ Regs |= getAliases(AArch64::X26); -+ Regs |= getAliases(AArch64::X27); -+ Regs |= getAliases(AArch64::X28); -+ Regs |= getAliases(AArch64::LR); -+ Regs |= getAliases(AArch64::FP); -+ } -+ - const MCExpr *getTargetExprFor(MCInst &Inst, const MCExpr *Expr, - MCContext &Ctx, - uint64_t RelType) const override { -@@ -818,6 +993,22 @@ public: - - int getUncondBranchEncodingSize() const override { return 28; } - -+ InstructionListType createCmpJE(MCPhysReg RegNo, int64_t Imm, -+ const MCSymbol *Target, -+ MCContext *Ctx) const override { -+ InstructionListType Code; -+ Code.emplace_back(MCInstBuilder(AArch64::SUBSXri) -+ .addReg(RegNo) -+ .addReg(RegNo) -+ .addImm(Imm) -+ .addImm(0)); -+ Code.emplace_back(MCInstBuilder(AArch64::Bcc) -+ .addImm(Imm) -+ .addExpr(MCSymbolRefExpr::create( -+ Target, MCSymbolRefExpr::VK_None, *Ctx))); -+ return Code; -+ } -+ - bool createCall(MCInst &Inst, const MCSymbol *Target, - MCContext *Ctx) override { - Inst.setOpcode(AArch64::BL); -@@ -828,12 +1019,7 @@ public: - - bool createTailCall(MCInst &Inst, const MCSymbol *Target, - MCContext *Ctx) override { -- Inst.setOpcode(AArch64::B); -- Inst.addOperand(MCOperand::createExpr(getTargetExprFor( -- Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx), -- *Ctx, 0))); -- setTailCall(Inst); -- return true; -+ return createDirectCall(Inst, Target, Ctx, /*IsTailCall*/ true); - } - - void createLongTailCall(InstructionListType &Seq, const MCSymbol *Target, -@@ -882,6 +1068,18 @@ public: - - bool isStore(const MCInst &Inst) const override { return false; } - -+ bool createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx, -+ bool IsTailCall) override { -+ Inst.setOpcode(IsTailCall ? AArch64::B : AArch64::BL); -+ Inst.clear(); -+ Inst.addOperand(MCOperand::createExpr(getTargetExprFor( -+ Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx), -+ *Ctx, 0))); -+ if (IsTailCall) -+ convertJmpToTailCall(Inst); -+ return true; -+ } -+ - bool analyzeBranch(InstructionIterator Begin, InstructionIterator End, - const MCSymbol *&TBB, const MCSymbol *&FBB, - MCInst *&CondBranch, -@@ -1153,6 +1351,242 @@ public: - return true; - } - -+ bool createStackPointerIncrement( -+ MCInst &Inst, int Size, -+ bool NoFlagsClobber = false /*unused for AArch64*/) const override { -+ Inst.setOpcode(AArch64::SUBXri); -+ Inst.clear(); -+ Inst.addOperand(MCOperand::createReg(AArch64::SP)); -+ Inst.addOperand(MCOperand::createReg(AArch64::SP)); -+ Inst.addOperand(MCOperand::createImm(Size)); -+ Inst.addOperand(MCOperand::createImm(0)); -+ return true; -+ } -+ -+ bool createStackPointerDecrement( -+ MCInst &Inst, int Size, -+ bool NoFlagsClobber = false /*unused for AArch64*/) const override { -+ Inst.setOpcode(AArch64::ADDXri); -+ Inst.clear(); -+ Inst.addOperand(MCOperand::createReg(AArch64::SP)); -+ Inst.addOperand(MCOperand::createReg(AArch64::SP)); -+ Inst.addOperand(MCOperand::createImm(Size)); -+ Inst.addOperand(MCOperand::createImm(0)); -+ return true; -+ } -+ -+ void createIndirectBranch(MCInst &Inst, MCPhysReg MemBaseReg, -+ int64_t Disp) const { -+ Inst.setOpcode(AArch64::BR); -+ Inst.addOperand(MCOperand::createReg(MemBaseReg)); -+ } -+ -+ InstructionListType createInstrumentedIndCallHandlerExitBB() const override { -+ InstructionListType Insts(5); -+ // Code sequence for instrumented indirect call handler: -+ // msr nzcv, x1 -+ // ldp x0, x1, [sp], #16 -+ // ldr x16, [sp], #16 -+ // ldp x0, x1, [sp], #16 -+ // br x16 -+ setSystemFlag(Insts[0], AArch64::X1); -+ createPopRegisters(Insts[1], AArch64::X0, AArch64::X1); -+ // Here we load address of the next function which should be called in the -+ // original binary to X16 register. Writing to X16 is permitted without -+ // needing to restore. -+ loadReg(Insts[2], AArch64::X16, AArch64::SP); -+ createPopRegisters(Insts[3], AArch64::X0, AArch64::X1); -+ createIndirectBranch(Insts[4], AArch64::X16, 0); -+ return Insts; -+ } -+ -+ InstructionListType -+ createInstrumentedIndTailCallHandlerExitBB() const override { -+ return createInstrumentedIndCallHandlerExitBB(); -+ } -+ -+ InstructionListType createGetter(MCContext *Ctx, const char *name) const { -+ InstructionListType Insts(4); -+ MCSymbol *Locs = Ctx->getOrCreateSymbol(name); -+ InstructionListType Addr = materializeAddress(Locs, Ctx, AArch64::X0); -+ std::copy(Addr.begin(), Addr.end(), Insts.begin()); -+ assert(Addr.size() == 2 && "Invalid Addr size"); -+ loadReg(Insts[2], AArch64::X0, AArch64::X0); -+ createReturn(Insts[3]); -+ return Insts; -+ } -+ -+ InstructionListType createNumCountersGetter(MCContext *Ctx) const override { -+ return createGetter(Ctx, "__bolt_num_counters"); -+ } -+ -+ InstructionListType -+ createInstrLocationsGetter(MCContext *Ctx) const override { -+ return createGetter(Ctx, "__bolt_instr_locations"); -+ } -+ -+ InstructionListType createInstrTablesGetter(MCContext *Ctx) const override { -+ return createGetter(Ctx, "__bolt_instr_tables"); -+ } -+ -+ InstructionListType createInstrNumFuncsGetter(MCContext *Ctx) const override { -+ return createGetter(Ctx, "__bolt_instr_num_funcs"); -+ } -+ -+ void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg) override { -+ bool IsTailCall = isTailCall(Inst); -+ if (IsTailCall) -+ removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall); -+ if (Inst.getOpcode() == AArch64::BR || Inst.getOpcode() == AArch64::BLR) { -+ Inst.setOpcode(AArch64::ORRXrs); -+ Inst.insert(Inst.begin(), MCOperand::createReg(Reg)); -+ Inst.insert(Inst.begin() + 1, MCOperand::createReg(AArch64::XZR)); -+ Inst.insert(Inst.begin() + 3, MCOperand::createImm(0)); -+ return; -+ } -+ llvm_unreachable("not implemented"); -+ } -+ -+ InstructionListType createLoadImmediate(const MCPhysReg Dest, -+ uint64_t Imm) const override { -+ InstructionListType Insts(4); -+ int Shift = 48; -+ for (int I = 0; I < 4; I++, Shift -= 16) { -+ Insts[I].setOpcode(AArch64::MOVKXi); -+ Insts[I].addOperand(MCOperand::createReg(Dest)); -+ Insts[I].addOperand(MCOperand::createReg(Dest)); -+ Insts[I].addOperand(MCOperand::createImm((Imm >> Shift) & 0xFFFF)); -+ Insts[I].addOperand(MCOperand::createImm(Shift)); -+ } -+ return Insts; -+ } -+ -+ void createIndirectCallInst(MCInst &Inst, bool IsTailCall, -+ MCPhysReg Reg) const { -+ Inst.clear(); -+ Inst.setOpcode(IsTailCall ? AArch64::BR : AArch64::BLR); -+ Inst.addOperand(MCOperand::createReg(Reg)); -+ } -+ -+ InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst, -+ MCSymbol *HandlerFuncAddr, -+ int CallSiteID, -+ MCContext *Ctx) override { -+ InstructionListType Insts; -+ // Code sequence used to enter indirect call instrumentation helper: -+ // stp x0, x1, [sp, #-16]! createPushRegisters -+ // mov target x0 convertIndirectCallToLoad -> orr x0 target xzr -+ // mov x1 CallSiteID createLoadImmediate -> -+ // movk x1, #0x0, lsl #48 -+ // movk x1, #0x0, lsl #32 -+ // movk x1, #0x0, lsl #16 -+ // movk x1, #0x0 -+ // stp x0, x1, [sp, #-16]! -+ // bl *HandlerFuncAddr createIndirectCall -> -+ // adr x0 *HandlerFuncAddr -> adrp + add -+ // blr x0 -+ Insts.emplace_back(); -+ createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); -+ Insts.emplace_back(CallInst); -+ convertIndirectCallToLoad(Insts.back(), AArch64::X0); -+ InstructionListType LoadImm = -+ createLoadImmediate(getIntArgRegister(1), CallSiteID); -+ Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end()); -+ Insts.emplace_back(); -+ createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); -+ Insts.resize(Insts.size() + 2); -+ InstructionListType Addr = -+ materializeAddress(HandlerFuncAddr, Ctx, AArch64::X0); -+ assert(Addr.size() == 2 && "Invalid Addr size"); -+ std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size()); -+ Insts.emplace_back(); -+ createIndirectCallInst(Insts.back(), isTailCall(CallInst), AArch64::X0); -+ -+ // Carry over metadata including tail call marker if present. -+ stripAnnotations(Insts.back()); -+ moveAnnotations(std::move(CallInst), Insts.back()); -+ -+ return Insts; -+ } -+ -+ InstructionListType -+ createInstrumentedIndCallHandlerEntryBB(const MCSymbol *InstrTrampoline, -+ const MCSymbol *IndCallHandler, -+ MCContext *Ctx) override { -+ // Code sequence used to check whether InstrTampoline was initialized -+ // and call it if so, returns via IndCallHandler -+ // stp x0, x1, [sp, #-16]! -+ // mrs x1, nzcv -+ // adr x0, InstrTrampoline -> adrp + add -+ // ldr x0, [x0] -+ // subs x0, x0, #0x0 -+ // b.eq IndCallHandler -+ // str x30, [sp, #-16]! -+ // blr x0 -+ // ldr x30, [sp], #16 -+ // b IndCallHandler -+ InstructionListType Insts; -+ Insts.emplace_back(); -+ createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); -+ Insts.emplace_back(); -+ getSystemFlag(Insts.back(), getIntArgRegister(1)); -+ Insts.emplace_back(); -+ Insts.emplace_back(); -+ InstructionListType Addr = -+ materializeAddress(InstrTrampoline, Ctx, AArch64::X0); -+ std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size()); -+ assert(Addr.size() == 2 && "Invalid Addr size"); -+ Insts.emplace_back(); -+ loadReg(Insts.back(), AArch64::X0, AArch64::X0); -+ InstructionListType cmpJmp = -+ createCmpJE(AArch64::X0, 0, IndCallHandler, Ctx); -+ Insts.insert(Insts.end(), cmpJmp.begin(), cmpJmp.end()); -+ Insts.emplace_back(); -+ storeReg(Insts.back(), AArch64::LR, AArch64::SP); -+ Insts.emplace_back(); -+ Insts.back().setOpcode(AArch64::BLR); -+ Insts.back().addOperand(MCOperand::createReg(AArch64::X0)); -+ Insts.emplace_back(); -+ loadReg(Insts.back(), AArch64::LR, AArch64::SP); -+ Insts.emplace_back(); -+ createDirectCall(Insts.back(), IndCallHandler, Ctx, /*IsTailCall*/ true); -+ return Insts; -+ } -+ -+ InstructionListType -+ createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, -+ unsigned CodePointerSize) const override { -+ unsigned int I = 0; -+ InstructionListType Instrs(IsLeaf ? 12 : 10); -+ -+ if (IsLeaf) -+ createStackPointerIncrement(Instrs[I++], 128); -+ createPushRegisters(Instrs[I++], AArch64::X0, AArch64::X1); -+ getSystemFlag(Instrs[I++], AArch64::X1); -+ InstructionListType Addr = materializeAddress(Target, Ctx, AArch64::X0); -+ assert(Addr.size() == 2 && "Invalid Addr size"); -+ std::copy(Addr.begin(), Addr.end(), Instrs.begin() + I); -+ I += Addr.size(); -+ storeReg(Instrs[I++], AArch64::X2, AArch64::SP); -+ InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X2); -+ assert(Insts.size() == 2 && "Invalid Insts size"); -+ std::copy(Insts.begin(), Insts.end(), Instrs.begin() + I); -+ I += Insts.size(); -+ loadReg(Instrs[I++], AArch64::X2, AArch64::SP); -+ setSystemFlag(Instrs[I++], AArch64::X1); -+ createPopRegisters(Instrs[I++], AArch64::X0, AArch64::X1); -+ if (IsLeaf) -+ createStackPointerDecrement(Instrs[I++], 128); -+ return Instrs; -+ } -+ -+ std::vector createSymbolTrampoline(const MCSymbol *TgtSym, -+ MCContext *Ctx) override { -+ std::vector Insts; -+ createShortJmp(Insts, TgtSym, Ctx, /*IsTailCall*/ true); -+ return Insts; -+ } -+ - InstructionListType materializeAddress(const MCSymbol *Target, MCContext *Ctx, - MCPhysReg RegName, - int64_t Addend = 0) const override { -diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp -index 5e3c01a..25b6970 100644 ---- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp -+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp -@@ -61,6 +61,25 @@ bool isADDri(const MCInst &Inst) { - Inst.getOpcode() == X86::ADD64ri8; - } - -+// Create instruction to increment contents of target by 1 -+static InstructionListType createIncMemory(const MCSymbol *Target, -+ MCContext *Ctx) { -+ InstructionListType Insts; -+ Insts.emplace_back(); -+ Insts.back().setOpcode(X86::LOCK_INC64m); -+ Insts.back().clear(); -+ Insts.back().addOperand(MCOperand::createReg(X86::RIP)); // BaseReg -+ Insts.back().addOperand(MCOperand::createImm(1)); // ScaleAmt -+ Insts.back().addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg -+ -+ Insts.back().addOperand(MCOperand::createExpr( -+ MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, -+ *Ctx))); // Displacement -+ Insts.back().addOperand( -+ MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg -+ return Insts; -+} -+ - #define GET_INSTRINFO_OPERAND_TYPES_ENUM - #define GET_INSTRINFO_OPERAND_TYPE - #define GET_INSTRINFO_MEM_OPERAND_SIZE -@@ -2309,28 +2328,15 @@ public: - return true; - } - -- void createLoadImmediate(MCInst &Inst, const MCPhysReg Dest, -- uint32_t Imm) const override { -- Inst.setOpcode(X86::MOV64ri32); -- Inst.clear(); -- Inst.addOperand(MCOperand::createReg(Dest)); -- Inst.addOperand(MCOperand::createImm(Imm)); -- } -- -- bool createIncMemory(MCInst &Inst, const MCSymbol *Target, -- MCContext *Ctx) const override { -- -- Inst.setOpcode(X86::LOCK_INC64m); -- Inst.clear(); -- Inst.addOperand(MCOperand::createReg(X86::RIP)); // BaseReg -- Inst.addOperand(MCOperand::createImm(1)); // ScaleAmt -- Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg -- -- Inst.addOperand(MCOperand::createExpr( -- MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, -- *Ctx))); // Displacement -- Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg -- return true; -+ InstructionListType createLoadImmediate(const MCPhysReg Dest, -+ uint64_t Imm) const override { -+ InstructionListType Insts; -+ Insts.emplace_back(); -+ Insts.back().setOpcode(X86::MOV64ri32); -+ Insts.back().clear(); -+ Insts.back().addOperand(MCOperand::createReg(Dest)); -+ Insts.back().addOperand(MCOperand::createImm(Imm)); -+ return Insts; - } - - bool createIJmp32Frag(SmallVectorImpl &Insts, -@@ -3057,9 +3063,9 @@ public: - Inst.clear(); - } - -- InstructionListType createInstrIncMemory(const MCSymbol *Target, -- MCContext *Ctx, -- bool IsLeaf) const override { -+ InstructionListType -+ createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, -+ unsigned CodePointerSize) const override { - InstructionListType Instrs(IsLeaf ? 13 : 11); - unsigned int I = 0; - -@@ -3079,7 +3085,10 @@ public: - createClearRegWithNoEFlagsUpdate(Instrs[I++], X86::RAX, 8); - createX86SaveOVFlagToRegister(Instrs[I++], X86::AL); - // LOCK INC -- createIncMemory(Instrs[I++], Target, Ctx); -+ InstructionListType IncMem = createIncMemory(Target, Ctx); -+ assert(IncMem.size() == 1 && "Invalid IncMem size"); -+ std::copy(IncMem.begin(), IncMem.end(), Instrs.begin() + I); -+ I += IncMem.size(); - // POPF - createAddRegImm(Instrs[I++], X86::AL, 127, 1); - createPopRegister(Instrs[I++], X86::RAX, 8); -@@ -3153,8 +3162,8 @@ public: - } - Insts.emplace_back(); - createPushRegister(Insts.back(), TempReg, 8); -- Insts.emplace_back(); -- createLoadImmediate(Insts.back(), TempReg, CallSiteID); -+ InstructionListType LoadImm = createLoadImmediate(TempReg, CallSiteID); -+ Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end()); - Insts.emplace_back(); - createPushRegister(Insts.back(), TempReg, 8); - -@@ -3264,7 +3273,7 @@ public: - } - - InstructionListType createSymbolTrampoline(const MCSymbol *TgtSym, -- MCContext *Ctx) const override { -+ MCContext *Ctx) override { - InstructionListType Insts(1); - createUncondBranch(Insts[0], TgtSym, Ctx); - return Insts; -diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt -index 8472ce0..838c8cb 100644 ---- a/bolt/runtime/CMakeLists.txt -+++ b/bolt/runtime/CMakeLists.txt -@@ -27,8 +27,14 @@ set(BOLT_RT_FLAGS - -fno-exceptions - -fno-rtti - -fno-stack-protector -- -mno-sse -- -fPIC) -+ -fPIC -+ -mgeneral-regs-only) -+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") -+ set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-sse") -+endif() -+if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") -+ set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-outline-atomics") -+endif() - - # Don't let the compiler think it can create calls to standard libs - target_compile_options(bolt_rt_instr PRIVATE ${BOLT_RT_FLAGS}) -@@ -39,7 +45,7 @@ target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) - install(TARGETS bolt_rt_instr DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") - install(TARGETS bolt_rt_hugify DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") - --if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*") -+if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin") - add_library(bolt_rt_instr_osx STATIC - instr.cpp - ${CMAKE_CURRENT_BINARY_DIR}/config.h -diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h -index 9e6f175..9b9965b 100644 ---- a/bolt/runtime/common.h -+++ b/bolt/runtime/common.h -@@ -6,10 +6,6 @@ - // - //===----------------------------------------------------------------------===// - --#if !defined(__x86_64__) --#error "For x86_64 only" --#endif -- - #if defined(__linux__) - - #include -@@ -44,44 +40,6 @@ typedef int int32_t; - #error "For Linux or MacOS only" - #endif - --// Save all registers while keeping 16B stack alignment --#define SAVE_ALL \ -- "push %%rax\n" \ -- "push %%rbx\n" \ -- "push %%rcx\n" \ -- "push %%rdx\n" \ -- "push %%rdi\n" \ -- "push %%rsi\n" \ -- "push %%rbp\n" \ -- "push %%r8\n" \ -- "push %%r9\n" \ -- "push %%r10\n" \ -- "push %%r11\n" \ -- "push %%r12\n" \ -- "push %%r13\n" \ -- "push %%r14\n" \ -- "push %%r15\n" \ -- "sub $8, %%rsp\n" -- --// Mirrors SAVE_ALL --#define RESTORE_ALL \ -- "add $8, %%rsp\n" \ -- "pop %%r15\n" \ -- "pop %%r14\n" \ -- "pop %%r13\n" \ -- "pop %%r12\n" \ -- "pop %%r11\n" \ -- "pop %%r10\n" \ -- "pop %%r9\n" \ -- "pop %%r8\n" \ -- "pop %%rbp\n" \ -- "pop %%rsi\n" \ -- "pop %%rdi\n" \ -- "pop %%rdx\n" \ -- "pop %%rcx\n" \ -- "pop %%rbx\n" \ -- "pop %%rax\n" -- - #define PROT_READ 0x1 /* Page can be read. */ - #define PROT_WRITE 0x2 /* Page can be written. */ - #define PROT_EXEC 0x4 /* Page can be executed. */ -@@ -165,141 +123,41 @@ int memcmp(const void *s1, const void *s2, size_t n) { - // Anonymous namespace covering everything but our library entry point - namespace { - --// Get the difference between runtime addrress of .text section and --// static address in section header table. Can be extracted from arbitrary --// pc value recorded at runtime to get the corresponding static address, which --// in turn can be used to search for indirect call description. Needed because --// indirect call descriptions are read-only non-relocatable data. --uint64_t getTextBaseAddress() { -- uint64_t DynAddr; -- uint64_t StaticAddr; -- __asm__ volatile("leaq __hot_end(%%rip), %0\n\t" -- "movabsq $__hot_end, %1\n\t" -- : "=r"(DynAddr), "=r"(StaticAddr)); -- return DynAddr - StaticAddr; --} -- --constexpr uint32_t BufSize = 10240; -- --#define _STRINGIFY(x) #x --#define STRINGIFY(x) _STRINGIFY(x) -- --uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { -- uint64_t ret; --#if defined(__APPLE__) --#define READ_SYSCALL 0x2000003 --#else --#define READ_SYSCALL 0 --#endif -- __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(fd), "S"(buf), "d"(count) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { -- uint64_t ret; --#if defined(__APPLE__) --#define WRITE_SYSCALL 0x2000004 --#else --#define WRITE_SYSCALL 1 --#endif -- __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(fd), "S"(buf), "d"(count) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, -- uint64_t fd, uint64_t offset) { --#if defined(__APPLE__) --#define MMAP_SYSCALL 0x20000c5 --#else --#define MMAP_SYSCALL 9 --#endif -- void *ret; -- register uint64_t r8 asm("r8") = fd; -- register uint64_t r9 asm("r9") = offset; -- register uint64_t r10 asm("r10") = flags; -- __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), -- "r"(r9) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --uint64_t __munmap(void *addr, uint64_t size) { --#if defined(__APPLE__) --#define MUNMAP_SYSCALL 0x2000049 --#else --#define MUNMAP_SYSCALL 11 --#endif -- uint64_t ret; -- __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(addr), "S"(size) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -+struct dirent64 { -+ uint64_t d_ino; /* Inode number */ -+ int64_t d_off; /* Offset to next linux_dirent */ -+ unsigned short d_reclen; /* Length of this linux_dirent */ -+ unsigned char d_type; -+ char d_name[]; /* Filename (null-terminated) */ -+ /* length is actually (d_reclen - 2 - -+ offsetof(struct linux_dirent, d_name)) */ -+}; - --#define SIG_BLOCK 0 --#define SIG_UNBLOCK 1 --#define SIG_SETMASK 2 -+/* Length of the entries in `struct utsname' is 65. */ -+#define _UTSNAME_LENGTH 65 - --static const uint64_t MaskAllSignals[] = {-1ULL}; -+struct UtsNameTy { -+ char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */ -+ char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined -+ network" */ -+ char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */ -+ char version[_UTSNAME_LENGTH]; /* Operating system version */ -+ char machine[_UTSNAME_LENGTH]; /* Hardware identifier */ -+ char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */ -+}; - --uint64_t __sigprocmask(int how, const void *set, void *oldset) { --#if defined(__APPLE__) --#define SIGPROCMASK_SYSCALL 0x2000030 --#else --#define SIGPROCMASK_SYSCALL 14 --#endif -- uint64_t ret; -- register long r10 asm("r10") = sizeof(uint64_t); -- __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(how), "S"(set), "d"(oldset), "r"(r10) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -+struct timespec { -+ uint64_t tv_sec; /* seconds */ -+ uint64_t tv_nsec; /* nanoseconds */ -+}; - --uint64_t __getpid() { -- uint64_t ret; --#if defined(__APPLE__) --#define GETPID_SYSCALL 20 -+#if defined(__aarch64__) -+#include "sys_aarch64.h" - #else --#define GETPID_SYSCALL 39 -+#include "sys_x86_64.h" - #endif -- __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : -- : "cc", "rcx", "r11", "memory"); -- return ret; --} - --uint64_t __exit(uint64_t code) { --#if defined(__APPLE__) --#define EXIT_SYSCALL 0x2000001 --#else --#define EXIT_SYSCALL 231 --#endif -- uint64_t ret; -- __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(code) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -+constexpr uint32_t BufSize = 10240; - - // Helper functions for writing strings to the .fdata file. We intentionally - // avoid using libc names to make it clear it is our impl. -@@ -415,219 +273,6 @@ static bool scanUInt32(const char *&Buf, const char *End, uint32_t &Ret) { - return false; - } - --#if !defined(__APPLE__) --// We use a stack-allocated buffer for string manipulation in many pieces of --// this code, including the code that prints each line of the fdata file. This --// buffer needs to accomodate large function names, but shouldn't be arbitrarily --// large (dynamically allocated) for simplicity of our memory space usage. -- --// Declare some syscall wrappers we use throughout this code to avoid linking --// against system libc. --uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { -- uint64_t ret; -- __asm__ __volatile__("movq $2, %%rax\n" -- "syscall" -- : "=a"(ret) -- : "D"(pathname), "S"(flags), "d"(mode) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --struct dirent { -- unsigned long d_ino; /* Inode number */ -- unsigned long d_off; /* Offset to next linux_dirent */ -- unsigned short d_reclen; /* Length of this linux_dirent */ -- char d_name[]; /* Filename (null-terminated) */ -- /* length is actually (d_reclen - 2 - -- offsetof(struct linux_dirent, d_name)) */ --}; -- --long __getdents(unsigned int fd, dirent *dirp, size_t count) { -- long ret; -- __asm__ __volatile__("movq $78, %%rax\n" -- "syscall" -- : "=a"(ret) -- : "D"(fd), "S"(dirp), "d"(count) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { -- uint64_t ret; -- __asm__ __volatile__("movq $89, %%rax\n" -- "syscall" -- : "=a"(ret) -- : "D"(pathname), "S"(buf), "d"(bufsize) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { -- uint64_t ret; -- __asm__ __volatile__("movq $8, %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(fd), "S"(pos), "d"(whence) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --int __ftruncate(uint64_t fd, uint64_t length) { -- int ret; -- __asm__ __volatile__("movq $77, %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(fd), "S"(length) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --int __close(uint64_t fd) { -- uint64_t ret; -- __asm__ __volatile__("movq $3, %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(fd) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --int __madvise(void *addr, size_t length, int advice) { -- int ret; -- __asm__ __volatile__("movq $28, %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(addr), "S"(length), "d"(advice) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --#define _UTSNAME_LENGTH 65 -- --struct UtsNameTy { -- char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */ -- char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined -- network" */ -- char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */ -- char version[_UTSNAME_LENGTH]; /* Operating system version */ -- char machine[_UTSNAME_LENGTH]; /* Hardware identifier */ -- char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */ --}; -- --int __uname(struct UtsNameTy *Buf) { -- int Ret; -- __asm__ __volatile__("movq $63, %%rax\n" -- "syscall\n" -- : "=a"(Ret) -- : "D"(Buf) -- : "cc", "rcx", "r11", "memory"); -- return Ret; --} -- --struct timespec { -- uint64_t tv_sec; /* seconds */ -- uint64_t tv_nsec; /* nanoseconds */ --}; -- --uint64_t __nanosleep(const timespec *req, timespec *rem) { -- uint64_t ret; -- __asm__ __volatile__("movq $35, %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(req), "S"(rem) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --int64_t __fork() { -- uint64_t ret; -- __asm__ __volatile__("movq $57, %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --int __mprotect(void *addr, size_t len, int prot) { -- int ret; -- __asm__ __volatile__("movq $10, %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(addr), "S"(len), "d"(prot) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --uint64_t __getppid() { -- uint64_t ret; -- __asm__ __volatile__("movq $110, %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --int __setpgid(uint64_t pid, uint64_t pgid) { -- int ret; -- __asm__ __volatile__("movq $109, %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(pid), "S"(pgid) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --uint64_t __getpgid(uint64_t pid) { -- uint64_t ret; -- __asm__ __volatile__("movq $121, %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(pid) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --int __kill(uint64_t pid, int sig) { -- int ret; -- __asm__ __volatile__("movq $62, %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(pid), "S"(sig) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --int __fsync(int fd) { -- int ret; -- __asm__ __volatile__("movq $74, %%rax\n" -- "syscall\n" -- : "=a"(ret) -- : "D"(fd) -- : "cc", "rcx", "r11", "memory"); -- return ret; --} -- --// %rdi %rsi %rdx %r10 %r8 --// sys_prctl int option unsigned unsigned unsigned unsigned --// long arg2 long arg3 long arg4 long arg5 --int __prctl(int Option, unsigned long Arg2, unsigned long Arg3, -- unsigned long Arg4, unsigned long Arg5) { -- int Ret; -- register long rdx asm("rdx") = Arg3; -- register long r8 asm("r8") = Arg5; -- register long r10 asm("r10") = Arg4; -- __asm__ __volatile__("movq $157, %%rax\n" -- "syscall\n" -- : "=a"(Ret) -- : "D"(Option), "S"(Arg2), "d"(rdx), "r"(r10), "r"(r8) -- :); -- return Ret; --} -- --#endif -- - void reportError(const char *Msg, uint64_t Size) { - __write(2, Msg, Size); - __exit(1); -@@ -644,6 +289,12 @@ void assert(bool Assertion, const char *Msg) { - reportError(Buf, Ptr - Buf); - } - -+#define SIG_BLOCK 0 -+#define SIG_UNBLOCK 1 -+#define SIG_SETMASK 2 -+ -+static const uint64_t MaskAllSignals[] = {-1ULL}; -+ - class Mutex { - volatile bool InUse{false}; - -diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp -index 96a43f6..cfd113e 100644 ---- a/bolt/runtime/instr.cpp -+++ b/bolt/runtime/instr.cpp -@@ -40,7 +40,6 @@ - // - //===----------------------------------------------------------------------===// - --#if defined (__x86_64__) - #include "common.h" - - // Enables a very verbose logging to stderr useful when debugging -@@ -695,12 +694,12 @@ static char *getBinaryPath() { - assert(static_cast(FDdir) >= 0, - "failed to open /proc/self/map_files"); - -- while (long Nread = __getdents(FDdir, (struct dirent *)Buf, BufSize)) { -+ while (long Nread = __getdents64(FDdir, (struct dirent64 *)Buf, BufSize)) { - assert(static_cast(Nread) != -1, "failed to get folder entries"); - -- struct dirent *d; -+ struct dirent64 *d; - for (long Bpos = 0; Bpos < Nread; Bpos += d->d_reclen) { -- d = (struct dirent *)(Buf + Bpos); -+ d = (struct dirent64 *)(Buf + Bpos); - - uint64_t StartAddress, EndAddress; - if (!parseAddressRange(d->d_name, StartAddress, EndAddress)) -@@ -1668,6 +1667,17 @@ instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) { - /// as well as the target address for the call - extern "C" __attribute((naked)) void __bolt_instr_indirect_call() - { -+#if defined(__aarch64__) -+ // clang-format off -+ __asm__ __volatile__(SAVE_ALL -+ "ldp x0, x1, [sp, #288]\n" -+ "bl instrumentIndirectCall\n" -+ RESTORE_ALL -+ "ret\n" -+ :::); -+ // clang-format on -+#else -+ // clang-format off - __asm__ __volatile__(SAVE_ALL - "mov 0xa0(%%rsp), %%rdi\n" - "mov 0x98(%%rsp), %%rsi\n" -@@ -1675,10 +1685,23 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_call() - RESTORE_ALL - "ret\n" - :::); -+ // clang-format on -+#endif - } - - extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall() - { -+#if defined(__aarch64__) -+ // clang-format off -+ __asm__ __volatile__(SAVE_ALL -+ "ldp x0, x1, [sp, #288]\n" -+ "bl instrumentIndirectCall\n" -+ RESTORE_ALL -+ "ret\n" -+ :::); -+ // clang-format on -+#else -+ // clang-format off - __asm__ __volatile__(SAVE_ALL - "mov 0x98(%%rsp), %%rdi\n" - "mov 0x90(%%rsp), %%rsi\n" -@@ -1686,21 +1709,48 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall() - RESTORE_ALL - "ret\n" - :::); -+ // clang-format on -+#endif - } - - /// This is hooking ELF's entry, it needs to save all machine state. - extern "C" __attribute((naked)) void __bolt_instr_start() - { -+#if defined(__aarch64__) -+ // clang-format off -+ __asm__ __volatile__(SAVE_ALL -+ "bl __bolt_instr_setup\n" -+ RESTORE_ALL -+ "adrp x16, __bolt_start_trampoline\n" -+ "add x16, x16, #:lo12:__bolt_start_trampoline\n" -+ "br x16\n" -+ :::); -+ // clang-format on -+#else -+ // clang-format off - __asm__ __volatile__(SAVE_ALL - "call __bolt_instr_setup\n" - RESTORE_ALL - "jmp __bolt_start_trampoline\n" - :::); -+ // clang-format on -+#endif - } - - /// This is hooking into ELF's DT_FINI - extern "C" void __bolt_instr_fini() { -- __bolt_fini_trampoline(); -+#if defined(__aarch64__) -+ // clang-format off -+ __asm__ __volatile__(SAVE_ALL -+ "adrp x16, __bolt_fini_trampoline\n" -+ "add x16, x16, #:lo12:__bolt_fini_trampoline\n" -+ "blr x16\n" -+ RESTORE_ALL -+ :::); -+ // clang-format on -+#else -+ __asm__ __volatile__("call __bolt_fini_trampoline\n" :::); -+#endif - if (__bolt_instr_sleep_time == 0) { - int FD = openProfile(); - __bolt_instr_data_dump(FD); -@@ -1752,4 +1802,3 @@ void _bolt_instr_fini() { - } - - #endif --#endif -diff --git a/bolt/runtime/sys_aarch64.h b/bolt/runtime/sys_aarch64.h -new file mode 100644 -index 0000000..77c9cfc ---- /dev/null -+++ b/bolt/runtime/sys_aarch64.h -@@ -0,0 +1,394 @@ -+#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_AARCH64 -+#define LLVM_TOOLS_LLVM_BOLT_SYS_AARCH64 -+ -+// Save all registers while keeping 16B stack alignment -+#define SAVE_ALL \ -+ "stp x0, x1, [sp, #-16]!\n" \ -+ "stp x2, x3, [sp, #-16]!\n" \ -+ "stp x4, x5, [sp, #-16]!\n" \ -+ "stp x6, x7, [sp, #-16]!\n" \ -+ "stp x8, x9, [sp, #-16]!\n" \ -+ "stp x10, x11, [sp, #-16]!\n" \ -+ "stp x12, x13, [sp, #-16]!\n" \ -+ "stp x14, x15, [sp, #-16]!\n" \ -+ "stp x16, x17, [sp, #-16]!\n" \ -+ "stp x18, x19, [sp, #-16]!\n" \ -+ "stp x20, x21, [sp, #-16]!\n" \ -+ "stp x22, x23, [sp, #-16]!\n" \ -+ "stp x24, x25, [sp, #-16]!\n" \ -+ "stp x26, x27, [sp, #-16]!\n" \ -+ "stp x28, x29, [sp, #-16]!\n" \ -+ "str x30, [sp,#-16]!\n" -+// Mirrors SAVE_ALL -+#define RESTORE_ALL \ -+ "ldr x30, [sp], #16\n" \ -+ "ldp x28, x29, [sp], #16\n" \ -+ "ldp x26, x27, [sp], #16\n" \ -+ "ldp x24, x25, [sp], #16\n" \ -+ "ldp x22, x23, [sp], #16\n" \ -+ "ldp x20, x21, [sp], #16\n" \ -+ "ldp x18, x19, [sp], #16\n" \ -+ "ldp x16, x17, [sp], #16\n" \ -+ "ldp x14, x15, [sp], #16\n" \ -+ "ldp x12, x13, [sp], #16\n" \ -+ "ldp x10, x11, [sp], #16\n" \ -+ "ldp x8, x9, [sp], #16\n" \ -+ "ldp x6, x7, [sp], #16\n" \ -+ "ldp x4, x5, [sp], #16\n" \ -+ "ldp x2, x3, [sp], #16\n" \ -+ "ldp x0, x1, [sp], #16\n" -+ -+// Anonymous namespace covering everything but our library entry point -+namespace { -+ -+// Get the difference between runtime addrress of .text section and -+// static address in section header table. Can be extracted from arbitrary -+// pc value recorded at runtime to get the corresponding static address, which -+// in turn can be used to search for indirect call description. Needed because -+// indirect call descriptions are read-only non-relocatable data. -+uint64_t getTextBaseAddress() { -+ uint64_t DynAddr; -+ uint64_t StaticAddr; -+ __asm__ volatile("b .instr%=\n\t" -+ ".StaticAddr%=:\n\t" -+ ".dword __hot_end\n\t" -+ ".instr%=:\n\t" -+ "ldr %0, .StaticAddr%=\n\t" -+ "adrp %1, __hot_end\n\t" -+ "add %1, %1, :lo12:__hot_end\n\t" -+ : "=r"(StaticAddr), "=r"(DynAddr)); -+ return DynAddr - StaticAddr; -+} -+ -+uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { -+ uint64_t ret; -+ register uint64_t x0 __asm__("x0") = fd; -+ register const void *x1 __asm__("x1") = buf; -+ register uint64_t x2 __asm__("x2") = count; -+ register uint32_t w8 __asm__("w8") = 63; -+ __asm__ __volatile__("svc #0\n" -+ "mov %0, x0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(x2), "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { -+ uint64_t ret; -+ register uint64_t x0 __asm__("x0") = fd; -+ register const void *x1 __asm__("x1") = buf; -+ register uint64_t x2 __asm__("x2") = count; -+ register uint32_t w8 __asm__("w8") = 64; -+ __asm__ __volatile__("svc #0\n" -+ "mov %0, x0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(x2), "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, -+ uint64_t fd, uint64_t offset) { -+ void *ret; -+ register uint64_t x0 __asm__("x0") = addr; -+ register uint64_t x1 __asm__("x1") = size; -+ register uint64_t x2 __asm__("x2") = prot; -+ register uint64_t x3 __asm__("x3") = flags; -+ register uint64_t x4 __asm__("x4") = fd; -+ register uint64_t x5 __asm__("x5") = offset; -+ register uint32_t w8 __asm__("w8") = 222; -+ __asm__ __volatile__("svc #0\n" -+ "mov %0, x0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(x2), "r"(x3), "r"(x4), "r"(x5), "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+uint64_t __munmap(void *addr, uint64_t size) { -+ uint64_t ret; -+ register void *x0 __asm__("x0") = addr; -+ register uint64_t x1 __asm__("x1") = size; -+ register uint32_t w8 __asm__("w8") = 215; -+ __asm__ __volatile__("svc #0\n" -+ "mov %0, x0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+uint64_t __exit(uint64_t code) { -+ uint64_t ret; -+ register uint64_t x0 __asm__("x0") = code; -+ register uint32_t w8 __asm__("w8") = 94; -+ __asm__ __volatile__("svc #0\n" -+ "mov %0, x0" -+ : "=r"(ret), "+r"(x0) -+ : "r"(w8) -+ : "cc", "memory", "x1"); -+ return ret; -+} -+ -+uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { -+ uint64_t ret; -+ register int x0 __asm__("x0") = -100; -+ register const char *x1 __asm__("x1") = pathname; -+ register uint64_t x2 __asm__("x2") = flags; -+ register uint64_t x3 __asm__("x3") = mode; -+ register uint32_t w8 __asm__("w8") = 56; -+ __asm__ __volatile__("svc #0\n" -+ "mov %0, x0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(x2), "r"(x3), "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) { -+ long ret; -+ register unsigned int x0 __asm__("x0") = fd; -+ register dirent64 *x1 __asm__("x1") = dirp; -+ register size_t x2 __asm__("x2") = count; -+ register uint32_t w8 __asm__("w8") = 61; -+ __asm__ __volatile__("svc #0\n" -+ "mov %0, x0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(x2), "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { -+ uint64_t ret; -+ register int x0 __asm__("x0") = -100; -+ register const char *x1 __asm__("x1") = pathname; -+ register char *x2 __asm__("x2") = buf; -+ register size_t x3 __asm__("x3") = bufsize; -+ register uint32_t w8 __asm__("w8") = 78; // readlinkat -+ __asm__ __volatile__("svc #0\n" -+ "mov %0, x0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(x2), "r"(x3), "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { -+ uint64_t ret; -+ register uint64_t x0 __asm__("x0") = fd; -+ register uint64_t x1 __asm__("x1") = pos; -+ register uint64_t x2 __asm__("x2") = whence; -+ register uint32_t w8 __asm__("w8") = 62; -+ __asm__ __volatile__("svc #0\n" -+ "mov %0, x0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(x2), "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+int __ftruncate(uint64_t fd, uint64_t length) { -+ int ret; -+ register uint64_t x0 __asm__("x0") = fd; -+ register uint64_t x1 __asm__("x1") = length; -+ register uint32_t w8 __asm__("w8") = 46; -+ __asm__ __volatile__("svc #0\n" -+ "mov %w0, w0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+int __close(uint64_t fd) { -+ int ret; -+ register uint64_t x0 __asm__("x0") = fd; -+ register uint32_t w8 __asm__("w8") = 57; -+ __asm__ __volatile__("svc #0\n" -+ "mov %w0, w0" -+ : "=r"(ret), "+r"(x0) -+ : "r"(w8) -+ : "cc", "memory", "x1"); -+ return ret; -+} -+ -+int __madvise(void *addr, size_t length, int advice) { -+ int ret; -+ register void *x0 __asm__("x0") = addr; -+ register size_t x1 __asm__("x1") = length; -+ register int x2 __asm__("x2") = advice; -+ register uint32_t w8 __asm__("w8") = 233; -+ __asm__ __volatile__("svc #0\n" -+ "mov %w0, w0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(x2), "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+int __uname(struct UtsNameTy *buf) { -+ int ret; -+ register UtsNameTy *x0 __asm__("x0") = buf; -+ register uint32_t w8 __asm__("w8") = 160; -+ __asm__ __volatile__("svc #0\n" -+ "mov %w0, w0" -+ : "=r"(ret), "+r"(x0) -+ : "r"(w8) -+ : "cc", "memory", "x1"); -+ return ret; -+} -+ -+uint64_t __nanosleep(const timespec *req, timespec *rem) { -+ uint64_t ret; -+ register const timespec *x0 __asm__("x0") = req; -+ register timespec *x1 __asm__("x1") = rem; -+ register uint32_t w8 __asm__("w8") = 101; -+ __asm__ __volatile__("svc #0\n" -+ "mov %0, x0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+int64_t __fork() { -+ uint64_t ret; -+ // clone instead of fork with flags -+ // "CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD" -+ register uint64_t x0 __asm__("x0") = 0x1200011; -+ register uint64_t x1 __asm__("x1") = 0; -+ register uint64_t x2 __asm__("x2") = 0; -+ register uint64_t x3 __asm__("x3") = 0; -+ register uint64_t x4 __asm__("x4") = 0; -+ register uint32_t w8 __asm__("w8") = 220; -+ __asm__ __volatile__("svc #0\n" -+ "mov %0, x0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(x2), "r"(x3), "r"(x4), "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+int __mprotect(void *addr, size_t len, int prot) { -+ int ret; -+ register void *x0 __asm__("x0") = addr; -+ register size_t x1 __asm__("x1") = len; -+ register int x2 __asm__("x2") = prot; -+ register uint32_t w8 __asm__("w8") = 226; -+ __asm__ __volatile__("svc #0\n" -+ "mov %w0, w0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(x2), "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+uint64_t __getpid() { -+ uint64_t ret; -+ register uint32_t w8 __asm__("w8") = 172; -+ __asm__ __volatile__("svc #0\n" -+ "mov %0, x0" -+ : "=r"(ret) -+ : "r"(w8) -+ : "cc", "memory", "x0", "x1"); -+ return ret; -+} -+ -+uint64_t __getppid() { -+ uint64_t ret; -+ register uint32_t w8 __asm__("w8") = 173; -+ __asm__ __volatile__("svc #0\n" -+ "mov %0, x0" -+ : "=r"(ret) -+ : "r"(w8) -+ : "cc", "memory", "x0", "x1"); -+ return ret; -+} -+ -+int __setpgid(uint64_t pid, uint64_t pgid) { -+ int ret; -+ register uint64_t x0 __asm__("x0") = pid; -+ register uint64_t x1 __asm__("x1") = pgid; -+ register uint32_t w8 __asm__("w8") = 154; -+ __asm__ __volatile__("svc #0\n" -+ "mov %w0, w0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+uint64_t __getpgid(uint64_t pid) { -+ uint64_t ret; -+ register uint64_t x0 __asm__("x0") = pid; -+ register uint32_t w8 __asm__("w8") = 155; -+ __asm__ __volatile__("svc #0\n" -+ "mov %0, x0" -+ : "=r"(ret), "+r"(x0) -+ : "r"(w8) -+ : "cc", "memory", "x1"); -+ return ret; -+} -+ -+int __kill(uint64_t pid, int sig) { -+ int ret; -+ register uint64_t x0 __asm__("x0") = pid; -+ register int x1 __asm__("x1") = sig; -+ register uint32_t w8 __asm__("w8") = 129; -+ __asm__ __volatile__("svc #0\n" -+ "mov %w0, w0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+int __fsync(int fd) { -+ int ret; -+ register int x0 __asm__("x0") = fd; -+ register uint32_t w8 __asm__("w8") = 82; -+ __asm__ __volatile__("svc #0\n" -+ "mov %w0, w0" -+ : "=r"(ret), "+r"(x0) -+ : "r"(w8) -+ : "cc", "memory", "x1"); -+ return ret; -+} -+ -+uint64_t __sigprocmask(int how, const void *set, void *oldset) { -+ uint64_t ret; -+ register int x0 __asm__("x0") = how; -+ register const void *x1 __asm__("x1") = set; -+ register void *x2 __asm__("x2") = oldset; -+ register long x3 asm("x3") = 8; -+ register uint32_t w8 __asm__("w8") = 135; -+ __asm__ __volatile__("svc #0\n" -+ "mov %0, x0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(x2), "r"(x3), "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+int __prctl(int option, unsigned long arg2, unsigned long arg3, -+ unsigned long arg4, unsigned long arg5) { -+ int ret; -+ register int x0 __asm__("x0") = option; -+ register unsigned long x1 __asm__("x1") = arg2; -+ register unsigned long x2 __asm__("x2") = arg3; -+ register unsigned long x3 __asm__("x3") = arg4; -+ register unsigned long x4 __asm__("x4") = arg5; -+ register uint32_t w8 __asm__("w8") = 167; -+ __asm__ __volatile__("svc #0\n" -+ "mov %w0, w0" -+ : "=r"(ret), "+r"(x0), "+r"(x1) -+ : "r"(x2), "r"(x3), "r"(x4), "r"(w8) -+ : "cc", "memory"); -+ return ret; -+} -+ -+} // anonymous namespace -+ -+#endif -diff --git a/bolt/runtime/sys_x86_64.h b/bolt/runtime/sys_x86_64.h -new file mode 100644 -index 0000000..ca2c693 ---- /dev/null -+++ b/bolt/runtime/sys_x86_64.h -@@ -0,0 +1,360 @@ -+#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_X86_64 -+#define LLVM_TOOLS_LLVM_BOLT_SYS_X86_64 -+ -+// Save all registers while keeping 16B stack alignment -+#define SAVE_ALL \ -+ "push %%rax\n" \ -+ "push %%rbx\n" \ -+ "push %%rcx\n" \ -+ "push %%rdx\n" \ -+ "push %%rdi\n" \ -+ "push %%rsi\n" \ -+ "push %%rbp\n" \ -+ "push %%r8\n" \ -+ "push %%r9\n" \ -+ "push %%r10\n" \ -+ "push %%r11\n" \ -+ "push %%r12\n" \ -+ "push %%r13\n" \ -+ "push %%r14\n" \ -+ "push %%r15\n" \ -+ "sub $8, %%rsp\n" -+// Mirrors SAVE_ALL -+#define RESTORE_ALL \ -+ "add $8, %%rsp\n" \ -+ "pop %%r15\n" \ -+ "pop %%r14\n" \ -+ "pop %%r13\n" \ -+ "pop %%r12\n" \ -+ "pop %%r11\n" \ -+ "pop %%r10\n" \ -+ "pop %%r9\n" \ -+ "pop %%r8\n" \ -+ "pop %%rbp\n" \ -+ "pop %%rsi\n" \ -+ "pop %%rdi\n" \ -+ "pop %%rdx\n" \ -+ "pop %%rcx\n" \ -+ "pop %%rbx\n" \ -+ "pop %%rax\n" -+ -+namespace { -+ -+// Get the difference between runtime addrress of .text section and -+// static address in section header table. Can be extracted from arbitrary -+// pc value recorded at runtime to get the corresponding static address, which -+// in turn can be used to search for indirect call description. Needed because -+// indirect call descriptions are read-only non-relocatable data. -+uint64_t getTextBaseAddress() { -+ uint64_t DynAddr; -+ uint64_t StaticAddr; -+ __asm__ volatile("leaq __hot_end(%%rip), %0\n\t" -+ "movabsq $__hot_end, %1\n\t" -+ : "=r"(DynAddr), "=r"(StaticAddr)); -+ return DynAddr - StaticAddr; -+} -+ -+#define _STRINGIFY(x) #x -+#define STRINGIFY(x) _STRINGIFY(x) -+ -+uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { -+ uint64_t ret; -+#if defined(__APPLE__) -+#define READ_SYSCALL 0x2000003 -+#else -+#define READ_SYSCALL 0 -+#endif -+ __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(fd), "S"(buf), "d"(count) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { -+ uint64_t ret; -+#if defined(__APPLE__) -+#define WRITE_SYSCALL 0x2000004 -+#else -+#define WRITE_SYSCALL 1 -+#endif -+ __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(fd), "S"(buf), "d"(count) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, -+ uint64_t fd, uint64_t offset) { -+#if defined(__APPLE__) -+#define MMAP_SYSCALL 0x20000c5 -+#else -+#define MMAP_SYSCALL 9 -+#endif -+ void *ret; -+ register uint64_t r8 asm("r8") = fd; -+ register uint64_t r9 asm("r9") = offset; -+ register uint64_t r10 asm("r10") = flags; -+ __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), -+ "r"(r9) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+uint64_t __munmap(void *addr, uint64_t size) { -+#if defined(__APPLE__) -+#define MUNMAP_SYSCALL 0x2000049 -+#else -+#define MUNMAP_SYSCALL 11 -+#endif -+ uint64_t ret; -+ __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(addr), "S"(size) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+uint64_t __sigprocmask(int how, const void *set, void *oldset) { -+#if defined(__APPLE__) -+#define SIGPROCMASK_SYSCALL 0x2000030 -+#else -+#define SIGPROCMASK_SYSCALL 14 -+#endif -+ uint64_t ret; -+ register long r10 asm("r10") = sizeof(uint64_t); -+ __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(how), "S"(set), "d"(oldset), "r"(r10) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+uint64_t __getpid() { -+ uint64_t ret; -+#if defined(__APPLE__) -+#define GETPID_SYSCALL 20 -+#else -+#define GETPID_SYSCALL 39 -+#endif -+ __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+uint64_t __exit(uint64_t code) { -+#if defined(__APPLE__) -+#define EXIT_SYSCALL 0x2000001 -+#else -+#define EXIT_SYSCALL 231 -+#endif -+ uint64_t ret; -+ __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(code) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+#if !defined(__APPLE__) -+// We use a stack-allocated buffer for string manipulation in many pieces of -+// this code, including the code that prints each line of the fdata file. This -+// buffer needs to accomodate large function names, but shouldn't be arbitrarily -+// large (dynamically allocated) for simplicity of our memory space usage. -+ -+// Declare some syscall wrappers we use throughout this code to avoid linking -+// against system libc. -+uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { -+ uint64_t ret; -+ __asm__ __volatile__("movq $2, %%rax\n" -+ "syscall" -+ : "=a"(ret) -+ : "D"(pathname), "S"(flags), "d"(mode) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) { -+ long ret; -+ __asm__ __volatile__("movq $217, %%rax\n" -+ "syscall" -+ : "=a"(ret) -+ : "D"(fd), "S"(dirp), "d"(count) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { -+ uint64_t ret; -+ __asm__ __volatile__("movq $89, %%rax\n" -+ "syscall" -+ : "=a"(ret) -+ : "D"(pathname), "S"(buf), "d"(bufsize) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { -+ uint64_t ret; -+ __asm__ __volatile__("movq $8, %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(fd), "S"(pos), "d"(whence) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+int __ftruncate(uint64_t fd, uint64_t length) { -+ int ret; -+ __asm__ __volatile__("movq $77, %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(fd), "S"(length) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+int __close(uint64_t fd) { -+ uint64_t ret; -+ __asm__ __volatile__("movq $3, %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(fd) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+int __madvise(void *addr, size_t length, int advice) { -+ int ret; -+ __asm__ __volatile__("movq $28, %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(addr), "S"(length), "d"(advice) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+int __uname(struct UtsNameTy *Buf) { -+ int Ret; -+ __asm__ __volatile__("movq $63, %%rax\n" -+ "syscall\n" -+ : "=a"(Ret) -+ : "D"(Buf) -+ : "cc", "rcx", "r11", "memory"); -+ return Ret; -+} -+ -+uint64_t __nanosleep(const timespec *req, timespec *rem) { -+ uint64_t ret; -+ __asm__ __volatile__("movq $35, %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(req), "S"(rem) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+int64_t __fork() { -+ uint64_t ret; -+ __asm__ __volatile__("movq $57, %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+int __mprotect(void *addr, size_t len, int prot) { -+ int ret; -+ __asm__ __volatile__("movq $10, %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(addr), "S"(len), "d"(prot) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+uint64_t __getppid() { -+ uint64_t ret; -+ __asm__ __volatile__("movq $110, %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+int __setpgid(uint64_t pid, uint64_t pgid) { -+ int ret; -+ __asm__ __volatile__("movq $109, %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(pid), "S"(pgid) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+uint64_t __getpgid(uint64_t pid) { -+ uint64_t ret; -+ __asm__ __volatile__("movq $121, %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(pid) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+int __kill(uint64_t pid, int sig) { -+ int ret; -+ __asm__ __volatile__("movq $62, %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(pid), "S"(sig) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+int __fsync(int fd) { -+ int ret; -+ __asm__ __volatile__("movq $74, %%rax\n" -+ "syscall\n" -+ : "=a"(ret) -+ : "D"(fd) -+ : "cc", "rcx", "r11", "memory"); -+ return ret; -+} -+ -+// %rdi %rsi %rdx %r10 %r8 -+// sys_prctl int option unsigned unsigned unsigned unsigned -+// long arg2 long arg3 long arg4 long arg5 -+int __prctl(int Option, unsigned long Arg2, unsigned long Arg3, -+ unsigned long Arg4, unsigned long Arg5) { -+ int Ret; -+ register long rdx asm("rdx") = Arg3; -+ register long r8 asm("r8") = Arg5; -+ register long r10 asm("r10") = Arg4; -+ __asm__ __volatile__("movq $157, %%rax\n" -+ "syscall\n" -+ : "=a"(Ret) -+ : "D"(Option), "S"(Arg2), "d"(rdx), "r"(r10), "r"(r8) -+ :); -+ return Ret; -+} -+ -+#endif -+ -+} // anonymous namespace -+ -+#endif -diff --git a/bolt/test/AArch64/exclusive-instrument.s b/bolt/test/AArch64/exclusive-instrument.s -new file mode 100644 -index 0000000..502dd83 ---- /dev/null -+++ b/bolt/test/AArch64/exclusive-instrument.s -@@ -0,0 +1,39 @@ -+// This test checks that the foo function having exclusive memory access -+// instructions won't be instrumented. -+ -+// REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}} -+ -+// RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ -+// RUN: %s -o %t.o -+// RUN: %clang %cflags -fPIC -pie %t.o -o %t.exe -nostdlib -Wl,-q -Wl,-fini=dummy -+// RUN: llvm-bolt %t.exe -o %t.bolt -instrument -v=1 | FileCheck %s -+ -+// CHECK: Function foo has exclusive instructions, skip instrumentation -+ -+.global foo -+.type foo, %function -+foo: -+ ldaxr w9, [x10] -+ cbnz w9, .Lret -+ stlxr w12, w11, [x9] -+ cbz w12, foo -+ clrex -+.Lret: -+ ret -+.size foo, .-foo -+ -+.global _start -+.type _start, %function -+_start: -+ cmp x0, #0 -+ b.eq .Lexit -+ bl foo -+.Lexit: -+ ret -+.size _start, .-_start -+ -+.global dummy -+.type dummy, %function -+dummy: -+ ret -+.size dummy, .-dummy -diff --git a/bolt/test/X86/asm-dump.c b/bolt/test/X86/asm-dump.c -index 5d85e2a..fdd448e 100644 ---- a/bolt/test/X86/asm-dump.c -+++ b/bolt/test/X86/asm-dump.c -@@ -1,13 +1,14 @@ - /** - * Test for asm-dump functionality. - * -- * REQUIRES: system-linux,bolt-runtime -+ * REQUIRES: x86_64-linux,bolt-runtime - * - * Compile the source - * RUN: %clang -fPIC %s -o %t.exe -Wl,-q - * - * Profile collection: instrument the binary -- * RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata -o %t.instr -+ * RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata -o \ -+ * RUN: %t.instr - * - * Profile collection: run instrumented binary (and capture output) - * RUN: %t.instr > %t.result -diff --git a/bolt/test/X86/bolt-address-translation-internal-call.test b/bolt/test/X86/bolt-address-translation-internal-call.test -index edc32d9..24cb635 100644 ---- a/bolt/test/X86/bolt-address-translation-internal-call.test -+++ b/bolt/test/X86/bolt-address-translation-internal-call.test -@@ -4,12 +4,12 @@ - # internal calls) might create new blocks without a mapping to an - # input block. - --# REQUIRES: system-linux,bolt-runtime -+# REQUIRES: x86_64-linux,bolt-runtime - - # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o - # Delete our BB symbols so BOLT doesn't mark them as entry points - # RUN: llvm-strip --strip-unneeded %t.o --# RUN: %clang %t.o -o %t.exe -Wl,-q -+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q - - # RUN: llvm-bolt --enable-bat %t.exe --relocs -o %t.out | FileCheck %s - # CHECK: BOLT-INFO: Wrote {{.*}} BAT maps -@@ -29,6 +29,7 @@ main: - push %rbx - sub $0x120,%rsp - mov $0x3,%rbx -+ movq rel(%rip), %rdi - .J1: - cmp $0x0,%rbx - je .J2 -@@ -49,4 +50,8 @@ main: - .J4: - pop %rbp - retq -+end: - .size main, .-main -+ -+ .data -+rel: .quad end -diff --git a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp -index f6ebd6b..4ed8be4 100644 ---- a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp -+++ b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp -@@ -1,7 +1,7 @@ - // This test checks that .eh_frame_hdr address is in bounds of the last LOAD - // end address i.e. the section address is smaller then the LOAD end address. - --// REQUIRES: system-linux,bolt-runtime -+// REQUIRES: system-linux,bolt-runtime,target=x86_64{{.*}} - - // RUN: %clangxx %cxxflags -static -Wl,-q %s -o %t.exe -Wl,--entry=_start - // RUN: llvm-bolt %t.exe -o %t.instr -instrument \ -diff --git a/bolt/test/X86/internal-call-instrument.s b/bolt/test/X86/internal-call-instrument.s -index c137174..c393f1d 100644 ---- a/bolt/test/X86/internal-call-instrument.s -+++ b/bolt/test/X86/internal-call-instrument.s -@@ -1,15 +1,23 @@ - # This reproduces a bug with instrumentation crashes on internal call - --# REQUIRES: system-linux,bolt-runtime -+# REQUIRES: x86_64-linux,bolt-runtime,target=x86_64{{.*}} - - # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o - # Delete our BB symbols so BOLT doesn't mark them as entry points - # RUN: llvm-strip --strip-unneeded %t.o --# RUN: %clang %t.o -o %t.exe -Wl,-q -+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q - - # RUN: llvm-bolt --instrument %t.exe --relocs -o %t.out - - .text -+ .globl _start -+ .type _start, %function -+ .p2align 4 -+_start: -+ call main -+ ret -+ .size _start, .-_start -+ - .globl main - .type main, %function - .p2align 4 -@@ -20,6 +28,7 @@ main: - push %rbx - sub $0x120,%rsp - mov $0x3,%rbx -+ movq rel(%rip), %rdi - .J1: - cmp $0x0,%rbx - je .J2 -@@ -40,4 +49,15 @@ main: - .J4: - pop %rbp - retq -+end: - .size main, .-main -+ -+ .globl _fini -+ .type _fini, %function -+ .p2align 4 -+_fini: -+ hlt -+ .size _fini, .-_fini -+ -+ .data -+rel: .quad end -diff --git a/bolt/test/X86/tail-duplication-pass.s b/bolt/test/X86/tail-duplication-pass.s -index 677f498..ed50cc5 100644 ---- a/bolt/test/X86/tail-duplication-pass.s -+++ b/bolt/test/X86/tail-duplication-pass.s -@@ -7,12 +7,21 @@ - # RUN: llvm-bolt %t.exe --data %t.fdata --reorder-blocks=ext-tsp \ - # RUN: --print-finalized --tail-duplication=moderate \ - # RUN: --tail-duplication-minimum-offset=1 -o %t.out | FileCheck %s -+# RUN: llvm-bolt %t.exe --data %t.fdata --print-finalized \ -+# RUN: --tail-duplication=aggressive --tail-duplication-minimum-offset=1 \ -+# RUN: -o %t.out | FileCheck %s --check-prefix CHECK-NOLOOP - - # FDATA: 1 main 2 1 main #.BB2# 0 10 - # FDATA: 1 main 4 1 main #.BB2# 0 20 - # CHECK: BOLT-INFO: tail duplication modified 1 ({{.*}}%) functions; duplicated 1 blocks (1 bytes) responsible for {{.*}} dynamic executions ({{.*}}% of all block executions) - # CHECK: BB Layout : .LBB00, .Ltail-dup0, .Ltmp0, .Ltmp1 - -+# Check that the successor of Ltail-dup0 is .LBB00, not itself. -+# CHECK-NOLOOP: .Ltail-dup0 (1 instructions, align : 1) -+# CHECK-NOLOOP: Predecessors: .LBB00 -+# CHECK-NOLOOP: retq -+# CHECK-NOLOOP: .Ltmp0 (1 instructions, align : 1) -+ - .text - .globl main - .type main, %function -diff --git a/bolt/test/assume-abi.test b/bolt/test/assume-abi.test -new file mode 100644 -index 0000000..688ab01 ---- /dev/null -+++ b/bolt/test/assume-abi.test -@@ -0,0 +1,7 @@ -+# Validate the usage of the `--assume-abi` option in conjunction with -+# options related to the RegAnalysis Pass. -+ -+REQUIRES: system-linux -+ -+RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q -+RUN: llvm-bolt %t -o %t.bolt --assume-abi --indirect-call-promotion=all -diff --git a/bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s b/bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s -new file mode 100644 -index 0000000..fa1ac35 ---- /dev/null -+++ b/bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s -@@ -0,0 +1,9 @@ -+ .globl main -+ .type main, %function -+main: -+ sub sp, sp, #16 -+ mov w0, wzr -+ str wzr, [sp, #12] -+ add sp, sp, #16 -+ ret -+.size main, .-main -diff --git a/bolt/test/runtime/AArch64/basic-instrumentation.test b/bolt/test/runtime/AArch64/basic-instrumentation.test -new file mode 100644 -index 0000000..0f77b0c ---- /dev/null -+++ b/bolt/test/runtime/AArch64/basic-instrumentation.test -@@ -0,0 +1,22 @@ -+# Try to instrument a very fast test. Input bin will not execute any code during -+# runtime besides returning zero in main, so it is a good trivial case. -+REQUIRES: system-linux,bolt-runtime -+ -+RUN: %clang %p/Inputs/basic-instrumentation.s -Wl,-q -o %t.exe -+RUN: llvm-bolt %t.exe -o %t --instrument \ -+RUN: --instrumentation-file=%t \ -+RUN: --instrumentation-file-append-pid -+ -+# Execute program to collect profile -+RUN: rm %t.*.fdata || echo Nothing to remove -+RUN: %t -+ -+# Profile should be written to %t.PID.fdata, check it -+RUN: mv %t.*.fdata %t.fdata -+RUN: cat %t.fdata | FileCheck -check-prefix=CHECK %s -+ -+# Check BOLT works with this profile -+RUN: llvm-bolt %t.exe --data %t.fdata -o %t.2 --reorder-blocks=cache -+ -+# The instrumented profile should at least say main was called once -+CHECK: main 0 0 1{{$}} -diff --git a/bolt/test/runtime/AArch64/instrumentation-ind-call.c b/bolt/test/runtime/AArch64/instrumentation-ind-call.c -new file mode 100644 -index 0000000..76ee8c0 ---- /dev/null -+++ b/bolt/test/runtime/AArch64/instrumentation-ind-call.c -@@ -0,0 +1,38 @@ -+#include -+ -+typedef int (*func_ptr)(int, int); -+ -+int add(int a, int b) { return a + b; } -+ -+int main() { -+ func_ptr fun; -+ fun = add; -+ int sum = fun(10, 20); // indirect call to 'add' -+ printf("The sum is: %d\n", sum); -+ return 0; -+} -+/* -+REQUIRES: system-linux,bolt-runtime -+ -+RUN: %clang %cflags %s -o %t.exe -Wl,-q -nopie -fpie -+ -+RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \ -+RUN: -o %t.instrumented -+ -+# Instrumented program needs to finish returning zero -+RUN: %t.instrumented | FileCheck %s -check-prefix=CHECK-OUTPUT -+ -+# Test that the instrumented data makes sense -+RUN: llvm-bolt %t.exe -o %t.bolted --data %t.fdata \ -+RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \ -+RUN: --print-only=main --print-finalized | FileCheck %s -+ -+RUN: %t.bolted | FileCheck %s -check-prefix=CHECK-OUTPUT -+ -+CHECK-OUTPUT: The sum is: 30 -+ -+# Check that our indirect call has 1 hit recorded in the fdata file and that -+# this was processed correctly by BOLT -+CHECK: blr x8 # CallProfile: 1 (0 misses) : -+CHECK-NEXT: { add: 1 (0 misses) } -+*/ -diff --git a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp b/bolt/test/runtime/Inputs/exceptions_split.cpp -similarity index 85% -rename from bolt/test/runtime/X86/Inputs/exceptions_split.cpp -rename to bolt/test/runtime/Inputs/exceptions_split.cpp -index 2c136b9..de81adf 100644 ---- a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp -+++ b/bolt/test/runtime/Inputs/exceptions_split.cpp -@@ -3,31 +3,25 @@ - // - // Record performance data with no args. Run test with 2 args. - --#include - #include -+#include - --int foo() --{ -- return 0; --} -+int foo() { return 0; } - - void bar(int a) { - if (a > 2 && a % 2) - throw new int(); - } - --void filter_only(){ -- foo(); --} -+void filter_only() { foo(); } - --int main(int argc, char **argv) --{ -+int main(int argc, char **argv) { - unsigned r = 0; - - uint64_t limit = (argc >= 2 ? 10 : 5000); - for (uint64_t i = 0; i < limit; ++i) { - i += foo(); -- try { -+ try { - bar(argc); - try { - if (argc >= 2) -diff --git a/bolt/test/runtime/X86/instrumentation-tail-call.s b/bolt/test/runtime/X86/instrumentation-tail-call.s -index 792d084..dfb12f0 100644 ---- a/bolt/test/runtime/X86/instrumentation-tail-call.s -+++ b/bolt/test/runtime/X86/instrumentation-tail-call.s -@@ -14,6 +14,9 @@ - - # CHECK: leaq 0x80(%rsp), %rsp - -+# RUN: FileCheck %s --input-file %t.fdata --check-prefix=CHECK-FDATA -+# CHECK-FDATA: 1 main {{.*}} 1 targetFunc 0 0 1 -+ - .text - .globl main - .type main, %function -@@ -32,7 +35,8 @@ main: - movq %rbp, %rsp - pop %rbp - mov -0x10(%rsp),%rax -- jmp targetFunc -+ test %rsp, %rsp -+ jne targetFunc - - .LBBerror: - addq $0x20, %rsp -diff --git a/bolt/test/runtime/X86/exceptions-instrumentation.test b/bolt/test/runtime/exceptions-instrumentation.test -similarity index 100% -rename from bolt/test/runtime/X86/exceptions-instrumentation.test -rename to bolt/test/runtime/exceptions-instrumentation.test -diff --git a/bolt/test/runtime/X86/pie-exceptions-split.test b/bolt/test/runtime/pie-exceptions-split.test -similarity index 95% -rename from bolt/test/runtime/X86/pie-exceptions-split.test -rename to bolt/test/runtime/pie-exceptions-split.test -index 124fef6..30f2d02 100644 ---- a/bolt/test/runtime/X86/pie-exceptions-split.test -+++ b/bolt/test/runtime/pie-exceptions-split.test -@@ -16,9 +16,9 @@ RUN: --print-only=main 2>&1 | FileCheck %s - ## All calls to printf() should be from exception handling code that was - ## recorded as cold during the profile collection run. Check that the calls - ## are placed after the split point. --CHECK-NOT: callq printf -+CHECK-NOT: printf - CHECK: HOT-COLD SPLIT POINT --CHECK: callq printf -+CHECK: printf - - ## Verify the output still executes correctly when the exception path is being - ## taken. --- -2.39.5 (Apple Git-154) - diff --git a/0011-support-D-FOT-addrs-data-parsing-for-optimized-binary.patch b/0011-support-D-FOT-addrs-data-parsing-for-optimized-binary.patch index 8a3c9e074ef4a6ee4effb9a1cca285138649476d..b020a7870f02d994168af09d49e7777b32e80f5f 100644 --- a/0011-support-D-FOT-addrs-data-parsing-for-optimized-binary.patch +++ b/0011-support-D-FOT-addrs-data-parsing-for-optimized-binary.patch @@ -179,7 +179,7 @@ index 24dbe34..509e7c9 100644 + ErrorOr Count = parseNumberField(FieldSeparator, true); + if (std::error_code EC = Count.getError()) + return EC; -+ ++ + if (!checkAndConsumeNewLine()) { + reportError("expected end of line"); + return make_error_code(llvm::errc::io_error); @@ -211,7 +211,7 @@ index 24dbe34..509e7c9 100644 + ErrorOr KperfEntry = parseLibkperfDataEntry(); + if (std::error_code EC = KperfEntry.getError()) + return EC; -+ ++ + BasicSamples[KperfEntry->Addr] += KperfEntry->Count; + } + diff --git a/0012-Add-Om-for-Kunpeng-Opts.patch b/0012-Add-Om-for-Kunpeng-Opts.patch index 0f8a8eed318725ffb1936a98e54b0a156eb82de2..17046e36588074c18791ea16c3bb6bf1e474e65d 100644 --- a/0012-Add-Om-for-Kunpeng-Opts.patch +++ b/0012-Add-Om-for-Kunpeng-Opts.patch @@ -168,11 +168,11 @@ index 2fc99f6..359cd05 100644 cl::desc("ignore recursive calls when constructing the call graph"), cl::init(true), cl::cat(BoltOptCategory)); --static cl::opt -+cl::opt - CgUseSplitHotSize("cg-use-split-hot-size", - cl::desc("use hot/cold data on basic blocks to determine hot sizes for " - "call graph functions"), +-static cl::opt CgUseSplitHotSize( ++cl::opt CgUseSplitHotSize( + "cg-use-split-hot-size", + cl::desc("use hot/cold data on basic blocks to determine hot sizes for " + "call graph functions"), diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp index 34973ce..e934b75 100644 --- a/bolt/lib/Passes/SplitFunctions.cpp diff --git a/0014-Fix-compilation-error.patch b/0014-Fix-compilation-error.patch new file mode 100644 index 0000000000000000000000000000000000000000..0edb60ad1c3fc3c37461112775a77722364c3348 --- /dev/null +++ b/0014-Fix-compilation-error.patch @@ -0,0 +1,24 @@ +From 19362cc8cbb9c4e8fc06a81ed0e887f249142c6a Mon Sep 17 00:00:00 2001 +From: eastb233 +Date: Thu, 28 Aug 2025 17:05:53 +0800 +Subject: [PATCH] Fix compilation error + +--- + .../include/llvm/Transforms/Instrumentation/PGOInstrumentation.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h +index 6d2ad3d75744..5b1977b7de9a 100644 +--- a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h ++++ b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h +@@ -18,6 +18,7 @@ + #include "llvm/ADT/ArrayRef.h" + #include "llvm/ADT/IntrusiveRefCntPtr.h" + #include "llvm/IR/PassManager.h" ++#include "llvm/Support/CommandLine.h" + #include + #include + +-- +2.43.0 + diff --git a/llvm-bolt.spec b/llvm-bolt.spec index 654fef005e26cebb936f34d783e5ea0dbfc044d5..c2f786273f8e3dac9b08d6dcd81352c35d4c55f6 100644 --- a/llvm-bolt.spec +++ b/llvm-bolt.spec @@ -9,8 +9,8 @@ %global maj_ver 17 %global min_ver 0 %global patch_ver 6 -%global bolt_version %{maj_ver}.%{min_ver}.%{patch_ver} -%global bolt_srcdir llvm-project-%{bolt_version}.src +%global bolt_version llvm-for-oE-17.0.6-2509.0.1 +%global bolt_srcdir llvm-project-%{bolt_version} %if %{with sys_llvm} %global pkg_name llvm-bolt @@ -26,28 +26,25 @@ %global max_link_jobs 2 Name: %{pkg_name} -Version: %{bolt_version} -Release: 7 +Version: %{maj_ver}.%{min_ver}.%{patch_ver} +Release: 8 Summary: BOLT is a post-link optimizer developed to speed up large applications License: Apache-2.0 -URL: https://github.com/llvm/llvm-project/tree/main/bolt +URL: https://gitee.com/openeuler/llvm-project -Source0: https://github.com/llvm/llvm-project/releases/download/llvmorg-%{bolt_version}/%{bolt_srcdir}.tar.xz -Source1: https://github.com/llvm/llvm-project/releases/download/llvmorg-%{bolt_version}/%{bolt_srcdir}.tar.xz.sig +Source0: llvm-project-%{bolt_version}.tar.gz -Patch1: 0001-Fix-trap-value-for-non-X86.patch -Patch2: 0002-Add-test-for-emitting-trap-value.patch Patch3: 0003-AArch64-Add-AArch64-support-for-inline.patch Patch4: 0004-Bolt-Solving-pie-support-issue.patch Patch5: 0005-BOLT-AArch64-Don-t-change-layout-in-PatchEntries.patch Patch6: 0006-AArch64-Add-CFG-block-count-correction-optimization.patch Patch7: 0007-BOLT-Skip-PLT-search-for-zero-value-weak-reference-symbols.patch Patch8: 0008-merge-fdata-Support-process-no_lbr-profile-file.patch -Patch9: 0009-support-aarch64-instrumentation.patch Patch10: 0010-AArch64-Add-hybrid-guess-approach-for-edge-weight-estimation.patch Patch11: 0011-support-D-FOT-addrs-data-parsing-for-optimized-binary.patch Patch12: 0012-Add-Om-for-Kunpeng-Opts.patch Patch13: 0013-add-strict-to-Om.patch +Patch14: 0014-Fix-compilation-error.patch BuildRequires: gcc BuildRequires: gcc-c++ @@ -161,6 +158,12 @@ rm -f %{buildroot}/%{_builddir}/%{bolt_srcdir}/%{__cmake_builddir}/%{_lib}/lib*. %doc %{install_docdir} %changelog +* Sat Aug 30 2025 eastb233 17.0.6-8 +- Type: Update +- ID:NA +- SUG:NA +- DESC: Update https://gitee.com/openeuler/llvm-project/archive/refs/tags/llvm-for-oE-17.0.6-2509.0.1.tar.gz + * Thu May 22 2025 rfwang07 17.0.6-7 - Type:backport - ID:NA diff --git a/llvm-project-17.0.6.src.tar.xz.sig b/llvm-project-17.0.6.src.tar.xz.sig deleted file mode 100644 index 59c3713127177606f6f6d95bf61790e3d7ad15cf..0000000000000000000000000000000000000000 Binary files a/llvm-project-17.0.6.src.tar.xz.sig and /dev/null differ diff --git a/llvm-project-17.0.6.src.tar.xz b/llvm-project-llvm-for-oE-17.0.6-2509.0.1.tar.gz similarity index 32% rename from llvm-project-17.0.6.src.tar.xz rename to llvm-project-llvm-for-oE-17.0.6-2509.0.1.tar.gz index 9a7fe059772df27d4d1f353636990ff8a8f33c4f..df9aef5c5ba96d6b6c1b13397801837eac539d25 100644 --- a/llvm-project-17.0.6.src.tar.xz +++ b/llvm-project-llvm-for-oE-17.0.6-2509.0.1.tar.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58a8818c60e6627064f312dbf46c02d9949956558340938b71cf731ad8bc0813 -size 127838860 +oid sha256:3437c2ca8c6e2fc0083e3db05e884bd5cd2c124ba51b902a8cb2e5bb13312a04 +size 212672038