diff --git a/0001-Fix-trap-value-for-non-X86.patch b/0001-Fix-trap-value-for-non-X86.patch new file mode 100644 index 0000000000000000000000000000000000000000..83542e4a2372b0dafd3cff6cb91677e6335a9e1d --- /dev/null +++ b/0001-Fix-trap-value-for-non-X86.patch @@ -0,0 +1,126 @@ +From 868d8c360b3e1e5f291cb3e0dae0777a4529228f Mon Sep 17 00:00:00 2001 +From: Denis Revunov +Date: Thu, 27 Jul 2023 11:48:08 -0400 +Subject: [PATCH] Fix trap value for non-X86 + +The trap value used by BOLT was assumed to be single-byte instruction. +It made some functions unaligned on AArch64(e.g exceptions-instrumentation test) +and caused emission failures. Fix that by changing fill value to StringRef. + +Reviewed By: rafauler + +Differential Revision: https://reviews.llvm.org/D158191 +--- + bolt/include/bolt/Core/MCPlusBuilder.h | 9 ++++++--- + bolt/lib/Core/BinaryEmitter.cpp | 4 ++-- + bolt/lib/Rewrite/RewriteInstance.cpp | 6 ++++-- + bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 4 ++++ + bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp | 4 ++++ + bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 2 +- + 6 files changed, 21 insertions(+), 8 deletions(-) + +diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h +index 56d0228cd..beb06751d 100644 +--- a/bolt/include/bolt/Core/MCPlusBuilder.h ++++ b/bolt/include/bolt/Core/MCPlusBuilder.h +@@ -636,9 +636,12 @@ public: + return false; + } + +- /// If non-zero, this is used to fill the executable space with instructions +- /// that will trap. Defaults to 0. +- virtual unsigned getTrapFillValue() const { return 0; } ++ /// Used to fill the executable space with instructions ++ /// that will trap. ++ virtual StringRef getTrapFillValue() const { ++ llvm_unreachable("not implemented"); ++ return StringRef(); ++ } + + /// Interface and basic functionality of a MCInstMatcher. The idea is to make + /// it easy to match one or more MCInsts against a tree-like pattern and +diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp +index c4129615a..df076c81d 100644 +--- a/bolt/lib/Core/BinaryEmitter.cpp ++++ b/bolt/lib/Core/BinaryEmitter.cpp +@@ -376,7 +376,7 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function, + } + + if (opts::MarkFuncs) +- Streamer.emitIntValue(BC.MIB->getTrapFillValue(), 1); ++ Streamer.emitBytes(BC.MIB->getTrapFillValue()); + + // Emit CFI end + if (Function.hasCFI()) +@@ -420,7 +420,7 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF, + // case, the call site entries in that LSDA have 0 as offset to the landing + // pad, which the runtime interprets as "no handler". To prevent this, + // insert some padding. +- Streamer.emitIntValue(BC.MIB->getTrapFillValue(), 1); ++ Streamer.emitBytes(BC.MIB->getTrapFillValue()); + } + + // Track the first emitted instruction with debug info. +diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp +index fe8c134b8..c6ea0b009 100644 +--- a/bolt/lib/Rewrite/RewriteInstance.cpp ++++ b/bolt/lib/Rewrite/RewriteInstance.cpp +@@ -5273,8 +5273,10 @@ void RewriteInstance::rewriteFile() { + if (!BF.getFileOffset() || !BF.isEmitted()) + continue; + OS.seek(BF.getFileOffset()); +- for (unsigned I = 0; I < BF.getMaxSize(); ++I) +- OS.write((unsigned char)BC->MIB->getTrapFillValue()); ++ StringRef TrapInstr = BC->MIB->getTrapFillValue(); ++ unsigned NInstr = BF.getMaxSize() / TrapInstr.size(); ++ for (unsigned I = 0; I < NInstr; ++I) ++ OS.write(TrapInstr.data(), TrapInstr.size()); + } + OS.seek(SavedPos); + } +diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +index acf21ba23..cd66b654e 100644 +--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp ++++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +@@ -1142,6 +1142,10 @@ public: + } + } + ++ StringRef getTrapFillValue() const override { ++ return StringRef("\0\0\0\0", 4); ++ } ++ + bool createReturn(MCInst &Inst) const override { + Inst.setOpcode(AArch64::RET); + Inst.clear(); +diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp +index ec5bca852..badc1bde8 100644 +--- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp ++++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp +@@ -171,6 +171,10 @@ public: + return true; + } + ++ StringRef getTrapFillValue() const override { ++ return StringRef("\0\0\0\0", 4); ++ } ++ + bool analyzeBranch(InstructionIterator Begin, InstructionIterator End, + const MCSymbol *&TBB, const MCSymbol *&FBB, + MCInst *&CondBranch, +diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +index 3ee161d0b..5e3c01a1c 100644 +--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp ++++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +@@ -397,7 +397,7 @@ public: + } + } + +- unsigned getTrapFillValue() const override { return 0xCC; } ++ StringRef getTrapFillValue() const override { return StringRef("\314", 1); } + + struct IndJmpMatcherFrag1 : MCInstMatcher { + std::unique_ptr Base; +-- +2.33.0 + diff --git a/0002-Add-test-for-emitting-trap-value.patch b/0002-Add-test-for-emitting-trap-value.patch new file mode 100644 index 0000000000000000000000000000000000000000..8cc1c6d8308dc848072e0b6be000f9fc12c96068 --- /dev/null +++ b/0002-Add-test-for-emitting-trap-value.patch @@ -0,0 +1,44 @@ +From e4ae238a42296a84bc819dd1fb61f3c699952f17 Mon Sep 17 00:00:00 2001 +From: Denis Revunov +Date: Thu, 17 Aug 2023 18:30:07 +0300 +Subject: [PATCH] Add test for emitting trap value + +Reviewed By: rafauler + +Differential Revision: https://reviews.llvm.org/D158191 +--- + bolt/test/runtime/mark-funcs.c | 22 ++++++++++++++++++++++ + 1 file changed, 22 insertions(+) + create mode 100644 bolt/test/runtime/mark-funcs.c + +diff --git a/bolt/test/runtime/mark-funcs.c b/bolt/test/runtime/mark-funcs.c +new file mode 100644 +index 000000000..a8586ca8b +--- /dev/null ++++ b/bolt/test/runtime/mark-funcs.c +@@ -0,0 +1,22 @@ ++#include ++ ++int dummy() { ++ printf("Dummy called\n"); ++ return 0; ++} ++ ++int main(int argc, char **argv) { ++ if (dummy() != 0) ++ return 1; ++ printf("Main called\n"); ++ return 0; ++} ++// Check that emitting trap value works properly and ++// does not break functions ++// REQUIRES: system-linux ++// RUN: %clangxx -Wl,-q %s -o %t.exe ++// RUN: %t.exe | FileCheck %s ++// CHECK: Dummy called ++// CHECK-NEXT: Main called ++// RUN: llvm-bolt %t.exe -o %t.exe.bolt -lite=false --mark-funcs ++// RUN: %t.exe.bolt | FileCheck %s +-- +2.33.0 + diff --git a/0003-AArch64-Add-AArch64-support-for-inline.patch b/0003-AArch64-Add-AArch64-support-for-inline.patch new file mode 100644 index 0000000000000000000000000000000000000000..cb64595fbbeddb127f769c8facf7676b1178cdf9 --- /dev/null +++ b/0003-AArch64-Add-AArch64-support-for-inline.patch @@ -0,0 +1,274 @@ +From a09ea2c3534d12f194f740180e09a229e0b2200f Mon Sep 17 00:00:00 2001 +From: xiongzhou4 +Date: Wed, 12 Jun 2024 17:12:36 +0800 +Subject: [PATCH 1/2] [AArch64] Add AArch64 support for inline. + +--- + bolt/include/bolt/Core/MCPlusBuilder.h | 5 +-- + bolt/lib/Passes/Inliner.cpp | 31 +++++++++++++++++++ + .../Target/AArch64/AArch64MCPlusBuilder.cpp | 10 ++++++ + bolt/test/AArch64/Inputs/inline-foo.c | 5 +++ + bolt/test/AArch64/Inputs/inline-main.c | 5 +++ + bolt/test/AArch64/Inputs/inlined.cpp | 23 ++++++++++++++ + bolt/test/AArch64/Inputs/inlinee.cpp | 3 ++ + bolt/test/AArch64/Inputs/jmp_opt.cpp | 7 +++++ + bolt/test/AArch64/Inputs/jmp_opt2.cpp | 3 ++ + bolt/test/AArch64/Inputs/jmp_opt3.cpp | 3 ++ + bolt/test/AArch64/inline-debug-info.test | 20 ++++++++++++ + bolt/test/AArch64/inlined-function-mixed.test | 11 +++++++ + bolt/test/AArch64/jmp-optimization.test | 14 +++++++++ + 13 files changed, 136 insertions(+), 4 deletions(-) + create mode 100644 bolt/test/AArch64/Inputs/inline-foo.c + create mode 100644 bolt/test/AArch64/Inputs/inline-main.c + create mode 100644 bolt/test/AArch64/Inputs/inlined.cpp + create mode 100644 bolt/test/AArch64/Inputs/inlinee.cpp + create mode 100644 bolt/test/AArch64/Inputs/jmp_opt.cpp + create mode 100644 bolt/test/AArch64/Inputs/jmp_opt2.cpp + create mode 100644 bolt/test/AArch64/Inputs/jmp_opt3.cpp + create mode 100644 bolt/test/AArch64/inline-debug-info.test + create mode 100644 bolt/test/AArch64/inlined-function-mixed.test + create mode 100644 bolt/test/AArch64/jmp-optimization.test + +diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h +index db3f7e7f1..56d0228cd 100644 +--- a/bolt/include/bolt/Core/MCPlusBuilder.h ++++ b/bolt/include/bolt/Core/MCPlusBuilder.h +@@ -573,10 +573,7 @@ public: + return 0; + } + +- virtual bool isPush(const MCInst &Inst) const { +- llvm_unreachable("not implemented"); +- return false; +- } ++ virtual bool isPush(const MCInst &Inst) const { return false; } + + /// Return the width, in bytes, of the memory access performed by \p Inst, if + /// this is a push instruction. Return zero otherwise. +diff --git a/bolt/lib/Passes/Inliner.cpp b/bolt/lib/Passes/Inliner.cpp +index 8dcb8934f..67dd294fb 100644 +--- a/bolt/lib/Passes/Inliner.cpp ++++ b/bolt/lib/Passes/Inliner.cpp +@@ -465,6 +465,37 @@ bool Inliner::inlineCallsInFunction(BinaryFunction &Function) { + << ". Size change: " << SizeAfterInlining + << " bytes.\n"); + ++// Skip situations where some A64 instructions can't be inlined: ++// # Indirect branch, e.g., BR. ++// # Branch instructions but used to make a function call. ++ if (BC.isAArch64()) { ++ auto &MIB = *BC.MIB; ++ bool skip = false; ++ for (const BinaryBasicBlock &BB : *TargetFunction) { ++ for (MCInst Inst : BB) { ++ if (MIB.isPseudo(Inst)) ++ continue; ++ ++ MIB.stripAnnotations(Inst, false); ++ ++ if (MIB.isBranch(Inst)) { ++ const BinaryBasicBlock *TargetBB = ++ TargetFunction->getBasicBlockForLabel(MIB.getTargetSymbol(Inst)); ++ if (MIB.isIndirectBranch(Inst) || !TargetBB) { ++ skip = true; ++ break; ++ } ++ } ++ } ++ if (skip) ++ break; ++ } ++ if (skip) { ++ ++InstIt; ++ continue; ++ } ++ } ++ + std::tie(BB, InstIt) = inlineCall(*BB, InstIt, *TargetFunction); + + DidInlining = true; +diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +index d109a5d52..acf21ba23 100644 +--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp ++++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +@@ -34,6 +34,8 @@ public: + const MCRegisterInfo *RegInfo) + : MCPlusBuilder(Analysis, Info, RegInfo) {} + ++ MCPhysReg getStackPointer() const override { return AArch64::SP; } ++ + bool equals(const MCTargetExpr &A, const MCTargetExpr &B, + CompFuncTy Comp) const override { + const auto &AArch64ExprA = cast(A); +@@ -816,6 +818,14 @@ public: + + int getUncondBranchEncodingSize() const override { return 28; } + ++ bool createCall(MCInst &Inst, const MCSymbol *Target, ++ MCContext *Ctx) override { ++ Inst.setOpcode(AArch64::BL); ++ Inst.addOperand(MCOperand::createExpr( ++ MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx))); ++ return true; ++ } ++ + bool createTailCall(MCInst &Inst, const MCSymbol *Target, + MCContext *Ctx) override { + Inst.setOpcode(AArch64::B); +diff --git a/bolt/test/AArch64/Inputs/inline-foo.c b/bolt/test/AArch64/Inputs/inline-foo.c +new file mode 100644 +index 000000000..1307c13f2 +--- /dev/null ++++ b/bolt/test/AArch64/Inputs/inline-foo.c +@@ -0,0 +1,5 @@ ++#include "stub.h" ++ ++void foo() { ++ puts("Hello world!\n"); ++} +diff --git a/bolt/test/AArch64/Inputs/inline-main.c b/bolt/test/AArch64/Inputs/inline-main.c +new file mode 100644 +index 000000000..7853d2b63 +--- /dev/null ++++ b/bolt/test/AArch64/Inputs/inline-main.c +@@ -0,0 +1,5 @@ ++extern void foo(); ++int main() { ++ foo(); ++ return 0; ++} +diff --git a/bolt/test/AArch64/Inputs/inlined.cpp b/bolt/test/AArch64/Inputs/inlined.cpp +new file mode 100644 +index 000000000..a6ff9e262 +--- /dev/null ++++ b/bolt/test/AArch64/Inputs/inlined.cpp +@@ -0,0 +1,23 @@ ++extern "C" int printf(const char*, ...); ++extern const char* question(); ++ ++inline int answer() __attribute__((always_inline)); ++inline int answer() { return 42; } ++ ++int main(int argc, char *argv[]) { ++ int ans; ++ if (argc == 1) { ++ ans = 0; ++ } else { ++ ans = argc; ++ } ++ printf("%s\n", question()); ++ for (int i = 0; i < 10; ++i) { ++ int x = answer(); ++ int y = answer(); ++ ans += x - y; ++ } ++ // padding to make sure question() is inlineable ++ asm("nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;"); ++ return ans; ++} +diff --git a/bolt/test/AArch64/Inputs/inlinee.cpp b/bolt/test/AArch64/Inputs/inlinee.cpp +new file mode 100644 +index 000000000..edb7ab145 +--- /dev/null ++++ b/bolt/test/AArch64/Inputs/inlinee.cpp +@@ -0,0 +1,3 @@ ++const char* question() { ++ return "What do you get if you multiply six by nine?"; ++} +diff --git a/bolt/test/AArch64/Inputs/jmp_opt.cpp b/bolt/test/AArch64/Inputs/jmp_opt.cpp +new file mode 100644 +index 000000000..cd6d53c35 +--- /dev/null ++++ b/bolt/test/AArch64/Inputs/jmp_opt.cpp +@@ -0,0 +1,7 @@ ++int g(); ++ ++int main() { ++ int x = g(); ++ int y = x*x; ++ return y; ++} +diff --git a/bolt/test/AArch64/Inputs/jmp_opt2.cpp b/bolt/test/AArch64/Inputs/jmp_opt2.cpp +new file mode 100644 +index 000000000..80b853d63 +--- /dev/null ++++ b/bolt/test/AArch64/Inputs/jmp_opt2.cpp +@@ -0,0 +1,3 @@ ++int f() { ++ return 0; ++} +diff --git a/bolt/test/AArch64/Inputs/jmp_opt3.cpp b/bolt/test/AArch64/Inputs/jmp_opt3.cpp +new file mode 100644 +index 000000000..7fb551163 +--- /dev/null ++++ b/bolt/test/AArch64/Inputs/jmp_opt3.cpp +@@ -0,0 +1,3 @@ ++int f(); ++ ++int g() { return f(); } +diff --git a/bolt/test/AArch64/inline-debug-info.test b/bolt/test/AArch64/inline-debug-info.test +new file mode 100644 +index 000000000..e20e5e31e +--- /dev/null ++++ b/bolt/test/AArch64/inline-debug-info.test +@@ -0,0 +1,20 @@ ++## Check that BOLT correctly prints and updates debug info for inlined ++## functions. ++ ++# REQUIRES: system-linux ++ ++# RUN: %clang %cflags -O1 -g %p/Inputs/inline-main.c %p/Inputs/inline-foo.c \ ++# RUN: -I%p/../Inputs -o %t.exe -Wl,-q ++# RUN: llvm-bolt %t.exe --update-debug-sections --print-debug-info \ ++# RUN: --print-only=main --print-after-lowering --force-inline=foo \ ++# RUN: -o %t.bolt \ ++# RUN: | FileCheck %s ++ ++## The call to puts() should come from inline-foo.c: ++# CHECK: callq {{.*}} # debug line {{.*}}inline-foo.c:4:3 ++ ++# RUN: llvm-objdump --disassemble-symbols=main -d --line-numbers %t.bolt \ ++# RUN: | FileCheck %s -check-prefix=CHECK-OBJDUMP ++ ++## Dump of main() should include debug info from inline-foo.c after inlining: ++# CHECK-OBJDUMP: inline-foo.c:4 +diff --git a/bolt/test/AArch64/inlined-function-mixed.test b/bolt/test/AArch64/inlined-function-mixed.test +new file mode 100644 +index 000000000..5a87bdde9 +--- /dev/null ++++ b/bolt/test/AArch64/inlined-function-mixed.test +@@ -0,0 +1,11 @@ ++# Make sure inlining from a unit with debug info into unit without ++# debug info does not cause a crash. ++ ++RUN: %clangxx %cxxflags %S/Inputs/inlined.cpp -c -o %T/inlined.o ++RUN: %clangxx %cxxflags %S/Inputs/inlinee.cpp -c -o %T/inlinee.o -g ++RUN: %clangxx %cxxflags %T/inlined.o %T/inlinee.o -o %t ++ ++RUN: llvm-bolt %t -o %t.bolt --update-debug-sections --reorder-blocks=reverse \ ++RUN: --inline-small-functions --force-inline=main | FileCheck %s ++ ++CHECK-NOT: BOLT: 0 out of {{.*}} functions were overwritten +diff --git a/bolt/test/AArch64/jmp-optimization.test b/bolt/test/AArch64/jmp-optimization.test +new file mode 100644 +index 000000000..92f4b9a14 +--- /dev/null ++++ b/bolt/test/AArch64/jmp-optimization.test +@@ -0,0 +1,14 @@ ++# Tests the optimization of functions that just do a tail call in the beginning. ++ ++# This test has commands that rely on shell capabilities that won't execute ++# correctly on Windows e.g. unsupported parameter expansion ++REQUIRES: shell ++ ++RUN: %clang %cflags -O2 %S/Inputs/jmp_opt{,2,3}.cpp -o %t ++RUN: llvm-bolt -inline-small-functions %t -o %t.bolt ++RUN: llvm-objdump -d %t.bolt --print-imm-hex | FileCheck %s ++ ++CHECK:
: ++CHECK-NOT: call ++CHECK: xorl %eax, %eax ++CHECK: retq +-- +2.33.0 + diff --git a/0004-Bolt-Solving-pie-support-issue.patch b/0004-Bolt-Solving-pie-support-issue.patch new file mode 100644 index 0000000000000000000000000000000000000000..b26d9fcdf1eb2fbfc4f02094a06ffcf3ea1fae2c --- /dev/null +++ b/0004-Bolt-Solving-pie-support-issue.patch @@ -0,0 +1,170 @@ +From a28084a4adff2340dd02c2c0c42f4997f76b3ffa Mon Sep 17 00:00:00 2001 +From: rfwang07 +Date: Fri, 21 Jun 2024 11:16:44 +0800 +Subject: [PATCH] [Bolt] Solving pie support issue + +--- + bolt/lib/Core/BinaryContext.cpp | 25 +++++++++++++++++++---- + bolt/test/perf2bolt/Inputs/perf_test.c | 26 ++++++++++++++++++++++++ + bolt/test/perf2bolt/Inputs/perf_test.lds | 13 ++++++++++++ + bolt/test/perf2bolt/lit.local.cfg | 4 ++++ + bolt/test/perf2bolt/perf_test.test | 17 ++++++++++++++++ + bolt/unittests/Core/BinaryContext.cpp | 21 +++++++++++++++++++ + 6 files changed, 102 insertions(+), 4 deletions(-) + create mode 100644 bolt/test/perf2bolt/Inputs/perf_test.c + create mode 100644 bolt/test/perf2bolt/Inputs/perf_test.lds + create mode 100644 bolt/test/perf2bolt/lit.local.cfg + create mode 100644 bolt/test/perf2bolt/perf_test.test + +diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp +index 2d2b35ee2..ab9f0b844 100644 +--- a/bolt/lib/Core/BinaryContext.cpp ++++ b/bolt/lib/Core/BinaryContext.cpp +@@ -1880,10 +1880,27 @@ BinaryContext::getBaseAddressForMapping(uint64_t MMapAddress, + // Find a segment with a matching file offset. + for (auto &KV : SegmentMapInfo) { + const SegmentInfo &SegInfo = KV.second; +- if (alignDown(SegInfo.FileOffset, SegInfo.Alignment) == FileOffset) { +- // Use segment's aligned memory offset to calculate the base address. +- const uint64_t MemOffset = alignDown(SegInfo.Address, SegInfo.Alignment); +- return MMapAddress - MemOffset; ++ // FileOffset is got from perf event, ++ // and it is equal to alignDown(SegInfo.FileOffset, pagesize). ++ // If the pagesize is not equal to SegInfo.Alignment. ++ // FileOffset and SegInfo.FileOffset should be aligned first, ++ // and then judge whether they are equal. ++ if (alignDown(SegInfo.FileOffset, SegInfo.Alignment) == ++ alignDown(FileOffset, SegInfo.Alignment)) { ++ // The function's offset from base address in VAS is aligned by pagesize ++ // instead of SegInfo.Alignment. Pagesize can't be got from perf events. ++ // However, The ELF document says that SegInfo.FileOffset should equal ++ // to SegInfo.Address, modulo the pagesize. ++ // Reference: https://refspecs.linuxfoundation.org/elf/elf.pdf ++ ++ // So alignDown(SegInfo.Address, pagesize) can be calculated by: ++ // alignDown(SegInfo.Address, pagesize) ++ // = SegInfo.Address - (SegInfo.Address % pagesize) ++ // = SegInfo.Address - (SegInfo.FileOffset % pagesize) ++ // = SegInfo.Address - SegInfo.FileOffset + ++ // alignDown(SegInfo.FileOffset, pagesize) ++ // = SegInfo.Address - SegInfo.FileOffset + FileOffset ++ return MMapAddress - (SegInfo.Address - SegInfo.FileOffset + FileOffset); + } + } + +diff --git a/bolt/test/perf2bolt/Inputs/perf_test.c b/bolt/test/perf2bolt/Inputs/perf_test.c +new file mode 100644 +index 000000000..ff5ecf7a8 +--- /dev/null ++++ b/bolt/test/perf2bolt/Inputs/perf_test.c +@@ -0,0 +1,26 @@ ++#include ++#include ++#include ++ ++int add(int a, int b) { return a + b; } ++int minus(int a, int b) { return a - b; } ++int multiple(int a, int b) { return a * b; } ++int divide(int a, int b) { ++ if (b == 0) ++ return 0; ++ return a / b; ++} ++ ++int main() { ++ int a = 16; ++ int b = 8; ++ ++ for (int i = 1; i < 100000; i++) { ++ add(a, b); ++ minus(a, b); ++ multiple(a, b); ++ divide(a, b); ++ } ++ ++ return 0; ++} +diff --git a/bolt/test/perf2bolt/Inputs/perf_test.lds b/bolt/test/perf2bolt/Inputs/perf_test.lds +new file mode 100644 +index 000000000..9cb4ebbf1 +--- /dev/null ++++ b/bolt/test/perf2bolt/Inputs/perf_test.lds +@@ -0,0 +1,13 @@ ++SECTIONS { ++ . = SIZEOF_HEADERS; ++ .interp : { *(.interp) } ++ .note.gnu.build-id : { *(.note.gnu.build-id) } ++ . = 0x212e8; ++ .dynsym : { *(.dynsym) } ++ . = 0x31860; ++ .text : { *(.text*) } ++ . = 0x41c20; ++ .fini_array : { *(.fini_array) } ++ . = 0x54e18; ++ .data : { *(.data) } ++} +diff --git a/bolt/test/perf2bolt/lit.local.cfg b/bolt/test/perf2bolt/lit.local.cfg +new file mode 100644 +index 000000000..87a96ec34 +--- /dev/null ++++ b/bolt/test/perf2bolt/lit.local.cfg +@@ -0,0 +1,4 @@ ++import shutil ++ ++if shutil.which("perf") != None: ++ config.available_features.add("perf") +diff --git a/bolt/test/perf2bolt/perf_test.test b/bolt/test/perf2bolt/perf_test.test +new file mode 100644 +index 000000000..fe6e015ab +--- /dev/null ++++ b/bolt/test/perf2bolt/perf_test.test +@@ -0,0 +1,17 @@ ++# Check perf2bolt binary function which was compiled with pie ++ ++REQUIRES: system-linux, perf ++ ++RUN: %clang %S/Inputs/perf_test.c -fuse-ld=lld -Wl,--script=%S/Inputs/perf_test.lds -o %t ++RUN: perf record -e cycles:u -o %t2 -- %t ++RUN: perf2bolt %t -p=%t2 -o %t3 -nl -ignore-build-id 2>&1 | FileCheck %s ++ ++CHECK-NOT: PERF2BOLT-ERROR ++CHECK-NOT: !! WARNING !! This high mismatch ratio indicates the input binary is probably not the same binary used during profiling collection. ++ ++RUN: %clang %S/Inputs/perf_test.c -no-pie -fuse-ld=lld -o %t4 ++RUN: perf record -e cycles:u -o %t5 -- %t4 ++RUN: perf2bolt %t4 -p=%t5 -o %t6 -nl -ignore-build-id 2>&1 | FileCheck %s --check-prefix=CHECK-NO-PIE ++ ++CHECK-NO-PIE-NOT: PERF2BOLT-ERROR ++CHECK-NO-PIE-NOT: !! WARNING !! This high mismatch ratio indicates the input binary is probably not the same binary used during profiling collection. +diff --git a/bolt/unittests/Core/BinaryContext.cpp b/bolt/unittests/Core/BinaryContext.cpp +index bac264141..5a80cb4a2 100644 +--- a/bolt/unittests/Core/BinaryContext.cpp ++++ b/bolt/unittests/Core/BinaryContext.cpp +@@ -83,3 +83,24 @@ TEST_P(BinaryContextTester, BaseAddress) { + BaseAddress = BC->getBaseAddressForMapping(0x7f13f5556000, 0x137a000); + ASSERT_FALSE(BaseAddress.has_value()); + } ++ ++TEST_P(BinaryContextTester, BaseAddress2) { ++ // Check that base address calculation is correct for a binary if the ++ // alignment in ELF file are different from pagesize. ++ // The segment layout is as follows: ++ BC->SegmentMapInfo[0] = SegmentInfo{0, 0x2177c, 0, 0x2177c, 0x10000}; ++ BC->SegmentMapInfo[0x31860] = ++ SegmentInfo{0x31860, 0x370, 0x21860, 0x370, 0x10000}; ++ BC->SegmentMapInfo[0x41c20] = ++ SegmentInfo{0x41c20, 0x1f8, 0x21c20, 0x1f8, 0x10000}; ++ BC->SegmentMapInfo[0x54e18] = ++ SegmentInfo{0x54e18, 0x51, 0x24e18, 0x51, 0x10000}; ++ ++ std::optional BaseAddress = ++ BC->getBaseAddressForMapping(0xaaaaea444000, 0x21000); ++ ASSERT_TRUE(BaseAddress.has_value()); ++ ASSERT_EQ(*BaseAddress, 0xaaaaea413000ULL); ++ ++ BaseAddress = BC->getBaseAddressForMapping(0xaaaaea444000, 0x11000); ++ ASSERT_FALSE(BaseAddress.has_value()); ++} +-- +2.39.2 (Apple Git-143) + diff --git a/0005-BOLT-AArch64-Don-t-change-layout-in-PatchEntries.patch b/0005-BOLT-AArch64-Don-t-change-layout-in-PatchEntries.patch new file mode 100644 index 0000000000000000000000000000000000000000..eda8d214b70aac572d487ff8a70e5233955d4ba5 --- /dev/null +++ b/0005-BOLT-AArch64-Don-t-change-layout-in-PatchEntries.patch @@ -0,0 +1,130 @@ +From 28e7e71251dc4b79c29aa0d4904cb424f9081455 Mon Sep 17 00:00:00 2001 +From: rfwang07 +Date: Fri, 21 Jun 2024 11:23:42 +0800 +Subject: [PATCH] [BOLT][AArch64] Don't change layout in PatchEntries + +--- + bolt/lib/Passes/PatchEntries.cpp | 11 ++++++++ + bolt/test/AArch64/patch-entries.s | 36 ++++++++++++++++++++++++ + bolt/unittests/Core/BinaryContext.cpp | 40 +++++++++++++++++++++++++++ + 3 files changed, 87 insertions(+) + create mode 100644 bolt/test/AArch64/patch-entries.s + +diff --git a/bolt/lib/Passes/PatchEntries.cpp b/bolt/lib/Passes/PatchEntries.cpp +index 02a044d8b..ee7512d89 100644 +--- a/bolt/lib/Passes/PatchEntries.cpp ++++ b/bolt/lib/Passes/PatchEntries.cpp +@@ -98,6 +98,17 @@ void PatchEntries::runOnFunctions(BinaryContext &BC) { + }); + + if (!Success) { ++ // We can't change output layout for AArch64 due to LongJmp pass ++ if (BC.isAArch64()) { ++ if (opts::ForcePatch) { ++ errs() << "BOLT-ERROR: unable to patch entries in " << Function ++ << "\n"; ++ exit(1); ++ } ++ ++ continue; ++ } ++ + // If the original function entries cannot be patched, then we cannot + // safely emit new function body. + errs() << "BOLT-WARNING: failed to patch entries in " << Function +diff --git a/bolt/test/AArch64/patch-entries.s b/bolt/test/AArch64/patch-entries.s +new file mode 100644 +index 000000000..cf6f72a0b +--- /dev/null ++++ b/bolt/test/AArch64/patch-entries.s +@@ -0,0 +1,36 @@ ++# This test checks patch entries functionality ++ ++# REQUIRES: system-linux ++ ++# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ ++# RUN: %s -o %t.o ++# RUN: %clang %cflags -pie %t.o -o %t.exe -nostdlib -Wl,-q ++# RUN: llvm-bolt %t.exe -o %t.bolt --use-old-text=0 --lite=0 --skip-funcs=_start ++# RUN: llvm-objdump -dz %t.bolt | FileCheck %s ++ ++# CHECK: : ++# CHECK-NEXT: adrp x16, 0x[[#%x,ADRP:]] ++# CHECK-NEXT: add x16, x16, #0x[[#%x,ADD:]] ++# CHECK-NEXT: br x16 ++ ++# CHECK: [[#ADRP + ADD]] : ++# CHECK-NEXT: [[#ADRP + ADD]]: {{.*}} ret ++ ++.text ++.balign 4 ++.global pathedEntries ++.type pathedEntries, %function ++pathedEntries: ++ .rept 32 ++ nop ++ .endr ++ ret ++.size pathedEntries, .-pathedEntries ++ ++.global _start ++.type _start, %function ++_start: ++ bl pathedEntries ++ .inst 0xdeadbeef ++ ret ++.size _start, .-_start +diff --git a/bolt/unittests/Core/BinaryContext.cpp b/bolt/unittests/Core/BinaryContext.cpp +index 5a80cb4a2..7ac1c1435 100644 +--- a/bolt/unittests/Core/BinaryContext.cpp ++++ b/bolt/unittests/Core/BinaryContext.cpp +@@ -62,6 +62,46 @@ INSTANTIATE_TEST_SUITE_P(X86, BinaryContextTester, + INSTANTIATE_TEST_SUITE_P(AArch64, BinaryContextTester, + ::testing::Values(Triple::aarch64)); + ++TEST_P(BinaryContextTester, FlushPendingRelocCALL26) { ++ if (GetParam() != Triple::aarch64) ++ GTEST_SKIP(); ++ ++ // This test checks that encodeValueAArch64 used by flushPendingRelocations ++ // returns correctly encoded values for CALL26 relocation for both backward ++ // and forward branches. ++ // ++ // The offsets layout is: ++ // 4: func1 ++ // 8: bl func1 ++ // 12: bl func2 ++ // 16: func2 ++ ++ char Data[20] = {}; ++ BinarySection &BS = BC->registerOrUpdateSection( ++ ".text", ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC, ++ (uint8_t *)Data, sizeof(Data), 4); ++ MCSymbol *RelSymbol1 = BC->getOrCreateGlobalSymbol(4, "Func1"); ++ ASSERT_TRUE(RelSymbol1); ++ BS.addRelocation(8, RelSymbol1, ELF::R_AARCH64_CALL26, 0, 0, true); ++ MCSymbol *RelSymbol2 = BC->getOrCreateGlobalSymbol(16, "Func2"); ++ ASSERT_TRUE(RelSymbol2); ++ BS.addRelocation(12, RelSymbol2, ELF::R_AARCH64_CALL26, 0, 0, true); ++ ++ std::error_code EC; ++ SmallVector Vect(sizeof(Data)); ++ raw_svector_ostream OS(Vect); ++ ++ BS.flushPendingRelocations(OS, [&](const MCSymbol *S) { ++ return S == RelSymbol1 ? 4 : S == RelSymbol2 ? 16 : 0; ++ }); ++ ++ const uint8_t Func1Call[4] = {255, 255, 255, 151}; ++ const uint8_t Func2Call[4] = {1, 0, 0, 148}; ++ ++ EXPECT_FALSE(memcmp(Func1Call, &Vect[8], 4)) << "Wrong backward call value\n"; ++ EXPECT_FALSE(memcmp(Func2Call, &Vect[12], 4)) << "Wrong forward call value\n"; ++} ++ + #endif + + TEST_P(BinaryContextTester, BaseAddress) { +-- +2.39.2 (Apple Git-143) + diff --git a/0006-AArch64-Add-CFG-block-count-correction-optimization.patch b/0006-AArch64-Add-CFG-block-count-correction-optimization.patch new file mode 100644 index 0000000000000000000000000000000000000000..b90b76d9461c27694516afc455562b5889485b0b --- /dev/null +++ b/0006-AArch64-Add-CFG-block-count-correction-optimization.patch @@ -0,0 +1,1820 @@ +From 25c9e9c7d4532f6e8962a25c5c7087bf3e3b8445 Mon Sep 17 00:00:00 2001 +From: rfwang07 +Date: Thu, 25 Jul 2024 14:45:53 +0800 +Subject: [PATCH] Add CFG block count correction optimization. + +--- + bolt/include/bolt/Core/BinaryBasicBlock.h | 59 +- + .../bolt/Core/BinaryBasicBlockFeature.h | 268 ++++++++ + bolt/include/bolt/Passes/FeatureMiner.h | 176 ++++++ + bolt/include/bolt/Passes/StaticBranchInfo.h | 108 ++++ + bolt/include/bolt/Profile/DataReader.h | 93 ++- + bolt/lib/Core/BinaryBasicBlockFeature.cpp | 21 + + bolt/lib/Core/CMakeLists.txt | 1 + + bolt/lib/Passes/CMakeLists.txt | 2 + + bolt/lib/Passes/FeatureMiner.cpp | 572 ++++++++++++++++++ + bolt/lib/Passes/StaticBranchInfo.cpp | 143 +++++ + bolt/lib/Profile/DataReader.cpp | 120 +++- + bolt/lib/Rewrite/RewriteInstance.cpp | 6 + + 12 files changed, 1557 insertions(+), 12 deletions(-) + create mode 100644 bolt/include/bolt/Core/BinaryBasicBlockFeature.h + create mode 100644 bolt/include/bolt/Passes/FeatureMiner.h + create mode 100644 bolt/include/bolt/Passes/StaticBranchInfo.h + create mode 100644 bolt/lib/Core/BinaryBasicBlockFeature.cpp + create mode 100644 bolt/lib/Passes/FeatureMiner.cpp + create mode 100644 bolt/lib/Passes/StaticBranchInfo.cpp + +diff --git a/bolt/include/bolt/Core/BinaryBasicBlock.h b/bolt/include/bolt/Core/BinaryBasicBlock.h +index 02be9c1d4..a39d38d6b 100644 +--- a/bolt/include/bolt/Core/BinaryBasicBlock.h ++++ b/bolt/include/bolt/Core/BinaryBasicBlock.h +@@ -15,6 +15,7 @@ + #ifndef BOLT_CORE_BINARY_BASIC_BLOCK_H + #define BOLT_CORE_BINARY_BASIC_BLOCK_H + ++#include "bolt/Core/BinaryBasicBlockFeature.h" + #include "bolt/Core/FunctionLayout.h" + #include "bolt/Core/MCPlus.h" + #include "llvm/ADT/GraphTraits.h" +@@ -25,6 +26,7 @@ + #include "llvm/Support/raw_ostream.h" + #include + #include ++#include + + namespace llvm { + class MCCodeEmitter; +@@ -147,6 +149,12 @@ private: + /// Last computed hash value. + mutable uint64_t Hash{0}; + ++ std::set ChildrenSet; ++ ++ std::set ParentSet; ++ ++ BinaryBasicBlockFeature BlockFeatures; ++ + private: + BinaryBasicBlock() = delete; + BinaryBasicBlock(const BinaryBasicBlock &) = delete; +@@ -385,11 +393,14 @@ public: + /// If the basic block ends with a conditional branch (possibly followed by + /// an unconditional branch) and thus has 2 successors, return a successor + /// corresponding to a jump condition which could be true or false. +- /// Return nullptr if the basic block does not have a conditional jump. ++ /// Return the only successor if it's followed by an unconditional branch. ++ /// Return nullptr otherwise. + BinaryBasicBlock *getConditionalSuccessor(bool Condition) { +- if (succ_size() != 2) +- return nullptr; +- return Successors[Condition == true ? 0 : 1]; ++ if (succ_size() == 2) ++ return Successors[Condition == true ? 0 : 1]; ++ if (succ_size() == 1) ++ return Successors[0]; ++ return nullptr; + } + + const BinaryBasicBlock *getConditionalSuccessor(bool Condition) const { +@@ -410,6 +421,13 @@ public: + return const_cast(this)->getFallthrough(); + } + ++ /// Return branch info corresponding to only branch. ++ const BinaryBranchInfo &getOnlyBranchInfo() const { ++ assert(BranchInfo.size() > 0 && ++ "could only be called for blocks with at least 1 successor"); ++ return BranchInfo[0]; ++ }; ++ + /// Return branch info corresponding to a taken branch. + const BinaryBranchInfo &getTakenBranchInfo() const { + assert(BranchInfo.size() == 2 && +@@ -818,6 +836,36 @@ public: + OutputAddressRange.second = Address; + } + ++ /// Sets features of this BB. ++ void setFeatures(BinaryBasicBlockFeature BBF) { ++ BlockFeatures = BBF; ++ } ++ ++ /// Gets numberic features of this BB. ++ BinaryBasicBlockFeature getFeatures() { ++ return BlockFeatures; ++ } ++ ++ /// Gets children sets of this BB. ++ std::set getChildrenSet() { ++ return ChildrenSet; ++ } ++ ++ /// Gets parent sets of this BB. ++ std::set getParentSet() { ++ return ParentSet; ++ } ++ ++ /// Inserts children sets of this BB. ++ void insertChildrenSet(BinaryBasicBlock *Node) { ++ ChildrenSet.insert(Node); ++ } ++ ++ /// Inserts parent sets of this BB. ++ void insertParentSet(BinaryBasicBlock *Node) { ++ ParentSet.insert(Node); ++ } ++ + /// Gets the memory address range of this BB in the input binary. + std::pair getInputAddressRange() const { + return InputRange; +@@ -991,7 +1039,8 @@ private: + #if defined(LLVM_ON_UNIX) + /// Keep the size of the BinaryBasicBlock within a reasonable size class + /// (jemalloc bucket) on Linux +-static_assert(sizeof(BinaryBasicBlock) <= 256); ++/// The size threshod is expanded from 256 to 2048 to contain the extra BB features ++static_assert(sizeof(BinaryBasicBlock) <= 2048, ""); + #endif + + bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS); +diff --git a/bolt/include/bolt/Core/BinaryBasicBlockFeature.h b/bolt/include/bolt/Core/BinaryBasicBlockFeature.h +new file mode 100644 +index 000000000..2b4809b1a +--- /dev/null ++++ b/bolt/include/bolt/Core/BinaryBasicBlockFeature.h +@@ -0,0 +1,268 @@ ++//===- bolt/Core/BinaryBasicBlockFeature.h - Low-level basic block -----*- C++ ++//-*-===// ++// ++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. ++// See https://llvm.org/LICENSE.txt for license information. ++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception ++// ++//===----------------------------------------------------------------------===// ++// ++// Features of BinaryBasicBlock ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef BOLT_CORE_BINARY_BASIC_BLOCK_FEATURE_H ++#define BOLT_CORE_BINARY_BASIC_BLOCK_FEATURE_H ++ ++#include "bolt/Core/FunctionLayout.h" ++#include "bolt/Core/MCPlus.h" ++#include "llvm/ADT/GraphTraits.h" ++#include "llvm/ADT/StringRef.h" ++#include "llvm/MC/MCInst.h" ++#include "llvm/MC/MCSymbol.h" ++#include "llvm/Support/ErrorOr.h" ++#include "llvm/Support/raw_ostream.h" ++#include ++#include ++ ++namespace llvm { ++ ++namespace bolt { ++ ++class BinaryBasicBlockFeature { ++ ++public: ++ int32_t Opcode; ++ ++ int16_t Direction; ++ ++ int32_t CmpOpcode; ++ ++ int16_t LoopHeader; ++ ++ int16_t ProcedureType; ++ ++ int64_t Count; ++ ++ int64_t FallthroughCount; ++ ++ int64_t TotalLoops; ++ ++ int64_t LoopDepth; ++ ++ int64_t LoopNumBlocks; ++ ++ int64_t LocalExitingBlock; ++ ++ int64_t LocalLatchBlock; ++ ++ int64_t LocalLoopHeader; ++ ++ int64_t Call; ++ ++ int64_t DeltaTaken; ++ ++ int64_t NumLoads; ++ ++ int64_t NumCalls; ++ ++ int64_t OperandRAType; ++ ++ int64_t OperandRBType; ++ ++ int64_t BasicBlockSize; ++ ++ int64_t NumBasicBlocks; ++ ++ int64_t HasIndirectCalls; ++ ++ std::vector EndOpcode_vec; ++ ++ std::vector LoopHeader_vec; ++ ++ std::vector Backedge_vec; ++ ++ std::vector Exit_vec; ++ ++ std::vector Call_vec; ++ ++ std::vector BasicBlockSize_vec; ++ ++ std::vector InferenceFeatures; ++ ++ uint64_t FuncExec; ++ ++ int32_t ParentChildNum; ++ ++ int32_t ParentCount; ++ ++ int32_t ChildParentNum; ++ ++ int32_t ChildCount; ++ ++public: ++ void setOpcode(const int32_t &BlockOpcode) { Opcode = BlockOpcode; } ++ ++ void setDirection(const int16_t &BlockDirection) { ++ Direction = BlockDirection; ++ } ++ ++ void setCmpOpcode(const int32_t &BlockCmpOpcode) { ++ CmpOpcode = BlockCmpOpcode; ++ } ++ ++ void setLoopHeader(const int16_t &BlockLoopHeader) { ++ LoopHeader = BlockLoopHeader; ++ } ++ ++ void setProcedureType(const int16_t &BlockProcedureType) { ++ ProcedureType = BlockProcedureType; ++ } ++ ++ void setCount(const int64_t &BlockCount) { Count = BlockCount; } ++ ++ void setFallthroughCount(const int64_t &BlockFallthroughCount) { ++ FallthroughCount = BlockFallthroughCount; ++ } ++ ++ void setTotalLoops(const int64_t &BlockTotalLoops) { ++ TotalLoops = BlockTotalLoops; ++ } ++ ++ void setLoopDepth(const int64_t &BlockLoopDepth) { ++ LoopDepth = BlockLoopDepth; ++ } ++ ++ void setLoopNumBlocks(const int64_t &BlockLoopNumBlocks) { ++ LoopNumBlocks = BlockLoopNumBlocks; ++ } ++ ++ void setLocalExitingBlock(const int64_t &BlockLocalExitingBlock) { ++ LocalExitingBlock = BlockLocalExitingBlock; ++ } ++ ++ void setLocalLatchBlock(const int64_t &BlockLocalLatchBlock) { ++ LocalLatchBlock = BlockLocalLatchBlock; ++ } ++ ++ void setLocalLoopHeader(const int64_t &BlockLocalLoopHeader) { ++ LocalLoopHeader = BlockLocalLoopHeader; ++ } ++ ++ void setDeltaTaken(const int64_t &BlockDeltaTaken) { ++ DeltaTaken = BlockDeltaTaken; ++ } ++ ++ void setNumLoads(const int64_t &BlockNumLoads) { NumLoads = BlockNumLoads; } ++ ++ void setNumCalls(const int64_t &BlockNumCalls) { NumCalls = BlockNumCalls; } ++ ++ void setOperandRAType(const int64_t &BlockOperandRAType) { ++ OperandRAType = BlockOperandRAType; ++ } ++ ++ void setOperandRBType(const int64_t &BlockOperandRBType) { ++ OperandRBType = BlockOperandRBType; ++ } ++ ++ void setBasicBlockSize(const int64_t &BlockBasicBlockSize) { ++ BasicBlockSize = BlockBasicBlockSize; ++ } ++ ++ void setNumBasicBlocks(const int64_t &BlockNumBasicBlocks) { ++ NumBasicBlocks = BlockNumBasicBlocks; ++ } ++ ++ void setHasIndirectCalls(const int64_t &BlockHasIndirectCalls) { ++ HasIndirectCalls = BlockHasIndirectCalls; ++ } ++ ++ void setEndOpcodeVec(const int32_t &EndOpcode) { ++ EndOpcode_vec.push_back(EndOpcode); ++ } ++ ++ void setLoopHeaderVec(const int16_t &LoopHeader) { ++ LoopHeader_vec.push_back(LoopHeader); ++ } ++ ++ void setBackedgeVec(const int16_t &Backedge) { ++ Backedge_vec.push_back(Backedge); ++ } ++ ++ void setExitVec(const int16_t &Exit) { Exit_vec.push_back(Exit); } ++ ++ void setCallVec(const int16_t &Call) { Call_vec.push_back(Call); } ++ ++ void setBasicBlockSizeVec(const int64_t &BasicBlockSize) { ++ BasicBlockSize_vec.push_back(BasicBlockSize); ++ } ++ ++ void setFunExec(const uint64_t &BlockFuncExec) { FuncExec = BlockFuncExec; } ++ ++ void setParentChildNum(const int32_t &BlockParentChildNum) { ++ ParentChildNum = BlockParentChildNum; ++ } ++ ++ void setParentCount(const int32_t &BlockParentCount) { ++ ParentCount = BlockParentCount; ++ } ++ ++ void setChildParentNum(const int32_t &BlockChildParentNum) { ++ ChildParentNum = BlockChildParentNum; ++ } ++ ++ void setChildCount(const int32_t &BlockChildCount) { ++ ChildCount = BlockChildCount; ++ } ++ ++ void setInferenceFeatures() { ++ ++ if (Count == -1 || FallthroughCount == -1) { ++ return; ++ } ++ if (ParentChildNum == -1 && ParentCount == -1 && ChildParentNum == -1 && ++ ChildCount == -1) { ++ return; ++ } ++ ++ InferenceFeatures.push_back(static_cast(Direction)); ++ InferenceFeatures.push_back(static_cast(LoopHeader)); ++ InferenceFeatures.push_back(static_cast(ProcedureType)); ++ InferenceFeatures.push_back(static_cast(OperandRAType)); ++ InferenceFeatures.push_back(static_cast(OperandRBType)); ++ InferenceFeatures.push_back(static_cast(LoopHeader_vec[0])); ++ InferenceFeatures.push_back(static_cast(Backedge_vec[0])); ++ InferenceFeatures.push_back(static_cast(Exit_vec[0])); ++ InferenceFeatures.push_back(static_cast(LoopHeader_vec[1])); ++ InferenceFeatures.push_back(static_cast(Call_vec[0])); ++ InferenceFeatures.push_back(static_cast(LocalExitingBlock)); ++ InferenceFeatures.push_back(static_cast(HasIndirectCalls)); ++ InferenceFeatures.push_back(static_cast(LocalLatchBlock)); ++ InferenceFeatures.push_back(static_cast(LocalLoopHeader)); ++ InferenceFeatures.push_back(static_cast(Opcode)); ++ InferenceFeatures.push_back(static_cast(CmpOpcode)); ++ InferenceFeatures.push_back(static_cast(EndOpcode_vec[0])); ++ InferenceFeatures.push_back(static_cast(EndOpcode_vec[1])); ++ InferenceFeatures.push_back(static_cast(FuncExec)); ++ InferenceFeatures.push_back(static_cast(NumBasicBlocks)); ++ InferenceFeatures.push_back(static_cast(BasicBlockSize)); ++ InferenceFeatures.push_back(static_cast(BasicBlockSize_vec[0])); ++ InferenceFeatures.push_back(static_cast(BasicBlockSize_vec[1])); ++ InferenceFeatures.push_back(static_cast(LoopNumBlocks)); ++ InferenceFeatures.push_back(static_cast(NumLoads)); ++ InferenceFeatures.push_back(static_cast(NumCalls)); ++ InferenceFeatures.push_back(static_cast(TotalLoops)); ++ InferenceFeatures.push_back(static_cast(DeltaTaken)); ++ InferenceFeatures.push_back(static_cast(LoopDepth)); ++ InferenceFeatures.push_back(static_cast(ParentChildNum)); ++ InferenceFeatures.push_back(static_cast(ParentCount)); ++ InferenceFeatures.push_back(static_cast(ChildParentNum)); ++ InferenceFeatures.push_back(static_cast(ChildCount)); ++ } ++ ++ std::vector getInferenceFeatures() { return InferenceFeatures; } ++}; ++} // namespace bolt ++} // namespace llvm ++ ++#endif +\ No newline at end of file +diff --git a/bolt/include/bolt/Passes/FeatureMiner.h b/bolt/include/bolt/Passes/FeatureMiner.h +new file mode 100644 +index 000000000..6170aa62d +--- /dev/null ++++ b/bolt/include/bolt/Passes/FeatureMiner.h +@@ -0,0 +1,176 @@ ++//===--- Passes/FeatureMiner.h ++//---------------------------------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// A very simple feature extractor based on Calder's paper ++// Evidence-based static branch prediction using machine learning ++// https://dl.acm.org/doi/10.1145/239912.239923 ++//===----------------------------------------------------------------------===// ++ ++#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_FEATUREMINER_H_ ++#define LLVM_TOOLS_LLVM_BOLT_PASSES_FEATUREMINER_H_ ++ ++#include "bolt/Core/BinaryData.h" ++#include "bolt/Core/BinaryFunction.h" ++#include "bolt/Core/BinaryLoop.h" ++#include "bolt/Passes/BinaryPasses.h" ++#include "bolt/Passes/DominatorAnalysis.h" ++#include "bolt/Passes/StaticBranchInfo.h" ++#include "llvm/ADT/DenseMap.h" ++#include "llvm/ADT/StringRef.h" ++#include "llvm/MC/MCInst.h" ++#include "llvm/Support/raw_ostream.h" ++#include ++#include ++#include ++#include ++#include ++ ++namespace llvm { ++namespace bolt { ++ ++class FeatureMiner : public BinaryFunctionPass { ++private: ++ std::unique_ptr SBI; ++ /// BasicBlockInfo - This structure holds feature information about the target ++ /// BasicBlock of either the taken or the fallthrough paths of a given branch. ++ struct BasicBlockInfo { ++ std::optional BranchDominates; // 1 - dominates, 0 - does not dominate ++ std::optional ++ BranchPostdominates; // 1 - postdominates, 0 - does not PD ++ std::optional LoopHeader; // 1 - loop header, 0 - not a loop header ++ std::optional Backedge; // 1 - loop back, 0 - not a loop back ++ std::optional Exit; // 1 - loop exit, 0 - not a loop exit ++ std::optional Call; // 1 - program call, 0 - not a program call ++ std::optional NumCalls; ++ std::optional NumLoads; ++ std::optional NumStores; ++ std::optional EndOpcode; // 0 = NOTHING ++ std::string EndOpcodeStr = "UNDEF"; ++ std::optional BasicBlockSize; ++ std::string FromFunName = "UNDEF"; ++ uint32_t FromBb; ++ std::string ToFunName = "UNDEF"; ++ uint32_t ToBb; ++ ++ std::optional NumCallsExit; ++ std::optional NumCallsInvoke; ++ std::optional NumIndirectCalls; ++ std::optional NumTailCalls; ++ }; ++ ++ typedef std::unique_ptr BBIPtr; ++ ++ /// BranchFeaturesInfo - This structure holds feature information about each ++ /// two-way branch from the program. ++ struct BranchFeaturesInfo { ++ std::string OpcodeStr = "UNDEF"; ++ std::string CmpOpcodeStr = "UNDEF"; ++ bool Simple = 0; ++ ++ std::optional Opcode; ++ std::optional CmpOpcode; ++ std::optional Count; ++ std::optional MissPredicted; ++ std::optional FallthroughCount; ++ std::optional FallthroughMissPredicted; ++ BBIPtr TrueSuccessor = std::make_unique(); ++ BBIPtr FalseSuccessor = std::make_unique(); ++ std::optional ProcedureType; // 1 - Leaf, 0 - NonLeaf, 2 - CallSelf ++ std::optional LoopHeader; // 1 — loop header, 0 - not a loop header ++ std::optional Direction; // 1 - Forward Branch, 0 - Backward Branch ++ ++ std::optional NumOuterLoops; ++ std::optional TotalLoops; ++ std::optional MaximumLoopDepth; ++ std::optional LoopDepth; ++ std::optional LoopNumExitEdges; ++ std::optional LoopNumExitBlocks; ++ std::optional LoopNumExitingBlocks; ++ std::optional LoopNumLatches; ++ std::optional LoopNumBlocks; ++ std::optional LoopNumBackEdges; ++ std::optional NumLoads; ++ std::optional NumStores; ++ ++ std::optional LocalExitingBlock; ++ std::optional LocalLatchBlock; ++ std::optional LocalLoopHeader; ++ std::optional Call; ++ ++ std::optional NumCalls; ++ std::optional NumCallsExit; ++ std::optional NumCallsInvoke; ++ std::optional NumIndirectCalls; ++ std::optional NumTailCalls; ++ std::optional NumSelfCalls; ++ ++ std::optional NumBasicBlocks; ++ ++ std::optional DeltaTaken; ++ ++ std::optional OperandRAType; ++ std::optional OperandRBType; ++ ++ std::optional BasicBlockSize; ++ ++ std::optional BranchOffset; ++ }; ++ ++ typedef std::unique_ptr BFIPtr; ++ ++ std::vector BranchesInfoSet; ++ ++ /// getProcedureType - Determines which category the function falls into: ++ /// Leaf, Non-leaf or Calls-self. ++ int8_t getProcedureType(BinaryFunction &Function, BinaryContext &BC); ++ ++ /// addSuccessorInfo - Discovers feature information for the target successor ++ /// basic block, and inserts it into the static branch info container. ++ void addSuccessorInfo(BFIPtr const &BFI, BinaryFunction &Function, ++ BinaryContext &BC, BinaryBasicBlock &BB, bool SuccType); ++ ++ /// extractFeatures - Extracts the feature information for each two-way branch ++ /// from the program. ++ void extractFeatures(BinaryFunction &Function, BinaryContext &BC); ++ ++ void generateInstFeatures(BinaryContext &BC, BinaryBasicBlock &BB, ++ BFIPtr const &BFI, int Index); ++ /// dumpSuccessorFeatures - Dumps the feature information about the target ++ /// BasicBlock of either the taken or the fallthrough paths of a given branch. ++ void generateSuccessorFeatures(BBIPtr &Successor, ++ BinaryBasicBlockFeature *BBF); ++ ++ /// dumpFeatures - Dumps the feature information about each two-way branch ++ /// from the program. ++ void dumpFeatures(raw_ostream &Printer, uint64_t FunctionAddress, ++ uint64_t FunctionFrequency); ++ ++ /// dumpProfileData - Dumps a limited version of the inout profile data ++ /// that contains only profile for conditional branches, unconditional ++ /// branches and terminators that aren't branches. ++ void dumpProfileData(BinaryFunction &Function, raw_ostream &Printer); ++ ++public: ++ explicit FeatureMiner(const cl::opt &PrintPass) ++ : BinaryFunctionPass(PrintPass) {} ++ ++ std::ofstream trainPrinter; ++ ++ const char *getName() const override { return "feature-miner"; } ++ ++ void runOnFunctions(BinaryContext &BC) override; ++ void inferenceFeatures(BinaryFunction &Function); ++ void generateProfileFeatures(BinaryBasicBlock *BB, ++ BinaryBasicBlockFeature *BBF); ++}; ++ ++} // namespace bolt ++} // namespace llvm ++ ++#endif /* LLVM_TOOLS_LLVM_BOLT_PASSES_FEATUREMINER_H_ */ +diff --git a/bolt/include/bolt/Passes/StaticBranchInfo.h b/bolt/include/bolt/Passes/StaticBranchInfo.h +new file mode 100644 +index 000000000..8de8df793 +--- /dev/null ++++ b/bolt/include/bolt/Passes/StaticBranchInfo.h +@@ -0,0 +1,108 @@ ++//===------ Passes/StaticBranchInfo.h -------------------------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// This is an auxiliary class to the feature miner, static branch probability ++// and frequency passes. This class is responsible for finding loop info (loop ++// back edges, loop exit edges and loop headers) of a function. It also finds ++// basic block info (if a block contains store and call instructions) and if a ++// basic block contains a call to the exit. ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STATICBRANCHINFO_H_ ++#define LLVM_TOOLS_LLVM_BOLT_PASSES_STATICBRANCHINFO_H_ ++ ++#include "bolt/Core/BinaryContext.h" ++#include "bolt/Core/BinaryFunction.h" ++#include "bolt/Core/BinaryLoop.h" ++#include "llvm/MC/MCSymbol.h" ++#include ++ ++namespace llvm { ++namespace bolt { ++ ++class StaticBranchInfo { ++ ++public: ++ /// An edge indicates that a control flow may go from a basic block (source) ++ /// to an other one (destination), and this pair of basic blocks will be used ++ /// to index maps and retrieve content of sets. ++ typedef std::pair Edge; ++ ++private: ++ /// Holds the loop headers of a given function. ++ DenseSet LoopHeaders; ++ ++ /// Holds the loop backedges of a given function. ++ DenseSet BackEdges; ++ ++ /// Holds the loop exit edges of a given function. ++ DenseSet ExitEdges; ++ ++ /// Holds the basic blocks of a given function ++ /// that contains at least one call instructions. ++ DenseSet CallSet; ++ ++ /// Holds the basic blocks of a given function ++ /// that contains at least one store instructions. ++ DenseSet StoreSet; ++ ++ unsigned NumLoads; ++ unsigned NumStores; ++ ++public: ++ unsigned getNumLoads() { return NumLoads; } ++ ++ unsigned getNumStores() { return NumStores; } ++ ++ /// findLoopEdgesInfo - Finds all loop back edges, loop exit eges ++ /// and loop headers within the function. ++ void findLoopEdgesInfo(const BinaryLoopInfo &LoopsInfo); ++ ++ /// findBasicBlockInfo - Finds all call and store instructions within ++ /// the basic blocks of a given function. ++ void findBasicBlockInfo(const BinaryFunction &Function, BinaryContext &BC); ++ ++ /// isBackEdge - Checks if the edge is a loop back edge. ++ bool isBackEdge(const Edge &CFGEdge) const; ++ ++ /// isBackEdge - Checks if the edge is a loop back edge. ++ bool isBackEdge(const BinaryBasicBlock *SrcBB, ++ const BinaryBasicBlock *DstBB) const; ++ ++ /// isExitEdge - Checks if the edge is a loop exit edge. ++ bool isExitEdge(const BinaryLoop::Edge &CFGEdge) const; ++ ++ /// isExitEdge - Checks if the edge is a loop exit edge. ++ bool isExitEdge(const BinaryBasicBlock *SrcBB, ++ const BinaryBasicBlock *DstBB) const; ++ ++ /// isLoopHeader - Checks if the basic block is a loop header. ++ bool isLoopHeader(const BinaryBasicBlock *BB) const; ++ ++ /// hasCallInst - Checks if the basic block has a call instruction. ++ bool hasCallInst(const BinaryBasicBlock *BB) const; ++ ++ /// hasStoreInst - Checks if the basic block has a store instruction. ++ bool hasStoreInst(const BinaryBasicBlock *BB) const; ++ ++ /// countBackEdges - Compute the number of BB's successor that are back edges. ++ unsigned countBackEdges(BinaryBasicBlock *BB) const; ++ ++ /// countExitEdges - Compute the number of BB's successor that are exit edges. ++ unsigned countExitEdges(BinaryBasicBlock *BB) const; ++ ++ /// clear - Cleans up all the content from the data structs used. ++ void clear(); ++}; ++ ++} // namespace bolt ++} // namespace llvm ++ ++#endif /* LLVM_TOOLS_LLVM_BOLT_PASSES_STATICBRANCHINFO_H_ */ +diff --git a/bolt/include/bolt/Profile/DataReader.h b/bolt/include/bolt/Profile/DataReader.h +index 916b4f7e2..bf732d47c 100644 +--- a/bolt/include/bolt/Profile/DataReader.h ++++ b/bolt/include/bolt/Profile/DataReader.h +@@ -22,6 +22,7 @@ + #include "llvm/Support/MemoryBuffer.h" + #include "llvm/Support/raw_ostream.h" + #include ++#include + #include + #include + +@@ -44,6 +45,15 @@ inline raw_ostream &operator<<(raw_ostream &OS, const LBREntry &LBR) { + return OS; + } + ++extern "C" { ++typedef void *(*CreateONNXRunnerFunc)(const char *); ++typedef void (*DeleteONNXRunnerFunc)(void *); ++typedef std::vector (*RunONNXModelFunc)(void *, ++ const std::vector &, ++ const std::vector &, ++ const std::vector &, int); ++} ++ + struct Location { + bool IsSymbol; + StringRef Name; +@@ -263,7 +273,8 @@ struct FuncSampleData { + class DataReader : public ProfileReaderBase { + public: + explicit DataReader(StringRef Filename) +- : ProfileReaderBase(Filename), Diag(errs()) {} ++ : ProfileReaderBase(Filename), Diag(errs()), onnxRunner(nullptr), ++ libHandle(nullptr), handleOnnxRuntime(nullptr) {} + + StringRef getReaderName() const override { return "branch profile reader"; } + +@@ -282,7 +293,87 @@ public: + /// Return all event names used to collect this profile + StringSet<> getEventNames() const override { return EventNames; } + ++ ~DataReader() { ++ // delete onnxrunner; ++ if (onnxRunner && libHandle && handleOnnxRuntime) { ++ DeleteONNXRunnerFunc deleteONNXRunner = ++ (DeleteONNXRunnerFunc)dlsym(libHandle, "deleteONNXRunner"); ++ deleteONNXRunner(onnxRunner); ++ dlclose(libHandle); ++ dlclose(handleOnnxRuntime); ++ } ++ } ++ ++ /// Initialize the onnxruntime model. ++ void initializeONNXRunner(const std::string &modelPath) { ++ if (!onnxRunner && !libHandle && !handleOnnxRuntime) { ++ handleOnnxRuntime = ++ dlopen("libonnxruntime.so", RTLD_LAZY | RTLD_GLOBAL); ++ if (handleOnnxRuntime == nullptr) { ++ outs() << "error: llvm-bolt failed during loading onnxruntime.so.\n"; ++ exit(1); ++ } ++ libHandle = dlopen("libONNXRunner.so", RTLD_LAZY); ++ if (libHandle == nullptr) { ++ outs() << "error: llvm-bolt failed during loading libONNXRunner.so.\n"; ++ exit(1); ++ } ++ CreateONNXRunnerFunc createONNXRunner = ++ (CreateONNXRunnerFunc)dlsym(libHandle, "createONNXRunner"); ++ onnxRunner = createONNXRunner(modelPath.c_str()); ++ } ++ } ++ ++ /// Inference step for predicting the BB counts based on the BB features. ++ float ONNXInference(const std::vector &input_string, ++ const std::vector &input_int64, ++ const std::vector &input_float, int batch_size = 1) { ++ if (onnxRunner && libHandle) { ++ RunONNXModelFunc runONNXModel = ++ (RunONNXModelFunc)dlsym(libHandle, "runONNXModel"); ++ std::vector model_preds = runONNXModel( ++ onnxRunner, input_string, input_int64, input_float, batch_size); ++ if (model_preds.size() <= 0) { ++ outs() << "error: llvm-bolt model prediction result cannot be empty.\n"; ++ exit(1); ++ } ++ float pred = model_preds[0]; ++ return pred; ++ } ++ return -1.0; ++ } ++ ++ /// Return the annotating threshold for the model prediction. ++ void setThreshold(float annotate_threshold) { ++ threshold = annotate_threshold; ++ } ++ + protected: ++ /// The onnxruntime model pointer read from the input model path. ++ void *onnxRunner; ++ ++ /// The library handle of the ai4compiler framwork. ++ void *libHandle; ++ ++ /// The library handle of the onnxruntime. ++ void *handleOnnxRuntime; ++ ++ /// The annotating threshold for the model prediction. ++ float threshold; ++ ++ /// Return the annotating threshold for the model prediction. ++ float getThreshold() const { return threshold; } ++ ++ /// The counting value of the total modified BB-count number. ++ uint64_t modified_BB_total = 0; ++ ++ /// Add the total modified BB-count number by the BB modifiied number within ++ /// the funciton. ++ void addModifiedBBTotal(uint64_t &value) { modified_BB_total += value; } ++ ++ /// Return the counting value of the total modified BB-count number. ++ uint64_t getModifiedBBTotal() const { return modified_BB_total; } ++ + /// Read profile information available for the function. + void readProfile(BinaryFunction &BF); + +diff --git a/bolt/lib/Core/BinaryBasicBlockFeature.cpp b/bolt/lib/Core/BinaryBasicBlockFeature.cpp +new file mode 100644 +index 000000000..e1a2a3dd8 +--- /dev/null ++++ b/bolt/lib/Core/BinaryBasicBlockFeature.cpp +@@ -0,0 +1,21 @@ ++//===- bolt/Core/BinaryBasicBlockFeature.cpp - Low-level basic block ++//-------------===// ++// ++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. ++// See https://llvm.org/LICENSE.txt for license information. ++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception ++// ++//===----------------------------------------------------------------------===// ++// ++// This file implements the BinaryBasicBlock class. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "bolt/Core/BinaryBasicBlock.h" ++#include "bolt/Core/BinaryBasicBlockFeature.h" ++ ++#define DEBUG_TYPE "bolt" ++ ++namespace llvm { ++namespace bolt {} // namespace bolt ++} // namespace llvm +\ No newline at end of file +diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt +index a4612fb93..f93147d39 100644 +--- a/bolt/lib/Core/CMakeLists.txt ++++ b/bolt/lib/Core/CMakeLists.txt +@@ -12,6 +12,7 @@ set(LLVM_LINK_COMPONENTS + + add_llvm_library(LLVMBOLTCore + BinaryBasicBlock.cpp ++ BinaryBasicBlockFeature.cpp + BinaryContext.cpp + BinaryData.cpp + BinaryEmitter.cpp +diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt +index b8bbe59a6..e9ccea17c 100644 +--- a/bolt/lib/Passes/CMakeLists.txt ++++ b/bolt/lib/Passes/CMakeLists.txt +@@ -13,6 +13,7 @@ add_llvm_library(LLVMBOLTPasses + DataflowInfoManager.cpp + FrameAnalysis.cpp + FrameOptimizer.cpp ++ FeatureMiner.cpp + FixRelaxationPass.cpp + FixRISCVCallsPass.cpp + HFSort.cpp +@@ -41,6 +42,7 @@ add_llvm_library(LLVMBOLTPasses + StackAvailableExpressions.cpp + StackPointerTracking.cpp + StackReachingUses.cpp ++ StaticBranchInfo.cpp + StokeInfo.cpp + TailDuplication.cpp + ThreeWayBranch.cpp +diff --git a/bolt/lib/Passes/FeatureMiner.cpp b/bolt/lib/Passes/FeatureMiner.cpp +new file mode 100644 +index 000000000..d93aef648 +--- /dev/null ++++ b/bolt/lib/Passes/FeatureMiner.cpp +@@ -0,0 +1,572 @@ ++//===--- Passes/FeatureMiner.cpp ------------------------------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// A very simple feature extractor based on Calder's paper ++// Evidence-based static branch prediction using machine learning ++// https://dl.acm.org/doi/10.1145/239912.239923 ++//===----------------------------------------------------------------------===// ++ ++#include "bolt/Passes/DataflowInfoManager.h" ++#include "bolt/Passes/FeatureMiner.h" ++#include "bolt/Passes/StaticBranchInfo.h" ++#include "llvm/Support/CommandLine.h" ++#include "llvm/Support/FileSystem.h" ++ ++#undef DEBUG_TYPE ++#define DEBUG_TYPE "bolt-feature-miner" ++ ++using namespace llvm; ++using namespace bolt; ++ ++namespace opts { ++extern cl::opt BlockCorrection; ++ ++} // namespace opts ++ ++namespace llvm { ++namespace bolt { ++ ++class BinaryFunction; ++ ++int8_t FeatureMiner::getProcedureType(BinaryFunction &Function, ++ BinaryContext &BC) { ++ int8_t ProcedureType = 1; ++ for (auto &BB : Function) { ++ for (auto &Inst : BB) { ++ if (BC.MIB->isCall(Inst)) { ++ ProcedureType = 0; // non-leaf type ++ if (const auto *CalleeSymbol = BC.MIB->getTargetSymbol(Inst)) { ++ const auto *Callee = BC.getFunctionForSymbol(CalleeSymbol); ++ if (Callee && ++ Callee->getFunctionNumber() == Function.getFunctionNumber()) { ++ return 2; // call self type ++ } ++ } ++ } ++ } ++ } ++ return ProcedureType; // leaf type ++} ++ ++void FeatureMiner::addSuccessorInfo(BFIPtr const &BFI, BinaryFunction &Function, ++ BinaryContext &BC, BinaryBasicBlock &BB, ++ bool SuccType) { ++ ++ BinaryBasicBlock *Successor = BB.getConditionalSuccessor(SuccType); ++ ++ if (!Successor) ++ return; ++ ++ unsigned NumCalls{0}; ++ ++ for (auto &Inst : BB) { ++ if (BC.MIB->isCall(Inst)) { ++ ++NumCalls; ++ } ++ } ++ ++ BBIPtr SuccBBInfo = std::make_unique(); ++ ++ // Check if the successor basic block is a loop header and store it. ++ SuccBBInfo->LoopHeader = SBI->isLoopHeader(Successor); ++ ++ SuccBBInfo->BasicBlockSize = Successor->size(); ++ ++ // Check if the edge getting to the successor basic block is a loop ++ // exit edge and store it. ++ SuccBBInfo->Exit = SBI->isExitEdge(&BB, Successor); ++ ++ // Check if the edge getting to the successor basic block is a loop ++ // back edge and store it. ++ SuccBBInfo->Backedge = SBI->isBackEdge(&BB, Successor); ++ ++ MCInst *SuccInst = Successor->getTerminatorBefore(nullptr); ++ ++ // Store information about the branch type ending sucessor basic block ++ SuccBBInfo->EndOpcode = (SuccInst && BC.MIA->isBranch(*SuccInst)) ++ ? SuccInst->getOpcode() ++ : 0; // 0 = NOTHING ++ ++ // Check if the successor basic block contains ++ // a procedure call and store it. ++ SuccBBInfo->Call = (NumCalls > 0) ? 1 // Contains a call instruction ++ : 0; // Does not contain a call instruction ++ ++ uint32_t Offset = BB.getEndOffset(); ++ ++ if (SuccType) { ++ BFI->TrueSuccessor = std::move(SuccBBInfo); ++ // Check if the taken branch is a forward ++ // or a backwards branch and store it ++ BFI->Direction = (Function.isForwardBranch(&BB, Successor) == true) ++ ? 1 // Forward branch ++ : 0; // Backwards branch ++ ++ auto OnlyBranchInfo = BB.getOnlyBranchInfo(); ++ BFI->Count = OnlyBranchInfo.Count; ++ ++ if (Offset) { ++ uint32_t TargetOffset = Successor->getInputOffset(); ++ uint32_t BranchOffset = Offset; ++ if (BranchOffset != UINT32_MAX && TargetOffset != UINT32_MAX) { ++ int64_t Delta = static_cast(TargetOffset) - ++ static_cast(BranchOffset); ++ BFI->DeltaTaken = std::abs(Delta); ++ } ++ } ++ } else { ++ if (BB.succ_size() == 2) { ++ auto FallthroughBranchInfo = BB.getFallthroughBranchInfo(); ++ BFI->FallthroughCount = FallthroughBranchInfo.Count; ++ } else { ++ auto OnlyBranchInfo = BB.getOnlyBranchInfo(); ++ BFI->FallthroughCount = OnlyBranchInfo.Count; ++ } ++ BFI->FalseSuccessor = std::move(SuccBBInfo); ++ } ++} ++ ++void FeatureMiner::extractFeatures(BinaryFunction &Function, ++ BinaryContext &BC) { ++ int8_t ProcedureType = getProcedureType(Function, BC); ++ auto Info = DataflowInfoManager(Function, nullptr, nullptr); ++ const BinaryLoopInfo &LoopsInfo = Function.getLoopInfo(); ++ ++ bool Simple = Function.isSimple(); ++ const auto &Order = Function.dfs(); ++ std::string Function_name = Function.getPrintName(); ++ ++ for (auto *BBA : Order) { ++ ++ auto &BB = *BBA; ++ ++ BinaryBasicBlockFeature BBF = BB.getFeatures(); ++ ++ unsigned TotalLoops{0}; ++ unsigned LoopDepth{0}; ++ unsigned LoopNumBlocks{0}; ++ ++ bool LocalExitingBlock{false}; ++ bool LocalLatchBlock{false}; ++ bool LocalLoopHeader{false}; ++ ++ generateProfileFeatures(&BB, &BBF); ++ ++ BinaryLoop *Loop = LoopsInfo.getLoopFor(&BB); ++ if (Loop) { ++ SmallVector ExitingBlocks; ++ Loop->getExitingBlocks(ExitingBlocks); ++ ++ SmallVector ExitBlocks; ++ Loop->getExitBlocks(ExitBlocks); ++ ++ SmallVector ExitEdges; ++ Loop->getExitEdges(ExitEdges); ++ ++ SmallVector Latches; ++ Loop->getLoopLatches(Latches); ++ ++ TotalLoops = LoopsInfo.TotalLoops; ++ LoopDepth = Loop->getLoopDepth(); ++ LoopNumBlocks = Loop->getNumBlocks(); ++ LocalExitingBlock = Loop->isLoopExiting(&BB); ++ LocalLatchBlock = Loop->isLoopLatch(&BB); ++ LocalLoopHeader = ((Loop->getHeader() == (&BB)) ? 1 : 0); ++ } ++ ++ unsigned NumLoads{0}; ++ unsigned NumCalls{0}; ++ unsigned NumIndirectCalls{0}; ++ ++ for (auto &Inst : BB) { ++ if (BC.MIB->isLoad(Inst)) { ++ ++NumLoads; ++ } else if (BC.MIB->isCall(Inst)) { ++ ++NumCalls; ++ if (BC.MIB->isIndirectCall(Inst)) ++ ++NumIndirectCalls; ++ } ++ } ++ ++ int Index = -2; ++ bool LoopHeader = SBI->isLoopHeader(&BB); ++ ++ BFIPtr BFI = std::make_unique(); ++ ++ BFI->TotalLoops = TotalLoops; ++ BFI->LoopDepth = LoopDepth; ++ BFI->LoopNumBlocks = LoopNumBlocks; ++ BFI->LocalExitingBlock = LocalExitingBlock; ++ BFI->LocalLatchBlock = LocalLatchBlock; ++ BFI->LocalLoopHeader = LocalLoopHeader; ++ BFI->NumCalls = NumCalls; ++ BFI->BasicBlockSize = BB.size(); ++ BFI->NumBasicBlocks = Function.size(); ++ ++ BFI->NumLoads = NumLoads; ++ BFI->NumIndirectCalls = NumIndirectCalls; ++ BFI->LoopHeader = LoopHeader; ++ BFI->ProcedureType = ProcedureType; ++ ++ // Adding taken successor info. ++ addSuccessorInfo(BFI, Function, BC, BB, true); ++ // Adding fall through successor info. ++ addSuccessorInfo(BFI, Function, BC, BB, false); ++ ++ MCInst ConditionalInst; ++ bool hasConditionalBranch = false; ++ MCInst UnconditionalInst; ++ bool hasUnconditionalBranch = false; ++ ++ for (auto &Inst : BB) { ++ ++Index; ++ if (!BC.MIA->isConditionalBranch(Inst) && ++ !BC.MIA->isUnconditionalBranch(Inst)) ++ continue; ++ ++ generateInstFeatures(BC, BB, BFI, Index); ++ ++ if (BC.MIA->isConditionalBranch(Inst)) { ++ ConditionalInst = Inst; ++ hasConditionalBranch = true; ++ } ++ ++ if (BC.MIA->isUnconditionalBranch(Inst)) { ++ UnconditionalInst = Inst; ++ hasUnconditionalBranch = true; ++ } ++ } ++ ++ if (hasConditionalBranch) { ++ BFI->Opcode = ConditionalInst.getOpcode(); ++ ++ } else { ++ if (hasUnconditionalBranch) { ++ BFI->Opcode = UnconditionalInst.getOpcode(); ++ ++ } else { ++ auto Inst = BB.getLastNonPseudoInstr(); ++ BFI->Opcode = Inst->getOpcode(); ++ generateInstFeatures(BC, BB, BFI, Index); ++ } ++ } ++ ++ auto &FalseSuccessor = BFI->FalseSuccessor; ++ auto &TrueSuccessor = BFI->TrueSuccessor; ++ ++ int16_t ProcedureType = (BFI->ProcedureType.has_value()) ++ ? static_cast(*(BFI->ProcedureType)) ++ : -1; ++ ++ int64_t Count = ++ (BFI->Count.has_value()) ? static_cast(*(BFI->Count)) : -1; ++ ++ int64_t FallthroughCount = ++ (BFI->FallthroughCount.has_value()) ++ ? static_cast(*(BFI->FallthroughCount)) ++ : -1; ++ ++ int16_t LoopHeaderValid = (BFI->LoopHeader.has_value()) ++ ? static_cast(*(BFI->LoopHeader)) ++ : -1; ++ ++ int64_t TotalLoopsValid = (BFI->TotalLoops.has_value()) ++ ? static_cast(*(BFI->TotalLoops)) ++ : -1; ++ int64_t LoopDepthValid = (BFI->LoopDepth.has_value()) ++ ? static_cast(*(BFI->LoopDepth)) ++ : -1; ++ int64_t LoopNumBlocksValid = ++ (BFI->LoopNumBlocks.has_value()) ++ ? static_cast(*(BFI->LoopNumBlocks)) ++ : -1; ++ int64_t LocalExitingBlockValid = ++ (BFI->LocalExitingBlock.has_value()) ++ ? static_cast(*(BFI->LocalExitingBlock)) ++ : -1; ++ ++ int64_t LocalLatchBlockValid = ++ (BFI->LocalLatchBlock.has_value()) ++ ? static_cast(*(BFI->LocalLatchBlock)) ++ : -1; ++ ++ int64_t LocalLoopHeaderValid = ++ (BFI->LocalLoopHeader.has_value()) ++ ? static_cast(*(BFI->LocalLoopHeader)) ++ : -1; ++ ++ int32_t CmpOpcode = (BFI->CmpOpcode.has_value()) ++ ? static_cast(*(BFI->CmpOpcode)) ++ : -1; ++ ++ int64_t OperandRAType = (BFI->OperandRAType.has_value()) ++ ? static_cast(*(BFI->OperandRAType)) ++ : 10; ++ ++ int64_t OperandRBType = (BFI->OperandRBType.has_value()) ++ ? static_cast(*(BFI->OperandRBType)) ++ : 10; ++ int16_t Direction = (BFI->Direction.has_value()) ++ ? static_cast(*(BFI->Direction)) ++ : -1; ++ ++ int64_t DeltaTaken = (BFI->DeltaTaken.has_value()) ++ ? static_cast(*(BFI->DeltaTaken)) ++ : -1; ++ ++ int64_t NumLoadsValid = (BFI->NumLoads.has_value()) ++ ? static_cast(*(BFI->NumLoads)) ++ : -1; ++ ++ int64_t BasicBlockSize = (BFI->BasicBlockSize.has_value()) ++ ? static_cast(*(BFI->BasicBlockSize)) ++ : -1; ++ ++ int64_t NumBasicBlocks = (BFI->NumBasicBlocks.has_value()) ++ ? static_cast(*(BFI->NumBasicBlocks)) ++ : -1; ++ ++ int64_t NumCallsValid = (BFI->NumCalls.has_value()) ++ ? static_cast(*(BFI->NumCalls)) ++ : -1; ++ ++ int64_t NumIndirectCallsValid = ++ (BFI->NumIndirectCalls.has_value()) ++ ? static_cast(*(BFI->NumIndirectCalls)) ++ : -1; ++ ++ int64_t HasIndirectCalls = (NumIndirectCallsValid > 0) ? 1 : 0; ++ ++ int32_t Opcode = ++ (BFI->Opcode.has_value()) ? static_cast(*(BFI->Opcode)) : -1; ++ ++ uint64_t fun_exec = Function.getExecutionCount(); ++ fun_exec = (fun_exec != UINT64_MAX) ? fun_exec : 0; ++ ++ BBF.setDirection(Direction); ++ BBF.setDeltaTaken(DeltaTaken); ++ BBF.setOpcode(Opcode); ++ BBF.setCmpOpcode(CmpOpcode); ++ BBF.setOperandRAType(OperandRAType); ++ BBF.setOperandRBType(OperandRBType); ++ BBF.setFunExec(fun_exec); ++ BBF.setTotalLoops(TotalLoopsValid); ++ BBF.setLoopDepth(LoopDepthValid); ++ BBF.setLoopNumBlocks(LoopNumBlocksValid); ++ BBF.setLocalExitingBlock(LocalExitingBlockValid); ++ BBF.setLocalLatchBlock(LocalLatchBlockValid); ++ BBF.setLocalLoopHeader(LocalLoopHeaderValid); ++ BBF.setNumCalls(NumCallsValid); ++ BBF.setBasicBlockSize(BasicBlockSize); ++ BBF.setNumBasicBlocks(NumBasicBlocks); ++ BBF.setNumLoads(NumLoadsValid); ++ BBF.setHasIndirectCalls(HasIndirectCalls); ++ BBF.setLoopHeader(LoopHeaderValid); ++ BBF.setProcedureType(ProcedureType); ++ BBF.setCount(Count); ++ BBF.setFallthroughCount(FallthroughCount); ++ ++ generateSuccessorFeatures(TrueSuccessor, &BBF); ++ generateSuccessorFeatures(FalseSuccessor, &BBF); ++ ++ FalseSuccessor.reset(); ++ TrueSuccessor.reset(); ++ ++ BBF.setInferenceFeatures(); ++ BB.setFeatures(BBF); ++ ++ BFI.reset(); ++ } ++} ++ ++void FeatureMiner::generateInstFeatures(BinaryContext &BC, BinaryBasicBlock &BB, ++ BFIPtr const &BFI, int Index) { ++ ++ // Holds the branch opcode info. ++ ++ BFI->CmpOpcode = 0; ++ if (Index > -1) { ++ auto Cmp = BB.begin() + Index; ++ if (BC.MII->get((*Cmp).getOpcode()).isCompare()) { ++ // Holding the branch comparison opcode info. ++ BFI->CmpOpcode = (*Cmp).getOpcode(); ++ auto getOperandType = [&](const MCOperand &Operand) -> int32_t { ++ if (Operand.isReg()) ++ return 0; ++ else if (Operand.isImm()) ++ return 1; ++ else if (Operand.isSFPImm()) ++ return 2; ++ else if (Operand.isExpr()) ++ return 3; ++ else ++ return -1; ++ }; ++ ++ const auto InstInfo = BC.MII->get((*Cmp).getOpcode()); ++ unsigned NumDefs = InstInfo.getNumDefs(); ++ int32_t NumPrimeOperands = MCPlus::getNumPrimeOperands(*Cmp) - NumDefs; ++ switch (NumPrimeOperands) { ++ case 6: { ++ int32_t RBType = getOperandType((*Cmp).getOperand(NumDefs)); ++ int32_t RAType = getOperandType((*Cmp).getOperand(NumDefs + 1)); ++ ++ if (RBType == 0 && RAType == 0) { ++ BFI->OperandRBType = RBType; ++ BFI->OperandRAType = RAType; ++ } else if (RBType == 0 && (RAType == 1 || RAType == 2)) { ++ RAType = getOperandType((*Cmp).getOperand(NumPrimeOperands - 1)); ++ ++ if (RAType != 1 && RAType != 2) { ++ RAType = -1; ++ } ++ ++ BFI->OperandRBType = RBType; ++ BFI->OperandRAType = RAType; ++ } else { ++ BFI->OperandRAType = -1; ++ BFI->OperandRBType = -1; ++ } ++ break; ++ } ++ case 2: ++ BFI->OperandRBType = getOperandType((*Cmp).getOperand(NumDefs)); ++ BFI->OperandRAType = getOperandType((*Cmp).getOperand(NumDefs + 1)); ++ break; ++ case 3: ++ BFI->OperandRBType = getOperandType((*Cmp).getOperand(NumDefs)); ++ BFI->OperandRAType = getOperandType((*Cmp).getOperand(NumDefs + 2)); ++ break; ++ case 1: ++ BFI->OperandRAType = getOperandType((*Cmp).getOperand(NumDefs)); ++ break; ++ default: ++ BFI->OperandRAType = -1; ++ BFI->OperandRBType = -1; ++ break; ++ } ++ ++ } else { ++ Index -= 1; ++ for (int Idx = Index; Idx > -1; Idx--) { ++ auto Cmp = BB.begin() + Idx; ++ if (BC.MII->get((*Cmp).getOpcode()).isCompare()) { ++ // Holding the branch comparison opcode info. ++ BFI->CmpOpcode = (*Cmp).getOpcode(); ++ break; ++ } ++ } ++ } ++ } ++} ++ ++void FeatureMiner::generateSuccessorFeatures(BBIPtr &Successor, ++ BinaryBasicBlockFeature *BBF) { ++ ++ int16_t LoopHeader = (Successor->LoopHeader.has_value()) ++ ? static_cast(*(Successor->LoopHeader)) ++ : -1; ++ ++ int16_t Backedge = (Successor->Backedge.has_value()) ++ ? static_cast(*(Successor->Backedge)) ++ : -1; ++ ++ int16_t Exit = (Successor->Exit.has_value()) ++ ? static_cast(*(Successor->Exit)) ++ : -1; ++ ++ int16_t Call = (Successor->Call.has_value()) ++ ? static_cast(*(Successor->Call)) ++ : -1; ++ ++ int32_t EndOpcode = (Successor->EndOpcode.has_value()) ++ ? static_cast(*(Successor->EndOpcode)) ++ : -1; ++ ++ int64_t BasicBlockSize = ++ (Successor->BasicBlockSize.has_value()) ++ ? static_cast(*(Successor->BasicBlockSize)) ++ : -1; ++ ++ BBF->setEndOpcodeVec(EndOpcode); ++ BBF->setLoopHeaderVec(LoopHeader); ++ BBF->setBackedgeVec(Backedge); ++ BBF->setExitVec(Exit); ++ BBF->setCallVec(Call); ++ BBF->setBasicBlockSizeVec(BasicBlockSize); ++} ++ ++void FeatureMiner::runOnFunctions(BinaryContext &BC) {} ++ ++void FeatureMiner::inferenceFeatures(BinaryFunction &Function) { ++ ++ SBI = std::make_unique(); ++ ++ if (Function.empty()) ++ return; ++ ++ if (!Function.isLoopFree()) { ++ const BinaryLoopInfo &LoopsInfo = Function.getLoopInfo(); ++ SBI->findLoopEdgesInfo(LoopsInfo); ++ } ++ ++ BinaryContext &BC = Function.getBinaryContext(); ++ extractFeatures(Function, BC); ++ ++ SBI->clear(); ++} ++ ++void FeatureMiner::generateProfileFeatures(BinaryBasicBlock *BB, ++ BinaryBasicBlockFeature *BBF) { ++ int32_t parentChildNum, parentCount, childParentNum, childCount; ++ ++ if (BB->getParentSet().size() == 0) { ++ parentChildNum = -1; ++ parentCount = -1; ++ } else { ++ parentChildNum = std::numeric_limits::max(); ++ parentCount = 0; ++ for (BinaryBasicBlock *parent : BB->getParentSet()) { ++ if (parent->getChildrenSet().size() < parentChildNum) { ++ parentChildNum = parent->getChildrenSet().size(); ++ parentCount = parent->getExecutionCount(); ++ } else if (parent->getChildrenSet().size() == parentChildNum && ++ parent->getExecutionCount() > parentCount) { ++ parentCount = parent->getExecutionCount(); ++ } ++ } ++ } ++ ++ if (BB->getChildrenSet().size() == 0) { ++ childParentNum = -1; ++ childCount = -1; ++ } else { ++ childParentNum = std::numeric_limits::max(); ++ childCount = 0; ++ for (BinaryBasicBlock *child : BB->getChildrenSet()) { ++ if (child->getParentSet().size() < childParentNum) { ++ childParentNum = child->getParentSet().size(); ++ childCount = child->getExecutionCount(); ++ } else if (child->getParentSet().size() == childParentNum && ++ child->getExecutionCount() > childCount) { ++ childCount = child->getExecutionCount(); ++ } ++ } ++ } ++ ++ int64_t parentCountCatch = parentCount > 0 ? 1 : 0; ++ int64_t childCountCatch = childCount > 0 ? 1 : 0; ++ ++ BBF->setParentChildNum(parentChildNum); ++ BBF->setParentCount(parentCountCatch); ++ BBF->setChildParentNum(childParentNum); ++ BBF->setChildCount(childCountCatch); ++} ++ ++} // namespace bolt ++} // namespace llvm +\ No newline at end of file +diff --git a/bolt/lib/Passes/StaticBranchInfo.cpp b/bolt/lib/Passes/StaticBranchInfo.cpp +new file mode 100644 +index 000000000..585dbcae2 +--- /dev/null ++++ b/bolt/lib/Passes/StaticBranchInfo.cpp +@@ -0,0 +1,143 @@ ++//===------ Passes/StaticBranchInfo.cpp -----------------------------------===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// This is an auxiliary class to the feature miner, static branch probability ++// and frequency passes. This class is responsible for finding loop info (loop ++// back edges, loop exit edges and loop headers) of a function. It also finds ++// basic block info (if a block contains store and call instructions) and if a ++// basic block contains a call to the exit. ++// ++//===----------------------------------------------------------------------===// ++ ++#include "bolt/Core/BinaryBasicBlock.h" ++#include "bolt/Core/BinaryLoop.h" ++#include "bolt/Passes/StaticBranchInfo.h" ++ ++namespace llvm { ++namespace bolt { ++ ++void StaticBranchInfo::findLoopEdgesInfo(const BinaryLoopInfo &LoopsInfo) { ++ // Traverse discovered loops ++ std::stack Loops; ++ for (BinaryLoop *BL : LoopsInfo) ++ Loops.push(BL); ++ ++ while (!Loops.empty()) { ++ BinaryLoop *Loop = Loops.top(); ++ Loops.pop(); ++ BinaryBasicBlock *LoopHeader = Loop->getHeader(); ++ LoopHeaders.insert(LoopHeader); ++ ++ // Add nested loops in the stack. ++ for (BinaryLoop::iterator I = Loop->begin(), E = Loop->end(); I != E; ++I) { ++ Loops.push(*I); ++ } ++ ++ SmallVector Latches; ++ Loop->getLoopLatches(Latches); ++ ++ // Find back edges. ++ for (BinaryBasicBlock *Latch : Latches) { ++ for (BinaryBasicBlock *Succ : Latch->successors()) { ++ if (Succ == LoopHeader) { ++ Edge CFGEdge = std::make_pair(Latch->getLabel(), Succ->getLabel()); ++ BackEdges.insert(CFGEdge); ++ } ++ } ++ } ++ ++ // Find exit edges. ++ SmallVector AuxExitEdges; ++ Loop->getExitEdges(AuxExitEdges); ++ for (BinaryLoop::Edge &Exit : AuxExitEdges) { ++ ExitEdges.insert(Exit); ++ } ++ } ++} ++ ++void StaticBranchInfo::findBasicBlockInfo(const BinaryFunction &Function, ++ BinaryContext &BC) { ++ for (auto &BB : Function) { ++ for (auto &Inst : BB) { ++ if (BC.MIB->isCall(Inst)) ++ CallSet.insert(&BB); ++ else if (BC.MIB->isStore(Inst)) ++ StoreSet.insert(&BB); ++ } ++ } ++} ++ ++bool StaticBranchInfo::isBackEdge(const Edge &CFGEdge) const { ++ return BackEdges.count(CFGEdge); ++} ++ ++bool StaticBranchInfo::isBackEdge(const BinaryBasicBlock *SrcBB, ++ const BinaryBasicBlock *DstBB) const { ++ const Edge CFGEdge = std::make_pair(SrcBB->getLabel(), DstBB->getLabel()); ++ return isBackEdge(CFGEdge); ++} ++ ++bool StaticBranchInfo::isExitEdge(const BinaryLoop::Edge &CFGEdge) const { ++ return ExitEdges.count(CFGEdge); ++} ++ ++bool StaticBranchInfo::isExitEdge(const BinaryBasicBlock *SrcBB, ++ const BinaryBasicBlock *DstBB) const { ++ const BinaryLoop::Edge CFGEdge = ++ std::make_pair(const_cast(SrcBB), ++ const_cast(DstBB)); ++ return isExitEdge(CFGEdge); ++} ++ ++bool StaticBranchInfo::isLoopHeader(const BinaryBasicBlock *BB) const { ++ return LoopHeaders.count(BB); ++} ++ ++bool StaticBranchInfo::hasCallInst(const BinaryBasicBlock *BB) const { ++ return CallSet.count(BB); ++} ++ ++bool StaticBranchInfo::hasStoreInst(const BinaryBasicBlock *BB) const { ++ return StoreSet.count(BB); ++} ++ ++unsigned StaticBranchInfo::countBackEdges(BinaryBasicBlock *BB) const { ++ unsigned CountEdges = 0; ++ ++ for (BinaryBasicBlock *SuccBB : BB->successors()) { ++ const Edge CFGEdge = std::make_pair(BB->getLabel(), SuccBB->getLabel()); ++ if (BackEdges.count(CFGEdge)) ++ ++CountEdges; ++ } ++ ++ return CountEdges; ++} ++ ++unsigned StaticBranchInfo::countExitEdges(BinaryBasicBlock *BB) const { ++ unsigned CountEdges = 0; ++ ++ for (BinaryBasicBlock *SuccBB : BB->successors()) { ++ const BinaryLoop::Edge CFGEdge = std::make_pair(BB, SuccBB); ++ if (ExitEdges.count(CFGEdge)) ++ ++CountEdges; ++ } ++ ++ return CountEdges; ++} ++ ++void StaticBranchInfo::clear() { ++ LoopHeaders.clear(); ++ BackEdges.clear(); ++ ExitEdges.clear(); ++ CallSet.clear(); ++ StoreSet.clear(); ++} ++ ++} // namespace bolt ++} // namespace llvm +diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp +index 0e12e8cb3..447b71fe7 100644 +--- a/bolt/lib/Profile/DataReader.cpp ++++ b/bolt/lib/Profile/DataReader.cpp +@@ -12,13 +12,16 @@ + //===----------------------------------------------------------------------===// + + #include "bolt/Profile/DataReader.h" ++#include "bolt/Passes/FeatureMiner.h" + #include "bolt/Core/BinaryFunction.h" + #include "bolt/Passes/MCF.h" + #include "bolt/Utils/Utils.h" + #include "llvm/Support/CommandLine.h" + #include "llvm/Support/Debug.h" + #include "llvm/Support/Errc.h" ++#include + #include ++#include + + #undef DEBUG_TYPE + #define DEBUG_TYPE "bolt-prof" +@@ -26,15 +29,23 @@ + using namespace llvm; + + namespace opts { +- ++extern cl::opt BlockCorrection; + extern cl::OptionCategory BoltCategory; + extern llvm::cl::opt Verbosity; + +-static cl::opt +-DumpData("dump-data", +- cl::desc("dump parsed bolt data for debugging"), +- cl::Hidden, +- cl::cat(BoltCategory)); ++static cl::opt InputModelFilename("model-path", ++ cl::desc(""), ++ cl::Optional, ++ cl::cat(BoltCategory)); ++ ++static cl::opt AnnotateThreshold( ++ "annotate-threshold", ++ cl::desc(""), ++ cl::init(0.85f), cl::Optional, cl::cat(BoltCategory)); ++ ++static cl::opt DumpData("dump-data", ++ cl::desc("dump parsed bolt data for debugging"), ++ cl::Hidden, cl::cat(BoltCategory)); + + } // namespace opts + +@@ -311,6 +322,17 @@ Error DataReader::readProfilePreCFG(BinaryContext &BC) { + } + + Error DataReader::readProfile(BinaryContext &BC) { ++ ++ if (opts::BlockCorrection) { ++ if (opts::InputModelFilename.empty()) { ++ outs() << "error: llvm-bolt expected -model-path= option.\n"; ++ exit(1); ++ } else { ++ DataReader::initializeONNXRunner(opts::InputModelFilename); ++ DataReader::setThreshold(opts::AnnotateThreshold); ++ } ++ } ++ + for (auto &BFI : BC.getBinaryFunctions()) { + BinaryFunction &Function = BFI.second; + readProfile(Function); +@@ -324,6 +346,12 @@ Error DataReader::readProfile(BinaryContext &BC) { + } + BC.setNumUnusedProfiledObjects(NumUnused); + ++ if (opts::BlockCorrection) { ++ uint64_t modified_total = DataReader::getModifiedBBTotal(); ++ outs() << "BOLT-INFO: total modified CFG BB count number is " ++ << modified_total << ".\n"; ++ } ++ + return Error::success(); + } + +@@ -555,6 +583,75 @@ float DataReader::evaluateProfileData(BinaryFunction &BF, + return MatchRatio; + } + ++void generateChildrenParentCount(BinaryBasicBlock *BB) { ++ typedef GraphTraits GraphT; ++ ++ for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB), ++ E = GraphT::child_end(BB); ++ CI != E; ++CI) { ++ typename GraphT::NodeRef Child = *CI; ++ BB->insertChildrenSet(Child); ++ Child->insertParentSet(BB); ++ } ++} ++ ++void generateChildrenParentCount(BinaryFunction &BF) { ++ for (BinaryBasicBlock &BB : BF) { ++ generateChildrenParentCount(&BB); ++ } ++} ++ ++uint64_t estimateBBCount(DataReader *dataReaderRef, BinaryBasicBlock *BB, ++ float threshold) { ++ uint64_t modified = 0; ++ if (BB->getExecutionCount() != 0) { ++ return modified; ++ } ++ ++ std::vector input_string; ++ std::vector input_int64; ++ std::vector input_float; ++ ++ BinaryBasicBlockFeature BBF = BB->getFeatures(); ++ input_int64 = BBF.getInferenceFeatures(); ++ ++ if (input_int64.empty()) { ++ return 0; ++ } ++ ++ float model_pred = ++ dataReaderRef->ONNXInference(input_string, input_int64, input_float); ++ if (model_pred >= threshold) { ++ uint64_t min_neighbor_count = std::numeric_limits::max(); ++ for (BinaryBasicBlock *parent : BB->getParentSet()) { ++ if (parent->getExecutionCount() > 0 && ++ parent->getExecutionCount() < min_neighbor_count) ++ min_neighbor_count = parent->getExecutionCount(); ++ } ++ for (BinaryBasicBlock *child : BB->getChildrenSet()) { ++ if (child->getExecutionCount() > 0 && ++ child->getExecutionCount() < min_neighbor_count) ++ min_neighbor_count = child->getExecutionCount(); ++ } ++ if (min_neighbor_count != std::numeric_limits::max()) { ++ BB->setExecutionCount(min_neighbor_count); ++ modified = 1; ++ } ++ } ++ return modified; ++} ++ ++uint64_t estimateBBCount(DataReader *dataReaderRef, BinaryFunction &BF, ++ float threshold) { ++ uint64_t modified_total_func = 0; ++ const auto &Order = BF.dfs(); ++ for (auto *BBA : Order) { ++ auto &BB = *BBA; ++ modified_total_func += estimateBBCount(dataReaderRef, &BB, threshold); ++ } ++ return modified_total_func; ++} ++ + void DataReader::readSampleData(BinaryFunction &BF) { + FuncSampleData *SampleDataOrErr = getFuncSampleData(BF.getNames()); + if (!SampleDataOrErr) +@@ -600,6 +697,17 @@ void DataReader::readSampleData(BinaryFunction &BF) { + + BF.ExecutionCount = TotalEntryCount; + ++ if (opts::BlockCorrection) { ++ generateChildrenParentCount(BF); ++ std::unique_ptr FM = ++ std::make_unique(opts::BlockCorrection); ++ FM->inferenceFeatures(BF); ++ ++ float threshold = DataReader::getThreshold(); ++ uint64_t modified_total_func = estimateBBCount(this, BF, threshold); ++ DataReader::addModifiedBBTotal(modified_total_func); ++ } ++ + estimateEdgeCounts(BF); + } + +diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp +index c6ea0b009..4191e18bd 100644 +--- a/bolt/lib/Rewrite/RewriteInstance.cpp ++++ b/bolt/lib/Rewrite/RewriteInstance.cpp +@@ -106,6 +106,12 @@ cl::opt DumpDotAll( + "enable '-print-loops' for color-coded blocks"), + cl::Hidden, cl::cat(BoltCategory)); + ++cl::opt BlockCorrection( ++ "block-correction", ++ cl::desc("capture features useful for ML model to inference the count on the binary basic block" ++ " and correct them on CFG."), ++ cl::ZeroOrMore, cl::cat(BoltOptCategory)); ++ + static cl::list + ForceFunctionNames("funcs", + cl::CommaSeparated, +-- +2.39.3 (Apple Git-146) + diff --git a/0007-BOLT-Skip-PLT-search-for-zero-value-weak-reference-symbols.patch b/0007-BOLT-Skip-PLT-search-for-zero-value-weak-reference-symbols.patch new file mode 100644 index 0000000000000000000000000000000000000000..905962d09037238fc1ddf960de19c8dda3f57969 --- /dev/null +++ b/0007-BOLT-Skip-PLT-search-for-zero-value-weak-reference-symbols.patch @@ -0,0 +1,96 @@ +From 6c8933e1a095028d648a5a26aecee0f569304dd0 Mon Sep 17 00:00:00 2001 +From: sinan +Date: Wed, 7 Aug 2024 18:02:42 +0800 +Subject: [PATCH] [BOLT] Skip PLT search for zero-value weak reference symbols + (#69136) + +Take a common weak reference pattern for example +``` + __attribute__((weak)) void undef_weak_fun(); + + if (&undef_weak_fun) + undef_weak_fun(); +``` + +In this case, an undefined weak symbol `undef_weak_fun` has an address +of zero, and Bolt incorrectly changes the relocation for the +corresponding symbol to symbol@PLT, leading to incorrect runtime +behavior. +--- + bolt/lib/Rewrite/RewriteInstance.cpp | 11 +++++- + .../AArch64/update-weak-reference-symbol.s | 34 +++++++++++++++++++ + 2 files changed, 44 insertions(+), 1 deletion(-) + create mode 100644 bolt/test/AArch64/update-weak-reference-symbol.s + +diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp +index 78b4889bf2ae..d2e2ca2f7553 100644 +--- a/bolt/lib/Rewrite/RewriteInstance.cpp ++++ b/bolt/lib/Rewrite/RewriteInstance.cpp +@@ -2143,6 +2143,14 @@ bool RewriteInstance::analyzeRelocation( + if (!Relocation::isSupported(RType)) + return false; + ++ auto IsWeakReference = [](const SymbolRef &Symbol) { ++ Expected SymFlagsOrErr = Symbol.getFlags(); ++ if (!SymFlagsOrErr) ++ return false; ++ return (*SymFlagsOrErr & SymbolRef::SF_Undefined) && ++ (*SymFlagsOrErr & SymbolRef::SF_Weak); ++ }; ++ + const bool IsAArch64 = BC->isAArch64(); + + const size_t RelSize = Relocation::getSizeForType(RType); +@@ -2174,7 +2182,8 @@ bool RewriteInstance::analyzeRelocation( + // Section symbols are marked as ST_Debug. + IsSectionRelocation = (cantFail(Symbol.getType()) == SymbolRef::ST_Debug); + // Check for PLT entry registered with symbol name +- if (!SymbolAddress && (IsAArch64 || BC->isRISCV())) { ++ if (!SymbolAddress && !IsWeakReference(Symbol) && ++ (IsAArch64 || BC->isRISCV())) { + const BinaryData *BD = BC->getPLTBinaryDataByName(SymbolName); + SymbolAddress = BD ? BD->getAddress() : 0; + } +diff --git a/bolt/test/AArch64/update-weak-reference-symbol.s b/bolt/test/AArch64/update-weak-reference-symbol.s +new file mode 100644 +index 000000000000..600a06b8b6d8 +--- /dev/null ++++ b/bolt/test/AArch64/update-weak-reference-symbol.s +@@ -0,0 +1,34 @@ ++// This test checks whether BOLT can correctly handle relocations against weak symbols. ++ ++// RUN: %clang %cflags -Wl,-z,notext -shared -Wl,-q %s -o %t.so ++// RUN: llvm-bolt %t.so -o %t.so.bolt ++// RUN: llvm-nm -n %t.so.bolt > %t.out.txt ++// RUN: llvm-objdump -dj .rodata %t.so.bolt >> %t.out.txt ++// RUN: FileCheck %s --input-file=%t.out.txt ++ ++# CHECK: w func_1 ++# CHECK: {{0+}}[[#%x,ADDR:]] W func_2 ++ ++# CHECK: {{.*}} <.rodata>: ++# CHECK-NEXT: {{.*}} .word 0x00000000 ++# CHECK-NEXT: {{.*}} .word 0x00000000 ++# CHECK-NEXT: {{.*}} .word 0x{{[0]+}}[[#ADDR]] ++# CHECK-NEXT: {{.*}} .word 0x00000000 ++ ++ .text ++ .weak func_2 ++ .weak func_1 ++ .global wow ++ .type wow, %function ++wow: ++ bl func_1 ++ bl func_2 ++ ret ++ .type func_2, %function ++func_2: ++ ret ++ .section .rodata ++.LC0: ++ .xword func_1 ++.LC1: ++ .xword func_2 +-- +2.39.3 (Apple Git-146) + diff --git a/0008-merge-fdata-Support-process-no_lbr-profile-file.patch b/0008-merge-fdata-Support-process-no_lbr-profile-file.patch new file mode 100644 index 0000000000000000000000000000000000000000..3ba709f6bb54bb678707d533288527db6eb5a354 --- /dev/null +++ b/0008-merge-fdata-Support-process-no_lbr-profile-file.patch @@ -0,0 +1,58 @@ +From 583d41ce046670eae7a59fb678a9e959cf0af061 Mon Sep 17 00:00:00 2001 +From: liyancheng <412998149@qq.com> +Date: Tue, 10 Sep 2024 15:09:51 +0800 +Subject: [PATCH] [merge-fdata] Support processing no_lbr profile file + +--- + bolt/tools/merge-fdata/merge-fdata.cpp | 21 ++++++++++++++++++++- + 1 file changed, 20 insertions(+), 1 deletion(-) + +diff --git a/bolt/tools/merge-fdata/merge-fdata.cpp b/bolt/tools/merge-fdata/merge-fdata.cpp +index 757f05366..147e18639 100644 +--- a/bolt/tools/merge-fdata/merge-fdata.cpp ++++ b/bolt/tools/merge-fdata/merge-fdata.cpp +@@ -261,6 +261,7 @@ bool isYAML(const StringRef Filename) { + void mergeLegacyProfiles(const SmallVectorImpl &Filenames) { + errs() << "Using legacy profile format.\n"; + std::optional BoltedCollection; ++ std::optional NoLBRMode; + std::mutex BoltedCollectionMutex; + typedef StringMap ProfileTy; + +@@ -294,6 +295,22 @@ void mergeLegacyProfiles(const SmallVectorImpl &Filenames) { + BoltedCollection = false; + } + ++ // Check if the string "no_lbr" is in the first line ++ if (Buf.startswith("no_lbr")) { ++ if (!NoLBRMode.value_or(true)) ++ report_error( ++ Filename, ++ "cannot mix profile collected with lbr and non-lbr info"); ++ NoLBRMode = true; ++ Buf = Buf.drop_front(Buf.find_first_of("\n")); ++ } else { ++ if (NoLBRMode.value_or(false)) ++ report_error( ++ Filename, ++ "cannot mix profile collected with lbr and non-lbr info"); ++ NoLBRMode = false; ++ } ++ + Profile = &Profiles[tid]; + } + +@@ -329,7 +346,9 @@ void mergeLegacyProfiles(const SmallVectorImpl &Filenames) { + MergedProfile.insert_or_assign(Key, Count); + } + +- if (BoltedCollection) ++ if (NoLBRMode) ++ output() << "no_lbr cycles:u:\n"; ++ else if (BoltedCollection) + output() << "boltedcollection\n"; + for (const auto &[Key, Value] : MergedProfile) + output() << Key << " " << Value << "\n"; +-- +2.33.0 + diff --git a/0009-support-aarch64-instrumentation.patch b/0009-support-aarch64-instrumentation.patch new file mode 100644 index 0000000000000000000000000000000000000000..df97a14f08116a4370e2475b2b2403f3b5505d24 --- /dev/null +++ b/0009-support-aarch64-instrumentation.patch @@ -0,0 +1,2630 @@ +From a7d826d3985dd886523df050949f1c3c151df636 Mon Sep 17 00:00:00 2001 +From: rfwang07 +Date: Thu, 31 Oct 2024 15:34:10 +0800 +Subject: [PATCH] support aarch64 instrumentation + +--- + bolt/CMakeLists.txt | 6 +- + bolt/include/bolt/Core/MCPlusBuilder.h | 24 +- + bolt/lib/Core/BinaryFunction.cpp | 6 + + bolt/lib/Passes/Instrumentation.cpp | 28 +- + bolt/lib/Passes/MCF.cpp | 1 + + bolt/lib/Passes/TailDuplication.cpp | 2 +- + .../Target/AArch64/AArch64MCPlusBuilder.cpp | 446 +++++++++++++++++- + bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 67 +-- + bolt/runtime/CMakeLists.txt | 12 +- + bolt/runtime/common.h | 417 ++-------------- + bolt/runtime/instr.cpp | 61 ++- + bolt/runtime/sys_aarch64.h | 394 ++++++++++++++++ + bolt/runtime/sys_x86_64.h | 360 ++++++++++++++ + bolt/test/AArch64/exclusive-instrument.s | 39 ++ + bolt/test/X86/asm-dump.c | 5 +- + ...olt-address-translation-internal-call.test | 9 +- + .../test/X86/instrumentation-eh_frame_hdr.cpp | 2 +- + bolt/test/X86/internal-call-instrument.s | 24 +- + bolt/test/X86/tail-duplication-pass.s | 9 + + bolt/test/assume-abi.test | 7 + + .../AArch64/Inputs/basic-instrumentation.s | 9 + + .../AArch64/basic-instrumentation.test | 22 + + .../AArch64/instrumentation-ind-call.c | 38 ++ + .../{X86 => }/Inputs/exceptions_split.cpp | 16 +- + .../runtime/X86/instrumentation-tail-call.s | 6 +- + .../{X86 => }/exceptions-instrumentation.test | 0 + .../{X86 => }/pie-exceptions-split.test | 4 +- + 27 files changed, 1545 insertions(+), 469 deletions(-) + create mode 100644 bolt/runtime/sys_aarch64.h + create mode 100644 bolt/runtime/sys_x86_64.h + create mode 100644 bolt/test/AArch64/exclusive-instrument.s + create mode 100644 bolt/test/assume-abi.test + create mode 100644 bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s + create mode 100644 bolt/test/runtime/AArch64/basic-instrumentation.test + create mode 100644 bolt/test/runtime/AArch64/instrumentation-ind-call.c + rename bolt/test/runtime/{X86 => }/Inputs/exceptions_split.cpp (85%) + rename bolt/test/runtime/{X86 => }/exceptions-instrumentation.test (100%) + rename bolt/test/runtime/{X86 => }/pie-exceptions-split.test (95%) + +diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt +index 4ff90c1..89462f8 100644 +--- a/bolt/CMakeLists.txt ++++ b/bolt/CMakeLists.txt +@@ -32,10 +32,10 @@ foreach (tgt ${BOLT_TARGETS_TO_BUILD}) + endforeach() + + set(BOLT_ENABLE_RUNTIME_default OFF) +-if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" ++if ((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" ++ OR CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + AND (CMAKE_SYSTEM_NAME STREQUAL "Linux" +- OR CMAKE_SYSTEM_NAME STREQUAL "Darwin") +- AND "X86" IN_LIST BOLT_TARGETS_TO_BUILD) ++ OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")) + set(BOLT_ENABLE_RUNTIME_default ON) + endif() + option(BOLT_ENABLE_RUNTIME "Enable BOLT runtime" ${BOLT_ENABLE_RUNTIME_default}) +diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h +index beb0675..e6945c9 100644 +--- a/bolt/include/bolt/Core/MCPlusBuilder.h ++++ b/bolt/include/bolt/Core/MCPlusBuilder.h +@@ -498,9 +498,9 @@ public: + } + + /// Create increment contents of target by 1 for Instrumentation +- virtual InstructionListType createInstrIncMemory(const MCSymbol *Target, +- MCContext *Ctx, +- bool IsLeaf) const { ++ virtual InstructionListType ++ createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, ++ unsigned CodePointerSize) const { + llvm_unreachable("not implemented"); + return InstructionListType(); + } +@@ -620,6 +620,11 @@ public: + return false; + } + ++ virtual bool isAArch64Exclusive(const MCInst &Inst) const { ++ llvm_unreachable("not implemented"); ++ return false; ++ } ++ + virtual bool isCleanRegXOR(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return false; +@@ -1597,18 +1602,11 @@ public: + return false; + } + +- virtual void createLoadImmediate(MCInst &Inst, const MCPhysReg Dest, +- uint32_t Imm) const { ++ virtual InstructionListType createLoadImmediate(const MCPhysReg Dest, ++ uint64_t Imm) const { + llvm_unreachable("not implemented"); + } + +- /// Create instruction to increment contents of target by 1 +- virtual bool createIncMemory(MCInst &Inst, const MCSymbol *Target, +- MCContext *Ctx) const { +- llvm_unreachable("not implemented"); +- return false; +- } +- + /// Create a fragment of code (sequence of instructions) that load a 32-bit + /// address from memory, zero-extends it to 64 and jump to it (indirect jump). + virtual bool +@@ -1969,7 +1967,7 @@ public: + } + + virtual InstructionListType createSymbolTrampoline(const MCSymbol *TgtSym, +- MCContext *Ctx) const { ++ MCContext *Ctx) { + llvm_unreachable("not implemented"); + return InstructionListType(); + } +diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp +index 5b44a76..b79bd58 100644 +--- a/bolt/lib/Core/BinaryFunction.cpp ++++ b/bolt/lib/Core/BinaryFunction.cpp +@@ -2305,6 +2305,12 @@ void BinaryFunction::removeConditionalTailCalls() { + + // This branch is no longer a conditional tail call. + BC.MIB->unsetConditionalTailCall(*CTCInstr); ++ ++ // Move offset from CTCInstr to TailCallInstr. ++ if (std::optional Offset = BC.MIB->getOffset(*CTCInstr)) { ++ BC.MIB->setOffset(TailCallInstr, *Offset); ++ BC.MIB->clearOffset(*CTCInstr); ++ } + } + + insertBasicBlocks(std::prev(end()), std::move(NewBlocks), +diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp +index fae6770..72adb31 100644 +--- a/bolt/lib/Passes/Instrumentation.cpp ++++ b/bolt/lib/Passes/Instrumentation.cpp +@@ -13,6 +13,7 @@ + #include "bolt/Passes/Instrumentation.h" + #include "bolt/Core/ParallelUtilities.h" + #include "bolt/RuntimeLibs/InstrumentationRuntimeLibrary.h" ++#include "bolt/Utils/CommandLineOpts.h" + #include "bolt/Utils/Utils.h" + #include "llvm/Support/CommandLine.h" + #include "llvm/Support/RWMutex.h" +@@ -85,6 +86,24 @@ cl::opt InstrumentCalls("instrument-calls", + namespace llvm { + namespace bolt { + ++static bool hasAArch64ExclusiveMemop(BinaryFunction &Function) { ++ // FIXME ARMv8-a architecture reference manual says that software must avoid ++ // having any explicit memory accesses between exclusive load and associated ++ // store instruction. So for now skip instrumentation for functions that have ++ // these instructions, since it might lead to runtime deadlock. ++ BinaryContext &BC = Function.getBinaryContext(); ++ for (const BinaryBasicBlock &BB : Function) ++ for (const MCInst &Inst : BB) ++ if (BC.MIB->isAArch64Exclusive(Inst)) { ++ if (opts::Verbosity >= 1) ++ outs() << "BOLT-INSTRUMENTER: Function " << Function ++ << " has exclusive instructions, skip instrumentation\n"; ++ return true; ++ } ++ ++ return false; ++} ++ + uint32_t Instrumentation::getFunctionNameIndex(const BinaryFunction &Function) { + auto Iter = FuncToStringIdx.find(&Function); + if (Iter != FuncToStringIdx.end()) +@@ -176,7 +195,8 @@ Instrumentation::createInstrumentationSnippet(BinaryContext &BC, bool IsLeaf) { + auto L = BC.scopeLock(); + MCSymbol *Label = BC.Ctx->createNamedTempSymbol("InstrEntry"); + Summary->Counters.emplace_back(Label); +- return BC.MIB->createInstrIncMemory(Label, BC.Ctx.get(), IsLeaf); ++ return BC.MIB->createInstrIncMemory(Label, BC.Ctx.get(), IsLeaf, ++ BC.AsmInfo->getCodePointerSize()); + } + + // Helper instruction sequence insertion function +@@ -287,6 +307,9 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function, + if (BC.isMachO() && Function.hasName("___GLOBAL_init_65535/1")) + return; + ++ if (BC.isAArch64() && hasAArch64ExclusiveMemop(Function)) ++ return; ++ + SplitWorklistTy SplitWorklist; + SplitInstrsTy SplitInstrs; + +@@ -504,9 +527,6 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function, + } + + void Instrumentation::runOnFunctions(BinaryContext &BC) { +- if (!BC.isX86()) +- return; +- + const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/false, + /*IsText=*/false, + /*IsAllocatable=*/true); +diff --git a/bolt/lib/Passes/MCF.cpp b/bolt/lib/Passes/MCF.cpp +index ec04012..c3898d2 100644 +--- a/bolt/lib/Passes/MCF.cpp ++++ b/bolt/lib/Passes/MCF.cpp +@@ -262,6 +262,7 @@ bool guessPredEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) { + continue; + + Pred->getBranchInfo(*BB).Count = Guessed; ++ GuessedArcs.insert(std::make_pair(Pred, BB)); + return true; + } + llvm_unreachable("Expected unguessed arc"); +diff --git a/bolt/lib/Passes/TailDuplication.cpp b/bolt/lib/Passes/TailDuplication.cpp +index c04efd7..7141d5d 100644 +--- a/bolt/lib/Passes/TailDuplication.cpp ++++ b/bolt/lib/Passes/TailDuplication.cpp +@@ -303,7 +303,7 @@ TailDuplication::aggressiveDuplicate(BinaryBasicBlock &BB, + if (isInCacheLine(BB, Tail)) + return BlocksToDuplicate; + +- BinaryBasicBlock *CurrBB = &BB; ++ BinaryBasicBlock *CurrBB = &Tail; + while (CurrBB) { + LLVM_DEBUG(dbgs() << "Aggressive tail duplication: adding " + << CurrBB->getName() << " to duplication list\n";); +diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +index cd66b65..3f6497e 100644 +--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp ++++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +@@ -16,6 +16,9 @@ + #include "Utils/AArch64BaseInfo.h" + #include "bolt/Core/MCPlusBuilder.h" + #include "llvm/BinaryFormat/ELF.h" ++#include "llvm/MC/MCContext.h" ++#include "llvm/MC/MCFixupKindInfo.h" ++#include "llvm/MC/MCInstBuilder.h" + #include "llvm/MC/MCInstrInfo.h" + #include "llvm/MC/MCRegisterInfo.h" + #include "llvm/Support/Debug.h" +@@ -28,6 +31,100 @@ using namespace bolt; + + namespace { + ++static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) { ++ Inst.setOpcode(AArch64::MRS); ++ Inst.clear(); ++ Inst.addOperand(MCOperand::createReg(RegName)); ++ Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV)); ++} ++ ++static void setSystemFlag(MCInst &Inst, MCPhysReg RegName) { ++ Inst.setOpcode(AArch64::MSR); ++ Inst.clear(); ++ Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV)); ++ Inst.addOperand(MCOperand::createReg(RegName)); ++} ++ ++static void createPushRegisters(MCInst &Inst, MCPhysReg Reg1, MCPhysReg Reg2) { ++ Inst.clear(); ++ unsigned NewOpcode = AArch64::STPXpre; ++ Inst.setOpcode(NewOpcode); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createReg(Reg1)); ++ Inst.addOperand(MCOperand::createReg(Reg2)); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createImm(-2)); ++} ++ ++static void createPopRegisters(MCInst &Inst, MCPhysReg Reg1, MCPhysReg Reg2) { ++ Inst.clear(); ++ unsigned NewOpcode = AArch64::LDPXpost; ++ Inst.setOpcode(NewOpcode); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createReg(Reg1)); ++ Inst.addOperand(MCOperand::createReg(Reg2)); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createImm(2)); ++} ++ ++static void loadReg(MCInst &Inst, MCPhysReg To, MCPhysReg From) { ++ Inst.setOpcode(AArch64::LDRXui); ++ Inst.clear(); ++ if (From == AArch64::SP) { ++ Inst.setOpcode(AArch64::LDRXpost); ++ Inst.addOperand(MCOperand::createReg(From)); ++ Inst.addOperand(MCOperand::createReg(To)); ++ Inst.addOperand(MCOperand::createReg(From)); ++ Inst.addOperand(MCOperand::createImm(16)); ++ } else { ++ Inst.addOperand(MCOperand::createReg(To)); ++ Inst.addOperand(MCOperand::createReg(From)); ++ Inst.addOperand(MCOperand::createImm(0)); ++ } ++} ++ ++static void storeReg(MCInst &Inst, MCPhysReg From, MCPhysReg To) { ++ Inst.setOpcode(AArch64::STRXui); ++ Inst.clear(); ++ if (To == AArch64::SP) { ++ Inst.setOpcode(AArch64::STRXpre); ++ Inst.addOperand(MCOperand::createReg(To)); ++ Inst.addOperand(MCOperand::createReg(From)); ++ Inst.addOperand(MCOperand::createReg(To)); ++ Inst.addOperand(MCOperand::createImm(-16)); ++ } else { ++ Inst.addOperand(MCOperand::createReg(From)); ++ Inst.addOperand(MCOperand::createReg(To)); ++ Inst.addOperand(MCOperand::createImm(0)); ++ } ++} ++ ++static void atomicAdd(MCInst &Inst, MCPhysReg RegTo, MCPhysReg RegCnt) { ++ // NOTE: Supports only ARM with LSE extension ++ Inst.setOpcode(AArch64::LDADDX); ++ Inst.clear(); ++ Inst.addOperand(MCOperand::createReg(AArch64::XZR)); ++ Inst.addOperand(MCOperand::createReg(RegCnt)); ++ Inst.addOperand(MCOperand::createReg(RegTo)); ++} ++ ++static void createMovz(MCInst &Inst, MCPhysReg Reg, uint64_t Imm) { ++ assert(Imm <= UINT16_MAX && "Invalid Imm size"); ++ Inst.clear(); ++ Inst.setOpcode(AArch64::MOVZXi); ++ Inst.addOperand(MCOperand::createReg(Reg)); ++ Inst.addOperand(MCOperand::createImm(Imm & 0xFFFF)); ++ Inst.addOperand(MCOperand::createImm(0)); ++} ++ ++static InstructionListType createIncMemory(MCPhysReg RegTo, MCPhysReg RegTmp) { ++ InstructionListType Insts; ++ Insts.emplace_back(); ++ createMovz(Insts.back(), RegTmp, 1); ++ Insts.emplace_back(); ++ atomicAdd(Insts.back(), RegTo, RegTmp); ++ return Insts; ++} + class AArch64MCPlusBuilder : public MCPlusBuilder { + public: + AArch64MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info, +@@ -176,6 +273,34 @@ public: + return isLDRB(Inst) || isLDRH(Inst) || isLDRW(Inst) || isLDRX(Inst); + } + ++ bool isAArch64Exclusive(const MCInst &Inst) const override { ++ return (Inst.getOpcode() == AArch64::LDXPX || ++ Inst.getOpcode() == AArch64::LDXPW || ++ Inst.getOpcode() == AArch64::LDXRX || ++ Inst.getOpcode() == AArch64::LDXRW || ++ Inst.getOpcode() == AArch64::LDXRH || ++ Inst.getOpcode() == AArch64::LDXRB || ++ Inst.getOpcode() == AArch64::STXPX || ++ Inst.getOpcode() == AArch64::STXPW || ++ Inst.getOpcode() == AArch64::STXRX || ++ Inst.getOpcode() == AArch64::STXRW || ++ Inst.getOpcode() == AArch64::STXRH || ++ Inst.getOpcode() == AArch64::STXRB || ++ Inst.getOpcode() == AArch64::LDAXPX || ++ Inst.getOpcode() == AArch64::LDAXPW || ++ Inst.getOpcode() == AArch64::LDAXRX || ++ Inst.getOpcode() == AArch64::LDAXRW || ++ Inst.getOpcode() == AArch64::LDAXRH || ++ Inst.getOpcode() == AArch64::LDAXRB || ++ Inst.getOpcode() == AArch64::STLXPX || ++ Inst.getOpcode() == AArch64::STLXPW || ++ Inst.getOpcode() == AArch64::STLXRX || ++ Inst.getOpcode() == AArch64::STLXRW || ++ Inst.getOpcode() == AArch64::STLXRH || ++ Inst.getOpcode() == AArch64::STLXRB || ++ Inst.getOpcode() == AArch64::CLREX); ++ } ++ + bool isLoadFromStack(const MCInst &Inst) const { + if (!isLoad(Inst)) + return false; +@@ -207,6 +332,40 @@ public: + return Inst.getOpcode() == AArch64::BLR; + } + ++ MCPhysReg getSpRegister(int Size) const { ++ switch (Size) { ++ case 4: ++ return AArch64::WSP; ++ case 8: ++ return AArch64::SP; ++ default: ++ llvm_unreachable("Unexpected size"); ++ } ++ } ++ ++ MCPhysReg getIntArgRegister(unsigned ArgNo) const override { ++ switch (ArgNo) { ++ case 0: ++ return AArch64::X0; ++ case 1: ++ return AArch64::X1; ++ case 2: ++ return AArch64::X2; ++ case 3: ++ return AArch64::X3; ++ case 4: ++ return AArch64::X4; ++ case 5: ++ return AArch64::X5; ++ case 6: ++ return AArch64::X6; ++ case 7: ++ return AArch64::X7; ++ default: ++ return getNoRegister(); ++ } ++ } ++ + bool hasPCRelOperand(const MCInst &Inst) const override { + // ADRP is blacklisted and is an exception. Even though it has a + // PC-relative operand, this operand is not a complete symbol reference +@@ -313,6 +472,22 @@ public: + return true; + } + ++ void getCalleeSavedRegs(BitVector &Regs) const override { ++ Regs |= getAliases(AArch64::X18); ++ Regs |= getAliases(AArch64::X19); ++ Regs |= getAliases(AArch64::X20); ++ Regs |= getAliases(AArch64::X21); ++ Regs |= getAliases(AArch64::X22); ++ Regs |= getAliases(AArch64::X23); ++ Regs |= getAliases(AArch64::X24); ++ Regs |= getAliases(AArch64::X25); ++ Regs |= getAliases(AArch64::X26); ++ Regs |= getAliases(AArch64::X27); ++ Regs |= getAliases(AArch64::X28); ++ Regs |= getAliases(AArch64::LR); ++ Regs |= getAliases(AArch64::FP); ++ } ++ + const MCExpr *getTargetExprFor(MCInst &Inst, const MCExpr *Expr, + MCContext &Ctx, + uint64_t RelType) const override { +@@ -818,6 +993,22 @@ public: + + int getUncondBranchEncodingSize() const override { return 28; } + ++ InstructionListType createCmpJE(MCPhysReg RegNo, int64_t Imm, ++ const MCSymbol *Target, ++ MCContext *Ctx) const override { ++ InstructionListType Code; ++ Code.emplace_back(MCInstBuilder(AArch64::SUBSXri) ++ .addReg(RegNo) ++ .addReg(RegNo) ++ .addImm(Imm) ++ .addImm(0)); ++ Code.emplace_back(MCInstBuilder(AArch64::Bcc) ++ .addImm(Imm) ++ .addExpr(MCSymbolRefExpr::create( ++ Target, MCSymbolRefExpr::VK_None, *Ctx))); ++ return Code; ++ } ++ + bool createCall(MCInst &Inst, const MCSymbol *Target, + MCContext *Ctx) override { + Inst.setOpcode(AArch64::BL); +@@ -828,12 +1019,7 @@ public: + + bool createTailCall(MCInst &Inst, const MCSymbol *Target, + MCContext *Ctx) override { +- Inst.setOpcode(AArch64::B); +- Inst.addOperand(MCOperand::createExpr(getTargetExprFor( +- Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx), +- *Ctx, 0))); +- setTailCall(Inst); +- return true; ++ return createDirectCall(Inst, Target, Ctx, /*IsTailCall*/ true); + } + + void createLongTailCall(InstructionListType &Seq, const MCSymbol *Target, +@@ -882,6 +1068,18 @@ public: + + bool isStore(const MCInst &Inst) const override { return false; } + ++ bool createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx, ++ bool IsTailCall) override { ++ Inst.setOpcode(IsTailCall ? AArch64::B : AArch64::BL); ++ Inst.clear(); ++ Inst.addOperand(MCOperand::createExpr(getTargetExprFor( ++ Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx), ++ *Ctx, 0))); ++ if (IsTailCall) ++ convertJmpToTailCall(Inst); ++ return true; ++ } ++ + bool analyzeBranch(InstructionIterator Begin, InstructionIterator End, + const MCSymbol *&TBB, const MCSymbol *&FBB, + MCInst *&CondBranch, +@@ -1153,6 +1351,242 @@ public: + return true; + } + ++ bool createStackPointerIncrement( ++ MCInst &Inst, int Size, ++ bool NoFlagsClobber = false /*unused for AArch64*/) const override { ++ Inst.setOpcode(AArch64::SUBXri); ++ Inst.clear(); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createImm(Size)); ++ Inst.addOperand(MCOperand::createImm(0)); ++ return true; ++ } ++ ++ bool createStackPointerDecrement( ++ MCInst &Inst, int Size, ++ bool NoFlagsClobber = false /*unused for AArch64*/) const override { ++ Inst.setOpcode(AArch64::ADDXri); ++ Inst.clear(); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createImm(Size)); ++ Inst.addOperand(MCOperand::createImm(0)); ++ return true; ++ } ++ ++ void createIndirectBranch(MCInst &Inst, MCPhysReg MemBaseReg, ++ int64_t Disp) const { ++ Inst.setOpcode(AArch64::BR); ++ Inst.addOperand(MCOperand::createReg(MemBaseReg)); ++ } ++ ++ InstructionListType createInstrumentedIndCallHandlerExitBB() const override { ++ InstructionListType Insts(5); ++ // Code sequence for instrumented indirect call handler: ++ // msr nzcv, x1 ++ // ldp x0, x1, [sp], #16 ++ // ldr x16, [sp], #16 ++ // ldp x0, x1, [sp], #16 ++ // br x16 ++ setSystemFlag(Insts[0], AArch64::X1); ++ createPopRegisters(Insts[1], AArch64::X0, AArch64::X1); ++ // Here we load address of the next function which should be called in the ++ // original binary to X16 register. Writing to X16 is permitted without ++ // needing to restore. ++ loadReg(Insts[2], AArch64::X16, AArch64::SP); ++ createPopRegisters(Insts[3], AArch64::X0, AArch64::X1); ++ createIndirectBranch(Insts[4], AArch64::X16, 0); ++ return Insts; ++ } ++ ++ InstructionListType ++ createInstrumentedIndTailCallHandlerExitBB() const override { ++ return createInstrumentedIndCallHandlerExitBB(); ++ } ++ ++ InstructionListType createGetter(MCContext *Ctx, const char *name) const { ++ InstructionListType Insts(4); ++ MCSymbol *Locs = Ctx->getOrCreateSymbol(name); ++ InstructionListType Addr = materializeAddress(Locs, Ctx, AArch64::X0); ++ std::copy(Addr.begin(), Addr.end(), Insts.begin()); ++ assert(Addr.size() == 2 && "Invalid Addr size"); ++ loadReg(Insts[2], AArch64::X0, AArch64::X0); ++ createReturn(Insts[3]); ++ return Insts; ++ } ++ ++ InstructionListType createNumCountersGetter(MCContext *Ctx) const override { ++ return createGetter(Ctx, "__bolt_num_counters"); ++ } ++ ++ InstructionListType ++ createInstrLocationsGetter(MCContext *Ctx) const override { ++ return createGetter(Ctx, "__bolt_instr_locations"); ++ } ++ ++ InstructionListType createInstrTablesGetter(MCContext *Ctx) const override { ++ return createGetter(Ctx, "__bolt_instr_tables"); ++ } ++ ++ InstructionListType createInstrNumFuncsGetter(MCContext *Ctx) const override { ++ return createGetter(Ctx, "__bolt_instr_num_funcs"); ++ } ++ ++ void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg) override { ++ bool IsTailCall = isTailCall(Inst); ++ if (IsTailCall) ++ removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall); ++ if (Inst.getOpcode() == AArch64::BR || Inst.getOpcode() == AArch64::BLR) { ++ Inst.setOpcode(AArch64::ORRXrs); ++ Inst.insert(Inst.begin(), MCOperand::createReg(Reg)); ++ Inst.insert(Inst.begin() + 1, MCOperand::createReg(AArch64::XZR)); ++ Inst.insert(Inst.begin() + 3, MCOperand::createImm(0)); ++ return; ++ } ++ llvm_unreachable("not implemented"); ++ } ++ ++ InstructionListType createLoadImmediate(const MCPhysReg Dest, ++ uint64_t Imm) const override { ++ InstructionListType Insts(4); ++ int Shift = 48; ++ for (int I = 0; I < 4; I++, Shift -= 16) { ++ Insts[I].setOpcode(AArch64::MOVKXi); ++ Insts[I].addOperand(MCOperand::createReg(Dest)); ++ Insts[I].addOperand(MCOperand::createReg(Dest)); ++ Insts[I].addOperand(MCOperand::createImm((Imm >> Shift) & 0xFFFF)); ++ Insts[I].addOperand(MCOperand::createImm(Shift)); ++ } ++ return Insts; ++ } ++ ++ void createIndirectCallInst(MCInst &Inst, bool IsTailCall, ++ MCPhysReg Reg) const { ++ Inst.clear(); ++ Inst.setOpcode(IsTailCall ? AArch64::BR : AArch64::BLR); ++ Inst.addOperand(MCOperand::createReg(Reg)); ++ } ++ ++ InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst, ++ MCSymbol *HandlerFuncAddr, ++ int CallSiteID, ++ MCContext *Ctx) override { ++ InstructionListType Insts; ++ // Code sequence used to enter indirect call instrumentation helper: ++ // stp x0, x1, [sp, #-16]! createPushRegisters ++ // mov target x0 convertIndirectCallToLoad -> orr x0 target xzr ++ // mov x1 CallSiteID createLoadImmediate -> ++ // movk x1, #0x0, lsl #48 ++ // movk x1, #0x0, lsl #32 ++ // movk x1, #0x0, lsl #16 ++ // movk x1, #0x0 ++ // stp x0, x1, [sp, #-16]! ++ // bl *HandlerFuncAddr createIndirectCall -> ++ // adr x0 *HandlerFuncAddr -> adrp + add ++ // blr x0 ++ Insts.emplace_back(); ++ createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); ++ Insts.emplace_back(CallInst); ++ convertIndirectCallToLoad(Insts.back(), AArch64::X0); ++ InstructionListType LoadImm = ++ createLoadImmediate(getIntArgRegister(1), CallSiteID); ++ Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end()); ++ Insts.emplace_back(); ++ createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); ++ Insts.resize(Insts.size() + 2); ++ InstructionListType Addr = ++ materializeAddress(HandlerFuncAddr, Ctx, AArch64::X0); ++ assert(Addr.size() == 2 && "Invalid Addr size"); ++ std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size()); ++ Insts.emplace_back(); ++ createIndirectCallInst(Insts.back(), isTailCall(CallInst), AArch64::X0); ++ ++ // Carry over metadata including tail call marker if present. ++ stripAnnotations(Insts.back()); ++ moveAnnotations(std::move(CallInst), Insts.back()); ++ ++ return Insts; ++ } ++ ++ InstructionListType ++ createInstrumentedIndCallHandlerEntryBB(const MCSymbol *InstrTrampoline, ++ const MCSymbol *IndCallHandler, ++ MCContext *Ctx) override { ++ // Code sequence used to check whether InstrTampoline was initialized ++ // and call it if so, returns via IndCallHandler ++ // stp x0, x1, [sp, #-16]! ++ // mrs x1, nzcv ++ // adr x0, InstrTrampoline -> adrp + add ++ // ldr x0, [x0] ++ // subs x0, x0, #0x0 ++ // b.eq IndCallHandler ++ // str x30, [sp, #-16]! ++ // blr x0 ++ // ldr x30, [sp], #16 ++ // b IndCallHandler ++ InstructionListType Insts; ++ Insts.emplace_back(); ++ createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); ++ Insts.emplace_back(); ++ getSystemFlag(Insts.back(), getIntArgRegister(1)); ++ Insts.emplace_back(); ++ Insts.emplace_back(); ++ InstructionListType Addr = ++ materializeAddress(InstrTrampoline, Ctx, AArch64::X0); ++ std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size()); ++ assert(Addr.size() == 2 && "Invalid Addr size"); ++ Insts.emplace_back(); ++ loadReg(Insts.back(), AArch64::X0, AArch64::X0); ++ InstructionListType cmpJmp = ++ createCmpJE(AArch64::X0, 0, IndCallHandler, Ctx); ++ Insts.insert(Insts.end(), cmpJmp.begin(), cmpJmp.end()); ++ Insts.emplace_back(); ++ storeReg(Insts.back(), AArch64::LR, AArch64::SP); ++ Insts.emplace_back(); ++ Insts.back().setOpcode(AArch64::BLR); ++ Insts.back().addOperand(MCOperand::createReg(AArch64::X0)); ++ Insts.emplace_back(); ++ loadReg(Insts.back(), AArch64::LR, AArch64::SP); ++ Insts.emplace_back(); ++ createDirectCall(Insts.back(), IndCallHandler, Ctx, /*IsTailCall*/ true); ++ return Insts; ++ } ++ ++ InstructionListType ++ createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, ++ unsigned CodePointerSize) const override { ++ unsigned int I = 0; ++ InstructionListType Instrs(IsLeaf ? 12 : 10); ++ ++ if (IsLeaf) ++ createStackPointerIncrement(Instrs[I++], 128); ++ createPushRegisters(Instrs[I++], AArch64::X0, AArch64::X1); ++ getSystemFlag(Instrs[I++], AArch64::X1); ++ InstructionListType Addr = materializeAddress(Target, Ctx, AArch64::X0); ++ assert(Addr.size() == 2 && "Invalid Addr size"); ++ std::copy(Addr.begin(), Addr.end(), Instrs.begin() + I); ++ I += Addr.size(); ++ storeReg(Instrs[I++], AArch64::X2, AArch64::SP); ++ InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X2); ++ assert(Insts.size() == 2 && "Invalid Insts size"); ++ std::copy(Insts.begin(), Insts.end(), Instrs.begin() + I); ++ I += Insts.size(); ++ loadReg(Instrs[I++], AArch64::X2, AArch64::SP); ++ setSystemFlag(Instrs[I++], AArch64::X1); ++ createPopRegisters(Instrs[I++], AArch64::X0, AArch64::X1); ++ if (IsLeaf) ++ createStackPointerDecrement(Instrs[I++], 128); ++ return Instrs; ++ } ++ ++ std::vector createSymbolTrampoline(const MCSymbol *TgtSym, ++ MCContext *Ctx) override { ++ std::vector Insts; ++ createShortJmp(Insts, TgtSym, Ctx, /*IsTailCall*/ true); ++ return Insts; ++ } ++ + InstructionListType materializeAddress(const MCSymbol *Target, MCContext *Ctx, + MCPhysReg RegName, + int64_t Addend = 0) const override { +diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +index 5e3c01a..25b6970 100644 +--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp ++++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +@@ -61,6 +61,25 @@ bool isADDri(const MCInst &Inst) { + Inst.getOpcode() == X86::ADD64ri8; + } + ++// Create instruction to increment contents of target by 1 ++static InstructionListType createIncMemory(const MCSymbol *Target, ++ MCContext *Ctx) { ++ InstructionListType Insts; ++ Insts.emplace_back(); ++ Insts.back().setOpcode(X86::LOCK_INC64m); ++ Insts.back().clear(); ++ Insts.back().addOperand(MCOperand::createReg(X86::RIP)); // BaseReg ++ Insts.back().addOperand(MCOperand::createImm(1)); // ScaleAmt ++ Insts.back().addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg ++ ++ Insts.back().addOperand(MCOperand::createExpr( ++ MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, ++ *Ctx))); // Displacement ++ Insts.back().addOperand( ++ MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg ++ return Insts; ++} ++ + #define GET_INSTRINFO_OPERAND_TYPES_ENUM + #define GET_INSTRINFO_OPERAND_TYPE + #define GET_INSTRINFO_MEM_OPERAND_SIZE +@@ -2309,28 +2328,15 @@ public: + return true; + } + +- void createLoadImmediate(MCInst &Inst, const MCPhysReg Dest, +- uint32_t Imm) const override { +- Inst.setOpcode(X86::MOV64ri32); +- Inst.clear(); +- Inst.addOperand(MCOperand::createReg(Dest)); +- Inst.addOperand(MCOperand::createImm(Imm)); +- } +- +- bool createIncMemory(MCInst &Inst, const MCSymbol *Target, +- MCContext *Ctx) const override { +- +- Inst.setOpcode(X86::LOCK_INC64m); +- Inst.clear(); +- Inst.addOperand(MCOperand::createReg(X86::RIP)); // BaseReg +- Inst.addOperand(MCOperand::createImm(1)); // ScaleAmt +- Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg +- +- Inst.addOperand(MCOperand::createExpr( +- MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, +- *Ctx))); // Displacement +- Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg +- return true; ++ InstructionListType createLoadImmediate(const MCPhysReg Dest, ++ uint64_t Imm) const override { ++ InstructionListType Insts; ++ Insts.emplace_back(); ++ Insts.back().setOpcode(X86::MOV64ri32); ++ Insts.back().clear(); ++ Insts.back().addOperand(MCOperand::createReg(Dest)); ++ Insts.back().addOperand(MCOperand::createImm(Imm)); ++ return Insts; + } + + bool createIJmp32Frag(SmallVectorImpl &Insts, +@@ -3057,9 +3063,9 @@ public: + Inst.clear(); + } + +- InstructionListType createInstrIncMemory(const MCSymbol *Target, +- MCContext *Ctx, +- bool IsLeaf) const override { ++ InstructionListType ++ createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, ++ unsigned CodePointerSize) const override { + InstructionListType Instrs(IsLeaf ? 13 : 11); + unsigned int I = 0; + +@@ -3079,7 +3085,10 @@ public: + createClearRegWithNoEFlagsUpdate(Instrs[I++], X86::RAX, 8); + createX86SaveOVFlagToRegister(Instrs[I++], X86::AL); + // LOCK INC +- createIncMemory(Instrs[I++], Target, Ctx); ++ InstructionListType IncMem = createIncMemory(Target, Ctx); ++ assert(IncMem.size() == 1 && "Invalid IncMem size"); ++ std::copy(IncMem.begin(), IncMem.end(), Instrs.begin() + I); ++ I += IncMem.size(); + // POPF + createAddRegImm(Instrs[I++], X86::AL, 127, 1); + createPopRegister(Instrs[I++], X86::RAX, 8); +@@ -3153,8 +3162,8 @@ public: + } + Insts.emplace_back(); + createPushRegister(Insts.back(), TempReg, 8); +- Insts.emplace_back(); +- createLoadImmediate(Insts.back(), TempReg, CallSiteID); ++ InstructionListType LoadImm = createLoadImmediate(TempReg, CallSiteID); ++ Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end()); + Insts.emplace_back(); + createPushRegister(Insts.back(), TempReg, 8); + +@@ -3264,7 +3273,7 @@ public: + } + + InstructionListType createSymbolTrampoline(const MCSymbol *TgtSym, +- MCContext *Ctx) const override { ++ MCContext *Ctx) override { + InstructionListType Insts(1); + createUncondBranch(Insts[0], TgtSym, Ctx); + return Insts; +diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt +index 8472ce0..838c8cb 100644 +--- a/bolt/runtime/CMakeLists.txt ++++ b/bolt/runtime/CMakeLists.txt +@@ -27,8 +27,14 @@ set(BOLT_RT_FLAGS + -fno-exceptions + -fno-rtti + -fno-stack-protector +- -mno-sse +- -fPIC) ++ -fPIC ++ -mgeneral-regs-only) ++if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") ++ set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-sse") ++endif() ++if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") ++ set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-outline-atomics") ++endif() + + # Don't let the compiler think it can create calls to standard libs + target_compile_options(bolt_rt_instr PRIVATE ${BOLT_RT_FLAGS}) +@@ -39,7 +45,7 @@ target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + install(TARGETS bolt_rt_instr DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") + install(TARGETS bolt_rt_hugify DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") + +-if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*") ++if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin") + add_library(bolt_rt_instr_osx STATIC + instr.cpp + ${CMAKE_CURRENT_BINARY_DIR}/config.h +diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h +index 9e6f175..9b9965b 100644 +--- a/bolt/runtime/common.h ++++ b/bolt/runtime/common.h +@@ -6,10 +6,6 @@ + // + //===----------------------------------------------------------------------===// + +-#if !defined(__x86_64__) +-#error "For x86_64 only" +-#endif +- + #if defined(__linux__) + + #include +@@ -44,44 +40,6 @@ typedef int int32_t; + #error "For Linux or MacOS only" + #endif + +-// Save all registers while keeping 16B stack alignment +-#define SAVE_ALL \ +- "push %%rax\n" \ +- "push %%rbx\n" \ +- "push %%rcx\n" \ +- "push %%rdx\n" \ +- "push %%rdi\n" \ +- "push %%rsi\n" \ +- "push %%rbp\n" \ +- "push %%r8\n" \ +- "push %%r9\n" \ +- "push %%r10\n" \ +- "push %%r11\n" \ +- "push %%r12\n" \ +- "push %%r13\n" \ +- "push %%r14\n" \ +- "push %%r15\n" \ +- "sub $8, %%rsp\n" +- +-// Mirrors SAVE_ALL +-#define RESTORE_ALL \ +- "add $8, %%rsp\n" \ +- "pop %%r15\n" \ +- "pop %%r14\n" \ +- "pop %%r13\n" \ +- "pop %%r12\n" \ +- "pop %%r11\n" \ +- "pop %%r10\n" \ +- "pop %%r9\n" \ +- "pop %%r8\n" \ +- "pop %%rbp\n" \ +- "pop %%rsi\n" \ +- "pop %%rdi\n" \ +- "pop %%rdx\n" \ +- "pop %%rcx\n" \ +- "pop %%rbx\n" \ +- "pop %%rax\n" +- + #define PROT_READ 0x1 /* Page can be read. */ + #define PROT_WRITE 0x2 /* Page can be written. */ + #define PROT_EXEC 0x4 /* Page can be executed. */ +@@ -165,141 +123,41 @@ int memcmp(const void *s1, const void *s2, size_t n) { + // Anonymous namespace covering everything but our library entry point + namespace { + +-// Get the difference between runtime addrress of .text section and +-// static address in section header table. Can be extracted from arbitrary +-// pc value recorded at runtime to get the corresponding static address, which +-// in turn can be used to search for indirect call description. Needed because +-// indirect call descriptions are read-only non-relocatable data. +-uint64_t getTextBaseAddress() { +- uint64_t DynAddr; +- uint64_t StaticAddr; +- __asm__ volatile("leaq __hot_end(%%rip), %0\n\t" +- "movabsq $__hot_end, %1\n\t" +- : "=r"(DynAddr), "=r"(StaticAddr)); +- return DynAddr - StaticAddr; +-} +- +-constexpr uint32_t BufSize = 10240; +- +-#define _STRINGIFY(x) #x +-#define STRINGIFY(x) _STRINGIFY(x) +- +-uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { +- uint64_t ret; +-#if defined(__APPLE__) +-#define READ_SYSCALL 0x2000003 +-#else +-#define READ_SYSCALL 0 +-#endif +- __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(fd), "S"(buf), "d"(count) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { +- uint64_t ret; +-#if defined(__APPLE__) +-#define WRITE_SYSCALL 0x2000004 +-#else +-#define WRITE_SYSCALL 1 +-#endif +- __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(fd), "S"(buf), "d"(count) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, +- uint64_t fd, uint64_t offset) { +-#if defined(__APPLE__) +-#define MMAP_SYSCALL 0x20000c5 +-#else +-#define MMAP_SYSCALL 9 +-#endif +- void *ret; +- register uint64_t r8 asm("r8") = fd; +- register uint64_t r9 asm("r9") = offset; +- register uint64_t r10 asm("r10") = flags; +- __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), +- "r"(r9) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-uint64_t __munmap(void *addr, uint64_t size) { +-#if defined(__APPLE__) +-#define MUNMAP_SYSCALL 0x2000049 +-#else +-#define MUNMAP_SYSCALL 11 +-#endif +- uint64_t ret; +- __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(addr), "S"(size) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} ++struct dirent64 { ++ uint64_t d_ino; /* Inode number */ ++ int64_t d_off; /* Offset to next linux_dirent */ ++ unsigned short d_reclen; /* Length of this linux_dirent */ ++ unsigned char d_type; ++ char d_name[]; /* Filename (null-terminated) */ ++ /* length is actually (d_reclen - 2 - ++ offsetof(struct linux_dirent, d_name)) */ ++}; + +-#define SIG_BLOCK 0 +-#define SIG_UNBLOCK 1 +-#define SIG_SETMASK 2 ++/* Length of the entries in `struct utsname' is 65. */ ++#define _UTSNAME_LENGTH 65 + +-static const uint64_t MaskAllSignals[] = {-1ULL}; ++struct UtsNameTy { ++ char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */ ++ char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined ++ network" */ ++ char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */ ++ char version[_UTSNAME_LENGTH]; /* Operating system version */ ++ char machine[_UTSNAME_LENGTH]; /* Hardware identifier */ ++ char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */ ++}; + +-uint64_t __sigprocmask(int how, const void *set, void *oldset) { +-#if defined(__APPLE__) +-#define SIGPROCMASK_SYSCALL 0x2000030 +-#else +-#define SIGPROCMASK_SYSCALL 14 +-#endif +- uint64_t ret; +- register long r10 asm("r10") = sizeof(uint64_t); +- __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(how), "S"(set), "d"(oldset), "r"(r10) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} ++struct timespec { ++ uint64_t tv_sec; /* seconds */ ++ uint64_t tv_nsec; /* nanoseconds */ ++}; + +-uint64_t __getpid() { +- uint64_t ret; +-#if defined(__APPLE__) +-#define GETPID_SYSCALL 20 ++#if defined(__aarch64__) ++#include "sys_aarch64.h" + #else +-#define GETPID_SYSCALL 39 ++#include "sys_x86_64.h" + #endif +- __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} + +-uint64_t __exit(uint64_t code) { +-#if defined(__APPLE__) +-#define EXIT_SYSCALL 0x2000001 +-#else +-#define EXIT_SYSCALL 231 +-#endif +- uint64_t ret; +- __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(code) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} ++constexpr uint32_t BufSize = 10240; + + // Helper functions for writing strings to the .fdata file. We intentionally + // avoid using libc names to make it clear it is our impl. +@@ -415,219 +273,6 @@ static bool scanUInt32(const char *&Buf, const char *End, uint32_t &Ret) { + return false; + } + +-#if !defined(__APPLE__) +-// We use a stack-allocated buffer for string manipulation in many pieces of +-// this code, including the code that prints each line of the fdata file. This +-// buffer needs to accomodate large function names, but shouldn't be arbitrarily +-// large (dynamically allocated) for simplicity of our memory space usage. +- +-// Declare some syscall wrappers we use throughout this code to avoid linking +-// against system libc. +-uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { +- uint64_t ret; +- __asm__ __volatile__("movq $2, %%rax\n" +- "syscall" +- : "=a"(ret) +- : "D"(pathname), "S"(flags), "d"(mode) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-struct dirent { +- unsigned long d_ino; /* Inode number */ +- unsigned long d_off; /* Offset to next linux_dirent */ +- unsigned short d_reclen; /* Length of this linux_dirent */ +- char d_name[]; /* Filename (null-terminated) */ +- /* length is actually (d_reclen - 2 - +- offsetof(struct linux_dirent, d_name)) */ +-}; +- +-long __getdents(unsigned int fd, dirent *dirp, size_t count) { +- long ret; +- __asm__ __volatile__("movq $78, %%rax\n" +- "syscall" +- : "=a"(ret) +- : "D"(fd), "S"(dirp), "d"(count) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { +- uint64_t ret; +- __asm__ __volatile__("movq $89, %%rax\n" +- "syscall" +- : "=a"(ret) +- : "D"(pathname), "S"(buf), "d"(bufsize) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { +- uint64_t ret; +- __asm__ __volatile__("movq $8, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(fd), "S"(pos), "d"(whence) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int __ftruncate(uint64_t fd, uint64_t length) { +- int ret; +- __asm__ __volatile__("movq $77, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(fd), "S"(length) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int __close(uint64_t fd) { +- uint64_t ret; +- __asm__ __volatile__("movq $3, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(fd) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int __madvise(void *addr, size_t length, int advice) { +- int ret; +- __asm__ __volatile__("movq $28, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(addr), "S"(length), "d"(advice) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-#define _UTSNAME_LENGTH 65 +- +-struct UtsNameTy { +- char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */ +- char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined +- network" */ +- char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */ +- char version[_UTSNAME_LENGTH]; /* Operating system version */ +- char machine[_UTSNAME_LENGTH]; /* Hardware identifier */ +- char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */ +-}; +- +-int __uname(struct UtsNameTy *Buf) { +- int Ret; +- __asm__ __volatile__("movq $63, %%rax\n" +- "syscall\n" +- : "=a"(Ret) +- : "D"(Buf) +- : "cc", "rcx", "r11", "memory"); +- return Ret; +-} +- +-struct timespec { +- uint64_t tv_sec; /* seconds */ +- uint64_t tv_nsec; /* nanoseconds */ +-}; +- +-uint64_t __nanosleep(const timespec *req, timespec *rem) { +- uint64_t ret; +- __asm__ __volatile__("movq $35, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(req), "S"(rem) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int64_t __fork() { +- uint64_t ret; +- __asm__ __volatile__("movq $57, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int __mprotect(void *addr, size_t len, int prot) { +- int ret; +- __asm__ __volatile__("movq $10, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(addr), "S"(len), "d"(prot) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-uint64_t __getppid() { +- uint64_t ret; +- __asm__ __volatile__("movq $110, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int __setpgid(uint64_t pid, uint64_t pgid) { +- int ret; +- __asm__ __volatile__("movq $109, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(pid), "S"(pgid) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-uint64_t __getpgid(uint64_t pid) { +- uint64_t ret; +- __asm__ __volatile__("movq $121, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(pid) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int __kill(uint64_t pid, int sig) { +- int ret; +- __asm__ __volatile__("movq $62, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(pid), "S"(sig) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int __fsync(int fd) { +- int ret; +- __asm__ __volatile__("movq $74, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(fd) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-// %rdi %rsi %rdx %r10 %r8 +-// sys_prctl int option unsigned unsigned unsigned unsigned +-// long arg2 long arg3 long arg4 long arg5 +-int __prctl(int Option, unsigned long Arg2, unsigned long Arg3, +- unsigned long Arg4, unsigned long Arg5) { +- int Ret; +- register long rdx asm("rdx") = Arg3; +- register long r8 asm("r8") = Arg5; +- register long r10 asm("r10") = Arg4; +- __asm__ __volatile__("movq $157, %%rax\n" +- "syscall\n" +- : "=a"(Ret) +- : "D"(Option), "S"(Arg2), "d"(rdx), "r"(r10), "r"(r8) +- :); +- return Ret; +-} +- +-#endif +- + void reportError(const char *Msg, uint64_t Size) { + __write(2, Msg, Size); + __exit(1); +@@ -644,6 +289,12 @@ void assert(bool Assertion, const char *Msg) { + reportError(Buf, Ptr - Buf); + } + ++#define SIG_BLOCK 0 ++#define SIG_UNBLOCK 1 ++#define SIG_SETMASK 2 ++ ++static const uint64_t MaskAllSignals[] = {-1ULL}; ++ + class Mutex { + volatile bool InUse{false}; + +diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp +index 96a43f6..cfd113e 100644 +--- a/bolt/runtime/instr.cpp ++++ b/bolt/runtime/instr.cpp +@@ -40,7 +40,6 @@ + // + //===----------------------------------------------------------------------===// + +-#if defined (__x86_64__) + #include "common.h" + + // Enables a very verbose logging to stderr useful when debugging +@@ -695,12 +694,12 @@ static char *getBinaryPath() { + assert(static_cast(FDdir) >= 0, + "failed to open /proc/self/map_files"); + +- while (long Nread = __getdents(FDdir, (struct dirent *)Buf, BufSize)) { ++ while (long Nread = __getdents64(FDdir, (struct dirent64 *)Buf, BufSize)) { + assert(static_cast(Nread) != -1, "failed to get folder entries"); + +- struct dirent *d; ++ struct dirent64 *d; + for (long Bpos = 0; Bpos < Nread; Bpos += d->d_reclen) { +- d = (struct dirent *)(Buf + Bpos); ++ d = (struct dirent64 *)(Buf + Bpos); + + uint64_t StartAddress, EndAddress; + if (!parseAddressRange(d->d_name, StartAddress, EndAddress)) +@@ -1668,6 +1667,17 @@ instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) { + /// as well as the target address for the call + extern "C" __attribute((naked)) void __bolt_instr_indirect_call() + { ++#if defined(__aarch64__) ++ // clang-format off ++ __asm__ __volatile__(SAVE_ALL ++ "ldp x0, x1, [sp, #288]\n" ++ "bl instrumentIndirectCall\n" ++ RESTORE_ALL ++ "ret\n" ++ :::); ++ // clang-format on ++#else ++ // clang-format off + __asm__ __volatile__(SAVE_ALL + "mov 0xa0(%%rsp), %%rdi\n" + "mov 0x98(%%rsp), %%rsi\n" +@@ -1675,10 +1685,23 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_call() + RESTORE_ALL + "ret\n" + :::); ++ // clang-format on ++#endif + } + + extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall() + { ++#if defined(__aarch64__) ++ // clang-format off ++ __asm__ __volatile__(SAVE_ALL ++ "ldp x0, x1, [sp, #288]\n" ++ "bl instrumentIndirectCall\n" ++ RESTORE_ALL ++ "ret\n" ++ :::); ++ // clang-format on ++#else ++ // clang-format off + __asm__ __volatile__(SAVE_ALL + "mov 0x98(%%rsp), %%rdi\n" + "mov 0x90(%%rsp), %%rsi\n" +@@ -1686,21 +1709,48 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall() + RESTORE_ALL + "ret\n" + :::); ++ // clang-format on ++#endif + } + + /// This is hooking ELF's entry, it needs to save all machine state. + extern "C" __attribute((naked)) void __bolt_instr_start() + { ++#if defined(__aarch64__) ++ // clang-format off ++ __asm__ __volatile__(SAVE_ALL ++ "bl __bolt_instr_setup\n" ++ RESTORE_ALL ++ "adrp x16, __bolt_start_trampoline\n" ++ "add x16, x16, #:lo12:__bolt_start_trampoline\n" ++ "br x16\n" ++ :::); ++ // clang-format on ++#else ++ // clang-format off + __asm__ __volatile__(SAVE_ALL + "call __bolt_instr_setup\n" + RESTORE_ALL + "jmp __bolt_start_trampoline\n" + :::); ++ // clang-format on ++#endif + } + + /// This is hooking into ELF's DT_FINI + extern "C" void __bolt_instr_fini() { +- __bolt_fini_trampoline(); ++#if defined(__aarch64__) ++ // clang-format off ++ __asm__ __volatile__(SAVE_ALL ++ "adrp x16, __bolt_fini_trampoline\n" ++ "add x16, x16, #:lo12:__bolt_fini_trampoline\n" ++ "blr x16\n" ++ RESTORE_ALL ++ :::); ++ // clang-format on ++#else ++ __asm__ __volatile__("call __bolt_fini_trampoline\n" :::); ++#endif + if (__bolt_instr_sleep_time == 0) { + int FD = openProfile(); + __bolt_instr_data_dump(FD); +@@ -1752,4 +1802,3 @@ void _bolt_instr_fini() { + } + + #endif +-#endif +diff --git a/bolt/runtime/sys_aarch64.h b/bolt/runtime/sys_aarch64.h +new file mode 100644 +index 0000000..77c9cfc +--- /dev/null ++++ b/bolt/runtime/sys_aarch64.h +@@ -0,0 +1,394 @@ ++#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_AARCH64 ++#define LLVM_TOOLS_LLVM_BOLT_SYS_AARCH64 ++ ++// Save all registers while keeping 16B stack alignment ++#define SAVE_ALL \ ++ "stp x0, x1, [sp, #-16]!\n" \ ++ "stp x2, x3, [sp, #-16]!\n" \ ++ "stp x4, x5, [sp, #-16]!\n" \ ++ "stp x6, x7, [sp, #-16]!\n" \ ++ "stp x8, x9, [sp, #-16]!\n" \ ++ "stp x10, x11, [sp, #-16]!\n" \ ++ "stp x12, x13, [sp, #-16]!\n" \ ++ "stp x14, x15, [sp, #-16]!\n" \ ++ "stp x16, x17, [sp, #-16]!\n" \ ++ "stp x18, x19, [sp, #-16]!\n" \ ++ "stp x20, x21, [sp, #-16]!\n" \ ++ "stp x22, x23, [sp, #-16]!\n" \ ++ "stp x24, x25, [sp, #-16]!\n" \ ++ "stp x26, x27, [sp, #-16]!\n" \ ++ "stp x28, x29, [sp, #-16]!\n" \ ++ "str x30, [sp,#-16]!\n" ++// Mirrors SAVE_ALL ++#define RESTORE_ALL \ ++ "ldr x30, [sp], #16\n" \ ++ "ldp x28, x29, [sp], #16\n" \ ++ "ldp x26, x27, [sp], #16\n" \ ++ "ldp x24, x25, [sp], #16\n" \ ++ "ldp x22, x23, [sp], #16\n" \ ++ "ldp x20, x21, [sp], #16\n" \ ++ "ldp x18, x19, [sp], #16\n" \ ++ "ldp x16, x17, [sp], #16\n" \ ++ "ldp x14, x15, [sp], #16\n" \ ++ "ldp x12, x13, [sp], #16\n" \ ++ "ldp x10, x11, [sp], #16\n" \ ++ "ldp x8, x9, [sp], #16\n" \ ++ "ldp x6, x7, [sp], #16\n" \ ++ "ldp x4, x5, [sp], #16\n" \ ++ "ldp x2, x3, [sp], #16\n" \ ++ "ldp x0, x1, [sp], #16\n" ++ ++// Anonymous namespace covering everything but our library entry point ++namespace { ++ ++// Get the difference between runtime addrress of .text section and ++// static address in section header table. Can be extracted from arbitrary ++// pc value recorded at runtime to get the corresponding static address, which ++// in turn can be used to search for indirect call description. Needed because ++// indirect call descriptions are read-only non-relocatable data. ++uint64_t getTextBaseAddress() { ++ uint64_t DynAddr; ++ uint64_t StaticAddr; ++ __asm__ volatile("b .instr%=\n\t" ++ ".StaticAddr%=:\n\t" ++ ".dword __hot_end\n\t" ++ ".instr%=:\n\t" ++ "ldr %0, .StaticAddr%=\n\t" ++ "adrp %1, __hot_end\n\t" ++ "add %1, %1, :lo12:__hot_end\n\t" ++ : "=r"(StaticAddr), "=r"(DynAddr)); ++ return DynAddr - StaticAddr; ++} ++ ++uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { ++ uint64_t ret; ++ register uint64_t x0 __asm__("x0") = fd; ++ register const void *x1 __asm__("x1") = buf; ++ register uint64_t x2 __asm__("x2") = count; ++ register uint32_t w8 __asm__("w8") = 63; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { ++ uint64_t ret; ++ register uint64_t x0 __asm__("x0") = fd; ++ register const void *x1 __asm__("x1") = buf; ++ register uint64_t x2 __asm__("x2") = count; ++ register uint32_t w8 __asm__("w8") = 64; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, ++ uint64_t fd, uint64_t offset) { ++ void *ret; ++ register uint64_t x0 __asm__("x0") = addr; ++ register uint64_t x1 __asm__("x1") = size; ++ register uint64_t x2 __asm__("x2") = prot; ++ register uint64_t x3 __asm__("x3") = flags; ++ register uint64_t x4 __asm__("x4") = fd; ++ register uint64_t x5 __asm__("x5") = offset; ++ register uint32_t w8 __asm__("w8") = 222; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(x3), "r"(x4), "r"(x5), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++uint64_t __munmap(void *addr, uint64_t size) { ++ uint64_t ret; ++ register void *x0 __asm__("x0") = addr; ++ register uint64_t x1 __asm__("x1") = size; ++ register uint32_t w8 __asm__("w8") = 215; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++uint64_t __exit(uint64_t code) { ++ uint64_t ret; ++ register uint64_t x0 __asm__("x0") = code; ++ register uint32_t w8 __asm__("w8") = 94; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0) ++ : "r"(w8) ++ : "cc", "memory", "x1"); ++ return ret; ++} ++ ++uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { ++ uint64_t ret; ++ register int x0 __asm__("x0") = -100; ++ register const char *x1 __asm__("x1") = pathname; ++ register uint64_t x2 __asm__("x2") = flags; ++ register uint64_t x3 __asm__("x3") = mode; ++ register uint32_t w8 __asm__("w8") = 56; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(x3), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) { ++ long ret; ++ register unsigned int x0 __asm__("x0") = fd; ++ register dirent64 *x1 __asm__("x1") = dirp; ++ register size_t x2 __asm__("x2") = count; ++ register uint32_t w8 __asm__("w8") = 61; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { ++ uint64_t ret; ++ register int x0 __asm__("x0") = -100; ++ register const char *x1 __asm__("x1") = pathname; ++ register char *x2 __asm__("x2") = buf; ++ register size_t x3 __asm__("x3") = bufsize; ++ register uint32_t w8 __asm__("w8") = 78; // readlinkat ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(x3), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { ++ uint64_t ret; ++ register uint64_t x0 __asm__("x0") = fd; ++ register uint64_t x1 __asm__("x1") = pos; ++ register uint64_t x2 __asm__("x2") = whence; ++ register uint32_t w8 __asm__("w8") = 62; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++int __ftruncate(uint64_t fd, uint64_t length) { ++ int ret; ++ register uint64_t x0 __asm__("x0") = fd; ++ register uint64_t x1 __asm__("x1") = length; ++ register uint32_t w8 __asm__("w8") = 46; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++int __close(uint64_t fd) { ++ int ret; ++ register uint64_t x0 __asm__("x0") = fd; ++ register uint32_t w8 __asm__("w8") = 57; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0) ++ : "r"(w8) ++ : "cc", "memory", "x1"); ++ return ret; ++} ++ ++int __madvise(void *addr, size_t length, int advice) { ++ int ret; ++ register void *x0 __asm__("x0") = addr; ++ register size_t x1 __asm__("x1") = length; ++ register int x2 __asm__("x2") = advice; ++ register uint32_t w8 __asm__("w8") = 233; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++int __uname(struct UtsNameTy *buf) { ++ int ret; ++ register UtsNameTy *x0 __asm__("x0") = buf; ++ register uint32_t w8 __asm__("w8") = 160; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0) ++ : "r"(w8) ++ : "cc", "memory", "x1"); ++ return ret; ++} ++ ++uint64_t __nanosleep(const timespec *req, timespec *rem) { ++ uint64_t ret; ++ register const timespec *x0 __asm__("x0") = req; ++ register timespec *x1 __asm__("x1") = rem; ++ register uint32_t w8 __asm__("w8") = 101; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++int64_t __fork() { ++ uint64_t ret; ++ // clone instead of fork with flags ++ // "CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD" ++ register uint64_t x0 __asm__("x0") = 0x1200011; ++ register uint64_t x1 __asm__("x1") = 0; ++ register uint64_t x2 __asm__("x2") = 0; ++ register uint64_t x3 __asm__("x3") = 0; ++ register uint64_t x4 __asm__("x4") = 0; ++ register uint32_t w8 __asm__("w8") = 220; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(x3), "r"(x4), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++int __mprotect(void *addr, size_t len, int prot) { ++ int ret; ++ register void *x0 __asm__("x0") = addr; ++ register size_t x1 __asm__("x1") = len; ++ register int x2 __asm__("x2") = prot; ++ register uint32_t w8 __asm__("w8") = 226; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++uint64_t __getpid() { ++ uint64_t ret; ++ register uint32_t w8 __asm__("w8") = 172; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret) ++ : "r"(w8) ++ : "cc", "memory", "x0", "x1"); ++ return ret; ++} ++ ++uint64_t __getppid() { ++ uint64_t ret; ++ register uint32_t w8 __asm__("w8") = 173; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret) ++ : "r"(w8) ++ : "cc", "memory", "x0", "x1"); ++ return ret; ++} ++ ++int __setpgid(uint64_t pid, uint64_t pgid) { ++ int ret; ++ register uint64_t x0 __asm__("x0") = pid; ++ register uint64_t x1 __asm__("x1") = pgid; ++ register uint32_t w8 __asm__("w8") = 154; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++uint64_t __getpgid(uint64_t pid) { ++ uint64_t ret; ++ register uint64_t x0 __asm__("x0") = pid; ++ register uint32_t w8 __asm__("w8") = 155; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0) ++ : "r"(w8) ++ : "cc", "memory", "x1"); ++ return ret; ++} ++ ++int __kill(uint64_t pid, int sig) { ++ int ret; ++ register uint64_t x0 __asm__("x0") = pid; ++ register int x1 __asm__("x1") = sig; ++ register uint32_t w8 __asm__("w8") = 129; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++int __fsync(int fd) { ++ int ret; ++ register int x0 __asm__("x0") = fd; ++ register uint32_t w8 __asm__("w8") = 82; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0) ++ : "r"(w8) ++ : "cc", "memory", "x1"); ++ return ret; ++} ++ ++uint64_t __sigprocmask(int how, const void *set, void *oldset) { ++ uint64_t ret; ++ register int x0 __asm__("x0") = how; ++ register const void *x1 __asm__("x1") = set; ++ register void *x2 __asm__("x2") = oldset; ++ register long x3 asm("x3") = 8; ++ register uint32_t w8 __asm__("w8") = 135; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(x3), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++int __prctl(int option, unsigned long arg2, unsigned long arg3, ++ unsigned long arg4, unsigned long arg5) { ++ int ret; ++ register int x0 __asm__("x0") = option; ++ register unsigned long x1 __asm__("x1") = arg2; ++ register unsigned long x2 __asm__("x2") = arg3; ++ register unsigned long x3 __asm__("x3") = arg4; ++ register unsigned long x4 __asm__("x4") = arg5; ++ register uint32_t w8 __asm__("w8") = 167; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(x3), "r"(x4), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++} // anonymous namespace ++ ++#endif +diff --git a/bolt/runtime/sys_x86_64.h b/bolt/runtime/sys_x86_64.h +new file mode 100644 +index 0000000..ca2c693 +--- /dev/null ++++ b/bolt/runtime/sys_x86_64.h +@@ -0,0 +1,360 @@ ++#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_X86_64 ++#define LLVM_TOOLS_LLVM_BOLT_SYS_X86_64 ++ ++// Save all registers while keeping 16B stack alignment ++#define SAVE_ALL \ ++ "push %%rax\n" \ ++ "push %%rbx\n" \ ++ "push %%rcx\n" \ ++ "push %%rdx\n" \ ++ "push %%rdi\n" \ ++ "push %%rsi\n" \ ++ "push %%rbp\n" \ ++ "push %%r8\n" \ ++ "push %%r9\n" \ ++ "push %%r10\n" \ ++ "push %%r11\n" \ ++ "push %%r12\n" \ ++ "push %%r13\n" \ ++ "push %%r14\n" \ ++ "push %%r15\n" \ ++ "sub $8, %%rsp\n" ++// Mirrors SAVE_ALL ++#define RESTORE_ALL \ ++ "add $8, %%rsp\n" \ ++ "pop %%r15\n" \ ++ "pop %%r14\n" \ ++ "pop %%r13\n" \ ++ "pop %%r12\n" \ ++ "pop %%r11\n" \ ++ "pop %%r10\n" \ ++ "pop %%r9\n" \ ++ "pop %%r8\n" \ ++ "pop %%rbp\n" \ ++ "pop %%rsi\n" \ ++ "pop %%rdi\n" \ ++ "pop %%rdx\n" \ ++ "pop %%rcx\n" \ ++ "pop %%rbx\n" \ ++ "pop %%rax\n" ++ ++namespace { ++ ++// Get the difference between runtime addrress of .text section and ++// static address in section header table. Can be extracted from arbitrary ++// pc value recorded at runtime to get the corresponding static address, which ++// in turn can be used to search for indirect call description. Needed because ++// indirect call descriptions are read-only non-relocatable data. ++uint64_t getTextBaseAddress() { ++ uint64_t DynAddr; ++ uint64_t StaticAddr; ++ __asm__ volatile("leaq __hot_end(%%rip), %0\n\t" ++ "movabsq $__hot_end, %1\n\t" ++ : "=r"(DynAddr), "=r"(StaticAddr)); ++ return DynAddr - StaticAddr; ++} ++ ++#define _STRINGIFY(x) #x ++#define STRINGIFY(x) _STRINGIFY(x) ++ ++uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { ++ uint64_t ret; ++#if defined(__APPLE__) ++#define READ_SYSCALL 0x2000003 ++#else ++#define READ_SYSCALL 0 ++#endif ++ __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(fd), "S"(buf), "d"(count) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { ++ uint64_t ret; ++#if defined(__APPLE__) ++#define WRITE_SYSCALL 0x2000004 ++#else ++#define WRITE_SYSCALL 1 ++#endif ++ __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(fd), "S"(buf), "d"(count) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, ++ uint64_t fd, uint64_t offset) { ++#if defined(__APPLE__) ++#define MMAP_SYSCALL 0x20000c5 ++#else ++#define MMAP_SYSCALL 9 ++#endif ++ void *ret; ++ register uint64_t r8 asm("r8") = fd; ++ register uint64_t r9 asm("r9") = offset; ++ register uint64_t r10 asm("r10") = flags; ++ __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), ++ "r"(r9) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __munmap(void *addr, uint64_t size) { ++#if defined(__APPLE__) ++#define MUNMAP_SYSCALL 0x2000049 ++#else ++#define MUNMAP_SYSCALL 11 ++#endif ++ uint64_t ret; ++ __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(addr), "S"(size) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __sigprocmask(int how, const void *set, void *oldset) { ++#if defined(__APPLE__) ++#define SIGPROCMASK_SYSCALL 0x2000030 ++#else ++#define SIGPROCMASK_SYSCALL 14 ++#endif ++ uint64_t ret; ++ register long r10 asm("r10") = sizeof(uint64_t); ++ __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(how), "S"(set), "d"(oldset), "r"(r10) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __getpid() { ++ uint64_t ret; ++#if defined(__APPLE__) ++#define GETPID_SYSCALL 20 ++#else ++#define GETPID_SYSCALL 39 ++#endif ++ __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __exit(uint64_t code) { ++#if defined(__APPLE__) ++#define EXIT_SYSCALL 0x2000001 ++#else ++#define EXIT_SYSCALL 231 ++#endif ++ uint64_t ret; ++ __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(code) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++#if !defined(__APPLE__) ++// We use a stack-allocated buffer for string manipulation in many pieces of ++// this code, including the code that prints each line of the fdata file. This ++// buffer needs to accomodate large function names, but shouldn't be arbitrarily ++// large (dynamically allocated) for simplicity of our memory space usage. ++ ++// Declare some syscall wrappers we use throughout this code to avoid linking ++// against system libc. ++uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { ++ uint64_t ret; ++ __asm__ __volatile__("movq $2, %%rax\n" ++ "syscall" ++ : "=a"(ret) ++ : "D"(pathname), "S"(flags), "d"(mode) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) { ++ long ret; ++ __asm__ __volatile__("movq $217, %%rax\n" ++ "syscall" ++ : "=a"(ret) ++ : "D"(fd), "S"(dirp), "d"(count) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { ++ uint64_t ret; ++ __asm__ __volatile__("movq $89, %%rax\n" ++ "syscall" ++ : "=a"(ret) ++ : "D"(pathname), "S"(buf), "d"(bufsize) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { ++ uint64_t ret; ++ __asm__ __volatile__("movq $8, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(fd), "S"(pos), "d"(whence) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __ftruncate(uint64_t fd, uint64_t length) { ++ int ret; ++ __asm__ __volatile__("movq $77, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(fd), "S"(length) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __close(uint64_t fd) { ++ uint64_t ret; ++ __asm__ __volatile__("movq $3, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(fd) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __madvise(void *addr, size_t length, int advice) { ++ int ret; ++ __asm__ __volatile__("movq $28, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(addr), "S"(length), "d"(advice) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __uname(struct UtsNameTy *Buf) { ++ int Ret; ++ __asm__ __volatile__("movq $63, %%rax\n" ++ "syscall\n" ++ : "=a"(Ret) ++ : "D"(Buf) ++ : "cc", "rcx", "r11", "memory"); ++ return Ret; ++} ++ ++uint64_t __nanosleep(const timespec *req, timespec *rem) { ++ uint64_t ret; ++ __asm__ __volatile__("movq $35, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(req), "S"(rem) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int64_t __fork() { ++ uint64_t ret; ++ __asm__ __volatile__("movq $57, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __mprotect(void *addr, size_t len, int prot) { ++ int ret; ++ __asm__ __volatile__("movq $10, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(addr), "S"(len), "d"(prot) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __getppid() { ++ uint64_t ret; ++ __asm__ __volatile__("movq $110, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __setpgid(uint64_t pid, uint64_t pgid) { ++ int ret; ++ __asm__ __volatile__("movq $109, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(pid), "S"(pgid) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __getpgid(uint64_t pid) { ++ uint64_t ret; ++ __asm__ __volatile__("movq $121, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(pid) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __kill(uint64_t pid, int sig) { ++ int ret; ++ __asm__ __volatile__("movq $62, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(pid), "S"(sig) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __fsync(int fd) { ++ int ret; ++ __asm__ __volatile__("movq $74, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(fd) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++// %rdi %rsi %rdx %r10 %r8 ++// sys_prctl int option unsigned unsigned unsigned unsigned ++// long arg2 long arg3 long arg4 long arg5 ++int __prctl(int Option, unsigned long Arg2, unsigned long Arg3, ++ unsigned long Arg4, unsigned long Arg5) { ++ int Ret; ++ register long rdx asm("rdx") = Arg3; ++ register long r8 asm("r8") = Arg5; ++ register long r10 asm("r10") = Arg4; ++ __asm__ __volatile__("movq $157, %%rax\n" ++ "syscall\n" ++ : "=a"(Ret) ++ : "D"(Option), "S"(Arg2), "d"(rdx), "r"(r10), "r"(r8) ++ :); ++ return Ret; ++} ++ ++#endif ++ ++} // anonymous namespace ++ ++#endif +diff --git a/bolt/test/AArch64/exclusive-instrument.s b/bolt/test/AArch64/exclusive-instrument.s +new file mode 100644 +index 0000000..502dd83 +--- /dev/null ++++ b/bolt/test/AArch64/exclusive-instrument.s +@@ -0,0 +1,39 @@ ++// This test checks that the foo function having exclusive memory access ++// instructions won't be instrumented. ++ ++// REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}} ++ ++// RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ ++// RUN: %s -o %t.o ++// RUN: %clang %cflags -fPIC -pie %t.o -o %t.exe -nostdlib -Wl,-q -Wl,-fini=dummy ++// RUN: llvm-bolt %t.exe -o %t.bolt -instrument -v=1 | FileCheck %s ++ ++// CHECK: Function foo has exclusive instructions, skip instrumentation ++ ++.global foo ++.type foo, %function ++foo: ++ ldaxr w9, [x10] ++ cbnz w9, .Lret ++ stlxr w12, w11, [x9] ++ cbz w12, foo ++ clrex ++.Lret: ++ ret ++.size foo, .-foo ++ ++.global _start ++.type _start, %function ++_start: ++ cmp x0, #0 ++ b.eq .Lexit ++ bl foo ++.Lexit: ++ ret ++.size _start, .-_start ++ ++.global dummy ++.type dummy, %function ++dummy: ++ ret ++.size dummy, .-dummy +diff --git a/bolt/test/X86/asm-dump.c b/bolt/test/X86/asm-dump.c +index 5d85e2a..fdd448e 100644 +--- a/bolt/test/X86/asm-dump.c ++++ b/bolt/test/X86/asm-dump.c +@@ -1,13 +1,14 @@ + /** + * Test for asm-dump functionality. + * +- * REQUIRES: system-linux,bolt-runtime ++ * REQUIRES: x86_64-linux,bolt-runtime + * + * Compile the source + * RUN: %clang -fPIC %s -o %t.exe -Wl,-q + * + * Profile collection: instrument the binary +- * RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata -o %t.instr ++ * RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata -o \ ++ * RUN: %t.instr + * + * Profile collection: run instrumented binary (and capture output) + * RUN: %t.instr > %t.result +diff --git a/bolt/test/X86/bolt-address-translation-internal-call.test b/bolt/test/X86/bolt-address-translation-internal-call.test +index edc32d9..24cb635 100644 +--- a/bolt/test/X86/bolt-address-translation-internal-call.test ++++ b/bolt/test/X86/bolt-address-translation-internal-call.test +@@ -4,12 +4,12 @@ + # internal calls) might create new blocks without a mapping to an + # input block. + +-# REQUIRES: system-linux,bolt-runtime ++# REQUIRES: x86_64-linux,bolt-runtime + + # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o + # Delete our BB symbols so BOLT doesn't mark them as entry points + # RUN: llvm-strip --strip-unneeded %t.o +-# RUN: %clang %t.o -o %t.exe -Wl,-q ++# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q + + # RUN: llvm-bolt --enable-bat %t.exe --relocs -o %t.out | FileCheck %s + # CHECK: BOLT-INFO: Wrote {{.*}} BAT maps +@@ -29,6 +29,7 @@ main: + push %rbx + sub $0x120,%rsp + mov $0x3,%rbx ++ movq rel(%rip), %rdi + .J1: + cmp $0x0,%rbx + je .J2 +@@ -49,4 +50,8 @@ main: + .J4: + pop %rbp + retq ++end: + .size main, .-main ++ ++ .data ++rel: .quad end +diff --git a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp +index f6ebd6b..4ed8be4 100644 +--- a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp ++++ b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp +@@ -1,7 +1,7 @@ + // This test checks that .eh_frame_hdr address is in bounds of the last LOAD + // end address i.e. the section address is smaller then the LOAD end address. + +-// REQUIRES: system-linux,bolt-runtime ++// REQUIRES: system-linux,bolt-runtime,target=x86_64{{.*}} + + // RUN: %clangxx %cxxflags -static -Wl,-q %s -o %t.exe -Wl,--entry=_start + // RUN: llvm-bolt %t.exe -o %t.instr -instrument \ +diff --git a/bolt/test/X86/internal-call-instrument.s b/bolt/test/X86/internal-call-instrument.s +index c137174..c393f1d 100644 +--- a/bolt/test/X86/internal-call-instrument.s ++++ b/bolt/test/X86/internal-call-instrument.s +@@ -1,15 +1,23 @@ + # This reproduces a bug with instrumentation crashes on internal call + +-# REQUIRES: system-linux,bolt-runtime ++# REQUIRES: x86_64-linux,bolt-runtime,target=x86_64{{.*}} + + # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o + # Delete our BB symbols so BOLT doesn't mark them as entry points + # RUN: llvm-strip --strip-unneeded %t.o +-# RUN: %clang %t.o -o %t.exe -Wl,-q ++# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q + + # RUN: llvm-bolt --instrument %t.exe --relocs -o %t.out + + .text ++ .globl _start ++ .type _start, %function ++ .p2align 4 ++_start: ++ call main ++ ret ++ .size _start, .-_start ++ + .globl main + .type main, %function + .p2align 4 +@@ -20,6 +28,7 @@ main: + push %rbx + sub $0x120,%rsp + mov $0x3,%rbx ++ movq rel(%rip), %rdi + .J1: + cmp $0x0,%rbx + je .J2 +@@ -40,4 +49,15 @@ main: + .J4: + pop %rbp + retq ++end: + .size main, .-main ++ ++ .globl _fini ++ .type _fini, %function ++ .p2align 4 ++_fini: ++ hlt ++ .size _fini, .-_fini ++ ++ .data ++rel: .quad end +diff --git a/bolt/test/X86/tail-duplication-pass.s b/bolt/test/X86/tail-duplication-pass.s +index 677f498..ed50cc5 100644 +--- a/bolt/test/X86/tail-duplication-pass.s ++++ b/bolt/test/X86/tail-duplication-pass.s +@@ -7,12 +7,21 @@ + # RUN: llvm-bolt %t.exe --data %t.fdata --reorder-blocks=ext-tsp \ + # RUN: --print-finalized --tail-duplication=moderate \ + # RUN: --tail-duplication-minimum-offset=1 -o %t.out | FileCheck %s ++# RUN: llvm-bolt %t.exe --data %t.fdata --print-finalized \ ++# RUN: --tail-duplication=aggressive --tail-duplication-minimum-offset=1 \ ++# RUN: -o %t.out | FileCheck %s --check-prefix CHECK-NOLOOP + + # FDATA: 1 main 2 1 main #.BB2# 0 10 + # FDATA: 1 main 4 1 main #.BB2# 0 20 + # CHECK: BOLT-INFO: tail duplication modified 1 ({{.*}}%) functions; duplicated 1 blocks (1 bytes) responsible for {{.*}} dynamic executions ({{.*}}% of all block executions) + # CHECK: BB Layout : .LBB00, .Ltail-dup0, .Ltmp0, .Ltmp1 + ++# Check that the successor of Ltail-dup0 is .LBB00, not itself. ++# CHECK-NOLOOP: .Ltail-dup0 (1 instructions, align : 1) ++# CHECK-NOLOOP: Predecessors: .LBB00 ++# CHECK-NOLOOP: retq ++# CHECK-NOLOOP: .Ltmp0 (1 instructions, align : 1) ++ + .text + .globl main + .type main, %function +diff --git a/bolt/test/assume-abi.test b/bolt/test/assume-abi.test +new file mode 100644 +index 0000000..688ab01 +--- /dev/null ++++ b/bolt/test/assume-abi.test +@@ -0,0 +1,7 @@ ++# Validate the usage of the `--assume-abi` option in conjunction with ++# options related to the RegAnalysis Pass. ++ ++REQUIRES: system-linux ++ ++RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q ++RUN: llvm-bolt %t -o %t.bolt --assume-abi --indirect-call-promotion=all +diff --git a/bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s b/bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s +new file mode 100644 +index 0000000..fa1ac35 +--- /dev/null ++++ b/bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s +@@ -0,0 +1,9 @@ ++ .globl main ++ .type main, %function ++main: ++ sub sp, sp, #16 ++ mov w0, wzr ++ str wzr, [sp, #12] ++ add sp, sp, #16 ++ ret ++.size main, .-main +diff --git a/bolt/test/runtime/AArch64/basic-instrumentation.test b/bolt/test/runtime/AArch64/basic-instrumentation.test +new file mode 100644 +index 0000000..0f77b0c +--- /dev/null ++++ b/bolt/test/runtime/AArch64/basic-instrumentation.test +@@ -0,0 +1,22 @@ ++# Try to instrument a very fast test. Input bin will not execute any code during ++# runtime besides returning zero in main, so it is a good trivial case. ++REQUIRES: system-linux,bolt-runtime ++ ++RUN: %clang %p/Inputs/basic-instrumentation.s -Wl,-q -o %t.exe ++RUN: llvm-bolt %t.exe -o %t --instrument \ ++RUN: --instrumentation-file=%t \ ++RUN: --instrumentation-file-append-pid ++ ++# Execute program to collect profile ++RUN: rm %t.*.fdata || echo Nothing to remove ++RUN: %t ++ ++# Profile should be written to %t.PID.fdata, check it ++RUN: mv %t.*.fdata %t.fdata ++RUN: cat %t.fdata | FileCheck -check-prefix=CHECK %s ++ ++# Check BOLT works with this profile ++RUN: llvm-bolt %t.exe --data %t.fdata -o %t.2 --reorder-blocks=cache ++ ++# The instrumented profile should at least say main was called once ++CHECK: main 0 0 1{{$}} +diff --git a/bolt/test/runtime/AArch64/instrumentation-ind-call.c b/bolt/test/runtime/AArch64/instrumentation-ind-call.c +new file mode 100644 +index 0000000..76ee8c0 +--- /dev/null ++++ b/bolt/test/runtime/AArch64/instrumentation-ind-call.c +@@ -0,0 +1,38 @@ ++#include ++ ++typedef int (*func_ptr)(int, int); ++ ++int add(int a, int b) { return a + b; } ++ ++int main() { ++ func_ptr fun; ++ fun = add; ++ int sum = fun(10, 20); // indirect call to 'add' ++ printf("The sum is: %d\n", sum); ++ return 0; ++} ++/* ++REQUIRES: system-linux,bolt-runtime ++ ++RUN: %clang %cflags %s -o %t.exe -Wl,-q -nopie -fpie ++ ++RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \ ++RUN: -o %t.instrumented ++ ++# Instrumented program needs to finish returning zero ++RUN: %t.instrumented | FileCheck %s -check-prefix=CHECK-OUTPUT ++ ++# Test that the instrumented data makes sense ++RUN: llvm-bolt %t.exe -o %t.bolted --data %t.fdata \ ++RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \ ++RUN: --print-only=main --print-finalized | FileCheck %s ++ ++RUN: %t.bolted | FileCheck %s -check-prefix=CHECK-OUTPUT ++ ++CHECK-OUTPUT: The sum is: 30 ++ ++# Check that our indirect call has 1 hit recorded in the fdata file and that ++# this was processed correctly by BOLT ++CHECK: blr x8 # CallProfile: 1 (0 misses) : ++CHECK-NEXT: { add: 1 (0 misses) } ++*/ +diff --git a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp b/bolt/test/runtime/Inputs/exceptions_split.cpp +similarity index 85% +rename from bolt/test/runtime/X86/Inputs/exceptions_split.cpp +rename to bolt/test/runtime/Inputs/exceptions_split.cpp +index 2c136b9..de81adf 100644 +--- a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp ++++ b/bolt/test/runtime/Inputs/exceptions_split.cpp +@@ -3,31 +3,25 @@ + // + // Record performance data with no args. Run test with 2 args. + +-#include + #include ++#include + +-int foo() +-{ +- return 0; +-} ++int foo() { return 0; } + + void bar(int a) { + if (a > 2 && a % 2) + throw new int(); + } + +-void filter_only(){ +- foo(); +-} ++void filter_only() { foo(); } + +-int main(int argc, char **argv) +-{ ++int main(int argc, char **argv) { + unsigned r = 0; + + uint64_t limit = (argc >= 2 ? 10 : 5000); + for (uint64_t i = 0; i < limit; ++i) { + i += foo(); +- try { ++ try { + bar(argc); + try { + if (argc >= 2) +diff --git a/bolt/test/runtime/X86/instrumentation-tail-call.s b/bolt/test/runtime/X86/instrumentation-tail-call.s +index 792d084..dfb12f0 100644 +--- a/bolt/test/runtime/X86/instrumentation-tail-call.s ++++ b/bolt/test/runtime/X86/instrumentation-tail-call.s +@@ -14,6 +14,9 @@ + + # CHECK: leaq 0x80(%rsp), %rsp + ++# RUN: FileCheck %s --input-file %t.fdata --check-prefix=CHECK-FDATA ++# CHECK-FDATA: 1 main {{.*}} 1 targetFunc 0 0 1 ++ + .text + .globl main + .type main, %function +@@ -32,7 +35,8 @@ main: + movq %rbp, %rsp + pop %rbp + mov -0x10(%rsp),%rax +- jmp targetFunc ++ test %rsp, %rsp ++ jne targetFunc + + .LBBerror: + addq $0x20, %rsp +diff --git a/bolt/test/runtime/X86/exceptions-instrumentation.test b/bolt/test/runtime/exceptions-instrumentation.test +similarity index 100% +rename from bolt/test/runtime/X86/exceptions-instrumentation.test +rename to bolt/test/runtime/exceptions-instrumentation.test +diff --git a/bolt/test/runtime/X86/pie-exceptions-split.test b/bolt/test/runtime/pie-exceptions-split.test +similarity index 95% +rename from bolt/test/runtime/X86/pie-exceptions-split.test +rename to bolt/test/runtime/pie-exceptions-split.test +index 124fef6..30f2d02 100644 +--- a/bolt/test/runtime/X86/pie-exceptions-split.test ++++ b/bolt/test/runtime/pie-exceptions-split.test +@@ -16,9 +16,9 @@ RUN: --print-only=main 2>&1 | FileCheck %s + ## All calls to printf() should be from exception handling code that was + ## recorded as cold during the profile collection run. Check that the calls + ## are placed after the split point. +-CHECK-NOT: callq printf ++CHECK-NOT: printf + CHECK: HOT-COLD SPLIT POINT +-CHECK: callq printf ++CHECK: printf + + ## Verify the output still executes correctly when the exception path is being + ## taken. +-- +2.39.5 (Apple Git-154) + diff --git a/0010-AArch64-Add-hybrid-guess-approach-for-edge-weight-estimation.patch b/0010-AArch64-Add-hybrid-guess-approach-for-edge-weight-estimation.patch new file mode 100644 index 0000000000000000000000000000000000000000..206136ce1262bbcef70b6f6529b0eae7673d32e3 --- /dev/null +++ b/0010-AArch64-Add-hybrid-guess-approach-for-edge-weight-estimation.patch @@ -0,0 +1,74 @@ +From 43aa1ec5b46baf032cf2fee22d765a195d40cf59 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E7=89=9F=E6=96=87=E9=BE=99?= +Date: Mon, 18 Nov 2024 02:13:25 +0000 +Subject: [PATCH] [AArch64] Add hybrid guess approach for edge weight + estimation + +--- + bolt/lib/Passes/MCF.cpp | 33 +++++++++++++++++++++++++++++++-- + 1 file changed, 31 insertions(+), 2 deletions(-) + +diff --git a/bolt/lib/Passes/MCF.cpp b/bolt/lib/Passes/MCF.cpp +index c3898d2dc..a6455bbeb 100644 +--- a/bolt/lib/Passes/MCF.cpp ++++ b/bolt/lib/Passes/MCF.cpp +@@ -36,6 +36,11 @@ static cl::opt IterativeGuess( + cl::desc("in non-LBR mode, guess edge counts using iterative technique"), + cl::Hidden, cl::cat(BoltOptCategory)); + ++static cl::opt HybridGuess( ++ "hybrid-guess", ++ cl::desc("in non-LBR mode, guess edge counts using hybird estimation technique"), ++ cl::Hidden, cl::cat(BoltOptCategory)); ++ + static cl::opt UseRArcs( + "mcf-use-rarcs", + cl::desc("in MCF, consider the possibility of cancelling flow to balance " +@@ -350,6 +355,27 @@ void guessEdgeByIterativeApproach(BinaryFunction &BF) { + } + } + ++void guessEdgeByHybridApproach(BinaryFunction &BF, ++ EdgeWeightMap &PredEdgeWeights, ++ EdgeWeightMap &SuccEdgeWeights) { ++ for (BinaryBasicBlock &BB : BF) { ++ for (BinaryBasicBlock *Pred : BB.predecessors()) { ++ double RelativeExecSucc = SuccEdgeWeights[std::make_pair(Pred, &BB)]; ++ double RelativeExec = PredEdgeWeights[std::make_pair(Pred, &BB)]; ++ RelativeExec *= BB.getExecutionCount(); ++ RelativeExecSucc *= Pred->getExecutionCount(); ++ BinaryBasicBlock::BinaryBranchInfo &BI = Pred->getBranchInfo(BB); ++ if ((static_cast(RelativeExec) != 0) && (static_cast(RelativeExecSucc) != 0)) { ++ BI.Count = (static_cast(RelativeExec) + RelativeExecSucc) / 2; ++ } else if (static_cast(RelativeExec) != 0) { ++ BI.Count = static_cast(RelativeExec); ++ } else if (static_cast(RelativeExecSucc) != 0) { ++ BI.Count = static_cast(RelativeExecSucc); ++ } ++ } ++ } ++} ++ + /// Associate each basic block with the BinaryLoop object corresponding to the + /// innermost loop containing this block. + DenseMap +@@ -454,11 +480,14 @@ void estimateEdgeCounts(BinaryFunction &BF) { + equalizeBBCounts(Info, BF); + LLVM_DEBUG(BF.print(dbgs(), "after equalize BB counts")); + } +- if (opts::IterativeGuess) ++ if (opts::IterativeGuess) { + guessEdgeByIterativeApproach(BF); +- else ++ } else if (opts::HybridGuess) { ++ guessEdgeByHybridApproach(BF, PredEdgeWeights, SuccEdgeWeights); ++ } else { + guessEdgeByRelHotness(BF, /*UseSuccs=*/false, PredEdgeWeights, + SuccEdgeWeights); ++ } + recalculateBBCounts(BF, /*AllEdges=*/false); + } + +-- +2.25.1 + diff --git a/0011-support-D-FOT-addrs-data-parsing-for-optimized-binary.patch b/0011-support-D-FOT-addrs-data-parsing-for-optimized-binary.patch new file mode 100644 index 0000000000000000000000000000000000000000..8a3c9e074ef4a6ee4effb9a1cca285138649476d --- /dev/null +++ b/0011-support-D-FOT-addrs-data-parsing-for-optimized-binary.patch @@ -0,0 +1,226 @@ +From 525a2d44443547c0349198df18286f594d62d557 Mon Sep 17 00:00:00 2001 +From: rfwang07 +Date: Tue, 19 Nov 2024 09:48:40 +0800 +Subject: [PATCH] support D-FOT addrs data parsing for optimized binary + +--- + bolt/include/bolt/Profile/DataAggregator.h | 31 ++++++++ + bolt/lib/Profile/DataAggregator.cpp | 86 +++++++++++++++++++++- + 2 files changed, 113 insertions(+), 4 deletions(-) + +diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h +index cc237a6..d352f1b 100644 +--- a/bolt/include/bolt/Profile/DataAggregator.h ++++ b/bolt/include/bolt/Profile/DataAggregator.h +@@ -102,6 +102,12 @@ private: + Type EntryType; + }; + ++ /// Used for parsing specific libkperf input files. ++ struct LibkperfDataEntry { ++ uint64_t Addr; ++ uint64_t Count; ++ }; ++ + struct Trace { + uint64_t From; + uint64_t To; +@@ -300,6 +306,9 @@ private: + /// Parse pre-aggregated LBR samples created by an external tool + ErrorOr parseAggregatedLBREntry(); + ++ /// Parse libkperf samples created by D-FOT ++ ErrorOr parseLibkperfDataEntry(); ++ + /// Parse either buildid:offset or just offset, representing a location in the + /// binary. Used exclusevely for pre-aggregated LBR samples. + ErrorOr parseLocationOrOffset(); +@@ -417,10 +426,32 @@ private: + /// B 4b196f 4b19e0 2 0 + void parsePreAggregated(); + ++ /// Coordinate reading and parsing of libkperf file ++ /// The regular perf2bolt aggregation job is to read perf output directly. ++ /// But in the oeaware framework, sampling is done by libkperf. ++ /// For data collected by sampling the BOLT-optimized binary, ++ /// oeaware can export addrs and counts. ++ /// In perf2bolt, with the help of the BAT section, ++ /// this data is converted to profile that is usable for the original binary. ++ /// ++ /// File format syntax: ++ /// - first line: ++ /// - the other lines: ++ /// ++ /// Example: ++ /// cycles ++ /// 40f544 1 ++ /// 40f750 2 ++ /// 40f810 53 ++ void parseLibkperfFile(); ++ + /// Parse the full output of pre-aggregated LBR samples generated by + /// an external tool. + std::error_code parsePreAggregatedLBRSamples(); + ++ /// Parse the libkperf samples ++ std::error_code parseLibkperfSamples(); ++ + /// Process parsed pre-aggregated data. + void processPreAggregated(); + +diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp +index 24dbe34..509e7c9 100644 +--- a/bolt/lib/Profile/DataAggregator.cpp ++++ b/bolt/lib/Profile/DataAggregator.cpp +@@ -85,6 +85,11 @@ cl::opt ReadPreAggregated( + "pa", cl::desc("skip perf and read data from a pre-aggregated file format"), + cl::cat(AggregatorCategory)); + ++cl::opt ReadLibkperfFile( ++ "libkperf", cl::desc("skip perf and read data from a libkperf file format, " ++ "only for continuous optimizing with BAT"), ++ cl::cat(AggregatorCategory)); ++ + static cl::opt + TimeAggregator("time-aggr", + cl::desc("time BOLT aggregator"), +@@ -157,8 +162,8 @@ void DataAggregator::findPerfExecutable() { + void DataAggregator::start() { + outs() << "PERF2BOLT: Starting data aggregation job for " << Filename << "\n"; + +- // Don't launch perf for pre-aggregated files +- if (opts::ReadPreAggregated) ++ // Don't launch perf for pre-aggregated files and libkperf files ++ if (opts::ReadPreAggregated || opts::ReadLibkperfFile) + return; + + findPerfExecutable(); +@@ -193,7 +198,7 @@ void DataAggregator::start() { + } + + void DataAggregator::abort() { +- if (opts::ReadPreAggregated) ++ if (opts::ReadPreAggregated || opts::ReadLibkperfFile) + return; + + std::string Error; +@@ -313,6 +318,8 @@ void DataAggregator::processFileBuildID(StringRef FileBuildID) { + bool DataAggregator::checkPerfDataMagic(StringRef FileName) { + if (opts::ReadPreAggregated) + return true; ++ if (opts::ReadLibkperfFile) ++ return true; + + Expected FD = sys::fs::openNativeFileForRead(FileName); + if (!FD) { +@@ -359,6 +366,27 @@ void DataAggregator::parsePreAggregated() { + } + } + ++void DataAggregator::parseLibkperfFile() { ++ std::string Error; ++ ++ ErrorOr> MB = ++ MemoryBuffer::getFileOrSTDIN(Filename); ++ if (std::error_code EC = MB.getError()) { ++ errs() << "PERF2BOLT-ERROR: cannot open " << Filename << ": " ++ << EC.message() << "\n"; ++ exit(1); ++ } ++ ++ FileBuf = std::move(*MB); ++ ParsingBuf = FileBuf->getBuffer(); ++ Col = 0; ++ Line = 0; ++ if (parseLibkperfSamples()) { ++ errs() << "PERF2BOLT: failed to parse libkperf samples\n"; ++ exit(1); ++ } ++} ++ + std::error_code DataAggregator::writeAutoFDOData(StringRef OutputFilename) { + outs() << "PERF2BOLT: writing data for autofdo tools...\n"; + NamedRegionTimer T("writeAutoFDO", "Processing branch events", TimerGroupName, +@@ -502,6 +530,11 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) { + return Error::success(); + } + ++ if (opts::ReadLibkperfFile) { ++ parseLibkperfFile(); ++ return Error::success(); ++ } ++ + if (std::optional FileBuildID = BC.getFileBuildID()) { + outs() << "BOLT-INFO: binary build-id is: " << *FileBuildID << "\n"; + processFileBuildID(*FileBuildID); +@@ -608,7 +641,7 @@ bool DataAggregator::mayHaveProfileData(const BinaryFunction &Function) { + void DataAggregator::processProfile(BinaryContext &BC) { + if (opts::ReadPreAggregated) + processPreAggregated(); +- else if (opts::BasicAggregation) ++ else if (opts::BasicAggregation || opts::ReadLibkperfFile) + processBasicEvents(); + else + processBranchEvents(); +@@ -1206,6 +1239,28 @@ ErrorOr DataAggregator::parseLocationOrOffset() { + return Location(true, BuildID.get(), Offset.get()); + } + ++ErrorOr ++DataAggregator::parseLibkperfDataEntry() { ++ // ++ while (checkAndConsumeFS()) { ++ } ++ ErrorOr Addr = parseHexField(FieldSeparator); ++ if (std::error_code EC = Addr.getError()) ++ return EC; ++ while (checkAndConsumeFS()) { ++ } ++ ErrorOr Count = parseNumberField(FieldSeparator, true); ++ if (std::error_code EC = Count.getError()) ++ return EC; ++ ++ if (!checkAndConsumeNewLine()) { ++ reportError("expected end of line"); ++ return make_error_code(llvm::errc::io_error); ++ } ++ ++ return LibkperfDataEntry{Addr.get(), Count.get()}; ++} ++ + ErrorOr + DataAggregator::parseAggregatedLBREntry() { + while (checkAndConsumeFS()) { +@@ -1712,6 +1767,29 @@ void DataAggregator::processMemEvents() { + } + } + ++std::error_code DataAggregator::parseLibkperfSamples() { ++ outs() << "PERF2BOLT: parsing libkperf data...\n"; ++ NamedRegionTimer T("parseLibkperfData", "Parsing libkperf data", ++ TimerGroupName, TimerGroupDesc, opts::TimeAggregator); ++ bool FirstLine = true; ++ while (hasData()) { ++ if (FirstLine) { ++ ErrorOr Event = parseString('\n'); ++ if (std::error_code EC = Event.getError()) ++ return EC; ++ EventNames.insert(Event.get()); ++ FirstLine = false; ++ } ++ ErrorOr KperfEntry = parseLibkperfDataEntry(); ++ if (std::error_code EC = KperfEntry.getError()) ++ return EC; ++ ++ BasicSamples[KperfEntry->Addr] += KperfEntry->Count; ++ } ++ ++ return std::error_code(); ++} ++ + std::error_code DataAggregator::parsePreAggregatedLBRSamples() { + outs() << "PERF2BOLT: parsing pre-aggregated profile...\n"; + NamedRegionTimer T("parseAggregated", "Parsing aggregated branch events", +-- +2.39.5 (Apple Git-154) + diff --git a/llvm-bolt.spec b/llvm-bolt.spec index 1be40dc137f07cbb590ff7f3c7738f40bd559ee3..d37cc3b36767016294866b94b6328a3da53a0fd3 100644 --- a/llvm-bolt.spec +++ b/llvm-bolt.spec @@ -22,7 +22,7 @@ Name: %{pkg_name} Version: %{bolt_version} -Release: 1 +Release: 2 Summary: BOLT is a post-link optimizer developed to speed up large applications License: Apache 2.0 URL: https://github.com/llvm/llvm-project/tree/main/bolt @@ -30,10 +30,17 @@ URL: https://github.com/llvm/llvm-project/tree/main/bolt Source0: https://github.com/llvm/llvm-project/releases/download/llvmorg-%{bolt_version}/%{bolt_srcdir}.tar.xz Source1: https://github.com/llvm/llvm-project/releases/download/llvmorg-%{bolt_version}/%{bolt_srcdir}.tar.xz.sig -# BOLT is not respecting the component split of LLVM and requires some private -# headers in order to compile itself. Try to disable as much libraries as -# possible in order to reduce build time. -#Patch0: rm-llvm-libs.diff +Patch1: 0001-Fix-trap-value-for-non-X86.patch +Patch2: 0002-Add-test-for-emitting-trap-value.patch +Patch3: 0003-AArch64-Add-AArch64-support-for-inline.patch +Patch4: 0004-Bolt-Solving-pie-support-issue.patch +Patch5: 0005-BOLT-AArch64-Don-t-change-layout-in-PatchEntries.patch +Patch6: 0006-AArch64-Add-CFG-block-count-correction-optimization.patch +Patch7: 0007-BOLT-Skip-PLT-search-for-zero-value-weak-reference-symbols.patch +Patch8: 0008-merge-fdata-Support-process-no_lbr-profile-file.patch +Patch9: 0009-support-aarch64-instrumentation.patch +Patch10: 0010-AArch64-Add-hybrid-guess-approach-for-edge-weight-estimation.patch +Patch11: 0011-support-D-FOT-addrs-data-parsing-for-optimized-binary.patch BuildRequires: gcc BuildRequires: gcc-c++ @@ -84,7 +91,6 @@ Documentation for the BOLT optimizer -DLLVM_TARGETS_TO_BUILD="AArch64" %endif - # Set LD_LIBRARY_PATH now because we skip rpath generation and the build uses # some just built libraries. export LD_LIBRARY_PATH=%{_builddir}/%{bolt_srcdir}/%{_vpath_builddir}/%{_lib} @@ -104,6 +110,9 @@ find %{buildroot}%{install_prefix} \ ! -name "libbolt_rt_instr.a" \ -type f,l -exec rm -f '{}' \; +%ifarch aarch64 +find %{buildroot}%{install_prefix} -name "libbolt_rt_hugify.a" -type f,l -exec rm -f '{}' \; +%endif # Remove files installed during the build phase. rm -f %{buildroot}/%{_builddir}/%{bolt_srcdir}/%{_vpath_builddir}/%{_lib}/lib*.a @@ -120,7 +129,7 @@ mv bolt/README.md bolt/docs/*.md %{buildroot}%{install_docdir} rm bolt/test/cache+-deprecated.test bolt/test/bolt-icf.test bolt/test/R_ABS.pic.lld.cpp %endif -export LD_LIBRARY_PATH=%{_builddir}/%{bolt_srcdir}//%{_vpath_builddir}/%{_lib} +export LD_LIBRARY_PATH=%{_builddir}/%{bolt_srcdir}/%{_vpath_builddir}/%{_lib} export DESTDIR=%{buildroot} %ninja_build check-bolt @@ -136,9 +145,9 @@ rm -f %{buildroot}/%{_builddir}/%{bolt_srcdir}/%{_vpath_builddir}/%{_lib}/lib*.a %{install_bindir}/perf2bolt %{install_bindir}/llvm-bolt-heatmap +%{install_libdir}/libbolt_rt_instr.a %ifarch x86_64 %{install_libdir}/libbolt_rt_hugify.a -%{install_libdir}/libbolt_rt_instr.a %endif %exclude %{_builddir}/%{bolt_srcdir}/lib/* @@ -146,8 +155,13 @@ rm -f %{buildroot}/%{_builddir}/%{bolt_srcdir}/%{_vpath_builddir}/%{_lib}/lib*.a %files doc %doc %{install_docdir} - %changelog +* Thu Nov 21 2024 rfwang07 17.0.6-2 +- Type:backport +- ID:NA +- SUG:NA +- DESC: Sync patch from 2203sp4 + * Mon Dec 4 2023 zhoujing 17.0.6-1 - Update to 17.0.6 @@ -167,4 +181,4 @@ rm -f %{buildroot}/%{_builddir}/%{bolt_srcdir}/%{_vpath_builddir}/%{_lib}/lib*.a - Type:Init - ID:NA - SUG:NA -- DESC:Init llvm-bolt repository \ No newline at end of file +- DESC:Init llvm-bolt repository