diff --git a/0009-support-aarch64-instrumentation.patch b/0009-support-aarch64-instrumentation.patch new file mode 100644 index 0000000000000000000000000000000000000000..df97a14f08116a4370e2475b2b2403f3b5505d24 --- /dev/null +++ b/0009-support-aarch64-instrumentation.patch @@ -0,0 +1,2630 @@ +From a7d826d3985dd886523df050949f1c3c151df636 Mon Sep 17 00:00:00 2001 +From: rfwang07 +Date: Thu, 31 Oct 2024 15:34:10 +0800 +Subject: [PATCH] support aarch64 instrumentation + +--- + bolt/CMakeLists.txt | 6 +- + bolt/include/bolt/Core/MCPlusBuilder.h | 24 +- + bolt/lib/Core/BinaryFunction.cpp | 6 + + bolt/lib/Passes/Instrumentation.cpp | 28 +- + bolt/lib/Passes/MCF.cpp | 1 + + bolt/lib/Passes/TailDuplication.cpp | 2 +- + .../Target/AArch64/AArch64MCPlusBuilder.cpp | 446 +++++++++++++++++- + bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 67 +-- + bolt/runtime/CMakeLists.txt | 12 +- + bolt/runtime/common.h | 417 ++-------------- + bolt/runtime/instr.cpp | 61 ++- + bolt/runtime/sys_aarch64.h | 394 ++++++++++++++++ + bolt/runtime/sys_x86_64.h | 360 ++++++++++++++ + bolt/test/AArch64/exclusive-instrument.s | 39 ++ + bolt/test/X86/asm-dump.c | 5 +- + ...olt-address-translation-internal-call.test | 9 +- + .../test/X86/instrumentation-eh_frame_hdr.cpp | 2 +- + bolt/test/X86/internal-call-instrument.s | 24 +- + bolt/test/X86/tail-duplication-pass.s | 9 + + bolt/test/assume-abi.test | 7 + + .../AArch64/Inputs/basic-instrumentation.s | 9 + + .../AArch64/basic-instrumentation.test | 22 + + .../AArch64/instrumentation-ind-call.c | 38 ++ + .../{X86 => }/Inputs/exceptions_split.cpp | 16 +- + .../runtime/X86/instrumentation-tail-call.s | 6 +- + .../{X86 => }/exceptions-instrumentation.test | 0 + .../{X86 => }/pie-exceptions-split.test | 4 +- + 27 files changed, 1545 insertions(+), 469 deletions(-) + create mode 100644 bolt/runtime/sys_aarch64.h + create mode 100644 bolt/runtime/sys_x86_64.h + create mode 100644 bolt/test/AArch64/exclusive-instrument.s + create mode 100644 bolt/test/assume-abi.test + create mode 100644 bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s + create mode 100644 bolt/test/runtime/AArch64/basic-instrumentation.test + create mode 100644 bolt/test/runtime/AArch64/instrumentation-ind-call.c + rename bolt/test/runtime/{X86 => }/Inputs/exceptions_split.cpp (85%) + rename bolt/test/runtime/{X86 => }/exceptions-instrumentation.test (100%) + rename bolt/test/runtime/{X86 => }/pie-exceptions-split.test (95%) + +diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt +index 4ff90c1..89462f8 100644 +--- a/bolt/CMakeLists.txt ++++ b/bolt/CMakeLists.txt +@@ -32,10 +32,10 @@ foreach (tgt ${BOLT_TARGETS_TO_BUILD}) + endforeach() + + set(BOLT_ENABLE_RUNTIME_default OFF) +-if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" ++if ((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" ++ OR CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + AND (CMAKE_SYSTEM_NAME STREQUAL "Linux" +- OR CMAKE_SYSTEM_NAME STREQUAL "Darwin") +- AND "X86" IN_LIST BOLT_TARGETS_TO_BUILD) ++ OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")) + set(BOLT_ENABLE_RUNTIME_default ON) + endif() + option(BOLT_ENABLE_RUNTIME "Enable BOLT runtime" ${BOLT_ENABLE_RUNTIME_default}) +diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h +index beb0675..e6945c9 100644 +--- a/bolt/include/bolt/Core/MCPlusBuilder.h ++++ b/bolt/include/bolt/Core/MCPlusBuilder.h +@@ -498,9 +498,9 @@ public: + } + + /// Create increment contents of target by 1 for Instrumentation +- virtual InstructionListType createInstrIncMemory(const MCSymbol *Target, +- MCContext *Ctx, +- bool IsLeaf) const { ++ virtual InstructionListType ++ createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, ++ unsigned CodePointerSize) const { + llvm_unreachable("not implemented"); + return InstructionListType(); + } +@@ -620,6 +620,11 @@ public: + return false; + } + ++ virtual bool isAArch64Exclusive(const MCInst &Inst) const { ++ llvm_unreachable("not implemented"); ++ return false; ++ } ++ + virtual bool isCleanRegXOR(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return false; +@@ -1597,18 +1602,11 @@ public: + return false; + } + +- virtual void createLoadImmediate(MCInst &Inst, const MCPhysReg Dest, +- uint32_t Imm) const { ++ virtual InstructionListType createLoadImmediate(const MCPhysReg Dest, ++ uint64_t Imm) const { + llvm_unreachable("not implemented"); + } + +- /// Create instruction to increment contents of target by 1 +- virtual bool createIncMemory(MCInst &Inst, const MCSymbol *Target, +- MCContext *Ctx) const { +- llvm_unreachable("not implemented"); +- return false; +- } +- + /// Create a fragment of code (sequence of instructions) that load a 32-bit + /// address from memory, zero-extends it to 64 and jump to it (indirect jump). + virtual bool +@@ -1969,7 +1967,7 @@ public: + } + + virtual InstructionListType createSymbolTrampoline(const MCSymbol *TgtSym, +- MCContext *Ctx) const { ++ MCContext *Ctx) { + llvm_unreachable("not implemented"); + return InstructionListType(); + } +diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp +index 5b44a76..b79bd58 100644 +--- a/bolt/lib/Core/BinaryFunction.cpp ++++ b/bolt/lib/Core/BinaryFunction.cpp +@@ -2305,6 +2305,12 @@ void BinaryFunction::removeConditionalTailCalls() { + + // This branch is no longer a conditional tail call. + BC.MIB->unsetConditionalTailCall(*CTCInstr); ++ ++ // Move offset from CTCInstr to TailCallInstr. ++ if (std::optional Offset = BC.MIB->getOffset(*CTCInstr)) { ++ BC.MIB->setOffset(TailCallInstr, *Offset); ++ BC.MIB->clearOffset(*CTCInstr); ++ } + } + + insertBasicBlocks(std::prev(end()), std::move(NewBlocks), +diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp +index fae6770..72adb31 100644 +--- a/bolt/lib/Passes/Instrumentation.cpp ++++ b/bolt/lib/Passes/Instrumentation.cpp +@@ -13,6 +13,7 @@ + #include "bolt/Passes/Instrumentation.h" + #include "bolt/Core/ParallelUtilities.h" + #include "bolt/RuntimeLibs/InstrumentationRuntimeLibrary.h" ++#include "bolt/Utils/CommandLineOpts.h" + #include "bolt/Utils/Utils.h" + #include "llvm/Support/CommandLine.h" + #include "llvm/Support/RWMutex.h" +@@ -85,6 +86,24 @@ cl::opt InstrumentCalls("instrument-calls", + namespace llvm { + namespace bolt { + ++static bool hasAArch64ExclusiveMemop(BinaryFunction &Function) { ++ // FIXME ARMv8-a architecture reference manual says that software must avoid ++ // having any explicit memory accesses between exclusive load and associated ++ // store instruction. So for now skip instrumentation for functions that have ++ // these instructions, since it might lead to runtime deadlock. ++ BinaryContext &BC = Function.getBinaryContext(); ++ for (const BinaryBasicBlock &BB : Function) ++ for (const MCInst &Inst : BB) ++ if (BC.MIB->isAArch64Exclusive(Inst)) { ++ if (opts::Verbosity >= 1) ++ outs() << "BOLT-INSTRUMENTER: Function " << Function ++ << " has exclusive instructions, skip instrumentation\n"; ++ return true; ++ } ++ ++ return false; ++} ++ + uint32_t Instrumentation::getFunctionNameIndex(const BinaryFunction &Function) { + auto Iter = FuncToStringIdx.find(&Function); + if (Iter != FuncToStringIdx.end()) +@@ -176,7 +195,8 @@ Instrumentation::createInstrumentationSnippet(BinaryContext &BC, bool IsLeaf) { + auto L = BC.scopeLock(); + MCSymbol *Label = BC.Ctx->createNamedTempSymbol("InstrEntry"); + Summary->Counters.emplace_back(Label); +- return BC.MIB->createInstrIncMemory(Label, BC.Ctx.get(), IsLeaf); ++ return BC.MIB->createInstrIncMemory(Label, BC.Ctx.get(), IsLeaf, ++ BC.AsmInfo->getCodePointerSize()); + } + + // Helper instruction sequence insertion function +@@ -287,6 +307,9 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function, + if (BC.isMachO() && Function.hasName("___GLOBAL_init_65535/1")) + return; + ++ if (BC.isAArch64() && hasAArch64ExclusiveMemop(Function)) ++ return; ++ + SplitWorklistTy SplitWorklist; + SplitInstrsTy SplitInstrs; + +@@ -504,9 +527,6 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function, + } + + void Instrumentation::runOnFunctions(BinaryContext &BC) { +- if (!BC.isX86()) +- return; +- + const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/false, + /*IsText=*/false, + /*IsAllocatable=*/true); +diff --git a/bolt/lib/Passes/MCF.cpp b/bolt/lib/Passes/MCF.cpp +index ec04012..c3898d2 100644 +--- a/bolt/lib/Passes/MCF.cpp ++++ b/bolt/lib/Passes/MCF.cpp +@@ -262,6 +262,7 @@ bool guessPredEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) { + continue; + + Pred->getBranchInfo(*BB).Count = Guessed; ++ GuessedArcs.insert(std::make_pair(Pred, BB)); + return true; + } + llvm_unreachable("Expected unguessed arc"); +diff --git a/bolt/lib/Passes/TailDuplication.cpp b/bolt/lib/Passes/TailDuplication.cpp +index c04efd7..7141d5d 100644 +--- a/bolt/lib/Passes/TailDuplication.cpp ++++ b/bolt/lib/Passes/TailDuplication.cpp +@@ -303,7 +303,7 @@ TailDuplication::aggressiveDuplicate(BinaryBasicBlock &BB, + if (isInCacheLine(BB, Tail)) + return BlocksToDuplicate; + +- BinaryBasicBlock *CurrBB = &BB; ++ BinaryBasicBlock *CurrBB = &Tail; + while (CurrBB) { + LLVM_DEBUG(dbgs() << "Aggressive tail duplication: adding " + << CurrBB->getName() << " to duplication list\n";); +diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +index cd66b65..3f6497e 100644 +--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp ++++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +@@ -16,6 +16,9 @@ + #include "Utils/AArch64BaseInfo.h" + #include "bolt/Core/MCPlusBuilder.h" + #include "llvm/BinaryFormat/ELF.h" ++#include "llvm/MC/MCContext.h" ++#include "llvm/MC/MCFixupKindInfo.h" ++#include "llvm/MC/MCInstBuilder.h" + #include "llvm/MC/MCInstrInfo.h" + #include "llvm/MC/MCRegisterInfo.h" + #include "llvm/Support/Debug.h" +@@ -28,6 +31,100 @@ using namespace bolt; + + namespace { + ++static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) { ++ Inst.setOpcode(AArch64::MRS); ++ Inst.clear(); ++ Inst.addOperand(MCOperand::createReg(RegName)); ++ Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV)); ++} ++ ++static void setSystemFlag(MCInst &Inst, MCPhysReg RegName) { ++ Inst.setOpcode(AArch64::MSR); ++ Inst.clear(); ++ Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV)); ++ Inst.addOperand(MCOperand::createReg(RegName)); ++} ++ ++static void createPushRegisters(MCInst &Inst, MCPhysReg Reg1, MCPhysReg Reg2) { ++ Inst.clear(); ++ unsigned NewOpcode = AArch64::STPXpre; ++ Inst.setOpcode(NewOpcode); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createReg(Reg1)); ++ Inst.addOperand(MCOperand::createReg(Reg2)); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createImm(-2)); ++} ++ ++static void createPopRegisters(MCInst &Inst, MCPhysReg Reg1, MCPhysReg Reg2) { ++ Inst.clear(); ++ unsigned NewOpcode = AArch64::LDPXpost; ++ Inst.setOpcode(NewOpcode); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createReg(Reg1)); ++ Inst.addOperand(MCOperand::createReg(Reg2)); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createImm(2)); ++} ++ ++static void loadReg(MCInst &Inst, MCPhysReg To, MCPhysReg From) { ++ Inst.setOpcode(AArch64::LDRXui); ++ Inst.clear(); ++ if (From == AArch64::SP) { ++ Inst.setOpcode(AArch64::LDRXpost); ++ Inst.addOperand(MCOperand::createReg(From)); ++ Inst.addOperand(MCOperand::createReg(To)); ++ Inst.addOperand(MCOperand::createReg(From)); ++ Inst.addOperand(MCOperand::createImm(16)); ++ } else { ++ Inst.addOperand(MCOperand::createReg(To)); ++ Inst.addOperand(MCOperand::createReg(From)); ++ Inst.addOperand(MCOperand::createImm(0)); ++ } ++} ++ ++static void storeReg(MCInst &Inst, MCPhysReg From, MCPhysReg To) { ++ Inst.setOpcode(AArch64::STRXui); ++ Inst.clear(); ++ if (To == AArch64::SP) { ++ Inst.setOpcode(AArch64::STRXpre); ++ Inst.addOperand(MCOperand::createReg(To)); ++ Inst.addOperand(MCOperand::createReg(From)); ++ Inst.addOperand(MCOperand::createReg(To)); ++ Inst.addOperand(MCOperand::createImm(-16)); ++ } else { ++ Inst.addOperand(MCOperand::createReg(From)); ++ Inst.addOperand(MCOperand::createReg(To)); ++ Inst.addOperand(MCOperand::createImm(0)); ++ } ++} ++ ++static void atomicAdd(MCInst &Inst, MCPhysReg RegTo, MCPhysReg RegCnt) { ++ // NOTE: Supports only ARM with LSE extension ++ Inst.setOpcode(AArch64::LDADDX); ++ Inst.clear(); ++ Inst.addOperand(MCOperand::createReg(AArch64::XZR)); ++ Inst.addOperand(MCOperand::createReg(RegCnt)); ++ Inst.addOperand(MCOperand::createReg(RegTo)); ++} ++ ++static void createMovz(MCInst &Inst, MCPhysReg Reg, uint64_t Imm) { ++ assert(Imm <= UINT16_MAX && "Invalid Imm size"); ++ Inst.clear(); ++ Inst.setOpcode(AArch64::MOVZXi); ++ Inst.addOperand(MCOperand::createReg(Reg)); ++ Inst.addOperand(MCOperand::createImm(Imm & 0xFFFF)); ++ Inst.addOperand(MCOperand::createImm(0)); ++} ++ ++static InstructionListType createIncMemory(MCPhysReg RegTo, MCPhysReg RegTmp) { ++ InstructionListType Insts; ++ Insts.emplace_back(); ++ createMovz(Insts.back(), RegTmp, 1); ++ Insts.emplace_back(); ++ atomicAdd(Insts.back(), RegTo, RegTmp); ++ return Insts; ++} + class AArch64MCPlusBuilder : public MCPlusBuilder { + public: + AArch64MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info, +@@ -176,6 +273,34 @@ public: + return isLDRB(Inst) || isLDRH(Inst) || isLDRW(Inst) || isLDRX(Inst); + } + ++ bool isAArch64Exclusive(const MCInst &Inst) const override { ++ return (Inst.getOpcode() == AArch64::LDXPX || ++ Inst.getOpcode() == AArch64::LDXPW || ++ Inst.getOpcode() == AArch64::LDXRX || ++ Inst.getOpcode() == AArch64::LDXRW || ++ Inst.getOpcode() == AArch64::LDXRH || ++ Inst.getOpcode() == AArch64::LDXRB || ++ Inst.getOpcode() == AArch64::STXPX || ++ Inst.getOpcode() == AArch64::STXPW || ++ Inst.getOpcode() == AArch64::STXRX || ++ Inst.getOpcode() == AArch64::STXRW || ++ Inst.getOpcode() == AArch64::STXRH || ++ Inst.getOpcode() == AArch64::STXRB || ++ Inst.getOpcode() == AArch64::LDAXPX || ++ Inst.getOpcode() == AArch64::LDAXPW || ++ Inst.getOpcode() == AArch64::LDAXRX || ++ Inst.getOpcode() == AArch64::LDAXRW || ++ Inst.getOpcode() == AArch64::LDAXRH || ++ Inst.getOpcode() == AArch64::LDAXRB || ++ Inst.getOpcode() == AArch64::STLXPX || ++ Inst.getOpcode() == AArch64::STLXPW || ++ Inst.getOpcode() == AArch64::STLXRX || ++ Inst.getOpcode() == AArch64::STLXRW || ++ Inst.getOpcode() == AArch64::STLXRH || ++ Inst.getOpcode() == AArch64::STLXRB || ++ Inst.getOpcode() == AArch64::CLREX); ++ } ++ + bool isLoadFromStack(const MCInst &Inst) const { + if (!isLoad(Inst)) + return false; +@@ -207,6 +332,40 @@ public: + return Inst.getOpcode() == AArch64::BLR; + } + ++ MCPhysReg getSpRegister(int Size) const { ++ switch (Size) { ++ case 4: ++ return AArch64::WSP; ++ case 8: ++ return AArch64::SP; ++ default: ++ llvm_unreachable("Unexpected size"); ++ } ++ } ++ ++ MCPhysReg getIntArgRegister(unsigned ArgNo) const override { ++ switch (ArgNo) { ++ case 0: ++ return AArch64::X0; ++ case 1: ++ return AArch64::X1; ++ case 2: ++ return AArch64::X2; ++ case 3: ++ return AArch64::X3; ++ case 4: ++ return AArch64::X4; ++ case 5: ++ return AArch64::X5; ++ case 6: ++ return AArch64::X6; ++ case 7: ++ return AArch64::X7; ++ default: ++ return getNoRegister(); ++ } ++ } ++ + bool hasPCRelOperand(const MCInst &Inst) const override { + // ADRP is blacklisted and is an exception. Even though it has a + // PC-relative operand, this operand is not a complete symbol reference +@@ -313,6 +472,22 @@ public: + return true; + } + ++ void getCalleeSavedRegs(BitVector &Regs) const override { ++ Regs |= getAliases(AArch64::X18); ++ Regs |= getAliases(AArch64::X19); ++ Regs |= getAliases(AArch64::X20); ++ Regs |= getAliases(AArch64::X21); ++ Regs |= getAliases(AArch64::X22); ++ Regs |= getAliases(AArch64::X23); ++ Regs |= getAliases(AArch64::X24); ++ Regs |= getAliases(AArch64::X25); ++ Regs |= getAliases(AArch64::X26); ++ Regs |= getAliases(AArch64::X27); ++ Regs |= getAliases(AArch64::X28); ++ Regs |= getAliases(AArch64::LR); ++ Regs |= getAliases(AArch64::FP); ++ } ++ + const MCExpr *getTargetExprFor(MCInst &Inst, const MCExpr *Expr, + MCContext &Ctx, + uint64_t RelType) const override { +@@ -818,6 +993,22 @@ public: + + int getUncondBranchEncodingSize() const override { return 28; } + ++ InstructionListType createCmpJE(MCPhysReg RegNo, int64_t Imm, ++ const MCSymbol *Target, ++ MCContext *Ctx) const override { ++ InstructionListType Code; ++ Code.emplace_back(MCInstBuilder(AArch64::SUBSXri) ++ .addReg(RegNo) ++ .addReg(RegNo) ++ .addImm(Imm) ++ .addImm(0)); ++ Code.emplace_back(MCInstBuilder(AArch64::Bcc) ++ .addImm(Imm) ++ .addExpr(MCSymbolRefExpr::create( ++ Target, MCSymbolRefExpr::VK_None, *Ctx))); ++ return Code; ++ } ++ + bool createCall(MCInst &Inst, const MCSymbol *Target, + MCContext *Ctx) override { + Inst.setOpcode(AArch64::BL); +@@ -828,12 +1019,7 @@ public: + + bool createTailCall(MCInst &Inst, const MCSymbol *Target, + MCContext *Ctx) override { +- Inst.setOpcode(AArch64::B); +- Inst.addOperand(MCOperand::createExpr(getTargetExprFor( +- Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx), +- *Ctx, 0))); +- setTailCall(Inst); +- return true; ++ return createDirectCall(Inst, Target, Ctx, /*IsTailCall*/ true); + } + + void createLongTailCall(InstructionListType &Seq, const MCSymbol *Target, +@@ -882,6 +1068,18 @@ public: + + bool isStore(const MCInst &Inst) const override { return false; } + ++ bool createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx, ++ bool IsTailCall) override { ++ Inst.setOpcode(IsTailCall ? AArch64::B : AArch64::BL); ++ Inst.clear(); ++ Inst.addOperand(MCOperand::createExpr(getTargetExprFor( ++ Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx), ++ *Ctx, 0))); ++ if (IsTailCall) ++ convertJmpToTailCall(Inst); ++ return true; ++ } ++ + bool analyzeBranch(InstructionIterator Begin, InstructionIterator End, + const MCSymbol *&TBB, const MCSymbol *&FBB, + MCInst *&CondBranch, +@@ -1153,6 +1351,242 @@ public: + return true; + } + ++ bool createStackPointerIncrement( ++ MCInst &Inst, int Size, ++ bool NoFlagsClobber = false /*unused for AArch64*/) const override { ++ Inst.setOpcode(AArch64::SUBXri); ++ Inst.clear(); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createImm(Size)); ++ Inst.addOperand(MCOperand::createImm(0)); ++ return true; ++ } ++ ++ bool createStackPointerDecrement( ++ MCInst &Inst, int Size, ++ bool NoFlagsClobber = false /*unused for AArch64*/) const override { ++ Inst.setOpcode(AArch64::ADDXri); ++ Inst.clear(); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createReg(AArch64::SP)); ++ Inst.addOperand(MCOperand::createImm(Size)); ++ Inst.addOperand(MCOperand::createImm(0)); ++ return true; ++ } ++ ++ void createIndirectBranch(MCInst &Inst, MCPhysReg MemBaseReg, ++ int64_t Disp) const { ++ Inst.setOpcode(AArch64::BR); ++ Inst.addOperand(MCOperand::createReg(MemBaseReg)); ++ } ++ ++ InstructionListType createInstrumentedIndCallHandlerExitBB() const override { ++ InstructionListType Insts(5); ++ // Code sequence for instrumented indirect call handler: ++ // msr nzcv, x1 ++ // ldp x0, x1, [sp], #16 ++ // ldr x16, [sp], #16 ++ // ldp x0, x1, [sp], #16 ++ // br x16 ++ setSystemFlag(Insts[0], AArch64::X1); ++ createPopRegisters(Insts[1], AArch64::X0, AArch64::X1); ++ // Here we load address of the next function which should be called in the ++ // original binary to X16 register. Writing to X16 is permitted without ++ // needing to restore. ++ loadReg(Insts[2], AArch64::X16, AArch64::SP); ++ createPopRegisters(Insts[3], AArch64::X0, AArch64::X1); ++ createIndirectBranch(Insts[4], AArch64::X16, 0); ++ return Insts; ++ } ++ ++ InstructionListType ++ createInstrumentedIndTailCallHandlerExitBB() const override { ++ return createInstrumentedIndCallHandlerExitBB(); ++ } ++ ++ InstructionListType createGetter(MCContext *Ctx, const char *name) const { ++ InstructionListType Insts(4); ++ MCSymbol *Locs = Ctx->getOrCreateSymbol(name); ++ InstructionListType Addr = materializeAddress(Locs, Ctx, AArch64::X0); ++ std::copy(Addr.begin(), Addr.end(), Insts.begin()); ++ assert(Addr.size() == 2 && "Invalid Addr size"); ++ loadReg(Insts[2], AArch64::X0, AArch64::X0); ++ createReturn(Insts[3]); ++ return Insts; ++ } ++ ++ InstructionListType createNumCountersGetter(MCContext *Ctx) const override { ++ return createGetter(Ctx, "__bolt_num_counters"); ++ } ++ ++ InstructionListType ++ createInstrLocationsGetter(MCContext *Ctx) const override { ++ return createGetter(Ctx, "__bolt_instr_locations"); ++ } ++ ++ InstructionListType createInstrTablesGetter(MCContext *Ctx) const override { ++ return createGetter(Ctx, "__bolt_instr_tables"); ++ } ++ ++ InstructionListType createInstrNumFuncsGetter(MCContext *Ctx) const override { ++ return createGetter(Ctx, "__bolt_instr_num_funcs"); ++ } ++ ++ void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg) override { ++ bool IsTailCall = isTailCall(Inst); ++ if (IsTailCall) ++ removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall); ++ if (Inst.getOpcode() == AArch64::BR || Inst.getOpcode() == AArch64::BLR) { ++ Inst.setOpcode(AArch64::ORRXrs); ++ Inst.insert(Inst.begin(), MCOperand::createReg(Reg)); ++ Inst.insert(Inst.begin() + 1, MCOperand::createReg(AArch64::XZR)); ++ Inst.insert(Inst.begin() + 3, MCOperand::createImm(0)); ++ return; ++ } ++ llvm_unreachable("not implemented"); ++ } ++ ++ InstructionListType createLoadImmediate(const MCPhysReg Dest, ++ uint64_t Imm) const override { ++ InstructionListType Insts(4); ++ int Shift = 48; ++ for (int I = 0; I < 4; I++, Shift -= 16) { ++ Insts[I].setOpcode(AArch64::MOVKXi); ++ Insts[I].addOperand(MCOperand::createReg(Dest)); ++ Insts[I].addOperand(MCOperand::createReg(Dest)); ++ Insts[I].addOperand(MCOperand::createImm((Imm >> Shift) & 0xFFFF)); ++ Insts[I].addOperand(MCOperand::createImm(Shift)); ++ } ++ return Insts; ++ } ++ ++ void createIndirectCallInst(MCInst &Inst, bool IsTailCall, ++ MCPhysReg Reg) const { ++ Inst.clear(); ++ Inst.setOpcode(IsTailCall ? AArch64::BR : AArch64::BLR); ++ Inst.addOperand(MCOperand::createReg(Reg)); ++ } ++ ++ InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst, ++ MCSymbol *HandlerFuncAddr, ++ int CallSiteID, ++ MCContext *Ctx) override { ++ InstructionListType Insts; ++ // Code sequence used to enter indirect call instrumentation helper: ++ // stp x0, x1, [sp, #-16]! createPushRegisters ++ // mov target x0 convertIndirectCallToLoad -> orr x0 target xzr ++ // mov x1 CallSiteID createLoadImmediate -> ++ // movk x1, #0x0, lsl #48 ++ // movk x1, #0x0, lsl #32 ++ // movk x1, #0x0, lsl #16 ++ // movk x1, #0x0 ++ // stp x0, x1, [sp, #-16]! ++ // bl *HandlerFuncAddr createIndirectCall -> ++ // adr x0 *HandlerFuncAddr -> adrp + add ++ // blr x0 ++ Insts.emplace_back(); ++ createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); ++ Insts.emplace_back(CallInst); ++ convertIndirectCallToLoad(Insts.back(), AArch64::X0); ++ InstructionListType LoadImm = ++ createLoadImmediate(getIntArgRegister(1), CallSiteID); ++ Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end()); ++ Insts.emplace_back(); ++ createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); ++ Insts.resize(Insts.size() + 2); ++ InstructionListType Addr = ++ materializeAddress(HandlerFuncAddr, Ctx, AArch64::X0); ++ assert(Addr.size() == 2 && "Invalid Addr size"); ++ std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size()); ++ Insts.emplace_back(); ++ createIndirectCallInst(Insts.back(), isTailCall(CallInst), AArch64::X0); ++ ++ // Carry over metadata including tail call marker if present. ++ stripAnnotations(Insts.back()); ++ moveAnnotations(std::move(CallInst), Insts.back()); ++ ++ return Insts; ++ } ++ ++ InstructionListType ++ createInstrumentedIndCallHandlerEntryBB(const MCSymbol *InstrTrampoline, ++ const MCSymbol *IndCallHandler, ++ MCContext *Ctx) override { ++ // Code sequence used to check whether InstrTampoline was initialized ++ // and call it if so, returns via IndCallHandler ++ // stp x0, x1, [sp, #-16]! ++ // mrs x1, nzcv ++ // adr x0, InstrTrampoline -> adrp + add ++ // ldr x0, [x0] ++ // subs x0, x0, #0x0 ++ // b.eq IndCallHandler ++ // str x30, [sp, #-16]! ++ // blr x0 ++ // ldr x30, [sp], #16 ++ // b IndCallHandler ++ InstructionListType Insts; ++ Insts.emplace_back(); ++ createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); ++ Insts.emplace_back(); ++ getSystemFlag(Insts.back(), getIntArgRegister(1)); ++ Insts.emplace_back(); ++ Insts.emplace_back(); ++ InstructionListType Addr = ++ materializeAddress(InstrTrampoline, Ctx, AArch64::X0); ++ std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size()); ++ assert(Addr.size() == 2 && "Invalid Addr size"); ++ Insts.emplace_back(); ++ loadReg(Insts.back(), AArch64::X0, AArch64::X0); ++ InstructionListType cmpJmp = ++ createCmpJE(AArch64::X0, 0, IndCallHandler, Ctx); ++ Insts.insert(Insts.end(), cmpJmp.begin(), cmpJmp.end()); ++ Insts.emplace_back(); ++ storeReg(Insts.back(), AArch64::LR, AArch64::SP); ++ Insts.emplace_back(); ++ Insts.back().setOpcode(AArch64::BLR); ++ Insts.back().addOperand(MCOperand::createReg(AArch64::X0)); ++ Insts.emplace_back(); ++ loadReg(Insts.back(), AArch64::LR, AArch64::SP); ++ Insts.emplace_back(); ++ createDirectCall(Insts.back(), IndCallHandler, Ctx, /*IsTailCall*/ true); ++ return Insts; ++ } ++ ++ InstructionListType ++ createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, ++ unsigned CodePointerSize) const override { ++ unsigned int I = 0; ++ InstructionListType Instrs(IsLeaf ? 12 : 10); ++ ++ if (IsLeaf) ++ createStackPointerIncrement(Instrs[I++], 128); ++ createPushRegisters(Instrs[I++], AArch64::X0, AArch64::X1); ++ getSystemFlag(Instrs[I++], AArch64::X1); ++ InstructionListType Addr = materializeAddress(Target, Ctx, AArch64::X0); ++ assert(Addr.size() == 2 && "Invalid Addr size"); ++ std::copy(Addr.begin(), Addr.end(), Instrs.begin() + I); ++ I += Addr.size(); ++ storeReg(Instrs[I++], AArch64::X2, AArch64::SP); ++ InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X2); ++ assert(Insts.size() == 2 && "Invalid Insts size"); ++ std::copy(Insts.begin(), Insts.end(), Instrs.begin() + I); ++ I += Insts.size(); ++ loadReg(Instrs[I++], AArch64::X2, AArch64::SP); ++ setSystemFlag(Instrs[I++], AArch64::X1); ++ createPopRegisters(Instrs[I++], AArch64::X0, AArch64::X1); ++ if (IsLeaf) ++ createStackPointerDecrement(Instrs[I++], 128); ++ return Instrs; ++ } ++ ++ std::vector createSymbolTrampoline(const MCSymbol *TgtSym, ++ MCContext *Ctx) override { ++ std::vector Insts; ++ createShortJmp(Insts, TgtSym, Ctx, /*IsTailCall*/ true); ++ return Insts; ++ } ++ + InstructionListType materializeAddress(const MCSymbol *Target, MCContext *Ctx, + MCPhysReg RegName, + int64_t Addend = 0) const override { +diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +index 5e3c01a..25b6970 100644 +--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp ++++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +@@ -61,6 +61,25 @@ bool isADDri(const MCInst &Inst) { + Inst.getOpcode() == X86::ADD64ri8; + } + ++// Create instruction to increment contents of target by 1 ++static InstructionListType createIncMemory(const MCSymbol *Target, ++ MCContext *Ctx) { ++ InstructionListType Insts; ++ Insts.emplace_back(); ++ Insts.back().setOpcode(X86::LOCK_INC64m); ++ Insts.back().clear(); ++ Insts.back().addOperand(MCOperand::createReg(X86::RIP)); // BaseReg ++ Insts.back().addOperand(MCOperand::createImm(1)); // ScaleAmt ++ Insts.back().addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg ++ ++ Insts.back().addOperand(MCOperand::createExpr( ++ MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, ++ *Ctx))); // Displacement ++ Insts.back().addOperand( ++ MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg ++ return Insts; ++} ++ + #define GET_INSTRINFO_OPERAND_TYPES_ENUM + #define GET_INSTRINFO_OPERAND_TYPE + #define GET_INSTRINFO_MEM_OPERAND_SIZE +@@ -2309,28 +2328,15 @@ public: + return true; + } + +- void createLoadImmediate(MCInst &Inst, const MCPhysReg Dest, +- uint32_t Imm) const override { +- Inst.setOpcode(X86::MOV64ri32); +- Inst.clear(); +- Inst.addOperand(MCOperand::createReg(Dest)); +- Inst.addOperand(MCOperand::createImm(Imm)); +- } +- +- bool createIncMemory(MCInst &Inst, const MCSymbol *Target, +- MCContext *Ctx) const override { +- +- Inst.setOpcode(X86::LOCK_INC64m); +- Inst.clear(); +- Inst.addOperand(MCOperand::createReg(X86::RIP)); // BaseReg +- Inst.addOperand(MCOperand::createImm(1)); // ScaleAmt +- Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg +- +- Inst.addOperand(MCOperand::createExpr( +- MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, +- *Ctx))); // Displacement +- Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg +- return true; ++ InstructionListType createLoadImmediate(const MCPhysReg Dest, ++ uint64_t Imm) const override { ++ InstructionListType Insts; ++ Insts.emplace_back(); ++ Insts.back().setOpcode(X86::MOV64ri32); ++ Insts.back().clear(); ++ Insts.back().addOperand(MCOperand::createReg(Dest)); ++ Insts.back().addOperand(MCOperand::createImm(Imm)); ++ return Insts; + } + + bool createIJmp32Frag(SmallVectorImpl &Insts, +@@ -3057,9 +3063,9 @@ public: + Inst.clear(); + } + +- InstructionListType createInstrIncMemory(const MCSymbol *Target, +- MCContext *Ctx, +- bool IsLeaf) const override { ++ InstructionListType ++ createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, ++ unsigned CodePointerSize) const override { + InstructionListType Instrs(IsLeaf ? 13 : 11); + unsigned int I = 0; + +@@ -3079,7 +3085,10 @@ public: + createClearRegWithNoEFlagsUpdate(Instrs[I++], X86::RAX, 8); + createX86SaveOVFlagToRegister(Instrs[I++], X86::AL); + // LOCK INC +- createIncMemory(Instrs[I++], Target, Ctx); ++ InstructionListType IncMem = createIncMemory(Target, Ctx); ++ assert(IncMem.size() == 1 && "Invalid IncMem size"); ++ std::copy(IncMem.begin(), IncMem.end(), Instrs.begin() + I); ++ I += IncMem.size(); + // POPF + createAddRegImm(Instrs[I++], X86::AL, 127, 1); + createPopRegister(Instrs[I++], X86::RAX, 8); +@@ -3153,8 +3162,8 @@ public: + } + Insts.emplace_back(); + createPushRegister(Insts.back(), TempReg, 8); +- Insts.emplace_back(); +- createLoadImmediate(Insts.back(), TempReg, CallSiteID); ++ InstructionListType LoadImm = createLoadImmediate(TempReg, CallSiteID); ++ Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end()); + Insts.emplace_back(); + createPushRegister(Insts.back(), TempReg, 8); + +@@ -3264,7 +3273,7 @@ public: + } + + InstructionListType createSymbolTrampoline(const MCSymbol *TgtSym, +- MCContext *Ctx) const override { ++ MCContext *Ctx) override { + InstructionListType Insts(1); + createUncondBranch(Insts[0], TgtSym, Ctx); + return Insts; +diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt +index 8472ce0..838c8cb 100644 +--- a/bolt/runtime/CMakeLists.txt ++++ b/bolt/runtime/CMakeLists.txt +@@ -27,8 +27,14 @@ set(BOLT_RT_FLAGS + -fno-exceptions + -fno-rtti + -fno-stack-protector +- -mno-sse +- -fPIC) ++ -fPIC ++ -mgeneral-regs-only) ++if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") ++ set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-sse") ++endif() ++if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") ++ set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-outline-atomics") ++endif() + + # Don't let the compiler think it can create calls to standard libs + target_compile_options(bolt_rt_instr PRIVATE ${BOLT_RT_FLAGS}) +@@ -39,7 +45,7 @@ target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + install(TARGETS bolt_rt_instr DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") + install(TARGETS bolt_rt_hugify DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") + +-if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*") ++if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin") + add_library(bolt_rt_instr_osx STATIC + instr.cpp + ${CMAKE_CURRENT_BINARY_DIR}/config.h +diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h +index 9e6f175..9b9965b 100644 +--- a/bolt/runtime/common.h ++++ b/bolt/runtime/common.h +@@ -6,10 +6,6 @@ + // + //===----------------------------------------------------------------------===// + +-#if !defined(__x86_64__) +-#error "For x86_64 only" +-#endif +- + #if defined(__linux__) + + #include +@@ -44,44 +40,6 @@ typedef int int32_t; + #error "For Linux or MacOS only" + #endif + +-// Save all registers while keeping 16B stack alignment +-#define SAVE_ALL \ +- "push %%rax\n" \ +- "push %%rbx\n" \ +- "push %%rcx\n" \ +- "push %%rdx\n" \ +- "push %%rdi\n" \ +- "push %%rsi\n" \ +- "push %%rbp\n" \ +- "push %%r8\n" \ +- "push %%r9\n" \ +- "push %%r10\n" \ +- "push %%r11\n" \ +- "push %%r12\n" \ +- "push %%r13\n" \ +- "push %%r14\n" \ +- "push %%r15\n" \ +- "sub $8, %%rsp\n" +- +-// Mirrors SAVE_ALL +-#define RESTORE_ALL \ +- "add $8, %%rsp\n" \ +- "pop %%r15\n" \ +- "pop %%r14\n" \ +- "pop %%r13\n" \ +- "pop %%r12\n" \ +- "pop %%r11\n" \ +- "pop %%r10\n" \ +- "pop %%r9\n" \ +- "pop %%r8\n" \ +- "pop %%rbp\n" \ +- "pop %%rsi\n" \ +- "pop %%rdi\n" \ +- "pop %%rdx\n" \ +- "pop %%rcx\n" \ +- "pop %%rbx\n" \ +- "pop %%rax\n" +- + #define PROT_READ 0x1 /* Page can be read. */ + #define PROT_WRITE 0x2 /* Page can be written. */ + #define PROT_EXEC 0x4 /* Page can be executed. */ +@@ -165,141 +123,41 @@ int memcmp(const void *s1, const void *s2, size_t n) { + // Anonymous namespace covering everything but our library entry point + namespace { + +-// Get the difference between runtime addrress of .text section and +-// static address in section header table. Can be extracted from arbitrary +-// pc value recorded at runtime to get the corresponding static address, which +-// in turn can be used to search for indirect call description. Needed because +-// indirect call descriptions are read-only non-relocatable data. +-uint64_t getTextBaseAddress() { +- uint64_t DynAddr; +- uint64_t StaticAddr; +- __asm__ volatile("leaq __hot_end(%%rip), %0\n\t" +- "movabsq $__hot_end, %1\n\t" +- : "=r"(DynAddr), "=r"(StaticAddr)); +- return DynAddr - StaticAddr; +-} +- +-constexpr uint32_t BufSize = 10240; +- +-#define _STRINGIFY(x) #x +-#define STRINGIFY(x) _STRINGIFY(x) +- +-uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { +- uint64_t ret; +-#if defined(__APPLE__) +-#define READ_SYSCALL 0x2000003 +-#else +-#define READ_SYSCALL 0 +-#endif +- __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(fd), "S"(buf), "d"(count) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { +- uint64_t ret; +-#if defined(__APPLE__) +-#define WRITE_SYSCALL 0x2000004 +-#else +-#define WRITE_SYSCALL 1 +-#endif +- __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(fd), "S"(buf), "d"(count) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, +- uint64_t fd, uint64_t offset) { +-#if defined(__APPLE__) +-#define MMAP_SYSCALL 0x20000c5 +-#else +-#define MMAP_SYSCALL 9 +-#endif +- void *ret; +- register uint64_t r8 asm("r8") = fd; +- register uint64_t r9 asm("r9") = offset; +- register uint64_t r10 asm("r10") = flags; +- __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), +- "r"(r9) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-uint64_t __munmap(void *addr, uint64_t size) { +-#if defined(__APPLE__) +-#define MUNMAP_SYSCALL 0x2000049 +-#else +-#define MUNMAP_SYSCALL 11 +-#endif +- uint64_t ret; +- __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(addr), "S"(size) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} ++struct dirent64 { ++ uint64_t d_ino; /* Inode number */ ++ int64_t d_off; /* Offset to next linux_dirent */ ++ unsigned short d_reclen; /* Length of this linux_dirent */ ++ unsigned char d_type; ++ char d_name[]; /* Filename (null-terminated) */ ++ /* length is actually (d_reclen - 2 - ++ offsetof(struct linux_dirent, d_name)) */ ++}; + +-#define SIG_BLOCK 0 +-#define SIG_UNBLOCK 1 +-#define SIG_SETMASK 2 ++/* Length of the entries in `struct utsname' is 65. */ ++#define _UTSNAME_LENGTH 65 + +-static const uint64_t MaskAllSignals[] = {-1ULL}; ++struct UtsNameTy { ++ char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */ ++ char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined ++ network" */ ++ char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */ ++ char version[_UTSNAME_LENGTH]; /* Operating system version */ ++ char machine[_UTSNAME_LENGTH]; /* Hardware identifier */ ++ char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */ ++}; + +-uint64_t __sigprocmask(int how, const void *set, void *oldset) { +-#if defined(__APPLE__) +-#define SIGPROCMASK_SYSCALL 0x2000030 +-#else +-#define SIGPROCMASK_SYSCALL 14 +-#endif +- uint64_t ret; +- register long r10 asm("r10") = sizeof(uint64_t); +- __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(how), "S"(set), "d"(oldset), "r"(r10) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} ++struct timespec { ++ uint64_t tv_sec; /* seconds */ ++ uint64_t tv_nsec; /* nanoseconds */ ++}; + +-uint64_t __getpid() { +- uint64_t ret; +-#if defined(__APPLE__) +-#define GETPID_SYSCALL 20 ++#if defined(__aarch64__) ++#include "sys_aarch64.h" + #else +-#define GETPID_SYSCALL 39 ++#include "sys_x86_64.h" + #endif +- __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} + +-uint64_t __exit(uint64_t code) { +-#if defined(__APPLE__) +-#define EXIT_SYSCALL 0x2000001 +-#else +-#define EXIT_SYSCALL 231 +-#endif +- uint64_t ret; +- __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(code) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} ++constexpr uint32_t BufSize = 10240; + + // Helper functions for writing strings to the .fdata file. We intentionally + // avoid using libc names to make it clear it is our impl. +@@ -415,219 +273,6 @@ static bool scanUInt32(const char *&Buf, const char *End, uint32_t &Ret) { + return false; + } + +-#if !defined(__APPLE__) +-// We use a stack-allocated buffer for string manipulation in many pieces of +-// this code, including the code that prints each line of the fdata file. This +-// buffer needs to accomodate large function names, but shouldn't be arbitrarily +-// large (dynamically allocated) for simplicity of our memory space usage. +- +-// Declare some syscall wrappers we use throughout this code to avoid linking +-// against system libc. +-uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { +- uint64_t ret; +- __asm__ __volatile__("movq $2, %%rax\n" +- "syscall" +- : "=a"(ret) +- : "D"(pathname), "S"(flags), "d"(mode) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-struct dirent { +- unsigned long d_ino; /* Inode number */ +- unsigned long d_off; /* Offset to next linux_dirent */ +- unsigned short d_reclen; /* Length of this linux_dirent */ +- char d_name[]; /* Filename (null-terminated) */ +- /* length is actually (d_reclen - 2 - +- offsetof(struct linux_dirent, d_name)) */ +-}; +- +-long __getdents(unsigned int fd, dirent *dirp, size_t count) { +- long ret; +- __asm__ __volatile__("movq $78, %%rax\n" +- "syscall" +- : "=a"(ret) +- : "D"(fd), "S"(dirp), "d"(count) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { +- uint64_t ret; +- __asm__ __volatile__("movq $89, %%rax\n" +- "syscall" +- : "=a"(ret) +- : "D"(pathname), "S"(buf), "d"(bufsize) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { +- uint64_t ret; +- __asm__ __volatile__("movq $8, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(fd), "S"(pos), "d"(whence) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int __ftruncate(uint64_t fd, uint64_t length) { +- int ret; +- __asm__ __volatile__("movq $77, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(fd), "S"(length) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int __close(uint64_t fd) { +- uint64_t ret; +- __asm__ __volatile__("movq $3, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(fd) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int __madvise(void *addr, size_t length, int advice) { +- int ret; +- __asm__ __volatile__("movq $28, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(addr), "S"(length), "d"(advice) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-#define _UTSNAME_LENGTH 65 +- +-struct UtsNameTy { +- char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */ +- char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined +- network" */ +- char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */ +- char version[_UTSNAME_LENGTH]; /* Operating system version */ +- char machine[_UTSNAME_LENGTH]; /* Hardware identifier */ +- char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */ +-}; +- +-int __uname(struct UtsNameTy *Buf) { +- int Ret; +- __asm__ __volatile__("movq $63, %%rax\n" +- "syscall\n" +- : "=a"(Ret) +- : "D"(Buf) +- : "cc", "rcx", "r11", "memory"); +- return Ret; +-} +- +-struct timespec { +- uint64_t tv_sec; /* seconds */ +- uint64_t tv_nsec; /* nanoseconds */ +-}; +- +-uint64_t __nanosleep(const timespec *req, timespec *rem) { +- uint64_t ret; +- __asm__ __volatile__("movq $35, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(req), "S"(rem) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int64_t __fork() { +- uint64_t ret; +- __asm__ __volatile__("movq $57, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int __mprotect(void *addr, size_t len, int prot) { +- int ret; +- __asm__ __volatile__("movq $10, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(addr), "S"(len), "d"(prot) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-uint64_t __getppid() { +- uint64_t ret; +- __asm__ __volatile__("movq $110, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int __setpgid(uint64_t pid, uint64_t pgid) { +- int ret; +- __asm__ __volatile__("movq $109, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(pid), "S"(pgid) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-uint64_t __getpgid(uint64_t pid) { +- uint64_t ret; +- __asm__ __volatile__("movq $121, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(pid) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int __kill(uint64_t pid, int sig) { +- int ret; +- __asm__ __volatile__("movq $62, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(pid), "S"(sig) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-int __fsync(int fd) { +- int ret; +- __asm__ __volatile__("movq $74, %%rax\n" +- "syscall\n" +- : "=a"(ret) +- : "D"(fd) +- : "cc", "rcx", "r11", "memory"); +- return ret; +-} +- +-// %rdi %rsi %rdx %r10 %r8 +-// sys_prctl int option unsigned unsigned unsigned unsigned +-// long arg2 long arg3 long arg4 long arg5 +-int __prctl(int Option, unsigned long Arg2, unsigned long Arg3, +- unsigned long Arg4, unsigned long Arg5) { +- int Ret; +- register long rdx asm("rdx") = Arg3; +- register long r8 asm("r8") = Arg5; +- register long r10 asm("r10") = Arg4; +- __asm__ __volatile__("movq $157, %%rax\n" +- "syscall\n" +- : "=a"(Ret) +- : "D"(Option), "S"(Arg2), "d"(rdx), "r"(r10), "r"(r8) +- :); +- return Ret; +-} +- +-#endif +- + void reportError(const char *Msg, uint64_t Size) { + __write(2, Msg, Size); + __exit(1); +@@ -644,6 +289,12 @@ void assert(bool Assertion, const char *Msg) { + reportError(Buf, Ptr - Buf); + } + ++#define SIG_BLOCK 0 ++#define SIG_UNBLOCK 1 ++#define SIG_SETMASK 2 ++ ++static const uint64_t MaskAllSignals[] = {-1ULL}; ++ + class Mutex { + volatile bool InUse{false}; + +diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp +index 96a43f6..cfd113e 100644 +--- a/bolt/runtime/instr.cpp ++++ b/bolt/runtime/instr.cpp +@@ -40,7 +40,6 @@ + // + //===----------------------------------------------------------------------===// + +-#if defined (__x86_64__) + #include "common.h" + + // Enables a very verbose logging to stderr useful when debugging +@@ -695,12 +694,12 @@ static char *getBinaryPath() { + assert(static_cast(FDdir) >= 0, + "failed to open /proc/self/map_files"); + +- while (long Nread = __getdents(FDdir, (struct dirent *)Buf, BufSize)) { ++ while (long Nread = __getdents64(FDdir, (struct dirent64 *)Buf, BufSize)) { + assert(static_cast(Nread) != -1, "failed to get folder entries"); + +- struct dirent *d; ++ struct dirent64 *d; + for (long Bpos = 0; Bpos < Nread; Bpos += d->d_reclen) { +- d = (struct dirent *)(Buf + Bpos); ++ d = (struct dirent64 *)(Buf + Bpos); + + uint64_t StartAddress, EndAddress; + if (!parseAddressRange(d->d_name, StartAddress, EndAddress)) +@@ -1668,6 +1667,17 @@ instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) { + /// as well as the target address for the call + extern "C" __attribute((naked)) void __bolt_instr_indirect_call() + { ++#if defined(__aarch64__) ++ // clang-format off ++ __asm__ __volatile__(SAVE_ALL ++ "ldp x0, x1, [sp, #288]\n" ++ "bl instrumentIndirectCall\n" ++ RESTORE_ALL ++ "ret\n" ++ :::); ++ // clang-format on ++#else ++ // clang-format off + __asm__ __volatile__(SAVE_ALL + "mov 0xa0(%%rsp), %%rdi\n" + "mov 0x98(%%rsp), %%rsi\n" +@@ -1675,10 +1685,23 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_call() + RESTORE_ALL + "ret\n" + :::); ++ // clang-format on ++#endif + } + + extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall() + { ++#if defined(__aarch64__) ++ // clang-format off ++ __asm__ __volatile__(SAVE_ALL ++ "ldp x0, x1, [sp, #288]\n" ++ "bl instrumentIndirectCall\n" ++ RESTORE_ALL ++ "ret\n" ++ :::); ++ // clang-format on ++#else ++ // clang-format off + __asm__ __volatile__(SAVE_ALL + "mov 0x98(%%rsp), %%rdi\n" + "mov 0x90(%%rsp), %%rsi\n" +@@ -1686,21 +1709,48 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall() + RESTORE_ALL + "ret\n" + :::); ++ // clang-format on ++#endif + } + + /// This is hooking ELF's entry, it needs to save all machine state. + extern "C" __attribute((naked)) void __bolt_instr_start() + { ++#if defined(__aarch64__) ++ // clang-format off ++ __asm__ __volatile__(SAVE_ALL ++ "bl __bolt_instr_setup\n" ++ RESTORE_ALL ++ "adrp x16, __bolt_start_trampoline\n" ++ "add x16, x16, #:lo12:__bolt_start_trampoline\n" ++ "br x16\n" ++ :::); ++ // clang-format on ++#else ++ // clang-format off + __asm__ __volatile__(SAVE_ALL + "call __bolt_instr_setup\n" + RESTORE_ALL + "jmp __bolt_start_trampoline\n" + :::); ++ // clang-format on ++#endif + } + + /// This is hooking into ELF's DT_FINI + extern "C" void __bolt_instr_fini() { +- __bolt_fini_trampoline(); ++#if defined(__aarch64__) ++ // clang-format off ++ __asm__ __volatile__(SAVE_ALL ++ "adrp x16, __bolt_fini_trampoline\n" ++ "add x16, x16, #:lo12:__bolt_fini_trampoline\n" ++ "blr x16\n" ++ RESTORE_ALL ++ :::); ++ // clang-format on ++#else ++ __asm__ __volatile__("call __bolt_fini_trampoline\n" :::); ++#endif + if (__bolt_instr_sleep_time == 0) { + int FD = openProfile(); + __bolt_instr_data_dump(FD); +@@ -1752,4 +1802,3 @@ void _bolt_instr_fini() { + } + + #endif +-#endif +diff --git a/bolt/runtime/sys_aarch64.h b/bolt/runtime/sys_aarch64.h +new file mode 100644 +index 0000000..77c9cfc +--- /dev/null ++++ b/bolt/runtime/sys_aarch64.h +@@ -0,0 +1,394 @@ ++#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_AARCH64 ++#define LLVM_TOOLS_LLVM_BOLT_SYS_AARCH64 ++ ++// Save all registers while keeping 16B stack alignment ++#define SAVE_ALL \ ++ "stp x0, x1, [sp, #-16]!\n" \ ++ "stp x2, x3, [sp, #-16]!\n" \ ++ "stp x4, x5, [sp, #-16]!\n" \ ++ "stp x6, x7, [sp, #-16]!\n" \ ++ "stp x8, x9, [sp, #-16]!\n" \ ++ "stp x10, x11, [sp, #-16]!\n" \ ++ "stp x12, x13, [sp, #-16]!\n" \ ++ "stp x14, x15, [sp, #-16]!\n" \ ++ "stp x16, x17, [sp, #-16]!\n" \ ++ "stp x18, x19, [sp, #-16]!\n" \ ++ "stp x20, x21, [sp, #-16]!\n" \ ++ "stp x22, x23, [sp, #-16]!\n" \ ++ "stp x24, x25, [sp, #-16]!\n" \ ++ "stp x26, x27, [sp, #-16]!\n" \ ++ "stp x28, x29, [sp, #-16]!\n" \ ++ "str x30, [sp,#-16]!\n" ++// Mirrors SAVE_ALL ++#define RESTORE_ALL \ ++ "ldr x30, [sp], #16\n" \ ++ "ldp x28, x29, [sp], #16\n" \ ++ "ldp x26, x27, [sp], #16\n" \ ++ "ldp x24, x25, [sp], #16\n" \ ++ "ldp x22, x23, [sp], #16\n" \ ++ "ldp x20, x21, [sp], #16\n" \ ++ "ldp x18, x19, [sp], #16\n" \ ++ "ldp x16, x17, [sp], #16\n" \ ++ "ldp x14, x15, [sp], #16\n" \ ++ "ldp x12, x13, [sp], #16\n" \ ++ "ldp x10, x11, [sp], #16\n" \ ++ "ldp x8, x9, [sp], #16\n" \ ++ "ldp x6, x7, [sp], #16\n" \ ++ "ldp x4, x5, [sp], #16\n" \ ++ "ldp x2, x3, [sp], #16\n" \ ++ "ldp x0, x1, [sp], #16\n" ++ ++// Anonymous namespace covering everything but our library entry point ++namespace { ++ ++// Get the difference between runtime addrress of .text section and ++// static address in section header table. Can be extracted from arbitrary ++// pc value recorded at runtime to get the corresponding static address, which ++// in turn can be used to search for indirect call description. Needed because ++// indirect call descriptions are read-only non-relocatable data. ++uint64_t getTextBaseAddress() { ++ uint64_t DynAddr; ++ uint64_t StaticAddr; ++ __asm__ volatile("b .instr%=\n\t" ++ ".StaticAddr%=:\n\t" ++ ".dword __hot_end\n\t" ++ ".instr%=:\n\t" ++ "ldr %0, .StaticAddr%=\n\t" ++ "adrp %1, __hot_end\n\t" ++ "add %1, %1, :lo12:__hot_end\n\t" ++ : "=r"(StaticAddr), "=r"(DynAddr)); ++ return DynAddr - StaticAddr; ++} ++ ++uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { ++ uint64_t ret; ++ register uint64_t x0 __asm__("x0") = fd; ++ register const void *x1 __asm__("x1") = buf; ++ register uint64_t x2 __asm__("x2") = count; ++ register uint32_t w8 __asm__("w8") = 63; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { ++ uint64_t ret; ++ register uint64_t x0 __asm__("x0") = fd; ++ register const void *x1 __asm__("x1") = buf; ++ register uint64_t x2 __asm__("x2") = count; ++ register uint32_t w8 __asm__("w8") = 64; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, ++ uint64_t fd, uint64_t offset) { ++ void *ret; ++ register uint64_t x0 __asm__("x0") = addr; ++ register uint64_t x1 __asm__("x1") = size; ++ register uint64_t x2 __asm__("x2") = prot; ++ register uint64_t x3 __asm__("x3") = flags; ++ register uint64_t x4 __asm__("x4") = fd; ++ register uint64_t x5 __asm__("x5") = offset; ++ register uint32_t w8 __asm__("w8") = 222; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(x3), "r"(x4), "r"(x5), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++uint64_t __munmap(void *addr, uint64_t size) { ++ uint64_t ret; ++ register void *x0 __asm__("x0") = addr; ++ register uint64_t x1 __asm__("x1") = size; ++ register uint32_t w8 __asm__("w8") = 215; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++uint64_t __exit(uint64_t code) { ++ uint64_t ret; ++ register uint64_t x0 __asm__("x0") = code; ++ register uint32_t w8 __asm__("w8") = 94; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0) ++ : "r"(w8) ++ : "cc", "memory", "x1"); ++ return ret; ++} ++ ++uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { ++ uint64_t ret; ++ register int x0 __asm__("x0") = -100; ++ register const char *x1 __asm__("x1") = pathname; ++ register uint64_t x2 __asm__("x2") = flags; ++ register uint64_t x3 __asm__("x3") = mode; ++ register uint32_t w8 __asm__("w8") = 56; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(x3), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) { ++ long ret; ++ register unsigned int x0 __asm__("x0") = fd; ++ register dirent64 *x1 __asm__("x1") = dirp; ++ register size_t x2 __asm__("x2") = count; ++ register uint32_t w8 __asm__("w8") = 61; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { ++ uint64_t ret; ++ register int x0 __asm__("x0") = -100; ++ register const char *x1 __asm__("x1") = pathname; ++ register char *x2 __asm__("x2") = buf; ++ register size_t x3 __asm__("x3") = bufsize; ++ register uint32_t w8 __asm__("w8") = 78; // readlinkat ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(x3), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { ++ uint64_t ret; ++ register uint64_t x0 __asm__("x0") = fd; ++ register uint64_t x1 __asm__("x1") = pos; ++ register uint64_t x2 __asm__("x2") = whence; ++ register uint32_t w8 __asm__("w8") = 62; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++int __ftruncate(uint64_t fd, uint64_t length) { ++ int ret; ++ register uint64_t x0 __asm__("x0") = fd; ++ register uint64_t x1 __asm__("x1") = length; ++ register uint32_t w8 __asm__("w8") = 46; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++int __close(uint64_t fd) { ++ int ret; ++ register uint64_t x0 __asm__("x0") = fd; ++ register uint32_t w8 __asm__("w8") = 57; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0) ++ : "r"(w8) ++ : "cc", "memory", "x1"); ++ return ret; ++} ++ ++int __madvise(void *addr, size_t length, int advice) { ++ int ret; ++ register void *x0 __asm__("x0") = addr; ++ register size_t x1 __asm__("x1") = length; ++ register int x2 __asm__("x2") = advice; ++ register uint32_t w8 __asm__("w8") = 233; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++int __uname(struct UtsNameTy *buf) { ++ int ret; ++ register UtsNameTy *x0 __asm__("x0") = buf; ++ register uint32_t w8 __asm__("w8") = 160; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0) ++ : "r"(w8) ++ : "cc", "memory", "x1"); ++ return ret; ++} ++ ++uint64_t __nanosleep(const timespec *req, timespec *rem) { ++ uint64_t ret; ++ register const timespec *x0 __asm__("x0") = req; ++ register timespec *x1 __asm__("x1") = rem; ++ register uint32_t w8 __asm__("w8") = 101; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++int64_t __fork() { ++ uint64_t ret; ++ // clone instead of fork with flags ++ // "CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD" ++ register uint64_t x0 __asm__("x0") = 0x1200011; ++ register uint64_t x1 __asm__("x1") = 0; ++ register uint64_t x2 __asm__("x2") = 0; ++ register uint64_t x3 __asm__("x3") = 0; ++ register uint64_t x4 __asm__("x4") = 0; ++ register uint32_t w8 __asm__("w8") = 220; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(x3), "r"(x4), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++int __mprotect(void *addr, size_t len, int prot) { ++ int ret; ++ register void *x0 __asm__("x0") = addr; ++ register size_t x1 __asm__("x1") = len; ++ register int x2 __asm__("x2") = prot; ++ register uint32_t w8 __asm__("w8") = 226; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++uint64_t __getpid() { ++ uint64_t ret; ++ register uint32_t w8 __asm__("w8") = 172; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret) ++ : "r"(w8) ++ : "cc", "memory", "x0", "x1"); ++ return ret; ++} ++ ++uint64_t __getppid() { ++ uint64_t ret; ++ register uint32_t w8 __asm__("w8") = 173; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret) ++ : "r"(w8) ++ : "cc", "memory", "x0", "x1"); ++ return ret; ++} ++ ++int __setpgid(uint64_t pid, uint64_t pgid) { ++ int ret; ++ register uint64_t x0 __asm__("x0") = pid; ++ register uint64_t x1 __asm__("x1") = pgid; ++ register uint32_t w8 __asm__("w8") = 154; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++uint64_t __getpgid(uint64_t pid) { ++ uint64_t ret; ++ register uint64_t x0 __asm__("x0") = pid; ++ register uint32_t w8 __asm__("w8") = 155; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0) ++ : "r"(w8) ++ : "cc", "memory", "x1"); ++ return ret; ++} ++ ++int __kill(uint64_t pid, int sig) { ++ int ret; ++ register uint64_t x0 __asm__("x0") = pid; ++ register int x1 __asm__("x1") = sig; ++ register uint32_t w8 __asm__("w8") = 129; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++int __fsync(int fd) { ++ int ret; ++ register int x0 __asm__("x0") = fd; ++ register uint32_t w8 __asm__("w8") = 82; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0) ++ : "r"(w8) ++ : "cc", "memory", "x1"); ++ return ret; ++} ++ ++uint64_t __sigprocmask(int how, const void *set, void *oldset) { ++ uint64_t ret; ++ register int x0 __asm__("x0") = how; ++ register const void *x1 __asm__("x1") = set; ++ register void *x2 __asm__("x2") = oldset; ++ register long x3 asm("x3") = 8; ++ register uint32_t w8 __asm__("w8") = 135; ++ __asm__ __volatile__("svc #0\n" ++ "mov %0, x0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(x3), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++int __prctl(int option, unsigned long arg2, unsigned long arg3, ++ unsigned long arg4, unsigned long arg5) { ++ int ret; ++ register int x0 __asm__("x0") = option; ++ register unsigned long x1 __asm__("x1") = arg2; ++ register unsigned long x2 __asm__("x2") = arg3; ++ register unsigned long x3 __asm__("x3") = arg4; ++ register unsigned long x4 __asm__("x4") = arg5; ++ register uint32_t w8 __asm__("w8") = 167; ++ __asm__ __volatile__("svc #0\n" ++ "mov %w0, w0" ++ : "=r"(ret), "+r"(x0), "+r"(x1) ++ : "r"(x2), "r"(x3), "r"(x4), "r"(w8) ++ : "cc", "memory"); ++ return ret; ++} ++ ++} // anonymous namespace ++ ++#endif +diff --git a/bolt/runtime/sys_x86_64.h b/bolt/runtime/sys_x86_64.h +new file mode 100644 +index 0000000..ca2c693 +--- /dev/null ++++ b/bolt/runtime/sys_x86_64.h +@@ -0,0 +1,360 @@ ++#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_X86_64 ++#define LLVM_TOOLS_LLVM_BOLT_SYS_X86_64 ++ ++// Save all registers while keeping 16B stack alignment ++#define SAVE_ALL \ ++ "push %%rax\n" \ ++ "push %%rbx\n" \ ++ "push %%rcx\n" \ ++ "push %%rdx\n" \ ++ "push %%rdi\n" \ ++ "push %%rsi\n" \ ++ "push %%rbp\n" \ ++ "push %%r8\n" \ ++ "push %%r9\n" \ ++ "push %%r10\n" \ ++ "push %%r11\n" \ ++ "push %%r12\n" \ ++ "push %%r13\n" \ ++ "push %%r14\n" \ ++ "push %%r15\n" \ ++ "sub $8, %%rsp\n" ++// Mirrors SAVE_ALL ++#define RESTORE_ALL \ ++ "add $8, %%rsp\n" \ ++ "pop %%r15\n" \ ++ "pop %%r14\n" \ ++ "pop %%r13\n" \ ++ "pop %%r12\n" \ ++ "pop %%r11\n" \ ++ "pop %%r10\n" \ ++ "pop %%r9\n" \ ++ "pop %%r8\n" \ ++ "pop %%rbp\n" \ ++ "pop %%rsi\n" \ ++ "pop %%rdi\n" \ ++ "pop %%rdx\n" \ ++ "pop %%rcx\n" \ ++ "pop %%rbx\n" \ ++ "pop %%rax\n" ++ ++namespace { ++ ++// Get the difference between runtime addrress of .text section and ++// static address in section header table. Can be extracted from arbitrary ++// pc value recorded at runtime to get the corresponding static address, which ++// in turn can be used to search for indirect call description. Needed because ++// indirect call descriptions are read-only non-relocatable data. ++uint64_t getTextBaseAddress() { ++ uint64_t DynAddr; ++ uint64_t StaticAddr; ++ __asm__ volatile("leaq __hot_end(%%rip), %0\n\t" ++ "movabsq $__hot_end, %1\n\t" ++ : "=r"(DynAddr), "=r"(StaticAddr)); ++ return DynAddr - StaticAddr; ++} ++ ++#define _STRINGIFY(x) #x ++#define STRINGIFY(x) _STRINGIFY(x) ++ ++uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { ++ uint64_t ret; ++#if defined(__APPLE__) ++#define READ_SYSCALL 0x2000003 ++#else ++#define READ_SYSCALL 0 ++#endif ++ __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(fd), "S"(buf), "d"(count) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { ++ uint64_t ret; ++#if defined(__APPLE__) ++#define WRITE_SYSCALL 0x2000004 ++#else ++#define WRITE_SYSCALL 1 ++#endif ++ __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(fd), "S"(buf), "d"(count) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, ++ uint64_t fd, uint64_t offset) { ++#if defined(__APPLE__) ++#define MMAP_SYSCALL 0x20000c5 ++#else ++#define MMAP_SYSCALL 9 ++#endif ++ void *ret; ++ register uint64_t r8 asm("r8") = fd; ++ register uint64_t r9 asm("r9") = offset; ++ register uint64_t r10 asm("r10") = flags; ++ __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), ++ "r"(r9) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __munmap(void *addr, uint64_t size) { ++#if defined(__APPLE__) ++#define MUNMAP_SYSCALL 0x2000049 ++#else ++#define MUNMAP_SYSCALL 11 ++#endif ++ uint64_t ret; ++ __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(addr), "S"(size) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __sigprocmask(int how, const void *set, void *oldset) { ++#if defined(__APPLE__) ++#define SIGPROCMASK_SYSCALL 0x2000030 ++#else ++#define SIGPROCMASK_SYSCALL 14 ++#endif ++ uint64_t ret; ++ register long r10 asm("r10") = sizeof(uint64_t); ++ __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(how), "S"(set), "d"(oldset), "r"(r10) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __getpid() { ++ uint64_t ret; ++#if defined(__APPLE__) ++#define GETPID_SYSCALL 20 ++#else ++#define GETPID_SYSCALL 39 ++#endif ++ __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __exit(uint64_t code) { ++#if defined(__APPLE__) ++#define EXIT_SYSCALL 0x2000001 ++#else ++#define EXIT_SYSCALL 231 ++#endif ++ uint64_t ret; ++ __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(code) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++#if !defined(__APPLE__) ++// We use a stack-allocated buffer for string manipulation in many pieces of ++// this code, including the code that prints each line of the fdata file. This ++// buffer needs to accomodate large function names, but shouldn't be arbitrarily ++// large (dynamically allocated) for simplicity of our memory space usage. ++ ++// Declare some syscall wrappers we use throughout this code to avoid linking ++// against system libc. ++uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { ++ uint64_t ret; ++ __asm__ __volatile__("movq $2, %%rax\n" ++ "syscall" ++ : "=a"(ret) ++ : "D"(pathname), "S"(flags), "d"(mode) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) { ++ long ret; ++ __asm__ __volatile__("movq $217, %%rax\n" ++ "syscall" ++ : "=a"(ret) ++ : "D"(fd), "S"(dirp), "d"(count) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { ++ uint64_t ret; ++ __asm__ __volatile__("movq $89, %%rax\n" ++ "syscall" ++ : "=a"(ret) ++ : "D"(pathname), "S"(buf), "d"(bufsize) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { ++ uint64_t ret; ++ __asm__ __volatile__("movq $8, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(fd), "S"(pos), "d"(whence) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __ftruncate(uint64_t fd, uint64_t length) { ++ int ret; ++ __asm__ __volatile__("movq $77, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(fd), "S"(length) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __close(uint64_t fd) { ++ uint64_t ret; ++ __asm__ __volatile__("movq $3, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(fd) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __madvise(void *addr, size_t length, int advice) { ++ int ret; ++ __asm__ __volatile__("movq $28, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(addr), "S"(length), "d"(advice) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __uname(struct UtsNameTy *Buf) { ++ int Ret; ++ __asm__ __volatile__("movq $63, %%rax\n" ++ "syscall\n" ++ : "=a"(Ret) ++ : "D"(Buf) ++ : "cc", "rcx", "r11", "memory"); ++ return Ret; ++} ++ ++uint64_t __nanosleep(const timespec *req, timespec *rem) { ++ uint64_t ret; ++ __asm__ __volatile__("movq $35, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(req), "S"(rem) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int64_t __fork() { ++ uint64_t ret; ++ __asm__ __volatile__("movq $57, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __mprotect(void *addr, size_t len, int prot) { ++ int ret; ++ __asm__ __volatile__("movq $10, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(addr), "S"(len), "d"(prot) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __getppid() { ++ uint64_t ret; ++ __asm__ __volatile__("movq $110, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __setpgid(uint64_t pid, uint64_t pgid) { ++ int ret; ++ __asm__ __volatile__("movq $109, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(pid), "S"(pgid) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++uint64_t __getpgid(uint64_t pid) { ++ uint64_t ret; ++ __asm__ __volatile__("movq $121, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(pid) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __kill(uint64_t pid, int sig) { ++ int ret; ++ __asm__ __volatile__("movq $62, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(pid), "S"(sig) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++int __fsync(int fd) { ++ int ret; ++ __asm__ __volatile__("movq $74, %%rax\n" ++ "syscall\n" ++ : "=a"(ret) ++ : "D"(fd) ++ : "cc", "rcx", "r11", "memory"); ++ return ret; ++} ++ ++// %rdi %rsi %rdx %r10 %r8 ++// sys_prctl int option unsigned unsigned unsigned unsigned ++// long arg2 long arg3 long arg4 long arg5 ++int __prctl(int Option, unsigned long Arg2, unsigned long Arg3, ++ unsigned long Arg4, unsigned long Arg5) { ++ int Ret; ++ register long rdx asm("rdx") = Arg3; ++ register long r8 asm("r8") = Arg5; ++ register long r10 asm("r10") = Arg4; ++ __asm__ __volatile__("movq $157, %%rax\n" ++ "syscall\n" ++ : "=a"(Ret) ++ : "D"(Option), "S"(Arg2), "d"(rdx), "r"(r10), "r"(r8) ++ :); ++ return Ret; ++} ++ ++#endif ++ ++} // anonymous namespace ++ ++#endif +diff --git a/bolt/test/AArch64/exclusive-instrument.s b/bolt/test/AArch64/exclusive-instrument.s +new file mode 100644 +index 0000000..502dd83 +--- /dev/null ++++ b/bolt/test/AArch64/exclusive-instrument.s +@@ -0,0 +1,39 @@ ++// This test checks that the foo function having exclusive memory access ++// instructions won't be instrumented. ++ ++// REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}} ++ ++// RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ ++// RUN: %s -o %t.o ++// RUN: %clang %cflags -fPIC -pie %t.o -o %t.exe -nostdlib -Wl,-q -Wl,-fini=dummy ++// RUN: llvm-bolt %t.exe -o %t.bolt -instrument -v=1 | FileCheck %s ++ ++// CHECK: Function foo has exclusive instructions, skip instrumentation ++ ++.global foo ++.type foo, %function ++foo: ++ ldaxr w9, [x10] ++ cbnz w9, .Lret ++ stlxr w12, w11, [x9] ++ cbz w12, foo ++ clrex ++.Lret: ++ ret ++.size foo, .-foo ++ ++.global _start ++.type _start, %function ++_start: ++ cmp x0, #0 ++ b.eq .Lexit ++ bl foo ++.Lexit: ++ ret ++.size _start, .-_start ++ ++.global dummy ++.type dummy, %function ++dummy: ++ ret ++.size dummy, .-dummy +diff --git a/bolt/test/X86/asm-dump.c b/bolt/test/X86/asm-dump.c +index 5d85e2a..fdd448e 100644 +--- a/bolt/test/X86/asm-dump.c ++++ b/bolt/test/X86/asm-dump.c +@@ -1,13 +1,14 @@ + /** + * Test for asm-dump functionality. + * +- * REQUIRES: system-linux,bolt-runtime ++ * REQUIRES: x86_64-linux,bolt-runtime + * + * Compile the source + * RUN: %clang -fPIC %s -o %t.exe -Wl,-q + * + * Profile collection: instrument the binary +- * RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata -o %t.instr ++ * RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata -o \ ++ * RUN: %t.instr + * + * Profile collection: run instrumented binary (and capture output) + * RUN: %t.instr > %t.result +diff --git a/bolt/test/X86/bolt-address-translation-internal-call.test b/bolt/test/X86/bolt-address-translation-internal-call.test +index edc32d9..24cb635 100644 +--- a/bolt/test/X86/bolt-address-translation-internal-call.test ++++ b/bolt/test/X86/bolt-address-translation-internal-call.test +@@ -4,12 +4,12 @@ + # internal calls) might create new blocks without a mapping to an + # input block. + +-# REQUIRES: system-linux,bolt-runtime ++# REQUIRES: x86_64-linux,bolt-runtime + + # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o + # Delete our BB symbols so BOLT doesn't mark them as entry points + # RUN: llvm-strip --strip-unneeded %t.o +-# RUN: %clang %t.o -o %t.exe -Wl,-q ++# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q + + # RUN: llvm-bolt --enable-bat %t.exe --relocs -o %t.out | FileCheck %s + # CHECK: BOLT-INFO: Wrote {{.*}} BAT maps +@@ -29,6 +29,7 @@ main: + push %rbx + sub $0x120,%rsp + mov $0x3,%rbx ++ movq rel(%rip), %rdi + .J1: + cmp $0x0,%rbx + je .J2 +@@ -49,4 +50,8 @@ main: + .J4: + pop %rbp + retq ++end: + .size main, .-main ++ ++ .data ++rel: .quad end +diff --git a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp +index f6ebd6b..4ed8be4 100644 +--- a/bolt/test/X86/instrumentation-eh_frame_hdr.cpp ++++ b/bolt/test/X86/instrumentation-eh_frame_hdr.cpp +@@ -1,7 +1,7 @@ + // This test checks that .eh_frame_hdr address is in bounds of the last LOAD + // end address i.e. the section address is smaller then the LOAD end address. + +-// REQUIRES: system-linux,bolt-runtime ++// REQUIRES: system-linux,bolt-runtime,target=x86_64{{.*}} + + // RUN: %clangxx %cxxflags -static -Wl,-q %s -o %t.exe -Wl,--entry=_start + // RUN: llvm-bolt %t.exe -o %t.instr -instrument \ +diff --git a/bolt/test/X86/internal-call-instrument.s b/bolt/test/X86/internal-call-instrument.s +index c137174..c393f1d 100644 +--- a/bolt/test/X86/internal-call-instrument.s ++++ b/bolt/test/X86/internal-call-instrument.s +@@ -1,15 +1,23 @@ + # This reproduces a bug with instrumentation crashes on internal call + +-# REQUIRES: system-linux,bolt-runtime ++# REQUIRES: x86_64-linux,bolt-runtime,target=x86_64{{.*}} + + # RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o + # Delete our BB symbols so BOLT doesn't mark them as entry points + # RUN: llvm-strip --strip-unneeded %t.o +-# RUN: %clang %t.o -o %t.exe -Wl,-q ++# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q + + # RUN: llvm-bolt --instrument %t.exe --relocs -o %t.out + + .text ++ .globl _start ++ .type _start, %function ++ .p2align 4 ++_start: ++ call main ++ ret ++ .size _start, .-_start ++ + .globl main + .type main, %function + .p2align 4 +@@ -20,6 +28,7 @@ main: + push %rbx + sub $0x120,%rsp + mov $0x3,%rbx ++ movq rel(%rip), %rdi + .J1: + cmp $0x0,%rbx + je .J2 +@@ -40,4 +49,15 @@ main: + .J4: + pop %rbp + retq ++end: + .size main, .-main ++ ++ .globl _fini ++ .type _fini, %function ++ .p2align 4 ++_fini: ++ hlt ++ .size _fini, .-_fini ++ ++ .data ++rel: .quad end +diff --git a/bolt/test/X86/tail-duplication-pass.s b/bolt/test/X86/tail-duplication-pass.s +index 677f498..ed50cc5 100644 +--- a/bolt/test/X86/tail-duplication-pass.s ++++ b/bolt/test/X86/tail-duplication-pass.s +@@ -7,12 +7,21 @@ + # RUN: llvm-bolt %t.exe --data %t.fdata --reorder-blocks=ext-tsp \ + # RUN: --print-finalized --tail-duplication=moderate \ + # RUN: --tail-duplication-minimum-offset=1 -o %t.out | FileCheck %s ++# RUN: llvm-bolt %t.exe --data %t.fdata --print-finalized \ ++# RUN: --tail-duplication=aggressive --tail-duplication-minimum-offset=1 \ ++# RUN: -o %t.out | FileCheck %s --check-prefix CHECK-NOLOOP + + # FDATA: 1 main 2 1 main #.BB2# 0 10 + # FDATA: 1 main 4 1 main #.BB2# 0 20 + # CHECK: BOLT-INFO: tail duplication modified 1 ({{.*}}%) functions; duplicated 1 blocks (1 bytes) responsible for {{.*}} dynamic executions ({{.*}}% of all block executions) + # CHECK: BB Layout : .LBB00, .Ltail-dup0, .Ltmp0, .Ltmp1 + ++# Check that the successor of Ltail-dup0 is .LBB00, not itself. ++# CHECK-NOLOOP: .Ltail-dup0 (1 instructions, align : 1) ++# CHECK-NOLOOP: Predecessors: .LBB00 ++# CHECK-NOLOOP: retq ++# CHECK-NOLOOP: .Ltmp0 (1 instructions, align : 1) ++ + .text + .globl main + .type main, %function +diff --git a/bolt/test/assume-abi.test b/bolt/test/assume-abi.test +new file mode 100644 +index 0000000..688ab01 +--- /dev/null ++++ b/bolt/test/assume-abi.test +@@ -0,0 +1,7 @@ ++# Validate the usage of the `--assume-abi` option in conjunction with ++# options related to the RegAnalysis Pass. ++ ++REQUIRES: system-linux ++ ++RUN: %clang %cflags %p/Inputs/hello.c -o %t -Wl,-q ++RUN: llvm-bolt %t -o %t.bolt --assume-abi --indirect-call-promotion=all +diff --git a/bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s b/bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s +new file mode 100644 +index 0000000..fa1ac35 +--- /dev/null ++++ b/bolt/test/runtime/AArch64/Inputs/basic-instrumentation.s +@@ -0,0 +1,9 @@ ++ .globl main ++ .type main, %function ++main: ++ sub sp, sp, #16 ++ mov w0, wzr ++ str wzr, [sp, #12] ++ add sp, sp, #16 ++ ret ++.size main, .-main +diff --git a/bolt/test/runtime/AArch64/basic-instrumentation.test b/bolt/test/runtime/AArch64/basic-instrumentation.test +new file mode 100644 +index 0000000..0f77b0c +--- /dev/null ++++ b/bolt/test/runtime/AArch64/basic-instrumentation.test +@@ -0,0 +1,22 @@ ++# Try to instrument a very fast test. Input bin will not execute any code during ++# runtime besides returning zero in main, so it is a good trivial case. ++REQUIRES: system-linux,bolt-runtime ++ ++RUN: %clang %p/Inputs/basic-instrumentation.s -Wl,-q -o %t.exe ++RUN: llvm-bolt %t.exe -o %t --instrument \ ++RUN: --instrumentation-file=%t \ ++RUN: --instrumentation-file-append-pid ++ ++# Execute program to collect profile ++RUN: rm %t.*.fdata || echo Nothing to remove ++RUN: %t ++ ++# Profile should be written to %t.PID.fdata, check it ++RUN: mv %t.*.fdata %t.fdata ++RUN: cat %t.fdata | FileCheck -check-prefix=CHECK %s ++ ++# Check BOLT works with this profile ++RUN: llvm-bolt %t.exe --data %t.fdata -o %t.2 --reorder-blocks=cache ++ ++# The instrumented profile should at least say main was called once ++CHECK: main 0 0 1{{$}} +diff --git a/bolt/test/runtime/AArch64/instrumentation-ind-call.c b/bolt/test/runtime/AArch64/instrumentation-ind-call.c +new file mode 100644 +index 0000000..76ee8c0 +--- /dev/null ++++ b/bolt/test/runtime/AArch64/instrumentation-ind-call.c +@@ -0,0 +1,38 @@ ++#include ++ ++typedef int (*func_ptr)(int, int); ++ ++int add(int a, int b) { return a + b; } ++ ++int main() { ++ func_ptr fun; ++ fun = add; ++ int sum = fun(10, 20); // indirect call to 'add' ++ printf("The sum is: %d\n", sum); ++ return 0; ++} ++/* ++REQUIRES: system-linux,bolt-runtime ++ ++RUN: %clang %cflags %s -o %t.exe -Wl,-q -nopie -fpie ++ ++RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \ ++RUN: -o %t.instrumented ++ ++# Instrumented program needs to finish returning zero ++RUN: %t.instrumented | FileCheck %s -check-prefix=CHECK-OUTPUT ++ ++# Test that the instrumented data makes sense ++RUN: llvm-bolt %t.exe -o %t.bolted --data %t.fdata \ ++RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \ ++RUN: --print-only=main --print-finalized | FileCheck %s ++ ++RUN: %t.bolted | FileCheck %s -check-prefix=CHECK-OUTPUT ++ ++CHECK-OUTPUT: The sum is: 30 ++ ++# Check that our indirect call has 1 hit recorded in the fdata file and that ++# this was processed correctly by BOLT ++CHECK: blr x8 # CallProfile: 1 (0 misses) : ++CHECK-NEXT: { add: 1 (0 misses) } ++*/ +diff --git a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp b/bolt/test/runtime/Inputs/exceptions_split.cpp +similarity index 85% +rename from bolt/test/runtime/X86/Inputs/exceptions_split.cpp +rename to bolt/test/runtime/Inputs/exceptions_split.cpp +index 2c136b9..de81adf 100644 +--- a/bolt/test/runtime/X86/Inputs/exceptions_split.cpp ++++ b/bolt/test/runtime/Inputs/exceptions_split.cpp +@@ -3,31 +3,25 @@ + // + // Record performance data with no args. Run test with 2 args. + +-#include + #include ++#include + +-int foo() +-{ +- return 0; +-} ++int foo() { return 0; } + + void bar(int a) { + if (a > 2 && a % 2) + throw new int(); + } + +-void filter_only(){ +- foo(); +-} ++void filter_only() { foo(); } + +-int main(int argc, char **argv) +-{ ++int main(int argc, char **argv) { + unsigned r = 0; + + uint64_t limit = (argc >= 2 ? 10 : 5000); + for (uint64_t i = 0; i < limit; ++i) { + i += foo(); +- try { ++ try { + bar(argc); + try { + if (argc >= 2) +diff --git a/bolt/test/runtime/X86/instrumentation-tail-call.s b/bolt/test/runtime/X86/instrumentation-tail-call.s +index 792d084..dfb12f0 100644 +--- a/bolt/test/runtime/X86/instrumentation-tail-call.s ++++ b/bolt/test/runtime/X86/instrumentation-tail-call.s +@@ -14,6 +14,9 @@ + + # CHECK: leaq 0x80(%rsp), %rsp + ++# RUN: FileCheck %s --input-file %t.fdata --check-prefix=CHECK-FDATA ++# CHECK-FDATA: 1 main {{.*}} 1 targetFunc 0 0 1 ++ + .text + .globl main + .type main, %function +@@ -32,7 +35,8 @@ main: + movq %rbp, %rsp + pop %rbp + mov -0x10(%rsp),%rax +- jmp targetFunc ++ test %rsp, %rsp ++ jne targetFunc + + .LBBerror: + addq $0x20, %rsp +diff --git a/bolt/test/runtime/X86/exceptions-instrumentation.test b/bolt/test/runtime/exceptions-instrumentation.test +similarity index 100% +rename from bolt/test/runtime/X86/exceptions-instrumentation.test +rename to bolt/test/runtime/exceptions-instrumentation.test +diff --git a/bolt/test/runtime/X86/pie-exceptions-split.test b/bolt/test/runtime/pie-exceptions-split.test +similarity index 95% +rename from bolt/test/runtime/X86/pie-exceptions-split.test +rename to bolt/test/runtime/pie-exceptions-split.test +index 124fef6..30f2d02 100644 +--- a/bolt/test/runtime/X86/pie-exceptions-split.test ++++ b/bolt/test/runtime/pie-exceptions-split.test +@@ -16,9 +16,9 @@ RUN: --print-only=main 2>&1 | FileCheck %s + ## All calls to printf() should be from exception handling code that was + ## recorded as cold during the profile collection run. Check that the calls + ## are placed after the split point. +-CHECK-NOT: callq printf ++CHECK-NOT: printf + CHECK: HOT-COLD SPLIT POINT +-CHECK: callq printf ++CHECK: printf + + ## Verify the output still executes correctly when the exception path is being + ## taken. +-- +2.39.5 (Apple Git-154) + diff --git a/llvm-bolt.spec b/llvm-bolt.spec index 832a69fd9b3f52be94f7dae83dd26988344f3eca..5154975000eacbc864dc1339a3cb244f65010003 100644 --- a/llvm-bolt.spec +++ b/llvm-bolt.spec @@ -22,7 +22,7 @@ Name: %{pkg_name} Version: %{bolt_version} -Release: 9 +Release: 10 Summary: BOLT is a post-link optimizer developed to speed up large applications License: Apache 2.0 URL: https://github.com/llvm/llvm-project/tree/main/bolt @@ -38,6 +38,7 @@ Patch5: 0005-BOLT-AArch64-Don-t-change-layout-in-PatchEntries.patch Patch6: 0006-AArch64-Add-CFG-block-count-correction-optimization.patch Patch7: 0007-BOLT-Skip-PLT-search-for-zero-value-weak-reference-symbols.patch Patch8: 0008-merge-fdata-Support-process-no_lbr-profile-file.patch +Patch9: 0009-support-aarch64-instrumentation.patch BuildRequires: gcc BuildRequires: gcc-c++ @@ -107,6 +108,10 @@ find %{buildroot}%{install_prefix} \ ! -name "libbolt_rt_instr.a" \ -type f,l -exec rm -f '{}' \; +%ifarch aarch64 +find %{buildroot}%{install_prefix} -name "libbolt_rt_hugify.a" -type f,l -exec rm -f '{}' \; +%endif + # Remove files installed during the build phase. rm -f %{buildroot}/%{_builddir}/%{bolt_srcdir}/%{_vpath_builddir}/%{_lib}/lib*.a @@ -138,9 +143,9 @@ rm -f %{buildroot}/%{_builddir}/%{bolt_srcdir}/%{_vpath_builddir}/%{_lib}/lib*.a %{install_bindir}/perf2bolt %{install_bindir}/llvm-bolt-heatmap +%{install_libdir}/libbolt_rt_instr.a %ifarch x86_64 %{install_libdir}/libbolt_rt_hugify.a -%{install_libdir}/libbolt_rt_instr.a %endif %exclude %{_builddir}/%{bolt_srcdir}/lib/* @@ -149,6 +154,12 @@ rm -f %{buildroot}/%{_builddir}/%{bolt_srcdir}/%{_vpath_builddir}/%{_lib}/lib*.a %doc %{install_docdir} %changelog +* Thu Oct 31 2024 rfwang07 17.0.6-10 +- Type:Backport +- ID:NA +- SUG:NA +- DESC: Backport aarch64 instrumentation support from 18.1.0 + * Tue Sep 10 2024 liyancheng <412998149@qq.com> 17.0.6-9 - Type:Bugfix - ID:NA