diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index f09d1129b128a35b75eb2479de23bee88a953b3c..eabc4aabea0628ecae8c46686b93cbbb85ccdd28 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1067,6 +1067,15 @@ void CodeGenModule::Release() { "sign-return-address-with-bkey", 1); } + if (CodeGenOpts.StackClashProtector) + getModule().addModuleFlag( + llvm::Module::Override, "probe-stack", + llvm::MDString::get(TheModule.getContext(), "inline-asm")); + + if (CodeGenOpts.StackProbeSize && CodeGenOpts.StackProbeSize != 4096) + getModule().addModuleFlag(llvm::Module::Min, "stack-probe-size", + CodeGenOpts.StackProbeSize); + if (!CodeGenOpts.MemoryProfileOutput.empty()) { llvm::LLVMContext &Ctx = TheModule.getContext(); getModule().addModuleFlag( @@ -2261,6 +2270,10 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D, if (CodeGenOpts.StackClashProtector) B.addAttribute("probe-stack", "inline-asm"); + if (CodeGenOpts.StackProbeSize && CodeGenOpts.StackProbeSize != 4096) + B.addAttribute("stack-probe-size", + std::to_string(CodeGenOpts.StackProbeSize)); + if (!hasUnwindExceptions(LangOpts)) B.addAttribute(llvm::Attribute::NoUnwind); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 6b5930990f11713da647de740bd3436570b6c415..25627c72f85892dee0e9aacb60f28971ad99c156 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -3473,7 +3473,7 @@ static void RenderSCPOptions(const ToolChain &TC, const ArgList &Args, return; if (!EffectiveTriple.isX86() && !EffectiveTriple.isSystemZ() && - !EffectiveTriple.isPPC64()) + !EffectiveTriple.isPPC64() && !EffectiveTriple.isAArch64()) return; Args.addOptInFlag(CmdArgs, options::OPT_fstack_clash_protection, diff --git a/clang/test/CodeGen/stack-clash-protection.c b/clang/test/CodeGen/stack-clash-protection.c index 67571f5cdb2c14c3bf92efe60a6be4a47e297a8d..dab9ee768c28745040923b8a5bba85f97755db26 100644 --- a/clang/test/CodeGen/stack-clash-protection.c +++ b/clang/test/CodeGen/stack-clash-protection.c @@ -1,8 +1,9 @@ // Check the correct function attributes are generated -// RUN: %clang_cc1 -triple x86_64-linux -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s -// RUN: %clang_cc1 -triple s390x-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s -// RUN: %clang_cc1 -triple powerpc64le-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s -// RUN: %clang_cc1 -triple powerpc64-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-linux -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s +// RUN: %clang_cc1 -triple s390x-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s +// RUN: %clang_cc1 -triple powerpc64le-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s +// RUN: %clang_cc1 -triple powerpc64-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s // CHECK: define{{.*}} void @large_stack() #[[A:.*]] { void large_stack(void) { @@ -11,15 +12,18 @@ void large_stack(void) { stack[i] = i; } -// CHECK: define{{.*}} void @vla({{.*}}) #[[A:.*]] { +// CHECK: define{{.*}} void @vla({{.*}}) #[[A]] { void vla(int n) { volatile int vla[n]; __builtin_memset(&vla[0], 0, 1); } -// CHECK: define{{.*}} void @builtin_alloca({{.*}}) #[[A:.*]] { +// CHECK: define{{.*}} void @builtin_alloca({{.*}}) #[[A]] { void builtin_alloca(int n) { volatile void *mem = __builtin_alloca(n); } -// CHECK: attributes #[[A]] = {{.*}} "probe-stack"="inline-asm" +// CHECK: attributes #[[A]] = {{.*}}"probe-stack"="inline-asm" {{.*}}"stack-probe-size"="8192" + +// CHECK: !{i32 4, !"probe-stack", !"inline-asm"} +// CHECK: !{i32 8, !"stack-probe-size", i32 8192} diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index a568edd0e640d93873a19adf5872ef04428f82bd..7abbd1f03f16324456ef45d95df7dc842a54ded3 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -400,7 +400,11 @@ public: LegalizeResult lowerUnmergeValues(MachineInstr &MI); LegalizeResult lowerExtractInsertVectorElt(MachineInstr &MI); LegalizeResult lowerShuffleVector(MachineInstr &MI); + Register getDynStackAllocTargetPtr(Register SPReg, Register AllocSize, + Align Alignment, LLT PtrTy); LegalizeResult lowerDynStackAlloc(MachineInstr &MI); + LegalizeResult lowerStackSave(MachineInstr &MI); + LegalizeResult lowerStackRestore(MachineInstr &MI); LegalizeResult lowerExtract(MachineInstr &MI); LegalizeResult lowerInsert(MachineInstr &MI); LegalizeResult lowerSADDO_SSUBO(MachineInstr &MI); diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index 186bea75ae96475a7980736873a061ee07e731d9..c92ce6dc701c8ed8c3ef1b276ed0c8dbad04128e 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -763,6 +763,12 @@ HANDLE_TARGET_OPCODE(G_JUMP_TABLE) /// Generic dynamic stack allocation. HANDLE_TARGET_OPCODE(G_DYN_STACKALLOC) +/// Generic stack pointer save. +HANDLE_TARGET_OPCODE(G_STACKSAVE) + +/// Generic stack pointer restore. +HANDLE_TARGET_OPCODE(G_STACKRESTORE) + /// Strict floating point instructions. HANDLE_TARGET_OPCODE(G_STRICT_FADD) HANDLE_TARGET_OPCODE(G_STRICT_FSUB) diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index 00d56d1c4bd55ced90f6f3cacadfd86f6f78ef51..e8cfaeab3cd805ca6a2fbda51c1fabd29190390a 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -225,6 +225,18 @@ def G_DYN_STACKALLOC : GenericInstruction { let hasSideEffects = true; } +def G_STACKSAVE : GenericInstruction { + let OutOperandList = (outs ptype0:$dst); + let InOperandList = (ins); + let hasSideEffects = true; +} + +def G_STACKRESTORE : GenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins ptype0:$src); + let hasSideEffects = true; +} + def G_FREEZE : GenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins type0:$src); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 9a67a8d05a4dda80b078d1ad48de6299a6a00276..e4b837c6b8ce90ec1ddd7daf69478e86d1179104 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2229,31 +2229,12 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return true; } case Intrinsic::stacksave: { - // Save the stack pointer to the location provided by the intrinsic. - Register Reg = getOrCreateVReg(CI); - Register StackPtr = MF->getSubtarget() - .getTargetLowering() - ->getStackPointerRegisterToSaveRestore(); - - // If the target doesn't specify a stack pointer, then fall back. - if (!StackPtr) - return false; - - MIRBuilder.buildCopy(Reg, StackPtr); + MIRBuilder.buildInstr(TargetOpcode::G_STACKSAVE, {getOrCreateVReg(CI)}, {}); return true; } case Intrinsic::stackrestore: { - // Restore the stack pointer from the location provided by the intrinsic. - Register Reg = getOrCreateVReg(*CI.getArgOperand(0)); - Register StackPtr = MF->getSubtarget() - .getTargetLowering() - ->getStackPointerRegisterToSaveRestore(); - - // If the target doesn't specify a stack pointer, then fall back. - if (!StackPtr) - return false; - - MIRBuilder.buildCopy(StackPtr, Reg); + MIRBuilder.buildInstr(TargetOpcode::G_STACKRESTORE, {}, + {getOrCreateVReg(*CI.getArgOperand(0))}); return true; } case Intrinsic::cttz: diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index f0da0d88140f2d1a51fe62a8199161206517bdbe..5557456e706d2a97699d82680dab8fd808d552b4 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -3503,6 +3503,10 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { return lowerShuffleVector(MI); case G_DYN_STACKALLOC: return lowerDynStackAlloc(MI); + case G_STACKSAVE: + return lowerStackSave(MI); + case G_STACKRESTORE: + return lowerStackRestore(MI); case G_EXTRACT: return lowerExtract(MI); case G_INSERT: @@ -6773,21 +6777,12 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) { return Legalized; } -LegalizerHelper::LegalizeResult -LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { - const auto &MF = *MI.getMF(); - const auto &TFI = *MF.getSubtarget().getFrameLowering(); - if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp) - return UnableToLegalize; - - Register Dst = MI.getOperand(0).getReg(); - Register AllocSize = MI.getOperand(1).getReg(); - Align Alignment = assumeAligned(MI.getOperand(2).getImm()); - - LLT PtrTy = MRI.getType(Dst); +Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg, + Register AllocSize, + Align Alignment, + LLT PtrTy) { LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); - Register SPReg = TLI.getStackPointerRegisterToSaveRestore(); auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg); SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp); @@ -6802,7 +6797,25 @@ LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst); } - SPTmp = MIRBuilder.buildCast(PtrTy, Alloc); + return MIRBuilder.buildCast(PtrTy, Alloc).getReg(0); +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { + const auto &MF = *MI.getMF(); + const auto &TFI = *MF.getSubtarget().getFrameLowering(); + if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp) + return UnableToLegalize; + + Register Dst = MI.getOperand(0).getReg(); + Register AllocSize = MI.getOperand(1).getReg(); + Align Alignment = assumeAligned(MI.getOperand(2).getImm()); + + LLT PtrTy = MRI.getType(Dst); + Register SPReg = TLI.getStackPointerRegisterToSaveRestore(); + Register SPTmp = + getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy); + MIRBuilder.buildCopy(SPReg, SPTmp); MIRBuilder.buildCopy(Dst, SPTmp); @@ -6810,6 +6823,28 @@ LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { return Legalized; } +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerStackSave(MachineInstr &MI) { + Register StackPtr = TLI.getStackPointerRegisterToSaveRestore(); + if (!StackPtr) + return UnableToLegalize; + + MIRBuilder.buildCopy(MI.getOperand(0), StackPtr); + MI.eraseFromParent(); + return Legalized; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerStackRestore(MachineInstr &MI) { + Register StackPtr = TLI.getStackPointerRegisterToSaveRestore(); + if (!StackPtr) + return UnableToLegalize; + + MIRBuilder.buildCopy(StackPtr, MI.getOperand(0)); + MI.eraseFromParent(); + return Legalized; +} + LegalizerHelper::LegalizeResult LegalizerHelper::lowerExtract(MachineInstr &MI) { auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 4d5676f341017270c556407795a2b5b209b0692e..fe21173f531fd72ab31808b51e9302af1349ae54 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -300,6 +300,7 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF, static bool produceCompactUnwindFrame(MachineFunction &MF); static bool needsWinCFI(const MachineFunction &MF); static StackOffset getSVEStackSize(const MachineFunction &MF); +static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB); static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF); /// Returns true if a homogeneous prolog or epilog code can be emitted @@ -461,6 +462,11 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { /// included as part of the stack frame. bool AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { + // The stack probing code for the dynamically allocated outgoing arguments + // area assumes that the stack is probed at the top - either by the prologue + // code, which issues a probe if `hasVarSizedObjects` return true, or by the + // most recent variable-sized object allocation. Changing the condition here + // may need to be followed up by changes to the probe issuing logic. return !MF.getFrameInfo().hasVarSizedObjects(); } @@ -469,6 +475,9 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( MachineBasicBlock::iterator I) const { const AArch64InstrInfo *TII = static_cast(MF.getSubtarget().getInstrInfo()); + const AArch64TargetLowering *TLI = + MF.getSubtarget().getTargetLowering(); + MachineFrameInfo &MFI = MF.getFrameInfo(); DebugLoc DL = I->getDebugLoc(); unsigned Opc = I->getOpcode(); bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); @@ -495,6 +504,24 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( // Most call frames will be allocated at the start of a function so // this is OK, but it is a limitation that needs dealing with. assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); + + if (TLI->hasInlineStackProbe(MF) && + -Amount >= AArch64::StackProbeMaxUnprobedStack) { + // When stack probing is enabled, the decrement of SP may need to be + // probed. We only need to do this if the call site needs 1024 bytes of + // space or more, because a region smaller than that is allowed to be + // unprobed at an ABI boundary. We rely on the fact that SP has been + // probed exactly at this point, either by the prologue or most recent + // dynamic allocation. + assert(MFI.hasVarSizedObjects() && + "non-reserved call frame without var sized objects?"); + Register ScratchReg = + MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0)); + } else { + emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(Amount), TII); + } emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(Amount), TII); } @@ -671,6 +698,154 @@ void AArch64FrameLowering::emitCalleeSavedSVERestores( emitCalleeSavedRestores(MBB, MBBI, true); } +// Return the maximum possible number of bytes for `Size` due to the +// architectural limit on the size of a SVE register. +static int64_t upperBound(StackOffset Size) { + static const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16; + return Size.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE + Size.getFixed(); +} + +void AArch64FrameLowering::allocateStackSpace( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI, + bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset, + bool FollowupAllocs) const { + + if (!AllocSize) + return; + + DebugLoc DL; + MachineFunction &MF = *MBB.getParent(); + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + AArch64FunctionInfo &AFI = *MF.getInfo(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + const int64_t MaxAlign = MFI.getMaxAlign().value(); + const uint64_t AndMask = ~(MaxAlign - 1); + + if (!Subtarget.getTargetLowering()->hasInlineStackProbe(MF)) { + Register TargetReg = RealignmentPadding + ? findScratchNonCalleeSaveRegister(&MBB) + : AArch64::SP; + // SUB Xd/SP, SP, AllocSize + emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI, + EmitCFI, InitialOffset); + + if (RealignmentPadding) { + // AND SP, X9, 0b11111...0000 + BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP) + .addReg(TargetReg, RegState::Kill) + .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) + .setMIFlags(MachineInstr::FrameSetup); + AFI.setStackRealigned(true); + + // No need for SEH instructions here; if we're realigning the stack, + // we've set a frame pointer and already finished the SEH prologue. + assert(!NeedsWinCFI); + } + return; + } + + // + // Stack probing allocation. + // + + // Fixed length allocation. If we don't need to re-align the stack and don't + // have SVE objects, we can use a more efficient sequence for stack probing. + if (AllocSize.getScalable() == 0 && RealignmentPadding == 0) { + Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB); + assert(ScratchReg != AArch64::NoRegister); + BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC)) + .addDef(ScratchReg) + .addImm(AllocSize.getFixed()) + .addImm(InitialOffset.getFixed()) + .addImm(InitialOffset.getScalable()); + // The fixed allocation may leave unprobed bytes at the top of the + // stack. If we have subsequent alocation (e.g. if we have variable-sized + // objects), we need to issue an extra probe, so these allocations start in + // a known state. + if (FollowupAllocs) { + // STR XZR, [SP] + BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + + return; + } + + // Variable length allocation. + + // If the (unknown) allocation size cannot exceed the probe size, decrement + // the stack pointer right away. + int64_t ProbeSize = AFI.getStackProbeSize(); + if (upperBound(AllocSize) + RealignmentPadding <= ProbeSize) { + Register ScratchReg = RealignmentPadding + ? findScratchNonCalleeSaveRegister(&MBB) + : AArch64::SP; + assert(ScratchReg != AArch64::NoRegister); + // SUB Xd, SP, AllocSize + emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP, -AllocSize, &TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI, + EmitCFI, InitialOffset); + if (RealignmentPadding) { + // AND SP, Xn, 0b11111...0000 + BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP) + .addReg(ScratchReg, RegState::Kill) + .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) + .setMIFlags(MachineInstr::FrameSetup); + AFI.setStackRealigned(true); + } + if (FollowupAllocs || upperBound(AllocSize) + RealignmentPadding > + AArch64::StackProbeMaxUnprobedStack) { + // STR XZR, [SP] + BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + return; + } + + // Emit a variable-length allocation probing loop. + // TODO: As an optimisation, the loop can be "unrolled" into a few parts, + // each of them guaranteed to adjust the stack by less than the probe size. + Register TargetReg = findScratchNonCalleeSaveRegister(&MBB); + assert(TargetReg != AArch64::NoRegister); + // SUB Xd, SP, AllocSize + emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI, + EmitCFI, InitialOffset); + + if (RealignmentPadding) { + // AND Xn, Xn, 0b11111...0000 + BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), TargetReg) + .addReg(TargetReg, RegState::Kill) + .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)) + .setMIFlags(MachineInstr::FrameSetup); + } + + BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR)) + .addReg(TargetReg); + if (EmitCFI) { + // Set the CFA register back to SP. + unsigned Reg = + Subtarget.getRegisterInfo()->getDwarfRegNum(AArch64::SP, true); + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + if (RealignmentPadding) + AFI.setStackRealigned(true); +} + static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) { switch (Reg.id()) { default: @@ -854,9 +1029,11 @@ bool AArch64FrameLowering::canUseAsPrologue( MachineBasicBlock *TmpMBB = const_cast(&MBB); const AArch64Subtarget &Subtarget = MF->getSubtarget(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const AArch64TargetLowering *TLI = Subtarget.getTargetLowering(); - // Don't need a scratch register if we're not going to re-align the stack. - if (!RegInfo->hasStackRealignment(*MF)) + // Don't need a scratch register if we're not going to re-align the stack or + // emit stack probes. + if (!RegInfo->hasStackRealignment(*MF) && TLI->hasInlineStackProbe(*MF)) return true; // Otherwise, we can use any block as long as it has a scratch register // available. @@ -866,15 +1043,11 @@ bool AArch64FrameLowering::canUseAsPrologue( static bool windowsRequiresStackProbe(MachineFunction &MF, uint64_t StackSizeInBytes) { const AArch64Subtarget &Subtarget = MF.getSubtarget(); - if (!Subtarget.isTargetWindows()) - return false; - const Function &F = MF.getFunction(); + const AArch64FunctionInfo &MFI = *MF.getInfo(); // TODO: When implementing stack protectors, take that into account // for the probe threshold. - unsigned StackProbeSize = - F.getFnAttributeAsParsedInteger("stack-probe-size", 4096); - return (StackSizeInBytes >= StackProbeSize) && - !F.hasFnAttribute("no-stack-arg-probe"); + return Subtarget.isTargetWindows() && MFI.hasStackProbing() && + StackSizeInBytes >= uint64_t(MFI.getStackProbeSize()); } static bool needsWinCFI(const MachineFunction &MF) { @@ -1639,7 +1812,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Alignment is required for the parent frame, not the funclet const bool NeedsRealignment = NumBytes && !IsFunclet && RegInfo->hasStackRealignment(MF); - int64_t RealignmentPadding = + const int64_t RealignmentPadding = (NeedsRealignment && MFI.getMaxAlign() > Align(16)) ? MFI.getMaxAlign().value() - 16 : 0; @@ -1769,12 +1942,14 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } } - StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {}; + StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize; MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI; // Process the SVE callee-saves to determine what space needs to be // allocated. if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) { + LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize + << "\n"); // Find callee save instructions in frame. CalleeSavesBegin = MBBI; assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); @@ -1782,67 +1957,34 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, ++MBBI; CalleeSavesEnd = MBBI; - AllocateBefore = StackOffset::getScalable(CalleeSavedSize); - AllocateAfter = SVEStackSize - AllocateBefore; + SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize); + SVELocalsSize = SVEStackSize - SVECalleeSavesSize; } // Allocate space for the callee saves (if any). - emitFrameOffset( - MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII, - MachineInstr::FrameSetup, false, false, nullptr, - EmitAsyncCFI && !HasFP && AllocateBefore, - StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); + StackOffset CFAOffset = + StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes); + StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes); + allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false, + nullptr, EmitAsyncCFI && !HasFP, CFAOffset, + MFI.hasVarSizedObjects() || LocalsSize); + CFAOffset += SVECalleeSavesSize; if (EmitAsyncCFI) emitCalleeSavedSVELocations(MBB, CalleeSavesEnd); - // Finally allocate remaining SVE stack space. - emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, - -AllocateAfter, TII, MachineInstr::FrameSetup, false, false, - nullptr, EmitAsyncCFI && !HasFP && AllocateAfter, - AllocateBefore + StackOffset::getFixed( - (int64_t)MFI.getStackSize() - NumBytes)); - - // Allocate space for the rest of the frame. - if (NumBytes) { - unsigned scratchSPReg = AArch64::SP; - - if (NeedsRealignment) { - scratchSPReg = findScratchNonCalleeSaveRegister(&MBB); - assert(scratchSPReg != AArch64::NoRegister); - } - - // If we're a leaf function, try using the red zone. - if (!canUseRedZone(MF)) { - // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have - // the correct value here, as NumBytes also includes padding bytes, - // which shouldn't be counted here. - emitFrameOffset( - MBB, MBBI, DL, scratchSPReg, AArch64::SP, - StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup, - false, NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, - SVEStackSize + - StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes)); - } - if (NeedsRealignment) { - assert(MFI.getMaxAlign() > Align(1)); - assert(scratchSPReg != AArch64::SP); - - // SUB X9, SP, NumBytes - // -- X9 is temporary register, so shouldn't contain any live data here, - // -- free to use. This is already produced by emitFrameOffset above. - // AND SP, X9, 0b11111...0000 - uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1); - - BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP) - .addReg(scratchSPReg, RegState::Kill) - .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64)); - AFI->setStackRealigned(true); - - // No need for SEH instructions here; if we're realigning the stack, - // we've set a frame pointer and already finished the SEH prologue. - assert(!NeedsWinCFI); - } + // Allocate space for the rest of the frame including SVE locals. Align the + // stack as necessary. + assert(!(canUseRedZone(MF) && NeedsRealignment) && + "Cannot use redzone with stack realignment"); + if (!canUseRedZone(MF)) { + // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have + // the correct value here, as NumBytes also includes padding bytes, + // which shouldn't be counted here. + allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding, + SVELocalsSize + StackOffset::getFixed(NumBytes), + NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP, + CFAOffset, MFI.hasVarSizedObjects()); } // If we need a base pointer, set it up here. It's whatever the value of the @@ -4024,3 +4166,170 @@ void AArch64FrameLowering::orderFrameObjects( dbgs() << "\n"; }); } + +/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at +/// least every ProbeSize bytes. Returns an iterator of the first instruction +/// after the loop. The difference between SP and TargetReg must be an exact +/// multiple of ProbeSize. +MachineBasicBlock::iterator +AArch64FrameLowering::inlineStackProbeLoopExactMultiple( + MachineBasicBlock::iterator MBBI, int64_t ProbeSize, + Register TargetReg) const { + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + + // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable + // in SUB). + emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-ProbeSize), TII, + MachineInstr::FrameSetup); + // STR XZR, [SP] + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + // CMP SP, TargetReg + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64), + AArch64::XZR) + .addReg(AArch64::SP) + .addReg(TargetReg) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) + .setMIFlags(MachineInstr::FrameSetup); + // B.CC Loop + BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc)) + .addImm(AArch64CC::NE) + .addMBB(LoopMBB) + .setMIFlags(MachineInstr::FrameSetup); + + LoopMBB->addSuccessor(ExitMBB); + LoopMBB->addSuccessor(LoopMBB); + // Synthesize the exit MBB. + ExitMBB->splice(ExitMBB->end(), &MBB, MBBI, MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(LoopMBB); + // Update liveins. + recomputeLiveIns(*LoopMBB); + recomputeLiveIns(*ExitMBB); + + return ExitMBB->begin(); +} + +void AArch64FrameLowering::inlineStackProbeFixed( + MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize, + StackOffset CFAOffset) const { + MachineBasicBlock *MBB = MBBI->getParent(); + MachineFunction &MF = *MBB->getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + AArch64FunctionInfo *AFI = MF.getInfo(); + bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF); + bool HasFP = hasFP(MF); + + DebugLoc DL; + int64_t ProbeSize = MF.getInfo()->getStackProbeSize(); + int64_t NumBlocks = FrameSize / ProbeSize; + int64_t ResidualSize = FrameSize % ProbeSize; + + LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, " + << NumBlocks << " blocks of " << ProbeSize + << " bytes, plus " << ResidualSize << " bytes\n"); + + // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or + // ordinary loop. + if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) { + for (int i = 0; i < NumBlocks; ++i) { + // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not + // encodable in a SUB). + emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-ProbeSize), TII, + MachineInstr::FrameSetup, false, false, nullptr, + EmitAsyncCFI && !HasFP, CFAOffset); + CFAOffset += StackOffset::getFixed(ProbeSize); + // STR XZR, [SP] + BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + } else if (NumBlocks != 0) { + // SUB ScratchReg, SP, #FrameSize (or equivalent if FrameSize is not + // encodable in ADD). ScrathReg may temporarily become the CFA register. + emitFrameOffset(*MBB, MBBI, DL, ScratchReg, AArch64::SP, + StackOffset::getFixed(-ProbeSize * NumBlocks), TII, + MachineInstr::FrameSetup, false, false, nullptr, + EmitAsyncCFI && !HasFP, CFAOffset); + CFAOffset += StackOffset::getFixed(ProbeSize * NumBlocks); + MBBI = inlineStackProbeLoopExactMultiple(MBBI, ProbeSize, ScratchReg); + MBB = MBBI->getParent(); + if (EmitAsyncCFI && !HasFP) { + // Set the CFA register back to SP. + const AArch64RegisterInfo &RegInfo = + *MF.getSubtarget().getRegisterInfo(); + unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true); + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); + BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + } + } + + if (ResidualSize != 0) { + // SUB SP, SP, #ResidualSize (or equivalent if ResidualSize is not encodable + // in SUB). + emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-ResidualSize), TII, + MachineInstr::FrameSetup, false, false, nullptr, + EmitAsyncCFI && !HasFP, CFAOffset); + if (ResidualSize > AArch64::StackProbeMaxUnprobedStack) { + // STR XZR, [SP] + BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(MachineInstr::FrameSetup); + } + } +} + +void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &MBB) const { + // Get the instructions that need to be replaced. We emit at most two of + // these. Remember them in order to avoid complications coming from the need + // to traverse the block while potentially creating more blocks. + SmallVector ToReplace; + for (MachineInstr &MI : MBB) + if (MI.getOpcode() == AArch64::PROBED_STACKALLOC || + MI.getOpcode() == AArch64::PROBED_STACKALLOC_VAR) + ToReplace.push_back(&MI); + + for (MachineInstr *MI : ToReplace) { + if (MI->getOpcode() == AArch64::PROBED_STACKALLOC) { + Register ScratchReg = MI->getOperand(0).getReg(); + int64_t FrameSize = MI->getOperand(1).getImm(); + StackOffset CFAOffset = StackOffset::get(MI->getOperand(2).getImm(), + MI->getOperand(3).getImm()); + inlineStackProbeFixed(MI->getIterator(), ScratchReg, FrameSize, + CFAOffset); + } else { + assert(MI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR && + "Stack probe pseudo-instruction expected"); + const AArch64InstrInfo *TII = + MI->getMF()->getSubtarget().getInstrInfo(); + Register TargetReg = MI->getOperand(0).getReg(); + (void)TII->probedStackAlloc(MI->getIterator(), TargetReg, true); + } + MI->eraseFromParent(); + } +} \ No newline at end of file diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 147b5c181be5e53788804c5aafc00282e5b02f73..941af03a78b738703e60bc6e6fe57ea80dbac71a 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -150,10 +150,28 @@ private: MachineBasicBlock::iterator MBBI) const; void emitCalleeSavedSVERestores(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const; + void allocateStackSpace(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + int64_t RealignmentPadding, StackOffset AllocSize, + bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI, + StackOffset InitialOffset, bool FollowupAllocs) const; /// Emit target zero call-used regs. void emitZeroCallUsedRegs(BitVector RegsToZero, MachineBasicBlock &MBB) const override; + + /// Replace a StackProbe stub (if any) with the actual probe code inline + void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const override; + + void inlineStackProbeFixed(MachineBasicBlock::iterator MBBI, + Register ScratchReg, int64_t FrameSize, + StackOffset CFAOffset) const; + + MachineBasicBlock::iterator + inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI, + int64_t NegProbeSize, + Register TargetReg) const; }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 6e721b9378468daf32522025d7192f03b88d679c..eff0722e1c778356a92723888bcb15380c5a362a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -556,10 +556,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - if (Subtarget->isTargetWindows()) - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); - else - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); // Constant pool entries setOperationAction(ISD::ConstantPool, MVT::i64, Custom); @@ -2288,6 +2285,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::CSINC) MAKE_CASE(AArch64ISD::THREAD_POINTER) MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) + MAKE_CASE(AArch64ISD::PROBED_ALLOCA) MAKE_CASE(AArch64ISD::ABDS_PRED) MAKE_CASE(AArch64ISD::ABDU_PRED) MAKE_CASE(AArch64ISD::HADDS_PRED) @@ -2646,6 +2644,22 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet( return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI, + MachineBasicBlock *MBB) const { + MachineFunction &MF = *MBB->getParent(); + MachineBasicBlock::iterator MBBI = MI.getIterator(); + DebugLoc DL = MBB->findDebugLoc(MBBI); + const AArch64InstrInfo &TII = + *MF.getSubtarget().getInstrInfo(); + Register TargetReg = MI.getOperand(0).getReg(); + MachineBasicBlock::iterator NextInst = + TII.probedStackAlloc(MBBI, TargetReg, false); + + MI.eraseFromParent(); + return NextInst->getParent(); +} + MachineBasicBlock * AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, @@ -2774,6 +2788,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( case AArch64::CATCHRET: return EmitLoweredCatchRet(MI, BB); + case AArch64::PROBED_STACKALLOC_DYN: + return EmitDynamicProbedAlloc(MI, BB); case AArch64::LD1_MXIPXX_H_PSEUDO_B: return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB); case AArch64::LD1_MXIPXX_H_PSEUDO_H: @@ -13666,9 +13682,34 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, AN->getMemOperand()); } -SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( - SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const { +SDValue +AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + // Get the inputs. + SDNode *Node = Op.getNode(); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + MaybeAlign Align = + cast(Op.getOperand(2))->getMaybeAlignValue(); + EVT VT = Node->getValueType(0); + + if (DAG.getMachineFunction().getFunction().hasFnAttribute( + "no-stack-arg-probe")) { + SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); + Chain = SP.getValue(1); + SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); + if (Align) + SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); + Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); + SDValue Ops[2] = {SP, Chain}; + return DAG.getMergeValues(Ops, dl); + } + + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(), PtrVT, 0); @@ -13692,7 +13733,59 @@ SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size, DAG.getConstant(4, dl, MVT::i64)); - return Chain; + + SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); + Chain = SP.getValue(1); + SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); + if (Align) + SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); + Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); + + Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); + + SDValue Ops[2] = {SP, Chain}; + return DAG.getMergeValues(Ops, dl); +} + +SDValue +AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + // Get the inputs. + SDNode *Node = Op.getNode(); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + + MaybeAlign Align = + cast(Op.getOperand(2))->getMaybeAlignValue(); + SDLoc dl(Op); + EVT VT = Node->getValueType(0); + + // Construct the new SP value in a GPR. + SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); + Chain = SP.getValue(1); + SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); + if (Align) + SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); + + // Set the real SP to the new value with a probing loop. + Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP); + SDValue Ops[2] = {SP, Chain}; + return DAG.getMergeValues(Ops, dl); +} + +SDValue +AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + + if (Subtarget->isTargetWindows()) + return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG); + else if (hasInlineStackProbe(MF)) + return LowerInlineDYNAMIC_STACKALLOC(Op, DAG); + else + return SDValue(); } // When x and y are extended, lower: @@ -13746,51 +13839,6 @@ SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::ADD, dl, VT, Add, tmp); } -SDValue -AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, - SelectionDAG &DAG) const { - assert(Subtarget->isTargetWindows() && - "Only Windows alloca probing supported"); - SDLoc dl(Op); - // Get the inputs. - SDNode *Node = Op.getNode(); - SDValue Chain = Op.getOperand(0); - SDValue Size = Op.getOperand(1); - MaybeAlign Align = - cast(Op.getOperand(2))->getMaybeAlignValue(); - EVT VT = Node->getValueType(0); - - if (DAG.getMachineFunction().getFunction().hasFnAttribute( - "no-stack-arg-probe")) { - SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); - Chain = SP.getValue(1); - SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); - if (Align) - SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); - Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); - SDValue Ops[2] = {SP, Chain}; - return DAG.getMergeValues(Ops, dl); - } - - Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); - - Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG); - - SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); - Chain = SP.getValue(1); - SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); - if (Align) - SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align->value(), dl, VT)); - Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); - - Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); - - SDValue Ops[2] = {SP, Chain}; - return DAG.getMergeValues(Ops, dl); -} - SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); @@ -26051,3 +26099,9 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const { } return true; } + +bool AArch64TargetLowering::hasInlineStackProbe( + const MachineFunction &MF) const { + return !Subtarget->isTargetWindows() && + MF.getInfo()->hasStackProbing(); +} \ No newline at end of file diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index aca45f113e736679e22961efba1668df7e17ef19..9b388c7f8668ec8a052fa5dd04a303710cfec4fb 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -90,6 +90,10 @@ enum NodeType : unsigned { ADC, SBC, // adc, sbc instructions + // To avoid stack clash, allocation is performed by block and each block is + // probed. + PROBED_ALLOCA, + // Predicated instructions where inactive lanes produce undefined results. ABDS_PRED, ABDU_PRED, @@ -508,6 +512,13 @@ const unsigned RoundingBitsPos = 22; const ArrayRef getGPRArgRegs(); const ArrayRef getFPRArgRegs(); +/// Maximum allowed number of unprobed bytes above SP at an ABI +/// boundary. +const unsigned StackProbeMaxUnprobedStack = 1024; + +/// Maximum number of iterations to unroll for a constant size probing loop. +const unsigned StackProbeMaxLoopUnroll = 4; + } // namespace AArch64 class AArch64Subtarget; @@ -603,6 +614,9 @@ public: MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI, + MachineBasicBlock *MBB) const; + MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const; @@ -942,6 +956,9 @@ public: // used for 64bit and 128bit vectors as well. bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const; + /// True if stack clash protection is enabled for this functions. + bool hasInlineStackProbe(const MachineFunction &MF) const override; + private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. @@ -1103,10 +1120,10 @@ private: SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, - SDValue &Size, - SelectionDAG &DAG) const; + SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const; SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 0691e07a639beee77f8096e2cdb1f70e259a0e03..b3b42a97e8c9114a3436e89e730dc54c6f8acd4b 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" +#include "AArch64ExpandImm.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" @@ -18,6 +19,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -8428,6 +8430,94 @@ unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { return AArch64::BLR; } +MachineBasicBlock::iterator +AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI, + Register TargetReg, bool FrameSetup) const { + assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP"); + + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + int64_t ProbeSize = MF.getInfo()->getStackProbeSize(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopTestMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopTestMBB); + MachineBasicBlock *LoopBodyMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopBodyMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + MachineInstr::MIFlag Flags = + FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags; + + // LoopTest: + // SUB SP, SP, #ProbeSize + emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP, + AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags); + + // CMP SP, TargetReg + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64), + AArch64::XZR) + .addReg(AArch64::SP) + .addReg(TargetReg) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) + .setMIFlags(Flags); + + // B. LoopExit + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc)) + .addImm(AArch64CC::LE) + .addMBB(ExitMBB) + .setMIFlags(Flags); + + // STR XZR, [SP] + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(Flags); + + // B loop + BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B)) + .addMBB(LoopTestMBB) + .setMIFlags(Flags); + + // LoopExit: + // MOV SP, TargetReg + BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP) + .addReg(TargetReg) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) + .setMIFlags(Flags); + + // STR XZR, [SP] + BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::STRXui)) + .addReg(AArch64::XZR) + .addReg(AArch64::SP) + .addImm(0) + .setMIFlags(Flags); + + ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + + LoopTestMBB->addSuccessor(ExitMBB); + LoopTestMBB->addSuccessor(LoopBodyMBB); + LoopBodyMBB->addSuccessor(LoopTestMBB); + MBB.addSuccessor(LoopTestMBB); + + // Update liveins. + if (MF.getRegInfo().reservedRegsFrozen()) { + recomputeLiveIns(*LoopTestMBB); + recomputeLiveIns(*LoopBodyMBB); + recomputeLiveIns(*ExitMBB); + } + + return ExitMBB->begin(); +} + #define GET_INSTRINFO_HELPERS #define GET_INSTRMAP_INFO #include "AArch64GenInstrInfo.inc" diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 20210a96d67ad28def588797a694c958c5aaaa79..7e84b86fc52cd617490d83f20b972b98fb3cd3d0 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -340,6 +340,12 @@ public: static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized); + // Decrement the SP, issuing probes along the way. `TargetReg` is the new top + // of the stack. `FrameSetup` is passed as true, if the allocation is a part + // of constructing the activation frame of a function. + MachineBasicBlock::iterator probedStackAlloc(MachineBasicBlock::iterator MBBI, + Register TargetReg, + bool FrameSetup) const; #define GET_INSTRINFO_HELPER_DECLS #include "AArch64GenInstrInfo.inc" diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 9e72d37880c58fbe132f8a2379d6c197ee49c85e..9b9103e01d67d16a3426f20d4236c27d1e086f38 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -818,6 +818,12 @@ def AArch64stilp : SDNode<"AArch64ISD::STILP", SDT_AArch64stilp, [SDNPHasChain, def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>; + +def AArch64probedalloca + : SDNode<"AArch64ISD::PROBED_ALLOCA", + SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPMayStore]>; + def AArch64mrs : SDNode<"AArch64ISD::MRS", SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>, [SDNPHasChain, SDNPOutGlue]>; @@ -880,7 +886,8 @@ include "SMEInstrFormats.td" // Miscellaneous instructions. //===----------------------------------------------------------------------===// -let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in { +let hasSideEffects = 1, isCodeGenOnly = 1 in { +let Defs = [SP], Uses = [SP] in { // We set Sched to empty list because we expect these instructions to simply get // removed in most cases. def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), @@ -889,7 +896,34 @@ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), [(AArch64callseq_end timm:$amt1, timm:$amt2)]>, Sched<[]>; -} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 +} + +let Defs = [SP, NZCV], Uses = [SP] in { +// Probed stack allocation of a constant size, used in function prologues when +// stack-clash protection is enabled. +def PROBED_STACKALLOC : Pseudo<(outs GPR64:$scratch), + (ins i64imm:$stacksize, i64imm:$fixed_offset, + i64imm:$scalable_offset), + []>, + Sched<[]>; + +// Probed stack allocation of a variable size, used in function prologues when +// stack-clash protection is enabled. +def PROBED_STACKALLOC_VAR : Pseudo<(outs), + (ins GPR64sp:$target), + []>, + Sched<[]>; + +// Probed stack allocations of a variable size, used for allocas of unknown size +// when stack-clash protection is enabled. +let usesCustomInserter = 1 in +def PROBED_STACKALLOC_DYN : Pseudo<(outs), + (ins GPR64common:$target), + [(AArch64probedalloca GPR64common:$target)]>, + Sched<[]>; + +} // Defs = [SP, NZCV], Uses = [SP] in +} // hasSideEffects = 1, isCodeGenOnly = 1 let isReMaterializable = 1, isCodeGenOnly = 1 in { // FIXME: The following pseudo instructions are only needed because remat diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp index 961a19317d6660b8a36f6ea0d13aa9cfeade80b2..0bef3c2d248310d635628549543269aa09d3b52d 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp @@ -97,14 +97,45 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F, if (const auto *BTE = mdconst::extract_or_null( F.getParent()->getModuleFlag("branch-target-enforcement"))) BranchTargetEnforcement = BTE->getZExtValue(); - return; + } else { + const StringRef BTIEnable = + F.getFnAttribute("branch-target-enforcement").getValueAsString(); + assert(BTIEnable.equals_insensitive("true") || + BTIEnable.equals_insensitive("false")); + BranchTargetEnforcement = BTIEnable.equals_insensitive("true"); } - const StringRef BTIEnable = - F.getFnAttribute("branch-target-enforcement").getValueAsString(); - assert(BTIEnable.equals_insensitive("true") || - BTIEnable.equals_insensitive("false")); - BranchTargetEnforcement = BTIEnable.equals_insensitive("true"); + // The default stack probe size is 4096 if the function has no + // stack-probe-size attribute. This is a safe default because it is the + // smallest possible guard page size. + uint64_t ProbeSize = 4096; + if (F.hasFnAttribute("stack-probe-size")) + ProbeSize = F.getFnAttributeAsParsedInteger("stack-probe-size"); + else if (const auto *PS = mdconst::extract_or_null( + F.getParent()->getModuleFlag("stack-probe-size"))) + ProbeSize = PS->getZExtValue(); + assert(int64_t(ProbeSize) > 0 && "Invalid stack probe size"); + + if (STI->isTargetWindows()) { + if (!F.hasFnAttribute("no-stack-arg-probe")) + StackProbeSize = ProbeSize; + } else { + // Round down to the stack alignment. + uint64_t StackAlign = + STI->getFrameLowering()->getTransientStackAlign().value(); + ProbeSize = std::max(StackAlign, ProbeSize & ~(StackAlign - 1U)); + StringRef ProbeKind; + if (F.hasFnAttribute("probe-stack")) + ProbeKind = F.getFnAttribute("probe-stack").getValueAsString(); + else if (const auto *PS = dyn_cast_or_null( + F.getParent()->getModuleFlag("probe-stack"))) + ProbeKind = PS->getString(); + if (ProbeKind.size()) { + if (ProbeKind != "inline-asm") + report_fatal_error("Unsupported stack probing method"); + StackProbeSize = ProbeSize; + } + } } MachineFunctionInfo *AArch64FunctionInfo::clone( diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index d82fb436925ec6fa2bd1b7f61bb73b9caba5dc29..d50011594eb1ac0a7de1087185c8be0c09293e2c 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -192,6 +192,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// True if the function need asynchronous unwind information. mutable std::optional NeedsAsyncDwarfUnwindInfo; + int64_t StackProbeSize = 0; + public: AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI); @@ -447,6 +449,10 @@ public: bool needsDwarfUnwindInfo(const MachineFunction &MF) const; bool needsAsyncDwarfUnwindInfo(const MachineFunction &MF) const; + bool hasStackProbing() const { return StackProbeSize != 0; } + + int64_t getStackProbeSize() const { return StackProbeSize; } + private: // Hold the lists of LOHs. MILOHContainer LOHContainerSet; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index d905da4eaec335b87b025763e350a6618bb3a19b..0dd2b4d48dd69570dfd6241d2a5c87c19eb03311 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -797,7 +797,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) return Query.Types[0] == p0 && Query.Types[1] == s64; }); - getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower(); + getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom(); + + getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower(); if (ST.hasMOPS()) { // G_BZERO is not supported. Currently it is only emitted by @@ -991,6 +993,8 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, return legalizeMemOps(MI, Helper); case TargetOpcode::G_FCOPYSIGN: return legalizeFCopySign(MI, Helper); + case TargetOpcode::G_DYN_STACKALLOC: + return legalizeDynStackAlloc(MI, Helper); } llvm_unreachable("expected switch to return"); @@ -1687,3 +1691,42 @@ bool AArch64LegalizerInfo::legalizeFCopySign(MachineInstr &MI, MI.eraseFromParent(); return true; } + +bool AArch64LegalizerInfo::legalizeDynStackAlloc( + MachineInstr &MI, LegalizerHelper &Helper) const { + MachineFunction &MF = *MI.getParent()->getParent(); + MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); + + // If stack probing is not enabled for this function, use the default + // lowering. + if (!MF.getFunction().hasFnAttribute("probe-stack") || + MF.getFunction().getFnAttribute("probe-stack").getValueAsString() != + "inline-asm") { + Helper.lowerDynStackAlloc(MI); + return true; + } + + Register Dst = MI.getOperand(0).getReg(); + Register AllocSize = MI.getOperand(1).getReg(); + Align Alignment = assumeAligned(MI.getOperand(2).getImm()); + + assert(MRI.getType(Dst) == LLT::pointer(0, 64) && + "Unexpected type for dynamic alloca"); + assert(MRI.getType(AllocSize) == LLT::scalar(64) && + "Unexpected type for dynamic alloca"); + + LLT PtrTy = MRI.getType(Dst); + Register SPReg = + Helper.getTargetLowering().getStackPointerRegisterToSaveRestore(); + Register SPTmp = + Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy); + auto NewMI = + MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp}); + MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass); + MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI); + MIRBuilder.buildCopy(Dst, SPTmp); + + MI.eraseFromParent(); + return true; +} \ No newline at end of file diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h index c10f6e071ed430c9f4b8cbe73646b4404d07f72a..94484ea59d1559e639cc36ba90e88aa3bd915ce8 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h @@ -58,6 +58,7 @@ private: bool legalizeCTTZ(MachineInstr &MI, LegalizerHelper &Helper) const; bool legalizeMemOps(MachineInstr &MI, LegalizerHelper &Helper) const; bool legalizeFCopySign(MachineInstr &MI, LegalizerHelper &Helper) const; + bool legalizeDynStackAlloc(MachineInstr &MI, LegalizerHelper &Helper) const; const AArch64Subtarget *ST; }; } // End llvm namespace. diff --git a/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/X86LegalizerInfo.cpp index a4a247f85f3d727149006ff84bc67153b60c2a95..104461cff0a91b77bdb5f69e43b045e610806a11 100644 --- a/llvm/lib/Target/X86/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/X86LegalizerInfo.cpp @@ -528,6 +528,10 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, // memory intrinsics getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall(); + getActionDefinitionsBuilder({G_DYN_STACKALLOC, + G_STACKSAVE, + G_STACKRESTORE}).lower(); + // fp intrinsics getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN) .scalarize(0) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll index 5f3544add39844aaafd33520f4f1e920cfb0366d..575cd6b874e3563152a996d3a11e6f09bd57d1b5 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll @@ -2392,8 +2392,8 @@ declare ptr @llvm.stacksave() declare void @llvm.stackrestore(ptr) define void @test_stacksaverestore() { ; CHECK-LABEL: name: test_stacksaverestore - ; CHECK: [[SAVE:%[0-9]+]]:_(p0) = COPY $sp - ; CHECK-NEXT: $sp = COPY [[SAVE]](p0) + ; CHECK: [[SAVE:%[0-9]+]]:_(p0) = G_STACKSAVE + ; CHECK-NEXT: G_STACKRESTORE [[SAVE]] ; CHECK-NEXT: RET_ReallyLR %sp = call ptr @llvm.stacksave() call void @llvm.stackrestore(ptr %sp) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir index e9188fb89f699a0434bbfdb8af9d4defa0544334..82781cebc55a93206c613faa4d977ee209ca57a3 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir @@ -19,6 +19,21 @@ ret i128* %addr } + define i8* @test_simple_alloca_stack_probing(i32 %numelts) "probe-stack"="inline-asm" { + %addr = alloca i8, i32 %numelts + ret i8* %addr + } + + define i8* @test_aligned_alloca_stack_probing(i32 %numelts) "probe-stack"="inline-asm" { + %addr = alloca i8, i32 %numelts, align 32 + ret i8* %addr + } + + define i128* @test_natural_alloca_stack_probing(i32 %numelts) "probe-stack"="inline-asm" { + %addr = alloca i128, i32 %numelts + ret i128* %addr + } + ... --- name: test_simple_alloca @@ -37,22 +52,23 @@ body: | ; CHECK-LABEL: name: test_simple_alloca ; CHECK: liveins: $w0 - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) - ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] - ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 - ; CHECK: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] - ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 - ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] - ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp - ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) - ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] - ; CHECK: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64) - ; CHECK: $sp = COPY [[INTTOPTR]](p0) - ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) - ; CHECK: $x0 = COPY [[COPY2]](p0) - ; CHECK: RET_ReallyLR implicit $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp + ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] + ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64) + ; CHECK-NEXT: $sp = COPY [[INTTOPTR]](p0) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) + ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 %0:_(s32) = COPY $w0 %3:_(s64) = G_CONSTANT i64 1 %1:_(s64) = G_ZEXT %0(s32) @@ -83,24 +99,25 @@ body: | ; CHECK-LABEL: name: test_aligned_alloca ; CHECK: liveins: $w0 - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) - ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] - ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 - ; CHECK: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] - ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 - ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] - ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp - ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) - ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] - ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -32 - ; CHECK: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C3]] - ; CHECK: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[AND1]](s64) - ; CHECK: $sp = COPY [[INTTOPTR]](p0) - ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) - ; CHECK: $x0 = COPY [[COPY2]](p0) - ; CHECK: RET_ReallyLR implicit $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp + ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -32 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C3]] + ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[AND1]](s64) + ; CHECK-NEXT: $sp = COPY [[INTTOPTR]](p0) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) + ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 %0:_(s32) = COPY $w0 %3:_(s64) = G_CONSTANT i64 1 %1:_(s64) = G_ZEXT %0(s32) @@ -131,22 +148,23 @@ body: | ; CHECK-LABEL: name: test_natural_alloca ; CHECK: liveins: $w0 - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) - ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] - ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 - ; CHECK: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] - ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 - ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] - ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp - ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) - ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] - ; CHECK: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64) - ; CHECK: $sp = COPY [[INTTOPTR]](p0) - ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) - ; CHECK: $x0 = COPY [[COPY2]](p0) - ; CHECK: RET_ReallyLR implicit $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp + ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] + ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64) + ; CHECK-NEXT: $sp = COPY [[INTTOPTR]](p0) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) + ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 %0:_(s32) = COPY $w0 %3:_(s64) = G_CONSTANT i64 16 %1:_(s64) = G_ZEXT %0(s32) @@ -160,3 +178,139 @@ body: | RET_ReallyLR implicit $x0 ... +--- +name: test_simple_alloca_stack_probing +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$w0' } +frameInfo: + maxAlignment: 1 +stack: + - { id: 0, name: addr, type: variable-sized, alignment: 1 } +machineFunctionInfo: {} +body: | + bb.1 (%ir-block.0): + liveins: $w0 + ; CHECK-LABEL: name: test_simple_alloca_stack_probing + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[SHL]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp + ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] + ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:gpr64common(p0) = G_INTTOPTR [[SUB]](s64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) + ; CHECK-NEXT: PROBED_STACKALLOC_DYN [[INTTOPTR]](p0), implicit-def $sp, implicit-def $nzcv, implicit $sp + ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s32) = COPY $w0 + %1:_(s64) = G_ZEXT %0(s32) + %9:_(s64) = G_CONSTANT i64 0 + %2:_(s64) = G_SHL %1, %9(s64) + %4:_(s64) = G_CONSTANT i64 15 + %5:_(s64) = nuw G_ADD %2, %4 + %6:_(s64) = G_CONSTANT i64 -16 + %7:_(s64) = G_AND %5, %6 + %8:_(p0) = G_DYN_STACKALLOC %7(s64), 1 + $x0 = COPY %8(p0) + RET_ReallyLR implicit $x0 +... +--- +name: test_aligned_alloca_stack_probing +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$w0' } +frameInfo: + maxAlignment: 32 +stack: + - { id: 0, name: addr, type: variable-sized, alignment: 32 } +machineFunctionInfo: {} +body: | + bb.1 (%ir-block.0): + liveins: $w0 + ; CHECK-LABEL: name: test_aligned_alloca_stack_probing + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[SHL]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp + ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -32 + ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C3]] + ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:gpr64common(p0) = G_INTTOPTR [[AND1]](s64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) + ; CHECK-NEXT: PROBED_STACKALLOC_DYN [[INTTOPTR]](p0), implicit-def $sp, implicit-def $nzcv, implicit $sp + ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s32) = COPY $w0 + %1:_(s64) = G_ZEXT %0(s32) + %9:_(s64) = G_CONSTANT i64 0 + %2:_(s64) = G_SHL %1, %9(s64) + %4:_(s64) = G_CONSTANT i64 15 + %5:_(s64) = nuw G_ADD %2, %4 + %6:_(s64) = G_CONSTANT i64 -16 + %7:_(s64) = G_AND %5, %6 + %8:_(p0) = G_DYN_STACKALLOC %7(s64), 32 + $x0 = COPY %8(p0) + RET_ReallyLR implicit $x0 +... +--- +name: test_natural_alloca_stack_probing +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$w0' } +frameInfo: + maxAlignment: 1 +stack: + - { id: 0, name: addr, type: variable-sized, alignment: 1 } +machineFunctionInfo: {} +body: | + bb.1 (%ir-block.0): + liveins: $w0 + ; CHECK-LABEL: name: test_natural_alloca_stack_probing + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[SHL]], [[C1]] + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp + ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0) + ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]] + ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:gpr64common(p0) = G_INTTOPTR [[SUB]](s64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0) + ; CHECK-NEXT: PROBED_STACKALLOC_DYN [[INTTOPTR]](p0), implicit-def $sp, implicit-def $nzcv, implicit $sp + ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s32) = COPY $w0 + %1:_(s64) = G_ZEXT %0(s32) + %9:_(s64) = G_CONSTANT i64 4 + %2:_(s64) = G_SHL %1, %9(s64) + %4:_(s64) = G_CONSTANT i64 15 + %5:_(s64) = nuw G_ADD %2, %4 + %6:_(s64) = G_CONSTANT i64 -16 + %7:_(s64) = G_AND %5, %6 + %8:_(p0) = G_DYN_STACKALLOC %7(s64), 1 + $x0 = COPY %8(p0) + RET_ReallyLR implicit $x0 \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index b4fe73d29fa650873b9820cbe94a1b962c02a32f..efae9b66b53dee601b53a58b59528160606e4e58 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -641,7 +641,22 @@ # DEBUG-NEXT: G_JUMP_TABLE (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 1, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK -# DEBUG-NEXT: G_DYN_STACKALLOC (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: G_DYN_STACKALLOC (opcode [[DYN_STACKALLOC:[0-9]+]]): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_STACKSAVE (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to [[DYN_STACKALLOC]] +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_STACKRESTORE (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to [[DYN_STACKALLOC]] +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_STACKSAVE (opcode [[STACKSAVE:[0-9]+]]): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_STACKRESTORE (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to [[STACKSAVE]] # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_STRICT_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll new file mode 100644 index 0000000000000000000000000000000000000000..97ecca0bd77b09feb28eba293232a58688036ee4 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel=1 -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s + +declare void @use_addr(ptr) +declare ptr @llvm.stacksave.p0() +declare void @llvm.stackrestore.p0(ptr) + +define void @test_scoped_alloca(i64 %n) { +; CHECK-LABEL: test_scoped_alloca: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: bl llvm.stacksave.p0 +; CHECK-NEXT: add x9, x19, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: bl use_addr +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl llvm.stackrestore.p0 +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %sp = call ptr @llvm.stacksave.p0() + %addr = alloca i8, i64 %n + call void @use_addr(ptr %addr) + call void @llvm.stackrestore.p0(ptr %sp) + ret void +} diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir index 623c0f240be4fd7794123055f059cf8310f86331..265c474fbc5db48b94b24731711fdc824ec925c6 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve-basepointer.mir @@ -4,8 +4,8 @@ name: hasBasepointer # CHECK-LABEL: name: hasBasepointer # CHECK: bb.0: -# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1 -# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 # CHECK-NEXT: $x19 = ADDXri $sp, 0, 0 # CHECK: STRXui $x0, $x19, 0 tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir index e367a380f8ba9f07f3b4a99768c80134e553a136..35fd7ca77d5cf3efc35a2ad1a2457856c36898bc 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve-fixed-width-access.mir @@ -7,9 +7,9 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp + ; CHECK-NEXT: sub sp, sp, #2064 ; CHECK-NEXT: addvl sp, sp, #-32 ; CHECK-NEXT: addvl sp, sp, #-28 - ; CHECK-NEXT: sub sp, sp, #2064 ; CHECK-NEXT: ldr x8, [sp, #2048] ; CHECK-NEXT: addvl sp, sp, #31 ; CHECK-NEXT: addvl sp, sp, #29 diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir index d54f67634d02a7b97d173159ba582f519dc00b38..680f9c335c250c54aed55bd8e14df87bedb02599 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir @@ -4,9 +4,9 @@ name: LateScavengingSlot # CHECK-LABEL: name: LateScavengingSlot # CHECK: bb.0: -# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1 -# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 8, 12 +# CHECK: $sp = frame-setup SUBXri $sp, 8, 12 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 # CHECK: STRXui killed $[[SCRATCH:x[0-9]+]], $sp, 0 # CHECK-NEXT: $[[SCRATCH]] = ADDVL_XXI $fp, -1 # CHECK-NEXT: STRXui $x0, killed $[[SCRATCH]], 0 diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir index 7c87587c6dc4e2c6b2d127b45d43e6d4b47ce49f..8b657c95bfc7c84c388d3802d121069e4e7a2329 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir @@ -60,10 +60,10 @@ # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -16 # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 -# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 -# CHECK-NEXT: CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 32 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 32 @@ -77,7 +77,7 @@ # ASM-LABEL: test_allocate_sve: # ASM: .cfi_def_cfa_offset 16 # ASM-NEXT: .cfi_offset w29, -16 -# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +# ASM: .cfi_def_cfa_offset 32 # ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 16 * VG # ASM: .cfi_def_cfa wsp, 32 # ASM: .cfi_def_cfa_offset 16 @@ -87,7 +87,7 @@ # # UNWINDINFO: DW_CFA_def_cfa_offset: +16 # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 -# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_offset: +32 # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus # UNWINDINFO: DW_CFA_def_cfa: reg31 +32 # UNWINDINFO: DW_CFA_def_cfa_offset: +16 @@ -125,9 +125,9 @@ body: | # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w20, -8 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w21, -16 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -32 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 -# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 48 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 # # CHECK-NEXT: $x20 = IMPLICIT_DEF @@ -149,7 +149,7 @@ body: | # ASM: .cfi_offset w20, -8 # ASM-NEXT: .cfi_offset w21, -16 # ASM-NEXT: .cfi_offset w29, -32 -# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 16 * VG +# ASM: .cfi_def_cfa_offset 48 # ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 48 + 16 * VG # # ASM: .cfi_def_cfa wsp, 48 @@ -164,7 +164,7 @@ body: | # UNWINDINFO: DW_CFA_offset: reg20 -8 # UNWINDINFO-NEXT: DW_CFA_offset: reg21 -16 # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -32 -# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_offset: +48 # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +48, DW_OP_plus, DW_OP_consts +16, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus # # UNWINDINFO: DW_CFA_def_cfa: reg31 +48 @@ -205,9 +205,9 @@ body: | # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 -# CHECK-NEXT: $sp = ANDXri killed $[[TMP]] +# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2 +# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]] # CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 @@ -267,9 +267,9 @@ body: | # CHECK-NEXT: $sp = frame-setup STRXpre killed $[[SCRATCH:[a-z0-9]+]], $sp, -16 # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3 -# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 32 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3 # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 16 @@ -292,7 +292,7 @@ body: | # ASM-LABEL: test_address_sve: # ASM: .cfi_def_cfa_offset 16 # ASM-NEXT: .cfi_offset w29, -16 -# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG +# ASM: .cfi_def_cfa_offset 32 # ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 24 * VG # # ASM: .cfi_def_cfa wsp, 32 @@ -302,7 +302,7 @@ body: | # # UNWINDINFO: DW_CFA_def_cfa_offset: +16 # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 -# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_offset: +32 # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +24, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus # # UNWINDINFO: DW_CFA_def_cfa: reg31 +32 @@ -353,8 +353,8 @@ body: | # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3 # CHECK-NEXT: STR_ZXI $z0, $fp, -1 # CHECK-NEXT: STR_ZXI $z1, $fp, -2 @@ -429,9 +429,9 @@ body: | # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 -# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 32 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK: $[[TMP:x[0-9]+]] = ADDVL_XXI $sp, 1 # CHECK-NEXT: $x0 = LDRXui killed $[[TMP]], 4 @@ -448,7 +448,7 @@ body: | # ASM-LABEL: test_stack_arg_sve: # ASM: .cfi_def_cfa_offset 16 # ASM-NEXT: .cfi_offset w29, -16 -# ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +# ASM: .cfi_def_cfa_offset 32 # ASM: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG # # ASM: .cfi_def_cfa wsp, 32 @@ -458,7 +458,7 @@ body: | # UNWINDINFO: DW_CFA_def_cfa_offset: +16 # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 -# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +16, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_offset: +32 # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +8, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus # # UNWINDINFO: DW_CFA_def_cfa: reg31 +32 @@ -640,8 +640,8 @@ body: | # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w19, -16 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -24 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -32 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 # CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 # CHECK-NEXT: $x19 = ADDXri $sp, 0, 0 # CHECK-NEXT: STRXui $xzr, $x19, 0 # CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 @@ -863,9 +863,9 @@ body: | # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4d, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4e, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 -# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1 -# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK: $sp = frame-setup SUBXri $sp, 32, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 +# CHECK: $sp = frame-setup ADDVL_XXI $sp, -1 # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK: $sp = frame-destroy ADDXri $sp, 32, 0 @@ -916,7 +916,7 @@ body: | # ASM-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 32 - 48 * VG # ASM-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 32 - 56 * VG # ASM-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 32 - 64 * VG -# ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG +# ASM: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 64 + 144 * VG # ASM: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 64 + 152 * VG # # ASM: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG @@ -950,7 +950,7 @@ body: | # UNWINDINFO-NEXT: DW_CFA_expression: reg77 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -48, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus # UNWINDINFO-NEXT: DW_CFA_expression: reg78 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -56, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus # UNWINDINFO-NEXT: DW_CFA_expression: reg79 DW_OP_consts -32, DW_OP_plus, DW_OP_consts -64, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus -# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +64, DW_OP_plus, DW_OP_consts +144, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +64, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus # # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +0, DW_OP_consts +32, DW_OP_plus, DW_OP_consts +152, DW_OP_bregx 0x2e +0, DW_OP_mul, DW_OP_plus @@ -1031,9 +1031,9 @@ body: | # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 -# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 -# CHECK-NEXT: $sp = ANDXri killed $[[TMP]] +# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -1 +# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]] # CHECK: $sp = frame-destroy ADDVL_XXI $fp, -18 # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4 diff --git a/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir b/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir index 1b9411d07f433ab68de88285a02a83156c383db6..f6fc627ac2d3d87368af69375b8d5ee5fa070409 100644 --- a/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir +++ b/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir @@ -21,7 +21,7 @@ stack: - { id: 1, size: 4, alignment: 4, local-offset: -68 } # CHECK: body: -# CHECK: $sp = ANDXri killed ${{x[0-9]+}}, 7865 +# CHECK: $sp = frame-setup ANDXri killed ${{x[0-9]+}}, 7865 # CHECK: STRSui $s0, $sp, 0 # CHECK: STRSui $s0, $fp, 7 body: | diff --git a/llvm/test/CodeGen/AArch64/stack-guard-sve.ll b/llvm/test/CodeGen/AArch64/stack-guard-sve.ll index 1672a7eb8739779123248bdc9c661241293b80c3..5acbb22bf1ab5a4978caa5f4a8e5e36e320ad0c6 100644 --- a/llvm/test/CodeGen/AArch64/stack-guard-sve.ll +++ b/llvm/test/CodeGen/AArch64/stack-guard-sve.ll @@ -148,9 +148,9 @@ entry: ; CHECK-LABEL: local_stack_alloc: ; CHECK: mov x29, sp -; CHECK: addvl sp, sp, #-2 ; CHECK: sub sp, sp, #16, lsl #12 ; CHECK: sub sp, sp, #16 +; CHECK: addvl sp, sp, #-2 ; Stack guard is placed below the SVE stack area (and above all fixed-width objects) ; CHECK-DAG: add [[STACK_GUARD_SPILL_PART_LOC:x[0-9]+]], sp, #8, lsl #12 @@ -198,9 +198,9 @@ entry: ; CHECK-LABEL: local_stack_alloc_strong: ; CHECK: mov x29, sp -; CHECK: addvl sp, sp, #-3 ; CHECK: sub sp, sp, #16, lsl #12 ; CHECK: sub sp, sp, #16 +; CHECK: addvl sp, sp, #-3 ; Stack guard is placed at the top of the SVE stack area ; CHECK-DAG: ldr [[STACK_GUARD:x[0-9]+]], [{{x[0-9]+}}, :lo12:__stack_chk_guard] diff --git a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll new file mode 100644 index 0000000000000000000000000000000000000000..0a3198fc520e99a143e827b4285c986797d90051 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll @@ -0,0 +1,392 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s + +; Tests for prolog sequences for stack probing, when using a 64KiB stack guard. + +; 64k bytes is the largest frame we can probe in one go. +define void @static_65536(ptr %out) #0 { +; CHECK-LABEL: static_65536: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 65536, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 64k+16 bytes, still needs just one probe. +define void @static_65552(ptr %out) #0 { +; CHECK-LABEL: static_65552: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp], #-16 +; CHECK-NEXT: .cfi_def_cfa_offset 65568 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 65552, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 64k+1024 bytes, the largest frame which needs just one probe. +define void @static_66560(ptr %out) #0 { +; CHECK-LABEL: static_66560: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 66576 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 66560, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 64k+1024+16 bytes, the smallest frame which needs two probes. +define void @static_66576(ptr %out) #0 { +; CHECK-LABEL: static_66576: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 66592 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 66576, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 2*64k+1024, the largest frame needing two probes. +define void @static_132096(ptr %out) #0 { +; CHECK-LABEL: static_132096: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 131088 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 132112 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #32, lsl #12 // =131072 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 132096, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 5*64k-16, the largest frame probed without a loop. +define void @static_327664(ptr %out) #0 { +; CHECK-LABEL: static_327664: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 65552 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 131088 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 196624 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: .cfi_def_cfa_offset 262160 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #15, lsl #12 // =61440 +; CHECK-NEXT: .cfi_def_cfa_offset 323600 +; CHECK-NEXT: sub sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 327680 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #79, lsl #12 // =323584 +; CHECK-NEXT: .cfi_def_cfa_offset 4096 +; CHECK-NEXT: add sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 327664, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 5*64k, smallest frame probed with a loop. +define void @static_327680(ptr %out) #0 { +; CHECK-LABEL: static_327680: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa w9, 327696 +; CHECK-NEXT: .LBB6_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB6_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 327680, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 5*64k+1024, large enough to use a loop, but not a multiple of 64KiB +; so has a reminder, but no extra probe. +define void @static_328704(ptr %out) #0 { +; CHECK-LABEL: static_328704: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa w9, 327696 +; CHECK-NEXT: .LBB7_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB7_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 328720 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 328704, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; 5*64k+1040, large enough to use a loop, has a reminder and +; an extra probe. +define void @static_328720(ptr %out) #0 { +; CHECK-LABEL: static_328720: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa w9, 327696 +; CHECK-NEXT: .LBB8_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB8_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 328736 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 328720, align 1 + store i8* %v, ptr %out, align 8 + ret void +} + +; A small allocation, but with a very large alignment requirement. We do this +; by moving SP far enough that a sufficiently-aligned block will exist +; somewhere in the stack frame, so must probe the whole of that larger SP move. +define void @static_16_align_131072(ptr %out) #0 { +; CHECK-LABEL: static_16_align_131072: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #31, lsl #12 // =126976 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and x9, x9, #0xfffffffffffe0000 +; CHECK-NEXT: .LBB9_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB9_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB9_1 +; CHECK-NEXT: .LBB9_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 16, align 131072 + store i8* %v, ptr %out, align 8 + ret void +} + +; A small allocation, but with a very large alignment requirement which +; is nevertheless small enough as to not need a loop. +define void @static_16_align_8192(ptr %out) #0 { +; CHECK-LABEL: static_16_align_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and sp, x9, #0xffffffffffffe000 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 16, align 8192 + store i8* %v, ptr %out, align 8 + ret void +} + +; A large allocation with a very large alignment requirement which +; is nevertheless small enough as to not need a loop. +define void @static_32752_align_32k(ptr %out) #0 { +; CHECK-LABEL: static_32752_align_32k: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #7, lsl #12 // =28672 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and sp, x9, #0xffffffffffff8000 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 32752, align 32768 + store i8* %v, ptr %out, align 8 + ret void +} + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" } \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll new file mode 100644 index 0000000000000000000000000000000000000000..673f9038a35fe6afcc1b0c5247b92b4be9afc344 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll @@ -0,0 +1,14 @@ +; RUN: llc --stop-after=finalize-isel -o - | FileCheck %s +target triple = "aarch64-linux" + +; Check dynamic stack allocation and probing instructions do not have +; the FrameSetup flag. + +; CHECK-NOT: frame-setup +define void @no_frame_setup(i64 %size, ptr %out) #0 { + %v = alloca i8, i64 %size, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" } \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll new file mode 100644 index 0000000000000000000000000000000000000000..3cbcf7749b2ae977346da9e196f26081c33472d5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll @@ -0,0 +1,365 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s + +; Dynamically-sized allocation, needs a loop which can handle any size at +; runtime. The final iteration of the loop will temporarily put SP below the +; target address, but this doesn't break any of the ABI constraints on the +; stack, and also doesn't probe below the target SP value. +define void @dynamic(i64 %size, ptr %out) #0 { +; CHECK-LABEL: dynamic: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB0_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB0_1 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + %v = alloca i8, i64 %size, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; This function has a fixed-size stack slot and a dynamic one. The fixed size +; slot isn't large enough that we would normally probe it, but we need to do so +; here otherwise the gap between the CSR save and the first probe of the +; dynamic allocation could be too far apart when the size of the dynamic +; allocation is close to the guard size. +define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 { +; CHECK-LABEL: dynamic_fixed: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: str xzr, [sp, #-64]! +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x10, x29, #64 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: str x10, [x1] +; CHECK-NEXT: .LBB1_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB1_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB1_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x2] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + %v1 = alloca i8, i64 64, align 1 + store ptr %v1, ptr %out1, align 8 + %v2 = alloca i8, i64 %size, align 1 + store ptr %v2, ptr %out2, align 8 + ret void +} + +; Dynamic allocation, with an alignment requirement greater than the alignment +; of SP. Done by ANDing the target SP with a constant to align it down, then +; doing the loop as normal. Note that we also re-align the stack in the prolog, +; which isn't actually needed because the only aligned allocations are dynamic, +; this is done even without stack probing. +define void @dynamic_align_64(i64 %size, ptr %out) #0 { +; CHECK-LABEL: dynamic_align_64: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: sub x9, sp, #32 +; CHECK-NEXT: and sp, x9, #0xffffffffffffffc0 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: and x8, x8, #0xffffffffffffffc0 +; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB2_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB2_1 +; CHECK-NEXT: .LBB2_3: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 32 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + %v = alloca i8, i64 %size, align 64 + store ptr %v, ptr %out, align 8 + ret void +} + +; Dynamic allocation, with an alignment greater than the stack guard size. The +; only difference to the dynamic allocation is the constant used for aligning +; the target SP, the loop will probe the whole allocation without needing to +; know about the alignment padding. +define void @dynamic_align_8192(i64 %size, ptr %out) #0 { +; CHECK-LABEL: dynamic_align_8192: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4064 +; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 +; CHECK-NEXT: .LBB3_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB3_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB3_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB3_1 +; CHECK-NEXT: .LBB3_3: +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: and x8, x8, #0xffffffffffffe000 +; CHECK-NEXT: .LBB3_4: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB3_6 +; CHECK-NEXT: // %bb.5: // in Loop: Header=BB3_4 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB3_4 +; CHECK-NEXT: .LBB3_6: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 32 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + %v = alloca i8, i64 %size, align 8192 + store ptr %v, ptr %out, align 8 + ret void +} + +; For 64k guard pages, the only difference is the constant subtracted from SP +; in the loop. +define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536" { +; CHECK-LABEL: dynamic_64k_guard: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: .LBB4_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB4_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB4_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB4_1 +; CHECK-NEXT: .LBB4_3: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + %v = alloca i8, i64 %size, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; If a function has variable-sized stack objects, then any function calls which +; need to pass arguments on the stack must allocate the stack space for them +; dynamically, to ensure they are at the bottom of the frame. We need to probe +; that space when it is larger than the unprobed space allowed by the ABI (1024 +; bytes), so this needs a very large number of arguments. +define void @no_reserved_call_frame(i64 %n) #0 { +; CHECK-LABEL: no_reserved_call_frame: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: lsl x9, x0, #2 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: add x9, x9, #15 +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x0, x8, x9 +; CHECK-NEXT: .LBB5_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x0 +; CHECK-NEXT: b.le .LBB5_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB5_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB5_1 +; CHECK-NEXT: .LBB5_3: // %entry +; CHECK-NEXT: mov sp, x0 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1104 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1104 +; CHECK-NEXT: bl callee_stack_args +; CHECK-NEXT: add sp, sp, #1104 +; CHECK-NEXT: add sp, sp, #1104 +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i32, i64 %n + call void @callee_stack_args(ptr %v, [138 x i64] undef) + ret void +} + +; Same as above but without a variable-sized allocation, so the reserved call +; frame can be folded into the fixed-size allocation in the prologue. +define void @reserved_call_frame(i64 %n) #0 { +; CHECK-LABEL: reserved_call_frame: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x28, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w28, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: sub sp, sp, #1504 +; CHECK-NEXT: add x0, sp, #1104 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: bl callee_stack_args +; CHECK-NEXT: add sp, sp, #1504 +; CHECK-NEXT: .cfi_def_cfa wsp, 32 +; CHECK-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i32, i64 100 + call void @callee_stack_args(ptr %v, [138 x i64] undef) + ret void +} + +declare void @callee_stack_args(ptr, [138 x i64]) + +; Dynamic allocation of SVE vectors +define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" { +; CHECK-LABEL: dynamic_sve: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: mov x10, #15 // =0xf +; CHECK-NEXT: madd x9, x0, x9, x10 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB7_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB7_1 +; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 32 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + %v = alloca , i64 %size, align 16 + store ptr %v, ptr %out, align 8 + ret void +} + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" } \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir new file mode 100644 index 0000000000000000000000000000000000000000..9a173be5857ed73bb3efd709008160b7c3243885 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir @@ -0,0 +1,144 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# RUN: llc -run-pass=prologepilog %s -o - | FileCheck %s +# Regression test for a crash when the probing instruction +# to replace is last in the block. +--- | + source_filename = "tt.ll" + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-linux" + + declare i1 @g(ptr) + + define void @f(ptr %out) #0 { + entry: + %p = alloca i32, i32 50000, align 4 + br label %loop + + loop: ; preds = %loop, %entry + %c = call i1 @g(ptr %p) + br i1 %c, label %loop, label %exit + + exit: ; preds = %loop + ret void + } + + attributes #0 = { uwtable "frame-pointer"="none" "probe-stack"="inline-asm" "target-features"="+sve" } + +... +--- +name: f +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: true +registers: [] +liveins: [] +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: true + hasCalls: true + stackProtector: '' + functionContext: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + localFrameSize: 200000 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: p, type: default, offset: 0, size: 200000, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: -200000, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: f + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $lr, $fp + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.2), (store (s64) into %stack.1) + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 + ; CHECK-NEXT: $x9 = frame-setup SUBXri $sp, 48, 12 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w9, 196624 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.entry: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: liveins: $x9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1, 12 + ; CHECK-NEXT: frame-setup STRXui $xzr, $sp, 0 + ; CHECK-NEXT: $xzr = frame-setup SUBSXrx64 $sp, $x9, 24, implicit-def $nzcv + ; CHECK-NEXT: frame-setup Bcc 1, %bb.3, implicit $nzcv + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $wsp + ; CHECK-NEXT: $sp = frame-setup SUBXri $sp, 3392, 0 + ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 200016 + ; CHECK-NEXT: frame-setup STRXui $xzr, $sp, 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.loop: + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x0 = ADDXri $sp, 0, 0 + ; CHECK-NEXT: BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 + ; CHECK-NEXT: TBNZW killed renamable $w0, 0, %bb.1 + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.exit: + ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 48, 12 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 3408 + ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 3392, 0 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16 + ; CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1) + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30 + ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 + ; CHECK-NEXT: RET_ReallyLR + bb.0.entry: + successors: %bb.1(0x80000000) + + + bb.1.loop: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + $x0 = ADDXri %stack.0.p, 0, 0 + BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + TBNZW killed renamable $w0, 0, %bb.1 + B %bb.2 + + bb.2.exit: + RET_ReallyLR \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll new file mode 100644 index 0000000000000000000000000000000000000000..e765d071e722041a80fbe72955d5eb5c4728a815 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll @@ -0,0 +1,724 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s + +; Test prolog sequences for stack probing when SVE objects are involved. + +; The space for SVE objects needs probing in the general case, because +; the stack adjustment may happen to be too big (i.e. greater than the +; probe size) to allocate with a single `addvl`. +; When we do know that the stack adjustment cannot exceed the probe size +; we can avoid emitting a probe loop and emit a simple `addvl; str` +; sequence instead. + +define void @sve_1_vector(ptr %out) #0 { +; CHECK-LABEL: sve_1_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + ret void +} + +; As above, but with 4 SVE vectors of stack space. +define void @sve_4_vector(ptr %out) #0 { +; CHECK-LABEL: sve_4_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + ret void +} + +; As above, but with 16 SVE vectors of stack space. +; The stack adjustment is less than or equal to 16 x 256 = 4096, so +; we can allocate the locals at once. +define void @sve_16_vector(ptr %out) #0 { +; CHECK-LABEL: sve_16_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + %vec5 = alloca , align 16 + %vec6 = alloca , align 16 + %vec7 = alloca , align 16 + %vec8 = alloca , align 16 + %vec9 = alloca , align 16 + %vec10 = alloca , align 16 + %vec11 = alloca , align 16 + %vec12 = alloca , align 16 + %vec13 = alloca , align 16 + %vec14 = alloca , align 16 + %vec15 = alloca , align 16 + %vec16 = alloca , align 16 + ret void +} + +; As above, but with 17 SVE vectors of stack space. Now we need +; a probing loops since stack adjustment may be greater than +; the probe size (17 x 256 = 4354 bytes) +; TODO: Allocating `k*16+r` SVE vectors can be unrolled into +; emiting the `k + r` sequences of `addvl sp, sp, #-N; str xzr, [sp]` +define void @sve_17_vector(ptr %out) #0 { +; CHECK-LABEL: sve_17_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x9, sp, #-17 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG +; CHECK-NEXT: .LBB3_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB3_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB3_1 +; CHECK-NEXT: .LBB3_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + %vec5 = alloca , align 16 + %vec6 = alloca , align 16 + %vec7 = alloca , align 16 + %vec8 = alloca , align 16 + %vec9 = alloca , align 16 + %vec10 = alloca , align 16 + %vec11 = alloca , align 16 + %vec12 = alloca , align 16 + %vec13 = alloca , align 16 + %vec14 = alloca , align 16 + %vec15 = alloca , align 16 + %vec16 = alloca , align 16 + %vec17 = alloca , align 16 + ret void +} + +; Space for callee-saved SVE register is allocated similarly to allocating +; space for SVE locals. When we know the stack adjustment cannot exceed the +; probe size we can skip the explict probe, since saving SVE registers serves +; as an implicit probe. +define void @sve_1v_csr( %a) #0 { +; CHECK-LABEL: sve_1v_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{z8}" () + ret void +} + +define void @sve_4v_csr( %a) #0 { +; CHECK-LABEL: sve_4v_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: str z11, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr z11, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: .cfi_restore z11 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" () + ret void +} + +define void @sve_16v_csr( %a) #0 { +; CHECK-LABEL: sve_16v_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: .cfi_restore z11 +; CHECK-NEXT: .cfi_restore z12 +; CHECK-NEXT: .cfi_restore z13 +; CHECK-NEXT: .cfi_restore z14 +; CHECK-NEXT: .cfi_restore z15 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" () + ret void +} + +define void @sve_1p_csr( %a) #0 { +; CHECK-LABEL: sve_1p_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{p8}" () + ret void +} + +define void @sve_4p_csr( %a) #0 { +; CHECK-LABEL: sve_4p_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str p11, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p11, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" () + ret void +} + +define void @sve_16v_1p_csr( %a) #0 { +; CHECK-LABEL: sve_16v_1p_csr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x9, sp, #-17 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG +; CHECK-NEXT: .LBB9_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB9_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB9_1 +; CHECK-NEXT: .LBB9_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: .cfi_restore z11 +; CHECK-NEXT: .cfi_restore z12 +; CHECK-NEXT: .cfi_restore z13 +; CHECK-NEXT: .cfi_restore z14 +; CHECK-NEXT: .cfi_restore z15 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{p8},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" () + ret void +} + +; A SVE vector and a 16-byte fixed size object. +define void @sve_1_vector_16_arr(ptr %out) #0 { +; CHECK-LABEL: sve_1_vector_16_arr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: .cfi_def_cfa wsp, 32 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 16, align 1 + ret void +} + +; A large SVE stack object and a large stack slot, both of which need probing. +; TODO: This could be optimised by combining the fixed-size offset into the +; loop. +define void @sve_1_vector_4096_arr(ptr %out) #0 { +; CHECK-LABEL: sve_1_vector_4096_arr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #3, lsl #12 // =12288 +; CHECK-NEXT: .cfi_def_cfa w9, 12304 +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 256 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 512 * VG +; CHECK-NEXT: .LBB11_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB11_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB11_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB11_1 +; CHECK-NEXT: .LBB11_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x88, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 264 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 16 * VG +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: .cfi_def_cfa wsp, 12304 +; CHECK-NEXT: add sp, sp, #3, lsl #12 // =12288 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 12288, align 1 + ret void +} + +; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently +; supported even without stack-probing. + +; An SVE vector, and a 16-byte fixed size object, which +; has a large alignment requirement. +define void @sve_1_vector_16_arr_align_8192(ptr %out) #0 { +; CHECK-LABEL: sve_1_vector_16_arr_align_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: addvl x9, x9, #-1 +; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 +; CHECK-NEXT: .LBB12_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB12_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB12_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB12_1 +; CHECK-NEXT: .LBB12_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %arr = alloca i8, i64 16, align 8192 + ret void +} + +; With 64k guard pages, we can allocate bigger SVE space without a probing loop. +define void @sve_1024_64k_guard(ptr %out) #0 "stack-probe-size"="65536" { +; CHECK-LABEL: sve_1024_64k_guard: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 256 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 512 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 768 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1024 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1280 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1536 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1792 * VG +; CHECK-NEXT: addvl sp, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2048 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1800 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1552 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1304 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1056 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 808 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 560 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 312 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG +; CHECK-NEXT: addvl sp, sp, #8 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + ret void +} + +define void @sve_1028_64k_guard(ptr %out) #0 "stack-probe-size"="65536" { +; CHECK-LABEL: sve_1028_64k_guard: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl x9, sp, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 256 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 512 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 768 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1024 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1280 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1536 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1792 * VG +; CHECK-NEXT: addvl x9, x9, #-32 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2048 * VG +; CHECK-NEXT: addvl x9, x9, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2056 * VG +; CHECK-NEXT: .LBB14_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB14_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB14_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB14_1 +; CHECK-NEXT: .LBB14_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1808 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1560 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1312 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1064 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 816 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 568 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 320 * VG +; CHECK-NEXT: addvl sp, sp, #31 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG +; CHECK-NEXT: addvl sp, sp, #9 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec = alloca , align 16 + %vec1 = alloca , align 16 + ret void +} + +; With 5 SVE vectors of stack space the unprobed area +; at the top of the stack can exceed 1024 bytes (5 x 256 == 1280), +; hence we need to issue a probe. +define void @sve_5_vector(ptr %out) #0 { +; CHECK-LABEL: sve_5_vector: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-5 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 40 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: addvl sp, sp, #5 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %vec1 = alloca , align 16 + %vec2 = alloca , align 16 + %vec3 = alloca , align 16 + %vec4 = alloca , align 16 + %vec5 = alloca , align 16 + ret void +} + +; Test with a 14 scalable bytes (so up to 14 * 16 = 224) of unprobed +; are bellow the save location of `p9`. +define void @sve_unprobed_area( %a, i32 %n) #0 { +; CHECK-LABEL: sve_unprobed_area: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + call void asm sideeffect "", "~{z8},~{z9},~{z10},~{p9}" () + + %v0 = alloca , align 16 + %v1 = alloca , align 16 + %v2 = alloca , align 16 + %v3 = alloca , align 16 + + ret void +} + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" } \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll new file mode 100644 index 0000000000000000000000000000000000000000..95001450622f433a2a5ae05f330d5789ff8f58c5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-probing.ll @@ -0,0 +1,539 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s +; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s + +; Tests for prolog sequences for stack probing, when using a 4KiB stack guard. + +; The stack probing parameters in function attributes take precedence over +; ones in the module flags. + +; Small stack frame, no probing required. +define void @static_64(ptr %out) #0 { +; CHECK-LABEL: static_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 64, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; At 256 bytes we start to always create a frame pointer. No frame smaller then +; this needs a probe, so we can use the saving of at least one CSR as a probe +; at the top of our frame. +define void @static_256(ptr %out) #0 { +; CHECK-LABEL: static_256: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #272 +; CHECK-NEXT: .cfi_def_cfa_offset 272 +; CHECK-NEXT: str x29, [sp, #256] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #272 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 256, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; At 1024 bytes, this is the largest frame which doesn't need probing. +define void @static_1024(ptr %out) #0 { +; CHECK-LABEL: static_1024: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 1024, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; At 1024+16 bytes, this is the smallest frame which needs probing. +define void @static_1040(ptr %out) #0 { +; CHECK-LABEL: static_1040: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 1040, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 4k bytes is the largest frame we can probe in one go. +define void @static_4096(ptr %out) #0 { +; CHECK-LABEL: static_4096: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 4096, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 4k+16 bytes, still needs just one probe. +define void @static_4112(ptr %out) #0 { +; CHECK-LABEL: static_4112: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp], #-16 +; CHECK-NEXT: .cfi_def_cfa_offset 4128 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 4112, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 4k+1024 bytes, the largest frame which needs just one probe. +define void @static_5120(ptr %out) #0 { +; CHECK-LABEL: static_5120: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 5136 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 5120, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 4k+1024+16, the smallest frame which needs two probes. +define void @static_5136(ptr %out) #0 { +; CHECK-LABEL: static_5136: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 5152 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 5136, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 2*4k+1024, the largest frame needing two probes +define void @static_9216(ptr %out) #0 { +; CHECK-LABEL: static_9216: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 8208 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 9232 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #2, lsl #12 // =8192 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 9216, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 5*4k-16, the largest frame probed without a loop +define void @static_20464(ptr %out) #0 { +; CHECK-LABEL: static_20464: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 8208 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 12304 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: .cfi_def_cfa_offset 16400 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 20480 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #4, lsl #12 // =16384 +; CHECK-NEXT: .cfi_def_cfa_offset 4096 +; CHECK-NEXT: add sp, sp, #4080 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 20464, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 5*4k, the smallest frame probed with a loop +define void @static_20480(ptr %out) #0 { +; CHECK-LABEL: static_20480: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa w9, 20496 +; CHECK-NEXT: .LBB10_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB10_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 20480, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 5*4k + 1024, large enough to use a loop, but not a multiple of 4KiB +; so has a reminder, but no extra probe. +define void @static_21504(ptr %out) #0 { +; CHECK-LABEL: static_21504: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa w9, 20496 +; CHECK-NEXT: .LBB11_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB11_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 21520 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa_offset 1040 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 21504, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; 5*4k+1040, large enough to use a loop, has a reminder and +; an extra probe. +define void @static_21520(ptr %out) #0 { +; CHECK-LABEL: static_21520: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa w9, 20496 +; CHECK-NEXT: .LBB12_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.ne .LBB12_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: .cfi_def_cfa_register wsp +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 21536 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #5, lsl #12 // =20480 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 21520, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; A small allocation, but with a very large alignment requirement. We do this +; by moving SP far enough that a sufficiently-aligned block will exist +; somewhere in the stack frame, so must probe the whole of that larger SP move. +define void @static_16_align_8192(ptr %out) #0 { +; CHECK-LABEL: static_16_align_8192: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: sub x9, x9, #4080 +; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 +; CHECK-NEXT: .LBB13_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 +; CHECK-NEXT: cmp sp, x9 +; CHECK-NEXT: b.le .LBB13_3 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: // in Loop: Header=BB13_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB13_1 +; CHECK-NEXT: .LBB13_3: // %entry +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 16, align 8192 + store ptr %v, ptr %out, align 8 + ret void +} + +; A small allocation with a very large alignment requirement, but +; nevertheless small enough as to not need a loop. +define void @static_16_align_2048(ptr %out) #0 { +; CHECK-LABEL: static_16_align_2048: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #2032 +; CHECK-NEXT: and sp, x9, #0xfffffffffffff800 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 16, align 2048 + store ptr %v, ptr %out, align 8 + ret void +} + +; A large(-ish) allocation with a very large alignment requirement, but +; nevertheless small enough as to not need a loop. +define void @static_2032_align_2048(ptr %out) #0 { +; CHECK-LABEL: static_2032_align_2048: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #2032 +; CHECK-NEXT: and sp, x9, #0xfffffffffffff800 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 2032, align 2048 + store ptr %v, ptr %out, align 8 + ret void +} + +; Test stack probing is enabled by module flags +define void @static_9232(ptr %out) uwtable(async) { +; CHECK-LABEL: static_9232: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub sp, sp, #2, lsl #12 // =8192 +; CHECK-NEXT: .cfi_def_cfa_offset 8208 +; CHECK-NEXT: sub sp, sp, #800 +; CHECK-NEXT: .cfi_def_cfa_offset 9008 +; CHECK-NEXT: str xzr, [sp], #-240 +; CHECK-NEXT: .cfi_def_cfa_offset 9248 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: add sp, sp, #2, lsl #12 // =8192 +; CHECK-NEXT: .cfi_def_cfa_offset 1056 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i64 9232, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; Test for a tight upper bound on the amount of stack adjustment +; due to stack realignment. No probes should appear. +define void @static_1008(ptr %out) #0 { +; CHECK-LABEL: static_1008: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x9, sp, #1008 +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + %v = alloca i8, i32 1008, align 32 + store ptr %v, ptr %out, align 8 + ret void +} + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" } + +!llvm.module.flags = !{!0, !1} + +!0 = !{i32 4, !"probe-stack", !"inline-asm"} +!1 = !{i32 8, !"stack-probe-size", i32 9000} \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll index a97649523565dc0f3fdb7053ee49cbdb5b68f68d..235364ac232183c53157ad6121fb9f1747949664 100644 --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -56,8 +56,8 @@ define float @foo2(ptr %x0, ptr %x1) nounwind { ; CHECK-LABEL: foo2: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ld4d { z1.d - z4.d }, p0/z, [x0] @@ -699,8 +699,8 @@ define void @verify_all_operands_are_initialised() { ; CHECK-LABEL: verify_all_operands_are_initialised: ; CHECK: // %bb.0: ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll index 31ff9287046cd5bc3ad89ea56a95610b1773d95c..b3529549c22b9cd0d19597687bbc6cc78a7c26ae 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll @@ -9,8 +9,8 @@ define void @fcvt_v4f64_v4f128(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: add x8, sp, #48 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] @@ -59,8 +59,8 @@ define void @fcvt_v4f128_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill ; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: sub sp, sp, #128 +; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: ldr q1, [x0, #64] ; CHECK-NEXT: mov x19, x1 ; CHECK-NEXT: ldr q0, [x0, #80] diff --git a/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll new file mode 100644 index 0000000000000000000000000000000000000000..8f665924577f5bcb6a5d530c360cb2967cf0b9c8 --- /dev/null +++ b/llvm/test/CodeGen/X86/GlobalISel/stacksave-stackrestore.ll @@ -0,0 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel=1 -mtriple=x86_64-linux-gnu -o - %s | FileCheck %s + +declare void @use_addr(ptr) +declare ptr @llvm.stacksave.p0() +declare void @llvm.stackrestore.p0(ptr) + +define void @test_scoped_alloca(i64 %n) { +; CHECK-LABEL: test_scoped_alloca: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_offset %rbx, -32 +; CHECK-NEXT: .cfi_offset %r14, -24 +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: callq llvm.stacksave.p0 +; CHECK-NEXT: movq %rax, %r14 +; CHECK-NEXT: movq %rsp, %rdi +; CHECK-NEXT: imulq $1, %rbx, %rax +; CHECK-NEXT: addq $15, %rax +; CHECK-NEXT: andq $-16, %rax +; CHECK-NEXT: subq %rax, %rdi +; CHECK-NEXT: movq %rdi, %rsp +; CHECK-NEXT: callq use_addr +; CHECK-NEXT: movq %r14, %rdi +; CHECK-NEXT: callq llvm.stackrestore.p0 +; CHECK-NEXT: leaq -16(%rbp), %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq + %sp = call ptr @llvm.stacksave.p0() + %addr = alloca i8, i64 %n + call void @use_addr(ptr %addr) + call void @llvm.stackrestore.p0(ptr %sp) + ret void +}