diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 28ccef34d8fc74a28edaab7752319882e1523e12..b7b5bc247899732530eb8fa6cb6d6008c45aaecf 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -545,6 +545,12 @@ option(LLVM_BUILD_RUNTIME "Build the LLVM runtime libraries." ON) option(LLVM_BUILD_EXAMPLES "Build the LLVM example programs. If OFF, just generate build targets." OFF) +option(BUILD_ARK_GC_SUPPORT + "ARK support GC. If ON, support GC." OFF) +if(BUILD_ARK_GC_SUPPORT) + add_definitions(-DARK_GC_SUPPORT) +endif(BUILD_ARK_GC_SUPPORT) + option(LLVM_INCLUDE_EXAMPLES "Generate build targets for the LLVM examples" ON) if(LLVM_BUILD_EXAMPLES) diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h index 2901ab715810fa43b9a6604db41b94bd909a4fda..74fcd1621856748904565f69838319d9efd5e54f 100644 --- a/llvm/include/llvm-c/Core.h +++ b/llvm/include/llvm-c/Core.h @@ -3946,6 +3946,12 @@ LLVMValueRef LLVMBuildCall(LLVMBuilderRef, LLVMValueRef Fn, LLVMValueRef LLVMBuildCall2(LLVMBuilderRef, LLVMTypeRef, LLVMValueRef Fn, LLVMValueRef *Args, unsigned NumArgs, const char *Name); +#ifdef ARK_GC_SUPPORT +LLVMValueRef LLVMBuildCall3(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Fn, + LLVMValueRef *Args, unsigned NumArgs, + const char *Name, LLVMValueRef *deoptVals, + int NumVals); +#endif LLVMValueRef LLVMBuildSelect(LLVMBuilderRef, LLVMValueRef If, LLVMValueRef Then, LLVMValueRef Else, const char *Name); diff --git a/llvm/include/llvm-c/ExecutionEngine.h b/llvm/include/llvm-c/ExecutionEngine.h index c5fc9bdb4d07f62462c65924e6ae8faf75748dec..ccd4e5164165ef040e3ff07957d79fa2d2877ce9 100644 --- a/llvm/include/llvm-c/ExecutionEngine.h +++ b/llvm/include/llvm-c/ExecutionEngine.h @@ -42,6 +42,9 @@ typedef struct LLVMOpaqueMCJITMemoryManager *LLVMMCJITMemoryManagerRef; struct LLVMMCJITCompilerOptions { unsigned OptLevel; +#ifdef ARK_GC_SUPPORT + LLVMRelocMode RelMode; +#endif LLVMCodeModel CodeModel; LLVMBool NoFramePointerElim; LLVMBool EnableFastISel; diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h index 22d819032cde12105287aec8891a9b3b0cd4dfb6..ae107867087b8d7ea35ba93225e537052dfe7021 100644 --- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h +++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h @@ -17,6 +17,9 @@ #include "llvm/CodeGen/StackProtectorRetLowering.h" #include "llvm/Support/TypeSize.h" #include +#ifdef ARK_GC_SUPPORT +#include "llvm/ADT/Triple.h" +#endif namespace llvm { class BitVector; @@ -209,6 +212,27 @@ public: MachineBasicBlock &MBB) const = 0; virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const = 0; +#ifdef ARK_GC_SUPPORT + +template +constexpr T RoundUp(T x, size_t n) const +{ + static_assert(std::is_integral::value, "T must be integral"); + return (static_cast(x) + n - 1U) & (-n); +} + +virtual Triple::ArchType GetArkSupportTarget() const +{ + return Triple::UnknownArch; +} + +virtual int GetFixedFpPosition() const +{ + return 2; +} + +virtual int GetFrameReserveSize(MachineFunction &MF) const; +#endif virtual const StackProtectorRetLowering *getStackProtectorRet() const { return nullptr; diff --git a/llvm/include/llvm/Target/CodeGenCWrappers.h b/llvm/include/llvm/Target/CodeGenCWrappers.h index a995463570535d04ccb0c378639c076760b88c73..5929c7efe2126d35ce3b07117042c4745e5d96cb 100644 --- a/llvm/include/llvm/Target/CodeGenCWrappers.h +++ b/llvm/include/llvm/Target/CodeGenCWrappers.h @@ -59,6 +59,37 @@ inline LLVMCodeModel wrap(CodeModel::Model Model) { } llvm_unreachable("Bad CodeModel!"); } + +#ifdef ARK_GC_SUPPORT +inline Reloc::Model unwrap(LLVMRelocMode Model) { + switch (Model) { + case LLVMRelocDefault: + case LLVMRelocStatic: + return Reloc::Static; + case LLVMRelocPIC: + return Reloc::PIC_; + case LLVMRelocDynamicNoPic: + return Reloc::DynamicNoPIC; + } + llvm_unreachable("Invalid LLVMRelocMode!"); +} + +inline LLVMRelocMode unwrap(Reloc::Model Model) { + switch (Model) { + case Reloc::Static: + return LLVMRelocStatic; + case Reloc::PIC_: + return LLVMRelocPIC; + case Reloc::DynamicNoPIC: + return LLVMRelocDynamicNoPic; + case Reloc::ROPI: + case Reloc::RWPI: + case Reloc::ROPI_RWPI: + break; + } + llvm_unreachable("Invalid Reloc::Model!"); +} +#endif } // namespace llvm #endif diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 145d6ebff90c164dbd90a887b0d4b9511dd29452..41e11683e6688426c3e052401ad7e8a03f377808 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -70,6 +70,11 @@ #include #include +#ifdef ARK_GC_SUPPORT +#include +#include +#endif + using namespace llvm; #define DEBUG_TYPE "prologepilog" @@ -123,6 +128,9 @@ private: void calculateCallFrameInfo(MachineFunction &MF); void calculateSaveRestoreBlocks(MachineFunction &MF); +#ifdef ARK_GC_SUPPORT + void RecordCalleeSaveRegisterAndOffset(MachineFunction &MF, const std::vector &CSI); +#endif void spillCalleeSavedRegs(MachineFunction &MF); void calculateFrameObjectOffsets(MachineFunction &MF); @@ -301,6 +309,10 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) { RestoreBlocks.clear(); MFI.setSavePoint(nullptr); MFI.setRestorePoint(nullptr); +#ifdef ARK_GC_SUPPORT + std::vector &CSI = MFI.getCalleeSavedInfo(); + RecordCalleeSaveRegisterAndOffset(MF, CSI); +#endif return true; } @@ -617,6 +629,69 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock, } } +#ifdef ARK_GC_SUPPORT +void PEI::RecordCalleeSaveRegisterAndOffset(MachineFunction &MF, const std::vector &CSI) +{ + MachineModuleInfo &MMI = MF.getMMI(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); + Function &func = const_cast(MF.getFunction()); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + Triple::ArchType archType = TFI->GetArkSupportTarget(); + + if ((archType != Triple::aarch64 && archType != Triple::x86_64) || !(TFI->hasFP(MF))) { + return; + } + unsigned FpRegDwarfNum = 0; + if (archType == Triple::aarch64) { + FpRegDwarfNum = 29; // x29 + } else { + FpRegDwarfNum = 6; //rbp + } + int64_t FpOffset = 0; + int64_t deleta; + // nearest to rbp callee register + int64_t maxOffset = INT_MIN; + for (auto I : CSI) { + int64_t Offset = MFI.getObjectOffset(I.getFrameIdx()); + unsigned Reg = I.getReg(); + unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, true); + if (FpRegDwarfNum == DwarfRegNum) { + FpOffset = Offset; + } + maxOffset = std::max(Offset, maxOffset); + } + if (archType == Triple::x86_64) { + // rbp not existed in CSI + int64_t reseversize = TFI->GetFrameReserveSize(MF) + sizeof(uint64_t); // 1: rbp + deleta = maxOffset + reseversize; // nearest to rbp offset + } else { + deleta = FpOffset; + } + + const unsigned LinkRegDwarfNum = 30; + for (std::vector::const_iterator + I = CSI.begin(), E = CSI.end(); I != E; ++I) { + int64_t Offset = MFI.getObjectOffset(I->getFrameIdx()); + unsigned Reg = I->getReg(); + unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, true); + if ((DwarfRegNum == LinkRegDwarfNum || DwarfRegNum == FpRegDwarfNum) + && (archType == Triple::aarch64)) { + continue; + } + Offset = Offset - deleta; + std::string key = std::string("DwarfReg") + std::to_string(DwarfRegNum); + std::string value = std::to_string(Offset); + LLVM_DEBUG(dbgs() << "RecordCalleeSaveRegisterAndOffset DwarfRegNum :" + << DwarfRegNum << " key:" << key + << " value:" << value + << "]\n"); + Attribute attr = Attribute::get(func.getContext(), key.c_str(), value.c_str()); + func.addAttribute(AttributeList::FunctionIndex, attr); + } +} +#endif + void PEI::spillCalleeSavedRegs(MachineFunction &MF) { // We can't list this requirement in getRequiredProperties because some // targets (WebAssembly) use virtual registers past this point, and the pass @@ -905,6 +980,88 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) { int64_t FixedCSEnd = Offset; Align MaxAlign = MFI.getMaxAlign(); +#ifdef ARK_GC_SUPPORT + int CalleeSavedFrameSize = 0; + Triple::ArchType archType = TFI.GetArkSupportTarget(); + if (archType == Triple::aarch64 && TFI.hasFP(MF)) { + int fpPosition = TFI.GetFixedFpPosition(); + int slotSize = sizeof(uint64_t); + int fpToCallerSpDelta = 0; + // 0:not exist +:count from head -:count from tail + // for x86-64 + // +--------------------------+ + // | caller Frame | + // +--------------------------+--- + // | returnAddr | ^ + // +--------------------------+ 2 slot(fpToCallerSpDelta) + // | Fp | V fpPosition = 2 + // +--------------------------+--- + // | type | + // +--------------------------+ + // | ReServeSize | + // +--------------------------+ + // | R14 | + // +--------------------------+ + // | R13 | + // +--------------------------+ + // | R12 | + // +--------------------------+ + // | RBX | + // +--------------------------+ + // for ARM64 + // +--------------------------+ + // | caller Frame | + // +--------------------------+--- + // | callee save registers | ^ + // | (exclude Fp) | | + // | | callee save registers size(fpToCallerSpDelta) + // +--------------------------+ | + // | Fp | V fpPosition = -1 + // +--------------------------+--- FixedCSEnd + // | type | + // +--------------------------+ + // | ReServeSize | + // +--------------------------+ + if (fpPosition >= 0) { + fpToCallerSpDelta = fpPosition * slotSize; + } else { + fpToCallerSpDelta = FixedCSEnd + (fpPosition + 1) * slotSize; + } + Function &func = const_cast(MF.getFunction()); + Attribute attr = Attribute::get(func.getContext(), "fpToCallerSpDelta", std::to_string(fpToCallerSpDelta).c_str()); + func.addAttribute(AttributeList::FunctionIndex, attr); + + CalleeSavedFrameSize = TFI.GetFrameReserveSize(MF); + Offset += CalleeSavedFrameSize; + } + + if ((archType == Triple::x86_64) && TFI.hasFP(MF)) { + // Determine which of the registers in the callee save list should be saved. + int fpPosition = TFI.GetFixedFpPosition(); + int fpToCallerSpDelta = 0; + int slotSize = sizeof(uint64_t); + if (fpPosition >= 0) { + fpToCallerSpDelta = fpPosition * slotSize; + } else { + fpToCallerSpDelta = FixedCSEnd + (fpPosition + 1) * slotSize; + } + Function &func = const_cast(MF.getFunction()); + Attribute attr = Attribute::get(func.getContext(), "fpToCallerSpDelta", std::to_string(fpToCallerSpDelta).c_str()); + func.addAttribute(AttributeList::FunctionIndex, attr); + + CalleeSavedFrameSize = TFI.GetFrameReserveSize(MF); + std::vector &CSI = MFI.getCalleeSavedInfo(); + LLVM_DEBUG(dbgs() << " CSI size: " << CSI.size() << " CalleeSavedFrameSize " << CalleeSavedFrameSize << "\n"); + // if callee-saved is empty, the reserved-size can't be passed to the computation of local zone + // because the assignCalleeSavedSpillSlots() directly return. + // Otherwise, the reserved-size don't need to add to the computation of local zone because it has been considered + // while computing the offsets of callee-saved-zone that will be passed to the computation of local-zone + if (CSI.empty()) { + Offset += CalleeSavedFrameSize; + } + } +#endif + // Make sure the special register scavenging spill slot is closest to the // incoming stack pointer if a frame pointer is required and is closer // to the incoming rather than the final stack pointer. diff --git a/llvm/lib/CodeGen/StackMaps.cpp b/llvm/lib/CodeGen/StackMaps.cpp index faf07e90c39cce215ec749e71796375a259a1c85..5152bbfda8326d93e121343ff5f6b035c8b8bc3e 100644 --- a/llvm/lib/CodeGen/StackMaps.cpp +++ b/llvm/lib/CodeGen/StackMaps.cpp @@ -29,6 +29,9 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#ifdef ARK_GC_SUPPORT +#include "llvm/Target/TargetMachine.h" +#endif #include #include #include @@ -599,10 +602,11 @@ void StackMaps::emitFunctionFrameRecords(MCStreamer &OS) { // Function Frame records. LLVM_DEBUG(dbgs() << WSMP << "functions:\n"); for (auto const &FR : FnInfos) { - LLVM_DEBUG(dbgs() << WSMP << "function addr: " << FR.first - << " frame size: " << FR.second.StackSize - << " callsite count: " << FR.second.RecordCount << '\n'); - OS.emitSymbolValue(FR.first, 8); + #ifdef ARK_GC_SUPPORT + OS.emitSymbolValue(FR.first, AP.TM.getProgramPointerSize()); + #else + OS.emitSymbolValue(FR.first, 8); + #endif OS.emitIntValue(FR.second.StackSize, 8); OS.emitIntValue(FR.second.RecordCount, 8); } diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp index b0594ec086b28f69d7b1ee0eb2c2294ad7d3ccd1..5d2bedfe3600b4d4b1a806394bc8b301875067c6 100644 --- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -162,3 +162,17 @@ TargetFrameLowering::getDwarfFrameBase(const MachineFunction &MF) const { const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); return DwarfFrameBase{DwarfFrameBase::Register, {RI->getFrameRegister(MF)}}; } + +#ifdef ARK_GC_SUPPORT +int TargetFrameLowering::GetFrameReserveSize(MachineFunction &MF) const +{ + int slotSize = sizeof(uint64_t); + int64_t marker = 0x0; + int reserveSize = 0; + MF.getFunction() + .getFnAttribute("frame-reserved-slots") + .getValueAsString() + .getAsInteger(10, marker); + return marker; +} +#endif diff --git a/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp index addec6871fa1001e1e737f2f9af1062d0f2619ec..cf8e43df97e5b4dd07eed8a0ba839e55cde0a043 100644 --- a/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp +++ b/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp @@ -198,6 +198,9 @@ LLVMBool LLVMCreateMCJITCompilerForModule( builder.setEngineKind(EngineKind::JIT) .setErrorStr(&Error) .setOptLevel((CodeGenOpt::Level)options.OptLevel) +#ifdef ARK_GC_SUPPORT + .setRelocationModel(unwrap(options.RelMode)) +#endif .setTargetOptions(targetOptions); bool JIT; if (Optional CM = unwrap(options.CodeModel, JIT)) diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index 039b34ace6abe85795fa7c1baa1c01ed49c74e35..937134d9494f8fa9ebe2a3b2a399b5faedd5dffd 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -3914,6 +3914,25 @@ LLVMValueRef LLVMBuildCall2(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Fn, makeArrayRef(unwrap(Args), NumArgs), Name)); } +#ifdef ARK_GC_SUPPORT +LLVMValueRef LLVMBuildCall3(LLVMBuilderRef B, LLVMTypeRef Ty, LLVMValueRef Fn, + LLVMValueRef *Args, unsigned NumArgs, + const char *Name, LLVMValueRef *deoptVals, + int NumVals) { + FunctionType *FTy = unwrap(Ty); + std::vector vals; + for (int i = 0; i < NumVals; i++) { + vals.push_back(unwrap(deoptVals[i])); + } + OperandBundleDefT deoptBundle("deopt", vals); + + return wrap(unwrap(B)->CreateCall(FTy, unwrap(Fn), + makeArrayRef(unwrap(Args), NumArgs), // Args + {deoptBundle}, // ArrayRef + Name)); +} +#endif + LLVMValueRef LLVMBuildSelect(LLVMBuilderRef B, LLVMValueRef If, LLVMValueRef Then, LLVMValueRef Else, const char *Name) { diff --git a/llvm/lib/Target/AArch64/AArch64ArkGc.td b/llvm/lib/Target/AArch64/AArch64ArkGc.td new file mode 100644 index 0000000000000000000000000000000000000000..ccd982d4491930175e06e567b3ee8b33f527ec8e --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64ArkGc.td @@ -0,0 +1,1263 @@ +//=- AArch64ArkGc.td - Describe the AArch64 Target Machine --------*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Target-independent interfaces which we are implementing. +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// AArch64 Subtarget features. +// + +def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true", + "Enable ARMv8 FP">; + +def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", + "Enable Advanced SIMD instructions", [FeatureFPARMv8]>; + +def FeatureSM4 : SubtargetFeature< + "sm4", "HasSM4", "true", + "Enable SM3 and SM4 support", [FeatureNEON]>; + +def FeatureSHA2 : SubtargetFeature< + "sha2", "HasSHA2", "true", + "Enable SHA1 and SHA256 support", [FeatureNEON]>; + +def FeatureSHA3 : SubtargetFeature< + "sha3", "HasSHA3", "true", + "Enable SHA512 and SHA3 support", [FeatureNEON, FeatureSHA2]>; + +def FeatureAES : SubtargetFeature< + "aes", "HasAES", "true", + "Enable AES support", [FeatureNEON]>; + +// Crypto has been split up and any combination is now valid (see the +// crypto definitions above). Also, crypto is now context sensitive: +// it has a different meaning for e.g. Armv8.4 than it has for Armv8.2. +// Therefore, we rely on Clang, the user interacing tool, to pass on the +// appropriate crypto options. But here in the backend, crypto has very little +// meaning anymore. We kept the Crypto definition here for backward +// compatibility, and now imply features SHA2 and AES, which was the +// "traditional" meaning of Crypto. +def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", + "Enable cryptographic instructions", [FeatureNEON, FeatureSHA2, FeatureAES]>; + +def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", + "Enable ARMv8 CRC-32 checksum instructions">; + +def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true", + "Enable ARMv8 Reliability, Availability and Serviceability Extensions">; + +def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true", + "Enable ARMv8.1 Large System Extension (LSE) atomic instructions">; + +def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true", + "Enable out of line atomics to support LSE instructions">; + +def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true", + "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions">; + +def FeaturePAN : SubtargetFeature< + "pan", "HasPAN", "true", + "Enables ARM v8.1 Privileged Access-Never extension">; + +def FeatureLOR : SubtargetFeature< + "lor", "HasLOR", "true", + "Enables ARM v8.1 Limited Ordering Regions extension">; + +def FeatureCONTEXTIDREL2 : SubtargetFeature<"CONTEXTIDREL2", "HasCONTEXTIDREL2", + "true", "Enable RW operand CONTEXTIDR_EL2" >; + +def FeatureVH : SubtargetFeature<"vh", "HasVH", "true", + "Enables ARM v8.1 Virtual Host extension", [FeatureCONTEXTIDREL2] >; + +def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", + "Enable ARMv8 PMUv3 Performance Monitors extension">; + +def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", + "Full FP16", [FeatureFPARMv8]>; + +def FeatureFP16FML : SubtargetFeature<"fp16fml", "HasFP16FML", "true", + "Enable FP16 FML instructions", [FeatureFullFP16]>; + +def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true", + "Enable Statistical Profiling extension">; + +def FeaturePAN_RWV : SubtargetFeature< + "pan-rwv", "HasPAN_RWV", "true", + "Enable v8.2 PAN s1e1R and s1e1W Variants", + [FeaturePAN]>; + +// UAO PState +def FeaturePsUAO : SubtargetFeature< "uaops", "HasPsUAO", "true", + "Enable v8.2 UAO PState">; + +def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP", + "true", "Enable v8.2 data Cache Clean to Point of Persistence" >; + +def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true", + "Enable Scalable Vector Extension (SVE) instructions", [FeatureFullFP16]>; + +// This flag is currently still labeled as Experimental, but when fully +// implemented this should tell the compiler to use the zeroing pseudos to +// benefit from the reverse instructions (e.g. SUB vs SUBR) if the inactive +// lanes are known to be zero. The pseudos will then be expanded using the +// MOVPRFX instruction to zero the inactive lanes. This feature should only be +// enabled if MOVPRFX instructions are known to merge with the destructive +// operations they prefix. +// +// This feature could similarly be extended to support cheap merging of _any_ +// value into the inactive lanes using the MOVPRFX instruction that uses +// merging-predication. +def FeatureExperimentalZeroingPseudos + : SubtargetFeature<"use-experimental-zeroing-pseudos", + "UseExperimentalZeroingPseudos", "true", + "Hint to the compiler that the MOVPRFX instruction is " + "merged with destructive operations", + []>; + +def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true", + "Enable Scalable Vector Extension 2 (SVE2) instructions", [FeatureSVE]>; + +def FeatureSVE2AES : SubtargetFeature<"sve2-aes", "HasSVE2AES", "true", + "Enable AES SVE2 instructions", [FeatureSVE2, FeatureAES]>; + +def FeatureSVE2SM4 : SubtargetFeature<"sve2-sm4", "HasSVE2SM4", "true", + "Enable SM4 SVE2 instructions", [FeatureSVE2, FeatureSM4]>; + +def FeatureSVE2SHA3 : SubtargetFeature<"sve2-sha3", "HasSVE2SHA3", "true", + "Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>; + +def FeatureSVE2BitPerm : SubtargetFeature<"sve2-bitperm", "HasSVE2BitPerm", "true", + "Enable bit permutation SVE2 instructions", [FeatureSVE2]>; + +def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", + "Has zero-cycle register moves">; + +def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true", + "Has zero-cycle zeroing instructions for generic registers">; + +def FeatureZCZeroingFP : SubtargetFeature<"zcz-fp", "HasZeroCycleZeroingFP", "true", + "Has zero-cycle zeroing instructions for FP registers">; + +def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", + "Has zero-cycle zeroing instructions", + [FeatureZCZeroingGP, FeatureZCZeroingFP]>; + +/// ... but the floating-point version doesn't quite work in rare cases on older +/// CPUs. +def FeatureZCZeroingFPWorkaround : SubtargetFeature<"zcz-fp-workaround", + "HasZeroCycleZeroingFPWorkaround", "true", + "The zero-cycle floating-point zeroing instruction has a bug">; + +def FeatureStrictAlign : SubtargetFeature<"strict-align", + "StrictAlign", "true", + "Disallow all unaligned memory " + "access">; + +foreach i = {1-7,9-15,18,20-28,30} in + def FeatureReserveX#i : SubtargetFeature<"reserve-x"#i, "ReserveXRegister["#i#"]", "true", + "Reserve X"#i#", making it unavailable " + "as a GPR">; + +foreach i = {8-15,18} in + def FeatureCallSavedX#i : SubtargetFeature<"call-saved-x"#i, + "CustomCallSavedXRegs["#i#"]", "true", "Make X"#i#" callee saved.">; + +def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", + "Use alias analysis during codegen">; + +def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps", + "true", + "balance mix of odd and even D-registers for fp multiply(-accumulate) ops">; + +def FeaturePredictableSelectIsExpensive : SubtargetFeature< + "predictable-select-expensive", "PredictableSelectIsExpensive", "true", + "Prefer likely predicted branches over selects">; + +def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move", + "CustomAsCheapAsMove", "true", + "Use custom handling of cheap instructions">; + +def FeatureExynosCheapAsMoveHandling : SubtargetFeature<"exynos-cheap-as-move", + "ExynosAsCheapAsMove", "true", + "Use Exynos specific handling of cheap instructions", + [FeatureCustomCheapAsMoveHandling]>; + +def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler", + "UsePostRAScheduler", "true", "Schedule again after register allocation">; + +def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store", + "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">; + +def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128", + "Paired128IsSlow", "true", "Paired 128 bit loads and stores are slow">; + +def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "STRQroIsSlow", + "true", "STR of Q register with register offset is slow">; + +def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature< + "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern", + "true", "Use alternative pattern for sextload convert to f32">; + +def FeatureArithmeticBccFusion : SubtargetFeature< + "arith-bcc-fusion", "HasArithmeticBccFusion", "true", + "CPU fuses arithmetic+bcc operations">; + +def FeatureArithmeticCbzFusion : SubtargetFeature< + "arith-cbz-fusion", "HasArithmeticCbzFusion", "true", + "CPU fuses arithmetic + cbz/cbnz operations">; + +def FeatureCmpBccFusion : SubtargetFeature< + "cmp-bcc-fusion", "HasCmpBccFusion", "true", + "CPU fuses cmp+bcc operations">; + +def FeatureFuseAddress : SubtargetFeature< + "fuse-address", "HasFuseAddress", "true", + "CPU fuses address generation and memory operations">; + +def FeatureFuseAES : SubtargetFeature< + "fuse-aes", "HasFuseAES", "true", + "CPU fuses AES crypto operations">; + +def FeatureFuseArithmeticLogic : SubtargetFeature< + "fuse-arith-logic", "HasFuseArithmeticLogic", "true", + "CPU fuses arithmetic and logic operations">; + +def FeatureFuseCCSelect : SubtargetFeature< + "fuse-csel", "HasFuseCCSelect", "true", + "CPU fuses conditional select operations">; + +def FeatureFuseCryptoEOR : SubtargetFeature< + "fuse-crypto-eor", "HasFuseCryptoEOR", "true", + "CPU fuses AES/PMULL and EOR operations">; + +def FeatureFuseLiterals : SubtargetFeature< + "fuse-literals", "HasFuseLiterals", "true", + "CPU fuses literal generation operations">; + +def FeatureDisableLatencySchedHeuristic : SubtargetFeature< + "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", + "Disable latency scheduling heuristic">; + +def FeatureForce32BitJumpTables + : SubtargetFeature<"force-32bit-jump-tables", "Force32BitJumpTables", "true", + "Force jump table entries to be 32-bits wide except at MinSize">; + +def FeatureRCPC : SubtargetFeature<"rcpc", "HasRCPC", "true", + "Enable support for RCPC extension">; + +def FeatureUseRSqrt : SubtargetFeature< + "use-reciprocal-square-root", "UseRSqrt", "true", + "Use the reciprocal square root approximation">; + +def FeatureDotProd : SubtargetFeature< + "dotprod", "HasDotProd", "true", + "Enable dot product support">; + +def FeaturePAuth : SubtargetFeature< + "pauth", "HasPAuth", "true", + "Enable v8.3-A Pointer Authentication extension">; + +def FeatureJS : SubtargetFeature< + "jsconv", "HasJS", "true", + "Enable v8.3-A JavaScript FP conversion instructions", + [FeatureFPARMv8]>; + +def FeatureCCIDX : SubtargetFeature< + "ccidx", "HasCCIDX", "true", + "Enable v8.3-A Extend of the CCSIDR number of sets">; + +def FeatureComplxNum : SubtargetFeature< + "complxnum", "HasComplxNum", "true", + "Enable v8.3-A Floating-point complex number support", + [FeatureNEON]>; + +def FeatureNV : SubtargetFeature< + "nv", "HasNV", "true", + "Enable v8.4-A Nested Virtualization Enchancement">; + +def FeatureMPAM : SubtargetFeature< + "mpam", "HasMPAM", "true", + "Enable v8.4-A Memory system Partitioning and Monitoring extension">; + +def FeatureDIT : SubtargetFeature< + "dit", "HasDIT", "true", + "Enable v8.4-A Data Independent Timing instructions">; + +def FeatureTRACEV8_4 : SubtargetFeature< + "tracev8.4", "HasTRACEV8_4", "true", + "Enable v8.4-A Trace extension">; + +def FeatureAM : SubtargetFeature< + "am", "HasAM", "true", + "Enable v8.4-A Activity Monitors extension">; + +def FeatureAMVS : SubtargetFeature< + "amvs", "HasAMVS", "true", + "Enable v8.6-A Activity Monitors Virtualization support", + [FeatureAM]>; + +def FeatureSEL2 : SubtargetFeature< + "sel2", "HasSEL2", "true", + "Enable v8.4-A Secure Exception Level 2 extension">; + +def FeaturePMU : SubtargetFeature< + "pmu", "HasPMU", "true", + "Enable v8.4-A PMU extension">; + +def FeatureTLB_RMI : SubtargetFeature< + "tlb-rmi", "HasTLB_RMI", "true", + "Enable v8.4-A TLB Range and Maintenance Instructions">; + +def FeatureFlagM : SubtargetFeature< + "flagm", "HasFlagM", "true", + "Enable v8.4-A Flag Manipulation Instructions">; + +// 8.4 RCPC enchancements: LDAPR & STLR instructions with Immediate Offset +def FeatureRCPC_IMMO : SubtargetFeature<"rcpc-immo", "HasRCPC_IMMO", "true", + "Enable v8.4-A RCPC instructions with Immediate Offsets", + [FeatureRCPC]>; + +def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates", + "NegativeImmediates", "false", + "Convert immediates and instructions " + "to their negated or complemented " + "equivalent when the immediate does " + "not fit in the encoding.">; + +def FeatureLSLFast : SubtargetFeature< + "lsl-fast", "HasLSLFast", "true", + "CPU has a fastpath logical shift of up to 3 places">; + +def FeatureAggressiveFMA : + SubtargetFeature<"aggressive-fma", + "HasAggressiveFMA", + "true", + "Enable Aggressive FMA for floating-point.">; + +def FeatureAltFPCmp : SubtargetFeature<"altnzcv", "HasAlternativeNZCV", "true", + "Enable alternative NZCV format for floating point comparisons">; + +def FeatureFRInt3264 : SubtargetFeature<"fptoint", "HasFRInt3264", "true", + "Enable FRInt[32|64][Z|X] instructions that round a floating-point number to " + "an integer (in FP format) forcing it to fit into a 32- or 64-bit int" >; + +def FeatureSpecRestrict : SubtargetFeature<"specrestrict", "HasSpecRestrict", + "true", "Enable architectural speculation restriction" >; + +def FeatureSB : SubtargetFeature<"sb", "HasSB", + "true", "Enable v8.5 Speculation Barrier" >; + +def FeatureSSBS : SubtargetFeature<"ssbs", "HasSSBS", + "true", "Enable Speculative Store Bypass Safe bit" >; + +def FeaturePredRes : SubtargetFeature<"predres", "HasPredRes", "true", + "Enable v8.5a execution and data prediction invalidation instructions" >; + +def FeatureCacheDeepPersist : SubtargetFeature<"ccdp", "HasCCDP", + "true", "Enable v8.5 Cache Clean to Point of Deep Persistence" >; + +def FeatureBranchTargetId : SubtargetFeature<"bti", "HasBTI", + "true", "Enable Branch Target Identification" >; + +def FeatureRandGen : SubtargetFeature<"rand", "HasRandGen", + "true", "Enable Random Number generation instructions" >; + +def FeatureMTE : SubtargetFeature<"mte", "HasMTE", + "true", "Enable Memory Tagging Extension" >; + +def FeatureTRBE : SubtargetFeature<"trbe", "HasTRBE", + "true", "Enable Trace Buffer Extension">; + +def FeatureETE : SubtargetFeature<"ete", "HasETE", + "true", "Enable Embedded Trace Extension", + [FeatureTRBE]>; + +def FeatureTME : SubtargetFeature<"tme", "HasTME", + "true", "Enable Transactional Memory Extension" >; + +def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", + "AllowTaggedGlobals", + "true", "Use an instruction sequence for taking the address of a global " + "that allows a memory tag in the upper address bits">; + +def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16", + "true", "Enable BFloat16 Extension" >; + +def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8", + "true", "Enable Matrix Multiply Int8 Extension">; + +def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32", + "true", "Enable Matrix Multiply FP32 Extension", [FeatureSVE]>; + +def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64", + "true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>; + +def FeatureXS : SubtargetFeature<"xs", "HasXS", + "true", "Enable Armv8.7-A limited-TLB-maintenance instruction">; + +def FeatureWFxT : SubtargetFeature<"wfxt", "HasWFxT", + "true", "Enable Armv8.7-A WFET and WFIT instruction">; + +def FeatureHCX : SubtargetFeature< + "hcx", "HasHCX", "true", "Enable Armv8.7-A HCRX_EL2 system register">; + +def FeatureLS64 : SubtargetFeature<"ls64", "HasLS64", + "true", "Enable Armv8.7-A LD64B/ST64B Accelerator Extension">; + +def FeatureBRBE : SubtargetFeature<"brbe", "HasBRBE", + "true", "Enable Branch Record Buffer Extension">; + +def FeatureSPE_EEF : SubtargetFeature<"spe-eef", "HasSPE_EEF", + "true", "Enable extra register in the Statistical Profiling Extension">; + +def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps", + "true", "Enable fine grained virtualization traps extension">; + +def FeatureEnhancedCounterVirtualization : + SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization", + "true", "Enable enhanced counter virtualization extension">; + +//===----------------------------------------------------------------------===// +// Architectures. +// + +def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", + "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE, FeatureRDM, + FeaturePAN, FeatureLOR, FeatureVH]>; + +def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", + "Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO, + FeaturePAN_RWV, FeatureRAS, FeatureCCPP]>; + +def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true", + "Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC, FeaturePAuth, + FeatureJS, FeatureCCIDX, FeatureComplxNum]>; + +def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true", + "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd, + FeatureNV, FeatureMPAM, FeatureDIT, + FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeaturePMU, FeatureTLB_RMI, + FeatureFlagM, FeatureRCPC_IMMO]>; + +def HasV8_5aOps : SubtargetFeature< + "v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions", + [HasV8_4aOps, FeatureAltFPCmp, FeatureFRInt3264, FeatureSpecRestrict, + FeatureSSBS, FeatureSB, FeaturePredRes, FeatureCacheDeepPersist, + FeatureBranchTargetId]>; + +def HasV8_6aOps : SubtargetFeature< + "v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions", + [HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps, + FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>; + +def HasV8_7aOps : SubtargetFeature< + "v8.7a", "HasV8_7aOps", "true", "Support ARM v8.7a instructions", + [HasV8_6aOps, FeatureXS, FeatureWFxT, FeatureHCX]>; + +def HasV8_0rOps : SubtargetFeature< + "v8r", "HasV8_0rOps", "true", "Support ARM v8r instructions", + [//v8.1 + FeatureCRC, FeaturePAN, FeatureRDM, FeatureLSE, FeatureCONTEXTIDREL2, + //v8.2 + FeaturePerfMon, FeatureRAS, FeaturePsUAO, FeatureSM4, + FeatureSHA3, FeatureCCPP, FeatureFullFP16, FeaturePAN_RWV, + //v8.3 + FeatureComplxNum, FeatureCCIDX, FeatureJS, + FeaturePAuth, FeatureRCPC, + //v8.4 + FeatureDotProd, FeatureFP16FML, FeatureTRACEV8_4, + FeatureTLB_RMI, FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO, + //v8.5 + FeatureSSBS, FeaturePredRes, FeatureSB, FeatureSpecRestrict]>; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "AArch64RegisterInfo.td" +include "AArch64RegisterBanks.td" +include "AArch64ArkGcCallingConvention.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "AArch64Schedule.td" +include "AArch64InstrInfo.td" +include "AArch64SchedPredicates.td" +include "AArch64SchedPredExynos.td" +include "AArch64Combine.td" + +def AArch64InstrInfo : InstrInfo; + +//===----------------------------------------------------------------------===// +// Named operands for MRS/MSR/TLBI/... +//===----------------------------------------------------------------------===// + +include "AArch64SystemOperands.td" + +//===----------------------------------------------------------------------===// +// Access to privileged registers +//===----------------------------------------------------------------------===// + +foreach i = 1-3 in +def FeatureUseEL#i#ForTP : SubtargetFeature<"tpidr-el"#i, "UseEL"#i#"ForTP", + "true", "Permit use of TPIDR_EL"#i#" for the TLS base">; + +//===----------------------------------------------------------------------===// +// Control codegen mitigation against Straight Line Speculation vulnerability. +//===----------------------------------------------------------------------===// + +def FeatureHardenSlsRetBr : SubtargetFeature<"harden-sls-retbr", + "HardenSlsRetBr", "true", + "Harden against straight line speculation across RET and BR instructions">; +def FeatureHardenSlsBlr : SubtargetFeature<"harden-sls-blr", + "HardenSlsBlr", "true", + "Harden against straight line speculation across BLR instructions">; + +//===----------------------------------------------------------------------===// +// AArch64 Processors supported. +// + +//===----------------------------------------------------------------------===// +// Unsupported features to disable for scheduling models +//===----------------------------------------------------------------------===// + +class AArch64Unsupported { list F; } + +def SVEUnsupported : AArch64Unsupported { + let F = [HasSVE, HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, + HasSVE2BitPerm]; +} + +def PAUnsupported : AArch64Unsupported { + let F = [HasPAuth]; +} + +include "AArch64SchedA53.td" +include "AArch64SchedA55.td" +include "AArch64SchedA57.td" +include "AArch64SchedCyclone.td" +include "AArch64SchedFalkor.td" +include "AArch64SchedKryo.td" +include "AArch64SchedExynosM3.td" +include "AArch64SchedExynosM4.td" +include "AArch64SchedExynosM5.td" +include "AArch64SchedThunderX.td" +include "AArch64SchedThunderX2T99.td" +include "AArch64SchedA64FX.td" +include "AArch64SchedThunderX3T110.td" +include "AArch64SchedTSV110.td" + +def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", + "Cortex-A35 ARM processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon + ]>; + +def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", + "Cortex-A53 ARM processors", [ + FeatureBalanceFPOps, + FeatureCRC, + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureUseAA + ]>; + +def ProcA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55", + "Cortex-A55 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeatureFullFP16, + FeatureDotProd, + FeatureRCPC, + FeaturePerfMon + ]>; + +def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", + "Cortex-A57 ARM processors", [ + FeatureBalanceFPOps, + FeatureCRC, + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureFuseAES, + FeatureFuseLiterals, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive + ]>; + +def ProcA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65", + "Cortex-A65 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureDotProd, + FeatureFPARMv8, + FeatureFullFP16, + FeatureFuseAddress, + FeatureFuseAES, + FeatureFuseLiterals, + FeatureNEON, + FeatureRAS, + FeatureRCPC, + FeatureSSBS, + ]>; + +def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", + "Cortex-A72 ARM processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeatureFuseAES, + FeatureFuseLiterals, + FeatureNEON, + FeaturePerfMon + ]>; + +def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", + "Cortex-A73 ARM processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeaturePerfMon + ]>; + +def ProcA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75", + "Cortex-A75 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeatureFullFP16, + FeatureDotProd, + FeatureRCPC, + FeaturePerfMon + ]>; + +def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", + "Cortex-A76 ARM processors", [ + HasV8_2aOps, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeatureRCPC, + FeatureCrypto, + FeatureFullFP16, + FeatureDotProd, + FeatureSSBS + ]>; + +def ProcA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77", + "Cortex-A77 ARM processors", [ + HasV8_2aOps, + FeatureCmpBccFusion, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, FeatureRCPC, + FeatureCrypto, + FeatureFullFP16, + FeatureDotProd + ]>; + +def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily", + "CortexA78", + "Cortex-A78 ARM processors", [ + HasV8_2aOps, + FeatureCmpBccFusion, + FeatureCrypto, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeatureRCPC, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureSPE, + FeatureFullFP16, + FeatureSSBS, + FeatureDotProd]>; + +def ProcA78C : SubtargetFeature<"cortex-a78c", "ARMProcFamily", + "CortexA78C", + "Cortex-A78C ARM processors", [ + HasV8_2aOps, + FeatureCmpBccFusion, + FeatureCrypto, + FeatureDotProd, + FeatureFlagM, + FeatureFP16FML, + FeatureFPARMv8, + FeatureFullFP16, + FeatureFuseAES, + FeatureNEON, + FeaturePAuth, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureRCPC, + FeatureSPE, + FeatureSSBS]>; + +def ProcR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily", + "CortexR82", + "Cortex-R82 ARM Processors", [ + FeaturePostRAScheduler, + // TODO: crypto and FuseAES + // All other features are implied by v8_0r ops: + HasV8_0rOps, + ]>; + +def ProcX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1", + "Cortex-X1 ARM processors", [ + HasV8_2aOps, + FeatureCmpBccFusion, + FeatureCrypto, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeatureRCPC, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureSPE, + FeatureFullFP16, + FeatureDotProd]>; + +def ProcA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX", + "Fujitsu A64FX processors", [ + HasV8_2aOps, + FeatureFPARMv8, + FeatureNEON, + FeatureSHA2, + FeaturePerfMon, + FeatureFullFP16, + FeatureSVE, + FeaturePostRAScheduler, + FeatureComplxNum, + FeatureAggressiveFMA, + FeatureArithmeticBccFusion, + FeaturePredictableSelectIsExpensive + ]>; + +def ProcCarmel : SubtargetFeature<"carmel", "ARMProcFamily", "Carmel", + "Nvidia Carmel processors", [ + HasV8_2aOps, + FeatureNEON, + FeatureCrypto, + FeatureFullFP16 + ]>; + +// Note that cyclone does not fuse AES instructions, but newer apple chips do +// perform the fusion and cyclone is used by default when targetting apple OSes. +def ProcAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7", + "Apple A7 (the CPU formerly known as Cyclone)", [ + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureCrypto, + FeatureDisableLatencySchedHeuristic, + FeatureFPARMv8, + FeatureFuseAES, + FeatureFuseCryptoEOR, + FeatureNEON, + FeaturePerfMon, + FeatureZCRegMove, + FeatureZCZeroing, + FeatureZCZeroingFPWorkaround + ]>; + +def ProcAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10", + "Apple A10", [ + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureCrypto, + FeatureDisableLatencySchedHeuristic, + FeatureFPARMv8, + FeatureFuseAES, + FeatureFuseCryptoEOR, + FeatureNEON, + FeaturePerfMon, + FeatureZCRegMove, + FeatureZCZeroing, + FeatureCRC, + FeatureRDM, + FeaturePAN, + FeatureLOR, + FeatureVH, + ]>; + +def ProcAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11", + "Apple A11", [ + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureCrypto, + FeatureDisableLatencySchedHeuristic, + FeatureFPARMv8, + FeatureFuseAES, + FeatureFuseCryptoEOR, + FeatureNEON, + FeaturePerfMon, + FeatureZCRegMove, + FeatureZCZeroing, + FeatureFullFP16, + HasV8_2aOps + ]>; + +def ProcAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12", + "Apple A12", [ + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureCrypto, + FeatureDisableLatencySchedHeuristic, + FeatureFPARMv8, + FeatureFuseAES, + FeatureFuseCryptoEOR, + FeatureNEON, + FeaturePerfMon, + FeatureZCRegMove, + FeatureZCZeroing, + FeatureFullFP16, + HasV8_3aOps + ]>; + +def ProcAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", + "Apple A13", [ + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureCrypto, + FeatureDisableLatencySchedHeuristic, + FeatureFPARMv8, + FeatureFuseAES, + FeatureFuseCryptoEOR, + FeatureNEON, + FeaturePerfMon, + FeatureZCRegMove, + FeatureZCZeroing, + FeatureFullFP16, + FeatureFP16FML, + FeatureSHA3, + HasV8_4aOps + ]>; + +def ProcAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", + "Apple A14", [ + FeatureAggressiveFMA, + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureAltFPCmp, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureCrypto, + FeatureDisableLatencySchedHeuristic, + FeatureFPARMv8, + FeatureFRInt3264, + FeatureFuseAddress, + FeatureFuseAES, + FeatureFuseArithmeticLogic, + FeatureFuseCCSelect, + FeatureFuseCryptoEOR, + FeatureFuseLiterals, + FeatureNEON, + FeaturePerfMon, + FeatureSpecRestrict, + FeatureSSBS, + FeatureSB, + FeaturePredRes, + FeatureCacheDeepPersist, + FeatureZCRegMove, + FeatureZCZeroing, + FeatureFullFP16, + FeatureFP16FML, + FeatureSHA3, + HasV8_4aOps + ]>; + +def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", + "Samsung Exynos-M3 processors", + [FeatureCRC, + FeatureCrypto, + FeatureExynosCheapAsMoveHandling, + FeatureForce32BitJumpTables, + FeatureFuseAddress, + FeatureFuseAES, + FeatureFuseCCSelect, + FeatureFuseLiterals, + FeatureLSLFast, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureZCZeroingFP]>; + +def ProcExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3", + "Samsung Exynos-M4 processors", + [HasV8_2aOps, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureCrypto, + FeatureDotProd, + FeatureExynosCheapAsMoveHandling, + FeatureForce32BitJumpTables, + FeatureFullFP16, + FeatureFuseAddress, + FeatureFuseAES, + FeatureFuseArithmeticLogic, + FeatureFuseCCSelect, + FeatureFuseLiterals, + FeatureLSLFast, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureZCZeroing]>; + +def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", + "Qualcomm Kryo processors", [ + FeatureCRC, + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureZCZeroing, + FeatureLSLFast + ]>; + +def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", + "Qualcomm Falkor processors", [ + FeatureCRC, + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureRDM, + FeatureZCZeroing, + FeatureLSLFast, + FeatureSlowSTRQro + ]>; + +def ProcNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily", + "NeoverseE1", + "Neoverse E1 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureDotProd, + FeatureFPARMv8, + FeatureFullFP16, + FeatureNEON, + FeatureRCPC, + FeatureSSBS, + ]>; + +def ProcNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", + "NeoverseN1", + "Neoverse N1 ARM processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureDotProd, + FeatureFPARMv8, + FeatureFullFP16, + FeatureNEON, + FeatureRCPC, + FeatureSPE, + FeatureSSBS, + ]>; + +def ProcNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", + "NeoverseN2", + "Neoverse N2 ARM processors", [ + HasV8_5aOps, + FeatureBF16, + FeatureETE, + FeatureMatMulInt8, + FeatureMTE, + FeatureSVE2, + FeatureSVE2BitPerm, + FeatureTRBE]>; + +def ProcNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", + "NeoverseV1", + "Neoverse V1 ARM processors", [ + HasV8_4aOps, + FeatureBF16, + FeatureCacheDeepPersist, + FeatureCrypto, + FeatureFPARMv8, + FeatureFP16FML, + FeatureFullFP16, + FeatureFuseAES, + FeatureMatMulInt8, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureRandGen, + FeatureSPE, + FeatureSSBS, + FeatureSVE]>; + +def ProcSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", + "Qualcomm Saphira processors", [ + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureNEON, + FeatureSPE, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureZCZeroing, + FeatureLSLFast, + HasV8_4aOps]>; + +def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", + "ThunderX2T99", + "Cavium ThunderX2 processors", [ + FeatureAggressiveFMA, + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeatureArithmeticBccFusion, + FeatureNEON, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureLSE, + HasV8_1aOps]>; + +def ProcThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily", + "ThunderX3T110", + "Marvell ThunderX3 processors", [ + FeatureAggressiveFMA, + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeatureArithmeticBccFusion, + FeatureNEON, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureLSE, + FeaturePAuth, + FeatureUseAA, + FeatureBalanceFPOps, + FeaturePerfMon, + FeatureStrictAlign, + HasV8_3aOps]>; + +def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX", + "Cavium ThunderX processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureNEON]>; + +def ProcThunderXT88 : SubtargetFeature<"thunderxt88", "ARMProcFamily", + "ThunderXT88", + "Cavium ThunderX processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureNEON]>; + +def ProcThunderXT81 : SubtargetFeature<"thunderxt81", "ARMProcFamily", + "ThunderXT81", + "Cavium ThunderX processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureNEON]>; + +def ProcThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily", + "ThunderXT83", + "Cavium ThunderX processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeaturePerfMon, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureNEON]>; + +def ProcTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110", + "HiSilicon TS-V110 processors", [ + HasV8_2aOps, + FeatureCrypto, + FeatureCustomCheapAsMoveHandling, + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + FeatureSPE, + FeatureFullFP16, + FeatureFP16FML, + FeatureDotProd]>; + +def : ProcessorModel<"generic", NoSchedModel, [ + FeatureFPARMv8, + FeatureFuseAES, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, +// ETE and TRBE are future architecture extensions. We temporarily enable them +// by default for users targeting generic AArch64, until it is decided in which +// armv8.x-a architecture revision they will end up. The extensions do not +// affect code generated by the compiler and can be used only by explicitly +// mentioning the new system register names in assembly. + FeatureETE + ]>; + +def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; +def : ProcessorModel<"cortex-a34", CortexA53Model, [ProcA35]>; +def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; +def : ProcessorModel<"cortex-a55", CortexA55Model, [ProcA55]>; +def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; +def : ProcessorModel<"cortex-a65", CortexA53Model, [ProcA65]>; +def : ProcessorModel<"cortex-a65ae", CortexA53Model, [ProcA65]>; +def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>; +def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>; +def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>; +def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>; +def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>; +def : ProcessorModel<"cortex-a77", CortexA57Model, [ProcA77]>; +def : ProcessorModel<"cortex-a78", CortexA57Model, [ProcA78]>; +def : ProcessorModel<"cortex-a78c", CortexA57Model, [ProcA78C]>; +def : ProcessorModel<"cortex-r82", CortexA55Model, [ProcR82]>; +def : ProcessorModel<"cortex-x1", CortexA57Model, [ProcX1]>; +def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>; +def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>; +def : ProcessorModel<"neoverse-n2", CortexA57Model, [ProcNeoverseN2]>; +def : ProcessorModel<"neoverse-v1", CortexA57Model, [ProcNeoverseV1]>; +def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>; +def : ProcessorModel<"exynos-m4", ExynosM4Model, [ProcExynosM4]>; +def : ProcessorModel<"exynos-m5", ExynosM5Model, [ProcExynosM4]>; +def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>; +def : ProcessorModel<"saphira", FalkorModel, [ProcSaphira]>; +def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>; +// Cavium ThunderX/ThunderX T8X Processors +def : ProcessorModel<"thunderx", ThunderXT8XModel, [ProcThunderX]>; +def : ProcessorModel<"thunderxt88", ThunderXT8XModel, [ProcThunderXT88]>; +def : ProcessorModel<"thunderxt81", ThunderXT8XModel, [ProcThunderXT81]>; +def : ProcessorModel<"thunderxt83", ThunderXT8XModel, [ProcThunderXT83]>; +// Cavium ThunderX2T9X Processors. Formerly Broadcom Vulcan. +def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>; +// Marvell ThunderX3T110 Processors. +def : ProcessorModel<"thunderx3t110", ThunderX3T110Model, [ProcThunderX3T110]>; +def : ProcessorModel<"tsv110", TSV110Model, [ProcTSV110]>; + +// Support cyclone as an alias for apple-a7 so we can still LTO old bitcode. +def : ProcessorModel<"cyclone", CycloneModel, [ProcAppleA7]>; + +// iPhone and iPad CPUs +def : ProcessorModel<"apple-a7", CycloneModel, [ProcAppleA7]>; +def : ProcessorModel<"apple-a8", CycloneModel, [ProcAppleA7]>; +def : ProcessorModel<"apple-a9", CycloneModel, [ProcAppleA7]>; +def : ProcessorModel<"apple-a10", CycloneModel, [ProcAppleA10]>; +def : ProcessorModel<"apple-a11", CycloneModel, [ProcAppleA11]>; +def : ProcessorModel<"apple-a12", CycloneModel, [ProcAppleA12]>; +def : ProcessorModel<"apple-a13", CycloneModel, [ProcAppleA13]>; +def : ProcessorModel<"apple-a14", CycloneModel, [ProcAppleA14]>; + +// watch CPUs. +def : ProcessorModel<"apple-s4", CycloneModel, [ProcAppleA12]>; +def : ProcessorModel<"apple-s5", CycloneModel, [ProcAppleA12]>; + +// Alias for the latest Apple processor model supported by LLVM. +def : ProcessorModel<"apple-latest", CycloneModel, [ProcAppleA13]>; + +// Fujitsu A64FX +def : ProcessorModel<"a64fx", A64FXModel, [ProcA64FX]>; + +// Nvidia Carmel +def : ProcessorModel<"carmel", NoSchedModel, [ProcCarmel]>; + +//===----------------------------------------------------------------------===// +// Assembly parser +//===----------------------------------------------------------------------===// + +def GenericAsmParserVariant : AsmParserVariant { + int Variant = 0; + string Name = "generic"; + string BreakCharacters = "."; + string TokenizingCharacters = "[]*!/"; +} + +def AppleAsmParserVariant : AsmParserVariant { + int Variant = 1; + string Name = "apple-neon"; + string BreakCharacters = "."; + string TokenizingCharacters = "[]*!/"; +} + +//===----------------------------------------------------------------------===// +// Assembly printer +//===----------------------------------------------------------------------===// +// AArch64 Uses the MC printer for asm output, so make sure the TableGen +// AsmWriter bits get associated with the correct class. +def GenericAsmWriter : AsmWriter { + string AsmWriterClassName = "InstPrinter"; + int PassSubtarget = 1; + int Variant = 0; + bit isMCAsmWriter = 1; +} + +def AppleAsmWriter : AsmWriter { + let AsmWriterClassName = "AppleInstPrinter"; + int PassSubtarget = 1; + int Variant = 1; + int isMCAsmWriter = 1; +} + +//===----------------------------------------------------------------------===// +// Target Declaration +//===----------------------------------------------------------------------===// + +def AArch64 : Target { + let InstructionSet = AArch64InstrInfo; + let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant]; + let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter]; + let AllowRegisterRenaming = 1; +} + +//===----------------------------------------------------------------------===// +// Pfm Counters +//===----------------------------------------------------------------------===// + +include "AArch64PfmCounters.td" diff --git a/llvm/lib/Target/AArch64/AArch64ArkGcCallingConvention.td b/llvm/lib/Target/AArch64/AArch64ArkGcCallingConvention.td new file mode 100644 index 0000000000000000000000000000000000000000..a55537c16a4af2d919afa980635c594733869cca --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64ArkGcCallingConvention.td @@ -0,0 +1,512 @@ +//=- AArch64ArkGcCallingConv.td - Calling Conventions for AArch64 -*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for AArch64 architecture. +// +//===----------------------------------------------------------------------===// + +/// CCIfBigEndian - Match only if we're in big endian mode. +class CCIfBigEndian : + CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>; + +class CCIfILP32 : + CCIf<"State.getMachineFunction().getDataLayout().getPointerSize() == 4", A>; + + +//===----------------------------------------------------------------------===// +// ARM AAPCS64 Calling Convention +//===----------------------------------------------------------------------===// + +let Entry = 1 in +def CC_AArch64_AAPCS : CallingConv<[ + CCIfType<[iPTR], CCBitConvertToType>, + CCIfType<[v2f32], CCBitConvertToType>, + CCIfType<[v2f64, v4f32], CCBitConvertToType>, + + // Big endian vectors must be passed as if they were 1-element vectors so that + // their lanes are in a consistent order. + CCIfBigEndian>>, + CCIfBigEndian>>, + + // In AAPCS, an SRet is passed in X8, not X0 like a normal pointer parameter. + // However, on windows, in some circumstances, the SRet is passed in X0 or X1 + // instead. The presence of the inreg attribute indicates that SRet is + // passed in the alternative register (X0 or X1), not X8: + // - X0 for non-instance methods. + // - X1 for instance methods. + + // The "sret" attribute identifies indirect returns. + // The "inreg" attribute identifies non-aggregate types. + // The position of the "sret" attribute identifies instance/non-instance + // methods. + // "sret" on argument 0 means non-instance methods. + // "sret" on argument 1 means instance methods. + + CCIfInReg>>>>, + + CCIfSRet>>, + + // Put ByVal arguments directly on the stack. Minimum size and alignment of a + // slot is 64-bit. + CCIfByVal>, + + // The 'nest' parameter, if any, is passed in X18. + // Darwin uses X18 as the platform register and hence 'nest' isn't currently + // supported there. + CCIfNest>, + + // Pass SwiftSelf in a callee saved register. + CCIfSwiftSelf>>, + + // A SwiftError is passed in X21. + CCIfSwiftError>>, + + CCIfConsecutiveRegs>, + + CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], + CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, + CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], + CCPassIndirect>, + + CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCAssignToReg<[P0, P1, P2, P3]>>, + CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCPassIndirect>, + + // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, + // up to eight each of GPR and FPR. + CCIfType<[i1, i8, i16], CCPromoteToType>, + CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], + [X0, X1, X2, X3, X4, X5, X6, X7]>>, + // i128 is split to two i64s, we can't fit half to register X7. + CCIfType<[i64], CCIfSplit>>, + + // i128 is split to two i64s, and its stack alignment is 16 bytes. + CCIfType<[i64], CCIfSplit>>, + + CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], + [W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], + CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], + CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + + // If more than will fit in registers, pass them on the stack instead. + CCIfType<[i1, i8, i16, f16, bf16], CCAssignToStack<8, 8>>, + CCIfType<[i32, f32], CCAssignToStack<8, 8>>, + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16], + CCAssignToStack<8, 8>>, + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], + CCAssignToStack<16, 16>> +]>; + +let Entry = 1 in +def RetCC_AArch64_AAPCS : CallingConv<[ + CCIfType<[iPTR], CCBitConvertToType>, + CCIfType<[v2f32], CCBitConvertToType>, + CCIfType<[v2f64, v4f32], CCBitConvertToType>, + + CCIfConsecutiveRegs>, + CCIfSwiftError>>, + + // Big endian vectors must be passed as if they were 1-element vectors so that + // their lanes are in a consistent order. + CCIfBigEndian>>, + CCIfBigEndian>>, + + CCIfType<[i1, i8, i16], CCPromoteToType>, + CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], + [X0, X1, X2, X3, X4, X5, X6, X7]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], + [W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], + CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], + CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + + CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], + CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, + + CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCAssignToReg<[P0, P1, P2, P3]>> +]>; + +// Vararg functions on windows pass floats in integer registers +let Entry = 1 in +def CC_AArch64_Win64_VarArg : CallingConv<[ + CCIfType<[f16, bf16, f32], CCPromoteToType>, + CCIfType<[f64], CCBitConvertToType>, + CCDelegateTo +]>; + +// Windows Control Flow Guard checks take a single argument (the target function +// address) and have no return value. +let Entry = 1 in +def CC_AArch64_Win64_CFGuard_Check : CallingConv<[ + CCIfType<[i64], CCAssignToReg<[X15]>> +]>; + + +// Darwin uses a calling convention which differs in only two ways +// from the standard one at this level: +// + i128s (i.e. split i64s) don't need even registers. +// + Stack slots are sized as needed rather than being at least 64-bit. +let Entry = 1 in +def CC_AArch64_DarwinPCS : CallingConv<[ + CCIfType<[iPTR], CCBitConvertToType>, + CCIfType<[v2f32], CCBitConvertToType>, + CCIfType<[v2f64, v4f32, f128], CCBitConvertToType>, + + // An SRet is passed in X8, not X0 like a normal pointer parameter. + CCIfSRet>>, + + // Put ByVal arguments directly on the stack. Minimum size and alignment of a + // slot is 64-bit. + CCIfByVal>, + + // Pass SwiftSelf in a callee saved register. + CCIfSwiftSelf>>, + + // A SwiftError is passed in X21. + CCIfSwiftError>>, + + CCIfConsecutiveRegs>, + + // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, + // up to eight each of GPR and FPR. + CCIfType<[i1, i8, i16], CCPromoteToType>, + CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], + [X0, X1, X2, X3, X4, X5, X6, X7]>>, + // i128 is split to two i64s, we can't fit half to register X7. + CCIfType<[i64], + CCIfSplit>>, + // i128 is split to two i64s, and its stack alignment is 16 bytes. + CCIfType<[i64], CCIfSplit>>, + + CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], + [W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], + CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], + CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + + // If more than will fit in registers, pass them on the stack instead. + CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>, + CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16 || ValVT == MVT::bf16", + CCAssignToStack<2, 2>>, + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + + // Re-demote pointers to 32-bits so we don't end up storing 64-bit + // values and clobbering neighbouring stack locations. Not very pretty. + CCIfPtr>>, + CCIfPtr>>, + + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16], + CCAssignToStack<8, 8>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], + CCAssignToStack<16, 16>> +]>; + +let Entry = 1 in +def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ + CCIfType<[iPTR], CCBitConvertToType>, + CCIfType<[v2f32], CCBitConvertToType>, + CCIfType<[v2f64, v4f32, f128], CCBitConvertToType>, + + CCIfConsecutiveRegs>, + + // Handle all scalar types as either i64 or f64. + CCIfType<[i8, i16, i32], CCPromoteToType>, + CCIfType<[f16, bf16, f32], CCPromoteToType>, + + // Everything is on the stack. + // i128 is split to two i64s, and its stack alignment is 16 bytes. + CCIfType<[i64], CCIfSplit>>, + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], + CCAssignToStack<8, 8>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], + CCAssignToStack<16, 16>> +]>; + +// In the ILP32 world, the minimum stack slot size is 4 bytes. Otherwise the +// same as the normal Darwin VarArgs handling. +let Entry = 1 in +def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[ + CCIfType<[v2f32], CCBitConvertToType>, + CCIfType<[v2f64, v4f32, f128], CCBitConvertToType>, + + // Handle all scalar types as either i32 or f32. + CCIfType<[i8, i16], CCPromoteToType>, + CCIfType<[f16, bf16], CCPromoteToType>, + + // Everything is on the stack. + // i128 is split to two i64s, and its stack alignment is 16 bytes. + CCIfPtr>>, + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[i64], CCIfSplit>>, + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], + CCAssignToStack<8, 8>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], + CCAssignToStack<16, 16>> +]>; + + +// The WebKit_JS calling convention only passes the first argument (the callee) +// in register and the remaining arguments on stack. We allow 32bit stack slots, +// so that WebKit can write partial values in the stack and define the other +// 32bit quantity as undef. +let Entry = 1 in +def CC_AArch64_WebKit_JS : CallingConv<[ + // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0). + CCIfType<[i1, i8, i16], CCPromoteToType>, + CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>, + + // Pass the remaining arguments on the stack instead. + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[i64, f64], CCAssignToStack<8, 8>> +]>; + +let Entry = 1 in +def RetCC_AArch64_WebKit_JS : CallingConv<[ + CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], + [X0, X1, X2, X3, X4, X5, X6, X7]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], + [W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> +]>; + +//===----------------------------------------------------------------------===// +// ARM64 Calling Convention for GHC +//===----------------------------------------------------------------------===// + +// This calling convention is specific to the Glasgow Haskell Compiler. +// The only documentation is the GHC source code, specifically the C header +// file: +// +// https://github.com/ghc/ghc/blob/master/includes/stg/MachRegs.h +// +// which defines the registers for the Spineless Tagless G-Machine (STG) that +// GHC uses to implement lazy evaluation. The generic STG machine has a set of +// registers which are mapped to appropriate set of architecture specific +// registers for each CPU architecture. +// +// The STG Machine is documented here: +// +// https://ghc.haskell.org/trac/ghc/wiki/Commentary/Compiler/GeneratedCode +// +// The AArch64 register mapping is under the heading "The ARMv8/AArch64 ABI +// register mapping". + +let Entry = 1 in +def CC_AArch64_GHC : CallingConv<[ + CCIfType<[iPTR], CCBitConvertToType>, + + // Handle all vector types as either f64 or v2f64. + CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, f128], CCBitConvertToType>, + + CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>, + CCIfType<[f32], CCAssignToReg<[S8, S9, S10, S11]>>, + CCIfType<[f64], CCAssignToReg<[D12, D13, D14, D15]>>, + + // Promote i8/i16/i32 arguments to i64. + CCIfType<[i8, i16, i32], CCPromoteToType>, + + // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim + CCIfType<[i64], CCAssignToReg<[X19, FP, X20, X21, X22, X23, X24, X25, X26, X27, X28]>> +]>; + +// The order of the callee-saves in this file is important, because the +// FrameLowering code will use this order to determine the layout the +// callee-save area in the stack frame. As can be observed below, Darwin +// requires the frame-record (LR, FP) to be at the top the callee-save area, +// whereas for other platforms they are at the bottom. + +// FIXME: LR is only callee-saved in the sense that *we* preserve it and are +// presumably a callee to someone. External functions may not do so, but this +// is currently safe since BL has LR as an implicit-def and what happens after a +// tail call doesn't matter. +// +// It would be better to model its preservation semantics properly (create a +// vreg on entry, use it in RET & tail call generation; make that vreg def if we +// end up saving LR as part of a call frame). Watch this space... +def CSR_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, + D8, D9, D10, D11, + D12, D13, D14, D15, + LR, FP)>; + +// A variant for treating X18 as callee saved, when interfacing with +// code that needs X18 to be preserved. +def CSR_AArch64_AAPCS_X18 : CalleeSavedRegs<(add X18, CSR_AArch64_AAPCS)>; + +// Win64 has unwinding codes for an (FP,LR) pair, save_fplr and save_fplr_x. +// We put FP before LR, so that frame lowering logic generates (FP,LR) pairs, +// and not (LR,FP) pairs. +def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, FP, LR, + D8, D9, D10, D11, + D12, D13, D14, D15)>; + +// The Control Flow Guard check call uses a custom calling convention that also +// preserves X0-X8 and Q0-Q7. +def CSR_Win_AArch64_CFGuard_Check : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, + (sequence "X%u", 0, 8), + (sequence "Q%u", 0, 7))>; + +// AArch64 PCS for vector functions (VPCS) +// must (additionally) preserve full Q8-Q23 registers +def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, LR, FP, + (sequence "Q%u", 8, 23))>; + +// Functions taking SVE arguments or returning an SVE type +// must (additionally) preserve full Z8-Z23 and predicate registers P4-P15 +def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add (sequence "Z%u", 8, 23), + (sequence "P%u", 4, 15), + X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, LR, FP)>; + +// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since +// 'this' and the pointer return value are both passed in X0 in these cases, +// this can be partially modelled by treating X0 as a callee-saved register; +// only the resulting RegMask is used; the SaveList is ignored +// +// (For generic ARM 64-bit ABI code, clang will not generate constructors or +// destructors with 'this' returns, so this RegMask will not be used in that +// case) +def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>; + +def CSR_AArch64_AAPCS_SwiftError + : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X21)>; + +// The ELF stub used for TLS-descriptor access saves every feasible +// register. Only X0 and LR are clobbered. +def CSR_AArch64_TLS_ELF + : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP, + (sequence "Q%u", 0, 31))>; + +def CSR_AArch64_AllRegs + : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP, + (sequence "X%u", 0, 28), FP, LR, SP, + (sequence "B%u", 0, 31), (sequence "H%u", 0, 31), + (sequence "S%u", 0, 31), (sequence "D%u", 0, 31), + (sequence "Q%u", 0, 31))>; + +def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>; + +def CSR_AArch64_RT_MostRegs : CalleeSavedRegs<(add CSR_AArch64_AAPCS, + (sequence "X%u", 9, 15))>; + +def CSR_AArch64_StackProbe_Windows + : CalleeSavedRegs<(add (sequence "X%u", 0, 15), + (sequence "X%u", 18, 28), FP, SP, + (sequence "Q%u", 0, 31))>; + +// Darwin variants of AAPCS. +// Darwin puts the frame-record at the top of the callee-save area. +def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, + X23, X24, X25, X26, X27, X28, + D8, D9, D10, D11, + D12, D13, D14, D15)>; + +def CSR_Darwin_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, + X22, X23, X24, X25, X26, X27, + X28, (sequence "Q%u", 8, 23))>; +def CSR_Darwin_AArch64_AAPCS_ThisReturn + : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, X0)>; + +def CSR_Darwin_AArch64_AAPCS_SwiftError + : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>; + +// The function used by Darwin to obtain the address of a thread-local variable +// guarantees more than a normal AAPCS function. x16 and x17 are used on the +// fast path for calculation, but other registers except X0 (argument/return) +// and LR (it is a call, after all) are preserved. +def CSR_Darwin_AArch64_TLS + : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17), + FP, + (sequence "Q%u", 0, 31))>; + +// We can only handle a register pair with adjacent registers, the register pair +// should belong to the same class as well. Since the access function on the +// fast path calls a function that follows CSR_Darwin_AArch64_TLS, +// CSR_Darwin_AArch64_CXX_TLS should be a subset of CSR_Darwin_AArch64_TLS. +def CSR_Darwin_AArch64_CXX_TLS + : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, + (sub (sequence "X%u", 1, 28), X15, X16, X17, X18), + (sequence "D%u", 0, 31))>; + +// CSRs that are handled by prologue, epilogue. +def CSR_Darwin_AArch64_CXX_TLS_PE + : CalleeSavedRegs<(add LR, FP)>; + +// CSRs that are handled explicitly via copies. +def CSR_Darwin_AArch64_CXX_TLS_ViaCopy + : CalleeSavedRegs<(sub CSR_Darwin_AArch64_CXX_TLS, LR, FP)>; + +def CSR_Darwin_AArch64_RT_MostRegs + : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, (sequence "X%u", 9, 15))>; + +// Variants of the standard calling conventions for shadow call stack. +// These all preserve x18 in addition to any other registers. +def CSR_AArch64_NoRegs_SCS + : CalleeSavedRegs<(add CSR_AArch64_NoRegs, X18)>; +def CSR_AArch64_AllRegs_SCS + : CalleeSavedRegs<(add CSR_AArch64_AllRegs, X18)>; +def CSR_AArch64_AAPCS_SwiftError_SCS + : CalleeSavedRegs<(add CSR_AArch64_AAPCS_SwiftError, X18)>; +def CSR_AArch64_RT_MostRegs_SCS + : CalleeSavedRegs<(add CSR_AArch64_RT_MostRegs, X18)>; +def CSR_AArch64_AAVPCS_SCS + : CalleeSavedRegs<(add CSR_AArch64_AAVPCS, X18)>; +def CSR_AArch64_SVE_AAPCS_SCS + : CalleeSavedRegs<(add CSR_AArch64_SVE_AAPCS, X18)>; +def CSR_AArch64_AAPCS_SCS + : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X18)>; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index bd3bc9ead40090a31d9bb0d9783aa7a0161ca1e3..b96b3f6b898c004152f711e9716db0ffc081f3c3 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1024,6 +1024,18 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) { } } +#ifdef ARK_GC_SUPPORT +Triple::ArchType AArch64FrameLowering::GetArkSupportTarget() const +{ + return Triple::aarch64; +} + +int AArch64FrameLowering::GetFixedFpPosition() const +{ + return -1; +} +#endif + void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -1073,8 +1085,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. +#ifndef ARK_GC_SUPPORT if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; +#endif + // asm-int GHC call webkit function, we need push regs to stack. // Set tagged base pointer to the requested stack slot. // Ideally it should match SP value after prologue. @@ -1559,8 +1574,11 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. + #ifndef ARK_GC_SUPPORT if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; + #endif + // asm-int GHC call webkit function, we need push regs to stack. // Initial and residual are named for consistency with the prologue. Note that // in the epilogue, the residual adjustment is executed first. @@ -2543,8 +2561,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, RegScavenger *RS) const { // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. + #ifndef ARK_GC_SUPPORT if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; + #endif + // asm-int GHC call webkit function, we need push regs to stack. TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); const AArch64RegisterInfo *RegInfo = static_cast( diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 27450385f89a57526ae7f7a21c5af953f724339f..7a27625cf4337dd8fd42b3cd908481425ba823a5 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -16,6 +16,9 @@ #include "AArch64StackProtectorRetLowering.h" #include "llvm/Support/TypeSize.h" #include "llvm/CodeGen/TargetFrameLowering.h" +#ifdef ARK_GC_SUPPORT +#include "llvm/ADT/Triple.h" +#endif namespace llvm { @@ -42,6 +45,10 @@ public: /// the function. void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; +#ifdef ARK_GC_SUPPORT + Triple::ArchType GetArkSupportTarget() const override; + int GetFixedFpPosition() const override; +#endif const StackProtectorRetLowering *getStackProtectorRet() const override; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c522ee76626d2b1cdf148143413096d53988a988..a53a719cba72e0eabf6383a606e131c376e52ac9 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4987,7 +4987,11 @@ SDValue AArch64TargetLowering::LowerCallResult( /// Return true if the calling convention is one that we can guarantee TCO for. static bool canGuaranteeTCO(CallingConv::ID CC) { +#ifdef ARK_GC_SUPPORT + return (CC == CallingConv::GHC) || (CC == CallingConv::Fast); +#else return CC == CallingConv::Fast; +#endif } /// Return true if we might ever do TCO for calls with this calling convention. @@ -5183,7 +5187,11 @@ SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const { +#ifdef ARK_GC_SUPPORT + return (CallCC == CallingConv::GHC || (CallCC == CallingConv::Fast)) && TailCallOpt; +#else return CallCC == CallingConv::Fast && TailCallOpt; +#endif } /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index f90856d14b2fe7616774edc0024de12bc57fedc0..3f1a9d4e8c9f82b284d007b23d8c0bd48f678566 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -312,6 +312,17 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { if (TFI->hasFP(MF) || TT.isOSDarwin()) markSuperRegs(Reserved, AArch64::W29); +#ifdef ARK_GC_SUPPORT + if (MF.getFunction().getCallingConv() == CallingConv::GHC) { + markSuperRegs(Reserved, AArch64::W29); + markSuperRegs(Reserved, AArch64::W30); + } + if ((MF.getFunction().getCallingConv() == CallingConv::WebKit_JS) || + (MF.getFunction().getCallingConv() == CallingConv::C)) { + markSuperRegs(Reserved, AArch64::W30); + } +#endif + for (size_t i = 0; i < AArch64::GPR32commonRegClass.getNumRegs(); ++i) { if (MF.getSubtarget().isXRegisterReserved(i)) markSuperRegs(Reserved, AArch64::GPR32commonRegClass.getRegister(i)); diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index 325f56484e74eb9687d13a6d806c35cc8cd38c82..442c7aa344525c03edf0237ab44add63e8a79e70 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -1,7 +1,10 @@ add_llvm_component_group(AArch64 HAS_JIT) -set(LLVM_TARGET_DEFINITIONS AArch64.td) - +if(BUILD_ARK_GC_SUPPORT) + set(LLVM_TARGET_DEFINITIONS AArch64ArkGc.td) +else() + set(LLVM_TARGET_DEFINITIONS AArch64.td) +endif(BUILD_ARK_GC_SUPPORT) tablegen(LLVM AArch64GenAsmMatcher.inc -gen-asm-matcher) tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer) tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index 0f8b1d6584b1a09660033451ca56131e0be97db8..9dcb2e35e3e225a0a5fd1c8837808f4221a6d784 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -231,7 +231,11 @@ struct OutgoingArgHandler : public CallLowering::OutgoingValueHandler { } // namespace static bool doesCalleeRestoreStack(CallingConv::ID CallConv, bool TailCallOpt) { - return CallConv == CallingConv::Fast && TailCallOpt; + #ifdef ARK_GC_SUPPORT + return (CallConv == CallingConv::GHC || (CallConv == CallingConv::Fast)) && TailCallOpt; + #else + return CallConv == CallingConv::Fast && TailCallOpt; + #endif } void AArch64CallLowering::splitToValueTypes( @@ -522,7 +526,11 @@ bool AArch64CallLowering::lowerFormalArguments( /// Return true if the calling convention is one that we can guarantee TCO for. static bool canGuaranteeTCO(CallingConv::ID CC) { +#ifdef ARK_GC_SUPPORT + return (CC == CallingConv::GHC) || (CC == CallingConv::Fast); +#else return CC == CallingConv::Fast; +#endif } /// Return true if we might ever do TCO for calls with this calling convention. diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 866f1136400477ea5f2cb3ac302567ba38765ec9..7451b6a65ed565734332b2d3608c025074870bc7 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -1286,6 +1286,37 @@ bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const { - for 32-bit code, substitute %e?? registers for %r?? */ +#ifdef ARK_GC_SUPPORT +Triple::ArchType X86FrameLowering::GetArkSupportTarget() const +{ + return Is64Bit ? Triple::x86_64 : Triple::x86; +} + +int X86FrameLowering::GetFixedFpPosition() const +{ + return 2; +} + +int X86FrameLowering::GetFrameReserveSize(MachineFunction &MF) const +{ + int slotSize = sizeof(uint64_t); + if (!Is64Bit) { + slotSize = sizeof(uint32_t); + } + int reserveSize = 0; + MF.getFunction() + .getFnAttribute("frame-reserved-slots") + .getValueAsString() + .getAsInteger(10, reserveSize); + + // x86-64 shoule align 16 bytes + if (Is64Bit) { + return RoundUp(reserveSize, 2 * sizeof(uint64_t)); + } + return reserveSize; +} +#endif + void X86FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&STI == &MF.getSubtarget() && @@ -1486,6 +1517,20 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, else MFI.setOffsetAdjustment(-StackSize); } +#ifdef ARK_GC_SUPPORT + // push marker + if (MF.getFunction().hasFnAttribute("frame-reserved-slots")) + { + unsigned StackPtr = TRI->getStackRegister(); + int reserveSize = GetFrameReserveSize(MF); + const unsigned SUBOpc = + getSUBriOpcode(Uses64BitFramePtr, reserveSize); + BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), StackPtr) + .addReg(StackPtr) + .addImm(reserveSize) + .setMIFlag(MachineInstr::FrameSetup); + } +#endif // For EH funclets, only allocate enough space for outgoing calls. Save the // NumBytes value that we would've used for the parent frame. @@ -1959,6 +2004,22 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // AfterPop is the position to insert .cfi_restore. MachineBasicBlock::iterator AfterPop = MBBI; if (HasFP) { +#ifdef ARK_GC_SUPPORT + if (MF.getFunction().hasFnAttribute("frame-reserved-slots")) + { + + int reserveSize = GetFrameReserveSize(MF); + int slotSize = sizeof(uint32_t); + if (Is64Bit) { + slotSize = sizeof(uint64_t); + } + for (int i = 0; i < reserveSize / slotSize; i++) { + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), + MachineFramePtr) + .setMIFlag(MachineInstr::FrameDestroy); + } + } +#endif // Pop EBP. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr) @@ -2354,7 +2415,11 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( } } } - +#ifdef ARK_GC_SUPPORT + int reserveSize = GetFrameReserveSize(MF); + SpillSlotOffset -= reserveSize; // skip frame reserved + CalleeSavedFrameSize += reserveSize; +#endif // Assign slots for GPRs. It increases frame size. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i - 1].getReg(); diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h index 26e80811af2e5aab3320a62e7ed41cb68eddb78c..81be7fb94bb9f506071739393a226747f6cb79de 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -70,6 +70,11 @@ public: /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. +#ifdef ARK_GC_SUPPORT + Triple::ArchType GetArkSupportTarget() const override; + int GetFixedFpPosition() const override; + int GetFrameReserveSize(MachineFunction &MF) const override; +#endif void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1e2407c7e7f6b8b29cc964409a1274f71faeaf31..f20b78aa918a4d5a243ed480a4e9cd5226707d1a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3958,8 +3958,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ++NumTailCalls; } +#ifdef ARK_GC_SUPPORT + assert(!(isVarArg && canGuaranteeTCO(CallConv) && (CallConv != CallingConv::GHC)) && + "Var args not supported with calling convention fastcc, ghc or hipe"); +#else assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); +#endif // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index d90b4e7bdc7ea1549b51b333a8665fbc724f40f9..00d30e503f120471d23f6d81d39af01fff4f491a 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -541,6 +541,12 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { for (const MCPhysReg &SubReg : subregs_inclusive(X86::RBP)) Reserved.set(SubReg); } +#ifdef ARK_GC_SUPPORT + if (MF.getFunction().getCallingConv() == CallingConv::GHC) { + for (const MCPhysReg &SubReg : subregs_inclusive(X86::RBP)) + Reserved.set(SubReg); + } +#endif // Set the base-pointer register and its aliases as reserved if needed. if (hasBasePointer(MF)) { diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index b7830555bf737b3e6a5faf80a684db104c5c7e34..1bfdde5e6606fc7dfb72f738020ecee68a9a2f6d 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -521,6 +521,16 @@ static BaseDefiningValueResult findBaseDefiningValue(Value *I) { return BaseDefiningValueResult( ConstantPointerNull::get(cast(I->getType())), true); } +#ifdef ARK_GC_SUPPORT + // inttoptrs in an integral address space are currently ill-defined. We + // treat them as defining base pointers here for consistency with the + // constant rule above and because we don't really have a better semantic + // to give them. Note that the optimizer is always free to insert undefined + // behavior on dynamically dead paths as well. + // Associate LLVM issue:https://reviews.llvm.org/D103492 + if (isa(I)) + return BaseDefiningValueResult(I, true); +#endif if (CastInst *CI = dyn_cast(I)) { Value *Def = CI->stripPointerCasts();