diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 4c1ed0f4fc9ac00e4bb6644cab71aef5c76d041c..e8c8886c53cdc5c10f7d690cda4087d6b65492c7 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -17678,6 +17678,45 @@ Arguments: The argument to this intrinsic must be a vector. +'``llvm.experimental.cttz.elts``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ```llvm.experimental.cttz.elts``` +on any vector of integer elements, both fixed width and scalable. + +:: + + declare i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1> , i1 ) + +Overview: +""""""""" + +The '``llvm.experimental.cttz.elts``' intrinsic counts the number of trailing +zero elements of a vector. + +Arguments: +"""""""""" + +The first argument is the vector to be counted. This argument must be a vector +with integer element type. The return type must also be an integer type which is +wide enough to hold the maximum number of elements of the source vector. The +behaviour of this intrinsic is undefined if the return type is not wide enough +for the number of elements in the input vector. + +The second argument is a constant flag that indicates whether the intrinsic +returns a valid result if the first argument is all zero. If the first argument +is all zero and the second argument is true, the result is poison. + +Semantics: +"""""""""" + +The '``llvm.experimental.cttz.elts``' intrinsic counts the trailing (least +significant) zero elements in a vector. If ``src == 0`` the result is the +number of elements in the input vector. + '``llvm.experimental.vector.splice``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -17755,6 +17794,54 @@ Arguments: None. +'``llvm.experimental.get.vector.length``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.experimental.get.vector.length.i32(i32 %cnt, i32 immarg %vf, i1 immarg %scalable) + declare i32 @llvm.experimental.get.vector.length.i64(i64 %cnt, i32 immarg %vf, i1 immarg %scalable) + +Overview: +""""""""" + +The '``llvm.experimental.get.vector.length.*``' intrinsics take a number of +elements to process and returns how many of the elements can be processed +with the requested vectorization factor. + +Arguments: +"""""""""" + +The first argument is an unsigned value of any scalar integer type and specifies +the total number of elements to be processed. The second argument is an i32 +immediate for the vectorization factor. The third argument indicates if the +vectorization factor should be multiplied by vscale. + +Semantics: +"""""""""" + +Returns a positive i32 value (explicit vector length) that is unknown at compile +time and depends on the hardware specification. +If the result value does not fit in the result type, then the result is +a :ref:`poison value `. + +This intrinsic is intended to be used by loop vectorization with VP intrinsics +in order to get the number of elements to process on each loop iteration. The +result should be used to decrease the count for the next iteration until the +count reaches zero. + +If the count is larger than the number of lanes in the type described by the +last 2 arguments, this intrinsic may return a value less than the number of +lanes implied by the type. The result will be at least as large as the result +will be on any later loop iteration. + +This intrinsic will only return 0 if the input count is also 0. A non-zero input +count will produce a non-zero result. + Matrix Intrinsics ----------------- diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index 51b67d1756b9aa9f56f7409582cda34cc54d5a1d..078d9bd2c023d24428aed529ccb93fa6620ac1e2 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -566,6 +566,7 @@ public: const SCEV *getLosslessPtrToIntExpr(const SCEV *Op, unsigned Depth = 0); const SCEV *getPtrToIntExpr(const SCEV *Op, Type *Ty); const SCEV *getTruncateExpr(const SCEV *Op, Type *Ty, unsigned Depth = 0); + const SCEV *getVScale(Type *Ty); const SCEV *getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth = 0); const SCEV *getZeroExtendExprImpl(const SCEV *Op, Type *Ty, unsigned Depth = 0); @@ -660,10 +661,8 @@ public: return getConstant(Ty, -1, /*isSigned=*/true); } - /// Return an expression for sizeof ScalableTy that is type IntTy, where - /// ScalableTy is a scalable vector type. - const SCEV *getSizeOfScalableVectorExpr(Type *IntTy, - ScalableVectorType *ScalableTy); + /// Return an expression for a TypeSize. + const SCEV *getSizeOfExpr(Type *IntTy, TypeSize Size); /// Return an expression for the alloc size of AllocTy that is type IntTy const SCEV *getSizeOfExpr(Type *IntTy, Type *AllocTy); diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionDivision.h b/llvm/include/llvm/Analysis/ScalarEvolutionDivision.h index 7d5902d317952039f9edf590ad035f5e94d3aed4..3283d438ccb515c88826225c01a38f760f3043d4 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolutionDivision.h +++ b/llvm/include/llvm/Analysis/ScalarEvolutionDivision.h @@ -48,6 +48,8 @@ public: void visitConstant(const SCEVConstant *Numerator); + void visitVScale(const SCEVVScale *Numerator); + void visitAddRecExpr(const SCEVAddRecExpr *Numerator); void visitAddExpr(const SCEVAddExpr *Numerator); diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h b/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h index 80443510d449c6dff8008b8bd45cdad0d07a6f9a..c7a27871bc84f93575f3a9070ba3d5b2fbff5017 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h +++ b/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h @@ -39,6 +39,7 @@ enum SCEVTypes : unsigned short { // These should be ordered in terms of increasing complexity to make the // folders simpler. scConstant, + scVScale, scTruncate, scZeroExtend, scSignExtend, @@ -75,6 +76,23 @@ public: static bool classof(const SCEV *S) { return S->getSCEVType() == scConstant; } }; +/// This class represents the value of vscale, as used when defining the length +/// of a scalable vector or returned by the llvm.vscale() intrinsic. +class SCEVVScale : public SCEV { + friend class ScalarEvolution; + + SCEVVScale(const FoldingSetNodeIDRef ID, Type *ty) + : SCEV(ID, scVScale, 0), Ty(ty) {} + + Type *Ty; + +public: + Type *getType() const { return Ty; } + + /// Methods for support type inquiry through isa, cast, and dyn_cast: + static bool classof(const SCEV *S) { return S->getSCEVType() == scVScale; } +}; + inline unsigned short computeExpressionSize(ArrayRef Args) { APInt Size(16, 1); for (const auto *Arg : Args) @@ -579,17 +597,8 @@ class SCEVUnknown final : public SCEV, private CallbackVH { public: Value *getValue() const { return getValPtr(); } - /// @{ - /// Test whether this is a special constant representing a type - /// size, alignment, or field offset in a target-independent - /// manner, and hasn't happened to have been folded with other - /// operations into something unrecognizable. This is mainly only - /// useful for pretty-printing and other situations where it isn't - /// absolutely required for these to succeed. - bool isSizeOf(Type *&AllocTy) const; bool isAlignOf(Type *&AllocTy) const; bool isOffsetOf(Type *&STy, Constant *&FieldNo) const; - /// @} Type *getType() const { return getValPtr()->getType(); } @@ -604,6 +613,8 @@ template struct SCEVVisitor { switch (S->getSCEVType()) { case scConstant: return ((SC *)this)->visitConstant((const SCEVConstant *)S); + case scVScale: + return ((SC *)this)->visitVScale((const SCEVVScale *)S); case scPtrToInt: return ((SC *)this)->visitPtrToIntExpr((const SCEVPtrToIntExpr *)S); case scTruncate: @@ -671,6 +682,7 @@ public: switch (S->getSCEVType()) { case scConstant: + case scVScale: case scUnknown: continue; case scPtrToInt: @@ -760,6 +772,8 @@ public: const SCEV *visitConstant(const SCEVConstant *Constant) { return Constant; } + const SCEV *visitVScale(const SCEVVScale *VScale) { return VScale; } + const SCEV *visitPtrToIntExpr(const SCEVPtrToIntExpr *Expr) { const SCEV *Operand = ((SC *)this)->visit(Expr->getOperand()); return Operand == Expr->getOperand() diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index ce3ff2ee7aecab20f159ffb99fda4b758951db09..ad30d83aec433af939aaa1bfc3f4d4fd33c947ae 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1039,6 +1039,9 @@ public: /// \return The associativity of the cache level, if available. std::optional getCacheAssociativity(CacheLevel Level) const; + /// \return The minimum architectural page size for the target. + std::optional getMinPageSize() const; + /// \return How much before a load we should place the prefetch /// instruction. This is currently measured in number of /// instructions. @@ -1741,6 +1744,7 @@ public: virtual std::optional getCacheSize(CacheLevel Level) const = 0; virtual std::optional getCacheAssociativity(CacheLevel Level) const = 0; + virtual std::optional getMinPageSize() const = 0; /// \return How much before a load we should place the prefetch /// instruction. This is currently measured in number of @@ -2297,6 +2301,10 @@ public: return Impl.getCacheAssociativity(Level); } + std::optional getMinPageSize() const override { + return Impl.getMinPageSize(); + } + /// Return the preferred prefetch distance in terms of instructions. /// unsigned getPrefetchDistance() const override { diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 3a99b02bc3637eed73572cb1d6891eef48e87689..719e6ce43c62e77094e4c7ffe52c54c9a137d832 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -483,6 +483,8 @@ public: llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); } + std::optional getMinPageSize() const { return {}; } + unsigned getPrefetchDistance() const { return 0; } unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index d1d31dc795b2a4db5464163756621ee49830a842..51e7a301b766d4846353debbf22c5b1a9fd0d379 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -554,6 +554,10 @@ OverflowResult computeOverflowForSignedSub(const Value *LHS, const Value *RHS, bool isOverflowIntrinsicNoWrap(const WithOverflowInst *WO, const DominatorTree &DT); +/// Determine the possible constant range of vscale with the given bit width, +/// based on the vscale_range function attribute. +ConstantRange getVScaleRange(const Function *F, unsigned BitWidth); + /// Determine the possible constant range of an integer or vector of integer /// value. This is intended as a cheap, non-recursive check. ConstantRange computeConstantRange(const Value *V, bool ForSigned, diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index aa1936c2757e71e16befcd3774a650da33cd2223..e3391e43b6a37233629de91ae665afb5a0edd307 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1066,6 +1066,9 @@ public: getConstant(MulImm.sextOrTrunc(VT.getSizeInBits()), DL, VT)); } + SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, + bool ConstantFold = true); + /// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc. SDValue getGLOBAL_OFFSET_TABLE(EVT VT) { return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT); diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 575ed22fe549ad2360c6a9f3259a1e3c96777d79..49e2f10a55c912d34a063af59c67e41a04483549 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -444,6 +444,15 @@ public: return true; } + virtual bool shouldExpandGetVectorLength(EVT CountVT, unsigned VF, + bool IsScalable) const { + return true; + } + + /// Return true if the @llvm.experimental.cttz.elts intrinsic should be + /// expanded using generic code in SelectionDAGBuilder. + virtual bool shouldExpandCttzElements(EVT VT) const { return true; } + /// Return true if it is profitable to convert a select of FP constants into /// a constant pool load whose address depends on the select condition. The /// parameter may be used to differentiate a select with FP compare from diff --git a/llvm/include/llvm/CodeGen/ValueTypes.h b/llvm/include/llvm/CodeGen/ValueTypes.h index af4c8ab40e8236373d34c2122ccc439a87db6e31..f59a7f2676a3719febece27b2c999bb06285258d 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.h +++ b/llvm/include/llvm/CodeGen/ValueTypes.h @@ -122,7 +122,7 @@ namespace llvm { /// Test if the given EVT has zero size, this will fail if called on a /// scalable type bool isZeroSized() const { - return !isScalableVector() && getSizeInBits() == 0; + return getSizeInBits().isZero(); } /// Test if the given EVT is simple (as opposed to being extended). @@ -150,6 +150,12 @@ namespace llvm { return isSimple() ? V.isScalarInteger() : isExtendedScalarInteger(); } + /// Return true if this is a vector type where the runtime + /// length is machine dependent + bool isScalableTargetExtVT() const { + return isSimple() && V.isScalableTargetExtVT(); + } + /// Return true if this is a vector value type. bool isVector() const { return isSimple() ? V.isVector() : isExtendedVector(); @@ -166,6 +172,11 @@ namespace llvm { : isExtendedFixedLengthVector(); } + /// Return true if the type is a scalable type. + bool isScalableVT() const { + return isScalableVector() || isScalableTargetExtVT(); + } + /// Return true if this is a 16-bit vector type. bool is16BitVector() const { return isSimple() ? V.is16BitVector() : isExtended16BitVector(); diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td index c22553855c55601afddf389b9004cdaa3becca90..934800f1074732f6f0fc8e5c1348dff127dd6f13 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.td +++ b/llvm/include/llvm/CodeGen/ValueTypes.td @@ -236,6 +236,8 @@ def funcref : ValueType<0, 192>; // WebAssembly's funcref type def externref : ValueType<0, 193>; // WebAssembly's externref type def x86amx : ValueType<8192, 194>; // X86 AMX value def i64x8 : ValueType<512, 195>; // 8 Consecutive GPRs (AArch64) +def aarch64svcount + : ValueType<16, 196>; // AArch64 predicate-as-counter def token : ValueType<0, 248>; // TokenTy def MetadataVT : ValueType<0, 249>; // Metadata diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index e0fd727607ce68435de55f1ea51c2b996ad4567a..8fd7ad473fa3d70db2bf68becf50ac46e8f7b636 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1839,6 +1839,17 @@ def int_get_active_lane_mask: [llvm_anyint_ty, LLVMMatchType<1>], [IntrNoMem, IntrNoSync, IntrWillReturn]>; +def int_experimental_get_vector_length: + DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_anyint_ty, llvm_i32_ty, llvm_i1_ty], + [IntrNoMem, IntrNoSync, IntrWillReturn, + ImmArg>, ImmArg>]>; + +def int_experimental_cttz_elts: + DefaultAttrsIntrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty, llvm_i1_ty], + [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg>]>; + def int_experimental_vp_splice: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h index 37e4c32ee25131e92a7825c6e55b05e46ed91f75..7d3a1d3f1c7b6c8959a53820d65f49cbf4bcedee 100644 --- a/llvm/include/llvm/IR/Type.h +++ b/llvm/include/llvm/IR/Type.h @@ -206,6 +206,15 @@ public: /// Return true if this is a target extension type. bool isTargetExtTy() const { return getTypeID() == TargetExtTyID; } + /// Return true if this is a target extension type with a scalable layout. + bool isScalableTargetExtTy() const; + + /// Return true if this is a scalable vector type or a target extension type + /// with a scalable layout. + bool isScalableTy() const { + return getTypeID() == ScalableVectorTyID || isScalableTargetExtTy(); + } + /// Return true if this is a FP type or a vector of FP. bool isFPOrFPVectorTy() const { return getScalarType()->isFloatingPointTy(); } diff --git a/llvm/include/llvm/Support/MachineValueType.h b/llvm/include/llvm/Support/MachineValueType.h index d7ad32737a45706cfa40deeb6052e3fd16df8b4d..a7ea92aa6d365644e2e125c72fcf81c6b87dd144 100644 --- a/llvm/include/llvm/Support/MachineValueType.h +++ b/llvm/include/llvm/Support/MachineValueType.h @@ -291,9 +291,10 @@ namespace llvm { externref = 193, // WebAssembly's externref type x86amx = 194, // This is an X86 AMX value i64x8 = 195, // 8 Consecutive GPRs (AArch64) + aarch64svcount = 196, // AArch64 predicate-as-counter FIRST_VALUETYPE = 1, // This is always the beginning of the list. - LAST_VALUETYPE = i64x8, // This always remains at the end of the list. + LAST_VALUETYPE = aarch64svcount, // This always remains at the end of the list. VALUETYPE_SIZE = LAST_VALUETYPE + 1, // This is the current maximum for LAST_VALUETYPE. @@ -395,6 +396,16 @@ namespace llvm { SimpleTy <= MVT::LAST_SCALABLE_VECTOR_VALUETYPE); } + /// Return true if this is a custom target type that has a scalable size. + bool isScalableTargetExtVT() const { + return SimpleTy == MVT::aarch64svcount; + } + + /// Return true if the type is a scalable type. + bool isScalableVT() const { + return isScalableVector() || isScalableTargetExtVT(); + } + bool isFixedLengthVector() const { return (SimpleTy >= MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE && SimpleTy <= MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE); @@ -956,6 +967,7 @@ namespace llvm { case v2i8: case v1i16: case v1f16: return TypeSize::Fixed(16); + case aarch64svcount: case nxv16i1: case nxv2i8: case nxv1i16: diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h index 131e24f685e89f40fc4afe5b650686b06022acad..555897083469aeeb73bdf6f8102979ed9da852a9 100644 --- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h +++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h @@ -457,6 +457,8 @@ private: Value *visitConstant(const SCEVConstant *S) { return S->getValue(); } + Value *visitVScale(const SCEVVScale *S); + Value *visitPtrToIntExpr(const SCEVPtrToIntExpr *S); Value *visitTruncateExpr(const SCEVTruncateExpr *S); diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index f55333303f8d9de987183e52017bac39d5b43a6e..48e435a06ba1556b3e9193ea62580f6bacbeeec0 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -204,7 +204,7 @@ bool llvm::isDereferenceableAndAlignedPointer( const TargetLibraryInfo *TLI) { // For unsized types or scalable vectors we don't know exactly how many bytes // are dereferenced, so bail out. - if (!Ty->isSized() || isa(Ty)) + if (!Ty->isSized() || Ty->isScalableTy()) return false; // When dereferenceability information is provided by a dereferenceable diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 8c62fc37c4a36ca35506c3d23d1f5f7c182a4bed..4ff97688152cec2395e8e5008b395ff9821cb42d 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -271,6 +271,9 @@ void SCEV::print(raw_ostream &OS) const { case scConstant: cast(this)->getValue()->printAsOperand(OS, false); return; + case scVScale: + OS << "vscale"; + return; case scPtrToInt: { const SCEVPtrToIntExpr *PtrToInt = cast(this); const SCEV *Op = PtrToInt->getOperand(); @@ -366,31 +369,9 @@ void SCEV::print(raw_ostream &OS) const { OS << "(" << *UDiv->getLHS() << " /u " << *UDiv->getRHS() << ")"; return; } - case scUnknown: { - const SCEVUnknown *U = cast(this); - Type *AllocTy; - if (U->isSizeOf(AllocTy)) { - OS << "sizeof(" << *AllocTy << ")"; - return; - } - if (U->isAlignOf(AllocTy)) { - OS << "alignof(" << *AllocTy << ")"; - return; - } - - Type *CTy; - Constant *FieldNo; - if (U->isOffsetOf(CTy, FieldNo)) { - OS << "offsetof(" << *CTy << ", "; - FieldNo->printAsOperand(OS, false); - OS << ")"; - return; - } - - // Otherwise just print it normally. - U->getValue()->printAsOperand(OS, false); + case scUnknown: + cast(this)->getValue()->printAsOperand(OS, false); return; - } case scCouldNotCompute: OS << "***COULDNOTCOMPUTE***"; return; @@ -402,6 +383,8 @@ Type *SCEV::getType() const { switch (getSCEVType()) { case scConstant: return cast(this)->getType(); + case scVScale: + return cast(this)->getType(); case scPtrToInt: case scTruncate: case scZeroExtend: @@ -433,6 +416,7 @@ Type *SCEV::getType() const { ArrayRef SCEV::operands() const { switch (getSCEVType()) { case scConstant: + case scVScale: case scUnknown: return {}; case scPtrToInt: @@ -515,6 +499,18 @@ ScalarEvolution::getConstant(Type *Ty, uint64_t V, bool isSigned) { return getConstant(ConstantInt::get(ITy, V, isSigned)); } +const SCEV *ScalarEvolution::getVScale(Type *Ty) { + FoldingSetNodeID ID; + ID.AddInteger(scVScale); + ID.AddPointer(Ty); + void *IP = nullptr; + if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) + return S; + SCEV *S = new (SCEVAllocator) SCEVVScale(ID.Intern(SCEVAllocator), Ty); + UniqueSCEVs.InsertNode(S, IP); + return S; +} + SCEVCastExpr::SCEVCastExpr(const FoldingSetNodeIDRef ID, SCEVTypes SCEVTy, const SCEV *op, Type *ty) : SCEV(ID, SCEVTy, computeExpressionSize(op)), Op(op), Ty(ty) {} @@ -574,22 +570,6 @@ void SCEVUnknown::allUsesReplacedWith(Value *New) { setValPtr(New); } -bool SCEVUnknown::isSizeOf(Type *&AllocTy) const { - if (ConstantExpr *VCE = dyn_cast(getValue())) - if (VCE->getOpcode() == Instruction::PtrToInt) - if (ConstantExpr *CE = dyn_cast(VCE->getOperand(0))) - if (CE->getOpcode() == Instruction::GetElementPtr && - CE->getOperand(0)->isNullValue() && - CE->getNumOperands() == 2) - if (ConstantInt *CI = dyn_cast(CE->getOperand(1))) - if (CI->isOne()) { - AllocTy = cast(CE)->getSourceElementType(); - return true; - } - - return false; -} - bool SCEVUnknown::isAlignOf(Type *&AllocTy) const { if (ConstantExpr *VCE = dyn_cast(getValue())) if (VCE->getOpcode() == Instruction::PtrToInt) @@ -785,6 +765,12 @@ CompareSCEVComplexity(EquivalenceClasses &EqCacheSCEV, return LA.ult(RA) ? -1 : 1; } + case scVScale: { + const auto *LTy = cast(cast(LHS)->getType()); + const auto *RTy = cast(cast(RHS)->getType()); + return LTy->getBitWidth() - RTy->getBitWidth(); + } + case scAddRecExpr: { const SCEVAddRecExpr *LA = cast(LHS); const SCEVAddRecExpr *RA = cast(RHS); @@ -4050,6 +4036,8 @@ public: RetVal visitConstant(const SCEVConstant *Constant) { return Constant; } + RetVal visitVScale(const SCEVVScale *VScale) { return VScale; } + RetVal visitPtrToIntExpr(const SCEVPtrToIntExpr *Expr) { return Expr; } RetVal visitTruncateExpr(const SCEVTruncateExpr *Expr) { return Expr; } @@ -4096,6 +4084,7 @@ public: static bool scevUnconditionallyPropagatesPoisonFromOperands(SCEVTypes Kind) { switch (Kind) { case scConstant: + case scVScale: case scTruncate: case scZeroExtend: case scSignExtend: @@ -4139,6 +4128,7 @@ static bool impliesPoison(const SCEV *AssumedPoison, const SCEV *S) { if (!scevUnconditionallyPropagatesPoisonFromOperands(S->getSCEVType())) { switch (S->getSCEVType()) { case scConstant: + case scVScale: case scTruncate: case scZeroExtend: case scSignExtend: @@ -4348,33 +4338,19 @@ const SCEV *ScalarEvolution::getUMinExpr(SmallVectorImpl &Ops, } const SCEV * -ScalarEvolution::getSizeOfScalableVectorExpr(Type *IntTy, - ScalableVectorType *ScalableTy) { - Constant *NullPtr = Constant::getNullValue(ScalableTy->getPointerTo()); - Constant *One = ConstantInt::get(IntTy, 1); - Constant *GEP = ConstantExpr::getGetElementPtr(ScalableTy, NullPtr, One); - // Note that the expression we created is the final expression, we don't - // want to simplify it any further Also, if we call a normal getSCEV(), - // we'll end up in an endless recursion. So just create an SCEVUnknown. - return getUnknown(ConstantExpr::getPtrToInt(GEP, IntTy)); +ScalarEvolution::getSizeOfExpr(Type *IntTy, TypeSize Size) { + const SCEV *Res = getConstant(IntTy, Size.getKnownMinValue()); + if (Size.isScalable()) + Res = getMulExpr(Res, getVScale(IntTy)); + return Res; } const SCEV *ScalarEvolution::getSizeOfExpr(Type *IntTy, Type *AllocTy) { - if (auto *ScalableAllocTy = dyn_cast(AllocTy)) - return getSizeOfScalableVectorExpr(IntTy, ScalableAllocTy); - // We can bypass creating a target-independent constant expression and then - // folding it back into a ConstantInt. This is just a compile-time - // optimization. - return getConstant(IntTy, getDataLayout().getTypeAllocSize(AllocTy)); + return getSizeOfExpr(IntTy, getDataLayout().getTypeAllocSize(AllocTy)); } const SCEV *ScalarEvolution::getStoreSizeOfExpr(Type *IntTy, Type *StoreTy) { - if (auto *ScalableStoreTy = dyn_cast(StoreTy)) - return getSizeOfScalableVectorExpr(IntTy, ScalableStoreTy); - // We can bypass creating a target-independent constant expression and then - // folding it back into a ConstantInt. This is just a compile-time - // optimization. - return getConstant(IntTy, getDataLayout().getTypeStoreSize(StoreTy)); + return getSizeOfExpr(IntTy, getDataLayout().getTypeStoreSize(StoreTy)); } const SCEV *ScalarEvolution::getOffsetOfExpr(Type *IntTy, @@ -5927,6 +5903,7 @@ static bool IsAvailableOnEntry(const Loop *L, DominatorTree &DT, const SCEV *S, bool follow(const SCEV *S) { switch (S->getSCEVType()) { case scConstant: + case scVScale: case scPtrToInt: case scTruncate: case scZeroExtend: @@ -6314,6 +6291,8 @@ uint32_t ScalarEvolution::GetMinTrailingZerosImpl(const SCEV *S) { switch (S->getSCEVType()) { case scConstant: return cast(S)->getAPInt().countTrailingZeros(); + case scVScale: + return 0; case scTruncate: { const SCEVTruncateExpr *T = cast(S); return std::min(GetMinTrailingZeros(T->getOperand()), @@ -6544,6 +6523,7 @@ ScalarEvolution::getRangeRefIter(const SCEV *S, break; [[fallthrough]]; case scConstant: + case scVScale: case scTruncate: case scZeroExtend: case scSignExtend: @@ -6647,6 +6627,8 @@ const ConstantRange &ScalarEvolution::getRangeRef( switch (S->getSCEVType()) { case scConstant: llvm_unreachable("Already handled above."); + case scVScale: + return setRange(S, SignHint, getVScaleRange(&F, BitWidth)); case scTruncate: { const SCEVTruncateExpr *Trunc = cast(S); ConstantRange X = getRangeRef(Trunc->getOperand(), SignHint, Depth + 1); @@ -8057,6 +8039,8 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) { // A start_loop_iterations or llvm.annotation or llvm.prt.annotation is // just eqivalent to the first operand for SCEV purposes. return getSCEV(II->getArgOperand(0)); + case Intrinsic::vscale: + return getVScale(II->getType()); default: break; } @@ -9762,6 +9746,7 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) { switch (V->getSCEVType()) { case scCouldNotCompute: case scAddRecExpr: + case scVScale: return nullptr; case scConstant: return cast(V)->getValue(); @@ -9845,6 +9830,7 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) { const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { switch (V->getSCEVType()) { case scConstant: + case scVScale: return V; case scAddRecExpr: { // If this is a loop recurrence for a loop that does not contain L, then we @@ -9943,6 +9929,7 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) { case scSequentialUMinExpr: return getSequentialMinMaxExpr(V->getSCEVType(), NewOps); case scConstant: + case scVScale: case scAddRecExpr: case scUnknown: case scCouldNotCompute: @@ -13705,6 +13692,7 @@ ScalarEvolution::LoopDisposition ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) { switch (S->getSCEVType()) { case scConstant: + case scVScale: return LoopInvariant; case scAddRecExpr: { const SCEVAddRecExpr *AR = cast(S); @@ -13803,6 +13791,7 @@ ScalarEvolution::BlockDisposition ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) { switch (S->getSCEVType()) { case scConstant: + case scVScale: return ProperlyDominatesBlock; case scAddRecExpr: { // This uses a "dominates" query instead of "properly dominates" query diff --git a/llvm/lib/Analysis/ScalarEvolutionDivision.cpp b/llvm/lib/Analysis/ScalarEvolutionDivision.cpp index 0619569bf8168f87969b9f6c50189948830e196d..e1dd834cfb100d19015ab08c00fd2c330a5185a3 100644 --- a/llvm/lib/Analysis/ScalarEvolutionDivision.cpp +++ b/llvm/lib/Analysis/ScalarEvolutionDivision.cpp @@ -126,6 +126,10 @@ void SCEVDivision::visitConstant(const SCEVConstant *Numerator) { } } +void SCEVDivision::visitVScale(const SCEVVScale *Numerator) { + return cannotDivide(Numerator); +} + void SCEVDivision::visitAddRecExpr(const SCEVAddRecExpr *Numerator) { const SCEV *StartQ, *StartR, *StepQ, *StepR; if (!Numerator->isAffine()) diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index e26c7ad3dd90bf505a7a85645352ef7568363dde..aa182266f17e3d89f02b99e852edde5346dfe987 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -37,6 +37,10 @@ static cl::opt CacheLineSize( cl::desc("Use this to override the target cache line size when " "specified by the user.")); +static cl::opt MinPageSize( + "min-page-size", cl::init(0), cl::Hidden, + cl::desc("Use this to override the target's minimum page size.")); + namespace { /// No-op implementation of the TTI interface using the utility base /// classes. @@ -709,6 +713,11 @@ TargetTransformInfo::getCacheAssociativity(CacheLevel Level) const { return TTIImpl->getCacheAssociativity(Level); } +std::optional TargetTransformInfo::getMinPageSize() const { + return MinPageSize.getNumOccurrences() > 0 ? MinPageSize + : TTIImpl->getMinPageSize(); +} + unsigned TargetTransformInfo::getPrefetchDistance() const { return TTIImpl->getPrefetchDistance(); } diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index a13bdade320f98ecda004193b768e15dc833bc49..3bda11f9cca5c0fe7a962496e378256571d3e14e 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -1058,6 +1058,25 @@ static void computeKnownBitsFromShiftOperator( Known.setAllZero(); } +ConstantRange llvm::getVScaleRange(const Function *F, unsigned BitWidth) { + Attribute Attr = F->getFnAttribute(Attribute::VScaleRange); + // Without vscale_range, we only know that vscale is non-zero. + if (!Attr.isValid()) + return ConstantRange(APInt(BitWidth, 1), APInt::getZero(BitWidth)); + + unsigned AttrMin = Attr.getVScaleRangeMin(); + // Minimum is larger than vscale width, result is always poison. + if ((unsigned)llvm::bit_width(AttrMin) > BitWidth) + return ConstantRange::getEmpty(BitWidth); + + APInt Min(BitWidth, AttrMin); + std::optional AttrMax = Attr.getVScaleRangeMax(); + if (!AttrMax || (unsigned)llvm::bit_width(*AttrMax) > BitWidth) + return ConstantRange(Min, APInt::getZero(BitWidth)); + + return ConstantRange(Min, APInt(BitWidth, *AttrMax) + 1); +} + static void computeKnownBitsFromOperator(const Operator *I, const APInt &DemandedElts, KnownBits &Known, unsigned Depth, @@ -1742,31 +1761,10 @@ static void computeKnownBitsFromOperator(const Operator *I, Known.Zero.setBitsFrom(31); break; case Intrinsic::vscale: { - if (!II->getParent() || !II->getFunction() || - !II->getFunction()->hasFnAttribute(Attribute::VScaleRange)) - break; - - auto Attr = II->getFunction()->getFnAttribute(Attribute::VScaleRange); - std::optional VScaleMax = Attr.getVScaleRangeMax(); - - if (!VScaleMax) - break; - - unsigned VScaleMin = Attr.getVScaleRangeMin(); - - // If vscale min = max then we know the exact value at compile time - // and hence we know the exact bits. - if (VScaleMin == VScaleMax) { - Known.One = VScaleMin; - Known.Zero = VScaleMin; - Known.Zero.flipAllBits(); + if (!II->getParent() || !II->getFunction()) break; - } - - unsigned FirstZeroHighBit = llvm::bit_width(*VScaleMax); - if (FirstZeroHighBit < BitWidth) - Known.Zero.setBitsFrom(FirstZeroHighBit); + Known = getVScaleRange(II->getFunction(), BitWidth).toKnownBits(); break; } } @@ -7229,67 +7227,66 @@ static void setLimitsForBinOp(const BinaryOperator &BO, APInt &Lower, } } -static void setLimitsForIntrinsic(const IntrinsicInst &II, APInt &Lower, - APInt &Upper) { - unsigned Width = Lower.getBitWidth(); +static ConstantRange getRangeForIntrinsic(const IntrinsicInst &II) { + unsigned Width = II.getType()->getScalarSizeInBits(); const APInt *C; switch (II.getIntrinsicID()) { case Intrinsic::ctpop: case Intrinsic::ctlz: case Intrinsic::cttz: // Maximum of set/clear bits is the bit width. - assert(Lower == 0 && "Expected lower bound to be zero"); - Upper = Width + 1; - break; + return ConstantRange(APInt::getZero(Width), APInt(Width, Width + 1)); case Intrinsic::uadd_sat: // uadd.sat(x, C) produces [C, UINT_MAX]. if (match(II.getOperand(0), m_APInt(C)) || match(II.getOperand(1), m_APInt(C))) - Lower = *C; + return ConstantRange::getNonEmpty(*C, APInt::getZero(Width)); break; case Intrinsic::sadd_sat: if (match(II.getOperand(0), m_APInt(C)) || match(II.getOperand(1), m_APInt(C))) { - if (C->isNegative()) { + if (C->isNegative()) // sadd.sat(x, -C) produces [SINT_MIN, SINT_MAX + (-C)]. - Lower = APInt::getSignedMinValue(Width); - Upper = APInt::getSignedMaxValue(Width) + *C + 1; - } else { - // sadd.sat(x, +C) produces [SINT_MIN + C, SINT_MAX]. - Lower = APInt::getSignedMinValue(Width) + *C; - Upper = APInt::getSignedMaxValue(Width) + 1; - } + return ConstantRange::getNonEmpty(APInt::getSignedMinValue(Width), + APInt::getSignedMaxValue(Width) + *C + + 1); + + // sadd.sat(x, +C) produces [SINT_MIN + C, SINT_MAX]. + return ConstantRange::getNonEmpty(APInt::getSignedMinValue(Width) + *C, + APInt::getSignedMaxValue(Width) + 1); } break; case Intrinsic::usub_sat: // usub.sat(C, x) produces [0, C]. if (match(II.getOperand(0), m_APInt(C))) - Upper = *C + 1; + return ConstantRange::getNonEmpty(APInt::getZero(Width), *C + 1); + // usub.sat(x, C) produces [0, UINT_MAX - C]. - else if (match(II.getOperand(1), m_APInt(C))) - Upper = APInt::getMaxValue(Width) - *C + 1; + if (match(II.getOperand(1), m_APInt(C))) + return ConstantRange::getNonEmpty(APInt::getZero(Width), + APInt::getMaxValue(Width) - *C + 1); break; case Intrinsic::ssub_sat: if (match(II.getOperand(0), m_APInt(C))) { - if (C->isNegative()) { + if (C->isNegative()) // ssub.sat(-C, x) produces [SINT_MIN, -SINT_MIN + (-C)]. - Lower = APInt::getSignedMinValue(Width); - Upper = *C - APInt::getSignedMinValue(Width) + 1; - } else { - // ssub.sat(+C, x) produces [-SINT_MAX + C, SINT_MAX]. - Lower = *C - APInt::getSignedMaxValue(Width); - Upper = APInt::getSignedMaxValue(Width) + 1; - } + return ConstantRange::getNonEmpty(APInt::getSignedMinValue(Width), + *C - APInt::getSignedMinValue(Width) + + 1); + + // ssub.sat(+C, x) produces [-SINT_MAX + C, SINT_MAX]. + return ConstantRange::getNonEmpty(*C - APInt::getSignedMaxValue(Width), + APInt::getSignedMaxValue(Width) + 1); } else if (match(II.getOperand(1), m_APInt(C))) { - if (C->isNegative()) { + if (C->isNegative()) // ssub.sat(x, -C) produces [SINT_MIN - (-C), SINT_MAX]: - Lower = APInt::getSignedMinValue(Width) - *C; - Upper = APInt::getSignedMaxValue(Width) + 1; - } else { - // ssub.sat(x, +C) produces [SINT_MIN, SINT_MAX - C]. - Lower = APInt::getSignedMinValue(Width); - Upper = APInt::getSignedMaxValue(Width) - *C + 1; - } + return ConstantRange::getNonEmpty(APInt::getSignedMinValue(Width) - *C, + APInt::getSignedMaxValue(Width) + 1); + + // ssub.sat(x, +C) produces [SINT_MIN, SINT_MAX - C]. + return ConstantRange::getNonEmpty(APInt::getSignedMinValue(Width), + APInt::getSignedMaxValue(Width) - *C + + 1); } break; case Intrinsic::umin: @@ -7302,19 +7299,15 @@ static void setLimitsForIntrinsic(const IntrinsicInst &II, APInt &Lower, switch (II.getIntrinsicID()) { case Intrinsic::umin: - Upper = *C + 1; - break; + return ConstantRange::getNonEmpty(APInt::getZero(Width), *C + 1); case Intrinsic::umax: - Lower = *C; - break; + return ConstantRange::getNonEmpty(*C, APInt::getZero(Width)); case Intrinsic::smin: - Lower = APInt::getSignedMinValue(Width); - Upper = *C + 1; - break; + return ConstantRange::getNonEmpty(APInt::getSignedMinValue(Width), + *C + 1); case Intrinsic::smax: - Lower = *C; - Upper = APInt::getSignedMaxValue(Width) + 1; - break; + return ConstantRange::getNonEmpty(*C, + APInt::getSignedMaxValue(Width) + 1); default: llvm_unreachable("Must be min/max intrinsic"); } @@ -7323,13 +7316,20 @@ static void setLimitsForIntrinsic(const IntrinsicInst &II, APInt &Lower, // If abs of SIGNED_MIN is poison, then the result is [0..SIGNED_MAX], // otherwise it is [0..SIGNED_MIN], as -SIGNED_MIN == SIGNED_MIN. if (match(II.getOperand(1), m_One())) - Upper = APInt::getSignedMaxValue(Width) + 1; - else - Upper = APInt::getSignedMinValue(Width) + 1; - break; + return ConstantRange(APInt::getZero(Width), + APInt::getSignedMaxValue(Width) + 1); + + return ConstantRange(APInt::getZero(Width), + APInt::getSignedMinValue(Width) + 1); + case Intrinsic::vscale: + if (!II.getParent() || !II.getFunction()) + break; + return getVScaleRange(II.getFunction(), Width); default: break; } + + return ConstantRange::getFull(Width); } static void setLimitsForSelectPattern(const SelectInst &SI, APInt &Lower, @@ -7418,18 +7418,28 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool ForSigned, InstrInfoQuery IIQ(UseInstrInfo); unsigned BitWidth = V->getType()->getScalarSizeInBits(); - APInt Lower = APInt(BitWidth, 0); - APInt Upper = APInt(BitWidth, 0); - if (auto *BO = dyn_cast(V)) + ConstantRange CR = ConstantRange::getFull(BitWidth); + if (auto *BO = dyn_cast(V)) { + APInt Lower = APInt(BitWidth, 0); + APInt Upper = APInt(BitWidth, 0); + // TODO: Return ConstantRange. setLimitsForBinOp(*BO, Lower, Upper, IIQ, ForSigned); - else if (auto *II = dyn_cast(V)) - setLimitsForIntrinsic(*II, Lower, Upper); - else if (auto *SI = dyn_cast(V)) + CR = ConstantRange::getNonEmpty(Lower, Upper); + } else if (auto *II = dyn_cast(V)) + CR = getRangeForIntrinsic(*II); + else if (auto *SI = dyn_cast(V)) { + APInt Lower = APInt(BitWidth, 0); + APInt Upper = APInt(BitWidth, 0); + // TODO: Return ConstantRange. setLimitsForSelectPattern(*SI, Lower, Upper, IIQ); - else if (isa(V) || isa(V)) + CR = ConstantRange::getNonEmpty(Lower, Upper); + } else if (isa(V) || isa(V)) { + APInt Lower = APInt(BitWidth, 0); + APInt Upper = APInt(BitWidth, 0); + // TODO: Return ConstantRange. setLimitForFPToI(cast(V), Lower, Upper); - - ConstantRange CR = ConstantRange::getNonEmpty(Lower, Upper); + CR = ConstantRange::getNonEmpty(Lower, Upper); + } if (auto *I = dyn_cast(V)) if (auto *Range = IIQ.getMetadata(I, LLVMContext::MD_range)) diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index c795503b3c7662a0d9ecedcbe8e3c616627040f9..d0090611adf6b13d4540f68305d8b4d53f0d9f88 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -7750,7 +7750,7 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL, // whereas scalable vectors would have to be shifted by // <2log(vscale) + number of bits> in order to store the // low/high parts. Bailing out for now. - if (isa(StoreType)) + if (StoreType->isScalableTy()) return false; if (!DL.typeSizeEqualsStoreSize(StoreType) || diff --git a/llvm/lib/CodeGen/LowLevelType.cpp b/llvm/lib/CodeGen/LowLevelType.cpp index b47c96e50831058e8202bdfb2ff883fbb3f052b3..539f56fbc5a96205cb022a91eca452d1a9806560 100644 --- a/llvm/lib/CodeGen/LowLevelType.cpp +++ b/llvm/lib/CodeGen/LowLevelType.cpp @@ -31,7 +31,7 @@ LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) { return LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace)); } - if (Ty.isSized()) { + if (Ty.isSized() && !Ty.isScalableTargetExtTy()) { // Aggregates are no different from real scalars as far as GlobalISel is // concerned. auto SizeInBits = DL.getTypeSizeInBits(&Ty); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d9cde609e5992d4d86d8ab66c93162d1659b5bc0..418e5e4c5cac1f6d3f1b3308c76696485757132b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17311,8 +17311,8 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { // 2. The store is scalable and the load is fixed width. We could // potentially support a limited number of cases here, but there has been // no cost-benefit analysis to prove it's worth it. - bool LdStScalable = LDMemType.isScalableVector(); - if (LdStScalable != STMemType.isScalableVector()) + bool LdStScalable = LDMemType.isScalableVT(); + if (LdStScalable != STMemType.isScalableVT()) return SDValue(); // If we are dealing with scalable vectors on a big endian platform the @@ -19467,7 +19467,7 @@ bool DAGCombiner::mergeConsecutiveStores(StoreSDNode *St) { // store since we know is exactly twice as large as // ). Until then, bail out for scalable vectors. EVT MemVT = St->getMemoryVT(); - if (MemVT.isScalableVector()) + if (MemVT.isScalableVT()) return false; if (!MemVT.isSimple() || MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits) return false; @@ -26335,7 +26335,7 @@ bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) { // BaseIndexOffset assumes that offsets are fixed-size, which // is not valid for scalable vectors where the offsets are // scaled by `vscale`, so bail out early. - if (St->getMemoryVT().isScalableVector()) + if (St->getMemoryVT().isScalableVT()) return false; // Add ST's interval. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 9a3609bc183bfeeb16c845adcd417ee8cd7183bf..0c6cb4b7926afb3e6d28a700edfbd203bbb9b865 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1910,6 +1910,15 @@ SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) { return SDValue(CondCodeNodes[Cond], 0); } +SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, + bool ConstantFold) { + if (EC.isScalable()) + return getVScale(DL, VT, + APInt(VT.getSizeInBits(), EC.getKnownMinValue())); + + return getConstant(EC.getKnownMinValue(), DL, VT); +} + SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) { APInt One(ResVT.getScalarSizeInBits(), 1); return getStepVector(DL, ResVT, One); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index ad94d0a191e108468189c6dbca625ed5f22f2c12..da2191603c125946897f643aab42a38f1fe0e741 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -495,7 +495,6 @@ getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V, CallConv); - unsigned PartBits = PartVT.getSizeInBits(); unsigned OrigNumParts = NumParts; assert(DAG.getTargetLoweringInfo().isTypeLegal(PartVT) && "Copying to an illegal type!"); @@ -511,6 +510,7 @@ getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, return; } + unsigned PartBits = PartVT.getSizeInBits(); if (NumParts * PartBits > ValueVT.getSizeInBits()) { // If the parts cover more bits than the value has, promote the value. if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) { @@ -7286,6 +7286,96 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, setValue(&I, SetCC); return; } + case Intrinsic::experimental_get_vector_length: { + assert(cast(I.getOperand(1))->getSExtValue() > 0 && + "Expected positive VF"); + unsigned VF = cast(I.getOperand(1))->getZExtValue(); + bool IsScalable = cast(I.getOperand(2))->isOne(); + + SDValue Count = getValue(I.getOperand(0)); + EVT CountVT = Count.getValueType(); + + if (!TLI.shouldExpandGetVectorLength(CountVT, VF, IsScalable)) { + visitTargetIntrinsic(I, Intrinsic); + return; + } + + // Expand to a umin between the trip count and the maximum elements the type + // can hold. + EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + + // Extend the trip count to at least the result VT. + if (CountVT.bitsLT(VT)) { + Count = DAG.getNode(ISD::ZERO_EXTEND, sdl, VT, Count); + CountVT = VT; + } + + SDValue MaxEVL = DAG.getElementCount(sdl, CountVT, + ElementCount::get(VF, IsScalable)); + + SDValue UMin = DAG.getNode(ISD::UMIN, sdl, CountVT, Count, MaxEVL); + // Clip to the result type if needed. + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, sdl, VT, UMin); + + setValue(&I, Trunc); + return; + } + case Intrinsic::experimental_cttz_elts: { + auto DL = getCurSDLoc(); + SDValue Op = getValue(I.getOperand(0)); + EVT OpVT = Op.getValueType(); + + if (!TLI.shouldExpandCttzElements(OpVT)) { + visitTargetIntrinsic(I, Intrinsic); + return; + } + + if (OpVT.getScalarType() != MVT::i1) { + // Compare the input vector elements to zero & use to count trailing zeros + SDValue AllZero = DAG.getConstant(0, DL, OpVT); + OpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + OpVT.getVectorElementCount()); + Op = DAG.getSetCC(DL, OpVT, Op, AllZero, ISD::SETNE); + } + + // Find the smallest "sensible" element type to use for the expansion. + ConstantRange CR( + APInt(64, OpVT.getVectorElementCount().getKnownMinValue())); + if (OpVT.isScalableVT()) + CR = CR.umul_sat(getVScaleRange(I.getCaller(), 64)); + + // If the zero-is-poison flag is set, we can assume the upper limit + // of the result is VF-1. + if (!cast(getValue(I.getOperand(1)))->isZero()) + CR = CR.subtract(APInt(64, 1)); + + unsigned EltWidth = I.getType()->getScalarSizeInBits(); + EltWidth = std::min(EltWidth, (unsigned)CR.getActiveBits()); + EltWidth = std::max(llvm::bit_ceil(EltWidth), (unsigned)8); + + MVT NewEltTy = MVT::getIntegerVT(EltWidth); + + // Create the new vector type & get the vector length + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), NewEltTy, + OpVT.getVectorElementCount()); + + SDValue VL = + DAG.getElementCount(DL, NewEltTy, OpVT.getVectorElementCount()); + + SDValue StepVec = DAG.getStepVector(DL, NewVT); + SDValue SplatVL = DAG.getSplat(NewVT, DL, VL); + SDValue StepVL = DAG.getNode(ISD::SUB, DL, NewVT, SplatVL, StepVec); + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, Op); + SDValue And = DAG.getNode(ISD::AND, DL, NewVT, StepVL, Ext); + SDValue Max = DAG.getNode(ISD::VECREDUCE_UMAX, DL, NewEltTy, And); + SDValue Sub = DAG.getNode(ISD::SUB, DL, NewEltTy, VL, Max); + + EVT RetTy = TLI.getValueType(DAG.getDataLayout(), I.getType()); + SDValue Ret = DAG.getZExtOrTrunc(Sub, DL, RetTy); + + setValue(&I, Ret); + return; + } case Intrinsic::vector_insert: { SDValue Vec = getValue(I.getOperand(0)); SDValue SubVec = getValue(I.getOperand(1)); diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index 608434800bc394fbee91cd7520053a7097d7c8c8..f65da6dd1cb0cb8e2603fcf6306e6cee05859868 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -173,6 +173,8 @@ std::string EVT::getEVTString() const { case MVT::Untyped: return "Untyped"; case MVT::funcref: return "funcref"; case MVT::externref: return "externref"; + case MVT::aarch64svcount: + return "aarch64svcount"; } } @@ -202,6 +204,8 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const { case MVT::f128: return Type::getFP128Ty(Context); case MVT::ppcf128: return Type::getPPC_FP128Ty(Context); case MVT::x86mmx: return Type::getX86_MMXTy(Context); + case MVT::aarch64svcount: + return TargetExtType::get(Context, "aarch64.svcount"); case MVT::x86amx: return Type::getX86_AMXTy(Context); case MVT::i64x8: return IntegerType::get(Context, 512); case MVT::externref: @@ -575,6 +579,12 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){ case Type::DoubleTyID: return MVT(MVT::f64); case Type::X86_FP80TyID: return MVT(MVT::f80); case Type::X86_MMXTyID: return MVT(MVT::x86mmx); + case Type::TargetExtTyID: + if (cast(Ty)->getName() == "aarch64.svcount") + return MVT(MVT::aarch64svcount); + if (HandleUnknown) + return MVT(MVT::Other); + llvm_unreachable("Unknown target ext type!"); case Type::X86_AMXTyID: return MVT(MVT::x86amx); case Type::FP128TyID: return MVT(MVT::f128); case Type::PPC_FP128TyID: return MVT(MVT::ppcf128); diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp index 8bb8c9d29a62457744abb38f332bae6897a799ac..d94545c25b2c07f2df6c41b54e70b8784ce79318 100644 --- a/llvm/lib/IR/Type.cpp +++ b/llvm/lib/IR/Type.cpp @@ -80,6 +80,12 @@ bool Type::isIEEE() const { return APFloat::getZero(getFltSemantics()).isIEEE(); } +bool Type::isScalableTargetExtTy() const { + if (auto *TT = dyn_cast(this)) + return isa(TT->getLayoutType()); + return false; +} + Type *Type::getFloatingPointTy(LLVMContext &C, const fltSemantics &S) { Type *Ty; if (&S == &APFloat::IEEEhalf()) diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 48e4b7e70d98f537ebb5558950cbf25c82bc476d..b21f15c83a4d0f1e06a31822dfd7429db4f08689 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5408,6 +5408,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { Call); break; } + case Intrinsic::experimental_get_vector_length: { + ConstantInt *VF = cast(Call.getArgOperand(1)); + Check(!VF->isNegative() && !VF->isZero(), + "get_vector_length: VF must be positive", Call); + break; + } case Intrinsic::masked_load: { Check(Call.getType()->isVectorTy(), "masked_load: must return a vector", Call); diff --git a/llvm/lib/Support/LowLevelType.cpp b/llvm/lib/Support/LowLevelType.cpp index 0282cd9bd79edb9ca7af5244fe1b31950d1855b7..e394f98fa62e3e5a150176d1d41a9ee7aa972fd7 100644 --- a/llvm/lib/Support/LowLevelType.cpp +++ b/llvm/lib/Support/LowLevelType.cpp @@ -21,7 +21,7 @@ LLT::LLT(MVT VT) { init(/*IsPointer=*/false, asVector, /*IsScalar=*/!asVector, VT.getVectorElementCount(), VT.getVectorElementType().getSizeInBits(), /*AddressSpace=*/0); - } else if (VT.isValid()) { + } else if (VT.isValid() && !VT.isScalableTargetExtVT()) { // Aggregates are no different from real scalars as far as GlobalISel is // concerned. init(/*IsPointer=*/false, /*IsVector=*/false, /*IsScalar=*/true, diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index 6ef0c804ede366ad0a7090a48b78e5646fc15454..d338fd0e01939a930d905d5609f4e83a411732f3 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -86,6 +86,7 @@ void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&); void initializeAArch64ExpandPseudoPass(PassRegistry &); void initializeAArch64KCFIPass(PassRegistry &); void initializeAArch64LoadStoreOptPass(PassRegistry&); +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &); void initializeAArch64MIPeepholeOptPass(PassRegistry &); void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &); diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index e53f573de66c1f796be8643e66f327403117bbdf..853975c6193d988a8462618334eebc26f01923f9 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -82,9 +82,9 @@ def CC_AArch64_AAPCS : CallingConv<[ nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], CCPassIndirect>, - CCIfType<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCIfType<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1, aarch64svcount], CCAssignToReg<[P0, P1, P2, P3]>>, - CCIfType<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCIfType<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1, aarch64svcount], CCPassIndirect>, // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, @@ -149,7 +149,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[ nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, - CCIfType<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCIfType<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1, aarch64svcount], CCAssignToReg<[P0, P1, P2, P3]>> ]>; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index dc0293a93069f7950f7ee4a9e0dfb0d5d50e6d1e..ea85893dcc708b702e8f0b5a33c9e21d698546ea 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -403,6 +403,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } + if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) { + addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass); + setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1); + setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1); + + setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom); + setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand); + } + // Compute derived properties from the register classes computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -1802,6 +1811,10 @@ void AArch64TargetLowering::addTypeForStreamingSVE(MVT VT) { setOperationAction(ISD::ZERO_EXTEND, VT, Custom); } +bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const { + return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1; +} + void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); @@ -2590,6 +2603,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::CALL_BTI) MAKE_CASE(AArch64ISD::MRRS) MAKE_CASE(AArch64ISD::MSRR) + MAKE_CASE(AArch64ISD::CTTZ_ELTS) } #undef MAKE_CASE return nullptr; @@ -5287,6 +5301,12 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID, Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::experimental_cttz_elts: { + SDValue NewCttzElts = + DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1)); + + return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType()); + } } } @@ -6394,6 +6414,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments( RegVT.getVectorElementType() == MVT::i1) { FuncInfo->setIsSVECC(true); RC = &AArch64::PPRRegClass; + } else if (RegVT == MVT::aarch64svcount) { + FuncInfo->setIsSVECC(true); + RC = &AArch64::PPRRegClass; } else if (RegVT.isScalableVector()) { FuncInfo->setIsSVECC(true); RC = &AArch64::ZPRRegClass; @@ -6428,9 +6451,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments( case CCValAssign::Full: break; case CCValAssign::Indirect: - assert((VA.getValVT().isScalableVector() || - Subtarget->isWindowsArm64EC()) && - "Indirect arguments should be scalable on most subtargets"); + assert( + (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) && + "Indirect arguments should be scalable on most subtargets"); break; case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); @@ -6509,9 +6532,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments( } if (VA.getLocInfo() == CCValAssign::Indirect) { - assert( - (VA.getValVT().isScalableVector() || Subtarget->isWindowsArm64EC()) && - "Indirect arguments should be scalable on most subtargets"); + assert((VA.getValVT().isScalableVT() || + Subtarget->isWindowsArm64EC()) && + "Indirect arguments should be scalable on most subtargets"); uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue(); unsigned NumParts = 1; @@ -7368,7 +7391,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::Indirect: - bool isScalable = VA.getValVT().isScalableVector(); + bool isScalable = VA.getValVT().isScalableVT(); assert((isScalable || Subtarget->isWindowsArm64EC()) && "Indirect arguments should be scalable on most subtargets"); @@ -9258,10 +9281,17 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SDLoc DL(Op); EVT Ty = Op.getValueType(); + if (Ty == MVT::aarch64svcount) { + TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal); + FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal); + SDValue Sel = + DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal); + return DAG.getNode(ISD::BITCAST, DL, Ty, Sel); + } + if (Ty.isScalableVector()) { - SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal); MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount()); - SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC); + SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal); return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal); } @@ -14845,6 +14875,9 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, return false; // FIXME: Update this method to support scalable addressing modes. + if (Ty->isScalableTargetExtTy()) + return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale; + if (isa(Ty)) { uint64_t VecElemNumBytes = DL.getTypeSizeInBits(cast(Ty)->getElementType()) / 8; @@ -20642,7 +20675,7 @@ static SDValue performSelectCombine(SDNode *N, if (N0.getOpcode() != ISD::SETCC) return SDValue(); - if (ResVT.isScalableVector()) + if (ResVT.isScalableVT()) return SDValue(); // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered @@ -22843,15 +22876,15 @@ bool AArch64TargetLowering::shouldLocalize( } bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { - if (isa(Inst.getType())) + if (Inst.getType()->isScalableTy()) return true; for (unsigned i = 0; i < Inst.getNumOperands(); ++i) - if (isa(Inst.getOperand(i)->getType())) + if (Inst.getOperand(i)->getType()->isScalableTy()) return true; if (const AllocaInst *AI = dyn_cast(&Inst)) { - if (isa(AI->getAllocatedType())) + if (AI->getAllocatedType()->isScalableTy()) return true; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 92619f7e4c5a396805e44ad6972608c439f7c21c..efb9014e60a8f3bb20ea9366d67fe83acdd89163 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -339,6 +339,8 @@ enum NodeType : unsigned { PTEST_ANY, PTRUE, + CTTZ_ELTS, + BITREVERSE_MERGE_PASSTHRU, BSWAP_MERGE_PASSTHRU, REVH_MERGE_PASSTHRU, @@ -905,6 +907,8 @@ public: bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override; + bool shouldExpandCttzElements(EVT VT) const override; + /// If a change in streaming mode is required on entry to/return from a /// function call it emits and returns the corresponding SMSTART or SMSTOP node. /// \p Entry tells whether this is before/after the Call, which is necessary diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 57545466271526b9c89d0d65415407ccea643032..8dc2e1c3f613adad1804659f0b9d3b0658262d7a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -799,6 +799,9 @@ def AArch64mrs : SDNode<"AArch64ISD::MRS", SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>, [SDNPHasChain, SDNPOutGlue]>; +def AArch64CttzElts : SDNode<"AArch64ISD::CTTZ_ELTS", SDTypeProfile<1, 1, + [SDTCisInt<0>, SDTCisVec<1>]>, []>; + // Match add node and also treat an 'or' node is as an 'add' if the or'ed operands // have no common bits. def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs), diff --git a/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp b/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6c6cd120b035449b2a235fc9c7f5158d40219bf7 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp @@ -0,0 +1,828 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +// if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===----------------------------------------------------------------------===// +// +// TODO List: +// +// * Add support for the inverse case where we scan for a matching element. +// * Permit 64-bit induction variable types. +// * Recognize loops that increment the IV *after* comparing bytes. +// * Allow 32-bit sign-extends of the IV used by the GEP. +// +//===----------------------------------------------------------------------===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt + DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(false), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt DisableByteCmp( + "disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), + cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt VerifyLoops( + "aarch64-lit-verify", cl::Hidden, cl::init(false), + cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl &ExitBlocks); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU, + GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, + Instruction *Index, Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, + PHINode *IndPhi, Value *MaxLen, Instruction *Index, + Value *Start, bool IncIdx, BasicBlock *FoundBB, + BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { + initializeAArch64LoopIdiomTransformLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { + return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, + LPPassManager &LPM) { + + if (skipLoop(L)) + return false; + + auto *DT = &getAnalysis().getDomTree(); + auto *LI = &getAnalysis().getLoopInfo(); + auto &TTI = getAnalysis().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char AArch64LoopIdiomTransformLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN( + AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", + "Transform specific loop idioms into optimized vector forms", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END( + AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", + "Transform specific loop idioms into optimized vector forms", false, false) + +Pass *llvm::createAArch64LoopIdiomTransformPass() { + return new AArch64LoopIdiomTransformLegacyPass(); +} + +PreservedAnalyses +AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { + if (DisableAll) + return PreservedAnalyses::all(); + + const auto *DL = &L.getHeader()->getModule()->getDataLayout(); + + AArch64LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL); + if (!LIT.run(&L)) + return PreservedAnalyses::all(); + + return PreservedAnalyses::none(); +} + +//===----------------------------------------------------------------------===// +// +// Implementation of AArch64LoopIdiomTransform +// +//===----------------------------------------------------------------------===// + +bool AArch64LoopIdiomTransform::run(Loop *L) { + CurLoop = L; + + if (DisableAll || L->getHeader()->getParent()->hasOptSize()) + return false; + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!L->getLoopPreheader()) + return false; + + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" + << CurLoop->getHeader()->getParent()->getName() + << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); + + return recognizeByteCompare(); +} + +bool AArch64LoopIdiomTransform::recognizeByteCompare() { + // Currently the transformation only works on scalable vector types, although + // there is no fundamental reason why it cannot be made to work for fixed + // width too. + + // We also need to know the minimum page size for the target in order to + // generate runtime memory checks to ensure the vector version won't fault. + if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() || + DisableByteCmp) + return false; + + BasicBlock *Header = CurLoop->getHeader(); + + // In AArch64LoopIdiomTransform::run we have already checked that the loop + // has a preheader so we can assume it's in a canonical form. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2) + return false; + + PHINode *PN = dyn_cast(&Header->front()); + if (!PN || PN->getNumIncomingValues() != 2) + return false; + + auto LoopBlocks = CurLoop->getBlocks(); + // The first block in the loop should contain only 4 instructions, e.g. + // + // while.cond: + // %res.phi = phi i32 [ %start, %ph ], [ %inc, %while.body ] + // %inc = add i32 %res.phi, 1 + // %cmp.not = icmp eq i32 %inc, %n + // br i1 %cmp.not, label %while.end, label %while.body + // + auto CondBBInsts = LoopBlocks[0]->instructionsWithoutDebug(); + if (std::distance(CondBBInsts.begin(), CondBBInsts.end()) > 4) + return false; + + // The second block should contain 7 instructions, e.g. + // + // while.body: + // %idx = zext i32 %inc to i64 + // %idx.a = getelementptr inbounds i8, ptr %a, i64 %idx + // %load.a = load i8, ptr %idx.a + // %idx.b = getelementptr inbounds i8, ptr %b, i64 %idx + // %load.b = load i8, ptr %idx.b + // %cmp.not.ld = icmp eq i8 %load.a, %load.b + // br i1 %cmp.not.ld, label %while.cond, label %while.end + // + auto LoopBBInsts = LoopBlocks[1]->instructionsWithoutDebug(); + if (std::distance(LoopBBInsts.begin(), LoopBBInsts.end()) > 7) + return false; + + // The incoming value to the PHI node from the loop should be an add of 1. + Value *StartIdx = nullptr; + Instruction *Index = nullptr; + if (!CurLoop->contains(PN->getIncomingBlock(0))) { + StartIdx = PN->getIncomingValue(0); + Index = dyn_cast(PN->getIncomingValue(1)); + } else { + StartIdx = PN->getIncomingValue(1); + Index = dyn_cast(PN->getIncomingValue(0)); + } + + // Limit to 32-bit types for now + if (!Index || !Index->getType()->isIntegerTy(32) || + !match(Index, m_c_Add(m_Specific(PN), m_One()))) + return false; + + // If we match the pattern, PN and Index will be replaced with the result of + // the cttz.elts intrinsic. If any other instructions are used outside of + // the loop, we cannot replace it. + for (BasicBlock *BB : LoopBlocks) + for (Instruction &I : *BB) + if (&I != PN && &I != Index) + for (User *U : I.users()) + if (!CurLoop->contains(cast(U))) + return false; + + // Match the branch instruction for the header + ICmpInst::Predicate Pred; + Value *MaxLen; + BasicBlock *EndBB, *WhileBB; + if (!match(Header->getTerminator(), + m_Br(m_ICmp(Pred, m_Specific(Index), m_Value(MaxLen)), + m_BasicBlock(EndBB), m_BasicBlock(WhileBB))) || + Pred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(WhileBB)) + return false; + + // WhileBB should contain the pattern of load & compare instructions. Match + // the pattern and find the GEP instructions used by the loads. + ICmpInst::Predicate WhilePred; + BasicBlock *FoundBB; + BasicBlock *TrueBB; + Value *LoadA, *LoadB; + if (!match(WhileBB->getTerminator(), + m_Br(m_ICmp(WhilePred, m_Value(LoadA), m_Value(LoadB)), + m_BasicBlock(TrueBB), m_BasicBlock(FoundBB))) || + WhilePred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(TrueBB)) + return false; + + Value *A, *B; + if (!match(LoadA, m_Load(m_Value(A))) || !match(LoadB, m_Load(m_Value(B)))) + return false; + + LoadInst *LoadAI = cast(LoadA); + LoadInst *LoadBI = cast(LoadB); + if (!LoadAI->isSimple() || !LoadBI->isSimple()) + return false; + + GetElementPtrInst *GEPA = dyn_cast(A); + GetElementPtrInst *GEPB = dyn_cast(B); + + if (!GEPA || !GEPB) + return false; + + Value *PtrA = GEPA->getPointerOperand(); + Value *PtrB = GEPB->getPointerOperand(); + + // Check we are loading i8 values from two loop invariant pointers + if (!CurLoop->isLoopInvariant(PtrA) || !CurLoop->isLoopInvariant(PtrB) || + !GEPA->getResultElementType()->isIntegerTy(8) || + !GEPB->getResultElementType()->isIntegerTy(8) || + !LoadAI->getType()->isIntegerTy(8) || + !LoadBI->getType()->isIntegerTy(8) || PtrA == PtrB) + return false; + + // Check that the index to the GEPs is the index we found earlier + if (GEPA->getNumIndices() > 1 || GEPB->getNumIndices() > 1) + return false; + + Value *IdxA = GEPA->getOperand(GEPA->getNumIndices()); + Value *IdxB = GEPB->getOperand(GEPB->getNumIndices()); + if (IdxA != IdxB || !match(IdxA, m_ZExt(m_Specific(Index)))) + return false; + + // We only ever expect the pre-incremented index value to be used inside the + // loop. + if (!PN->hasOneUse()) + return false; + + // Ensure that when the Found and End blocks are identical the PHIs have the + // supported format. We don't currently allow cases like this: + // while.cond: + // ... + // br i1 %cmp.not, label %while.end, label %while.body + // + // while.body: + // ... + // br i1 %cmp.not2, label %while.cond, label %while.end + // + // while.end: + // %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ] + // + // Where the incoming values for %final_ptr are unique and from each of the + // loop blocks, but not actually defined in the loop. This requires extra + // work setting up the byte.compare block, i.e. by introducing a select to + // choose the correct value. + // TODO: We could add support for this in future. + if (FoundBB == EndBB) { + for (PHINode &EndPN : EndBB->phis()) { + Value *WhileCondVal = EndPN.getIncomingValueForBlock(Header); + Value *WhileBodyVal = EndPN.getIncomingValueForBlock(WhileBB); + + // The value of the index when leaving the while.cond block is always the + // same as the end value (MaxLen) so we permit either. Otherwise for any + // other value defined outside the loop we only allow values that are the + // same as the exit value for while.body. + if (WhileCondVal != Index && WhileCondVal != MaxLen && + WhileCondVal != WhileBodyVal) + return false; + } + } + + LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n" + << *(EndBB->getParent()) << "\n\n"); + + // The index is incremented before the GEP/Load pair so we need to + // add 1 to the start value. + transformByteCompare(GEPA, GEPB, PN, MaxLen, Index, StartIdx, /*IncIdx=*/true, + FoundBB, EndBB); + return true; +} + +Value *AArch64LoopIdiomTransform::expandFindMismatch( + IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, + GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) { + Value *PtrA = GEPA->getPointerOperand(); + Value *PtrB = GEPB->getPointerOperand(); + + // Get the arguments and types for the intrinsic. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + BranchInst *PHBranch = cast(Preheader->getTerminator()); + LLVMContext &Ctx = PHBranch->getContext(); + Type *LoadType = Type::getInt8Ty(Ctx); + Type *ResType = Builder.getInt32Ty(); + + // Split block in the original loop preheader. + BasicBlock *EndBlock = + SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end"); + + // Create the blocks that we're going to need: + // 1. A block for checking the zero-extended length exceeds 0 + // 2. A block to check that the start and end addresses of a given array + // lie on the same page. + // 3. The SVE loop preheader. + // 4. The first SVE loop block. + // 5. The SVE loop increment block. + // 6. A block we can jump to from the SVE loop when a mismatch is found. + // 7. The first block of the scalar loop itself, containing PHIs , loads + // and cmp. + // 8. A scalar loop increment block to increment the PHIs and go back + // around the loop. + + BasicBlock *MinItCheckBlock = BasicBlock::Create( + Ctx, "mismatch_min_it_check", EndBlock->getParent(), EndBlock); + + // Update the terminator added by SplitBlock to branch to the first block + Preheader->getTerminator()->setSuccessor(0, MinItCheckBlock); + + BasicBlock *MemCheckBlock = BasicBlock::Create( + Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock); + + BasicBlock *SVELoopPreheaderBlock = BasicBlock::Create( + Ctx, "mismatch_sve_loop_preheader", EndBlock->getParent(), EndBlock); + + BasicBlock *SVELoopStartBlock = BasicBlock::Create( + Ctx, "mismatch_sve_loop", EndBlock->getParent(), EndBlock); + + BasicBlock *SVELoopIncBlock = BasicBlock::Create( + Ctx, "mismatch_sve_loop_inc", EndBlock->getParent(), EndBlock); + + BasicBlock *SVELoopMismatchBlock = BasicBlock::Create( + Ctx, "mismatch_sve_loop_found", EndBlock->getParent(), EndBlock); + + BasicBlock *LoopPreHeaderBlock = BasicBlock::Create( + Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock); + + BasicBlock *LoopStartBlock = + BasicBlock::Create(Ctx, "mismatch_loop", EndBlock->getParent(), EndBlock); + + BasicBlock *LoopIncBlock = BasicBlock::Create( + Ctx, "mismatch_loop_inc", EndBlock->getParent(), EndBlock); + + DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock}, + {DominatorTree::Delete, Preheader, EndBlock}}); + + // Update LoopInfo with the new SVE & scalar loops. + auto SVELoop = LI->AllocateLoop(); + auto ScalarLoop = LI->AllocateLoop(); + + if (CurLoop->getParentLoop()) { + CurLoop->getParentLoop()->addBasicBlockToLoop(MinItCheckBlock, *LI); + CurLoop->getParentLoop()->addBasicBlockToLoop(MemCheckBlock, *LI); + CurLoop->getParentLoop()->addBasicBlockToLoop(SVELoopPreheaderBlock, *LI); + CurLoop->getParentLoop()->addChildLoop(SVELoop); + CurLoop->getParentLoop()->addBasicBlockToLoop(SVELoopMismatchBlock, *LI); + CurLoop->getParentLoop()->addBasicBlockToLoop(LoopPreHeaderBlock, *LI); + CurLoop->getParentLoop()->addChildLoop(ScalarLoop); + } else { + LI->addTopLevelLoop(SVELoop); + LI->addTopLevelLoop(ScalarLoop); + } + + // Add the new basic blocks to their associated loops. + SVELoop->addBasicBlockToLoop(SVELoopStartBlock, *LI); + SVELoop->addBasicBlockToLoop(SVELoopIncBlock, *LI); + + ScalarLoop->addBasicBlockToLoop(LoopStartBlock, *LI); + ScalarLoop->addBasicBlockToLoop(LoopIncBlock, *LI); + + // Set up some types and constants that we intend to reuse. + Type *I64Type = Builder.getInt64Ty(); + + // Check the zero-extended iteration count > 0 + Builder.SetInsertPoint(MinItCheckBlock); + Value *ExtStart = Builder.CreateZExt(Start, I64Type); + Value *ExtEnd = Builder.CreateZExt(MaxLen, I64Type); + // This check doesn't really cost us very much. + + Value *LimitCheck = Builder.CreateICmpULE(Start, MaxLen); + BranchInst *MinItCheckBr = + BranchInst::Create(MemCheckBlock, LoopPreHeaderBlock, LimitCheck); + MinItCheckBr->setMetadata( + LLVMContext::MD_prof, + MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1)); + Builder.Insert(MinItCheckBr); + + DTU.applyUpdates( + {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock}, + {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}}); + + // For each of the arrays, check the start/end addresses are on the same + // page. + Builder.SetInsertPoint(MemCheckBlock); + + // The early exit in the original loop means that when performing vector + // loads we are potentially reading ahead of the early exit. So we could + // fault if crossing a page boundary. Therefore, we create runtime memory + // checks based on the minimum page size as follows: + // 1. Calculate the addresses of the first memory accesses in the loop, + // i.e. LhsStart and RhsStart. + // 2. Get the last accessed addresses in the loop, i.e. LhsEnd and RhsEnd. + // 3. Determine which pages correspond to all the memory accesses, i.e + // LhsStartPage, LhsEndPage, RhsStartPage, RhsEndPage. + // 4. If LhsStartPage == LhsEndPage and RhsStartPage == RhsEndPage, then + // we know we won't cross any page boundaries in the loop so we can + // enter the vector loop! Otherwise we fall back on the scalar loop. + Value *LhsStartGEP = Builder.CreateGEP(LoadType, PtrA, ExtStart); + Value *RhsStartGEP = Builder.CreateGEP(LoadType, PtrB, ExtStart); + Value *RhsStart = Builder.CreatePtrToInt(RhsStartGEP, I64Type); + Value *LhsStart = Builder.CreatePtrToInt(LhsStartGEP, I64Type); + Value *LhsEndGEP = Builder.CreateGEP(LoadType, PtrA, ExtEnd); + Value *RhsEndGEP = Builder.CreateGEP(LoadType, PtrB, ExtEnd); + Value *LhsEnd = Builder.CreatePtrToInt(LhsEndGEP, I64Type); + Value *RhsEnd = Builder.CreatePtrToInt(RhsEndGEP, I64Type); + + const uint64_t MinPageSize = TTI->getMinPageSize().value(); + const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize); + Value *LhsStartPage = Builder.CreateLShr(LhsStart, AddrShiftAmt); + Value *LhsEndPage = Builder.CreateLShr(LhsEnd, AddrShiftAmt); + Value *RhsStartPage = Builder.CreateLShr(RhsStart, AddrShiftAmt); + Value *RhsEndPage = Builder.CreateLShr(RhsEnd, AddrShiftAmt); + Value *LhsPageCmp = Builder.CreateICmpNE(LhsStartPage, LhsEndPage); + Value *RhsPageCmp = Builder.CreateICmpNE(RhsStartPage, RhsEndPage); + + Value *CombinedPageCmp = Builder.CreateOr(LhsPageCmp, RhsPageCmp); + BranchInst *CombinedPageCmpCmpBr = BranchInst::Create( + LoopPreHeaderBlock, SVELoopPreheaderBlock, CombinedPageCmp); + CombinedPageCmpCmpBr->setMetadata( + LLVMContext::MD_prof, MDBuilder(CombinedPageCmpCmpBr->getContext()) + .createBranchWeights(10, 90)); + Builder.Insert(CombinedPageCmpCmpBr); + + DTU.applyUpdates( + {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock}, + {DominatorTree::Insert, MemCheckBlock, SVELoopPreheaderBlock}}); + + // Set up the SVE loop preheader, i.e. calculate initial loop predicate, + // zero-extend MaxLen to 64-bits, determine the number of vector elements + // processed in each iteration, etc. + Builder.SetInsertPoint(SVELoopPreheaderBlock); + + // At this point we know two things must be true: + // 1. Start <= End + // 2. ExtMaxLen <= MinPageSize due to the page checks. + // Therefore, we know that we can use a 64-bit induction variable that + // starts from 0 -> ExtMaxLen and it will not overflow. + ScalableVectorType *PredVTy = + ScalableVectorType::get(Builder.getInt1Ty(), 16); + + Value *InitialPred = Builder.CreateIntrinsic( + Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd}); + + Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {}); + VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "", + /*HasNUW=*/true, /*HasNSW=*/true); + + Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(), + Builder.getInt1(false)); + + BranchInst *JumpToSVELoop = BranchInst::Create(SVELoopStartBlock); + Builder.Insert(JumpToSVELoop); + + DTU.applyUpdates( + {{DominatorTree::Insert, SVELoopPreheaderBlock, SVELoopStartBlock}}); + + // Set up the first SVE loop block by creating the PHIs, doing the vector + // loads and comparing the vectors. + Builder.SetInsertPoint(SVELoopStartBlock); + PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_sve_loop_pred"); + LoopPred->addIncoming(InitialPred, SVELoopPreheaderBlock); + PHINode *SVEIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_sve_index"); + SVEIndexPhi->addIncoming(ExtStart, SVELoopPreheaderBlock); + Type *SVELoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16); + Value *Passthru = ConstantInt::getNullValue(SVELoadType); + + Value *SVELhsGep = Builder.CreateGEP(LoadType, PtrA, SVEIndexPhi); + if (GEPA->isInBounds()) + cast(SVELhsGep)->setIsInBounds(true); + Value *SVELhsLoad = Builder.CreateMaskedLoad(SVELoadType, SVELhsGep, Align(1), + LoopPred, Passthru); + + Value *SVERhsGep = Builder.CreateGEP(LoadType, PtrB, SVEIndexPhi); + if (GEPB->isInBounds()) + cast(SVERhsGep)->setIsInBounds(true); + Value *SVERhsLoad = Builder.CreateMaskedLoad(SVELoadType, SVERhsGep, Align(1), + LoopPred, Passthru); + + Value *SVEMatchCmp = Builder.CreateICmpNE(SVELhsLoad, SVERhsLoad); + SVEMatchCmp = Builder.CreateSelect(LoopPred, SVEMatchCmp, PFalse); + Value *SVEMatchHasActiveLanes = Builder.CreateOrReduce(SVEMatchCmp); + BranchInst *SVEEarlyExit = BranchInst::Create( + SVELoopMismatchBlock, SVELoopIncBlock, SVEMatchHasActiveLanes); + Builder.Insert(SVEEarlyExit); + + DTU.applyUpdates( + {{DominatorTree::Insert, SVELoopStartBlock, SVELoopMismatchBlock}, + {DominatorTree::Insert, SVELoopStartBlock, SVELoopIncBlock}}); + + // Increment the index counter and calculate the predicate for the next + // iteration of the loop. We branch back to the start of the loop if there + // is at least one active lane. + Builder.SetInsertPoint(SVELoopIncBlock); + Value *NewSVEIndexPhi = Builder.CreateAdd(SVEIndexPhi, VecLen, "", + /*HasNUW=*/true, /*HasNSW=*/true); + SVEIndexPhi->addIncoming(NewSVEIndexPhi, SVELoopIncBlock); + Value *NewPred = + Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, + {PredVTy, I64Type}, {NewSVEIndexPhi, ExtEnd}); + LoopPred->addIncoming(NewPred, SVELoopIncBlock); + + Value *PredHasActiveLanes = + Builder.CreateExtractElement(NewPred, uint64_t(0)); + BranchInst *SVELoopBranchBack = + BranchInst::Create(SVELoopStartBlock, EndBlock, PredHasActiveLanes); + Builder.Insert(SVELoopBranchBack); + + DTU.applyUpdates({{DominatorTree::Insert, SVELoopIncBlock, SVELoopStartBlock}, + {DominatorTree::Insert, SVELoopIncBlock, EndBlock}}); + + // If we found a mismatch then we need to calculate which lane in the vector + // had a mismatch and add that on to the current loop index. + Builder.SetInsertPoint(SVELoopMismatchBlock); + PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_sve_found_pred"); + FoundPred->addIncoming(SVEMatchCmp, SVELoopStartBlock); + PHINode *LastLoopPred = + Builder.CreatePHI(PredVTy, 1, "mismatch_sve_last_loop_pred"); + LastLoopPred->addIncoming(LoopPred, SVELoopStartBlock); + PHINode *SVEFoundIndex = + Builder.CreatePHI(I64Type, 1, "mismatch_sve_found_index"); + SVEFoundIndex->addIncoming(SVEIndexPhi, SVELoopStartBlock); + + Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred); + Value *Ctz = Builder.CreateIntrinsic( + Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()}, + {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)}); + Ctz = Builder.CreateZExt(Ctz, I64Type); + Value *SVELoopRes64 = Builder.CreateAdd(SVEFoundIndex, Ctz, "", + /*HasNUW=*/true, /*HasNSW=*/true); + Value *SVELoopRes = Builder.CreateTrunc(SVELoopRes64, ResType); + + Builder.Insert(BranchInst::Create(EndBlock)); + + DTU.applyUpdates({{DominatorTree::Insert, SVELoopMismatchBlock, EndBlock}}); + + // Generate code for scalar loop. + Builder.SetInsertPoint(LoopPreHeaderBlock); + Builder.Insert(BranchInst::Create(LoopStartBlock)); + + DTU.applyUpdates( + {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}}); + + Builder.SetInsertPoint(LoopStartBlock); + PHINode *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index"); + IndexPhi->addIncoming(Start, LoopPreHeaderBlock); + + // Otherwise compare the values + // Load bytes from each array and compare them. + Value *GepOffset = Builder.CreateZExt(IndexPhi, I64Type); + + Value *LhsGep = Builder.CreateGEP(LoadType, PtrA, GepOffset); + if (GEPA->isInBounds()) + cast(LhsGep)->setIsInBounds(true); + Value *LhsLoad = Builder.CreateLoad(LoadType, LhsGep); + + Value *RhsGep = Builder.CreateGEP(LoadType, PtrB, GepOffset); + if (GEPB->isInBounds()) + cast(RhsGep)->setIsInBounds(true); + Value *RhsLoad = Builder.CreateLoad(LoadType, RhsGep); + + Value *MatchCmp = Builder.CreateICmpEQ(LhsLoad, RhsLoad); + // If we have a mismatch then exit the loop ... + BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp); + Builder.Insert(MatchCmpBr); + + DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock}, + {DominatorTree::Insert, LoopStartBlock, EndBlock}}); + + // Have we reached the maximum permitted length for the loop? + Builder.SetInsertPoint(LoopIncBlock); + Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "", + /*HasNUW=*/Index->hasNoUnsignedWrap(), + /*HasNSW=*/Index->hasNoSignedWrap()); + IndexPhi->addIncoming(PhiInc, LoopIncBlock); + Value *IVCmp = Builder.CreateICmpEQ(PhiInc, MaxLen); + BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp); + Builder.Insert(IVCmpBr); + + DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock}, + {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}}); + + // In the end block we need to insert a PHI node to deal with three cases: + // 1. We didn't find a mismatch in the scalar loop, so we return MaxLen. + // 2. We exitted the scalar loop early due to a mismatch and need to return + // the index that we found. + // 3. We didn't find a mismatch in the SVE loop, so we return MaxLen. + // 4. We exitted the SVE loop early due to a mismatch and need to return + // the index that we found. + Builder.SetInsertPoint(EndBlock, EndBlock->getFirstInsertionPt()); + PHINode *ResPhi = Builder.CreatePHI(ResType, 4, "mismatch_result"); + ResPhi->addIncoming(MaxLen, LoopIncBlock); + ResPhi->addIncoming(IndexPhi, LoopStartBlock); + ResPhi->addIncoming(MaxLen, SVELoopIncBlock); + ResPhi->addIncoming(SVELoopRes, SVELoopMismatchBlock); + + Value *FinalRes = Builder.CreateTrunc(ResPhi, ResType); + + if (VerifyLoops) { + ScalarLoop->verifyLoop(); + SVELoop->verifyLoop(); + if (!SVELoop->isRecursivelyLCSSAForm(*DT, *LI)) + report_fatal_error("Loops must remain in LCSSA form!"); + if (!ScalarLoop->isRecursivelyLCSSAForm(*DT, *LI)) + report_fatal_error("Loops must remain in LCSSA form!"); + } + + return FinalRes; +} + +void AArch64LoopIdiomTransform::transformByteCompare( + GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, PHINode *IndPhi, + Value *MaxLen, Instruction *Index, Value *Start, bool IncIdx, + BasicBlock *FoundBB, BasicBlock *EndBB) { + + // Insert the byte compare code at the end of the preheader block + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + BasicBlock *Header = CurLoop->getHeader(); + BranchInst *PHBranch = cast(Preheader->getTerminator()); + IRBuilder<> Builder(PHBranch); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); + + // Increment the pointer if this was done before the loads in the loop. + if (IncIdx) + Start = Builder.CreateAdd(Start, ConstantInt::get(Start->getType(), 1)); + + Value *ByteCmpRes = + expandFindMismatch(Builder, DTU, GEPA, GEPB, Index, Start, MaxLen); + + // Replaces uses of index & induction Phi with intrinsic (we already + // checked that the the first instruction of Header is the Phi above). + assert(IndPhi->hasOneUse() && "Index phi node has more than one use!"); + Index->replaceAllUsesWith(ByteCmpRes); + + assert(PHBranch->isUnconditional() && + "Expected preheader to terminate with an unconditional branch."); + + // If no mismatch was found, we can jump to the end block. Create a + // new basic block for the compare instruction. + auto *CmpBB = BasicBlock::Create(Preheader->getContext(), "byte.compare", + Preheader->getParent()); + CmpBB->moveBefore(EndBB); + + // Replace the branch in the preheader with an always-true conditional branch. + // This ensures there is still a reference to the original loop. + Builder.CreateCondBr(Builder.getTrue(), CmpBB, Header); + PHBranch->eraseFromParent(); + + BasicBlock *MismatchEnd = cast(ByteCmpRes)->getParent(); + DTU.applyUpdates({{DominatorTree::Insert, MismatchEnd, CmpBB}}); + + // Create the branch to either the end or found block depending on the value + // returned by the intrinsic. + Builder.SetInsertPoint(CmpBB); + if (FoundBB != EndBB) { + Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen); + Builder.CreateCondBr(FoundCmp, EndBB, FoundBB); + DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}, + {DominatorTree::Insert, CmpBB, EndBB}}); + + } else { + Builder.CreateBr(FoundBB); + DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}}); + } + + auto fixSuccessorPhis = [&](BasicBlock *SuccBB) { + for (PHINode &PN : SuccBB->phis()) { + // At this point we've already replaced all uses of the result from the + // loop with ByteCmp. Look through the incoming values to find ByteCmp, + // meaning this is a Phi collecting the results of the byte compare. + bool ResPhi = false; + for (Value *Op : PN.incoming_values()) + if (Op == ByteCmpRes) { + ResPhi = true; + break; + } + + // Any PHI that depended upon the result of the byte compare needs a new + // incoming value from CmpBB. This is because the original loop will get + // deleted. + if (ResPhi) + PN.addIncoming(ByteCmpRes, CmpBB); + else { + // There should be no other outside uses of other values in the + // original loop. Any incoming values should either: + // 1. Be for blocks outside the loop, which aren't interesting. Or .. + // 2. These are from blocks in the loop with values defined outside + // the loop. We should a similar incoming value from CmpBB. + for (BasicBlock *BB : PN.blocks()) + if (CurLoop->contains(BB)) { + PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB); + break; + } + } + } + }; + + // Ensure all Phis in the successors of CmpBB have an incoming value from it. + fixSuccessorPhis(EndBB); + if (EndBB != FoundBB) + fixSuccessorPhis(FoundBB); + + // The new CmpBB block isn't part of the loop, but will need to be added to + // the outer loop if there is one. + if (!CurLoop->isOutermost()) + CurLoop->getParentLoop()->addBasicBlockToLoop(CmpBB, *LI); + + if (VerifyLoops && CurLoop->getParentLoop()) { + CurLoop->getParentLoop()->verifyLoop(); + if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI)) + report_fatal_error("Loops must remain in LCSSA form!"); + } +} diff --git a/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h b/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h new file mode 100644 index 0000000000000000000000000000000000000000..cc68425bb68b581483f66848a0c102ad6573df92 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h @@ -0,0 +1,25 @@ +//===- AArch64LoopIdiomTransform.h --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" + +namespace llvm { + +struct AArch64LoopIdiomTransformPass + : PassInfoMixin { + PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, LPMUpdater &U); +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 358f7f6c1656315c0ad514fdbf3fc907f64a3761..9f26bd732609e0cdc58191c2745f73cb18c38f28 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -891,7 +891,7 @@ class ZPRRegOp : RegisterClass< "AArch64", - [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1 ], 16, + [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1, aarch64svcount ], 16, (sequence "P%u", firstreg, lastreg)> { let Size = 16; } diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 90496d597e1d7e0ee5c14442ac3ec5f84a168b1d..4d6e575477e89c5c69c79a8116d1f8fa4beb71bf 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1950,6 +1950,11 @@ let Predicates = [HasSVEorSME] in { defm CNTW_XPiI : sve_int_count<0b100, "cntw", int_aarch64_sve_cntw>; defm CNTD_XPiI : sve_int_count<0b110, "cntd", int_aarch64_sve_cntd>; defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>; + + def : Pat<(i64 (AArch64CttzElts nxv16i1:$Op1)), + (i64 (!cast(CNTP_XPP_B) + (nxv16i1 (!cast(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1)), + (nxv16i1 (!cast(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1))))>; } defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb", add, int_aarch64_sve_cntb>; @@ -2035,6 +2040,17 @@ let Predicates = [HasSVEorSME] in { defm INCP_ZP : sve_int_count_v<0b10000, "incp">; defm DECP_ZP : sve_int_count_v<0b10100, "decp">; + def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv16i1:$Op2)))), + (i64 (!cast(INCP_XP_B) + (nxv16i1 (!cast(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op2)), + GPR64:$Op1))>; + + def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv16i1:$Op2))))), + (i32 (EXTRACT_SUBREG (i64 (!cast(INCP_XP_B) + (nxv16i1 (!cast(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op2)), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Op1, sub_32))), + sub_32))>; + defm INDEX_RR : sve_int_index_rr<"index", AArch64mul_p_oneuse>; defm INDEX_IR : sve_int_index_ir<"index", AArch64mul_p, AArch64mul_p_oneuse>; defm INDEX_RI : sve_int_index_ri<"index">; @@ -2523,6 +2539,9 @@ let Predicates = [HasSVEorSME] in { def : Pat<(nxv8f16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8f16 ZPR:$src)>; def : Pat<(nxv4f32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4f32 ZPR:$src)>; def : Pat<(nxv2f64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2f64 ZPR:$src)>; + + def : Pat<(nxv16i1 (bitconvert (aarch64svcount PPR:$src))), (nxv16i1 PPR:$src)>; + def : Pat<(aarch64svcount (bitconvert (nxv16i1 PPR:$src))), (aarch64svcount PPR:$src)>; } // These allow casting from/to unpacked predicate types. diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 98b990fdce8c77ac599ecc6267fd4bafb1d37c07..601cc4b19fa13247d571e9ef0a42123a8fac93e9 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -11,6 +11,7 @@ #include "AArch64TargetMachine.h" #include "AArch64.h" +#include "AArch64LoopIdiomTransform.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64MachineScheduler.h" #include "AArch64MacroFusion.h" @@ -43,6 +44,7 @@ #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Pass.h" +#include "llvm/Passes/PassBuilder.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -217,6 +219,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { initializeAArch64ExpandPseudoPass(*PR); initializeAArch64KCFIPass(*PR); initializeAArch64LoadStoreOptPass(*PR); + initializeAArch64LoopIdiomTransformLegacyPassPass(*PR); initializeAArch64MIPeepholeOptPass(*PR); initializeAArch64SIMDInstrOptPass(*PR); initializeAArch64O0PreLegalizerCombinerPass(*PR); @@ -525,6 +528,13 @@ public: } // end anonymous namespace +void AArch64TargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { + PB.registerLateLoopOptimizationsEPCallback( + [=](LoopPassManager &LPM, OptimizationLevel Level) { + LPM.addPass(AArch64LoopIdiomTransformPass()); + }); +} + TargetTransformInfo AArch64TargetMachine::getTargetTransformInfo(const Function &F) const { return TargetTransformInfo(AArch64TTIImpl(this, F)); diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h index e9b5f4820b79c1a9ce2f5652c23e0fb1d075dcaa..16671aa2ab545346476b7259537b5354c3231831 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -14,6 +14,7 @@ #define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H #include "AArch64InstrInfo.h" +#include "AArch64LoopIdiomTransform.h" #include "AArch64Subtarget.h" #include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" @@ -43,6 +44,8 @@ public: // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + void registerPassBuilderCallbacks(PassBuilder &PB) override; + TargetTransformInfo getTargetTransformInfo(const Function &F) const override; TargetLoweringObjectFile* getObjFileLowering() const override { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 0f4a4d7dc80460651365486d2ca8b1bd0651ef81..9d92dce4232339a11d8fcbebf5e98702d41870c1 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -394,6 +394,8 @@ public: /// @} bool enableSelectOptimize() { return ST->enableSelectOptimize(); } + + std::optional getMinPageSize() const { return 4096; } }; } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt index f42ab1f4211aa8bd29c8cfa7c693a432443570b6..4035f439a69a41e680e2f8e31cbaa31614548eba 100644 --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -64,6 +64,7 @@ add_llvm_target(AArch64CodeGen AArch64InstrInfo.cpp AArch64KCFI.cpp AArch64LoadStoreOptimizer.cpp + AArch64LoopIdiomTransform.cpp AArch64LowerHomogeneousPrologEpilog.cpp AArch64MachineFunctionInfo.cpp AArch64MachineScheduler.cpp diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index 9fcae88b794fdc37cd1f80c27e3907c83b19002f..873c08ee2bb505b2817d1eda62937a1bc9aa6082 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -526,10 +526,9 @@ static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder, bool AArch64CallLowering::fallBackToDAGISel(const MachineFunction &MF) const { auto &F = MF.getFunction(); - if (isa(F.getReturnType())) - return true; - if (llvm::any_of(F.args(), [](const Argument &A) { - return isa(A.getType()); + if (F.getReturnType()->isScalableTy() || + llvm::any_of(F.args(), [](const Argument &A) { + return A.getType()->isScalableTy(); })) return true; const auto &ST = MF.getSubtarget(); diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 4c89f947d7fc0c80bfaeccdd599c4e9748b195a1..2064955a65efd0f1042e6925975bf060cefcb5f8 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -976,6 +976,7 @@ static bool isHighCostExpansion(const SCEV *S, switch (S->getSCEVType()) { case scUnknown: case scConstant: + case scVScale: return false; case scTruncate: return isHighCostExpansion(cast(S)->getOperand(), @@ -2812,9 +2813,10 @@ static bool isCompatibleIVType(Value *LVal, Value *RVal) { /// SCEVUnknown, we simply return the rightmost SCEV operand. static const SCEV *getExprBase(const SCEV *S) { switch (S->getSCEVType()) { - default: // uncluding scUnknown. + default: // including scUnknown. return S; case scConstant: + case scVScale: return nullptr; case scTruncate: return getExprBase(cast(S)->getOperand()); diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 24f1966edd37aa31bcc77c1120b4940cd6614142..902eee26a45671cf58db7fb085d7b0270f787bb2 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -680,6 +680,7 @@ const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) { switch (S->getSCEVType()) { case scConstant: + case scVScale: return nullptr; // A constant has no relevant loops. case scTruncate: case scZeroExtend: @@ -1744,6 +1745,10 @@ Value *SCEVExpander::visitSequentialUMinExpr(const SCEVSequentialUMinExpr *S) { return expandMinMaxExpr(S, Intrinsic::umin, "umin", /*IsSequential*/true); } +Value *SCEVExpander::visitVScale(const SCEVVScale *S) { + return Builder.CreateVScale(ConstantInt::get(S->getType(), 1)); +} + Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, Instruction *IP) { setInsertPoint(IP); @@ -2124,6 +2129,7 @@ template static InstructionCost costAndCollectOperands( llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); case scUnknown: case scConstant: + case scVScale: return 0; case scPtrToInt: Cost = CastCost(Instruction::PtrToInt); @@ -2260,6 +2266,7 @@ bool SCEVExpander::isHighCostExpansionHelper( case scCouldNotCompute: llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!"); case scUnknown: + case scVScale: // Assume to be zero-cost. return false; case scConstant: { diff --git a/llvm/test/Analysis/ScalarEvolution/scalable-vector.ll b/llvm/test/Analysis/ScalarEvolution/scalable-vector.ll index 798a36023235d7750844e670a0b42f9e9a8ceecd..b5bd724cb269c13b655a30f3040ce5d646268c32 100644 --- a/llvm/test/Analysis/ScalarEvolution/scalable-vector.ll +++ b/llvm/test/Analysis/ScalarEvolution/scalable-vector.ll @@ -5,9 +5,9 @@ define void @a(ptr %p) { ; CHECK-LABEL: 'a' ; CHECK-NEXT: Classifying expressions for: @a ; CHECK-NEXT: %1 = getelementptr , ptr null, i32 3 -; CHECK-NEXT: --> ((3 * sizeof()) + null) U: [0,-15) S: [-9223372036854775808,9223372036854775793) +; CHECK-NEXT: --> ((48 * vscale) + null) U: [0,-15) S: [-9223372036854775808,9223372036854775793) ; CHECK-NEXT: %2 = getelementptr , ptr %p, i32 1 -; CHECK-NEXT: --> (sizeof() + %p) U: full-set S: full-set +; CHECK-NEXT: --> ((8 * vscale) + %p) U: full-set S: full-set ; CHECK-NEXT: Determining loop execution counts for: @a ; getelementptr , ptr null, i32 3 diff --git a/llvm/test/CodeGen/AArch64/get_vector_length.ll b/llvm/test/CodeGen/AArch64/get_vector_length.ll new file mode 100644 index 0000000000000000000000000000000000000000..3a83edb339b8481b5c26be3caa342d8fa095fc71 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/get_vector_length.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s + +declare i32 @llvm.experimental.get.vector.length.i16(i16, i32, i1) +declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i1) +declare i32 @llvm.experimental.get.vector.length.i64(i64, i32, i1) + +define i32 @vector_length_i16(i16 zeroext %tc) { +; CHECK-LABEL: vector_length_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: csel w0, w0, w8, lo +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 2, i1 true) + ret i32 %a +} + +define i32 @vector_length_i32(i32 zeroext %tc) { +; CHECK-LABEL: vector_length_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: csel w0, w0, w8, lo +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 2, i1 true) + ret i32 %a +} + +define i32 @vector_length_i64(i64 %tc) { +; CHECK-LABEL: vector_length_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: cmp x0, x8 +; CHECK-NEXT: csel x0, x0, x8, lo +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i64(i64 %tc, i32 2, i1 true) + ret i32 %a +} diff --git a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll new file mode 100644 index 0000000000000000000000000000000000000000..b002cbc6cd4d55e0df39d38e9d4d118b53259492 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll @@ -0,0 +1,130 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV64 + +declare i32 @llvm.experimental.get.vector.length.i16(i16, i32, i1) +declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i1) +declare i32 @llvm.experimental.get.vector.length.i64(i64, i32, i1) + +define i32 @vector_length_i16(i16 zeroext %tc) { +; CHECK-LABEL: vector_length_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: bltu a0, a1, .LBB0_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 2, i1 true) + ret i32 %a +} + +define i32 @vector_length_i32(i32 zeroext %tc) { +; RV32-LABEL: vector_length_i32: +; RV32: # %bb.0: +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: bltu a0, a1, .LBB1_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: .LBB1_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vector_length_i32: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: bltu a0, a1, .LBB1_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB1_2: +; RV64-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 2, i1 true) + ret i32 %a +} + +define i32 @vector_length_XLen(iXLen zeroext %tc) { +; RV32-LABEL: vector_length_XLen: +; RV32: # %bb.0: +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: bltu a0, a1, .LBB2_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: .LBB2_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vector_length_XLen: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: bltu a0, a1, .LBB2_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB2_2: +; RV64-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 2, i1 true) + ret i32 %a +} + +define i32 @vector_length_i16_fixed(i16 zeroext %tc) { +; CHECK-LABEL: vector_length_i16_fixed: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 2 +; CHECK-NEXT: bltu a0, a1, .LBB3_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 2, i1 false) + ret i32 %a +} + +define i32 @vector_length_i32_fixed(i32 zeroext %tc) { +; RV32-LABEL: vector_length_i32_fixed: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 2 +; RV32-NEXT: bltu a0, a1, .LBB4_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a0, 2 +; RV32-NEXT: .LBB4_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vector_length_i32_fixed: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: li a1, 2 +; RV64-NEXT: bltu a0, a1, .LBB4_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a0, 2 +; RV64-NEXT: .LBB4_2: +; RV64-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 2, i1 false) + ret i32 %a +} + +define i32 @vector_length_XLen_fixed(iXLen zeroext %tc) { +; RV32-LABEL: vector_length_XLen_fixed: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 2 +; RV32-NEXT: bltu a0, a1, .LBB5_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a0, 2 +; RV32-NEXT: .LBB5_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vector_length_XLen_fixed: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: li a1, 2 +; RV64-NEXT: bltu a0, a1, .LBB5_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a0, 2 +; RV64-NEXT: .LBB5_2: +; RV64-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 2, i1 false) + ret i32 %a +} diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll new file mode 100644 index 0000000000000000000000000000000000000000..40ac60156aa54d43d0e67980865f933ba4a4dc14 --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll @@ -0,0 +1,1940 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -aarch64-lit -aarch64-lit-verify -verify-dom-info -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s +; RUN: opt -aarch64-lit -simplifycfg -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=LOOP-DEL +; RUN: opt -aarch64-lit -mtriple aarch64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=NO-TRANSFORM + +define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) { +; CHECK-LABEL: @compare_bytes_simple( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN:%.*]], 1 +; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; CHECK: mismatch_min_it_check: +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]] +; CHECK: mismatch_mem_check: +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12 +; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12 +; CHECK-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK: mismatch_sve_loop_preheader: +; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 +; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; CHECK: mismatch_sve_loop: +; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) +; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; CHECK: mismatch_sve_loop_inc: +; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; CHECK-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 +; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_sve_loop_found: +; CHECK-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] +; CHECK-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) +; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 +; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 +; CHECK-NEXT: br label [[MISMATCH_END]] +; CHECK: mismatch_loop_pre: +; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]] +; CHECK: mismatch_loop: +; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]] +; CHECK-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; CHECK: mismatch_loop_inc: +; CHECK-NEXT: [[TMP43]] = add i32 [[MISMATCH_INDEX]], 1 +; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] +; CHECK-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; CHECK: mismatch_end: +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP45]], [[TMP46]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: byte.compare: +; CHECK-NEXT: br label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: [[EXTRA_PHI:%.*]] = phi i32 [ [[EXTRA:%.*]], [[WHILE_BODY]] ], [ [[EXTRA]], [[WHILE_COND]] ], [ [[EXTRA]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[INC_LCSSA]], [[EXTRA_PHI]] +; CHECK-NEXT: ret i32 [[RES]] +; +; LOOP-DEL-LABEL: @compare_bytes_simple( +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN:%.*]], 1 +; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N:%.*]] to i64 +; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]] +; LOOP-DEL: mismatch_mem_check: +; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +; LOOP-DEL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; LOOP-DEL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64 +; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12 +; LOOP-DEL-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12 +; LOOP-DEL-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12 +; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12 +; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] +; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]] +; LOOP-DEL: mismatch_sve_loop_preheader: +; LOOP-DEL-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) +; LOOP-DEL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; LOOP-DEL-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 +; LOOP-DEL-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; LOOP-DEL: mismatch_sve_loop: +; LOOP-DEL-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] +; LOOP-DEL-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] +; LOOP-DEL-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] +; LOOP-DEL-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; LOOP-DEL-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) +; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; LOOP-DEL: mismatch_sve_loop_inc: +; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; LOOP-DEL-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) +; LOOP-DEL-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 +; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[WHILE_END:%.*]] +; LOOP-DEL: mismatch_sve_loop_found: +; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] +; LOOP-DEL-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; LOOP-DEL-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) +; LOOP-DEL-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 +; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; LOOP-DEL-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 +; LOOP-DEL-NEXT: br label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_pre: +; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LOOP-DEL: mismatch_loop: +; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LOOP-DEL-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]] +; LOOP-DEL-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; LOOP-DEL-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]] +; LOOP-DEL-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1 +; LOOP-DEL-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]] +; LOOP-DEL-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_inc: +; LOOP-DEL-NEXT: [[TMP43]] = add i32 [[MISMATCH_INDEX]], 1 +; LOOP-DEL-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP44]], label [[WHILE_END]], label [[MISMATCH_LOOP]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; LOOP-DEL-NEXT: [[RES:%.*]] = add i32 [[MISMATCH_RESULT]], [[EXTRA:%.*]] +; LOOP-DEL-NEXT: ret i32 [[RES]] +; +; NO-TRANSFORM-LABEL: @compare_bytes_simple( +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: [[EXTRA_PHI:%.*]] = phi i32 [ [[EXTRA:%.*]], [[WHILE_BODY]] ], [ [[EXTRA]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: [[RES:%.*]] = add i32 [[INC_LCSSA]], [[EXTRA_PHI]] +; NO-TRANSFORM-NEXT: ret i32 [[RES]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + %extra.phi = phi i32 [ %extra, %while.body ], [ %extra, %while.cond ] + %res = add i32 %inc.lcssa, %extra.phi + ret i32 %res +} + + +define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: @compare_bytes_signed_wrap( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN:%.*]], 1 +; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; CHECK: mismatch_min_it_check: +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; CHECK: mismatch_mem_check: +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12 +; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12 +; CHECK-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; CHECK: mismatch_sve_loop_preheader: +; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 +; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; CHECK: mismatch_sve_loop: +; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) +; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; CHECK: mismatch_sve_loop_inc: +; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; CHECK-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 +; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_sve_loop_found: +; CHECK-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] +; CHECK-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) +; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 +; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 +; CHECK-NEXT: br label [[MISMATCH_END]] +; CHECK: mismatch_loop_pre: +; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]] +; CHECK: mismatch_loop: +; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]] +; CHECK-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; CHECK: mismatch_loop_inc: +; CHECK-NEXT: [[TMP43]] = add nsw i32 [[MISMATCH_INDEX]], 1 +; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] +; CHECK-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; CHECK: mismatch_end: +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP45]], [[TMP46]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: byte.compare: +; CHECK-NEXT: br label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: @compare_bytes_signed_wrap( +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN:%.*]], 1 +; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N:%.*]] to i64 +; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; LOOP-DEL: mismatch_mem_check: +; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +; LOOP-DEL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; LOOP-DEL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64 +; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12 +; LOOP-DEL-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12 +; LOOP-DEL-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12 +; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12 +; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] +; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; LOOP-DEL: mismatch_sve_loop_preheader: +; LOOP-DEL-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) +; LOOP-DEL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; LOOP-DEL-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 +; LOOP-DEL-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; LOOP-DEL: mismatch_sve_loop: +; LOOP-DEL-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] +; LOOP-DEL-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] +; LOOP-DEL-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] +; LOOP-DEL-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; LOOP-DEL-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) +; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; LOOP-DEL: mismatch_sve_loop_inc: +; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; LOOP-DEL-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) +; LOOP-DEL-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 +; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[WHILE_END:%.*]] +; LOOP-DEL: mismatch_sve_loop_found: +; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] +; LOOP-DEL-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; LOOP-DEL-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) +; LOOP-DEL-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 +; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; LOOP-DEL-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 +; LOOP-DEL-NEXT: br label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_pre: +; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LOOP-DEL: mismatch_loop: +; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LOOP-DEL-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]] +; LOOP-DEL-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; LOOP-DEL-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]] +; LOOP-DEL-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1 +; LOOP-DEL-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]] +; LOOP-DEL-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_inc: +; LOOP-DEL-NEXT: [[TMP43]] = add nsw i32 [[MISMATCH_INDEX]], 1 +; LOOP-DEL-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP44]], label [[WHILE_END]], label [[MISMATCH_LOOP]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; LOOP-DEL-NEXT: ret i32 [[MISMATCH_RESULT]] +; +; NO-TRANSFORM-LABEL: @compare_bytes_signed_wrap( +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add nsw i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add nsw i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + + +define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) { +; CHECK-LABEL: @compare_bytes_simple_end_ne_found( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN:%.*]], 1 +; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; CHECK: mismatch_min_it_check: +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N:%.*]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; CHECK: mismatch_mem_check: +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12 +; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12 +; CHECK-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; CHECK: mismatch_sve_loop_preheader: +; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 +; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; CHECK: mismatch_sve_loop: +; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) +; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; CHECK: mismatch_sve_loop_inc: +; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; CHECK-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 +; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_sve_loop_found: +; CHECK-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] +; CHECK-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) +; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 +; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 +; CHECK-NEXT: br label [[MISMATCH_END]] +; CHECK: mismatch_loop_pre: +; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]] +; CHECK: mismatch_loop: +; CHECK-NEXT: [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64 +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]] +; CHECK-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; CHECK: mismatch_loop_inc: +; CHECK-NEXT: [[TMP43]] = add i32 [[MISMATCH_INDEX3]], 1 +; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] +; CHECK-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; CHECK: mismatch_end: +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP45]], [[TMP46]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]] +; CHECK: while.found: +; CHECK-NEXT: [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: [[FOUND_PTR:%.*]] = phi ptr [ [[C:%.*]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: byte.compare: +; CHECK-NEXT: [[TMP47:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[TMP47]], label [[WHILE_END]], label [[WHILE_FOUND]] +; CHECK: while.end: +; CHECK-NEXT: [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: [[END_PTR:%.*]] = phi ptr [ [[D:%.*]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ] +; CHECK-NEXT: [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ] +; CHECK-NEXT: store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4 +; CHECK-NEXT: ret i32 [[MISMATCH_INDEX]] +; +; LOOP-DEL-LABEL: @compare_bytes_simple_end_ne_found( +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN:%.*]], 1 +; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N:%.*]] to i64 +; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; LOOP-DEL: mismatch_mem_check: +; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +; LOOP-DEL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; LOOP-DEL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64 +; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12 +; LOOP-DEL-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12 +; LOOP-DEL-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12 +; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12 +; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] +; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; LOOP-DEL: mismatch_sve_loop_preheader: +; LOOP-DEL-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) +; LOOP-DEL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; LOOP-DEL-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 +; LOOP-DEL-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; LOOP-DEL: mismatch_sve_loop: +; LOOP-DEL-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] +; LOOP-DEL-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] +; LOOP-DEL-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] +; LOOP-DEL-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; LOOP-DEL-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) +; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; LOOP-DEL: mismatch_sve_loop_inc: +; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; LOOP-DEL-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) +; LOOP-DEL-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 +; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[BYTE_COMPARE:%.*]] +; LOOP-DEL: mismatch_sve_loop_found: +; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] +; LOOP-DEL-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; LOOP-DEL-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) +; LOOP-DEL-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 +; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; LOOP-DEL-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 +; LOOP-DEL-NEXT: br label [[BYTE_COMPARE]] +; LOOP-DEL: mismatch_loop_pre: +; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LOOP-DEL: mismatch_loop: +; LOOP-DEL-NEXT: [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64 +; LOOP-DEL-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]] +; LOOP-DEL-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; LOOP-DEL-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]] +; LOOP-DEL-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1 +; LOOP-DEL-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]] +; LOOP-DEL-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[BYTE_COMPARE]] +; LOOP-DEL: mismatch_loop_inc: +; LOOP-DEL-NEXT: [[TMP43]] = add i32 [[MISMATCH_INDEX3]], 1 +; LOOP-DEL-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP44]], label [[BYTE_COMPARE]], label [[MISMATCH_LOOP]] +; LOOP-DEL: byte.compare: +; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; LOOP-DEL-NEXT: [[TMP45:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; LOOP-DEL-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP45]], i32 [[N]], i32 [[MISMATCH_RESULT]] +; LOOP-DEL-NEXT: [[SPEC_SELECT4:%.*]] = select i1 [[TMP45]], ptr [[D:%.*]], ptr [[C:%.*]] +; LOOP-DEL-NEXT: store i32 [[SPEC_SELECT]], ptr [[SPEC_SELECT4]], align 4 +; LOOP-DEL-NEXT: ret i32 [[SPEC_SELECT]] +; +; NO-TRANSFORM-LABEL: @compare_bytes_simple_end_ne_found( +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]] +; NO-TRANSFORM: while.found: +; NO-TRANSFORM-NEXT: [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ] +; NO-TRANSFORM-NEXT: [[FOUND_PTR:%.*]] = phi ptr [ [[C:%.*]], [[WHILE_BODY]] ] +; NO-TRANSFORM-NEXT: br label [[END:%.*]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: [[END_PTR:%.*]] = phi ptr [ [[D:%.*]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: br label [[END]] +; NO-TRANSFORM: end: +; NO-TRANSFORM-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ] +; NO-TRANSFORM-NEXT: [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ] +; NO-TRANSFORM-NEXT: store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4 +; NO-TRANSFORM-NEXT: ret i32 [[MISMATCH_INDEX]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.found + +while.found: + %mismatch_index1 = phi i32 [ %inc, %while.body ] + %found_ptr = phi ptr [ %c, %while.body ] + br label %end + +while.end: + %mismatch_index2 = phi i32 [ %n, %while.cond ] + %end_ptr = phi ptr [ %d, %while.cond ] + br label %end + +end: + %mismatch_index = phi i32 [ %mismatch_index1, %while.found ], [ %mismatch_index2, %while.end ] + %store_ptr = phi ptr [ %end_ptr, %while.end ], [ %found_ptr, %while.found ] + store i32 %mismatch_index, ptr %store_ptr + ret i32 %mismatch_index +} + + + +define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) { +; CHECK-LABEL: @compare_bytes_extra_cmp( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N:%.*]], [[X:%.*]] +; CHECK-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]] +; CHECK: ph: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN:%.*]], 1 +; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; CHECK: mismatch_min_it_check: +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; CHECK: mismatch_mem_check: +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12 +; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12 +; CHECK-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; CHECK: mismatch_sve_loop_preheader: +; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 +; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; CHECK: mismatch_sve_loop: +; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) +; CHECK-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; CHECK: mismatch_sve_loop_inc: +; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; CHECK-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 +; CHECK-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_sve_loop_found: +; CHECK-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] +; CHECK-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) +; CHECK-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 +; CHECK-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 +; CHECK-NEXT: br label [[MISMATCH_END]] +; CHECK: mismatch_loop_pre: +; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]] +; CHECK: mismatch_loop: +; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]] +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1 +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]] +; CHECK-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; CHECK: mismatch_loop_inc: +; CHECK-NEXT: [[TMP43]] = add i32 [[MISMATCH_INDEX]], 1 +; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] +; CHECK-NEXT: br i1 [[TMP44]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; CHECK: mismatch_end: +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP45]], [[TMP46]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: byte.compare: +; CHECK-NEXT: br label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[X]], [[ENTRY:%.*]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: @compare_bytes_extra_cmp( +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N:%.*]], [[X:%.*]] +; LOOP-DEL-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]] +; LOOP-DEL: ph: +; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN:%.*]], 1 +; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; LOOP-DEL: mismatch_mem_check: +; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +; LOOP-DEL-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; LOOP-DEL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP8]] to i64 +; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP7]], 12 +; LOOP-DEL-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP10]], 12 +; LOOP-DEL-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP6]], 12 +; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 12 +; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP12]], [[TMP13]] +; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; LOOP-DEL: mismatch_sve_loop_preheader: +; LOOP-DEL-NEXT: [[TMP19:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[TMP2]]) +; LOOP-DEL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; LOOP-DEL-NEXT: [[TMP21:%.*]] = mul nuw nsw i64 [[TMP20]], 16 +; LOOP-DEL-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; LOOP-DEL: mismatch_sve_loop: +; LOOP-DEL-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP19]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; LOOP-DEL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_SVE_INDEX]] +; LOOP-DEL-NEXT: [[TMP23:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_SVE_INDEX]] +; LOOP-DEL-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; LOOP-DEL-NEXT: [[TMP26:%.*]] = icmp ne [[TMP23]], [[TMP25]] +; LOOP-DEL-NEXT: [[TMP27:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP26]], zeroinitializer +; LOOP-DEL-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP27]]) +; LOOP-DEL-NEXT: br i1 [[TMP28]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; LOOP-DEL: mismatch_sve_loop_inc: +; LOOP-DEL-NEXT: [[TMP29]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP21]] +; LOOP-DEL-NEXT: [[TMP30]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP29]], i64 [[TMP2]]) +; LOOP-DEL-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 +; LOOP-DEL-NEXT: br i1 [[TMP31]], label [[MISMATCH_SVE_LOOP]], label [[WHILE_END]] +; LOOP-DEL: mismatch_sve_loop_found: +; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP27]], [[MISMATCH_SVE_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] +; LOOP-DEL-NEXT: [[TMP32:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; LOOP-DEL-NEXT: [[TMP33:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP32]], i1 true) +; LOOP-DEL-NEXT: [[TMP34:%.*]] = zext i32 [[TMP33]] to i64 +; LOOP-DEL-NEXT: [[TMP35:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP34]] +; LOOP-DEL-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP35]] to i32 +; LOOP-DEL-NEXT: br label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_pre: +; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LOOP-DEL: mismatch_loop: +; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP43:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[TMP37:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LOOP-DEL-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP37]] +; LOOP-DEL-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 +; LOOP-DEL-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP37]] +; LOOP-DEL-NEXT: [[TMP41:%.*]] = load i8, ptr [[TMP40]], align 1 +; LOOP-DEL-NEXT: [[TMP42:%.*]] = icmp eq i8 [[TMP39]], [[TMP41]] +; LOOP-DEL-NEXT: br i1 [[TMP42]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_inc: +; LOOP-DEL-NEXT: [[TMP43]] = add i32 [[MISMATCH_INDEX]], 1 +; LOOP-DEL-NEXT: [[TMP44:%.*]] = icmp eq i32 [[TMP43]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP44]], label [[WHILE_END]], label [[MISMATCH_LOOP]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP36]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: @compare_bytes_extra_cmp( +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N:%.*]], [[X:%.*]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]] +; NO-TRANSFORM: ph: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[PH]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ], [ [[X]], [[ENTRY:%.*]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + %cmp.x = icmp ult i32 %n, %x + br i1 %cmp.x, label %ph, label %while.end + +ph: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %ph ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ], [ %x, %entry ] + ret i32 %inc.lcssa +} + +define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) { +; CHECK-LABEL: @compare_bytes_cleanup_block( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; CHECK: mismatch_min_it_check: +; CHECK-NEXT: br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; CHECK: mismatch_mem_check: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1:%.*]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[SRC2:%.*]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint ptr [[TMP0]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP5]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP3]], 12 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP6]], 12 +; CHECK-NEXT: [[TMP10:%.*]] = lshr i64 [[TMP2]], 12 +; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP7]], 12 +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_SVE_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; CHECK: mismatch_sve_loop_preheader: +; CHECK-NEXT: [[TMP15:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 1, i64 0) +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul nuw nsw i64 [[TMP16]], 16 +; CHECK-NEXT: br label [[MISMATCH_SVE_LOOP:%.*]] +; CHECK: mismatch_sve_loop: +; CHECK-NEXT: [[MISMATCH_SVE_LOOP_PRED:%.*]] = phi [ [[TMP15]], [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP26:%.*]], [[MISMATCH_SVE_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[MISMATCH_SVE_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_SVE_LOOP_PREHEADER]] ], [ [[TMP25:%.*]], [[MISMATCH_SVE_LOOP_INC]] ] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_SVE_INDEX]] +; CHECK-NEXT: [[TMP21:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP20]], i32 1, [[MISMATCH_SVE_LOOP_PRED]], zeroinitializer) +; CHECK-NEXT: [[TMP22:%.*]] = icmp ne [[TMP19]], [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = select [[MISMATCH_SVE_LOOP_PRED]], [[TMP22]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP23]]) +; CHECK-NEXT: br i1 [[TMP24]], label [[MISMATCH_SVE_LOOP_FOUND:%.*]], label [[MISMATCH_SVE_LOOP_INC]] +; CHECK: mismatch_sve_loop_inc: +; CHECK-NEXT: [[TMP25]] = add nuw nsw i64 [[MISMATCH_SVE_INDEX]], [[TMP17]] +; CHECK-NEXT: [[TMP26]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP25]], i64 0) +; CHECK-NEXT: [[TMP27:%.*]] = extractelement [[TMP26]], i64 0 +; CHECK-NEXT: br i1 [[TMP27]], label [[MISMATCH_SVE_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_sve_loop_found: +; CHECK-NEXT: [[MISMATCH_SVE_FOUND_PRED:%.*]] = phi [ [[TMP23]], [[MISMATCH_SVE_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_SVE_LAST_LOOP_PRED:%.*]] = phi [ [[MISMATCH_SVE_LOOP_PRED]], [[MISMATCH_SVE_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_SVE_FOUND_INDEX:%.*]] = phi i64 [ [[MISMATCH_SVE_INDEX]], [[MISMATCH_SVE_LOOP]] ] +; CHECK-NEXT: [[TMP28:%.*]] = and [[MISMATCH_SVE_LAST_LOOP_PRED]], [[MISMATCH_SVE_FOUND_PRED]] +; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( [[TMP28]], i1 true) +; CHECK-NEXT: [[TMP30:%.*]] = zext i32 [[TMP29]] to i64 +; CHECK-NEXT: [[TMP31:%.*]] = add nuw nsw i64 [[MISMATCH_SVE_FOUND_INDEX]], [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 +; CHECK-NEXT: br label [[MISMATCH_END]] +; CHECK: mismatch_loop_pre: +; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]] +; CHECK: mismatch_loop: +; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_LOOP_PRE]] ], [ [[TMP39:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[TMP33:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP35:%.*]] = load i8, ptr [[TMP34]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP33]] +; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1 +; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i8 [[TMP35]], [[TMP37]] +; CHECK-NEXT: br i1 [[TMP38]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; CHECK: mismatch_loop_inc: +; CHECK-NEXT: [[TMP39]] = add i32 [[MISMATCH_INDEX]], 1 +; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i32 [[TMP39]], 0 +; CHECK-NEXT: br i1 [[TMP40]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; CHECK: mismatch_end: +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_SVE_LOOP_INC]] ], [ [[TMP32]], [[MISMATCH_SVE_LOOP_FOUND]] ] +; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ] +; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0 +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]] +; CHECK: byte.compare: +; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0 +; CHECK-NEXT: br i1 [[TMP43]], label [[CLEANUP_THREAD]], label [[IF_END]] +; CHECK: cleanup.thread: +; CHECK-NEXT: ret void +; CHECK: if.end: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: ret void +; +; LOOP-DEL-LABEL: @compare_bytes_cleanup_block( +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LOOP-DEL: mismatch_loop: +; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[MISMATCH_LOOP]] ] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LOOP-DEL-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[SRC1:%.*]], i64 [[TMP0]] +; LOOP-DEL-NEXT: [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1 +; LOOP-DEL-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[SRC2:%.*]], i64 [[TMP0]] +; LOOP-DEL-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1 +; LOOP-DEL-NEXT: [[TMP5:%.*]] = icmp ne i8 [[TMP2]], [[TMP4]] +; LOOP-DEL-NEXT: [[TMP6]] = add i32 [[MISMATCH_INDEX]], 1 +; LOOP-DEL-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 +; LOOP-DEL-NEXT: [[OR_COND:%.*]] = select i1 [[TMP5]], i1 true, i1 [[TMP7]] +; LOOP-DEL-NEXT: br i1 [[OR_COND]], label [[COMMON_RET:%.*]], label [[MISMATCH_LOOP]] +; LOOP-DEL: common.ret: +; LOOP-DEL-NEXT: ret void +; +; NO-TRANSFORM-LABEL: @compare_bytes_cleanup_block( +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], 0 +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]] +; NO-TRANSFORM: cleanup.thread: +; NO-TRANSFORM-NEXT: ret void +; NO-TRANSFORM: if.end: +; NO-TRANSFORM-NEXT: [[RES:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ] +; NO-TRANSFORM-NEXT: ret void +; +entry: + br label %while.cond + +while.cond: + %len = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %inc = add i32 %len, 1 + %cmp.not = icmp eq i32 %inc, 0 + br i1 %cmp.not, label %cleanup.thread, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr i8, ptr %src1, i64 %idxprom + %0 = load i8, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr i8, ptr %src2, i64 %idxprom + %1 = load i8, ptr %arrayidx2, align 1 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %if.end + +cleanup.thread: + ret void + +if.end: + %res = phi i32 [ %inc, %while.body ] + ret void +} + +; +; NEGATIVE TESTS +; + + +; Similar to @compare_bytes_simple, except in the while.end block we have an extra PHI +; with unique values for each incoming block from the loop. +define i32 @compare_bytes_simple2(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) { +; CHECK-LABEL: @compare_bytes_simple2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C:%.*]], [[WHILE_BODY]] ], [ [[D:%.*]], [[WHILE_COND]] ] +; CHECK-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4 +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: @compare_bytes_simple2( +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C:%.*]], [[WHILE_BODY]] ], [ [[D:%.*]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4 +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: @compare_bytes_simple2( +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C:%.*]], [[WHILE_BODY]] ], [ [[D:%.*]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4 +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ] + store i32 %inc.lcssa, ptr %final_ptr + ret i32 %inc.lcssa +} + + +define i32 @compare_bytes_sign_ext(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: @compare_bytes_sign_ext( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: @compare_bytes_sign_ext( +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = sext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: @compare_bytes_sign_ext( +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = sext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = sext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + + +define i32 @compare_bytes_outside_uses(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: @compare_bytes_outside_uses( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[LEN:%.*]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[RES:%.*]] = phi i1 [ [[CMP_NOT2]], [[WHILE_BODY]] ], [ [[CMP_NOT]], [[WHILE_COND]] ] +; CHECK-NEXT: [[EXT_RES:%.*]] = zext i1 [[RES]] to i32 +; CHECK-NEXT: ret i32 [[EXT_RES]] +; +; LOOP-DEL-LABEL: @compare_bytes_outside_uses( +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[IV]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[LEN:%.*]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[RES:%.*]] = phi i1 [ [[CMP_NOT2]], [[WHILE_BODY]] ], [ [[CMP_NOT]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: [[EXT_RES:%.*]] = zext i1 [[RES]] to i32 +; LOOP-DEL-NEXT: ret i32 [[EXT_RES]] +; +; NO-TRANSFORM-LABEL: @compare_bytes_outside_uses( +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[IV]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[LEN:%.*]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[RES:%.*]] = phi i1 [ [[CMP_NOT2]], [[WHILE_BODY]] ], [ [[CMP_NOT]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: [[EXT_RES:%.*]] = zext i1 [[RES]] to i32 +; NO-TRANSFORM-NEXT: ret i32 [[EXT_RES]] +; +entry: + br label %while.cond + +while.cond: + %iv = phi i32 [ 0, %entry ], [ %inc, %while.body ] + %inc = add i32 %iv, 1 + %cmp.not = icmp eq i32 %inc, %len + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %res = phi i1 [ %cmp.not2, %while.body ], [ %cmp.not, %while.cond ] + %ext_res = zext i1 %res to i32 + ret i32 %ext_res +} + +define i64 @compare_bytes_i64_index(ptr %a, ptr %b, i64 %len, i64 %n) { +; CHECK-LABEL: @compare_bytes_i64_index( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i64 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i64 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INC]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[INC]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i64 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i64 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: @compare_bytes_i64_index( +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i64 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i64 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], [[N:%.*]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INC]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[INC]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i64 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i64 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: @compare_bytes_i64_index( +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i64 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i64 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INC]], [[N:%.*]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INC]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[INC]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i64 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i64 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i64 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i64 %len.addr, 1 + %cmp.not = icmp eq i64 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %inc + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %inc + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i64 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i64 %inc.lcssa +} + +define i32 @compare_bytes_simple_wrong_icmp1(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: @compare_bytes_simple_wrong_icmp1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp ne i32 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: @compare_bytes_simple_wrong_icmp1( +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp ne i32 [[INC]], [[N:%.*]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: @compare_bytes_simple_wrong_icmp1( +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp ne i32 [[INC]], [[N:%.*]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp ne i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + +define i32 @compare_bytes_simple_wrong_icmp2(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: @compare_bytes_simple_wrong_icmp2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_BODY]], label [[WHILE_END:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: @compare_bytes_simple_wrong_icmp2( +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_BODY]], label [[WHILE_END:%.*]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: @compare_bytes_simple_wrong_icmp2( +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_BODY]], label [[WHILE_END:%.*]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.body, label %while.end + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + +define i32 @compare_bytes_simple_wrong_icmp3(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: @compare_bytes_simple_wrong_icmp3( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp ne i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: @compare_bytes_simple_wrong_icmp3( +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp ne i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: @compare_bytes_simple_wrong_icmp3( +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp ne i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp ne i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + +define i32 @compare_bytes_simple_wrong_icmp4(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: @compare_bytes_simple_wrong_icmp4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_END]], label [[WHILE_COND]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: @compare_bytes_simple_wrong_icmp4( +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_END]], label [[WHILE_COND]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: @compare_bytes_simple_wrong_icmp4( +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_END]], label [[WHILE_COND]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.end, label %while.cond + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + +define i32 @compare_bytes_bad_load_type(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: @compare_bytes_bad_load_type( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i16 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: @compare_bytes_bad_load_type( +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i16 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: @compare_bytes_bad_load_type( +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i16 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i16, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i16, ptr %arrayidx2 + %cmp.not2 = icmp eq i16 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + + +define i32 @compare_bytes_simple_optsize(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) optsize { +; CHECK-LABEL: @compare_bytes_simple_optsize( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: [[EXTRA_PHI:%.*]] = phi i32 [ [[EXTRA:%.*]], [[WHILE_BODY]] ], [ [[EXTRA]], [[WHILE_COND]] ] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[INC_LCSSA]], [[EXTRA_PHI]] +; CHECK-NEXT: ret i32 [[RES]] +; +; LOOP-DEL-LABEL: @compare_bytes_simple_optsize( +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: [[EXTRA_PHI:%.*]] = phi i32 [ [[EXTRA:%.*]], [[WHILE_BODY]] ], [ [[EXTRA]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: [[RES:%.*]] = add i32 [[INC_LCSSA]], [[EXTRA_PHI]] +; LOOP-DEL-NEXT: ret i32 [[RES]] +; +; NO-TRANSFORM-LABEL: @compare_bytes_simple_optsize( +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN:%.*]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: [[EXTRA_PHI:%.*]] = phi i32 [ [[EXTRA:%.*]], [[WHILE_BODY]] ], [ [[EXTRA]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: [[RES:%.*]] = add i32 [[INC_LCSSA]], [[EXTRA_PHI]] +; NO-TRANSFORM-NEXT: ret i32 [[RES]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + %extra.phi = phi i32 [ %extra, %while.body ], [ %extra, %while.cond ] + %res = add i32 %inc.lcssa, %extra.phi + ret i32 %res +} + diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/lit.local.cfg b/llvm/test/Transforms/LoopIdiom/AArch64/lit.local.cfg new file mode 100644 index 0000000000000000000000000000000000000000..10d4a0e953ed47f5514135baf286a9c3697ad4c0 --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/AArch64/lit.local.cfg @@ -0,0 +1,2 @@ +if not "AArch64" in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Verifier/get_vector_length.ll b/llvm/test/Verifier/get_vector_length.ll new file mode 100644 index 0000000000000000000000000000000000000000..2fb2e089cd69f757c4e93aa11aa568af58c076e4 --- /dev/null +++ b/llvm/test/Verifier/get_vector_length.ll @@ -0,0 +1,17 @@ +; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s + +declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i1) + +define i32 @vector_length_negative_vf(i32 zeroext %tc) { + ; CHECK: get_vector_length: VF must be positive + ; CHECK-NEXT: %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 -1, i1 true) + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 -1, i1 true) + ret i32 %a +} + +define i32 @vector_length_zero_vf(i32 zeroext %tc) { + ; CHECK: get_vector_length: VF must be positive + ; CHECK-NEXT: %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 0, i1 true) + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 0, i1 true) + ret i32 %a +} diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp index b7240f01300ce10ea030ac4820f14d8b4c46d442..f0f4ef5da91678f16c05ab17a876a182a72fac70 100644 --- a/llvm/utils/TableGen/CodeGenTarget.cpp +++ b/llvm/utils/TableGen/CodeGenTarget.cpp @@ -77,6 +77,7 @@ StringRef llvm::getEnumName(MVT::SimpleValueType T) { case MVT::ppcf128: return "MVT::ppcf128"; case MVT::x86mmx: return "MVT::x86mmx"; case MVT::x86amx: return "MVT::x86amx"; + case MVT::aarch64svcount: return "MVT::aarch64svcount"; case MVT::i64x8: return "MVT::i64x8"; case MVT::Glue: return "MVT::Glue"; case MVT::isVoid: return "MVT::isVoid"; diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn index b3400b527abff5ef544f3f5e0832f9119603da40..3006278f9e9868624cea042eca01e5f375577422 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn @@ -129,6 +129,7 @@ static_library("LLVMAArch64CodeGen") { "AArch64InstrInfo.cpp", "AArch64KCFI.cpp", "AArch64LoadStoreOptimizer.cpp", + "AArch64LoopIdiomTransform.cpp", "AArch64LowerHomogeneousPrologEpilog.cpp", "AArch64MCInstLower.cpp", "AArch64MIPeepholeOpt.cpp",