From 2a9856d86f916a740d2cf77a99da6e752e3bc887 Mon Sep 17 00:00:00 2001 From: Alfred Huang Date: Sat, 30 Oct 2021 23:40:24 -0700 Subject: [PATCH] Added vmovn_high/vmovl/vmovl_high Neon intrinsics and instructions for -O3 reduction variable vectorization --- .../include/cg/aarch64/aarch64_cgfunc.h | 4 +- .../include/cg/aarch64/aarch64_md.def | 9 ++- src/mapleall/maple_be/include/cg/cgfunc.h | 4 +- .../src/cg/aarch64/aarch64_cgfunc.cpp | 46 ++++++++++--- src/mapleall/maple_be/src/cg/cgfunc.cpp | 31 ++++++++- .../maple_ir/include/intrinsic_vector.def | 68 ++++++++++++++++++- 6 files changed, 143 insertions(+), 19 deletions(-) diff --git a/src/mapleall/maple_be/include/cg/aarch64/aarch64_cgfunc.h b/src/mapleall/maple_be/include/cg/aarch64/aarch64_cgfunc.h index 2ce8326e1d..aec9d74065 100644 --- a/src/mapleall/maple_be/include/cg/aarch64/aarch64_cgfunc.h +++ b/src/mapleall/maple_be/include/cg/aarch64/aarch64_cgfunc.h @@ -284,7 +284,8 @@ class AArch64CGFunc : public CGFunc { Operand *o3, PrimType oTyp3) override; RegOperand *SelectVectorMerge(PrimType rTyp, Operand *o1, Operand *o2, int32 iNum) override; RegOperand *SelectVectorMull(PrimType rType, Operand *o1, PrimType oTyp1, Operand *o2, PrimType oTyp2) override; - RegOperand *SelectVectorNarrow(PrimType rType, Operand *o1, PrimType otyp, bool isLow) override; + RegOperand *SelectVectorNarrow(PrimType rType, Operand *o1, PrimType otyp) override; + RegOperand *SelectVectorNarrow2(PrimType rType, Operand *o1, PrimType oty1, Operand *o2, PrimType oty2) override; RegOperand *SelectVectorNeg(PrimType rType, Operand *o1) override; RegOperand *SelectVectorNot(PrimType rType, Operand *o1) override; RegOperand *SelectVectorPairwiseAdd(PrimType rType, Operand *src, PrimType sType) override; @@ -295,6 +296,7 @@ class AArch64CGFunc : public CGFunc { RegOperand *SelectVectorShiftRNarrow(PrimType rType, Operand *o1, PrimType oTyp, Operand *o2, bool isLow) override; RegOperand *SelectVectorSum(PrimType rtype, Operand *o1, PrimType oType) override; RegOperand *SelectVectorTableLookup(PrimType rType, Operand *o1, Operand *o2) override; + RegOperand *SelectVectorWiden(PrimType rType, Operand *o1, PrimType otyp, bool isLow) override; void SelectVectorCvt(Operand *res, PrimType rType, Operand *o1, PrimType oType); void SelectVectorZip(PrimType rType, Operand *o1, Operand *o2); diff --git a/src/mapleall/maple_be/include/cg/aarch64/aarch64_md.def b/src/mapleall/maple_be/include/cg/aarch64/aarch64_md.def index a23168f813..6a8b2f359f 100644 --- a/src/mapleall/maple_be/include/cg/aarch64/aarch64_md.def +++ b/src/mapleall/maple_be/include/cg/aarch64/aarch64_md.def @@ -772,7 +772,6 @@ DEFINE_MOP(MOP_vbaddvrv,{mopdReg8FD,mopdReg128VS},ISVECTOR,kLtFpalu,"addv","0,1" DEFINE_MOP(MOP_vhaddvrv,{mopdReg16FD,mopdReg128VS},ISVECTOR,kLtFpalu,"addv","0,1",1) DEFINE_MOP(MOP_vsaddvrv,{mopdReg32FD,mopdReg128VS},ISVECTOR,kLtFpalu,"addv","0,1",1) DEFINE_MOP(MOP_vdaddvrv,{mopdReg64FD,mopdReg128VS},ISVECTOR,kLtFpalu,"addp","0,1",1) -DEFINE_MOP(MOP_vxtnuv, {mopdReg64VD,mopdReg128VS},ISVECTOR,kLtFpalu,"xtn","0,1",1) DEFINE_MOP(MOP_vzcmequu,{mopdReg64VD,mopdReg64VS,mopdImm8},ISVECTOR,kLtFpalu,"cmeq","0,1,2",1) DEFINE_MOP(MOP_vzcmgtuu,{mopdReg64VD,mopdReg64VS,mopdImm8},ISVECTOR,kLtFpalu,"cmgt","0,1,2",1) @@ -807,8 +806,12 @@ DEFINE_MOP(MOP_vushrvvi,{mopdReg128VD,mopdReg128VS,mopdImm8},ISVECTOR,kLtFpalu," DEFINE_MOP(MOP_vshllvvi,{mopdReg128VD,mopdReg64VS,mopdImm8},ISVECTOR,kLtFpalu,"shll","0,1,2",1) DEFINE_MOP(MOP_vushllvvi,{mopdReg128VD,mopdReg64VS,mopdImm8},ISVECTOR,kLtFpalu,"ushll","0,1,2",1) -DEFINE_MOP(MOP_vsqxtnvv,{mopdReg64VD,mopdReg128VS},ISVECTOR,kLtFpalu,"sqxtn","0,1",1) -DEFINE_MOP(MOP_vuqxtnvv,{mopdReg64VD,mopdReg128VS},ISVECTOR,kLtFpalu,"uqxtn","0,1",1) +DEFINE_MOP(MOP_vxtnuv, {mopdReg64VD,mopdReg128VS},ISVECTOR,kLtFpalu,"xtn","0,1",1) +DEFINE_MOP(MOP_vsxtlvu, {mopdReg128VD,mopdReg64VS},ISVECTOR,kLtFpalu,"sxtl","0,1",1) +DEFINE_MOP(MOP_vuxtlvu, {mopdReg128VD,mopdReg64VS},ISVECTOR,kLtFpalu,"uxtl","0,1",1) +DEFINE_MOP(MOP_vxtn2uv, {mopdReg64VDS,mopdReg128VS},ISPARTDEF|ISVECTOR,kLtFpalu,"xtn2","0,1",1) +DEFINE_MOP(MOP_vsxtl2vv,{mopdReg128VD,mopdReg128VS},ISVECTOR,kLtFpalu,"sxtl2","0,1",1) +DEFINE_MOP(MOP_vuxtl2vv,{mopdReg128VD,mopdReg128VS},ISVECTOR,kLtFpalu,"uxtl2","0,1",1) DEFINE_MOP(MOP_vshruui, {mopdReg64VD,mopdReg64VS,mopdImm8},ISVECTOR,kLtFpalu,"sshr","0,1,2",1) DEFINE_MOP(MOP_vshrvvi, {mopdReg128VD,mopdReg128VS,mopdImm8},ISVECTOR,kLtFpalu,"sshr","0,1,2",1) diff --git a/src/mapleall/maple_be/include/cg/cgfunc.h b/src/mapleall/maple_be/include/cg/cgfunc.h index 44c36d9134..b529bb318e 100644 --- a/src/mapleall/maple_be/include/cg/cgfunc.h +++ b/src/mapleall/maple_be/include/cg/cgfunc.h @@ -309,7 +309,8 @@ class CGFunc { PrimType oTyp3) = 0; virtual RegOperand *SelectVectorMerge(PrimType rTyp, Operand *o1, Operand *o2, int32 iNum) = 0; virtual RegOperand *SelectVectorMull(PrimType rType, Operand *o1, PrimType oTyp1, Operand *o2, PrimType oTyp2) = 0; - virtual RegOperand *SelectVectorNarrow(PrimType rType, Operand *o1, PrimType otyp, bool isLow) = 0; + virtual RegOperand *SelectVectorNarrow(PrimType rType, Operand *o1, PrimType otyp) = 0; + virtual RegOperand *SelectVectorNarrow2(PrimType rType, Operand *o1, PrimType oty1, Operand *o2, PrimType oty2) = 0; virtual RegOperand *SelectVectorNeg(PrimType rType, Operand *o1) = 0; virtual RegOperand *SelectVectorNot(PrimType rType, Operand *o1) = 0; virtual RegOperand *SelectVectorPairwiseAdd(PrimType rType, Operand *src, PrimType sType) = 0; @@ -321,6 +322,7 @@ class CGFunc { Operand *o2, bool isLow) = 0; virtual RegOperand *SelectVectorSum(PrimType rtype, Operand *o1, PrimType oType) = 0; virtual RegOperand *SelectVectorTableLookup(PrimType rType, Operand *o1, Operand *o2) = 0; + virtual RegOperand *SelectVectorWiden(PrimType rType, Operand *o1, PrimType otyp, bool isLow) = 0; /* For ebo issue. */ virtual Operand *GetTrueOpnd() { diff --git a/src/mapleall/maple_be/src/cg/aarch64/aarch64_cgfunc.cpp b/src/mapleall/maple_be/src/cg/aarch64/aarch64_cgfunc.cpp index aea08847c3..da3c2b487e 100644 --- a/src/mapleall/maple_be/src/cg/aarch64/aarch64_cgfunc.cpp +++ b/src/mapleall/maple_be/src/cg/aarch64/aarch64_cgfunc.cpp @@ -9645,8 +9645,7 @@ void AArch64CGFunc::SelectVectorCvt(Operand *res, PrimType rType, Operand *o1, P insn = &GetCG()->BuildInstruction(mOp, *res, *o1, *imm); } else if (GetPrimTypeSize(rType) < GetPrimTypeSize(oType)) { /* extract, similar to vqmovn_XX() intrinsics */ - mOp = (IsUnsignedInteger(rType)) ? MOP_vuqxtnvv : MOP_vsqxtnvv; - insn = &GetCG()->BuildInstruction(mOp, *res, *o1); + insn = &GetCG()->BuildInstruction(MOP_vxtnuv, *res, *o1); } else { CHECK_FATAL(0, "Invalid cvt between 2 operands of the same size"); } @@ -9965,24 +9964,31 @@ RegOperand *AArch64CGFunc::SelectVectorBitwiseOp(PrimType rType, Operand *o1, Pr return res; } -RegOperand *AArch64CGFunc::SelectVectorNarrow(PrimType rType, Operand *o1, PrimType otyp, bool isLow) { +RegOperand *AArch64CGFunc::SelectVectorNarrow(PrimType rType, Operand *o1, PrimType otyp) { RegOperand *res = &CreateRegisterOperandOfType(rType); /* result operand */ VectorRegSpec *vecSpecDest = GetMemoryPool()->New(rType); - VectorRegSpec *vecSpec1 = GetMemoryPool()->New(otyp); /* vector operand 1 */ + VectorRegSpec *vecSpec1 = GetMemoryPool()->New(otyp); /* vector operand */ - MOperator mOp; - if (isLow) { - mOp = MOP_vxtnuv; - } else { - CHECK_FATAL(0, "NYI: vmovn_high_"); - } - Insn *insn = &GetCG()->BuildInstruction(mOp, *res, *o1); + Insn *insn = &GetCG()->BuildInstruction(MOP_vxtnuv, *res, *o1); static_cast(insn)->PushRegSpecEntry(vecSpecDest); static_cast(insn)->PushRegSpecEntry(vecSpec1); GetCurBB()->AppendInsn(*insn); return res; } +RegOperand *AArch64CGFunc::SelectVectorNarrow2(PrimType rType, Operand *o1, PrimType oty1, Operand *o2, PrimType oty2) { + (void)oty1; /* 1st opnd was loaded already, type no longer needed */ + RegOperand *res = static_cast(o1); /* o1 is also the result */ + VectorRegSpec *vecSpecDest = GetMemoryPool()->New(rType); + VectorRegSpec *vecSpec2 = GetMemoryPool()->New(oty2); /* vector opnd2 */ + + Insn *insn = &GetCG()->BuildInstruction(MOP_vxtn2uv, *res, *o2); + static_cast(insn)->PushRegSpecEntry(vecSpecDest); + static_cast(insn)->PushRegSpecEntry(vecSpec2); + GetCurBB()->AppendInsn(*insn); + return res; +} + RegOperand *AArch64CGFunc::SelectVectorNot(PrimType rType, Operand *o1) { RegOperand *res = &CreateRegisterOperandOfType(rType); /* result operand */ VectorRegSpec *vecSpecDest = GetMemoryPool()->New(rType); @@ -10056,4 +10062,22 @@ void AArch64CGFunc::SelectVectorZip(PrimType rType, Operand *o1, Operand *o2) { } } +RegOperand *AArch64CGFunc::SelectVectorWiden(PrimType rType, Operand *o1, PrimType otyp, bool isLow) { + RegOperand *res = &CreateRegisterOperandOfType(rType); /* result operand */ + VectorRegSpec *vecSpecDest = GetMemoryPool()->New(rType); + VectorRegSpec *vecSpec1 = GetMemoryPool()->New(otyp); /* vector operand */ + + MOperator mOp; + if (isLow) { + mOp = IsPrimitiveUnSignedVector(rType) ? MOP_vuxtlvu : MOP_vsxtlvu; + } else { + mOp = IsPrimitiveUnSignedVector(rType) ? MOP_vuxtl2vv : MOP_vsxtl2vv; + } + Insn *insn = &GetCG()->BuildInstruction(mOp, *res, *o1); + static_cast(insn)->PushRegSpecEntry(vecSpecDest); + static_cast(insn)->PushRegSpecEntry(vecSpec1); + GetCurBB()->AppendInsn(*insn); + return res; +} + } /* namespace maplebe */ diff --git a/src/mapleall/maple_be/src/cg/cgfunc.cpp b/src/mapleall/maple_be/src/cg/cgfunc.cpp index 67a3a19d5e..1fbdcd66ce 100644 --- a/src/mapleall/maple_be/src/cg/cgfunc.cpp +++ b/src/mapleall/maple_be/src/cg/cgfunc.cpp @@ -480,11 +480,23 @@ Operand *HandleVectorMull(IntrinsicopNode &intrnNode, CGFunc &cgFunc) { Operand *HandleVectorNarrow(IntrinsicopNode &intrnNode, CGFunc &cgFunc, bool isLow) { PrimType rType = intrnNode.GetPrimType(); /* result operand */ - Operand *opnd1 = cgFunc.HandleExpr(intrnNode, *intrnNode.Opnd(0)); /* vector operand 1 */ - return cgFunc.SelectVectorNarrow(rType, opnd1, intrnNode.Opnd(0)->GetPrimType(), isLow); + Operand *opnd1 = cgFunc.HandleExpr(intrnNode, *intrnNode.Opnd(0)); /* vector opnd 1 */ + if (isLow) { + return cgFunc.SelectVectorNarrow(rType, opnd1, intrnNode.Opnd(0)->GetPrimType()); + } else { + Operand *opnd2 = cgFunc.HandleExpr(intrnNode, *intrnNode.Opnd(1)); /* vector opnd 2 */ + return cgFunc.SelectVectorNarrow2(rType, opnd1, intrnNode.Opnd(0)->GetPrimType(), opnd2, intrnNode.Opnd(1)->GetPrimType()); + } +} + +Operand *HandleVectorWiden(IntrinsicopNode &intrnNode, CGFunc &cgFunc, bool isLow) { + PrimType rType = intrnNode.GetPrimType(); /* result operand */ + Operand *opnd1 = cgFunc.HandleExpr(intrnNode, *intrnNode.Opnd(0)); /* vector opnd 1 */ + return cgFunc.SelectVectorWiden(rType, opnd1, intrnNode.Opnd(0)->GetPrimType(), isLow); } Operand *HandleIntrinOp(const BaseNode &parent, BaseNode &expr, CGFunc &cgFunc) { +// (void)parent; auto &intrinsicopNode = static_cast(expr); switch (intrinsicopNode.GetIntrinsic()) { case INTRN_MPL_READ_OVTABLE_ENTRY_LAZY: { @@ -708,6 +720,11 @@ Operand *HandleIntrinOp(const BaseNode &parent, BaseNode &expr, CGFunc &cgFunc) case INTRN_vector_narrow_low_v2u64: case INTRN_vector_narrow_low_v2i64: return HandleVectorNarrow(intrinsicopNode, cgFunc, true); + case INTRN_vector_narrow_high_v8u16: case INTRN_vector_narrow_high_v8i16: + case INTRN_vector_narrow_high_v4u32: case INTRN_vector_narrow_high_v4i32: + case INTRN_vector_narrow_high_v2u64: case INTRN_vector_narrow_high_v2i64: + return HandleVectorNarrow(intrinsicopNode, cgFunc, false); + case INTRN_vector_reverse_v8u8: case INTRN_vector_reverse_v8i8: case INTRN_vector_reverse_v4u16: case INTRN_vector_reverse_v4i16: case INTRN_vector_reverse_v16u8: case INTRN_vector_reverse_v16i8: @@ -723,6 +740,16 @@ Operand *HandleIntrinOp(const BaseNode &parent, BaseNode &expr, CGFunc &cgFunc) case INTRN_vector_table_lookup_v16u8: case INTRN_vector_table_lookup_v16i8: return HandleVectorTableLookup(intrinsicopNode, cgFunc); + case INTRN_vector_widen_low_v8u8: case INTRN_vector_widen_low_v8i8: + case INTRN_vector_widen_low_v4u16: case INTRN_vector_widen_low_v4i16: + case INTRN_vector_widen_low_v2u32: case INTRN_vector_widen_low_v2i32: + return HandleVectorWiden(intrinsicopNode, cgFunc, true); + + case INTRN_vector_widen_high_v8u8: case INTRN_vector_widen_high_v8i8: + case INTRN_vector_widen_high_v4u16: case INTRN_vector_widen_high_v4i16: + case INTRN_vector_widen_high_v2u32: case INTRN_vector_widen_high_v2i32: + return HandleVectorWiden(intrinsicopNode, cgFunc, false); + default: ASSERT(false, "Should not reach here."); return nullptr; diff --git a/src/mapleall/maple_ir/include/intrinsic_vector.def b/src/mapleall/maple_ir/include/intrinsic_vector.def index 9f2042d8ef..0e07c76083 100644 --- a/src/mapleall/maple_ir/include/intrinsic_vector.def +++ b/src/mapleall/maple_ir/include/intrinsic_vector.def @@ -419,8 +419,52 @@ DEF_MIR_INTRINSIC(vector_set_element_v2f32, "vector_set_element_v2f32", INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV2F32, kArgTyF32, kArgTyV2F32, kArgTyI32) +// vecTy2 vector_widen_low(vecTy1 src) +// Widen each element of the 64-bit argument to double size of the +// original width to a 128-bit destination vector. +DEF_MIR_INTRINSIC(vector_widen_low_v2i32, "vector_widen_low_v2i32", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV2I64, + kArgTyV2I32) +DEF_MIR_INTRINSIC(vector_widen_low_v4i16, "vector_widen_low_v4i16", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV4I32, + kArgTyV4I16) +DEF_MIR_INTRINSIC(vector_widen_low_v8i8, "vector_widen_low_v8i8", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV8I16, + kArgTyV8I8) +DEF_MIR_INTRINSIC(vector_widen_low_v2u32, "vector_widen_low_v2u32", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV2U64, + kArgTyV2U32) +DEF_MIR_INTRINSIC(vector_widen_low_v4u16, "vector_widen_low_v4u16", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV4U32, + kArgTyV4U16) +DEF_MIR_INTRINSIC(vector_widen_low_v8u8, "vector_widen_low_v8u8", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV8U16, + kArgTyV8U8) + +// vecTy2 vector_widen_high(vecTy1 src) +// Widen each upper element of the 128-bit source vector to double size of +// the original width into a 128-bit destination vector. +DEF_MIR_INTRINSIC(vector_widen_high_v2i32, "vector_widen_high_v2i32", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV2I64, + kArgTyV4I32) +DEF_MIR_INTRINSIC(vector_widen_high_v4i16, "vector_widen_high_v4i16", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV4I32, + kArgTyV8I16) +DEF_MIR_INTRINSIC(vector_widen_high_v8i8, "vector_widen_high_v8i8", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV8I16, + kArgTyV16I8) +DEF_MIR_INTRINSIC(vector_widen_high_v2u32, "vector_widen_high_v2u32", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV2U64, + kArgTyV4U32) +DEF_MIR_INTRINSIC(vector_widen_high_v4u16, "vector_widen_high_v4u16", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV4U32, + kArgTyV8U16) +DEF_MIR_INTRINSIC(vector_widen_high_v8u8, "vector_widen_high_v8u8", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV8U16, + kArgTyV16U8) + // vecTy2 vector_narrow_low(vecTy1 src) -// Narrow each element of the source vector to half of the original width, +// Narrow each element of the 128-bit source vector to half of the original width, // then write it to the lower half of the destination vector. DEF_MIR_INTRINSIC(vector_narrow_low_v2i64, "vector_narrow_low_v2i64", INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV2I32, @@ -441,6 +485,28 @@ DEF_MIR_INTRINSIC(vector_narrow_low_v8u16, "vector_narrow_low_v8u16", INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV8U8, kArgTyV8U16) +// vecTy2 vector_narrow_high(vecTy1 src) +// Narrow each element of the upper source vector to half of the original width, +// concatenate with the first 64-bit arg into a 128-bit destination vector. +DEF_MIR_INTRINSIC(vector_narrow_high_v2i64, "vector_narrow_high_v2i64", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV4I32, + kArgTyV2I32, kArgTyV2I64) +DEF_MIR_INTRINSIC(vector_narrow_high_v4i32, "vector_narrow_high_v4i32", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV8I16, + kArgTyV4I16, kArgTyV4I32) +DEF_MIR_INTRINSIC(vector_narrow_high_v8i16, "vector_narrow_high_v8i16", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV16I8, + kArgTyV8I8, kArgTyV8I16) +DEF_MIR_INTRINSIC(vector_narrow_high_v2u64, "vector_narrow_high_v2u64", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV4U32, + kArgTyV2U32, kArgTyV2U64) +DEF_MIR_INTRINSIC(vector_narrow_high_v4u32, "vector_narrow_high_v4u32", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV8U16, + kArgTyV4U16, kArgTyV4U32) +DEF_MIR_INTRINSIC(vector_narrow_high_v8u16, "vector_narrow_high_v8u16", + INTRNISVECTOR | INTRNISPURE | INTRNNOSIDEEFFECT, kArgTyV16U8, + kArgTyV8U8, kArgTyV8U16) + // vecTy2 vector_pairwise_add(vecTy1 src) // Add pairs of elements from the source vector and put the result into the // destination vector, whose element size is twice and the number of -- Gitee