From 4198cc60e9f42182aae6e62984a8c7dae5e543f2 Mon Sep 17 00:00:00 2001 From: fye Date: Wed, 13 Oct 2021 18:20:38 -0700 Subject: [PATCH 1/2] ME: simdization on memcpy and memset --- .../src/cg/aarch64/aarch64_strldr.cpp | 5 + .../maple_me/include/me_merge_stmts.h | 8 +- src/mapleall/maple_me/src/me_merge_stmts.cpp | 189 +++++++++++++++++- 3 files changed, 199 insertions(+), 3 deletions(-) diff --git a/src/mapleall/maple_be/src/cg/aarch64/aarch64_strldr.cpp b/src/mapleall/maple_be/src/cg/aarch64/aarch64_strldr.cpp index 467d1ab89c..e6d84f2a5c 100644 --- a/src/mapleall/maple_be/src/cg/aarch64/aarch64_strldr.cpp +++ b/src/mapleall/maple_be/src/cg/aarch64/aarch64_strldr.cpp @@ -677,6 +677,11 @@ bool AArch64StoreLoadOpt::CanDoMemProp(Insn *insn) { if (insn->IsAtomic()) { return false; } + // It is not desired to propagate on 128bit reg with immediate offset + // which may cause linker to issue misalignment error + if (insn->IsAtomic() || insn->GetOperand(0).GetSize() == k128BitSize) { + return false; + } AArch64MemOperand *currMemOpnd = static_cast(insn->GetMemOpnd()); return currMemOpnd != nullptr; } diff --git a/src/mapleall/maple_me/include/me_merge_stmts.h b/src/mapleall/maple_me/include/me_merge_stmts.h index 4c91ca61dc..7302ca594b 100644 --- a/src/mapleall/maple_me/include/me_merge_stmts.h +++ b/src/mapleall/maple_me/include/me_merge_stmts.h @@ -16,7 +16,9 @@ #define MAPLE_ME_INCLUDE_ME_MERGE_STMTS_H #include "me_function.h" -// Merge smaller stores into larger one +// 1. Merge smaller stores into larger one +// 2. Simdize intrinsic + namespace maple { class MergeStmts { public: @@ -31,6 +33,10 @@ class MergeStmts { void mergeIassigns(vOffsetStmt& iassignCandidates); void mergeDassigns(vOffsetStmt& dassignCandidates); uint32 GetPointedTypeBitSize(TyIdx ptrTypeIdx); + IassignMeStmt *genSimdIassign(int32 offset, IvarMeExpr iVar1, IvarMeExpr iVar2, MapleMap &stmtChi, TyIdx ptrTypeIdx); + IassignMeStmt *genSimdIassign(int32 offset, IvarMeExpr iVar, RegMeExpr& regVal, MapleMap &stmtChi, TyIdx ptrTypeIdx); + void simdMemcpy(IntrinsiccallMeStmt* memcpyCallStmt); + void simdMemset(IntrinsiccallMeStmt* memcpyCallStmt); private: MeFunction &func; }; diff --git a/src/mapleall/maple_me/src/me_merge_stmts.cpp b/src/mapleall/maple_me/src/me_merge_stmts.cpp index 6dfef0b49d..d2e513dbeb 100644 --- a/src/mapleall/maple_me/src/me_merge_stmts.cpp +++ b/src/mapleall/maple_me/src/me_merge_stmts.cpp @@ -243,7 +243,179 @@ void MergeStmts::mergeDassigns(vOffsetStmt& dassignCandidates) { } } +IassignMeStmt *MergeStmts::genSimdIassign(int32 offset, IvarMeExpr iVar1, IvarMeExpr iVar2, + MapleMap &stmtChi, TyIdx ptrTypeIdx) { + MeIRMap *irMap = func.GetIRMap(); + iVar1.SetOffset(offset); + IvarMeExpr *dstIvar = static_cast(irMap->HashMeExpr(iVar1)); + iVar2.SetOffset(offset); + IvarMeExpr *srcIvar = static_cast(irMap->HashMeExpr(iVar2)); + IassignMeStmt *xIassignStmt = irMap->CreateIassignMeStmt(ptrTypeIdx, *dstIvar, *srcIvar, stmtChi); + return xIassignStmt; +} + +IassignMeStmt *MergeStmts::genSimdIassign(int32 offset, IvarMeExpr iVar, RegMeExpr& valMeExpr, + MapleMap &stmtChi, TyIdx ptrTypeIdx) { + MeIRMap *irMap = func.GetIRMap(); + iVar.SetOffset(offset); + IvarMeExpr *dstIvar = static_cast(irMap->HashMeExpr(iVar)); + IassignMeStmt *xIassignStmt = irMap->CreateIassignMeStmt(ptrTypeIdx, *dstIvar, valMeExpr, stmtChi); + return xIassignStmt; +} + +const uint32 simdThreshold = 128; +void MergeStmts::simdMemcpy(IntrinsiccallMeStmt* memcpyCallStmt) { + ASSERT(memcpyCallStmt->GetIntrinsic() == INTRN_C_memcpy, "The stmt is NOT intrinsic memcpy"); + + ConstMeExpr *lengthExpr = static_cast(memcpyCallStmt->GetOpnd(2)); + if (!lengthExpr || lengthExpr->GetMeOp() != kMeOpConst || + lengthExpr->GetConstVal()->GetKind() != kConstInt) { + return; + } + int32 copyLength = lengthExpr->GetIntValue(); + if (copyLength <= 0 || copyLength > simdThreshold || copyLength % 8 != 0) { + return; + } + + int32 numOf16Byte = copyLength / 16; + int32 numOf8Byte = (copyLength % 16) / 8; + int32 offset8Byte = copyLength - (copyLength % 16); + /* Leave following cases for future + int32 numOf4Byte = (copyLength % 8) / 4; + int32 offset4Byte = copyLength - (copyLength % 8); + int32 numOf2Byte = (copyLength % 4) / 2; + int32 offset2Byte = copyLength - (copyLength % 4); + int32 numOf1Byte = (copyLength % 2); + int32 offset1Byte = copyLength - (copyLength % 2); + */ + MeExpr *dstMeExpr = memcpyCallStmt->GetOpnd(0); + MeExpr *srcMeExpr = memcpyCallStmt->GetOpnd(1); + MapleMap *memcpyCallStmtChi = memcpyCallStmt->GetChiList(); + MIRType *v16uint8MirType = GlobalTables::GetTypeTable().GetV16UInt8(); + MIRType *v16uint8PtrType = GlobalTables::GetTypeTable().GetOrCreatePointerType(*v16uint8MirType, PTY_ptr); + + IvarMeExpr tmpIvar1(kInvalidExprID, PTY_v16u8, v16uint8PtrType->GetTypeIndex(), 0); + if (dstMeExpr->GetOp() != OP_regread) { + RegMeExpr *addrRegMeExpr = func.GetIRMap()->CreateRegMeExpr(PTY_a64); + MeStmt *addrRegAssignMeStmt = func.GetIRMap()->CreateAssignMeStmt(*addrRegMeExpr, *dstMeExpr, *memcpyCallStmt->GetBB()); + memcpyCallStmt->GetBB()->InsertMeStmtBefore(memcpyCallStmt, addrRegAssignMeStmt); + dstMeExpr = addrRegMeExpr; + } + tmpIvar1.SetBase(dstMeExpr); + IvarMeExpr tmpIvar2(kInvalidExprID, PTY_v16u8, v16uint8PtrType->GetTypeIndex(), 0); + if (srcMeExpr->GetOp() != OP_regread) { + RegMeExpr *addrRegMeExpr = func.GetIRMap()->CreateRegMeExpr(PTY_a64); + MeStmt *addrRegAssignMeStmt = func.GetIRMap()->CreateAssignMeStmt(*addrRegMeExpr, *srcMeExpr, *memcpyCallStmt->GetBB()); + memcpyCallStmt->GetBB()->InsertMeStmtBefore(memcpyCallStmt, addrRegAssignMeStmt); + srcMeExpr = addrRegMeExpr; + } + tmpIvar2.SetBase(srcMeExpr); + + for (int32 i = 0; i < numOf16Byte; i++) { + IassignMeStmt *xIassignStmt = genSimdIassign(16 * i, tmpIvar1, tmpIvar2, *memcpyCallStmtChi, + v16uint8PtrType->GetTypeIndex()); + memcpyCallStmt->GetBB()->InsertMeStmtBefore(memcpyCallStmt, xIassignStmt); + } + + if (numOf8Byte != 0) { + MIRType *v8uint8MirType = GlobalTables::GetTypeTable().GetV8UInt8(); + MIRType *v8uint8PtrType = GlobalTables::GetTypeTable().GetOrCreatePointerType(*v8uint8MirType, PTY_ptr); + IvarMeExpr tmpIvar3(kInvalidExprID, PTY_v8u8, v8uint8PtrType->GetTypeIndex(), 0); + tmpIvar3.SetBase(dstMeExpr); + IvarMeExpr tmpIvar4(kInvalidExprID, PTY_v8u8, v8uint8PtrType->GetTypeIndex(), 0); + tmpIvar4.SetBase(srcMeExpr); + IassignMeStmt *xIassignStmt = genSimdIassign(offset8Byte, tmpIvar3, tmpIvar4, *memcpyCallStmtChi, + v8uint8PtrType->GetTypeIndex()); + memcpyCallStmt->GetBB()->InsertMeStmtBefore(memcpyCallStmt, xIassignStmt); + } + + // Remove memcpy stmt + if (numOf8Byte != 0 || numOf16Byte != 0) { + BB * bb = memcpyCallStmt->GetBB(); + bb->RemoveMeStmt(memcpyCallStmt); + } +} + +void MergeStmts::simdMemset(IntrinsiccallMeStmt* memsetCallStmt) { + ASSERT(memsetCallStmt->GetIntrinsic() == INTRN_C_memset, "The stmt is NOT intrinsic memset"); + + ConstMeExpr *numExpr = static_cast(memsetCallStmt->GetOpnd(2)); + if (!numExpr || numExpr->GetMeOp() != kMeOpConst || + numExpr->GetConstVal()->GetKind() != kConstInt) { + return; + } + int32 setLength = numExpr->GetIntValue(); + if (setLength <= 0 || setLength > simdThreshold || setLength % 8 != 0) { + return; + } + + int32 numOf16Byte = setLength / 16; + int32 numOf8Byte = (setLength % 16) / 8; + int32 offset8Byte = setLength - (setLength % 16); + /* Leave following cases for future + int32 numOf4Byte = (copyLength % 8) / 4; + int32 offset4Byte = copyLength - (copyLength % 8); + int32 numOf2Byte = (copyLength % 4) / 2; + int32 offset2Byte = copyLength - (copyLength % 4); + int32 numOf1Byte = (copyLength % 2); + int32 offset1Byte = copyLength - (copyLength % 2); + */ + MeExpr *dstMeExpr = memsetCallStmt->GetOpnd(0); + MeExpr *fillValMeExpr = memsetCallStmt->GetOpnd(1); + MapleMap *memsetCallStmtChi = memsetCallStmt->GetChiList(); + MIRType *v16u8MirType = GlobalTables::GetTypeTable().GetV16UInt8(); + MIRType *v16u8PtrType = GlobalTables::GetTypeTable().GetOrCreatePointerType(*v16u8MirType, PTY_ptr); + + IvarMeExpr tmpIvar(kInvalidExprID, PTY_v16u8, v16u8PtrType->GetTypeIndex(), 0); + if (dstMeExpr->GetOp() != OP_regread) { + RegMeExpr *addrRegMeExpr = func.GetIRMap()->CreateRegMeExpr(PTY_a64); + MeStmt *addrRegAssignMeStmt = func.GetIRMap()->CreateAssignMeStmt(*addrRegMeExpr, *dstMeExpr, *memsetCallStmt->GetBB()); + memsetCallStmt->GetBB()->InsertMeStmtBefore(memsetCallStmt, addrRegAssignMeStmt); + dstMeExpr = addrRegMeExpr; + } + tmpIvar.SetBase(dstMeExpr); + + RegMeExpr *dupRegMeExpr = func.GetIRMap()->CreateRegMeExpr(PTY_v16u8); + NaryMeExpr *dupValMeExpr = new NaryMeExpr(&func.GetIRMap()->GetIRMapAlloc(), kInvalidExprID, OP_intrinsicop, PTY_v16u8, + 1, TyIdx(0), INTRN_vector_from_scalar_v16u8, false); + dupValMeExpr->PushOpnd(fillValMeExpr); + MeStmt *dupRegAssignMeStmt = func.GetIRMap()->CreateAssignMeStmt(*dupRegMeExpr, *dupValMeExpr, *memsetCallStmt->GetBB()); + memsetCallStmt->GetBB()->InsertMeStmtBefore(memsetCallStmt, dupRegAssignMeStmt); + + for (int32 i = 0; i < numOf16Byte; i++) { + IassignMeStmt *xIassignStmt = genSimdIassign(16 * i, tmpIvar, *dupRegMeExpr, *memsetCallStmtChi, + v16u8PtrType->GetTypeIndex()); + memsetCallStmt->GetBB()->InsertMeStmtBefore(memsetCallStmt, xIassignStmt); + } + + if (numOf8Byte != 0) { + MIRType *v8u8MirType = GlobalTables::GetTypeTable().GetV8UInt8(); + MIRType *v8u8PtrType = GlobalTables::GetTypeTable().GetOrCreatePointerType(*v8u8MirType, PTY_ptr); + IvarMeExpr tmpIvar(kInvalidExprID, PTY_v8u8, v8u8PtrType->GetTypeIndex(), 0); + tmpIvar.SetBase(dstMeExpr); + + // Consider Reuse of dstMeExpr ? + // RegMeExpr *dupRegMeExpr = static_cast(func.GetIRMap()->CreateMeExprTypeCvt(PTY_v8u8, PTY_v16u8, *dstMeExpr)); + RegMeExpr *dupRegMeExpr = func.GetIRMap()->CreateRegMeExpr(PTY_v8u8); + NaryMeExpr *dupValMeExpr = new NaryMeExpr(&func.GetIRMap()->GetIRMapAlloc(), kInvalidExprID, OP_intrinsicop, PTY_v8u8, + 1, TyIdx(0), INTRN_vector_from_scalar_v8u8, false); + dupValMeExpr->PushOpnd(fillValMeExpr); + MeStmt *dupRegAssignMeStmt = func.GetIRMap()->CreateAssignMeStmt(*dupRegMeExpr, *dupValMeExpr, *memsetCallStmt->GetBB()); + memsetCallStmt->GetBB()->InsertMeStmtBefore(memsetCallStmt, dupRegAssignMeStmt); + IassignMeStmt *xIassignStmt = genSimdIassign(offset8Byte, tmpIvar, *dupRegMeExpr, *memsetCallStmtChi, + v8u8PtrType->GetTypeIndex()); + memsetCallStmt->GetBB()->InsertMeStmtBefore(memsetCallStmt, xIassignStmt); + } + + // Remove memset stmt + if (numOf8Byte != 0 || numOf16Byte != 0) { + BB * bb = memsetCallStmt->GetBB(); + bb->RemoveMeStmt(memsetCallStmt); + } +} + // Merge assigns on consecutive struct fields into one assignoff +// Or Simdize memset/memcpy void MergeStmts::MergeMeStmts() { auto layoutBBs = func.GetLaidOutBBs(); @@ -253,8 +425,8 @@ void MergeStmts::MergeMeStmts() { // Identify consecutive (I/D)assign stmts // Candiates of (I/D)assignment are grouped together and saparated by nullptr - auto &meStmts = bb->GetMeStmts(); - for (auto &meStmt : meStmts) { + MeStmts &meStmts = bb->GetMeStmts(); + for (MeStmt &meStmt : meStmts) { Opcode op = meStmt.GetOp(); switch (op) { case OP_iassign: { @@ -325,6 +497,19 @@ void MergeStmts::MergeMeStmts() { } break; } + // Simdize intrinsic. SIMD should really be handled in CG + case OP_intrinsiccall: { + IntrinsiccallMeStmt *intrinsicCallStmt = static_cast(&meStmt); + MIRIntrinsicID intrinsicCallID = intrinsicCallStmt->GetIntrinsic(); + if (intrinsicCallID == INTRN_C_memcpy) { + simdMemcpy(intrinsicCallStmt); + } else if (intrinsicCallID == INTRN_C_memset) { + simdMemset(intrinsicCallStmt); + } else { + // More to come + } + break; + } default: { candidateStmts.push(nullptr); break; -- Gitee From d35f2874d9354200fe7f81428e7e7f96b824cd75 Mon Sep 17 00:00:00 2001 From: fye Date: Fri, 15 Oct 2021 18:06:20 -0700 Subject: [PATCH 2/2] ME: simdization on memcpy and memset - avoid using offset for mem operand in ldr for q reg --- src/mapleall/maple_be/src/cg/aarch64/aarch64_peep.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/mapleall/maple_be/src/cg/aarch64/aarch64_peep.cpp b/src/mapleall/maple_be/src/cg/aarch64/aarch64_peep.cpp index 9565e08864..2ddde4aed3 100644 --- a/src/mapleall/maple_be/src/cg/aarch64/aarch64_peep.cpp +++ b/src/mapleall/maple_be/src/cg/aarch64/aarch64_peep.cpp @@ -2585,6 +2585,11 @@ void ComplexMemOperandAArch64::Run(BB &bb, Insn &insn) { auto ®Opnd = static_cast(insn.GetOperand(kInsnFirstOpnd)); + /* Avoid linking issues when object is not 16byte aligned */ + if (memOpnd->GetSize() == k128BitSize) { + return; + } + /* Check if dest operand of insn is idential with base register of nextInsn. */ if (memOpnd->GetBaseRegister() != ®Opnd) { return; -- Gitee