From 116b486cfc1fca5ad43193f5c475175dcc28497b Mon Sep 17 00:00:00 2001 From: Yarovoy Danil WX1195294 Date: Tue, 2 May 2023 13:18:45 +0300 Subject: [PATCH 1/7] CRC32 pattern matching. --- gcc/combine.c | 282 ++++++++++++++++++++- gcc/common.opt | 4 + gcc/config/aarch64/aarch64.c | 5 + gcc/doc/tm.texi | 4 + gcc/doc/tm.texi.in | 2 + gcc/target.def | 6 + gcc/testsuite/gcc.target/aarch64/crc32-1.c | 110 ++++++++ gcc/testsuite/gcc.target/aarch64/crc32-2.c | 97 +++++++ 8 files changed, 509 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/crc32-1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/crc32-2.c diff --git a/gcc/combine.c b/gcc/combine.c index 497e53289ca..afa4787292d 100644 --- a/gcc/combine.c +++ b/gcc/combine.c @@ -1,5 +1,5 @@ /* Optimize by combining instructions for GNU compiler. - Copyright (C) 1987-2020 Free Software Foundation, Inc. + Copyright (C) 1987-2023 Free Software Foundation, Inc. This file is part of GCC. @@ -2651,6 +2651,279 @@ count_auto_inc (rtx, rtx, rtx, rtx, rtx, void *arg) return 0; } +struct crc32_combine_info +{ + rtx dest; + rtx src1; + rtx src2; +}; + +static unsigned int +calc_crc32 (unsigned char c) +{ + int i; + unsigned int POLY = 0xedb88320; + unsigned int crc = c; + for (i = 0; i < 8; i++) + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + return crc; +} + +static rtx_insn * +replace_32bit_case (rtx dest, rtx src1, rtx src2, rtx_insn *loc) +{ + gcc_assert (targetm.gen_crc32b); + + rtx pattern = targetm.gen_crc32b (dest, src1, src2); + + start_sequence (); + rtx_insn *crc32 = emit_insn (pattern); + + if (recog_memoized (crc32) < 0) + { + end_sequence (); + return 0; + } + + rtx_insn *seq = get_insns (); + end_sequence (); + emit_insn_after (seq, loc); + + return NEXT_INSN (crc32); +} + +/* We try to replace + dest = crc32_table[(src1 ^ src2) & 0xff] ^ (src1 >> 8); + with one crc32 instruction. If src1 has 64 bit size we cannot + just replace with the instruction because crc32 has 32 bit input + and 32 bit output. To save user code behavior we need to insert + some bitwise operations in case of (src1 > 0xffffffff). */ + +static rtx_insn * +replace_64bit_case (rtx dest, rtx src1, rtx src2, rtx_insn *loc) +{ + gcc_assert (targetm.gen_crc32b); + + rtx reg1 = gen_reg_rtx (DImode); + rtx reg2 = gen_reg_rtx (DImode); + rtx reg3 = gen_reg_rtx (DImode); + + auto_vec patterns; + + const unsigned int crc32_shift_imm = 8; + rtx imm = gen_rtx_CONST_INT (DImode, crc32_shift_imm); + patterns.safe_push (gen_rtx_SET (reg1, + gen_rtx_LSHIFTRT (DImode, + gen_lowpart (DImode, src1), + imm))); + + rtx mask = gen_rtx_CONST_INT (DImode, 0xffffffff000000); + patterns.safe_push (gen_rtx_SET (reg2, + gen_rtx_AND (DImode, reg1, mask))); + + patterns.safe_push (targetm.gen_crc32b (gen_lowpart (SImode, reg3), + src1, src2)); + + patterns.safe_push (gen_rtx_SET (gen_lowpart (DImode, dest), + gen_rtx_XOR (DImode, reg3, reg2))); + + rtx_insn *insn; + rtx pattern; + int i; + + start_sequence (); + FOR_EACH_VEC_ELT (patterns, i, pattern) + { + insn = emit_insn (pattern); + if (recog_memoized (insn) < 0) + { + end_sequence (); + return 0; + } + } + + rtx_insn *seq = get_insns (); + end_sequence (); + emit_insn_after (seq, loc); + + return NEXT_INSN (insn); +} + +static bool +crc32_table_ref (rtx mem, rtx idx) +{ + rtx addr = XEXP (mem, 0); + if (GET_CODE (addr) != PLUS) + return false; + + struct address_info info; + decompose_mem_address (&info, mem); + + rtx base = *info.base; + rtx offset = *info.index; + + /* Check if base is reg which holds anchor address. */ + if (!REG_P (base)) + return false; + + /* Check if offset = idx * sizeof (crc_table[0]) */ + int size = GET_MODE_SIZE (GET_MODE (mem)).to_constant (); + if (GET_CODE (offset) != MULT + || !REG_P (XEXP (offset, 0)) + || !reg_overlap_mentioned_p (idx, offset) + || GET_MODE (XEXP (offset, 0)) != GET_MODE (idx) + || !CONST_INT_P (XEXP (offset, 1)) + || INTVAL (XEXP (offset, 1)) != size) + return false; + + tree expr = MEM_EXPR (mem); + if (!expr || TREE_CODE (expr) != ARRAY_REF) + return false; + + tree decl = TREE_OPERAND (expr, 0); + if (!decl || !DECL_P (decl) || !TREE_READONLY (decl)) + return false; + + const unsigned int crc_table_nelts = 256; + tree ctor = DECL_INITIAL (decl); + if (!ctor || CONSTRUCTOR_NELTS (ctor) != crc_table_nelts) + return false; + + unsigned int ix; + tree val; + FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (ctor), ix, val) + { + unsigned HOST_WIDE_INT ival = TREE_INT_CST_LOW (val); + unsigned HOST_WIDE_INT crc = calc_crc32 (ix); + if (ival != crc) + return false; + } + return true; +} + +/* Check pattern: + dest = crc32_table[(src1 ^ src2) & 0xff] ^ (src1 >> 8); */ + +static bool +crc32_pattern (rtx i3, rtx i2, rtx i1, rtx i0, struct crc32_combine_info *info) +{ + /* Check that i0: { i0dest = op1 ^ op2 } */ + rtx i0src = SET_SRC (i0); + rtx i0dest = SET_DEST (i0); + if (GET_CODE (i0src) != XOR || !REG_P (i0dest)) + return false; + + rtx i0op1 = XEXP (i0src, 0); + rtx i0op2 = XEXP (i0src, 1); + if ((!REG_P (i0op1) && !SUBREG_P (i0op1)) + || (!REG_P (i0op2) && !SUBREG_P (i0op2))) + return false; + + /* Check that i1: { i1dest = zero_extend ((unsigned char)i0dest) } */ + rtx i1src = SET_SRC (i1); + rtx i1dest = SET_DEST (i1); + if (!REG_P (i1dest) + || GET_CODE (i1src) != ZERO_EXTEND + || !SUBREG_P (XEXP (i1src, 0)) + || GET_MODE (XEXP (i1src, 0)) != QImode + || !reg_overlap_mentioned_p (i0dest, i1src)) + return false; + + /* Check that i2: { i2dest = crc32_table[idest1] } */ + rtx i2src = SET_SRC (i2); + rtx i2dest = SET_DEST (i2); + if (!REG_P (i2dest)) + return false; + + rtx mem; + if (GET_CODE (i2src) == MEM) + mem = i2src; + else if (GET_CODE (i2src) == ZERO_EXTEND + && GET_CODE (XEXP (i2src, 0)) == MEM) + mem = XEXP (i2src, 0); + else + mem = NULL_RTX; + + if (!mem || !crc32_table_ref (mem, i1dest)) + return false; + + /* Check that i3: { i3dest = i2dest ^ (op[1|2] >> 8) } */ + rtx i3src = SET_SRC (i3); + rtx i3dest = SET_DEST (i3); + + if (GET_MODE (i3dest) != DImode + && GET_MODE (i3dest) != SImode) + return false; + + if (!REG_P (i3dest) || GET_CODE (i3src) != XOR + || GET_CODE (XEXP (i3src, 0)) != LSHIFTRT) + return false; + + rtx i3op1 = XEXP (XEXP (i3src, 0), 0); + rtx i3op2 = XEXP (XEXP (i3src, 0), 1); + rtx i3op3 = XEXP (i3src, 1); + if (!REG_P (i3op1) + || (!REG_P (i3op3) && !SUBREG_P (i3op3)) + || !CONST_INT_P (i3op2) || INTVAL (i3op2) != 8 + || !reg_overlap_mentioned_p (i2dest, i3op3)) + return false; + + bool is_op1 = reg_overlap_mentioned_p (i0op1, i3op1); + bool is_op2 = reg_overlap_mentioned_p (i0op2, i3op1); + if (!is_op1 && !is_op2) + return false; + + info->dest = i3dest; + info->src1 = is_op1 ? i0op1 : i0op2; + info->src2 = is_op1 ? i0op2 : i0op1; + + return true; +} + +static rtx_insn* +try_combine_crc32 (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0) +{ + struct crc32_combine_info info; + + rtx i3set = single_set (i3); + rtx i2set = single_set (i2); + rtx i1set = single_set (i1); + rtx i0set = single_set (i0); + + if (!i3set || !i2set || !i1set || !i0set) + return 0; + + if (!crc32_pattern (i3set, i2set, i1set, i0set, &info)) + return 0; + + if (!dead_or_set_p (i1, SET_DEST (i0set)) + || !dead_or_set_p (i2, SET_DEST (i1set)) + || !dead_or_set_p (i3, SET_DEST (i2set))) + return 0; + + rtx dest = gen_lowpart (SImode, info.dest); + rtx src1 = gen_lowpart (SImode, info.src1); + rtx src2 = gen_lowpart (QImode, info.src2); + + rtx_insn *next = NULL; + if (GET_MODE (info.dest) == DImode) + next = replace_64bit_case (dest, src1, src2, i3); + else if (GET_MODE (info.dest) == SImode) + next = replace_32bit_case (dest, src1, src2, i3); + else + gcc_unreachable (); + + if (!next) + return 0; + + SET_INSN_DELETED (i0); + SET_INSN_DELETED (i1); + SET_INSN_DELETED (i2); + SET_INSN_DELETED (i3); + + return next; +} + /* Try to combine the insns I0, I1 and I2 into I3. Here I0, I1 and I2 appear earlier than I3. I0 and I1 can be zero; then we combine just I2 into I3, or I1 and I2 into @@ -2742,6 +3015,13 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0, int nshift = 0; rtx set0, set3; + if (flag_crypto_accel && targetm.gen_crc32b) + { + rtx_insn *next_comb = try_combine_crc32 (i3, i2, i1, i0); + if (next_comb) + return next_comb; + } + if (!flag_expensive_optimizations) return 0; diff --git a/gcc/common.opt b/gcc/common.opt index 6f0ed7cea59..2d24f3494f9 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1060,6 +1060,10 @@ fasynchronous-unwind-tables Common Report Var(flag_asynchronous_unwind_tables) Optimization Generate unwind tables that are exact at each instruction boundary. +fcrypto-accel +Common Report Var(flag_crypto_accel) Init(0) Optimization +Perform crypto acceleration pattern matching. + farray-widen-compare Common Report Var(flag_array_widen_compare) Optimization Extends types for pointers to arrays to improve array comparsion performance. diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index cbdde11b07b..2e08a63a3ef 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -23827,6 +23827,11 @@ aarch64_run_selftests (void) #endif /* #if CHECKING_P */ +#ifdef TARGET_CRC32 +#undef TARGET_GEN_CRC32B +#define TARGET_GEN_CRC32B gen_aarch64_crc32b +#endif + #undef TARGET_STACK_PROTECT_GUARD #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 0508fce57a7..d98e6c5d7c6 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -11840,6 +11840,10 @@ object files that are not referenced from @code{main} and uses export lists. @end defmac +@deftypefn {Target Hook} rtx TARGET_GEN_CRC32B (rtx @var{dest}, rtx @var{src1}, rtx @var{src2}) +This function generate the crc32 instruction if target supports this. +@end deftypefn + @deftypefn {Target Hook} bool TARGET_CANNOT_MODIFY_JUMPS_P (void) This target hook returns @code{true} past the point in which new jump instructions could be created. On machines that require a register for diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 3b70ea4841a..c26729404d6 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -8002,6 +8002,8 @@ object files that are not referenced from @code{main} and uses export lists. @end defmac +@hook TARGET_GEN_CRC32B + @hook TARGET_CANNOT_MODIFY_JUMPS_P @hook TARGET_HAVE_CONDITIONAL_EXECUTION diff --git a/gcc/target.def b/gcc/target.def index 2020564118b..bfc5e4e59c9 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -2682,6 +2682,12 @@ modes and they have different conditional execution capability, such as ARM.", bool, (void), default_have_conditional_execution) +DEFHOOK +(gen_crc32b, + "This function generate the crc32 instruction if target supports this.", + rtx, (rtx dest, rtx src1, rtx src2), + NULL) + DEFHOOK (gen_ccmp_first, "This function prepares to emit a comparison insn for the first compare in a\n\ diff --git a/gcc/testsuite/gcc.target/aarch64/crc32-1.c b/gcc/testsuite/gcc.target/aarch64/crc32-1.c new file mode 100644 index 00000000000..2719af3f3e6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/crc32-1.c @@ -0,0 +1,110 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -fno-inline --save-temps -fcrypto-accel -march=armv8.2-a" } */ + +#include + +static const unsigned long crc32_tab[] = { + 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, + 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, + 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, + 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, + 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, + 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, + 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, + 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, + 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, + 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, + 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, + 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, + 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, + 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, + 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, + 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, + 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, + 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, + 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, + 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, + 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, + 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, + 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, + 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, + 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, + 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, + 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, + 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, + 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, + 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, + 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, + 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, + 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, + 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, + 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, + 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, + 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, + 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, + 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, + 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, + 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, + 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, + 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, + 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, + 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, + 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, + 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, + 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, + 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, + 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, + 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, + 0x2d02ef8dL +}; + +unsigned long +long_calc_crc_1 (unsigned char c, unsigned long init) +{ + unsigned long crc = init; + crc = crc32_tab[((int)crc ^ c) & 0xff] ^ (crc >> 8); + return crc; +} + +unsigned long +long_calc_crc_2 (unsigned char c, unsigned long init) +{ + unsigned long crc = init; + crc = crc32_tab[(crc ^ c) & 0xff] ^ (crc >> 8); + return crc; +} + + +unsigned int +int_calc_crc (unsigned char c, unsigned int init) +{ + unsigned int crc = init; + crc = crc32_tab[(crc ^ c) & 0xff] ^ (crc >> 8); + return crc; +} + +int +main (int argc, char **argv) +{ + unsigned int ans = 0x6722b533; + unsigned long crc1 = long_calc_crc_1 (100, 0xffffffff); + if (crc1 != ans) + abort(); + + unsigned long overflow_ans = 0x1234561f22b533; + unsigned long crc2 = long_calc_crc_1 (100, 0x12345678ffffffff); + if (crc2 != overflow_ans) + abort(); + + unsigned long crc3 = long_calc_crc_2 (100, 0xffffffff); + if (crc3 != ans) + abort(); + + unsigned int crc4 = int_calc_crc (100, 0xffffffff); + if (crc4 != ans) + abort(); + + return 0; +} + +/* { dg-final { scan-assembler-times "crc32b" 3 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/crc32-2.c b/gcc/testsuite/gcc.target/aarch64/crc32-2.c new file mode 100644 index 00000000000..99bd9037f9e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/crc32-2.c @@ -0,0 +1,97 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -fno-inline -fcrypto-accel --save-temps -march=armv8.2-a" } */ + +#include + +static const unsigned crc32_tab[] = { + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, + 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, + 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, + 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, + 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856, + 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, + 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, + 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, + 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, + 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a, + 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, + 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, + 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, + 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, + 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e, + 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, + 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, + 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, + 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, + 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, + 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, + 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, + 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010, + 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, + 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, + 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, + 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, + 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344, + 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, + 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, + 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, + 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, + 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c, + 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, + 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, + 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, + 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, + 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c, + 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, + 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, + 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, + 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, + 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, + 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278, + 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, + 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66, + 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, + 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, + 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, + 0x2d02ef8d +}; + +unsigned long +long_calc_crc (unsigned char c, unsigned long init) +{ + unsigned long crc = init; + crc = crc32_tab[(crc ^ c) & 0xff] ^ (crc >> 8); + return crc; +} + +unsigned int +int_calc_crc (unsigned char c, unsigned int init) +{ + unsigned int crc = init; + crc = crc32_tab[(crc ^ c) & 0xff] ^ (crc >> 8); + return crc; +} + +int +main (int argc, char **argv) +{ + unsigned int ans = 0x6722b533; + unsigned long crc1 = long_calc_crc (100, 0xffffffff); + if (crc1 != ans) + abort(); + + unsigned long overflow_ans = 0x1234561f22b533; + unsigned long crc2 = long_calc_crc (100, 0x12345678ffffffff); + if (crc2 != overflow_ans) + abort(); + + unsigned int crc3 = int_calc_crc (100, 0xffffffff); + if (crc3 != ans) + abort(); + + return 0; +} + +/* { dg-final { scan-assembler-times "crc32b" 2 } } */ -- Gitee From 7331308268a780dcc50528cdba5bd27584f0df9e Mon Sep 17 00:00:00 2001 From: Yarovoy Danil Date: Wed, 2 Aug 2023 20:50:24 +0800 Subject: [PATCH 2/7] Add special case in crc32 pattern matching --- gcc/combine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/combine.c b/gcc/combine.c index afa4787292d..0f3c0955c99 100644 --- a/gcc/combine.c +++ b/gcc/combine.c @@ -2862,7 +2862,7 @@ crc32_pattern (rtx i3, rtx i2, rtx i1, rtx i0, struct crc32_combine_info *info) rtx i3op1 = XEXP (XEXP (i3src, 0), 0); rtx i3op2 = XEXP (XEXP (i3src, 0), 1); rtx i3op3 = XEXP (i3src, 1); - if (!REG_P (i3op1) + if (!REG_P (i3op1) && !SUBREG_P (i3op1) || (!REG_P (i3op3) && !SUBREG_P (i3op3)) || !CONST_INT_P (i3op2) || INTVAL (i3op2) != 8 || !reg_overlap_mentioned_p (i2dest, i3op3)) -- Gitee From 0f0c9f0ca7d00942c1a9b7dfeb21e6880eb05f60 Mon Sep 17 00:00:00 2001 From: Yarovoy Danil WX1195294 Date: Fri, 11 Aug 2023 15:32:39 +0300 Subject: [PATCH 3/7] Implement value range analysis enhancement. --- gcc/common.opt | 4 + gcc/gimple-ssa-evrp-analyze.c | 344 +++++++++++++++++++++++++++++- gcc/gimple-ssa-evrp-analyze.h | 18 ++ gcc/gimple-ssa-evrp.c | 14 ++ gcc/testsuite/gcc.dg/evrp-mem-1.c | 24 +++ gcc/testsuite/gcc.dg/evrp-mem-2.c | 36 ++++ gcc/tree-vrp.c | 58 ++++- gcc/tree-vrp.h | 1 + gcc/vr-values.c | 84 ++++++++ gcc/vr-values.h | 5 + 10 files changed, 586 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/evrp-mem-1.c create mode 100644 gcc/testsuite/gcc.dg/evrp-mem-2.c diff --git a/gcc/common.opt b/gcc/common.opt index 2d24f3494f9..2ed6fa6ddab 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1064,6 +1064,10 @@ fcrypto-accel Common Report Var(flag_crypto_accel) Init(0) Optimization Perform crypto acceleration pattern matching. +fvrp-mem +Common Report Var(flag_vrp_mem) Init(0) Optimization +Perform value range analysis for static storages. + farray-widen-compare Common Report Var(flag_array_widen_compare) Optimization Extends types for pointers to arrays to improve array comparsion performance. diff --git a/gcc/gimple-ssa-evrp-analyze.c b/gcc/gimple-ssa-evrp-analyze.c index 9f8ce5575a2..fff2a8293ad 100644 --- a/gcc/gimple-ssa-evrp-analyze.c +++ b/gcc/gimple-ssa-evrp-analyze.c @@ -284,6 +284,36 @@ evrp_range_analyzer::record_ranges_from_phis (basic_block bb) } } +/* Select candidates for static var assignment. + Candidates are integer assignment + from constant static variables or arrays + Floating point types are unsupported. */ +static bool +static_var_ref_vrp_candidate (gimple *stmt) +{ + if (!gimple_vuse (stmt) + || gimple_code (stmt) != GIMPLE_ASSIGN) + return false; + + tree lhs = gimple_get_lhs (stmt); + tree rhs = gimple_assign_rhs1 (stmt); + tree type = TREE_TYPE (lhs); + + if (TREE_CODE (type) != INTEGER_TYPE + || !TYPE_UNSIGNED (type)) + return false; + + if (!rhs || !VAR_P (rhs) || !DECL_P (rhs) + || !TREE_STATIC (rhs)) + return false; + + tree init = DECL_INITIAL (rhs); + if (!init || TREE_CODE (init) != INTEGER_CST) + return false; + + return true; +} + /* Record ranges from STMT into our VR_VALUES class. If TEMPORARY is true, then this is a temporary equivalence and should be recorded into the unwind table. Othewise record the equivalence into the @@ -343,7 +373,12 @@ evrp_range_analyzer::record_ranges_from_stmt (gimple *stmt, bool temporary) vr_values->set_defs_to_varying (stmt); } else - vr_values->set_defs_to_varying (stmt); + { + if (flag_vrp_mem && m_update_global_ranges + && static_var_ref_vrp_candidate (stmt)) + visit_assignment_static_var (stmt); + vr_values->set_defs_to_varying (stmt); + } /* See if we can derive a range for any of STMT's operands. */ tree op; @@ -394,6 +429,299 @@ evrp_range_analyzer::record_ranges_from_stmt (gimple *stmt, bool temporary) } } +/* Analyze static variable reference. Check if its initial value is fitted + into smaller type and there is only one store to this static storage. + Save the appropriate static variable for the futher analysis. */ + +void +evrp_range_analyzer::visit_assignment_static_var (gimple *stmt) +{ + gimple *store = NULL; + tree vuse = gimple_vuse (stmt); + tree lhs = gimple_get_lhs (stmt); + tree rhs = gimple_assign_rhs1 (stmt); + + imm_use_iterator it; + use_operand_p use_p; + FOR_EACH_IMM_USE_FAST (use_p, it, vuse) + { + gimple *use = USE_STMT (use_p); + if (gimple_code (use) != GIMPLE_ASSIGN) + continue; + + tree use_lhs = gimple_get_lhs (use); + tree use_rhs = gimple_assign_rhs1 (use); + + if (use_lhs && use_rhs && VAR_P (use_lhs) + && TREE_STATIC (use_lhs) && use_lhs == rhs) + { + if (store) + return; + store = use; + } + } + + if (!store) + return; + + tree init = DECL_INITIAL (rhs); + tree min_type = get_min_fitted_type (init); + tree type = TREE_TYPE (lhs); + if (!min_type + || TYPE_PRECISION (min_type) >= TYPE_PRECISION (type)) + return; + + struct static_var var = {stmt, store, min_type}; + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Found static variable for refining: "); + print_gimple_stmt (dump_file, stmt, 0); + } + + refine_candidates.safe_push (var); +} + +/* Collect all uses of static variable and check if it is possible + to refine their value range info. */ + +bool +evrp_range_analyzer::may_refine (gimple *stmt, + tree type, + auto_vec &may_be_refined) +{ + imm_use_iterator it; + use_operand_p use_p; + + hash_set visited; + hash_set uses_set; + hash_set phi_args; + + /* BFS traversal to find all dependent uses. */ + unsigned vc = 0; + tree cur = gimple_get_lhs (stmt); + while (true) + { + uses_set.add (cur); + gimple *def = SSA_NAME_DEF_STMT (cur); + + gimple_set_visited (def, false); + + /* If use is phi then save all phi args. */ + if (gimple_code (def) == GIMPLE_PHI) + { + for (unsigned i = 0; i < gimple_phi_num_args (def); i++) + { + tree arg = gimple_phi_arg_def (def, i); + phi_args.add (arg); + } + } + + FOR_EACH_IMM_USE_FAST (use_p, it, cur) + { + gimple *use = USE_STMT (use_p); + tree lhs = gimple_get_lhs (use); + + if (!lhs || is_gimple_debug (use) + || !stmt_interesting_for_vrp (use) + || gimple_vuse (use) + || visited.add (use)) + continue; + + may_be_refined.safe_push (lhs); + } + + if (vc == may_be_refined.length ()) + break; + cur = may_be_refined[vc++]; + } + + tree max = TYPE_MAX_VALUE (type); + + /* Check if all phi args are met in refining candidates. + Otherwise we cannot guarantee that after refining we have + valid value ranges. */ + for (hash_set::iterator it = phi_args.begin (), + end = phi_args.end (); + it != end; ++it) + { + if (TREE_CODE (*it) == SSA_NAME + && !uses_set.contains (*it)) + return false; + if (TREE_CODE (*it) == INTEGER_CST + && tree_int_cst_compare (*it, max) == 1) + return false; + } + + return true; +} + +/* If phi function was faced it has to be processed separetely + because in term phi (arg1, arg2) + arg1/arg2 could have different ranges + so result range has to be the union of both ranges. */ +void +evrp_range_analyzer::refine_phi (gimple *phi, + value_range_equiv *vr) +{ + bool first = true; + tree lhs = PHI_RESULT (phi); + for (unsigned i = 0; i < gimple_phi_num_args (phi); i++) + { + tree arg = gimple_phi_arg_def (phi, i); + if (TREE_CODE (arg) == SSA_NAME + && gimple_visited_p (SSA_NAME_DEF_STMT (arg))) + { + if (first) + vr->deep_copy (get_value_range (arg)); + else + vr->union_ (get_value_range (arg)); + first = false; + } + else + to_validate.safe_push (std::make_pair (lhs, arg)); + } +} + +bool +evrp_range_analyzer::validate_after_refining (gimple *store, tree type) +{ + bool valid = true; + while (!to_validate.is_empty ()) + { + std::pair v = to_validate.pop (); + if (!valid) + continue; + + value_range_equiv tem; + const value_range_equiv *vr1 = get_value_range (v.first); + if (TREE_CODE (v.second) != SSA_NAME) + continue; + const value_range_equiv *vr2 = get_value_range (v.second); + + tem.deep_copy (vr1); + if (vr2) + tem.union_ (vr2); + if (!vr1->equal_p (tem, true)) + valid = false; + } + if (!valid) + return false; + + tree max = TYPE_MAX_VALUE (type); + tree rhs = gimple_assign_rhs1 (store); + if (TREE_CODE (rhs) == SSA_NAME) + { + const value_range_equiv *vr = get_value_range (rhs); + if (!vr->constant_p () + || tree_int_cst_compare (vr->max (), max) == 1) + valid = false; + } + else if (TREE_CODE (rhs) == INTEGER_CST) + { + if (tree_int_cst_compare (rhs, max) == 1) + valid = false; + } + else + valid = false; + + return valid; +} + +/* Process each static varibale + and refine range for its users. */ +bool +evrp_range_analyzer::refine_range (struct static_var svar) +{ + int i; + tree var; + hash_set uses_set; + auto_vec may_be_refined; + auto_vec > to_update; + + gimple *load = svar.load; + gimple *store = svar.store; + tree min_type = svar.min_type; + + if (!may_refine (load, min_type, may_be_refined)) + return false; + + push_marker (); + gimple_set_visited (load, true); + + tree lhs = gimple_get_lhs (load); + value_range_equiv *new_vr = vr_values->allocate_value_range_equiv (); + vr_values->set_range_with_type (TREE_TYPE (lhs), min_type, new_vr); + + to_update.safe_push (std::make_pair (lhs, new_vr)); + push_value_range (lhs, new_vr); + + FOR_EACH_VEC_ELT (may_be_refined, i, var) + { + edge taken_edge; + tree output; + + gimple *def = SSA_NAME_DEF_STMT (var); + value_range_equiv *new_vr = vr_values->allocate_value_range_equiv (); + if (gimple_code (def) != GIMPLE_PHI) + { + vr_values->extract_range_from_stmt (def, &taken_edge, + &output, new_vr); + to_update.safe_push (std::make_pair (var, new_vr)); + } + else + refine_phi (def, new_vr); + + push_value_range (var, new_vr); + gimple_set_visited (def, true); + } + + if (validate_after_refining (store, min_type)) + { + for (unsigned i = 0; i < to_update.length (); i++) + set_ssa_range_info (to_update[i].first, to_update[i].second); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\nSuccessful refining for static variable: "); + print_gimple_stmt (dump_file, load, 0); + } + + unwind_to_marker (); + } + else + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\nFailed refining for static variable: "); + print_gimple_stmt (dump_file, load, 0); + } + + pop_to_marker (); + } + + return true; +} + +/* Main function for ranges refining. */ +bool +evrp_range_analyzer::refine_ranges () +{ + if (!m_update_global_ranges) + return false; + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "\nNumber of refining candidates: %u\n", + refine_candidates.length ()); + } + bool refined = false; + for (unsigned i = 0; i < refine_candidates.length (); i++) + refined |= refine_range (refine_candidates[i]); + + return refined; +} + /* Unwind recorded ranges to their most recent state. */ void @@ -405,6 +733,20 @@ evrp_range_analyzer::pop_to_marker (void) stack.pop (); } +/* Unwind ranges without recovering recent state. */ + +void +evrp_range_analyzer::unwind_to_marker (void) +{ + gcc_checking_assert (!stack.is_empty ()); + while (stack.last ().first != NULL_TREE) + { + std::pair e = stack.pop (); + vr_values->free_value_range (e.second); + } + stack.pop (); +} + /* Restore/pop VRs valid only for BB when we leave BB. */ void diff --git a/gcc/gimple-ssa-evrp-analyze.h b/gcc/gimple-ssa-evrp-analyze.h index d16279f89b9..42c5a263076 100644 --- a/gcc/gimple-ssa-evrp-analyze.h +++ b/gcc/gimple-ssa-evrp-analyze.h @@ -20,6 +20,13 @@ along with GCC; see the file COPYING3. If not see #ifndef GCC_GIMPLE_SSA_EVRP_ANALYZE_H #define GCC_GIMPLE_SSA_EVRP_ANALYZE_H +struct static_var +{ + gimple *load; + gimple *store; + tree min_type; +}; + class evrp_range_analyzer { public: @@ -33,9 +40,11 @@ class evrp_range_analyzer void enter (basic_block); void push_marker (void); void pop_to_marker (void); + void unwind_to_marker (void); void leave (basic_block); void record_ranges_from_stmt (gimple *, bool); + bool refine_ranges (); /* Main interface to retrieve range information. */ const value_range_equiv *get_value_range (const_tree op) { return vr_values->get_value_range (op); } @@ -62,6 +71,7 @@ class evrp_range_analyzer DISABLE_COPY_AND_ASSIGN (evrp_range_analyzer); class vr_values *vr_values; + bool may_refine (gimple *, tree, auto_vec &); void pop_value_range (); value_range_equiv *try_find_new_range (tree, tree op, tree_code code, tree limit); @@ -69,9 +79,17 @@ class evrp_range_analyzer void record_ranges_from_phis (basic_block); void set_ssa_range_info (tree, value_range_equiv *); + void visit_assignment_static_var (gimple *); + bool refine_range (struct static_var); + void refine_phi (gimple *, value_range_equiv *); + bool validate_after_refining (gimple *, tree); + /* STACK holds the old VR. */ auto_vec > stack; + auto_vec refine_candidates; + auto_vec > to_validate; + /* True if we are updating global ranges, false otherwise. */ bool m_update_global_ranges; }; diff --git a/gcc/gimple-ssa-evrp.c b/gcc/gimple-ssa-evrp.c index 599e1459f00..3dd0aa20a7d 100644 --- a/gcc/gimple-ssa-evrp.c +++ b/gcc/gimple-ssa-evrp.c @@ -273,6 +273,20 @@ evrp_dom_walker::cleanup (void) fprintf (dump_file, "\n"); } + if (flag_vrp_mem) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\nTry to refine values ranges"); + + bool refined = evrp_range_analyzer.refine_ranges (); + + if (dump_file && (dump_flags & TDF_DETAILS) && refined) + { + fprintf (dump_file, "\nValue ranges after refining:\n\n"); + evrp_range_analyzer.dump_all_value_ranges (dump_file); + fprintf (dump_file, "\n"); + } + } /* Remove stmts in reverse order to make debug stmt creation possible. */ while (! stmts_to_remove.is_empty ()) { diff --git a/gcc/testsuite/gcc.dg/evrp-mem-1.c b/gcc/testsuite/gcc.dg/evrp-mem-1.c new file mode 100644 index 00000000000..c62ec6c4e09 --- /dev/null +++ b/gcc/testsuite/gcc.dg/evrp-mem-1.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fdump-tree-evrp-all -fvrp-mem" } */ + +const static unsigned long long arr[] = {0x0 , 0x88888888, 0xffffffff }; +const static unsigned long long overflow_arr[] = {0x0 , 0xfffffffff , 0xffffffffffffffff }; +static unsigned long long non_const_arr[] = {0x0 , 0x88888888, 0xffffffff }; + +unsigned long long test1 (int i) +{ + return arr[i]; +} + +unsigned long long test2 (int i) +{ + return overflow_arr[i]; +} + +unsigned long long test3 (int i) +{ + return non_const_arr[i]; +} + +/* { dg-final { scan-tree-dump-times "Const array vuse:" 2 "evrp" } } */ +/* { dg-final { scan-tree-dump-times "Set value range with smaller type: long long unsigned int \\\[0, 4294967295\\\]" 1 "evrp" } } */ diff --git a/gcc/testsuite/gcc.dg/evrp-mem-2.c b/gcc/testsuite/gcc.dg/evrp-mem-2.c new file mode 100644 index 00000000000..0c657252ed5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/evrp-mem-2.c @@ -0,0 +1,36 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fdump-tree-evrp-all -fvrp-mem" } */ + + +unsigned long long test1 (int flg) +{ + static unsigned long long val = 0xffffffff; + + if (flg) + val >>= 8; + return val; +} + +unsigned long long test2 (int flg) +{ + static unsigned long long val = 0xffffffff; + + if (flg) + val <<= 8; + return val; +} + +unsigned long long test3 (int flg, unsigned long long in) +{ + static unsigned long long val = 0xffffffff; + + if (flg) + val = in + val; + return val; +} + +/* { dg-final { scan-tree-dump-times "Number of refining candidates: 1" 3 "evrp" } } */ +/* { dg-final { scan-tree-dump-times "pushing new range for .*: long long unsigned int \\\[0, 4294967295\\\]" 1 "evrp" } } */ +/* { dg-final { scan-tree-dump-times "pushing new range for .*: long long unsigned int \\\[0, 16777215\\\]" 1 "evrp" } } */ +/* { dg-final { scan-tree-dump-times "Successful refining for static variable" 1 "evrp" } } */ +/* { dg-final { scan-tree-dump-times "Failed refining for static variable" 2 "evrp" } } */ diff --git a/gcc/tree-vrp.c b/gcc/tree-vrp.c index ee077e7c841..54078074a51 100644 --- a/gcc/tree-vrp.c +++ b/gcc/tree-vrp.c @@ -1811,6 +1811,28 @@ overflow_comparison_p (tree_code code, tree name, tree val, use_equiv_p, true, new_cst); } +/* For current value get minimum integer type + that this value can be fitted into. */ +tree +get_min_fitted_type (tree val) +{ + gcc_assert (TREE_CODE (val) == INTEGER_CST); + gcc_assert (TYPE_UNSIGNED (TREE_TYPE (val))); + + tree uint16_max = TYPE_MAX_VALUE (uint16_type_node); + tree uint32_max = TYPE_MAX_VALUE (uint32_type_node); + tree uint64_max = TYPE_MAX_VALUE (uint64_type_node); + + if (tree_int_cst_compare (val, uint16_max) != 1) + return uint16_type_node; + if (tree_int_cst_compare (val, uint32_max) != 1) + return uint32_type_node; + if (tree_int_cst_compare (val, uint64_max) != 1) + return uint64_type_node; + + return NULL_TREE; +} + /* Try to register an edge assertion for SSA name NAME on edge E for the condition COND contributing to the conditional jump pointed to by BSI. @@ -4320,6 +4342,40 @@ remove_range_assertions (void) } } +/* Return true if this statement is an assignment + from constant static array. */ +static bool +const_array_with_ctor_load (gimple *stmt) +{ + if (!gimple_vuse (stmt) + || gimple_code (stmt) != GIMPLE_ASSIGN) + return false; + + if (gimple_num_ops (stmt) != 2) + return false; + + tree lhs = gimple_get_lhs (stmt); + tree rhs = gimple_assign_rhs1 (stmt); + tree type = TREE_TYPE (lhs); + + if (TREE_CODE (type) != INTEGER_TYPE + || !TYPE_UNSIGNED (type) + || TREE_CODE (rhs) != ARRAY_REF) + return false; + + tree arr = TREE_OPERAND (rhs, 0); + if (!arr || !DECL_P (arr) || !VAR_P (arr) + || !TREE_STATIC (arr) || !TREE_READONLY (arr) + || may_be_aliased (arr)) + return false; + + tree ctor = DECL_INITIAL (arr); + if (!ctor || TREE_CODE (ctor) != CONSTRUCTOR) + return false; + + return true; +} + /* Return true if STMT is interesting for VRP. */ bool @@ -4343,7 +4399,7 @@ stmt_interesting_for_vrp (gimple *stmt) && (INTEGRAL_TYPE_P (TREE_TYPE (lhs)) || POINTER_TYPE_P (TREE_TYPE (lhs))) && (is_gimple_call (stmt) - || !gimple_vuse (stmt))) + || !gimple_vuse (stmt) || const_array_with_ctor_load (stmt))) return true; else if (is_gimple_call (stmt) && gimple_call_internal_p (stmt)) switch (gimple_call_internal_fn (stmt)) diff --git a/gcc/tree-vrp.h b/gcc/tree-vrp.h index aa8201f7359..d79ec52f97d 100644 --- a/gcc/tree-vrp.h +++ b/gcc/tree-vrp.h @@ -107,6 +107,7 @@ struct assert_info tree expr; }; +extern tree get_min_fitted_type (tree); extern void register_edge_assert_for (tree, edge, enum tree_code, tree, tree, vec &); extern bool stmt_interesting_for_vrp (gimple *); diff --git a/gcc/vr-values.c b/gcc/vr-values.c index 2e3a0788710..59bd4aeeb5c 100644 --- a/gcc/vr-values.c +++ b/gcc/vr-values.c @@ -50,6 +50,7 @@ along with GCC; see the file COPYING3. If not see #include "vr-values.h" #include "cfghooks.h" #include "range-op.h" +#include "tree-vrp.h" /* Set value range VR to a non-negative range of type TYPE. */ @@ -2046,6 +2047,87 @@ get_output_for_vrp (gimple *stmt) return NULL_TREE; } +/* Set maximum and minimum variable range according to its type. */ +void +vr_values::set_range_with_type (tree type, tree min_type, value_range_equiv *vr) +{ + tree min = wide_int_to_tree (type, + wi::to_wide (TYPE_MIN_VALUE (min_type))); + tree max = wide_int_to_tree (type, + wi::to_wide (TYPE_MAX_VALUE (min_type))); + vr->set (min, max); +} + +/* Get maximum possible element of static constructor. + Maximum element is equal to maximum range of the + constructor intermediate users. */ +static tree +get_max_ctor_val (tree ctor, tree type) +{ + tree val; + unsigned int ix; + + tree max = TYPE_MIN_VALUE (type); + FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (ctor), ix, val) + { + if (TREE_CODE (val) != INTEGER_CST) + return NULL_TREE; + + if (tree_int_cst_compare (val, max) == 1) + max = val; + } + return max; +} + +void +vr_values::vrp_visit_assignment_const_array (gimple *stmt, tree *output_p, + value_range_equiv *vr) +{ + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Const array vuse: "); + print_gimple_stmt (dump_file, stmt, 0); + } + tree lhs = gimple_get_lhs (stmt); + tree rhs = gimple_assign_rhs1 (stmt); + + tree arr = TREE_OPERAND (rhs, 0); + tree ctor = DECL_INITIAL (arr); + + tree type = TREE_TYPE (lhs); + tree max = get_max_ctor_val (ctor, type); + if (!max) + return; + tree min_type = get_min_fitted_type (max); + + if (!min_type + || TYPE_PRECISION (min_type) >= TYPE_PRECISION (type)) + return; + + set_range_with_type (type, min_type, vr); + *output_p = lhs; + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Set value range with smaller type: "); + vr->dump (dump_file); + fprintf (dump_file, "\n"); + } + +} + +void +vr_values::vrp_visit_assignment_vuse (gimple *stmt, tree *output_p, + value_range_equiv *vr) +{ + if (!flag_vrp_mem) + return; + + tree rhs = gimple_assign_rhs1 (stmt); + if (TREE_CODE (rhs) == ARRAY_REF) + vrp_visit_assignment_const_array (stmt, output_p, vr); +} + /* Visit assignment STMT. If it produces an interesting range, record the range in VR and set LHS to OUTPUT_P. */ @@ -2805,6 +2887,8 @@ vr_values::extract_range_from_stmt (gimple *stmt, edge *taken_edge_p, if (!stmt_interesting_for_vrp (stmt)) gcc_assert (stmt_ends_bb_p (stmt)); + else if (is_gimple_assign (stmt) && gimple_vuse (stmt)) + vrp_visit_assignment_vuse (stmt, output_p, vr); else if (is_gimple_assign (stmt) || is_gimple_call (stmt)) vrp_visit_assignment_or_call (stmt, output_p, vr); else if (gimple_code (stmt) == GIMPLE_COND) diff --git a/gcc/vr-values.h b/gcc/vr-values.h index b4ab4e6f5b8..5019496ed65 100644 --- a/gcc/vr-values.h +++ b/gcc/vr-values.h @@ -77,6 +77,8 @@ class vr_values /* */ void cleanup_edges_and_switches (void); + void set_range_with_type (tree, tree, value_range_equiv *); + private: value_range_equiv *get_lattice_entry (const_tree); bool vrp_stmt_computes_nonzero (gimple *); @@ -102,6 +104,8 @@ class vr_values void extract_range_from_cond_expr (value_range_equiv *, gassign *); void extract_range_from_comparison (value_range_equiv *, enum tree_code, tree, tree, tree); + void vrp_visit_assignment_vuse (gimple *, tree *, value_range_equiv *); + void vrp_visit_assignment_const_array (gimple *, tree *, value_range_equiv *); void vrp_visit_assignment_or_call (gimple*, tree *, value_range_equiv *); void vrp_visit_switch_stmt (gswitch *, edge *); bool simplify_truth_ops_using_ranges (gimple_stmt_iterator *, gimple *); @@ -115,6 +119,7 @@ class vr_values gimple *); bool simplify_internal_call_using_ranges (gimple_stmt_iterator *, gimple *); + bool may_refine (gimple *, tree, auto_vec &, hash_set &); /* Allocation pools for value_range objects. */ object_allocator vrp_value_range_pool; -- Gitee From b56d58e366a5e34fb658c8e084873c21e72f56ca Mon Sep 17 00:00:00 2001 From: Yarovoy Danil WX1195294 Date: Fri, 18 Aug 2023 15:19:20 +0300 Subject: [PATCH 4/7] Implement type lowering pass. if it obviously all possible values of variable are fitted in lower type try to replace wider type (int64 as example) with lower type (int32) --- gcc/Makefile.in | 1 + gcc/common.opt | 4 + gcc/gimple-ssa-type-lowering.c | 537 +++++++++++++++++++++++++++ gcc/params.opt | 4 + gcc/passes.def | 1 + gcc/testsuite/gcc.dg/type-lowering.c | 24 ++ gcc/timevar.def | 1 + gcc/tree-pass.h | 1 + 8 files changed, 573 insertions(+) create mode 100644 gcc/gimple-ssa-type-lowering.c create mode 100644 gcc/testsuite/gcc.dg/type-lowering.c diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 3f06b8907ce..2678b559bcb 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1356,6 +1356,7 @@ OBJS = \ gimple-pretty-print.o \ gimple-ssa-backprop.o \ gimple-ssa-evrp.o \ + gimple-ssa-type-lowering.o \ gimple-ssa-evrp-analyze.o \ gimple-ssa-isolate-paths.o \ gimple-ssa-nonnull-compare.o \ diff --git a/gcc/common.opt b/gcc/common.opt index 2ed6fa6ddab..0e5681c20ab 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1064,6 +1064,10 @@ fcrypto-accel Common Report Var(flag_crypto_accel) Init(0) Optimization Perform crypto acceleration pattern matching. +ftype-lowering +Common Report Var(flag_type_lowering) Init(0) Optimization +Perform type lowering using range analysis info. + fvrp-mem Common Report Var(flag_vrp_mem) Init(0) Optimization Perform value range analysis for static storages. diff --git a/gcc/gimple-ssa-type-lowering.c b/gcc/gimple-ssa-type-lowering.c new file mode 100644 index 00000000000..da112168578 --- /dev/null +++ b/gcc/gimple-ssa-type-lowering.c @@ -0,0 +1,537 @@ +/* Straight-line strength reduction. + Copyright (C) 2023 Free Software Foundation, Inc. + Contributed by Yarovoy Danil, Huawei + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +/* This optimiztion tries to lower variable type + based on value range analysis. + Only integer types are supported. + Pass makes revers tree traversal and look for statements + that can be fitted into smaller integer type based on + value range information After that def chains are collected + and if chain lenght is more than type_lowering_min_seq_size + then transform sequens. In the begining and in the end of the chain + casts are inserted. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "tree.h" +#include "gimple.h" +#include "tree-pass.h" +#include "ssa.h" +#include "gimple-pretty-print.h" +#include "gimple-fold.h" +#include "gimple-iterator.h" +#include "tree-cfg.h" + + +class type_lower +{ +public: + void init (gimple *stmt, basic_block bb, tree low_type); + bool valuable_sequence () const; + bool gen_low_seq (); + +private: + void collect_lowering_sequence (gimple *stmt); + void form_input_ssa_names (); + bool suitable_stmt (gimple *stmt) const; + bool suitable_phi (gimple *stmt) const; + bool fitted_in_low_type (tree val) const; + void add_stmt_to_seq (gimple *stmt); + + gimple* gimple_copy_low (gimple *stmt); + tree build_and_insert_cast (tree val) const; + tree make_new_low_ssa () const; + tree make_new_low_cst (tree val) const; + +private: + tree low_type_; + basic_block bb_; + + auto_vec to_lower_; + hash_set input_ssa_names_; + hash_set defined_ssa_names_; + hash_map new_ssa_; +}; + +/* Shifts on non constant value can't be candidates. */ +static bool +binary_candidate (gimple *stmt) +{ + enum tree_code code = gimple_assign_rhs_code (stmt); + if (code == RSHIFT_EXPR + || code == LSHIFT_EXPR) + { + tree op = gimple_assign_rhs2 (stmt); + if (TREE_CODE (op) != INTEGER_CST) + return false; + } + return true; +} + +/* Select candidates for lowering. + gimple assign consider as a candidate if + 1. rhs is non binary shift on non const value. + 2. lhs is ssa name. + 3. lhs has integer type. */ +static bool +candidate_for_lowering (gimple *stmt) +{ + if (!is_gimple_assign (stmt)) + return false; + + if ((gimple_assign_rhs_class (stmt) != GIMPLE_BINARY_RHS + || !binary_candidate (stmt)) + && !gimple_vuse (stmt)) + return false; + + tree lhs = gimple_get_lhs (stmt); + if (TREE_CODE (lhs) != SSA_NAME) + return false; + + tree type = TREE_TYPE (lhs); + if (TREE_CODE (type) != INTEGER_TYPE) + return false; + + return true; +} + +/* Get minimum unsigned integer type + that this value can be fitted into. */ +static tree +get_min_fitted_utype (tree val) +{ + gcc_assert (TYPE_UNSIGNED (TREE_TYPE (val))); + + tree uint32_max = TYPE_MAX_VALUE (uint32_type_node); + tree uint64_max = TYPE_MAX_VALUE (uint64_type_node); + + if (tree_int_cst_compare (val, uint32_max) != 1) + return uint32_type_node; + if (tree_int_cst_compare (val, uint64_max) != 1) + return uint64_type_node; + + return NULL_TREE; +} + +/* Get minimum signed integer type + that this value can be fitted into. */ +static tree +get_min_fitted_stype (tree min, tree max) +{ + gcc_assert (!TYPE_UNSIGNED (TREE_TYPE (min))); + gcc_assert (!TYPE_UNSIGNED (TREE_TYPE (max))); + + tree int_max = TYPE_MAX_VALUE (integer_type_node); + tree int_min = TYPE_MIN_VALUE (integer_type_node); + + tree long_max = TYPE_MAX_VALUE (long_integer_type_node); + tree long_min = TYPE_MIN_VALUE (long_integer_type_node); + + tree long_long_max = TYPE_MAX_VALUE (long_long_integer_type_node); + tree long_long_min = TYPE_MIN_VALUE (long_long_integer_type_node); + + if (tree_int_cst_compare (max, int_max) != 1 + && tree_int_cst_compare (min, int_min) != -1) + return integer_type_node; + if (tree_int_cst_compare (max, long_max) != 1 + && tree_int_cst_compare (min, long_min) != -1) + return long_integer_type_node; + if (tree_int_cst_compare (max, long_long_max) != 1 + && tree_int_cst_compare (min, long_long_min) != -1) + return long_long_integer_type_node; + + return NULL_TREE; +} + +/* Get minimum integer type + that this value range can be fitted into. */ +static tree +get_min_fitted_int_type (value_range vr) +{ + gcc_assert (TREE_CODE (vr.min ()) == INTEGER_CST); + gcc_assert (TREE_CODE (vr.max ()) == INTEGER_CST); + gcc_assert (TREE_TYPE (vr.min ()) == TREE_TYPE (vr.max ())); + + if (TYPE_UNSIGNED (TREE_TYPE (vr.max ()))) + return get_min_fitted_utype (vr.max ()); + else + return get_min_fitted_stype (vr.min (), vr.max ()); +} + +/* Return possible lowest type for current statement. */ +static tree +get_low_type (gimple *stmt) +{ + tree lhs = gimple_get_lhs (stmt); + + value_range vr; + if (get_range_info (lhs, vr) != VR_RANGE) + return NULL_TREE; + + if (!vr.constant_p ()) + return NULL_TREE; + + tree orig_type = TREE_TYPE (lhs); + tree low_type = get_min_fitted_int_type (vr); + + if (!low_type + || TYPE_PRECISION (low_type) >= TYPE_PRECISION (orig_type)) + return NULL_TREE; + + return low_type; +} +/* Use global param type_lowering_min_seq_size + to check whether current instruction sequence + is valuable for lowering. */ +bool +type_lower::valuable_sequence () const +{ + unsigned min_size = type_lowering_min_seq_size; + return (to_lower_.length () >= min_size); +} + +void +type_lower::add_stmt_to_seq (gimple *stmt) +{ + to_lower_.safe_push (stmt); +} + +void +type_lower::form_input_ssa_names () +{ + for (hash_set::iterator it = defined_ssa_names_.begin (), + end = defined_ssa_names_.end (); + it != end; ++it) + input_ssa_names_.remove (*it); +} +/* Check whether given statement is suitable for lowering. */ +bool +type_lower::suitable_stmt (gimple *stmt) const +{ + if (!candidate_for_lowering (stmt)) + return false; + + tree type = get_low_type (stmt); + if (!type + || TYPE_PRECISION (type) > TYPE_PRECISION (low_type_)) + return false; + + return true; +} +/* For phi stmt all possible values has to be checked + for fitting in smaller type. + Return false if at least one phi argument + can't be fitted in low type. */ +bool +type_lower::suitable_phi (gimple *phi) const +{ + for (unsigned i = 0; i < gimple_phi_num_args (phi); i++) + { + tree arg = gimple_phi_arg_def (phi, i); + if (TREE_CODE (arg) == SSA_NAME) + { + gimple *def = SSA_NAME_DEF_STMT (arg); + if (!suitable_stmt (def)) + return false; + } + else if (TREE_CODE (arg) == INTEGER_CST) + { + if (!fitted_in_low_type (arg)) + return false; + } + else + gcc_unreachable(); + } + return true; +} + +/* Check that integer constant is fitted in low type. */ +bool +type_lower::fitted_in_low_type (tree val) const +{ + gcc_assert (TREE_CODE (val) == INTEGER_CST); + + tree max = TYPE_MAX_VALUE (low_type_); + if (tree_int_cst_compare (val, max) == 1) + return false; + + return true; +} + +/* Collect sequens of possible statements from use to def. */ +void +type_lower::collect_lowering_sequence (gimple *stmt) +{ + tree op; + ssa_op_iter op_iter; + + if (gimple_visited_p (stmt) + || gimple_bb (stmt) != bb_) + return; + + gimple_set_visited (stmt, true); + + if (!is_gimple_assign (stmt) + || gimple_assign_rhs_class (stmt) != GIMPLE_BINARY_RHS + || gimple_vuse (stmt)) + return; + + FOR_EACH_SSA_TREE_OPERAND (op, stmt, op_iter, SSA_OP_USE) + { + gimple *op_def = SSA_NAME_DEF_STMT (op); + if (gimple_code (op_def) == GIMPLE_PHI) + { + if (!suitable_phi (op_def)) + return; + } + else + { + if (!suitable_stmt (op_def)) + return; + } + } + + defined_ssa_names_.add (gimple_get_lhs (stmt)); + + FOR_EACH_SSA_TREE_OPERAND (op, stmt, op_iter, SSA_OP_USE) + { + gimple *op_def = SSA_NAME_DEF_STMT (op); + input_ssa_names_.add (op); + collect_lowering_sequence (op_def); + } + + add_stmt_to_seq (stmt); +} + +/* Optimization analyser main function. */ +void +type_lower::init (gimple *stmt, basic_block bb, tree low_type) +{ + bb_ = bb; + low_type_ = low_type; + + gimple_set_visited (stmt, false); + + collect_lowering_sequence (stmt); + form_input_ssa_names (); + + if (dump_file && (dump_flags & TDF_DETAILS) + && valuable_sequence ()) + { + fprintf (dump_file, "Found valuable sequence for lowering:\n"); + + int i = 0; + gimple *stmt = NULL; + FOR_EACH_VEC_ELT (to_lower_, i, stmt) + print_gimple_stmt (dump_file, stmt, 0); + + fprintf (dump_file, "\n"); + } +} + +/* In the begining and in the end of transformed sequense + casts to the orginal type have to be inserted. */ +tree +type_lower::build_and_insert_cast (tree op) const +{ + gimple_stmt_iterator gsi; + + gimple *def = SSA_NAME_DEF_STMT (op); + + tree new_ssa = make_ssa_name (low_type_); + gassign *cast_stmt = gimple_build_assign (new_ssa, NOP_EXPR, op); + + if (gimple_code (def) == GIMPLE_PHI) + { + gsi = gsi_after_labels (bb_); + gsi_insert_before (&gsi, cast_stmt, GSI_SAME_STMT); + } + else + { + gsi = gsi_for_stmt (def); + gsi_insert_after (&gsi, cast_stmt, GSI_SAME_STMT); + } + + return new_ssa; +} + +tree +type_lower::make_new_low_ssa () const +{ + return make_ssa_name (low_type_); +} + +tree +type_lower::make_new_low_cst (tree op) const +{ + return wide_int_to_tree (low_type_, wi::to_poly_wide (op)); +} + +gimple* +type_lower::gimple_copy_low (gimple *stmt) +{ + gcc_assert (is_gimple_assign (stmt)); + gcc_assert (gimple_num_ops (stmt) == 3); + + for (unsigned i = 0; i < gimple_num_ops (stmt) ; i++) + { + tree op = gimple_op (stmt, i); + if (new_ssa_.get (op)) + continue; + + tree new_op; + if (input_ssa_names_.contains (op)) + new_op = build_and_insert_cast (op); + else if (TREE_CODE (op) == SSA_NAME) + new_op = make_new_low_ssa (); + else if (TREE_CODE (op) == INTEGER_CST) + new_op = make_new_low_cst (op); + else + gcc_unreachable (); + + new_ssa_.put (op, new_op); + } + + enum tree_code code = gimple_assign_rhs_code (stmt); + + tree ops[3]; + for (int i = 0; i < 3; i++) + ops[i] = *new_ssa_.get (gimple_op (stmt, i)); + + gimple *new_assign = gimple_build_assign (ops[0], code, ops[1], ops[2]); + + return new_assign; +} + +/* Generate sequence of lowered statements. */ +bool +type_lower::gen_low_seq () +{ + gimple_stmt_iterator gsi; + + int i = 0; + gimple *stmt = NULL; + gimple *final_stmt = NULL; + FOR_EACH_VEC_ELT (to_lower_, i, stmt) + { + gimple *copy = gimple_copy_low (stmt); + + gsi = gsi_for_stmt (stmt); + gsi_insert_before (&gsi, copy, GSI_SAME_STMT); + final_stmt = stmt; + } + if (!final_stmt) + return false; + + tree lhs = gimple_get_lhs (final_stmt); + gassign *cast_stmt = gimple_build_assign (lhs, NOP_EXPR, *new_ssa_.get (lhs)); + + gsi = gsi_for_stmt (final_stmt); + gsi_replace (&gsi, cast_stmt, true); + SSA_NAME_DEF_STMT (lhs) = cast_stmt; + return true; +} + +static void +process_bb (basic_block bb) +{ + for (gimple_stmt_iterator gsi = gsi_start_bb (bb); + !gsi_end_p (gsi); gsi_next (&gsi)) + gimple_set_visited (gsi_stmt (gsi), false); + + for (gimple_stmt_iterator gsi = gsi_last_bb (bb); + !gsi_end_p (gsi); gsi_prev (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + if (gimple_visited_p (stmt)) + continue; + + gimple_set_visited (stmt, true); + + if (!candidate_for_lowering (stmt)) + continue; + + tree low_type = get_low_type (stmt); + if (!low_type) + continue; + + type_lower lower; + lower.init (stmt, bb, low_type); + if (!lower.valuable_sequence ()) + continue; + + lower.gen_low_seq (); + } +} + +static unsigned int +execute_pass_type_lowering () +{ + basic_block bb; + FOR_EACH_BB_FN (bb, cfun) + process_bb (bb); + + return 0; +} + +namespace { + +const pass_data pass_data_type_lowering = +{ + GIMPLE_PASS, /* type */ + "type_lowering", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_TREE_TYPE_LOWERING, /* tv_id */ + PROP_ssa, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + ( TODO_update_ssa | TODO_verify_all ), +}; + +class pass_type_lowering : public gimple_opt_pass +{ +public: + pass_type_lowering (gcc::context *ctxt) + : gimple_opt_pass (pass_data_type_lowering, ctxt) + {} + + /* opt_pass methods: */ + opt_pass * clone () { return new pass_type_lowering (m_ctxt); } + virtual bool gate (function *) + { + return (flag_type_lowering != 0); + } + virtual unsigned int execute (function *) + { + return execute_pass_type_lowering (); + } + +}; // class pass_vrp +} // anon namespace + +gimple_opt_pass * +make_pass_type_lowering (gcc::context *ctxt) +{ + return new pass_type_lowering (ctxt); +} + diff --git a/gcc/params.opt b/gcc/params.opt index 83fd705eed7..47e127abd80 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -912,6 +912,10 @@ Stop reverse growth if the reverse probability of best edge is less than this th Common Joined UInteger Var(param_tree_reassoc_width) Param Optimization Set the maximum number of instructions executed in parallel in reassociated tree. If 0, use the target dependent heuristic. +-param=type-lowering-min-seq-size= +Common Joined UInteger Var(type_lowering_min_seq_size) Init(2) IntegerRange(1, 32) Param Optimization +Set the minimum valuable sequence size for type lowering. + -param=uninit-control-dep-attempts= Common Joined UInteger Var(param_uninit_control_dep_attempts) Init(1000) IntegerRange(1, 65536) Param Optimization Maximum number of nested calls to search for control dependencies during uninitialized variable analysis. diff --git a/gcc/passes.def b/gcc/passes.def index ea50db08623..8da29aa9ed0 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -87,6 +87,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_build_ealias); NEXT_PASS (pass_fre, true /* may_iterate */); NEXT_PASS (pass_early_vrp); + NEXT_PASS (pass_type_lowering); NEXT_PASS (pass_merge_phi); NEXT_PASS (pass_dse); NEXT_PASS (pass_cd_dce); diff --git a/gcc/testsuite/gcc.dg/type-lowering.c b/gcc/testsuite/gcc.dg/type-lowering.c new file mode 100644 index 00000000000..b89267909f4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/type-lowering.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fdump-tree-type_lowering-all -ftype-lowering" } */ + +unsigned long long +test1 (unsigned long long a, unsigned long long b, int flg) +{ + unsigned long long sa; + unsigned long long sb; + if (flg) + { + sa = a >> 48; + sb = b >> 32; + } + else + { + sa = a >> 32; + sb = b >> 48; + } + + unsigned long long res = ((sa | 1) ^ (sb | 1)); + return res; +} + +/* { dg-final { scan-tree-dump "Found valuable sequence for lowering" "type_lowering" } } */ diff --git a/gcc/timevar.def b/gcc/timevar.def index 2814b14f21c..f3b0d3f9034 100644 --- a/gcc/timevar.def +++ b/gcc/timevar.def @@ -158,6 +158,7 @@ DEFTIMEVAR (TV_TREE_CLEANUP_CFG , "tree CFG cleanup") DEFTIMEVAR (TV_TREE_TAIL_MERGE , "tree tail merge") DEFTIMEVAR (TV_TREE_VRP , "tree VRP") DEFTIMEVAR (TV_TREE_EARLY_VRP , "tree Early VRP") +DEFTIMEVAR (TV_TREE_TYPE_LOWERING , "tree type lowering") DEFTIMEVAR (TV_TREE_COPY_PROP , "tree copy propagation") DEFTIMEVAR (TV_FIND_REFERENCED_VARS , "tree find ref. vars") DEFTIMEVAR (TV_TREE_PTA , "tree PTA") diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 3cdc124668b..d73ab825bf2 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -447,6 +447,7 @@ extern gimple_opt_pass *make_pass_check_data_deps (gcc::context *ctxt); extern gimple_opt_pass *make_pass_copy_prop (gcc::context *ctxt); extern gimple_opt_pass *make_pass_isolate_erroneous_paths (gcc::context *ctxt); extern gimple_opt_pass *make_pass_early_vrp (gcc::context *ctxt); +extern gimple_opt_pass *make_pass_type_lowering (gcc::context *ctxt); extern gimple_opt_pass *make_pass_vrp (gcc::context *ctxt); extern gimple_opt_pass *make_pass_uncprop (gcc::context *ctxt); extern gimple_opt_pass *make_pass_return_slot (gcc::context *ctxt); -- Gitee From 3fd3b80b925b96594adc49cba0f58a7da1649572 Mon Sep 17 00:00:00 2001 From: Agrachev Andrey WX1228450 Date: Mon, 21 Aug 2023 12:35:19 +0300 Subject: [PATCH 5/7] Add split-complex-instructions pass Add option -fsplit-ldp-stp Add functionality to detect and split depended from store LDP instructions. Add -param=param-ldp-dependency-search-range= to configure ldp dependency search range --- gcc/common.opt | 5 + gcc/config/aarch64/aarch64.c | 40 ++ gcc/doc/tm.texi | 8 + gcc/doc/tm.texi.in | 4 + gcc/params.opt | 7 + gcc/passes.def | 1 + gcc/sched-rgn.c | 625 +++++++++++++++++- gcc/target.def | 10 + .../gcc.dg/rtl/aarch64/test-ldp-dont-split.c | 73 ++ .../gcc.dg/rtl/aarch64/test-ldp-split.c | 73 ++ gcc/timevar.def | 1 + gcc/tree-pass.h | 1 + 12 files changed, 847 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c diff --git a/gcc/common.opt b/gcc/common.opt index 0e5681c20ab..ffd6f7cebff 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1705,6 +1705,11 @@ floop-nest-optimize Common Report Var(flag_loop_nest_optimize) Optimization Enable the loop nest optimizer. +fsplit-ldp-stp +Common Report Var(flag_split_ldp_stp) Optimization +Split load/store pair instructions into seperate load/store operations +for better perfomance. + fstrict-volatile-bitfields Common Report Var(flag_strict_volatile_bitfields) Init(-1) Optimization Force bitfield accesses to match their type width. diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 2e08a63a3ef..52a78d2b18b 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -23827,6 +23827,46 @@ aarch64_run_selftests (void) #endif /* #if CHECKING_P */ +bool +is_aarch64_ldp_insn (int icode) +{ + if (icode >= CODE_FOR_load_pair_sw_sisi + && icode <= CODE_FOR_load_pair_dw_tftf + || icode >= CODE_FOR_loadwb_pairsi_si + && icode <= CODE_FOR_loadwb_pairtf_di + || icode >= CODE_FOR_load_pairv8qiv8qi + && icode <= CODE_FOR_load_pairdfdf + || icode >= CODE_FOR_load_pairv16qiv16qi + && icode <= CODE_FOR_load_pairv8bfv2df + || icode >= CODE_FOR_load_pair_lanesv8qi + && icode <= CODE_FOR_load_pair_lanesdf) + return true; + return false; +} + +bool +is_aarch64_stp_insn (int icode) +{ + if (icode >= CODE_FOR_store_pair_sw_sisi + && icode <= CODE_FOR_store_pair_dw_tftf + || icode >= CODE_FOR_storewb_pairsi_si + && icode <= CODE_FOR_storewb_pairtf_di + || icode >= CODE_FOR_vec_store_pairv8qiv8qi + && icode <= CODE_FOR_vec_store_pairdfdf + || icode >= CODE_FOR_vec_store_pairv16qiv16qi + && icode <= CODE_FOR_vec_store_pairv8bfv2df + || icode >= CODE_FOR_store_pair_lanesv8qi + && icode <= CODE_FOR_store_pair_lanesdf) + return true; + return false; +} + +#undef TARGET_IS_LDP_INSN +#define TARGET_IS_LDP_INSN is_aarch64_ldp_insn + +#undef TARGET_IS_STP_INSN +#define TARGET_IS_STP_INSN is_aarch64_stp_insn + #ifdef TARGET_CRC32 #undef TARGET_GEN_CRC32B #define TARGET_GEN_CRC32B gen_aarch64_crc32b diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index d98e6c5d7c6..9a4660ddf6c 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -11840,6 +11840,14 @@ object files that are not referenced from @code{main} and uses export lists. @end defmac +@deftypefn {Target Hook} bool TARGET_IS_LDP_INSN (int @var{icode}) +Return true if icode is corresponding to any of the LDP instruction types. +@end deftypefn + +@deftypefn {Target Hook} bool TARGET_IS_STP_INSN (int @var{icode}) +Return true if icode is corresponding to any of the STP instruction types. +@end deftypefn + @deftypefn {Target Hook} rtx TARGET_GEN_CRC32B (rtx @var{dest}, rtx @var{src1}, rtx @var{src2}) This function generate the crc32 instruction if target supports this. @end deftypefn diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index c26729404d6..e0d064e79e8 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -8002,6 +8002,10 @@ object files that are not referenced from @code{main} and uses export lists. @end defmac +@hook TARGET_IS_LDP_INSN + +@hook TARGET_IS_STP_INSN + @hook TARGET_GEN_CRC32B @hook TARGET_CANNOT_MODIFY_JUMPS_P diff --git a/gcc/params.opt b/gcc/params.opt index 47e127abd80..c2475517026 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -996,4 +996,11 @@ Target size of compressed pointer, which should be 8, 16 or 32. Common Joined UInteger Var(semi_relayout_level) Init(13) IntegerRange(11, 15) Param Optimization Set capacity of each bucket to semi-relayout to (1 << semi-relayout-level) / 8 . +-param=param-ldp-dependency-search-range= +Common Joined UInteger Var(param_ldp_dependency_search_range) Init(16) IntegerRange(1, 32) Param Optimization +Range for depended ldp search in split-ldp-stp path. + +-param=param-relayout-bucket-num= +Common Joined UInteger Var(param_relayout_bucket_num) Init(15) IntegerRange(1, 20) Param Optimization +Relayout num of bucket. ; This comment is to ensure we retain the blank line above. diff --git a/gcc/passes.def b/gcc/passes.def index 8da29aa9ed0..02c83aeed0f 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -485,6 +485,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_reorder_blocks); NEXT_PASS (pass_leaf_regs); NEXT_PASS (pass_split_before_sched2); + NEXT_PASS (pass_split_complex_instructions); NEXT_PASS (pass_sched2); NEXT_PASS (pass_stack_regs); PUSH_INSERT_PASSES_WITHIN (pass_stack_regs) diff --git a/gcc/sched-rgn.c b/gcc/sched-rgn.c index 7f5dfdb3db4..50375dbfd0a 100644 --- a/gcc/sched-rgn.c +++ b/gcc/sched-rgn.c @@ -44,6 +44,8 @@ along with GCC; see the file COPYING3. If not see are actually scheduled. */ #include "config.h" +#define INCLUDE_SET +#define INCLUDE_VECTOR #include "system.h" #include "coretypes.h" #include "backend.h" @@ -65,6 +67,7 @@ along with GCC; see the file COPYING3. If not see #include "dbgcnt.h" #include "pretty-print.h" #include "print-rtl.h" +#include "cfgrtl.h" /* Disable warnings about quoting issues in the pp_xxx calls below that (intentionally) don't follow GCC diagnostic conventions. */ @@ -3955,6 +3958,626 @@ make_pass_sched_fusion (gcc::context *ctxt) return new pass_sched_fusion (ctxt); } +namespace +{ + +/* Def-use analisys special functions implementation. */ + +static struct df_link * +get_defs (rtx_insn *insn, rtx reg, vec *dest) +{ + df_ref use; + struct df_link *ref_chain, *ref_link; + + FOR_EACH_INSN_USE (use, insn) + { + if (GET_CODE (DF_REF_REG (use)) == SUBREG) + return NULL; + if (REGNO (DF_REF_REG (use)) == REGNO (reg)) + break; + } + + gcc_assert (use != NULL); + + ref_chain = DF_REF_CHAIN (use); + + for (ref_link = ref_chain; ref_link; ref_link = ref_link->next) + { + /* Problem getting some definition for this instruction. */ + if (ref_link->ref == NULL) + return NULL; + if (DF_REF_INSN_INFO (ref_link->ref) == NULL) + return NULL; + /* As global regs are assumed to be defined at each function call + dataflow can report a call_insn as being a definition of REG. + But we can't do anything with that in this pass so proceed only + if the instruction really sets REG in a way that can be deduced + from the RTL structure. */ + if (global_regs[REGNO (reg)] + && !set_of (reg, DF_REF_INSN (ref_link->ref))) + return NULL; + } + + if (dest) + for (ref_link = ref_chain; ref_link; ref_link = ref_link->next) + dest->safe_push (DF_REF_INSN (ref_link->ref)); + + return ref_chain; +} + +static struct df_link * +get_uses (rtx_insn *insn, rtx reg) +{ + df_ref def; + struct df_link *ref_chain, *ref_link; + + FOR_EACH_INSN_DEF (def, insn) + if (REGNO (DF_REF_REG (def)) == REGNO (reg)) + break; + + gcc_assert (def != NULL && "Broken def-use analisys chain."); + + ref_chain = DF_REF_CHAIN (def); + + for (ref_link = ref_chain; ref_link; ref_link = ref_link->next) + { + /* Problem getting some use for this instruction. */ + if (ref_link->ref == NULL) + return NULL; + } + + return ref_chain; +} + +const pass_data pass_data_split_complex_instructions = { + RTL_PASS, /* Type. */ + "split_complex_instructions", /* Name. */ + OPTGROUP_NONE, /* Optinfo_flags. */ + TV_SPLIT_CMP_INS, /* Tv_id. */ + 0, /* Properties_required. */ + 0, /* Properties_provided. */ + 0, /* Properties_destroyed. */ + 0, /* Todo_flags_start. */ + (TODO_df_verify | TODO_df_finish), /* Todo_flags_finish. */ +}; + +class pass_split_complex_instructions : public rtl_opt_pass +{ +private: + enum complex_instructions_t + { + UNDEFINED, + LDP, + LDP_TI, + STP, + STR + }; + + void split_complex_insn (rtx_insn *insn); + void split_ldp_ti (rtx_insn *insn); + void split_ldp_stp (rtx_insn *insn); + complex_instructions_t get_insn_type (rtx_insn *insn); + + basic_block bb; + rtx_insn *insn; + std::set dependent_stores_candidates; + std::vector ldp_to_split_list; + + complex_instructions_t complex_insn_type = UNDEFINED; + bool is_store_insn (rtx_insn *insn); + bool is_ldp_dependent_on_store (rtx_insn *ldp_insn, basic_block bb); + bool bfs_for_reg_dependent_store (rtx_insn *ldp_insn, basic_block search_bb, + rtx_insn *search_insn, + int search_range + = param_ldp_dependency_search_range); + bool is_store_reg_dependent (rtx_insn *ldp_insn, rtx_insn *str_insn); + void init_df (); + void find_dependent_stores_candidates (rtx_insn *ldp_insn); + int get_insn_offset (rtx_insn *insn, complex_instructions_t insn_type, + int *arith_operation_ptr = NULL); + +public: + pass_split_complex_instructions (gcc::context *ctxt) + : rtl_opt_pass (pass_data_split_complex_instructions, ctxt) + { + } + /* opt_pass methods: */ + virtual bool gate (function *); + + virtual unsigned int + execute (function *) + { + init_df (); + ldp_to_split_list.clear (); + FOR_EACH_BB_FN (bb, cfun) + { + FOR_BB_INSNS (bb, insn) + { + complex_instructions_t insn_type = get_insn_type (insn); + if (insn_type != LDP && insn_type != LDP_TI) + continue; + + if (is_ldp_dependent_on_store (insn, bb)) + { + ldp_to_split_list.push_back (insn); + } + } + } + + for (std::vector::iterator i = ldp_to_split_list.begin (); + i != ldp_to_split_list.end (); ++i) + split_complex_insn (*i); + + return 0; + } +}; // class pass_split_complex_instructions + +bool +pass_split_complex_instructions::is_ldp_dependent_on_store (rtx_insn *ldp_insn, + basic_block bb) +{ + find_dependent_stores_candidates (ldp_insn); + return bfs_for_reg_dependent_store (ldp_insn, bb, ldp_insn); +} + +bool +pass_split_complex_instructions::bfs_for_reg_dependent_store ( + rtx_insn *ldp_insn, basic_block search_bb, rtx_insn *search_insn, + int search_range) +{ + rtx_insn *current_search_insn = search_insn; + + for (int i = search_range; i > 0; --i) + { + if (!current_search_insn) + return false; + bool checking_result + = is_store_reg_dependent (ldp_insn, current_search_insn); + if (checking_result) + { + if (dump_file) + { + fprintf (dump_file, "LDP to split:\n"); + print_rtl_single (dump_file, ldp_insn); + fprintf (dump_file, "Found STR:\n"); + print_rtl_single (dump_file, current_search_insn); + } + return true; + } + if (current_search_insn == BB_HEAD (search_bb)) + { + /* Search in all parent BBs for the reg_dependent store. */ + edge_iterator ei; + edge e; + bool found_reg_dependent_store = false; + + FOR_EACH_EDGE (e, ei, search_bb->preds) + if (e->src->index != 0) + found_reg_dependent_store + = found_reg_dependent_store + || bfs_for_reg_dependent_store (ldp_insn, e->src, + BB_END (e->src), i); + return found_reg_dependent_store; + } + else + { + if (!active_insn_p (current_search_insn)) + i++; + current_search_insn = PREV_INSN (current_search_insn); + } + } + return false; +} + +void +pass_split_complex_instructions::init_df () +{ + df_set_flags (DF_RD_PRUNE_DEAD_DEFS); + df_chain_add_problem (DF_UD_CHAIN + DF_DU_CHAIN); + df_mir_add_problem (); + df_live_add_problem (); + df_live_set_all_dirty (); + df_analyze (); + df_set_flags (DF_DEFER_INSN_RESCAN); +} + +void +pass_split_complex_instructions::find_dependent_stores_candidates ( + rtx_insn *ldp_insn) +{ + dependent_stores_candidates.clear (); + df_ref use; + + FOR_EACH_INSN_USE (use, ldp_insn) + { + df_link *defs = get_defs (ldp_insn, DF_REF_REG (use), NULL); + if (!defs) + return; + + for (df_link *def = defs; def; def = def->next) + { + df_link *uses + = get_uses (DF_REF_INSN (def->ref), DF_REF_REG (def->ref)); + if (!uses) + continue; + + for (df_link *use = uses; use; use = use->next) + { + if (DF_REF_CLASS (use->ref) == DF_REF_REGULAR + && is_store_insn (DF_REF_INSN (use->ref))) + + dependent_stores_candidates.insert (DF_REF_INSN (use->ref)); + } + } + } +} + +bool +pass_split_complex_instructions::is_store_reg_dependent (rtx_insn *ldp_insn, + rtx_insn *str_insn) +{ + if (!is_store_insn (str_insn) + || dependent_stores_candidates.find (str_insn) + == dependent_stores_candidates.end ()) + return false; + + int ldp_offset_sign = UNDEFINED; + int ldp_offset + = get_insn_offset (ldp_insn, get_insn_type (ldp_insn), &ldp_offset_sign); + if (ldp_offset_sign == MINUS) + ldp_offset = -ldp_offset; + + int str_offset_sign = UNDEFINED; + int str_offset = get_insn_offset (str_insn, STR, &str_offset_sign); + if (str_offset_sign == MINUS) + str_offset = -str_offset; + + if (str_offset == ldp_offset || str_offset == ldp_offset + 8) + return true; + + return false; +} + +bool +pass_split_complex_instructions::is_store_insn (rtx_insn *insn) +{ + if (!insn) + return false; + rtx_def *sset_b = single_set (insn); + if (sset_b && MEM_P (SET_DEST (sset_b)) + && GET_MODE (XEXP (sset_b, 0)) != BLKmode + && GET_CODE (XEXP (PATTERN (insn), 0)) == MEM + && GET_RTX_CLASS (GET_CODE (XEXP (XEXP (PATTERN (insn), 0), 0))) + == RTX_COMM_ARITH) + return true; + + return false; +} + +int +pass_split_complex_instructions::get_insn_offset ( + rtx_insn *insn, complex_instructions_t insn_type, int *arith_operation_ptr) +{ + rtx insn_pat = PATTERN (insn); + int returned_offset = 0; + + rtx_def *offset_expr = NULL; + rtx_def *offset_value_expr = NULL; + + switch (insn_type) + { + case LDP: + { + int number_of_sub_insns = XVECLEN (insn_pat, 0); + + /* Calculate it's own ofsset of first load insn. */ + rtx_insn *first_load_insn = NULL; + if (number_of_sub_insns == 2) + { + first_load_insn + = make_insn_raw (copy_rtx (XVECEXP (insn_pat, 0, 0))); + arith_operation_ptr = NULL; + + offset_expr = XEXP (XEXP (PATTERN (first_load_insn), 1), 0); + if (GET_CODE (offset_expr) == PLUS + || GET_CODE (offset_expr) == MINUS) + offset_value_expr + = XEXP (XEXP (XEXP (PATTERN (first_load_insn), 1), 0), 1); + else + offset_expr = NULL; + } + else if (number_of_sub_insns == 3) + { + rtx_insn *offset_sub_insn + = make_insn_raw (copy_rtx (XVECEXP (insn_pat, 0, 0))); + + offset_expr = XEXP (PATTERN (offset_sub_insn), 1); + offset_value_expr = XEXP (XEXP (PATTERN (offset_sub_insn), 1), 1); + } + else + { + gcc_assert (false + && "Wrong number of elements in the ldp_insn vector"); + } + break; + } + case LDP_TI: + { + offset_expr = XEXP (XEXP (PATTERN (insn), 1), 0); + if (GET_CODE (offset_expr) != PLUS && GET_CODE (offset_expr) != MINUS) + return 0; + offset_value_expr = XEXP (XEXP (XEXP (PATTERN (insn), 1), 0), 1); + break; + } + case STR: + { + offset_expr = XEXP (XEXP (PATTERN (insn), 0), 0); + offset_value_expr = XEXP (XEXP (XEXP (PATTERN (insn), 0), 0), 1); + break; + } + default: + { + debug (insn); + gcc_assert (false && "Unsupported instruction type"); + break; + } + } + + if (offset_expr != NULL && offset_value_expr + && GET_CODE (offset_value_expr) == CONST_INT) + returned_offset = XINT (offset_value_expr, 0); + + if (arith_operation_ptr != NULL) + { + *arith_operation_ptr = GET_CODE (offset_expr); + gcc_assert ( + (*arith_operation_ptr == MINUS || *arith_operation_ptr == PLUS) + && "Unexpected arithmetic opearion in offset realted sub_insn"); + } + + return returned_offset; +} + +void +pass_split_complex_instructions::split_ldp_stp (rtx_insn *insn) +{ + rtx_insn *prev_insn = PREV_INSN (insn); + rtx pat = PATTERN (insn); + int number_of_sub_insns = XVECLEN (pat, 0); + + start_sequence (); + + if (number_of_sub_insns == 2) + { + for (int i = 0; i < number_of_sub_insns; i++) + { + rtx_def *sub_insn = XVECEXP (pat, 0, i); + rtx_insn *copy_of_sub_insn = make_insn_raw (copy_rtx (sub_insn)); + int sub_insn_code + = recog (PATTERN (copy_of_sub_insn), copy_of_sub_insn, 0); + rtx_insn *emited_insn = emit_insn (PATTERN (copy_of_sub_insn)); + INSN_CODE (emited_insn) = sub_insn_code; + } + } + + bool post_index = true; + + rtx_insn offset_insn; + rtx_insn mem_insn_1; + rtx_insn mem_insn_2; + + int offset_insn_code; + int mem_insn_1_code = -1; + int mem_insn_2_code = -1; + + /* If load/store pair insn has an offset. */ + if (number_of_sub_insns == 3) + { + int offset = 0; + int arith_operation = UNDEFINED; + + for (int i = 0; i < number_of_sub_insns; i++) + { + rtx_def *sub_insn = XVECEXP (pat, 0, i); + rtx_insn *copy_of_sub_insn = make_insn_raw (copy_rtx (sub_insn)); + int sub_insn_code + = recog (PATTERN (copy_of_sub_insn), copy_of_sub_insn, 0); + + /* If sub_insn is offset related. */ + if (GET_RTX_CLASS (sub_insn_code) == RTX_UNARY) + { + offset_insn = *copy_of_sub_insn; + offset_insn_code = sub_insn_code; + gcc_assert (i == 0 + && "Offset related insn must be the first " + "element of a parallel insn vector"); + + offset = get_insn_offset (insn, LDP, &arith_operation); + } + else + { + if (GET_CODE (XEXP (PATTERN (copy_of_sub_insn), 0)) != REG) + { + rtx_def *&offset_expr = XEXP ( + XEXP (XEXP (PATTERN (copy_of_sub_insn), 0), 0), 1); + if (GET_CODE (offset_expr) == CONST_INT) + { + int local_offset = XINT (offset_expr, 0); + offset = (arith_operation == PLUS ? offset : -offset); + + offset_expr = GEN_INT (local_offset + offset); + + gcc_assert ( + (arith_operation == MINUS || arith_operation == PLUS) + && "Unexpected arithmetic opearion in offset realted " + "sub_insn"); + + if (i == 1) + post_index = false; + } + else + { + post_index = true; + } + } + } + if (i == 1) + { + mem_insn_1 = *copy_of_sub_insn; + mem_insn_1_code = sub_insn_code; + } + if (i == 2) + { + mem_insn_2 = *copy_of_sub_insn; + mem_insn_2_code = sub_insn_code; + } + } + gcc_assert (mem_insn_1_code != -1 && mem_insn_2_code != -1 + && "Uninitialized memory insns"); + + rtx_insn *emited_offset_insn; + if (post_index) + { + emited_offset_insn = emit_insn (PATTERN (&offset_insn)); + INSN_CODE (emited_offset_insn) = offset_insn_code; + } + + rtx_insn *emited_mem_insn_1 = emit_insn (PATTERN (&mem_insn_1)); + rtx_insn *emited_mem_insn_2 = emit_insn (PATTERN (&mem_insn_2)); + INSN_CODE (emited_mem_insn_1) = mem_insn_1_code; + INSN_CODE (emited_mem_insn_2) = mem_insn_2_code; + + if (post_index) + { + emited_offset_insn = emit_insn (PATTERN (&offset_insn)); + INSN_CODE (emited_offset_insn) = offset_insn_code; + } + } + else + { + // gcc_assert (false && "Broken complex insn vector"); + } + + rtx_insn *seq = get_insns (); + unshare_all_rtl_in_chain (seq); + end_sequence (); + + emit_insn_after_setloc (seq, prev_insn, INSN_LOCATION (insn)); + delete_insn_and_edges (insn); +} + +void +pass_split_complex_instructions::split_ldp_ti (rtx_insn *insn) +{ + rtx_insn *prev_insn = PREV_INSN (insn); + rtx_insn *load_insn_1 = make_insn_raw (copy_rtx (PATTERN (insn))); + rtx_insn *load_insn_2 = make_insn_raw (copy_rtx (PATTERN (insn))); + + rtx_def *reg_insn_1 = XEXP (PATTERN (load_insn_1), 0); + rtx_def *mem_insn_1 = XEXP (PATTERN (load_insn_1), 1); + rtx_def *mem_insn_2 = XEXP (PATTERN (load_insn_2), 1); + + PUT_MODE (mem_insn_1, DImode); + PUT_MODE (mem_insn_2, DImode); + + int reg_no_1 = REGNO (reg_insn_1); + + XEXP (PATTERN (load_insn_1), 0) = gen_rtx_REG (DImode, reg_no_1); + XEXP (PATTERN (load_insn_2), 0) = gen_rtx_REG (DImode, reg_no_1 + 1); + + rtx_def *load_insn_2_plus_expr = XEXP (XEXP (PATTERN (load_insn_2), 1), 0); + if (GET_CODE (load_insn_2_plus_expr) == REG) + return; + + rtx_def *load_insn_2_offset_expr + = XEXP (XEXP (XEXP (PATTERN (load_insn_2), 1), 0), 1); + + if (load_insn_2_offset_expr == NULL) + return; + + if (GET_CODE (load_insn_2_offset_expr) == CONST_INT) + { + int load_insn_2_offset = XINT (load_insn_2_offset_expr, 0); + XEXP (XEXP (XEXP (PATTERN (load_insn_2), 1), 0), 1) + = GEN_INT (load_insn_2_offset + GET_MODE_SIZE (DImode)); + } + + start_sequence (); + + rtx_insn *emited_load_insn_1 = emit_insn (PATTERN (load_insn_1)); + rtx_insn *emited_load_insn_2 = emit_insn (PATTERN (load_insn_2)); + + INSN_CODE (emited_load_insn_1) + = recog (PATTERN (emited_load_insn_1), emited_load_insn_1, 0); + INSN_CODE (emited_load_insn_2) + = recog (PATTERN (emited_load_insn_2), emited_load_insn_2, 0); + + rtx_insn *seq = get_insns (); + unshare_all_rtl_in_chain (seq); + end_sequence (); + + emit_insn_after_setloc (seq, prev_insn, INSN_LOCATION (insn)); + delete_insn_and_edges (insn); +} + +void +pass_split_complex_instructions::split_complex_insn (rtx_insn *insn) +{ + complex_instructions_t insn_type = get_insn_type (insn); + if (insn_type == LDP || insn_type == STP) + split_ldp_stp (insn); + else if (insn_type == LDP_TI) + split_ldp_ti (insn); + else + gcc_assert (false && "Unsupported type of insn to split"); +} + +pass_split_complex_instructions::complex_instructions_t +pass_split_complex_instructions::get_insn_type (rtx_insn *insn) +{ + if (!INSN_P (insn)) + return UNDEFINED; + + rtx pat = PATTERN (insn); + int icode = recog (PATTERN (insn), insn, NULL); + + if (GET_CODE (pat) == PARALLEL) + { + if (targetm.is_ldp_insn (icode)) + { + return LDP; + } + if (targetm.is_stp_insn (icode)) + { + return STP; + } + else + { + return UNDEFINED; + } + } + rtx_def *set_insn = single_set (insn); + if (set_insn && GET_CODE (XEXP (single_set (insn), 1)) == MEM + && GET_MODE (XEXP (single_set (insn), 1)) == E_TImode) + return LDP_TI; + + return UNDEFINED; +} + +bool +pass_split_complex_instructions::gate (function *) +{ + return targetm.is_ldp_insn && targetm.is_stp_insn && flag_split_ldp_stp > 0; +} + +} // anon namespace + +rtl_opt_pass * +make_pass_split_complex_instructions (gcc::context *ctxt) +{ + return new pass_split_complex_instructions (ctxt); +} + #if __GNUC__ >= 10 -# pragma GCC diagnostic pop +#pragma GCC diagnostic pop #endif diff --git a/gcc/target.def b/gcc/target.def index bfc5e4e59c9..6d8fa055c3d 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -2682,6 +2682,16 @@ modes and they have different conditional execution capability, such as ARM.", bool, (void), default_have_conditional_execution) +DEFHOOK +(is_ldp_insn, + "Return true if icode is corresponding to any of the LDP instruction types.", + bool, (int icode), NULL) + +DEFHOOK +(is_stp_insn, + "Return true if icode is corresponding to any of the STP instruction types.", + bool, (int icode), NULL) + DEFHOOK (gen_crc32b, "This function generate the crc32 instruction if target supports this.", diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c new file mode 100644 index 00000000000..e826674e2c9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c @@ -0,0 +1,73 @@ +/* { dg-do compile { target aarch64-*-* } } */ +/* { dg-additional-options "-fsplit-ldp-stp" } */ +/* + * Tests are: + * Patterns where LDP insns should NOT be split + * */ + +int __RTL (startwith ("split_complex_instructions")) +simple_ldp_after_store () +{ +(function "simple_ldp_after_store" + (insn-chain + (block 2 + (edge-from entry (flags "FALLTHRU")) + (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK) + (cinsn 228 (set (reg/i:DI sp) + (reg/i:DI x0))) + (cinsn 101 (set (mem/c:DI + (plus:DI (reg/f:DI sp) + (const_int 32))[1 S4 A32])(reg:DI x0))) + (cinsn 10 (parallel [ + (set (reg:DI x29) + (mem:DI (plus:DI (reg/f:DI sp) (const_int 8)) [1 S4 A32])) + (set (reg:DI x30) + (mem:DI (plus:DI (reg/f:DI sp) + (const_int 16)) [1 S4 A32]))])) + (cinsn 11 (use (reg/i:DI sp))) + (cinsn 12 (use (reg/i:DI cc))) + (cinsn 13 (use (reg/i:DI x29))) + (cinsn 14 (use (reg/i:DI x30))) + (cinsn 15 (use (reg/i:DI x0))) + (edge-to exit (flags "FALLTHRU")) + ) ;; block 2 + ) ;; insn-chain +) ;; function "simple_ldp_after_store" +} + +int __RTL (startwith ("split_complex_instructions")) +ldp_after_store_in_different_bb () +{ +(function "ldp_after_store_in_different_bb" + (insn-chain + (block 2 + (edge-from entry (flags "FALLTHRU")) + (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK) + (cinsn 228 (set (reg/i:DI sp) + (reg/i:DI x0))) + (cinsn 101 (set (mem/c:DI + (plus:DI (reg/f:DI sp) + (const_int 32))[1 S4 A32])(reg:DI x0))) + (edge-to 3 (flags "FALLTHRU")) + ) ;; block 2 + (block 3 + (cinsn 10 (parallel [ + (set (reg:DI x29) + (mem:DI (plus:DI (reg/f:DI sp) (const_int 8)) [1 S4 A32])) + (set (reg:DI x30) + (mem:DI (plus:DI (reg/f:DI sp) + (const_int 16)) [1 S4 A32]))])) + (edge-from 2 (flags "FALLTHRU")) + (cinsn 11 (use (reg/i:DI sp))) + (cinsn 12 (use (reg/i:DI cc))) + (cinsn 13 (use (reg/i:DI x29))) + (cinsn 14 (use (reg/i:DI x30))) + (cinsn 15 (use (reg/i:DI x0))) + (edge-to exit (flags "FALLTHRU")) + ) ;; block 3 + ) ;; insn-chain +) ;; function "ldp_after_store_in_different_bb" +} + +/* Verify that the output code contains exactly 2 ldp. */ +/* { dg-final { scan-assembler-times {ldp\t} 2 } } */ \ No newline at end of file diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c new file mode 100644 index 00000000000..4af69d07d3e --- /dev/null +++ b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c @@ -0,0 +1,73 @@ +/* { dg-do compile { target aarch64-*-* } } */ +/* { dg-additional-options "-fsplit-ldp-stp" } */ +/* + * Tests are: + * Patterns where LDP insns should be split + * */ + +int __RTL (startwith ("split_complex_instructions")) +simple_ldp_after_store () +{ +(function "simple_ldp_after_store" + (insn-chain + (block 2 + (edge-from entry (flags "FALLTHRU")) + (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK) + (cinsn 228 (set (reg/i:DI sp) + (reg/i:DI x0))) + (cinsn 101 (set (mem/c:DI + (plus:DI (reg/f:DI sp) + (const_int 8))[1 S4 A32])(reg:DI x0))) + (cinsn 10 (parallel [ + (set (reg:DI x29) + (mem:DI (plus:DI (reg/f:DI sp) (const_int 8)) [1 S4 A32])) + (set (reg:DI x30) + (mem:DI (plus:DI (reg/f:DI sp) + (const_int 16)) [1 S4 A32]))])) + (cinsn 11 (use (reg/i:DI sp))) + (cinsn 12 (use (reg/i:DI cc))) + (cinsn 13 (use (reg/i:DI x29))) + (cinsn 14 (use (reg/i:DI x30))) + (cinsn 15 (use (reg/i:DI x0))) + (edge-to exit (flags "FALLTHRU")) + ) ;; block 2 + ) ;; insn-chain +) ;; function "simple_ldp_after_store" +} + +int __RTL (startwith ("split_complex_instructions")) +ldp_after_store_in_different_bb () +{ +(function "ldp_after_store_in_different_bb" + (insn-chain + (block 2 + (edge-from entry (flags "FALLTHRU")) + (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK) + (cinsn 228 (set (reg/i:DI sp) + (reg/i:DI x0))) + (cinsn 101 (set (mem/c:DI + (plus:DI (reg/f:DI sp) + (const_int 8))[1 S4 A32])(reg:DI x0))) + (edge-to 3 (flags "FALLTHRU")) + ) ;; block 2 + (block 3 + (cinsn 10 (parallel [ + (set (reg:DI x29) + (mem:DI (plus:DI (reg/f:DI sp) (const_int 8)) [1 S4 A32])) + (set (reg:DI x30) + (mem:DI (plus:DI (reg/f:DI sp) + (const_int 16)) [1 S4 A32]))])) + (edge-from 2 (flags "FALLTHRU")) + (cinsn 11 (use (reg/i:DI sp))) + (cinsn 12 (use (reg/i:DI cc))) + (cinsn 13 (use (reg/i:DI x29))) + (cinsn 14 (use (reg/i:DI x30))) + (cinsn 15 (use (reg/i:DI x0))) + (edge-to exit (flags "FALLTHRU")) + ) ;; block 3 + ) ;; insn-chain +) ;; function "ldp_after_store_in_different_bb" +} + +/* Verify that the output code doesn't contain ldp. */ +/* { dg-final { scan-assembler-not {ldp\t} } } */ \ No newline at end of file diff --git a/gcc/timevar.def b/gcc/timevar.def index f3b0d3f9034..fc74b88618d 100644 --- a/gcc/timevar.def +++ b/gcc/timevar.def @@ -275,6 +275,7 @@ DEFTIMEVAR (TV_RELOAD_CSE_REGS , "reload CSE regs") DEFTIMEVAR (TV_GCSE_AFTER_RELOAD , "load CSE after reload") DEFTIMEVAR (TV_REE , "ree") DEFTIMEVAR (TV_THREAD_PROLOGUE_AND_EPILOGUE, "thread pro- & epilogue") +DEFTIMEVAR (TV_SPLIT_CMP_INS , "split complex instructions") DEFTIMEVAR (TV_IFCVT2 , "if-conversion 2") DEFTIMEVAR (TV_SPLIT_PATHS , "split paths") DEFTIMEVAR (TV_COMBINE_STACK_ADJUST , "combine stack adjustments") diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index d73ab825bf2..17ad08ac35a 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -596,6 +596,7 @@ extern rtl_opt_pass *make_pass_gcse2 (gcc::context *ctxt); extern rtl_opt_pass *make_pass_split_after_reload (gcc::context *ctxt); extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context *ctxt); +extern rtl_opt_pass *make_pass_split_complex_instructions (gcc::context *ctxt); extern rtl_opt_pass *make_pass_stack_adjustments (gcc::context *ctxt); extern rtl_opt_pass *make_pass_sched_fusion (gcc::context *ctxt); extern rtl_opt_pass *make_pass_peephole2 (gcc::context *ctxt); -- Gitee From 112c913f474bd38a62e0f468953ed9e618aef33d Mon Sep 17 00:00:00 2001 From: Agrachev Andrey WX1228450 Date: Wed, 23 Aug 2023 15:02:33 +0300 Subject: [PATCH 6/7] fix for split-ldp optimization --- gcc/config/aarch64/aarch64.c | 42 +++++++++++++++++++----------------- gcc/sched-rgn.c | 29 +++++++++++-------------- 2 files changed, 35 insertions(+), 36 deletions(-) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 52a78d2b18b..6d7644e0e18 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -23827,19 +23827,21 @@ aarch64_run_selftests (void) #endif /* #if CHECKING_P */ +/* TODO: refuse to use ranges intead of full list of an instruction codes. */ + bool is_aarch64_ldp_insn (int icode) { - if (icode >= CODE_FOR_load_pair_sw_sisi - && icode <= CODE_FOR_load_pair_dw_tftf - || icode >= CODE_FOR_loadwb_pairsi_si - && icode <= CODE_FOR_loadwb_pairtf_di - || icode >= CODE_FOR_load_pairv8qiv8qi - && icode <= CODE_FOR_load_pairdfdf - || icode >= CODE_FOR_load_pairv16qiv16qi - && icode <= CODE_FOR_load_pairv8bfv2df - || icode >= CODE_FOR_load_pair_lanesv8qi - && icode <= CODE_FOR_load_pair_lanesdf) + if ((icode >= CODE_FOR_load_pair_sw_sisi + && icode <= CODE_FOR_load_pair_dw_tftf) + || (icode >= CODE_FOR_loadwb_pairsi_si + && icode <= CODE_FOR_loadwb_pairtf_di) + || (icode >= CODE_FOR_load_pairv8qiv8qi + && icode <= CODE_FOR_load_pairdfdf) + || (icode >= CODE_FOR_load_pairv16qiv16qi + && icode <= CODE_FOR_load_pairv8bfv2df) + || (icode >= CODE_FOR_load_pair_lanesv8qi + && icode <= CODE_FOR_load_pair_lanesdf)) return true; return false; } @@ -23847,16 +23849,16 @@ is_aarch64_ldp_insn (int icode) bool is_aarch64_stp_insn (int icode) { - if (icode >= CODE_FOR_store_pair_sw_sisi - && icode <= CODE_FOR_store_pair_dw_tftf - || icode >= CODE_FOR_storewb_pairsi_si - && icode <= CODE_FOR_storewb_pairtf_di - || icode >= CODE_FOR_vec_store_pairv8qiv8qi - && icode <= CODE_FOR_vec_store_pairdfdf - || icode >= CODE_FOR_vec_store_pairv16qiv16qi - && icode <= CODE_FOR_vec_store_pairv8bfv2df - || icode >= CODE_FOR_store_pair_lanesv8qi - && icode <= CODE_FOR_store_pair_lanesdf) + if ((icode >= CODE_FOR_store_pair_sw_sisi + && icode <= CODE_FOR_store_pair_dw_tftf) + || (icode >= CODE_FOR_storewb_pairsi_si + && icode <= CODE_FOR_storewb_pairtf_di) + || (icode >= CODE_FOR_vec_store_pairv8qiv8qi + && icode <= CODE_FOR_vec_store_pairdfdf) + || (icode >= CODE_FOR_vec_store_pairv16qiv16qi + && icode <= CODE_FOR_vec_store_pairv8bfv2df) + || (icode >= CODE_FOR_store_pair_lanesv8qi + && icode <= CODE_FOR_store_pair_lanesdf)) return true; return false; } diff --git a/gcc/sched-rgn.c b/gcc/sched-rgn.c index 50375dbfd0a..a229d2196ae 100644 --- a/gcc/sched-rgn.c +++ b/gcc/sched-rgn.c @@ -3958,8 +3958,7 @@ make_pass_sched_fusion (gcc::context *ctxt) return new pass_sched_fusion (ctxt); } -namespace -{ +namespace { /* Def-use analisys special functions implementation. */ @@ -4346,6 +4345,16 @@ pass_split_complex_instructions::split_ldp_stp (rtx_insn *insn) int number_of_sub_insns = XVECLEN (pat, 0); start_sequence (); + + bool post_index = true; + + rtx_insn offset_insn; + rtx_insn mem_insn_1; + rtx_insn mem_insn_2; + + int offset_insn_code; + int mem_insn_1_code = -1; + int mem_insn_2_code = -1; if (number_of_sub_insns == 2) { @@ -4359,19 +4368,7 @@ pass_split_complex_instructions::split_ldp_stp (rtx_insn *insn) INSN_CODE (emited_insn) = sub_insn_code; } } - - bool post_index = true; - - rtx_insn offset_insn; - rtx_insn mem_insn_1; - rtx_insn mem_insn_2; - - int offset_insn_code; - int mem_insn_1_code = -1; - int mem_insn_2_code = -1; - - /* If load/store pair insn has an offset. */ - if (number_of_sub_insns == 3) + else if (number_of_sub_insns == 3) /* If load/store pair insn has an offset. */ { int offset = 0; int arith_operation = UNDEFINED; @@ -4436,7 +4433,7 @@ pass_split_complex_instructions::split_ldp_stp (rtx_insn *insn) && "Uninitialized memory insns"); rtx_insn *emited_offset_insn; - if (post_index) + if (!post_index) { emited_offset_insn = emit_insn (PATTERN (&offset_insn)); INSN_CODE (emited_offset_insn) = offset_insn_code; -- Gitee From bb87154e2d9baad7e5165e97b8b488fe4357430d Mon Sep 17 00:00:00 2001 From: Chernonog Vyacheslav 00812786 Date: Sun, 3 Sep 2023 10:38:52 +0300 Subject: [PATCH 7/7] Fix cornercase ldp bug for same src and dest regisers change splitted instruction order for cases when the source register matches the destination one --- gcc/sched-rgn.c | 231 +++++++++++------- .../rtl/aarch64/test-ldp-split-rearrange.c | 40 +++ 2 files changed, 183 insertions(+), 88 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c diff --git a/gcc/sched-rgn.c b/gcc/sched-rgn.c index a229d2196ae..da3967413d5 100644 --- a/gcc/sched-rgn.c +++ b/gcc/sched-rgn.c @@ -4054,13 +4054,15 @@ private: void split_complex_insn (rtx_insn *insn); void split_ldp_ti (rtx_insn *insn); + void split_ldp_with_offset (rtx_insn *ldp_insn); + void split_simple_ldp (rtx_insn *ldp_insn); void split_ldp_stp (rtx_insn *insn); complex_instructions_t get_insn_type (rtx_insn *insn); basic_block bb; rtx_insn *insn; std::set dependent_stores_candidates; - std::vector ldp_to_split_list; + std::set ldp_to_split_list; complex_instructions_t complex_insn_type = UNDEFINED; bool is_store_insn (rtx_insn *insn); @@ -4098,12 +4100,12 @@ public: if (is_ldp_dependent_on_store (insn, bb)) { - ldp_to_split_list.push_back (insn); + ldp_to_split_list.insert (insn); } } } - for (std::vector::iterator i = ldp_to_split_list.begin (); + for (std::set::iterator i = ldp_to_split_list.begin (); i != ldp_to_split_list.end (); ++i) split_complex_insn (*i); @@ -4204,7 +4206,6 @@ pass_split_complex_instructions::find_dependent_stores_candidates ( { if (DF_REF_CLASS (use->ref) == DF_REF_REGULAR && is_store_insn (DF_REF_INSN (use->ref))) - dependent_stores_candidates.insert (DF_REF_INSN (use->ref)); } } @@ -4338,14 +4339,43 @@ pass_split_complex_instructions::get_insn_offset ( } void -pass_split_complex_instructions::split_ldp_stp (rtx_insn *insn) +pass_split_complex_instructions::split_simple_ldp (rtx_insn *ldp_insn) { - rtx_insn *prev_insn = PREV_INSN (insn); - rtx pat = PATTERN (insn); - int number_of_sub_insns = XVECLEN (pat, 0); + rtx pat = PATTERN (ldp_insn); - start_sequence (); - + rtx_insn *mem_insn_1 = make_insn_raw (copy_rtx (XVECEXP (pat, 0, 0))); + rtx_insn *mem_insn_2 = make_insn_raw (copy_rtx (XVECEXP (pat, 0, 1))); + + int dest_regno = REGNO (SET_DEST (PATTERN (mem_insn_1))); + int src_regno; + + rtx_def *srs_reg_insn = XEXP (SET_SRC (PATTERN (mem_insn_1)), 0); + + if (GET_CODE (srs_reg_insn) == REG) + src_regno = REGNO (srs_reg_insn); + else + src_regno = REGNO (XEXP (srs_reg_insn, 0)); + + rtx_insn *emited_insn_1, *emited_insn_2; + + /* in cases like ldp r1,r2,[r1] we emit ldr r2,[r1] first. */ + if (src_regno == dest_regno) + std::swap (mem_insn_1, mem_insn_2); + + emited_insn_1 = emit_insn (PATTERN (mem_insn_1)); + emited_insn_2 = emit_insn (PATTERN (mem_insn_2)); + + int sub_insn_1_code = recog (PATTERN (mem_insn_1), mem_insn_1, 0); + int sub_insn_2_code = recog (PATTERN (mem_insn_2), mem_insn_2, 0); + + INSN_CODE (emited_insn_1) = sub_insn_1_code; + INSN_CODE (emited_insn_2) = sub_insn_2_code; +} + +void +pass_split_complex_instructions::split_ldp_with_offset (rtx_insn *ldp_insn) +{ + rtx pat = PATTERN (ldp_insn); bool post_index = true; rtx_insn offset_insn; @@ -4356,104 +4386,117 @@ pass_split_complex_instructions::split_ldp_stp (rtx_insn *insn) int mem_insn_1_code = -1; int mem_insn_2_code = -1; - if (number_of_sub_insns == 2) - { - for (int i = 0; i < number_of_sub_insns; i++) - { - rtx_def *sub_insn = XVECEXP (pat, 0, i); - rtx_insn *copy_of_sub_insn = make_insn_raw (copy_rtx (sub_insn)); - int sub_insn_code - = recog (PATTERN (copy_of_sub_insn), copy_of_sub_insn, 0); - rtx_insn *emited_insn = emit_insn (PATTERN (copy_of_sub_insn)); - INSN_CODE (emited_insn) = sub_insn_code; - } - } - else if (number_of_sub_insns == 3) /* If load/store pair insn has an offset. */ + int offset = 0; + int arith_operation = UNDEFINED; + + for (int i = 0; i < 3; i++) { - int offset = 0; - int arith_operation = UNDEFINED; + rtx_def *sub_insn = XVECEXP (pat, 0, i); + rtx_insn *copy_of_sub_insn = make_insn_raw (copy_rtx (sub_insn)); + int sub_insn_code + = recog (PATTERN (copy_of_sub_insn), copy_of_sub_insn, 0); - for (int i = 0; i < number_of_sub_insns; i++) + /* If sub_insn is offset related. */ + if (GET_RTX_CLASS (sub_insn_code) == RTX_UNARY) { - rtx_def *sub_insn = XVECEXP (pat, 0, i); - rtx_insn *copy_of_sub_insn = make_insn_raw (copy_rtx (sub_insn)); - int sub_insn_code - = recog (PATTERN (copy_of_sub_insn), copy_of_sub_insn, 0); - - /* If sub_insn is offset related. */ - if (GET_RTX_CLASS (sub_insn_code) == RTX_UNARY) - { - offset_insn = *copy_of_sub_insn; - offset_insn_code = sub_insn_code; - gcc_assert (i == 0 - && "Offset related insn must be the first " - "element of a parallel insn vector"); + offset_insn = *copy_of_sub_insn; + offset_insn_code = sub_insn_code; + gcc_assert (i == 0 + && "Offset related insn must be the first " + "element of a parallel insn vector"); - offset = get_insn_offset (insn, LDP, &arith_operation); - } - else + offset = get_insn_offset (ldp_insn, LDP, &arith_operation); + } + else + { + if (GET_CODE (XEXP (PATTERN (copy_of_sub_insn), 0)) != REG) { - if (GET_CODE (XEXP (PATTERN (copy_of_sub_insn), 0)) != REG) + rtx_def *&offset_expr + = XEXP (XEXP (XEXP (PATTERN (copy_of_sub_insn), 0), 0), 1); + if (GET_CODE (offset_expr) == CONST_INT) { - rtx_def *&offset_expr = XEXP ( - XEXP (XEXP (PATTERN (copy_of_sub_insn), 0), 0), 1); - if (GET_CODE (offset_expr) == CONST_INT) - { - int local_offset = XINT (offset_expr, 0); - offset = (arith_operation == PLUS ? offset : -offset); + int local_offset = XINT (offset_expr, 0); + offset = (arith_operation == PLUS ? offset : -offset); - offset_expr = GEN_INT (local_offset + offset); + offset_expr = GEN_INT (local_offset + offset); - gcc_assert ( - (arith_operation == MINUS || arith_operation == PLUS) - && "Unexpected arithmetic opearion in offset realted " - "sub_insn"); + gcc_assert ( + (arith_operation == MINUS || arith_operation == PLUS) + && "Unexpected arithmetic opearion in offset realted " + "sub_insn"); - if (i == 1) - post_index = false; - } - else - { - post_index = true; - } + if (i == 1) + post_index = false; + } + else + { + post_index = true; } - } - if (i == 1) - { - mem_insn_1 = *copy_of_sub_insn; - mem_insn_1_code = sub_insn_code; - } - if (i == 2) - { - mem_insn_2 = *copy_of_sub_insn; - mem_insn_2_code = sub_insn_code; } } - gcc_assert (mem_insn_1_code != -1 && mem_insn_2_code != -1 - && "Uninitialized memory insns"); - - rtx_insn *emited_offset_insn; - if (!post_index) + if (i == 1) { - emited_offset_insn = emit_insn (PATTERN (&offset_insn)); - INSN_CODE (emited_offset_insn) = offset_insn_code; + mem_insn_1 = *copy_of_sub_insn; + mem_insn_1_code = sub_insn_code; } - - rtx_insn *emited_mem_insn_1 = emit_insn (PATTERN (&mem_insn_1)); - rtx_insn *emited_mem_insn_2 = emit_insn (PATTERN (&mem_insn_2)); - INSN_CODE (emited_mem_insn_1) = mem_insn_1_code; - INSN_CODE (emited_mem_insn_2) = mem_insn_2_code; - - if (post_index) + if (i == 2) { - emited_offset_insn = emit_insn (PATTERN (&offset_insn)); - INSN_CODE (emited_offset_insn) = offset_insn_code; + mem_insn_2 = *copy_of_sub_insn; + mem_insn_2_code = sub_insn_code; } } + gcc_assert (mem_insn_1_code != -1 && mem_insn_2_code != -1 + && "Uninitialized memory insns"); + + int dest_regno = REGNO (SET_DEST (PATTERN (&mem_insn_1))); + int src_regno; + + rtx_def *srs_reg_insn = XEXP (SET_SRC (PATTERN (&mem_insn_1)), 0); + + if (GET_CODE (srs_reg_insn) == REG) + src_regno = REGNO (srs_reg_insn); else + src_regno = REGNO (XEXP (srs_reg_insn, 0)); + + /* Don't split such weird LDP. */ + if (src_regno == dest_regno) + return; + + rtx_insn *emited_offset_insn; + if (!post_index) + { + emited_offset_insn = emit_insn (PATTERN (&offset_insn)); + INSN_CODE (emited_offset_insn) = offset_insn_code; + } + + rtx_insn *emited_insn_1 = emit_insn (PATTERN (&mem_insn_1)); + rtx_insn *emited_insn_2 = emit_insn (PATTERN (&mem_insn_2)); + + + INSN_CODE (emited_insn_1) = mem_insn_1_code; + INSN_CODE (emited_insn_2) = mem_insn_2_code; + + if (post_index) { - // gcc_assert (false && "Broken complex insn vector"); + emited_offset_insn = emit_insn (PATTERN (&offset_insn)); + INSN_CODE (emited_offset_insn) = offset_insn_code; } +} + +void +pass_split_complex_instructions::split_ldp_stp (rtx_insn *insn) +{ + rtx_insn *prev_insn = PREV_INSN (insn); + int number_of_sub_insns = XVECLEN (PATTERN (insn), 0); + + start_sequence (); + + if (number_of_sub_insns == 2) + split_simple_ldp (insn); + else if (number_of_sub_insns == 3) + split_ldp_with_offset (insn); + else + gcc_assert (false && "Broken complex insn vector"); rtx_insn *seq = get_insns (); unshare_all_rtl_in_chain (seq); @@ -4501,6 +4544,18 @@ pass_split_complex_instructions::split_ldp_ti (rtx_insn *insn) start_sequence (); + int src_regno; + rtx_def *srs_reg_insn = XEXP (XEXP (PATTERN (load_insn_1), 1), 0); + + if (GET_CODE (srs_reg_insn) == REG) + src_regno = REGNO (srs_reg_insn); + else + src_regno = REGNO (XEXP (srs_reg_insn, 0)); + + /* in cases like ldp r1,r2,[r1] we emit ldr r2,[r1] first. */ + if (src_regno == reg_no_1) + std::swap (load_insn_1, load_insn_2); + rtx_insn *emited_load_insn_1 = emit_insn (PATTERN (load_insn_1)); rtx_insn *emited_load_insn_2 = emit_insn (PATTERN (load_insn_2)); diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c new file mode 100644 index 00000000000..8c035c3e10e --- /dev/null +++ b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c @@ -0,0 +1,40 @@ +/* { dg-do compile { target aarch64-*-* } } */ +/* { dg-additional-options "-fsplit-ldp-stp" } */ +/* + * Test is: + * Pattern where LDP insns should be split with rearrangement in order + * to deal with data dependecy betwen subinstruction. + * */ + +int __RTL (startwith ("split_complex_instructions")) +simple_ldp_after_store () +{ +(function "ldp_equal_registers" + (insn-chain + (block 2 + (edge-from entry (flags "FALLTHRU")) + (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK) + (cinsn 228 (set (reg/i:DI x1) + (reg/i:DI x0))) + (cinsn 101 (set (mem/c:DI + (plus:DI (reg/f:DI x1) + (const_int 8))[1 S4 A32])(reg:DI x0))) + (cinsn 10 (parallel [ + (set (reg:DI x1) + (mem:DI (plus:DI (reg/f:DI x1) (const_int 8)) [1 S4 A32])) + (set (reg:DI x2) + (mem:DI (plus:DI (reg/f:DI x1) + (const_int 16)) [1 S4 A32]))])) + (cinsn 11 (use (reg/i:DI sp))) + (cinsn 12 (use (reg/i:DI cc))) + (cinsn 13 (use (reg/i:DI x0))) + (cinsn 14 (use (reg/i:DI x1))) + (cinsn 15 (use (reg/i:DI x2))) + (edge-to exit (flags "FALLTHRU")) + ) ;; block 2 + ) ;; insn-chain +) ;; function "ldp_equal_registers" +} + +/* Verify that the output code doesn't contain ldp. */ +/* { dg-final { scan-assembler-times ".*ldr.*x2.*x1,.*16.*ldr.*x1.*x1.*8" 1 } } */ \ No newline at end of file -- Gitee