From f64bd58071630989f75900023d72de5d715a6c44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= Date: Tue, 28 May 2024 11:44:49 +0800 Subject: [PATCH] Sync patch from openeuler/gcc. --- 0142-crc-loop-optimization-initial.patch | 2332 ++++++++ ...y-if-conversion-of-simple-arithmetic.patch | 109 + ...low-matching-uaddsub-overflow-for-wi.patch | 236 + 0145-Match-double-sized-mul-pattern.patch | 488 ++ ...rc32-Optimization-in-Gzip-For-crc32-.patch | 2354 ++++++++ ...nd-correct-costs-for-cmlt-generation.patch | 194 + 0148-Introduce-RTL-ifcvt-enhancements.patch | 502 ++ ...e-check-for-pointer-aliasing-during-.patch | 239 + ...ropagation-of-permutations-in-fwprop.patch | 1050 ++++ ...Fix-bugs-and-add-tests-for-RTL-ifcvt.patch | 381 ++ 0152-Add-LLC-Allocation-Pass.patch | 4905 +++++++++++++++++ 0153-LLC-add-extending-outer-loop.patch | 1285 +++++ ...-null-on-pointers-and-solving-coding.patch | 1772 ++++++ 0155-Add-maxmin-and-uzp1-uzp2-combining.patch | 477 ++ 0156-add-icp-optimization.patch | 2387 ++++++++ ...-Add-split-complex-instructions-pass.patch | 1241 +++++ ...-Implement-IPA-prefetch-optimization.patch | 2072 +++++++ 0159-Implement-AES-pattern-matching.patch | 233 + 0160-AES-Add-lost-files.patch | 3746 +++++++++++++ ...st-ftree-fold-phiopt-option-in-tests.patch | 51 + ...ee-dominance-info-before-cleanup_cfg.patch | 25 + ...-the-problem-of-insufficient-CRC-tab.patch | 42 + ...ix-some-bugs-and-remove-variable-pre.patch | 924 ++++ ...-change-def-selection-logic-in-noce_.patch | 30 + ...-Bugfix-Check-that-the-arithmetic-op.patch | 57 + ...-Bugfix-Fix-shll-shll2-patterns-for-.patch | 62 + ...ugfix-Terminate-kernel-filtering-for.patch | 175 + 0169-Struct-Reorg-Fix-several-bugs.patch | 183 + 0170-DFE-Add-escape-check.patch | 104 + ...-Add-ftree-fold-phiopt-option-to-5-t.patch | 80 + ...minmax-Move-minmax-pattern-to-gimple.patch | 323 ++ 0173-IPA-Fix-test-completion-1.c.patch | 24 + ...-checked-build-and-comments-from-rev.patch | 71 + ...tending-and-refactoring-of-pass_spli.patch | 1426 +++++ ...-ICP-src-openEuler-gcc-I8PYBF-I8PYLL.patch | 61 + 0177-Fix-sqlite-build.patch | 168 + 0178-Fix-freetype-build.patch | 52 + ...-to-rename-def-in-the-last-instructi.patch | 29 + ...zation-level-requirement-to-the-gate.patch | 25 + 0181-Fix-issue-I8QD9H.patch | 115 + ...bugs-in-ICP-src-openEuler-gcc-I8RKFJ.patch | 47 + ...fail-in-ICP-src-openEuler-gcc-I8RP4H.patch | 26 + ...PA-prefetch-src-openEuler-gcc-I8RURA.patch | 45 + ...PA-prefetch-src-openEuler-gcc-I8RV7T.patch | 26 + ...-the-problem-of-insufficient-CRC-tab.patch | 26 + 0187-Add-IPA-prefetch-test.patch | 1862 +++++++ ...-for-src-openEuler-gcc-I90P7M-I91CZ8.patch | 32 + 0189-Add-hip11-CPU-pipeline-scheduling.patch | 739 +++ gcc.spec | 104 +- 49 files changed, 32936 insertions(+), 1 deletion(-) create mode 100644 0142-crc-loop-optimization-initial.patch create mode 100644 0143-Perform-early-if-conversion-of-simple-arithmetic.patch create mode 100644 0144-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch create mode 100644 0145-Match-double-sized-mul-pattern.patch create mode 100644 0146-LOOP-CRC32-Add-Crc32-Optimization-in-Gzip-For-crc32-.patch create mode 100644 0147-add-insn-defs-and-correct-costs-for-cmlt-generation.patch create mode 100644 0148-Introduce-RTL-ifcvt-enhancements.patch create mode 100644 0149-Add-more-flexible-check-for-pointer-aliasing-during-.patch create mode 100644 0150-Implement-propagation-of-permutations-in-fwprop.patch create mode 100644 0151-Fix-bugs-and-add-tests-for-RTL-ifcvt.patch create mode 100644 0152-Add-LLC-Allocation-Pass.patch create mode 100644 0153-LLC-add-extending-outer-loop.patch create mode 100644 0154-Loop-CRC32-Judge-null-on-pointers-and-solving-coding.patch create mode 100644 0155-Add-maxmin-and-uzp1-uzp2-combining.patch create mode 100644 0156-add-icp-optimization.patch create mode 100644 0157-Add-split-complex-instructions-pass.patch create mode 100644 0158-Implement-IPA-prefetch-optimization.patch create mode 100644 0159-Implement-AES-pattern-matching.patch create mode 100644 0160-AES-Add-lost-files.patch create mode 100644 0161-Fix-lost-ftree-fold-phiopt-option-in-tests.patch create mode 100644 0162-rtl-ifcvt-free-dominance-info-before-cleanup_cfg.patch create mode 100644 0163-Loop-CRC-Solving-the-problem-of-insufficient-CRC-tab.patch create mode 100644 0164-LLC-Allocation-Fix-some-bugs-and-remove-variable-pre.patch create mode 100644 0165-rtl-ifcvt-BugFix-change-def-selection-logic-in-noce_.patch create mode 100644 0166-perm-propagation-Bugfix-Check-that-the-arithmetic-op.patch create mode 100644 0167-perm-propagation-Bugfix-Fix-shll-shll2-patterns-for-.patch create mode 100644 0168-LLC-Allocation-Bugfix-Terminate-kernel-filtering-for.patch create mode 100644 0169-Struct-Reorg-Fix-several-bugs.patch create mode 100644 0170-DFE-Add-escape-check.patch create mode 100644 0171-phiopt-testsuite-Add-ftree-fold-phiopt-option-to-5-t.patch create mode 100644 0172-minmax-Move-minmax-pattern-to-gimple.patch create mode 100644 0173-IPA-Fix-test-completion-1.c.patch create mode 100644 0174-IPA-Fix-fails-on-checked-build-and-comments-from-rev.patch create mode 100644 0175-split-ldp-stp-Extending-and-refactoring-of-pass_spli.patch create mode 100644 0176-Fix-bugs-in-ICP-src-openEuler-gcc-I8PYBF-I8PYLL.patch create mode 100644 0177-Fix-sqlite-build.patch create mode 100644 0178-Fix-freetype-build.patch create mode 100644 0179-rtl-ifcvt-refuse-to-rename-def-in-the-last-instructi.patch create mode 100644 0180-add-optimization-level-requirement-to-the-gate.patch create mode 100644 0181-Fix-issue-I8QD9H.patch create mode 100644 0182-Fix-bugs-in-ICP-src-openEuler-gcc-I8RKFJ.patch create mode 100644 0183-Fix-fail-in-ICP-src-openEuler-gcc-I8RP4H.patch create mode 100644 0184-Fix-fail-in-IPA-prefetch-src-openEuler-gcc-I8RURA.patch create mode 100644 0185-Fix-fail-in-IPA-prefetch-src-openEuler-gcc-I8RV7T.patch create mode 100644 0186-Loop-CRC-Solving-the-problem-of-insufficient-CRC-tab.patch create mode 100644 0187-Add-IPA-prefetch-test.patch create mode 100644 0188-Fix-fails-in-ICP-for-src-openEuler-gcc-I90P7M-I91CZ8.patch create mode 100644 0189-Add-hip11-CPU-pipeline-scheduling.patch diff --git a/0142-crc-loop-optimization-initial.patch b/0142-crc-loop-optimization-initial.patch new file mode 100644 index 0000000..61d5ae5 --- /dev/null +++ b/0142-crc-loop-optimization-initial.patch @@ -0,0 +1,2332 @@ +From 2716abb1a4de2a4edf06d2f1877d9b76a88e5807 Mon Sep 17 00:00:00 2001 +From: bule +Date: Thu, 15 Dec 2022 14:34:16 +0800 +Subject: [PATCH 05/33] crc loop optimization initial + +--- + gcc/Makefile.in | 1 + + gcc/common.opt | 4 + + gcc/doc/invoke.texi | 6 +- + gcc/match.pd | 169 +++++ + gcc/passes.def | 1 + + gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c | 85 +++ + .../tree-ssa/loop-crc-1.c.042t.loop_crc | 90 +++ + gcc/testsuite/gcc.dg/tree-ssa/loop-crc-2.c | 88 +++ + gcc/testsuite/gcc.dg/tree-ssa/loop-crc-3.c | 85 +++ + gcc/testsuite/gcc.dg/tree-ssa/loop-crc-4.c | 89 +++ + .../tree-ssa/loop-crc-4.c.042t.loop_crc | 0 + .../loop-crc-calculation-check-fail.c | 156 +++++ + ...crc-calculation-check-fail.c.042t.loop_crc | 64 ++ + .../loop-crc-calculation-check-fail.s | 329 +++++++++ + .../gcc.dg/tree-ssa/loop-crc-loop-form-fail.c | 111 +++ + .../gcc.dg/tree-ssa/loop-crc-sucess.c | 84 +++ + .../tree-ssa/loop-crc-table-check-fail.c | 113 +++ + gcc/timevar.def | 1 + + gcc/tree-pass.h | 1 + + gcc/tree-ssa-loop-crc.c | 644 ++++++++++++++++++ + 20 files changed, 2120 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c.042t.loop_crc + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-2.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-3.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-4.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-4.c.042t.loop_crc + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c.042t.loop_crc + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.s + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c + create mode 100644 gcc/tree-ssa-loop-crc.c + +diff --git a/gcc/Makefile.in b/gcc/Makefile.in +index 3f06b8907..2a59acfbe 100644 +--- a/gcc/Makefile.in ++++ b/gcc/Makefile.in +@@ -1592,6 +1592,7 @@ OBJS = \ + tree-ssa-loop-manip.o \ + tree-ssa-loop-niter.o \ + tree-ssa-loop-array-widen-compare.o \ ++ tree-ssa-loop-crc.o \ + tree-ssa-loop-prefetch.o \ + tree-ssa-loop-split.o \ + tree-ssa-loop-unswitch.o \ +diff --git a/gcc/common.opt b/gcc/common.opt +index 6f0ed7cea..a286a2628 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1065,6 +1065,10 @@ Common Report Var(flag_array_widen_compare) Optimization + Extends types for pointers to arrays to improve array comparsion performance. + In some extreme situations this may result in unsafe behavior. + ++floop-crc ++Common Report Var(flag_loop_crc) Optimization ++do the loop crc conversion. ++ + fauto-inc-dec + Common Report Var(flag_auto_inc_dec) Init(1) Optimization + Generate auto-inc/dec instructions. +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index 7498758b0..52018617a 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -460,7 +460,7 @@ Objective-C and Objective-C++ Dialects}. + -fno-allocation-dce -fallow-store-data-races @gol + -fassociative-math -fauto-profile -fauto-profile[=@var{path}] @gol + -farray-widen-compare -fauto-inc-dec -fbranch-probabilities @gol +--fcaller-saves @gol ++-fcaller-saves -floop-crc @gol + -fcombine-stack-adjustments -fconserve-stack @gol + -fcompare-elim -fcprop-registers -fcrossjumping @gol + -fcse-follow-jumps -fcse-skip-blocks -fcx-fortran-rules @gol +@@ -9722,6 +9722,10 @@ extreme situations this may result in unsafe behavior. + This option may generate better or worse code; results are highly dependent + on the structure of loops within the source code. + ++@item -floop-crc ++@opindex floop-crc ++Do the loop crc conversion ++ + @item -fdce + @opindex fdce + Perform dead code elimination (DCE) on RTL@. +diff --git a/gcc/match.pd b/gcc/match.pd +index 01f81b063..87b316953 100644 +--- a/gcc/match.pd ++++ b/gcc/match.pd +@@ -3487,6 +3487,175 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) + ) + #endif + ++ ++#if GIMPLE ++(if (canonicalize_math_p ()) ++/* These patterns are mostly used by PHIOPT to move some operations outside of ++ the if statements. They should be done late because it gives jump threading ++ and few other passes to reduce what is going on. */ ++/* a ? x op POW2 : x -> x op (a ? POW2 : 0). */ ++ (for op (plus minus bit_ior bit_xor lshift rshift lrotate rrotate) ++ (simplify ++ (cond @0 (op:s @1 INTEGER_CST@2) @1) ++ /* powerof2cst */ ++ (if (INTEGRAL_TYPE_P (type) && integer_pow2p (@2)) ++ (with { ++ tree shift = build_int_cst (integer_type_node, tree_log2 (@2)); ++ } ++ (op @1 (lshift (convert (convert:boolean_type_node @0)) { shift; }))) ++ ) ++ ) ++ ) ++) ++#endif ++ ++#if GIMPLE ++/* These patterns are mostly used by FORWPROP to move some operations outside of ++ the if statements. They should be done late because it gives jump threading ++ and few other passes to reduce what is going on. */ ++/* Mul64 is defined as a multiplication algorithm which compute two 64-bit integers to one 128-bit integer ++ (i64 ResLo, i64 ResHi) = Mul64(i64 In0, i64 In1) { ++ In0Lo = In0(D) & 4294967295; ++ In0Hi = In0(D) >> 32; ++ In1Lo = In1(D) & 4294967295; ++ In1Hi = In1(D) >> 32; ++ Mull_01 = In0Hi * In1Lo; ++ Addc = In0Lo * In1Hi + Mull_01; ++ addc32 = Addc << 32; ++ ResLo = In0Lo * In1Lo + addc32; ++ ResHi = ((long unsigned int) (addc32 > ResLo)) + ++ (((long unsigned int) (Mull_01 > Addc)) << 32) + (Addc >> 32) + In0Hi * In1Hi; ++ } */ ++ (simplify ++ (plus ++ (plus ++ (convert ++ (gt @10 ++ (plus ++ (mult @4 @6) ++ (lshift@10 @9 @3)))) ++ (lshift ++ (convert ++ (gt @8 @9)) @3)) ++ (plus@11 ++ (rshift ++ (plus@9 ++ (mult (bit_and@4 SSA_NAME@0 @2) @7) ++ (mult@8 @5 (bit_and@6 SSA_NAME@1 INTEGER_CST@2))) @3) ++ (mult (rshift@5 SSA_NAME@0 @3) ++ (rshift@7 SSA_NAME@1 INTEGER_CST@3)))) ++ (if (INTEGRAL_TYPE_P (type) && INTEGRAL_TYPE_P (TREE_TYPE (@0)) && types_match (@0, @1) && ++ TYPE_PRECISION (type) == 64) ++ (with { ++ tree i128_type = build_nonstandard_integer_type (128, TYPE_UNSIGNED (type)); ++ tree shift = build_int_cst (integer_type_node, 64); ++ //direct_internal_fn_supported_p (UMULH, type, OPTIMIZE_FOR_BOTH) ++ } ++ (convert:type (rshift ++ (mult (convert:i128_type @0) (convert:i128_type @1)) { shift; }))) ++ ) ++ ) ++ ++ /* (i64 ResLo, i64 ResHi) = Mul64(i64 In0, i64 In1) { ++ In0Lo = In0(D) & 4294967295; ++ In0Hi = In0(D) >> 32; ++ In1Lo = In1(D) & 4294967295; ++ In1Hi = In1(D) >> 32; ++ Mull_01 = In0Hi * In1Lo; ++ Addc = In0Lo * In1Hi + Mull_01; ++ addc32 = Addc << 32; ++ ResLo = In0(D) * In1(D); ++ ResHi = ((long unsigned int) (addc32 > ResLo)) + ++ (((long unsigned int) (Mull_01 > Addc)) << 32) + (Addc >> 32) + In0Hi * In1Hi; ++ } */ ++ (simplify ++ (plus ++ (plus ++ (convert ++ (gt (lshift@10 @9 @3) ++ (mult @0 @1))) ++ (lshift ++ (convert ++ (gt @8 @9)) @3)) ++ (plus@11 ++ (rshift ++ (plus@9 ++ (mult (bit_and@4 SSA_NAME@0 @2) @7) ++ (mult@8 @5 (bit_and@6 SSA_NAME@1 INTEGER_CST@2))) @3) ++ (mult (rshift@5 SSA_NAME@0 @3) ++ (rshift@7 SSA_NAME@1 INTEGER_CST@3)))) ++ (if (INTEGRAL_TYPE_P (type) && INTEGRAL_TYPE_P (TREE_TYPE (@0)) && types_match (@0, @1) && ++ TYPE_PRECISION (type) == 64) ++ (with { ++ tree i128_type = build_nonstandard_integer_type (128, TYPE_UNSIGNED (type)); ++ tree shift = build_int_cst (integer_type_node, 64); ++ //direct_internal_fn_supported_p (UMULH, type, OPTIMIZE_FOR_BOTH) ++ } ++ (convert:type (rshift ++ (mult (convert:i128_type @0) (convert:i128_type @1)) { shift; }))) ++ ) ++ ) ++#endif ++ ++#if GIMPLE ++/* These patterns are mostly used by FORWPROP to move some operations outside of ++ the if statements. They should be done late because it gives jump threading ++ and few other passes to reduce what is going on. */ ++ /* ++ In0Lo = In0(D) & 4294967295; ++ In0Hi = In0(D) >> 32; ++ In1Lo = In1(D) & 4294967295; ++ In1Hi = In1(D) >> 32; ++ Addc = In0Lo * In1Hi + In0Hi * In1Lo; ++ addc32 = Addc << 32; ++ ResLo = In0Lo * In1Lo + addc32 ++ */ ++ (simplify ++ (plus (mult @4 @5) ++ (lshift ++ (plus ++ (mult (bit_and@4 SSA_NAME@0 @2) (rshift SSA_NAME@1 @3)) ++ (mult (rshift SSA_NAME@0 @3) (bit_and@5 SSA_NAME@1 INTEGER_CST@2))) INTEGER_CST@3)) ++ (if (INTEGRAL_TYPE_P (type) && INTEGRAL_TYPE_P (TREE_TYPE (@0)) && types_match (@0, @1) && ++ TYPE_PRECISION (type) == 64) ++ (with { ++ tree i128_type = build_nonstandard_integer_type (128, TYPE_UNSIGNED (type)); ++ tree shift = build_int_cst (integer_type_node, 64); ++ //direct_internal_fn_supported_p (UMULH, type, OPTIMIZE_FOR_BOTH) ++ } ++ (mult (convert:type @0) (convert:type @1))) ++ ) ++ ) ++#endif ++ ++ ++#if GIMPLE ++/* Try to match */ ++ /* ++_4 = (int) _3; //NOP_EXPR (SSA_NAME @2) ++_5 = _4 ^ c_10; //BIT_XOR_EXPR (SSA_NAME@1, SSA_NAME) ++_6 = _5 & 255; //BIT_XOR_EXPR (SSA_NAME, INTEGER_CST@3) ++ */ ++(match (crc_match_index @1 @2 @3) ++ (bit_and (bit_xor (nop SSA_NAME@2) SSA_NAME@1) INTEGER_CST@3) ++ (if (INTEGRAL_TYPE_P (type) && tree_to_uhwi(@3) == 255)) ++) ++ ++#endif ++ ++#if GIMPLE ++/* Try to match */ ++ /* ++_8 = c_12 >> 8; // RSHIFT_EXPR (SSA_NAME @1, INTEGER_CST @2) ++c_19 = _7 ^ _8; // BIT_XOR_EXPR (SSA_NAME@3, SSA_NAME) ++ */ ++(match (crc_match_res @1 @2 @3) ++ (bit_xor SSA_NAME@3 (rshift SSA_NAME@1 INTEGER_CST@2)) ++ (if (INTEGRAL_TYPE_P (type) && tree_to_uhwi(@2) == 8)) ++) ++ ++#endif ++ + /* Simplification moved from fold_cond_expr_with_comparison. It may also + be extended. */ + /* This pattern implements two kinds simplification: +diff --git a/gcc/passes.def b/gcc/passes.def +index ea50db086..7abd946ce 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -92,6 +92,7 @@ along with GCC; see the file COPYING3. If not see + NEXT_PASS (pass_cd_dce); + NEXT_PASS (pass_phiopt, true /* early_p */); + NEXT_PASS (pass_array_widen_compare); ++ NEXT_PASS (pass_loop_crc); + NEXT_PASS (pass_tail_recursion); + NEXT_PASS (pass_convert_switch); + NEXT_PASS (pass_cleanup_eh); +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c +new file mode 100644 +index 000000000..07f9e01ec +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c +@@ -0,0 +1,85 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ ++ ++#include ++#include ++typedef unsigned long ulg; ++typedef unsigned char uch; ++ ++static const ulg crc_32_tab[] = { ++ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, ++ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, ++ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, ++ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, ++ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, ++ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, ++ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, ++ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, ++ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, ++ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, ++ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, ++ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, ++ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, ++ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, ++ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, ++ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, ++ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, ++ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, ++ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, ++ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, ++ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, ++ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, ++ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, ++ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, ++ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, ++ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, ++ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, ++ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, ++ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, ++ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, ++ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, ++ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, ++ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, ++ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, ++ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, ++ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, ++ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, ++ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, ++ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, ++ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, ++ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, ++ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, ++ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, ++ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, ++ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, ++ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, ++ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, ++ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, ++ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, ++ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, ++ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, ++ 0x2d02ef8dL ++}; ++ ++ulg updcrc(s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ } while (--n); ++ } ++ crc = c; ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++/* { dg-final { scan-tree-dump-times "Processing loop" 1 "loop_crc"} } */ ++/* { dg-final { scan-tree-dump-times "the loop can be optimized" 1 "loop_crc"} } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c.042t.loop_crc b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c.042t.loop_crc +new file mode 100644 +index 000000000..c726059f3 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c.042t.loop_crc +@@ -0,0 +1,90 @@ ++ ++;; Function updcrc (updcrc, funcdef_no=0, decl_uid=3687, cgraph_uid=1, symbol_order=1) ++ ++;; 2 loops found ++;; ++;; Loop 0 ++;; header 0, latch 1 ++;; depth 0, outer -1 ++;; nodes: 0 1 2 3 6 4 7 5 ++;; ++;; Loop 1 ++;; header 4, latch 7 ++;; depth 1, outer 0 ++;; nodes: 4 7 ++;; 2 succs { 5 3 } ++;; 3 succs { 6 5 } ++;; 6 succs { 4 } ++;; 4 succs { 7 5 } ++;; 7 succs { 4 } ++;; 5 succs { 1 } ++ ++Starting the loop_crc pass ++====================================== ++Processing loop 1: ++====================================== ++;; ++;; Loop 1 ++;; header 4, latch 7 ++;; depth 1, outer 0 ++;; nodes: 4 7 ++ ++ ++The 1th loop form is success matched,and the loop can be optimized. ++updcrc (uch * s, unsigned int n) ++{ ++ static ulg crc = 4294967295; ++ register ulg c; ++ unsigned char _2; ++ long unsigned int _3; ++ long unsigned int _4; ++ long unsigned int _5; ++ long unsigned int _6; ++ long unsigned int _7; ++ ulg _21; ++ ++ : ++ if (s_12(D) == 0B) ++ goto ; [INV] ++ else ++ goto ; [INV] ++ ++ : ++ c_14 = crc; ++ if (n_15(D) != 0) ++ goto ; [INV] ++ else ++ goto ; [INV] ++ ++ : ++ ++ : ++ # s_8 = PHI ++ # n_9 = PHI ++ # c_10 = PHI ++ s_16 = s_8 + 1; ++ _2 = *s_8; ++ _3 = (long unsigned int) _2; ++ _4 = _3 ^ c_10; ++ _5 = _4 & 255; ++ _6 = crc_32_tab[_5]; ++ _7 = c_10 >> 8; ++ c_17 = _6 ^ _7; ++ n_18 = n_9 + 4294967295; ++ if (n_18 != 0) ++ goto ; [INV] ++ else ++ goto ; [INV] ++ ++ : ++ goto ; [100.00%] ++ ++ : ++ # c_11 = PHI <4294967295(2), c_14(3), c_17(4)> ++ crc = c_11; ++ _21 = c_11 ^ 4294967295; ++ return _21; ++ ++} ++ ++ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-2.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-2.c +new file mode 100644 +index 000000000..f73c4d550 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-2.c +@@ -0,0 +1,88 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ ++ ++#include ++#include ++typedef unsigned long ulg; ++typedef unsigned char uch; ++ ++static const ulg crc_32_tab[] = { ++ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, ++ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, ++ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, ++ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, ++ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, ++ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, ++ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, ++ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, ++ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, ++ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, ++ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, ++ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, ++ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, ++ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, ++ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, ++ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, ++ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, ++ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, ++ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, ++ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, ++ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, ++ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, ++ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, ++ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, ++ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, ++ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, ++ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, ++ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, ++ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, ++ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, ++ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, ++ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, ++ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, ++ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, ++ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, ++ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, ++ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, ++ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, ++ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, ++ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, ++ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, ++ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, ++ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, ++ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, ++ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, ++ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, ++ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, ++ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, ++ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, ++ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, ++ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, ++ 0x2d02ef8dL ++}; ++ ++ulg updcrc(s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ for (int i = 0; i < 5; i++) { ++ c++; ++ } ++ ++ } while (--n); ++ } ++ crc = c; ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++/* { dg-final { scan-tree-dump-times "Wrong loop form for crc matching." 1 "loop_crc"} } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-3.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-3.c +new file mode 100644 +index 000000000..70eb1b814 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-3.c +@@ -0,0 +1,85 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ ++ ++#include ++#include ++typedef unsigned long ulg; ++typedef unsigned char uch; ++ ++static const ulg crc_32_tab[] = { ++ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, ++ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, ++ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, ++ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, ++ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, ++ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, ++ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, ++ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, ++ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, ++ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, ++ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, ++ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, ++ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, ++ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, ++ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, ++ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, ++ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, ++ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, ++ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, ++ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, ++ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, ++ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, ++ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, ++ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, ++ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, ++ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, ++ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, ++ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, ++ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, ++ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, ++ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, ++ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, ++ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, ++ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, ++ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, ++ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, ++ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, ++ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, ++ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, ++ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, ++ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, ++ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, ++ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, ++ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, ++ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, ++ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, ++ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, ++ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, ++ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, ++ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, ++ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, ++ 0x2d02ef8dL ++}; ++ ++ulg updcrc(s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ } while (--n || c != 0) ; ++ } ++ crc = c; ++exit1: ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++/* { dg-final { scan-tree-dump-times "Wrong loop form for crc matching." 1 "loop_crc"} } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-4.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-4.c +new file mode 100644 +index 000000000..1d7e0a319 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-4.c +@@ -0,0 +1,89 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ ++ ++#include ++#include ++typedef unsigned long ulg; ++typedef unsigned char uch; ++ ++static const ulg crc_32_tab[] = { ++ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, ++ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, ++ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, ++ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, ++ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, ++ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, ++ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, ++ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, ++ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, ++ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, ++ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, ++ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, ++ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, ++ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, ++ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, ++ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, ++ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, ++ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, ++ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, ++ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, ++ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, ++ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, ++ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, ++ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, ++ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, ++ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, ++ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, ++ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, ++ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, ++ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, ++ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, ++ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, ++ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, ++ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, ++ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, ++ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, ++ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, ++ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, ++ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, ++ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, ++ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, ++ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, ++ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, ++ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, ++ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, ++ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, ++ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, ++ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, ++ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, ++ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, ++ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, ++ 0x2d02ef8dL ++}; ++int test[5] = {0}; ++ ++ulg updcrc(s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8) * test[c%5]; ++ } while (--n) ; ++ } ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ test[c%5] = c; ++ } while (--n) ; ++ crc = c; ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++/* { dg-final { scan-tree-dump-times "Table check fail. not only single array is read." 2 "loop_crc"} } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-4.c.042t.loop_crc b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-4.c.042t.loop_crc +new file mode 100644 +index 000000000..e69de29bb +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c +new file mode 100644 +index 000000000..71b25f537 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c +@@ -0,0 +1,156 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ ++ ++#include ++#include ++typedef unsigned long ulg; ++typedef unsigned char uch; ++ ++static const ulg crc_32_tab[] = { ++ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, ++ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, ++ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, ++ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, ++ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, ++ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, ++ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, ++ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, ++ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, ++ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, ++ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, ++ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, ++ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, ++ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, ++ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, ++ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, ++ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, ++ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, ++ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, ++ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, ++ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, ++ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, ++ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, ++ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, ++ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, ++ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, ++ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, ++ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, ++ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, ++ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, ++ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, ++ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, ++ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, ++ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, ++ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, ++ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, ++ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, ++ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, ++ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, ++ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, ++ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, ++ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, ++ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, ++ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, ++ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, ++ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, ++ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, ++ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, ++ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, ++ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, ++ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, ++ 0x2d02ef8dL ++}; ++ ++int test[5] = {0}; ++ ++ulg updcrc(s, n) ++ uch *s; ++ unsigned n; ++{ ++ register ulg c; ++ ++ static ulg crc = (ulg)0xffffffffL; ++ int a = 0; ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ do { ++ a++; ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8) ; ++ } while (--n) ; ++ } ++ crc = c; ++ return c ^ 0xffffffffL*a; ++} ++ ++ulg updcrc1(s, n) ++ uch *s; ++ unsigned n; ++{ ++ register ulg c; ++ ++ static ulg crc = (ulg)0xffffffffL; ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ unsigned n_back = n; ++ if (n) ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8) ; ++ n = n - 2; ++ } while (n != 0) ; ++ } ++ ++ crc = c; ++ return c ^ 0xffffffffL; ++} ++ ++ulg updcrc2(s, n) ++ uch *s; ++ unsigned n; ++{ ++ register ulg c; ++ ++ static ulg crc = (ulg)0xffffffffL; ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ unsigned n_back = n; ++ if (n) ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8) + 1; ++ } while (--n) ; ++ } ++ ++ crc = c; ++ return c ^ 0xffffffffL; ++} ++/* ++ulg updcrc3(s, n) ++ uch *s; ++ int n; ++{ ++ register ulg c; ++ ++ static ulg crc = (ulg)0xffffffffL; ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ --n; ++ } while (n ) ; ++ } ++ ++ crc = c; ++ return c ^ 0xffffffffL; ++}*/ ++/* { dg-final { scan-tree-dump-times "num of phi noeds check failed." 1 "loop_crc"} } */ ++/* { dg-final { scan-tree-dump-times "evolution pattern check failed." 1 "loop_crc"} } */ ++/* { dg-final { scan-tree-dump-times "calculation pattern check failed." 1 "loop_crc"} } */ ++ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c.042t.loop_crc b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c.042t.loop_crc +new file mode 100644 +index 000000000..6d52a8684 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c.042t.loop_crc +@@ -0,0 +1,64 @@ ++ ++;; Function updcrc3 (updcrc3, funcdef_no=0, decl_uid=3687, cgraph_uid=1, symbol_order=1) ++ ++;; 2 loops found ++;; ++;; Loop 0 ++;; header 0, latch 1 ++;; depth 0, outer -1 ++;; nodes: 0 1 2 3 4 5 ++;; ++;; Loop 1 ++;; header 4, latch 4 ++;; depth 1, outer 0 ++;; nodes: 4 ++;; 2 succs { 5 3 } ++;; 3 succs { 4 5 } ++;; 4 succs { 4 } ++;; 5 succs { 1 } ++ ++Starting the loop_crc pass ++====================================== ++Processing loop 1: ++====================================== ++;; ++;; Loop 1 ++;; header 4, latch 4 ++;; depth 1, outer 0 ++;; nodes: 4 ++ ++ ++ ++Wrong loop form for crc matching. ++updcrc3 (uch * s, unsigned int n) ++{ ++ unsigned int n_back; ++ static ulg crc = 4294967295; ++ register ulg c; ++ ulg _22; ++ ++ : ++ if (s_12(D) == 0B) ++ goto ; [INV] ++ else ++ goto ; [INV] ++ ++ : ++ c_14 = crc; ++ if (n_15(D) != 0) ++ goto ; [INV] ++ else ++ goto ; [INV] ++ ++ : ++ goto ; [100.00%] ++ ++ : ++ # c_11 = PHI <4294967295(2), c_14(3)> ++ crc = c_11; ++ _22 = c_11 ^ 4294967295; ++ return _22; ++ ++} ++ ++ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.s b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.s +new file mode 100644 +index 000000000..cae934bfe +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.s +@@ -0,0 +1,329 @@ ++ .arch armv8-a ++ .file "loop-crc-calculation-check-fail.c" ++ .text ++ .section .rodata ++ .align 3 ++ .type crc_32_tab, %object ++ .size crc_32_tab, 2048 ++crc_32_tab: ++ .xword 0 ++ .xword 1996959894 ++ .xword 3993919788 ++ .xword 2567524794 ++ .xword 124634137 ++ .xword 1886057615 ++ .xword 3915621685 ++ .xword 2657392035 ++ .xword 249268274 ++ .xword 2044508324 ++ .xword 3772115230 ++ .xword 2547177864 ++ .xword 162941995 ++ .xword 2125561021 ++ .xword 3887607047 ++ .xword 2428444049 ++ .xword 498536548 ++ .xword 1789927666 ++ .xword 4089016648 ++ .xword 2227061214 ++ .xword 450548861 ++ .xword 1843258603 ++ .xword 4107580753 ++ .xword 2211677639 ++ .xword 325883990 ++ .xword 1684777152 ++ .xword 4251122042 ++ .xword 2321926636 ++ .xword 335633487 ++ .xword 1661365465 ++ .xword 4195302755 ++ .xword 2366115317 ++ .xword 997073096 ++ .xword 1281953886 ++ .xword 3579855332 ++ .xword 2724688242 ++ .xword 1006888145 ++ .xword 1258607687 ++ .xword 3524101629 ++ .xword 2768942443 ++ .xword 901097722 ++ .xword 1119000684 ++ .xword 3686517206 ++ .xword 2898065728 ++ .xword 853044451 ++ .xword 1172266101 ++ .xword 3705015759 ++ .xword 2882616665 ++ .xword 651767980 ++ .xword 1373503546 ++ .xword 3369554304 ++ .xword 3218104598 ++ .xword 565507253 ++ .xword 1454621731 ++ .xword 3485111705 ++ .xword 3099436303 ++ .xword 671266974 ++ .xword 1594198024 ++ .xword 3322730930 ++ .xword 2970347812 ++ .xword 795835527 ++ .xword 1483230225 ++ .xword 3244367275 ++ .xword 3060149565 ++ .xword 1994146192 ++ .xword 31158534 ++ .xword 2563907772 ++ .xword 4023717930 ++ .xword 1907459465 ++ .xword 112637215 ++ .xword 2680153253 ++ .xword 3904427059 ++ .xword 2013776290 ++ .xword 251722036 ++ .xword 2517215374 ++ .xword 3775830040 ++ .xword 2137656763 ++ .xword 141376813 ++ .xword 2439277719 ++ .xword 3865271297 ++ .xword 1802195444 ++ .xword 476864866 ++ .xword 2238001368 ++ .xword 4066508878 ++ .xword 1812370925 ++ .xword 453092731 ++ .xword 2181625025 ++ .xword 4111451223 ++ .xword 1706088902 ++ .xword 314042704 ++ .xword 2344532202 ++ .xword 4240017532 ++ .xword 1658658271 ++ .xword 366619977 ++ .xword 2362670323 ++ .xword 4224994405 ++ .xword 1303535960 ++ .xword 984961486 ++ .xword 2747007092 ++ .xword 3569037538 ++ .xword 1256170817 ++ .xword 1037604311 ++ .xword 2765210733 ++ .xword 3554079995 ++ .xword 1131014506 ++ .xword 879679996 ++ .xword 2909243462 ++ .xword 3663771856 ++ .xword 1141124467 ++ .xword 855842277 ++ .xword 2852801631 ++ .xword 3708648649 ++ .xword 1342533948 ++ .xword 654459306 ++ .xword 3188396048 ++ .xword 3373015174 ++ .xword 1466479909 ++ .xword 544179635 ++ .xword 3110523913 ++ .xword 3462522015 ++ .xword 1591671054 ++ .xword 702138776 ++ .xword 2966460450 ++ .xword 3352799412 ++ .xword 1504918807 ++ .xword 783551873 ++ .xword 3082640443 ++ .xword 3233442989 ++ .xword 3988292384 ++ .xword 2596254646 ++ .xword 62317068 ++ .xword 1957810842 ++ .xword 3939845945 ++ .xword 2647816111 ++ .xword 81470997 ++ .xword 1943803523 ++ .xword 3814918930 ++ .xword 2489596804 ++ .xword 225274430 ++ .xword 2053790376 ++ .xword 3826175755 ++ .xword 2466906013 ++ .xword 167816743 ++ .xword 2097651377 ++ .xword 4027552580 ++ .xword 2265490386 ++ .xword 503444072 ++ .xword 1762050814 ++ .xword 4150417245 ++ .xword 2154129355 ++ .xword 426522225 ++ .xword 1852507879 ++ .xword 4275313526 ++ .xword 2312317920 ++ .xword 282753626 ++ .xword 1742555852 ++ .xword 4189708143 ++ .xword 2394877945 ++ .xword 397917763 ++ .xword 1622183637 ++ .xword 3604390888 ++ .xword 2714866558 ++ .xword 953729732 ++ .xword 1340076626 ++ .xword 3518719985 ++ .xword 2797360999 ++ .xword 1068828381 ++ .xword 1219638859 ++ .xword 3624741850 ++ .xword 2936675148 ++ .xword 906185462 ++ .xword 1090812512 ++ .xword 3747672003 ++ .xword 2825379669 ++ .xword 829329135 ++ .xword 1181335161 ++ .xword 3412177804 ++ .xword 3160834842 ++ .xword 628085408 ++ .xword 1382605366 ++ .xword 3423369109 ++ .xword 3138078467 ++ .xword 570562233 ++ .xword 1426400815 ++ .xword 3317316542 ++ .xword 2998733608 ++ .xword 733239954 ++ .xword 1555261956 ++ .xword 3268935591 ++ .xword 3050360625 ++ .xword 752459403 ++ .xword 1541320221 ++ .xword 2607071920 ++ .xword 3965973030 ++ .xword 1969922972 ++ .xword 40735498 ++ .xword 2617837225 ++ .xword 3943577151 ++ .xword 1913087877 ++ .xword 83908371 ++ .xword 2512341634 ++ .xword 3803740692 ++ .xword 2075208622 ++ .xword 213261112 ++ .xword 2463272603 ++ .xword 3855990285 ++ .xword 2094854071 ++ .xword 198958881 ++ .xword 2262029012 ++ .xword 4057260610 ++ .xword 1759359992 ++ .xword 534414190 ++ .xword 2176718541 ++ .xword 4139329115 ++ .xword 1873836001 ++ .xword 414664567 ++ .xword 2282248934 ++ .xword 4279200368 ++ .xword 1711684554 ++ .xword 285281116 ++ .xword 2405801727 ++ .xword 4167216745 ++ .xword 1634467795 ++ .xword 376229701 ++ .xword 2685067896 ++ .xword 3608007406 ++ .xword 1308918612 ++ .xword 956543938 ++ .xword 2808555105 ++ .xword 3495958263 ++ .xword 1231636301 ++ .xword 1047427035 ++ .xword 2932959818 ++ .xword 3654703836 ++ .xword 1088359270 ++ .xword 936918000 ++ .xword 2847714899 ++ .xword 3736837829 ++ .xword 1202900863 ++ .xword 817233897 ++ .xword 3183342108 ++ .xword 3401237130 ++ .xword 1404277552 ++ .xword 615818150 ++ .xword 3134207493 ++ .xword 3453421203 ++ .xword 1423857449 ++ .xword 601450431 ++ .xword 3009837614 ++ .xword 3294710456 ++ .xword 1567103746 ++ .xword 711928724 ++ .xword 3020668471 ++ .xword 3272380065 ++ .xword 1510334235 ++ .xword 755167117 ++ .text ++ .align 2 ++ .global updcrc3 ++ .type updcrc3, %function ++updcrc3: ++.LFB0: ++ .cfi_startproc ++ str x19, [sp, -48]! ++ .cfi_def_cfa_offset 48 ++ .cfi_offset 19, -48 ++ str x0, [sp, 24] ++ str w1, [sp, 20] ++ ldr x0, [sp, 24] ++ cmp x0, 0 ++ bne .L2 ++ mov x19, 4294967295 ++ b .L3 ++.L2: ++ adrp x0, crc.0 ++ add x0, x0, :lo12:crc.0 ++ ldr x19, [x0] ++ ldr w0, [sp, 20] ++ str w0, [sp, 44] ++ ldr w0, [sp, 20] ++ cmp w0, 0 ++ beq .L3 ++.L4: ++ ldr x0, [sp, 24] ++ add x1, x0, 1 ++ str x1, [sp, 24] ++ ldrb w0, [x0] ++ and x0, x0, 255 ++ eor x0, x19, x0 ++ and x1, x0, 255 ++ adrp x0, crc_32_tab ++ add x0, x0, :lo12:crc_32_tab ++ ldr x1, [x0, x1, lsl 3] ++ lsr x0, x19, 8 ++ eor x19, x1, x0 ++ ldr w0, [sp, 20] ++ sub w0, w0, #1 ++ str w0, [sp, 20] ++ ldr w0, [sp, 20] ++ cmp w0, 999 ++ bls .L4 ++.L3: ++ adrp x0, crc.0 ++ add x0, x0, :lo12:crc.0 ++ str x19, [x0] ++ eor x0, x19, 4294967295 ++ ldr x19, [sp], 48 ++ .cfi_restore 19 ++ .cfi_def_cfa_offset 0 ++ ret ++ .cfi_endproc ++.LFE0: ++ .size updcrc3, .-updcrc3 ++ .data ++ .align 3 ++ .type crc.0, %object ++ .size crc.0, 8 ++crc.0: ++ .xword 4294967295 ++ .ident "GCC: (Kunpeng gcc 10.3.1-2.3.0.b006) 10.3.1" ++ .section .note.GNU-stack,"",@progbits +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c +new file mode 100644 +index 000000000..b59704e31 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c +@@ -0,0 +1,111 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ ++ ++#include ++#include ++typedef unsigned long ulg; ++typedef unsigned char uch; ++ ++static const ulg crc_32_tab[] = { ++ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, ++ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, ++ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, ++ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, ++ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, ++ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, ++ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, ++ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, ++ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, ++ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, ++ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, ++ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, ++ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, ++ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, ++ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, ++ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, ++ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, ++ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, ++ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, ++ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, ++ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, ++ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, ++ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, ++ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, ++ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, ++ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, ++ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, ++ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, ++ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, ++ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, ++ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, ++ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, ++ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, ++ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, ++ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, ++ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, ++ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, ++ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, ++ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, ++ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, ++ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, ++ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, ++ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, ++ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, ++ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, ++ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, ++ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, ++ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, ++ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, ++ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, ++ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, ++ 0x2d02ef8dL ++}; ++ ++/* check when the loop have a innor loop, should fail. */ ++ulg updcrc(s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ for (int i = 0; i < 5; i++) { ++ c++; ++ } ++ ++ } while (--n); ++ } ++ crc = c; ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++ ++/* check when the loop have a second backedge, should fail. */ ++ulg updcrc1(s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ } while (--n || c != 0) ; ++ } ++ crc = c; ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++/* { dg-final { scan-tree-dump-times "Wrong loop form for crc matching." 2 "loop_crc"} } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c +new file mode 100644 +index 000000000..e1e16eaf2 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c +@@ -0,0 +1,84 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ ++ ++#include ++#include ++typedef unsigned long ulg; ++typedef unsigned char uch; ++ ++static const ulg crc_32_tab[] = { ++ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, ++ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, ++ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, ++ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, ++ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, ++ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, ++ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, ++ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, ++ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, ++ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, ++ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, ++ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, ++ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, ++ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, ++ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, ++ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, ++ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, ++ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, ++ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, ++ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, ++ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, ++ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, ++ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, ++ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, ++ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, ++ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, ++ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, ++ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, ++ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, ++ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, ++ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, ++ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, ++ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, ++ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, ++ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, ++ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, ++ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, ++ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, ++ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, ++ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, ++ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, ++ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, ++ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, ++ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, ++ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, ++ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, ++ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, ++ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, ++ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, ++ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, ++ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, ++ 0x2d02ef8dL ++}; ++ ++ulg updcrc(s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ } while (--n); ++ } ++ crc = c; ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++/* { dg-final { scan-tree-dump-times "the loop can be optimized" 1 "loop_crc"} } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c +new file mode 100644 +index 000000000..f03a4fa82 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c +@@ -0,0 +1,113 @@ ++/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ ++/* { dg-options "-O3 -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ ++ ++#include ++#include ++typedef unsigned long ulg; ++typedef unsigned char uch; ++ ++static const ulg crc_32_tab[] = { ++ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, ++ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, ++ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, ++ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, ++ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, ++ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, ++ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, ++ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, ++ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, ++ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, ++ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, ++ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, ++ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, ++ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, ++ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, ++ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, ++ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, ++ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, ++ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, ++ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, ++ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, ++ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, ++ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, ++ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, ++ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, ++ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, ++ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, ++ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, ++ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, ++ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, ++ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, ++ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, ++ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, ++ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, ++ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, ++ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, ++ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, ++ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, ++ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, ++ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, ++ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, ++ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, ++ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, ++ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, ++ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, ++ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, ++ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, ++ 0x37d83bf1L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, ++ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, ++ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, ++ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, ++ 0x2d02ef8dL ++}; ++int test[5] = {0}; ++ ++/* check when the loop is doing more then 1 array read or writing an array, both should fail. */ ++ulg updcrc(s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8) * test[c%5]; ++ } while (--n) ; ++ } ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ test[c%5] = c; ++ } while (--n) ; ++ crc = c; ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++ ++/* check when the loop is not working on a correct crc_table. should fail. */ ++ulg updcrc1(s, n) ++ uch *s; /* pointer to bytes to pump through */ ++ unsigned n; /* number of bytes in s[] */ ++{ ++ register ulg c; /* temporary variable */ ++ ++ static ulg crc = (ulg)0xffffffffL; /* shift register contents */ ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ } while (--n) ; ++ } ++ crc = c; ++ return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ ++} ++/* { dg-final { scan-tree-dump-times "Table check fail. not only single array is read." 2 "loop_crc"} } */ ++/* { dg-final { scan-tree-dump-times "Table check fail. Table not matching." 1 "loop_crc"} } */ +\ No newline at end of file +diff --git a/gcc/timevar.def b/gcc/timevar.def +index 2814b14f2..ba86a1b7b 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -215,6 +215,7 @@ DEFTIMEVAR (TV_TREE_COPY_RENAME , "tree rename SSA copies") + DEFTIMEVAR (TV_TREE_SSA_VERIFY , "tree SSA verifier") + DEFTIMEVAR (TV_TREE_STMT_VERIFY , "tree STMT verifier") + DEFTIMEVAR (TV_TREE_ARRAY_WIDEN_COMPARE, "tree array widen compare") ++DEFTIMEVAR (TV_TREE_LOOP_CRC, "tree loop crc") + DEFTIMEVAR (TV_TREE_SWITCH_CONVERSION, "tree switch conversion") + DEFTIMEVAR (TV_TREE_SWITCH_LOWERING, "tree switch lowering") + DEFTIMEVAR (TV_TREE_RECIP , "gimple CSE reciprocals") +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index 3cdc12466..027f8992d 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -437,6 +437,7 @@ extern gimple_opt_pass *make_pass_phiopt (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_forwprop (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_phiprop (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_array_widen_compare (gcc::context *ctxt); ++extern gimple_opt_pass *make_pass_loop_crc (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_tree_ifcombine (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_dse (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_nrv (gcc::context *ctxt); +diff --git a/gcc/tree-ssa-loop-crc.c b/gcc/tree-ssa-loop-crc.c +new file mode 100644 +index 000000000..4982384c6 +--- /dev/null ++++ b/gcc/tree-ssa-loop-crc.c +@@ -0,0 +1,644 @@ ++/* Array widen compare. ++ Copyright (C) 2022-2022 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it ++under the terms of the GNU General Public License as published by the ++Free Software Foundation; either version 3, or (at your option) any ++later version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#include "config.h" ++#include "system.h" ++#include "coretypes.h" ++#include "backend.h" ++#include "target.h" ++#include "tree.h" ++#include "gimple.h" ++#include "tree-pass.h" ++#include "gimple-ssa.h" ++#include "tree-pretty-print.h" ++#include "fold-const.h" ++#include "gimplify.h" ++#include "gimple-iterator.h" ++#include "tree-ssa-loop-manip.h" ++#include "tree-ssa-loop.h" ++#include "ssa.h" ++#include "tree-into-ssa.h" ++#include "cfganal.h" ++#include "cfgloop.h" ++#include "gimple-pretty-print.h" ++#include "tree-cfg.h" ++#include "cgraph.h" ++#include "print-tree.h" ++#include "cfghooks.h" ++#include "gimple-fold.h" ++ ++/* Match.pd function to match the ctz expression. */ ++extern bool gimple_crc_match_index (tree, tree *, tree (*)(tree)); ++extern bool gimple_crc_match_res (tree, tree *, tree (*)(tree)); ++ ++static gimple *crc_table_read_stmt = NULL; ++ ++ ++/* The loop form check will check the entire loop control flow ++ It should be a loop that: ++ 1. a do-while loop with header and latch only with no other control flow inside the loop ++ 2. have only one exiting edge ++ 3. have only one back edge and one entry edge ++*/ ++static bool ++crc_loop_form_check (class loop *loop) ++{ ++ if (loop->num_nodes > 2 || loop->inner) ++ return false; ++ // should only have 1 exit edge ++ vec edges; ++ edges = get_loop_exit_edges (loop); ++ if (edges.length() != 1) ++ return false; ++ ++ // The header should have only 2 incoming edges ++ // One of them is the preheader edge and the other is the backedge from the latch ++ if (EDGE_COUNT (loop->header->preds) != 2) ++ return false; ++ edge e1 = EDGE_PRED (loop->header, 0); ++ edge e2 = EDGE_PRED (loop->header, 1); ++ ++ if ((e1->src == loop->latch && e2->src->loop_father != loop) ++ || (e2->src == loop->latch && e1->src->loop_father != loop)) ++ return true; ++ ++ return false; ++} ++ ++/* Check there is only one array is read in the loop. ++ Return the only array as crc_table. */ ++static bool ++only_one_array_read (class loop *loop, tree &crc_table) ++{ ++ gimple_stmt_iterator gsi; ++ gimple *stmt; ++ bool res = false; ++ for (gsi = gsi_start_bb (loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ stmt = gsi_stmt (gsi); ++ if (stmt == NULL) ++ return false; ++ ++ if (gimple_code (stmt) == GIMPLE_ASSIGN && ++ TREE_CODE(gimple_assign_lhs (stmt)) == ARRAY_REF ) ++ return false; ++ ++ if (gimple_code (stmt) == GIMPLE_ASSIGN && ++ TREE_CODE(gimple_assign_rhs1 (stmt)) == ARRAY_REF) ++ { ++ if (crc_table == NULL) ++ { ++ crc_table = gimple_assign_rhs1 (stmt); ++ crc_table_read_stmt = stmt; ++ res = true; ++ } ++ else ++ return false; ++ } ++ } ++ return res; ++} ++ ++static const unsigned HOST_WIDE_INT crc_32_tab[] = { ++ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, ++ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, ++ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, ++ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, ++ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, ++ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, ++ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, ++ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, ++ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, ++ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, ++ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, ++ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, ++ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, ++ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, ++ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, ++ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, ++ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, ++ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, ++ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, ++ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, ++ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, ++ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, ++ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, ++ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, ++ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, ++ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, ++ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, ++ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, ++ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, ++ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, ++ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, ++ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, ++ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, ++ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, ++ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, ++ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, ++ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, ++ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, ++ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, ++ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, ++ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, ++ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, ++ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, ++ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, ++ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, ++ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, ++ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, ++ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, ++ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, ++ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, ++ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, ++ 0x2d02ef8dL ++}; ++ ++/* Check the content of the array. */ ++static bool ++match_crc_table (tree crc_table) ++{ ++ unsigned HOST_WIDE_INT lb = tree_to_uhwi (array_ref_low_bound (crc_table)); ++ unsigned HOST_WIDE_INT ub = tree_to_uhwi (array_ref_up_bound (crc_table)); ++ unsigned HOST_WIDE_INT es = tree_to_uhwi (array_ref_element_size (crc_table)); ++ if (lb != 0 || ub != 255 || es != 8) ++ return false; ++ ++ tree decl = TREE_OPERAND (crc_table, 0); ++ tree ctor = ctor_for_folding(decl); ++ for (int i = 0; i < 255; i++) { ++ unsigned HOST_WIDE_INT val = tree_to_uhwi (CONSTRUCTOR_ELT (ctor,i)->value); ++ if (crc_32_tab[i] != val) ++ return false; ++ } ++ return true; ++} ++ ++ ++/* Check the crc table. The loop should have only one data reference. ++ And match the data reference with the predefined array. */ ++static bool ++crc_table_check (class loop *loop) ++{ ++ tree crc_table = NULL; ++ if (!only_one_array_read (loop, crc_table)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nTable check fail. not only single array is read.\n"); ++ return false; ++ } ++ if (!match_crc_table (crc_table)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nTable check fail. Table not matching.\n"); ++ return false; ++ } ++ return true; ++} ++ ++/* check whether the evolution pattern of phi is phi = SSA_NAME + target*/ ++static bool ++evolution_pattern_plus_with_p (class loop* loop, gphi *phi, unsigned HOST_WIDE_INT target) ++{ ++ edge backedge = find_edge (loop->latch, loop->header); ++ if (backedge == NULL) ++ return false; ++ tree evolution_node = PHI_ARG_DEF_FROM_EDGE (phi, backedge); ++ gimple *evolution_expr = SSA_NAME_DEF_STMT (evolution_node); ++ ++ if (evolution_expr && (gimple_assign_rhs_code (evolution_expr) == PLUS_EXPR || ++ gimple_assign_rhs_code (evolution_expr) == POINTER_PLUS_EXPR)) ++ { ++ tree rhs1 = gimple_assign_rhs1 (evolution_expr); ++ tree rhs2 = gimple_assign_rhs2 (evolution_expr); ++ if (TREE_CODE (rhs1) == SSA_NAME && TREE_CODE (rhs2) == INTEGER_CST ++ && tree_to_uhwi (rhs2) == target) ++ return true; ++ } ++ return false; ++} ++ ++/* Check whether there are only 3 phi nodes in the header block. ++ Return 3 phi nodes in the capture. */ ++static bool ++check_num_of_phi (basic_block header, gphi *capture[]) ++{ ++ gphi *phi; ++ gphi_iterator gsi; ++ int num_of_phi = 0; ++ ++ for (gsi = gsi_start_phis (header); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ phi = gsi.phi(); ++ if (phi) num_of_phi++; ++ if (num_of_phi > 3) ++ return false; ++ capture[num_of_phi - 1] = phi; ++ } ++ /* phi node should be exactly 3. */ ++ return num_of_phi == 3; ++} ++ ++/* Check the evolution pattern of three phi nodes. ++ Should be one of the node +1 every time (s), one of the node -1 ++ every time (n), and a 3rd one neither (c). Return 3 phi nodes in ++ the capture with the order of s,n,c.*/ ++static bool ++check_evolution_pattern (class loop* loop, gphi *capture[]) ++{ ++ gphi *s=NULL; ++ gphi *n=NULL; ++ gphi *c=NULL; ++ ++ for (int i = 0; i < 3; i++) ++ { ++ if (evolution_pattern_plus_with_p(loop, capture[i], 1)) ++ { ++ if (s != NULL) ++ return false; ++ s = capture[i]; ++ } ++ else if (evolution_pattern_plus_with_p(loop, capture[i], 4294967295)) ++ { ++ if (n != NULL) ++ return false; ++ n = capture[i]; ++ } ++ else ++ { ++ if (c != NULL) ++ return false; ++ c = capture[i]; ++ } ++ } ++ ++ // some envolution pattern cannot find ++ if (!n || !s || !c) ++ return false; ++ ++ capture[0] = s; ++ capture[1] = n; ++ capture[2] = c; ++ return true; ++} ++/* check the calculation pattern before and after the crc_table array read stmt. ++ _7 = crc_32_tab[_6]; ++ The caculation of index _6 should be the result of a sequency of calculation by the s and c ++ The result of the array read _7 should be used to calculate the new c. */ ++static bool ++check_calculation_pattern (class loop* loop, gphi *capture[]) ++{ ++ gphi *s=capture[0]; ++ gphi *c=capture[2]; ++ tree res_ops[3]; ++ tree index = TREE_OPERAND (gimple_assign_rhs1 (crc_table_read_stmt), 1); ++ ++ /* Try to match ++ _4 = (int) _3; //NOP_EXPR (SSA_NAME @2) ++ _5 = _4 ^ c_10; //BIT_XOR_EXPR (SSA_NAME, PHI @1) ++ _6 = _5 & 255; //BIT_XOR_EXPR (SSA_NAME, INTEGER_CST@3) ++ */ ++ ++ if (!gimple_crc_match_index(index, res_ops, NULL)) ++ return false; ++ gimple *s_res_stmt = SSA_NAME_DEF_STMT(res_ops[1]); ++ tree s_res = TREE_OPERAND(gimple_assign_rhs1(s_res_stmt),0); ++ if (res_ops[0] != gimple_phi_result (c) || ++ s_res != gimple_phi_result (s)) ++ return false; ++ ++ /* Try to match ++ _8 = c_12 >> 8; // RSHIFT_EXPR (SSA_NAME @1, INTEGER_CST @2) ++ c_19 = _7 ^ _8; // BIT_XOR_EXPR (SSA_NAME@3, SSA_NAME) ++ */ ++ edge backedge = find_edge(loop->latch, loop->header); ++ tree updated_c = PHI_ARG_DEF_FROM_EDGE (c, backedge); ++ if (!gimple_crc_match_res(updated_c, res_ops, NULL)) ++ return false; ++ if (res_ops[0] != gimple_phi_result (c) ++ || res_ops[2] != gimple_assign_lhs(crc_table_read_stmt)) ++ return false; ++ ++ return true; ++} ++ ++/* check the exit condition is n != 0. */ ++static bool ++check_exit_condition (class loop* loop, gphi *n) ++{ ++ edge backedge = find_edge(loop->latch, loop->header); ++ gimple *cond_stmt = gsi_stmt (gsi_last_bb (loop->header)); ++ if (!cond_stmt || gimple_code (cond_stmt) != GIMPLE_COND || gimple_cond_code (cond_stmt) != NE_EXPR ++ || gimple_cond_lhs (cond_stmt) != PHI_ARG_DEF_FROM_EDGE (n, backedge) ++ || tree_to_uhwi(gimple_cond_rhs (cond_stmt)) != 0) ++ return false; ++ ++ return true; ++} ++ ++/* Check the loop body. The loop body we are trying to match is ++ ++# s_10 = PHI ++# n_11 = PHI ++# c_12 = PHI ++_1 = (int) c_12; ++s_18 = s_10 + 1; ++_3 = *s_10; ++_4 = (int) _3; ++_5 = _1 ^ _4; ++_6 = _5 & 255; ++_7 = crc_32_tab[_6]; ++_8 = c_12 >> 8; ++c_19 = _7 ^ _8; ++n_20 = n_11 + 4294967295; ++if (n_20 != 0) ++ goto ; [INV] ++else ++ goto ; [INV] ++ ++which is doing a very simple calculation ++do { ++ c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++} while (--n); ++ ++In this case ,we don't want this loop to have any other operation inside. ++so the matching condition is ++1. There are only 3 loop variant during each itoration, namely s,c,n, ++ which is limited by the condition that the loop have exactly 3 phi nodes. ++2. The 3 loop variants should have evolution pattern as 1 of the 3 nodes is ++ increased by 1 every itoration, 1 of the 3 nodes is decreased by 1 every itor ++ and the 3rd one is neither. These three tree node SSA value will be captured for ++ the later arithmatic pattern matching ++3. Pattern matching for the index of crc_table ++4. pattern matching for the result of c calcuation after read from crc_table ++5. The exit condition matching. ++ */ ++static bool ++crc_loop_body_check (class loop *loop) ++{ ++ basic_block header = loop->header; ++ gphi *capture[3]; ++ if (!check_num_of_phi(header, capture)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n num of phi noeds check failed.\n"); ++ return false; ++ } ++ if (!check_evolution_pattern(loop, capture)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n evolution pattern check failed.\n"); ++ return false; ++ } ++ if (!check_calculation_pattern(loop, capture)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n calculation pattern check failed.\n"); ++ return false; ++ } ++ if (!check_exit_condition(loop, capture[1] /* n*/)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n exit condition check failed.\n"); ++ return false; ++ } ++ return true; ++/* gphi *phi; ++ gphi_iterator gsi; ++ int num_of_phi = 0; ++ //s, n, c; ++ //only 3 phi nodes are there, every one of the phi nodes comming from 2 edge only, one from preheader, one from latch ++ // s increase by 1 every itoration ++ // n decrease by 1 every itoration ++ // The final one is c, which is the result, should be used for the start of the later pattern matching ++ for (gsi = gsi_start_phis(loop->header); !gsi_end_p(gsi); gsi_next(&gsi)) ++ { ++ phi = gsi.phi(); ++ ++ if (phi) num_of_phi++; ++ if (num_of_phi > 3) return false; // more then 3 phi node ++ if (gimple_phi_num_args(phi) > 2) // more than 2 edges other then one backedge and one preheader edge ++ return false; ++ //capture[num_of_phi - 1] = gimple_phi_result(phi); ++ capture[num_of_phi - 1] = phi; ++ } ++ if (num_of_phi != 3) return false; // phi node should be 3 */ ++ // Find the envolution pattern for s and n, try to match the identity of these variable ++/* gphi *s=NULL; ++ gphi *n=NULL; ++ gphi *c=NULL; ++ ++ for (int i = 0; i < 3; i++) ++ { ++ if (evolution_pattern_plus_with_p(loop, capture[i], 1)) ++ { ++ if(s != NULL) ++ return false; ++ s = capture[i]; ++ } ++ else if (evolution_pattern_plus_with_p(loop, capture[i], 4294967295)) ++ { ++ if(n != NULL) ++ return false; ++ n = capture[i]; ++ } ++ else ++ { ++ if(c != NULL) ++ return false; ++ c = capture[i]; ++ } ++ } ++ ++ // some envolution pattern cannot find ++ if (!n || !s || !c) ++ return false; ++ gphi *s=capture[0]; ++ gphi *n=capture[1]; ++ gphi *c=capture[2]; ++ tree res_ops[3]; ++ tree index = TREE_OPERAND (gimple_assign_rhs1 (crc_table_read_stmt), 1); ++ ++ /* Try to match ++ _1 = (int) c_12; //NOP_EXPR (SSA_NAME @1) ++ _4 = (int) _3; //NOP_EXPR (SSA_NAME @2) ++ _5 = _1 ^ _4; //BIT_XOR_EXPR (SSA_NAME, SSA_NAME) ++ _6 = _5 & 255; //BIT_XOR_EXPR (SSA_NAME, INTEGER_CST@3) ++ ++ ++ if (!gimple_crc_match_index(index, res_ops, NULL)) ++ return false; ++ gimple *s_res_stmt = SSA_NAME_DEF_STMT(res_ops[1]); ++ tree s_res = TREE_OPERAND(gimple_assign_rhs1(s_res_stmt),0); ++ if (res_ops[0] != gimple_phi_result (c) || ++ s_res != gimple_phi_result (s)) ++ return false; ++ ++ /* ++_8 = c_12 >> 8; // RSHIFT_EXPR (SSA_NAME @1, INTEGER_CST @2) ++c_19 = _7 ^ _8; // BIT_XOR_EXPR (SSA_NAME@3, SSA_NAME) ++ ++ edge backedge = find_edge(loop->latch, loop->header); ++ tree updated_c = PHI_ARG_DEF_FROM_EDGE (c, backedge); ++ if (!gimple_crc_match_res(updated_c, res_ops, NULL)) ++ return false; ++ if (res_ops[0] != gimple_phi_result (c) ++ || res_ops[2] != gimple_assign_lhs(crc_table_read_stmt)) ++ return false; ++ ++ // try match n as the induction variable ++ // The proceed condition for back edge is n != 0 ++ gimple *cond_stmt = gsi_stmt (gsi_last_bb (loop->header)); ++ if (!cond_stmt || gimple_code (cond_stmt) != GIMPLE_COND || gimple_cond_code (cond_stmt) != NE_EXPR ++ || gimple_cond_lhs (cond_stmt) != PHI_ARG_DEF_FROM_EDGE (n, backedge) ++ || tree_to_uhwi(gimple_cond_rhs (cond_stmt)) != 0) ++ return false; ++ ++ return true; ++ */ ++} ++ ++ ++static bool ++match_crc_loop (class loop *loop) ++{ ++ if (!crc_loop_form_check(loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nWrong loop form for crc matching.\n"); ++ return false; ++ } ++ if (!crc_table_check(loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nWrong crc table for crc matching.\n"); ++ return false; ++ } ++ if (!crc_loop_body_check(loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nWrong loop body for crc matching.\n"); ++ return false; ++ } ++ return true; ++} ++ ++/* The main entry of loop crc optimizes. */ ++static unsigned int ++tree_ssa_loop_crc () ++{ ++ unsigned int todo = 0; ++ class loop *loop; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ flow_loops_dump (dump_file, NULL, 1); ++ fprintf (dump_file, "\nStarting the loop_crc pass\n"); ++ } ++ ++ FOR_EACH_LOOP (loop, LI_FROM_INNERMOST) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "======================================\n"); ++ fprintf (dump_file, "Processing loop %d:\n", loop->num); ++ fprintf (dump_file, "======================================\n"); ++ flow_loop_dump (loop, dump_file, NULL, 1); ++ fprintf (dump_file, "\n\n"); ++ } ++ ++ if (match_crc_loop (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "The %dth loop form is success matched," ++ "and the loop can be optimized.\n", ++ loop->num); ++ } ++ ++ convert_to_new_loop (loop); ++ } ++ } ++ ++ todo |= (TODO_update_ssa); ++ return todo; ++} ++ ++/* Loop crc. */ ++ ++namespace { ++ ++const pass_data pass_data_tree_loop_crc = ++{ ++ GIMPLE_PASS, ++ "loop_crc", ++ OPTGROUP_LOOP, ++ TV_TREE_LOOP_CRC, ++ (PROP_cfg | PROP_ssa), ++ 0, ++ 0, ++ 0, ++ (TODO_update_ssa | TODO_verify_all) ++}; ++ ++class pass_loop_crc : public gimple_opt_pass ++{ ++public: ++ pass_loop_crc (gcc::context *ctxt) ++ : gimple_opt_pass (pass_data_tree_loop_crc, ctxt) ++ {} ++ ++ /* opt_pass methods: */ ++ virtual bool gate (function *); ++ virtual unsigned int execute (function *); ++ ++}; // class pass_loop_crc ++ ++bool ++pass_loop_crc::gate (function *) ++{ ++ return (flag_loop_crc > 0 && optimize >= 3); ++} ++ ++unsigned int ++pass_loop_crc::execute (function *fun) ++{ ++ if (number_of_loops (fun) <= 1) ++ return 0; ++ ++ /* Only supports LP64 data mode. */ ++ if (TYPE_PRECISION (long_integer_type_node) != 64 ++ || POINTER_SIZE != 64 || TYPE_PRECISION (integer_type_node) != 32) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "The current data mode is not supported," ++ "only the LP64 date mode is supported.\n"); ++ return 0; ++ } ++ ++ return tree_ssa_loop_crc (); ++} ++ ++} // anon namespace ++ ++gimple_opt_pass * ++make_pass_loop_crc (gcc::context *ctxt) ++{ ++ return new pass_loop_crc (ctxt); ++} +\ No newline at end of file +-- +2.33.0 + diff --git a/0143-Perform-early-if-conversion-of-simple-arithmetic.patch b/0143-Perform-early-if-conversion-of-simple-arithmetic.patch new file mode 100644 index 0000000..6965a9b --- /dev/null +++ b/0143-Perform-early-if-conversion-of-simple-arithmetic.patch @@ -0,0 +1,109 @@ +From 7acb88ae27eb3e1af0da866d433968143c7754bd Mon Sep 17 00:00:00 2001 +From: Pronin Alexander 00812787 +Date: Thu, 12 Jan 2023 14:52:49 +0300 +Subject: [PATCH 20/33] Perform early if-conversion of simple arithmetic + +--- + gcc/common.opt | 4 ++++ + gcc/match.pd | 25 +++++++++++++++++++ + gcc/testsuite/gcc.dg/ifcvt-gimple.c | 37 +++++++++++++++++++++++++++++ + 3 files changed, 66 insertions(+) + create mode 100644 gcc/testsuite/gcc.dg/ifcvt-gimple.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index 6f0ed7cea..6950756fd 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1721,6 +1721,10 @@ fif-conversion2 + Common Report Var(flag_if_conversion2) Optimization + Perform conversion of conditional jumps to conditional execution. + ++fif-conversion-gimple ++Common Report Var(flag_if_conversion_gimple) Optimization ++Perform conversion of conditional jumps to branchless equivalents during gimple transformations. ++ + fstack-reuse= + Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization + -fstack-reuse=[all|named_vars|none] Set stack reuse level for local variables. +diff --git a/gcc/match.pd b/gcc/match.pd +index 01f81b063..e98cd02e0 100644 +--- a/gcc/match.pd ++++ b/gcc/match.pd +@@ -3402,6 +3402,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) + ) + ) + ) ++ ++(if (flag_if_conversion_gimple) ++ (for simple_op (plus minus bit_and bit_ior bit_xor) ++ (simplify ++ (cond @0 (simple_op @1 INTEGER_CST@2) @1) ++ (switch ++ /* a = cond ? a + 1 : a -> a = a + ((int) cond) */ ++ (if (integer_onep (@2)) ++ (simple_op @1 (convert (convert:boolean_type_node @0)))) ++ /* a = cond ? a + powerof2cst : a -> ++ a = a + ((int) cond) << log2 (powerof2cst) */ ++ (if (INTEGRAL_TYPE_P (type) && integer_pow2p (@2)) ++ (with ++ { ++ tree shift = build_int_cst (integer_type_node, tree_log2 (@2)); ++ } ++ (simple_op @1 (lshift (convert (convert:boolean_type_node @0)) ++ { shift; }) ++ ) ++ ) ++ ) ++ ) ++ ) ++ ) ++) + #endif + + #if GIMPLE +diff --git a/gcc/testsuite/gcc.dg/ifcvt-gimple.c b/gcc/testsuite/gcc.dg/ifcvt-gimple.c +new file mode 100644 +index 000000000..0f7c87e5c +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/ifcvt-gimple.c +@@ -0,0 +1,37 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -fif-conversion-gimple -fdump-tree-optimized" } */ ++ ++int test_int (int optimizable_int) { ++ if (optimizable_int > 5) ++ ++optimizable_int; ++ return optimizable_int; ++} ++ ++int test_int_pow2 (int optimizable_int_pow2) { ++ if (optimizable_int_pow2 <= 4) ++ optimizable_int_pow2 += 1024; ++ return optimizable_int_pow2; ++} ++ ++int test_int_non_pow2 (int not_optimizable_int_non_pow2) { ++ if (not_optimizable_int_non_pow2 == 1) ++ not_optimizable_int_non_pow2 += 513; ++ return not_optimizable_int_non_pow2; ++} ++ ++float test_float (float not_optimizable_float) { ++ if (not_optimizable_float > 5) ++ not_optimizable_float += 1; ++ return not_optimizable_float; ++} ++ ++/* Expecting if-else block in test_float and test_int_non_pow2 only. */ ++/* { dg-final { scan-tree-dump-not "if \\(optimizable" "optimized" } } */ ++/* { dg-final { scan-tree-dump "if \\(not_optimizable_int_non_pow2" "optimized" } } */ ++/* { dg-final { scan-tree-dump "if \\(not_optimizable_float" "optimized" } } */ ++/* { dg-final { scan-tree-dump-times "if " 2 "optimized" } } */ ++/* { dg-final { scan-tree-dump-times "else" 2 "optimized" } } */ ++ ++/* Expecting shifted result only for optimizable_int_pow2. */ ++/* { dg-final { scan-tree-dump-times " << " 1 "optimized" } } */ ++/* { dg-final { scan-tree-dump " << 10;" "optimized" } } */ +-- +2.33.0 + diff --git a/0144-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch b/0144-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch new file mode 100644 index 0000000..e5ea737 --- /dev/null +++ b/0144-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch @@ -0,0 +1,236 @@ +From f788555b23b0b676729bb695af96954fe083e354 Mon Sep 17 00:00:00 2001 +From: Pronin Alexander 00812787 +Date: Tue, 24 Jan 2023 16:43:40 +0300 +Subject: [PATCH 21/33] Add option to allow matching uaddsub overflow for widen + ops too. + +--- + gcc/common.opt | 5 ++ + gcc/testsuite/gcc.dg/uaddsub.c | 143 +++++++++++++++++++++++++++++++++ + gcc/tree-ssa-math-opts.c | 35 +++++++- + 3 files changed, 179 insertions(+), 4 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/uaddsub.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index 6950756fd..c2f01bbc0 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -2989,6 +2989,11 @@ freciprocal-math + Common Report Var(flag_reciprocal_math) SetByCombined Optimization + Same as -fassociative-math for expressions which include division. + ++fuaddsub-overflow-match-all ++Common Report Var(flag_uaddsub_overflow_match_all) ++Match unsigned add/sub overflow even if the target does not support ++the corresponding instruction. ++ + ; Nonzero means that unsafe floating-point math optimizations are allowed + ; for the sake of speed. IEEE compliance is not guaranteed, and operations + ; are allowed to assume that their arguments and results are "normal" +diff --git a/gcc/testsuite/gcc.dg/uaddsub.c b/gcc/testsuite/gcc.dg/uaddsub.c +new file mode 100644 +index 000000000..96c26d308 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/uaddsub.c +@@ -0,0 +1,143 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -fuaddsub-overflow-match-all -fdump-tree-optimized" } */ ++#include ++ ++typedef unsigned __int128 uint128_t; ++typedef struct uint256_t ++{ ++ uint128_t lo; ++ uint128_t hi; ++} uint256_t; ++ ++uint16_t add16 (uint8_t a, uint8_t b) ++{ ++ uint8_t tmp = a + b; ++ uint8_t overflow = 0; ++ if (tmp < a) ++ overflow = 1; ++ ++ uint16_t res = overflow; ++ res <<= 8; ++ res += tmp; ++ return res; ++} ++ ++uint32_t add32 (uint16_t a, uint16_t b) ++{ ++ uint16_t tmp = a + b; ++ uint16_t overflow = 0; ++ if (tmp < a) ++ overflow = 1; ++ ++ uint32_t res = overflow; ++ res <<= 16; ++ res += tmp; ++ return res; ++} ++ ++uint64_t add64 (uint32_t a, uint32_t b) ++{ ++ uint32_t tmp = a + b; ++ uint32_t overflow = 0; ++ if (tmp < a) ++ overflow = 1; ++ ++ uint64_t res = overflow; ++ res <<= 32; ++ res += tmp; ++ return res; ++} ++ ++uint128_t add128 (uint64_t a, uint64_t b) ++{ ++ uint64_t tmp = a + b; ++ uint64_t overflow = 0; ++ if (tmp < a) ++ overflow = 1; ++ ++ uint128_t res = overflow; ++ res <<= 64; ++ res += tmp; ++ return res; ++} ++ ++uint256_t add256 (uint128_t a, uint128_t b) ++{ ++ uint128_t tmp = a + b; ++ uint128_t overflow = 0; ++ if (tmp < a) ++ overflow = 1; ++ ++ uint256_t res; ++ res.hi = overflow; ++ res.lo = tmp; ++ return res; ++} ++ ++uint16_t sub16 (uint8_t a, uint8_t b) ++{ ++ uint8_t tmp = a - b; ++ uint8_t overflow = 0; ++ if (tmp > a) ++ overflow = -1; ++ ++ uint16_t res = overflow; ++ res <<= 8; ++ res += tmp; ++ return res; ++} ++ ++uint32_t sub32 (uint16_t a, uint16_t b) ++{ ++ uint16_t tmp = a - b; ++ uint16_t overflow = 0; ++ if (tmp > a) ++ overflow = -1; ++ ++ uint32_t res = overflow; ++ res <<= 16; ++ res += tmp; ++ return res; ++} ++ ++uint64_t sub64 (uint32_t a, uint32_t b) ++{ ++ uint32_t tmp = a - b; ++ uint32_t overflow = 0; ++ if (tmp > a) ++ overflow = -1; ++ ++ uint64_t res = overflow; ++ res <<= 32; ++ res += tmp; ++ return res; ++} ++ ++uint128_t sub128 (uint64_t a, uint64_t b) ++{ ++ uint64_t tmp = a - b; ++ uint64_t overflow = 0; ++ if (tmp > a) ++ overflow = -1; ++ ++ uint128_t res = overflow; ++ res <<= 64; ++ res += tmp; ++ return res; ++} ++ ++uint256_t sub256 (uint128_t a, uint128_t b) ++{ ++ uint128_t tmp = a - b; ++ uint128_t overflow = 0; ++ if (tmp > a) ++ overflow = -1; ++ ++ uint256_t res; ++ res.hi = overflow; ++ res.lo = tmp; ++ return res; ++} ++ ++/* { dg-final { scan-tree-dump-times "= .ADD_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */ ++/* { dg-final { scan-tree-dump-times "= .SUB_OVERFLOW \\(a_\[0-9\]+\\(D\\), b_\[0-9\]+\\(D\\)\\)" 5 "optimized" } } */ +diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c +index 4c89fddcf..716bf9e35 100644 +--- a/gcc/tree-ssa-math-opts.c ++++ b/gcc/tree-ssa-math-opts.c +@@ -3290,6 +3290,27 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2, + } + } + ++/* Check if the corresponding operation has wider equivalent on the target. */ ++ ++static bool ++wider_optab_check_p (optab op, machine_mode mode, int unsignedp) ++{ ++ machine_mode wider_mode; ++ FOR_EACH_WIDER_MODE (wider_mode, mode) ++ { ++ machine_mode next_mode; ++ if (optab_handler (op, wider_mode) != CODE_FOR_nothing ++ || (op == smul_optab ++ && GET_MODE_WIDER_MODE (wider_mode).exists (&next_mode) ++ && (find_widening_optab_handler ((unsignedp ++ ? umul_widen_optab ++ : smul_widen_optab), ++ next_mode, mode)))) ++ return true; ++ } ++ ++ return false; ++} + + /* Helper function of match_uaddsub_overflow. Return 1 + if USE_STMT is unsigned overflow check ovf != 0 for +@@ -3390,12 +3411,18 @@ match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt, + gimple *use_stmt; + + gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR); ++ optab op = code == PLUS_EXPR ? uaddv4_optab : usubv4_optab; ++ machine_mode mode = TYPE_MODE (type); ++ int unsignedp = TYPE_UNSIGNED (type); + if (!INTEGRAL_TYPE_P (type) +- || !TYPE_UNSIGNED (type) ++ || !unsignedp + || has_zero_uses (lhs) +- || has_single_use (lhs) +- || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab, +- TYPE_MODE (type)) == CODE_FOR_nothing) ++ || has_single_use (lhs)) ++ return false; ++ ++ if (optab_handler (op, mode) == CODE_FOR_nothing ++ && (!flag_uaddsub_overflow_match_all ++ || !wider_optab_check_p (op, mode, unsignedp))) + return false; + + FOR_EACH_IMM_USE_FAST (use_p, iter, lhs) +-- +2.33.0 + diff --git a/0145-Match-double-sized-mul-pattern.patch b/0145-Match-double-sized-mul-pattern.patch new file mode 100644 index 0000000..f04d6ce --- /dev/null +++ b/0145-Match-double-sized-mul-pattern.patch @@ -0,0 +1,488 @@ +From 3be7a26a08772d014f54f7b1a0555ccca91115d6 Mon Sep 17 00:00:00 2001 +From: Pronin Alexander 00812787 +Date: Wed, 25 Jan 2023 15:04:07 +0300 +Subject: [PATCH 22/33] Match double sized mul pattern + +--- + gcc/match.pd | 136 +++++++++++++++++++++ + gcc/testsuite/gcc.dg/double_sized_mul-1.c | 141 ++++++++++++++++++++++ + gcc/testsuite/gcc.dg/double_sized_mul-2.c | 62 ++++++++++ + gcc/tree-ssa-math-opts.c | 80 ++++++++++++ + 4 files changed, 419 insertions(+) + create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-1.c + create mode 100644 gcc/testsuite/gcc.dg/double_sized_mul-2.c + +diff --git a/gcc/match.pd b/gcc/match.pd +index e98cd02e0..74f8ab999 100644 +--- a/gcc/match.pd ++++ b/gcc/match.pd +@@ -6390,3 +6390,139 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) + to the number of trailing zeroes. */ + (match (ctz_table_index @1 @2 @3) + (rshift (mult (bit_and:c (negate @1) @1) INTEGER_CST@2) INTEGER_CST@3)) ++ ++/* Match multiplication with double sized result. ++ ++ Consider the following calculations: ++ arg0 * arg1 = (2^(bit_size/2) * arg0_hi + arg0_lo) ++ * (2^(bit_size/2) * arg1_hi + arg1_lo) ++ arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi ++ + 2^(bit_size/2) * (arg0_hi * arg1_lo + arg0_lo * arg1_hi) ++ + arg0_lo * arg1_lo ++ ++ The products of high and low parts fits in bit_size values, thus they are ++ placed in high and low parts of result respectively. ++ ++ The sum of the mixed products may overflow, so we need a detection for that. ++ Also it has a bit_size/2 offset, thus it intersects with both high and low ++ parts of result. Overflow detection constant is bit_size/2 due to this. ++ ++ With this info: ++ arg0 * arg1 = 2^bit_size * arg0_hi * arg1_hi ++ + 2^(bit_size/2) * middle ++ + 2^bit_size * possible_middle_overflow ++ + arg0_lo * arg1_lo ++ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow) ++ + 2^(bit_size/2) * (2^(bit_size/2) * middle_hi + middle_lo) ++ + arg0_lo * arg1_lo ++ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + middle_hi ++ + possible_middle_overflow) ++ + 2^(bit_size/2) * middle_lo ++ + arg0_lo * arg1_lo ++ ++ The last sum can produce overflow for the high result part. With this: ++ arg0 * arg1 = 2^bit_size * (arg0_hi * arg1_hi + possible_middle_overflow ++ + possible_res_lo_overflow + middle_hi) ++ + res_lo ++ = res_hi + res_lo ++ ++ This formula is quite big to fit into one match pattern with all of the ++ combinations of terms inside it. There are many helpers for better code ++ readability. ++ ++ The simplification basis is res_hi: assuming that res_lo only is not ++ real practical case for such calculations. ++ ++ Overflow handling is done via matching complex calculations: ++ the realpart and imagpart are quite handy here. */ ++/* Match low and high parts of the argument. */ ++(match (double_size_mul_arg_lo @0 @1) ++ (bit_and @0 INTEGER_CST@1) ++ (if (wi::to_wide (@1) ++ == wi::mask (TYPE_PRECISION (type) / 2, false, TYPE_PRECISION (type))))) ++(match (double_size_mul_arg_hi @0 @1) ++ (rshift @0 INTEGER_CST@1) ++ (if (wi::to_wide (@1) == TYPE_PRECISION (type) / 2))) ++ ++/* Match various argument parts products. */ ++(match (double_size_mul_lolo @0 @1) ++ (mult@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_lo @1 @3)) ++ (if (single_use (@4)))) ++(match (double_size_mul_hihi @0 @1) ++ (mult@4 (double_size_mul_arg_hi @0 @2) (double_size_mul_arg_hi @1 @3)) ++ (if (single_use (@4)))) ++(match (double_size_mul_lohi @0 @1) ++ (mult:c@4 (double_size_mul_arg_lo @0 @2) (double_size_mul_arg_hi @1 @3)) ++ (if (single_use (@4)))) ++ ++/* Match complex middle sum. */ ++(match (double_size_mul_middle_complex @0 @1) ++ (IFN_ADD_OVERFLOW@2 (double_size_mul_lohi @0 @1) (double_size_mul_lohi @1 @0)) ++ (if (num_imm_uses (@2) == 2))) ++ ++/* Match real middle results. */ ++(match (double_size_mul_middle @0 @1) ++ (realpart@2 (double_size_mul_middle_complex @0 @1)) ++ (if (num_imm_uses (@2) == 2))) ++(match (double_size_mul_middleres_lo @0 @1) ++ (lshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2) ++ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2 ++ && single_use (@3)))) ++(match (double_size_mul_middleres_hi @0 @1) ++ (rshift@3 (double_size_mul_middle @0 @1) INTEGER_CST@2) ++ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2 ++ && single_use (@3)))) ++ ++/* Match low result part. */ ++/* Number of uses may be < 2 in case when we are interested in ++ high part only. */ ++(match (double_size_mul_res_lo_complex @0 @1) ++ (IFN_ADD_OVERFLOW:c@2 ++ (double_size_mul_lolo:c @0 @1) (double_size_mul_middleres_lo @0 @1)) ++ (if (num_imm_uses (@2) <= 2))) ++(match (double_size_mul_res_lo @0 @1) ++ (realpart (double_size_mul_res_lo_complex @0 @1))) ++ ++/* Match overflow terms. */ ++(match (double_size_mul_overflow_check_lo @0 @1 @5) ++ (convert@4 (ne@3 ++ (imagpart@2 (double_size_mul_res_lo_complex@5 @0 @1)) integer_zerop)) ++ (if (single_use (@2) && single_use (@3) && single_use (@4)))) ++(match (double_size_mul_overflow_check_hi @0 @1) ++ (lshift@6 (convert@5 (ne@4 ++ (imagpart@3 (double_size_mul_middle_complex @0 @1)) integer_zerop)) ++ INTEGER_CST@2) ++ (if (wi::to_wide (@2) == TYPE_PRECISION (type) / 2 ++ && single_use (@3) && single_use (@4) && single_use (@5) ++ && single_use (@6)))) ++ ++/* Match all possible permutations for high result part calculations. */ ++(for op1 (double_size_mul_hihi ++ double_size_mul_overflow_check_hi ++ double_size_mul_middleres_hi) ++ op2 (double_size_mul_overflow_check_hi ++ double_size_mul_middleres_hi ++ double_size_mul_hihi) ++ op3 (double_size_mul_middleres_hi ++ double_size_mul_hihi ++ double_size_mul_overflow_check_hi) ++ (match (double_size_mul_candidate @0 @1 @2 @3) ++ (plus:c@2 ++ (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3) (op1:c @0 @1)) ++ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1))) ++ (if (single_use (@4) && single_use (@5)))) ++ (match (double_size_mul_candidate @0 @1 @2 @3) ++ (plus:c@2 (double_size_mul_overflow_check_lo @0 @1 @3) ++ (plus:c@4 (op1:c @0 @1) ++ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1)))) ++ (if (single_use (@4) && single_use (@5)))) ++ (match (double_size_mul_candidate @0 @1 @2 @3) ++ (plus:c@2 (op1:c @0 @1) ++ (plus:c@4 (double_size_mul_overflow_check_lo @0 @1 @3) ++ (plus:c@5 (op2:c @0 @1) (op3:c @0 @1)))) ++ (if (single_use (@4) && single_use (@5)))) ++ (match (double_size_mul_candidate @0 @1 @2 @3) ++ (plus:c@2 (op1:c @0 @1) ++ (plus:c@4 (op2:c @0 @1) ++ (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1)))) ++ (if (single_use (@4) && single_use (@5))))) +diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-1.c b/gcc/testsuite/gcc.dg/double_sized_mul-1.c +new file mode 100644 +index 000000000..4d475cc8a +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/double_sized_mul-1.c +@@ -0,0 +1,141 @@ ++/* { dg-do compile } */ ++/* fif-conversion-gimple and fuaddsub-overflow-match-all are required for ++ proper overflow detection in some cases. */ ++/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */ ++#include ++ ++typedef unsigned __int128 uint128_t; ++ ++uint16_t mul16 (uint8_t a, uint8_t b) ++{ ++ uint8_t a_lo = a & 0xF; ++ uint8_t b_lo = b & 0xF; ++ uint8_t a_hi = a >> 4; ++ uint8_t b_hi = b >> 4; ++ uint8_t lolo = a_lo * b_lo; ++ uint8_t lohi = a_lo * b_hi; ++ uint8_t hilo = a_hi * b_lo; ++ uint8_t hihi = a_hi * b_hi; ++ uint8_t middle = hilo + lohi; ++ uint8_t middle_hi = middle >> 4; ++ uint8_t middle_lo = middle << 4; ++ uint8_t res_lo = lolo + middle_lo; ++ uint8_t res_hi = hihi + middle_hi; ++ res_hi += (res_lo < middle_lo ? 1 : 0); ++ res_hi += (middle < hilo ? 0x10 : 0); ++ uint16_t res = ((uint16_t) res_hi) << 8; ++ res += res_lo; ++ return res; ++} ++ ++uint32_t mul32 (uint16_t a, uint16_t b) ++{ ++ uint16_t a_lo = a & 0xFF; ++ uint16_t b_lo = b & 0xFF; ++ uint16_t a_hi = a >> 8; ++ uint16_t b_hi = b >> 8; ++ uint16_t lolo = a_lo * b_lo; ++ uint16_t lohi = a_lo * b_hi; ++ uint16_t hilo = a_hi * b_lo; ++ uint16_t hihi = a_hi * b_hi; ++ uint16_t middle = hilo + lohi; ++ uint16_t middle_hi = middle >> 8; ++ uint16_t middle_lo = middle << 8; ++ uint16_t res_lo = lolo + middle_lo; ++ uint16_t res_hi = hihi + middle_hi; ++ res_hi += (res_lo < middle_lo ? 1 : 0); ++ res_hi += (middle < hilo ? 0x100 : 0); ++ uint32_t res = ((uint32_t) res_hi) << 16; ++ res += res_lo; ++ return res; ++} ++ ++uint64_t mul64 (uint32_t a, uint32_t b) ++{ ++ uint32_t a_lo = a & 0xFFFF; ++ uint32_t b_lo = b & 0xFFFF; ++ uint32_t a_hi = a >> 16; ++ uint32_t b_hi = b >> 16; ++ uint32_t lolo = a_lo * b_lo; ++ uint32_t lohi = a_lo * b_hi; ++ uint32_t hilo = a_hi * b_lo; ++ uint32_t hihi = a_hi * b_hi; ++ uint32_t middle = hilo + lohi; ++ uint32_t middle_hi = middle >> 16; ++ uint32_t middle_lo = middle << 16; ++ uint32_t res_lo = lolo + middle_lo; ++ uint32_t res_hi = hihi + middle_hi; ++ res_hi += (res_lo < middle_lo ? 1 : 0); ++ res_hi += (middle < hilo ? 0x10000 : 0); ++ uint64_t res = ((uint64_t) res_hi) << 32; ++ res += res_lo; ++ return res; ++} ++ ++uint128_t mul128 (uint64_t a, uint64_t b) ++{ ++ uint64_t a_lo = a & 0xFFFFFFFF; ++ uint64_t b_lo = b & 0xFFFFFFFF; ++ uint64_t a_hi = a >> 32; ++ uint64_t b_hi = b >> 32; ++ uint64_t lolo = a_lo * b_lo; ++ uint64_t lohi = a_lo * b_hi; ++ uint64_t hilo = a_hi * b_lo; ++ uint64_t hihi = a_hi * b_hi; ++ uint64_t middle = hilo + lohi; ++ uint64_t middle_hi = middle >> 32; ++ uint64_t middle_lo = middle << 32; ++ uint64_t res_lo = lolo + middle_lo; ++ uint64_t res_hi = hihi + middle_hi; ++ res_hi += (res_lo < middle_lo ? 1 : 0); ++ res_hi += (middle < hilo ? 0x100000000 : 0); ++ uint128_t res = ((uint128_t) res_hi) << 64; ++ res += res_lo; ++ return res; ++} ++ ++uint64_t mul64_perm (uint32_t a, uint32_t b) ++{ ++ uint32_t a_lo = a & 0xFFFF; ++ uint32_t b_lo = b & 0xFFFF; ++ uint32_t a_hi = a >> 16; ++ uint32_t b_hi = b >> 16; ++ uint32_t lolo = a_lo * b_lo; ++ uint32_t lohi = a_lo * b_hi; ++ uint32_t hilo = a_hi * b_lo; ++ uint32_t hihi = a_hi * b_hi; ++ uint32_t middle = hilo + lohi; ++ uint32_t middle_hi = middle >> 16; ++ uint32_t middle_lo = middle << 16; ++ uint32_t res_lo = lolo + middle_lo; ++ uint32_t res_hi = hihi + middle_hi; ++ res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi; ++ res_hi = middle < hilo ? res_hi + 0x10000 : res_hi; ++ uint64_t res = ((uint64_t) res_hi) << 32; ++ res += res_lo; ++ return res; ++} ++ ++uint128_t mul128_perm (uint64_t a, uint64_t b) ++{ ++ uint64_t a_lo = a & 0xFFFFFFFF; ++ uint64_t b_lo = b & 0xFFFFFFFF; ++ uint64_t a_hi = a >> 32; ++ uint64_t b_hi = b >> 32; ++ uint64_t lolo = a_lo * b_lo; ++ uint64_t lohi = a_lo * b_hi; ++ uint64_t hilo = a_hi * b_lo; ++ uint64_t hihi = a_hi * b_hi; ++ uint64_t middle = hilo + lohi; ++ uint64_t middle_hi = middle >> 32; ++ uint64_t middle_lo = middle << 32; ++ uint64_t res_lo = lolo + middle_lo; ++ uint64_t res_hi = hihi + middle_hi; ++ res_hi = res_lo < middle_lo ? res_hi + 1 : res_hi; ++ res_hi = middle < hilo ? res_hi + 0x100000000 : res_hi; ++ uint128_t res = ((uint128_t) res_hi) << 64; ++ res += res_lo; ++ return res; ++} ++ ++/* { dg-final { scan-tree-dump-times "double sized mul optimized: 1" 6 "widening_mul" } } */ +diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-2.c b/gcc/testsuite/gcc.dg/double_sized_mul-2.c +new file mode 100644 +index 000000000..cc6e5af25 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/double_sized_mul-2.c +@@ -0,0 +1,62 @@ ++/* { dg-do compile } */ ++/* fif-conversion-gimple is required for proper overflow detection ++ in some cases. */ ++/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */ ++#include ++ ++typedef unsigned __int128 uint128_t; ++typedef struct uint256_t ++{ ++ uint128_t lo; ++ uint128_t hi; ++} uint256_t; ++ ++uint64_t mul64_double_use (uint32_t a, uint32_t b) ++{ ++ uint32_t a_lo = a & 0xFFFF; ++ uint32_t b_lo = b & 0xFFFF; ++ uint32_t a_hi = a >> 16; ++ uint32_t b_hi = b >> 16; ++ uint32_t lolo = a_lo * b_lo; ++ uint32_t lohi = a_lo * b_hi; ++ uint32_t hilo = a_hi * b_lo; ++ uint32_t hihi = a_hi * b_hi; ++ uint32_t middle = hilo + lohi; ++ uint32_t middle_hi = middle >> 16; ++ uint32_t middle_lo = middle << 16; ++ uint32_t res_lo = lolo + middle_lo; ++ uint32_t res_hi = hihi + middle_hi; ++ res_hi += (res_lo < middle_lo ? 1 : 0); ++ res_hi += (middle < hilo ? 0x10000 : 0); ++ uint64_t res = ((uint64_t) res_hi) << 32; ++ res += res_lo; ++ return res + lolo; ++} ++ ++uint256_t mul256 (uint128_t a, uint128_t b) ++{ ++ uint128_t a_lo = a & 0xFFFFFFFFFFFFFFFF; ++ uint128_t b_lo = b & 0xFFFFFFFFFFFFFFFF; ++ uint128_t a_hi = a >> 64; ++ uint128_t b_hi = b >> 64; ++ uint128_t lolo = a_lo * b_lo; ++ uint128_t lohi = a_lo * b_hi; ++ uint128_t hilo = a_hi * b_lo; ++ uint128_t hihi = a_hi * b_hi; ++ uint128_t middle = hilo + lohi; ++ uint128_t middle_hi = middle >> 64; ++ uint128_t middle_lo = middle << 64; ++ uint128_t res_lo = lolo + middle_lo; ++ uint128_t res_hi = hihi + middle_hi; ++ res_hi += (res_lo < middle_lo ? 1 : 0); ++ /* Constant is to big warning WA */ ++ uint128_t overflow_tmp = (middle < hilo ? 1 : 0); ++ overflow_tmp <<= 64; ++ res_hi += overflow_tmp; ++ uint256_t res; ++ res.lo = res_lo; ++ res.hi = res_hi; ++ return res; ++} ++ ++/* { dg-final { scan-tree-dump-not "double sized mul optimized" "widening_mul" } } */ +diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c +index 716bf9e35..a81d7501c 100644 +--- a/gcc/tree-ssa-math-opts.c ++++ b/gcc/tree-ssa-math-opts.c +@@ -182,6 +182,9 @@ static struct + + /* Number of divmod calls inserted. */ + int divmod_calls_inserted; ++ ++ /* Number of optimized double sized multiplications. */ ++ int double_sized_mul_optimized; + } widen_mul_stats; + + /* The instance of "struct occurrence" representing the highest +@@ -3708,6 +3711,78 @@ convert_to_divmod (gassign *stmt) + return true; + } + ++/* Pattern matcher for double sized multiplication defined in match.pd. */ ++extern bool gimple_double_size_mul_candidate (tree, tree*, tree (*)(tree)); ++ ++static bool ++convert_double_size_mul (gimple_stmt_iterator *gsi, gimple *stmt) ++{ ++ gimple *use_stmt, *complex_res_lo; ++ gimple_stmt_iterator insert_before; ++ imm_use_iterator use_iter; ++ tree match[4]; // arg0, arg1, res_hi, complex_res_lo ++ tree arg0, arg1, widen_mult, new_type, tmp; ++ tree lhs = gimple_assign_lhs (stmt); ++ location_t loc = UNKNOWN_LOCATION; ++ machine_mode mode; ++ ++ if (!gimple_double_size_mul_candidate (lhs, match, NULL)) ++ return false; ++ ++ new_type = build_nonstandard_integer_type ( ++ TYPE_PRECISION (TREE_TYPE (match[0])) * 2, 1); ++ mode = TYPE_MODE (new_type); ++ ++ /* Early return if the target multiplication doesn't exist on target. */ ++ if (optab_handler (smul_optab, mode) == CODE_FOR_nothing ++ && !wider_optab_check_p (smul_optab, mode, 1)) ++ return false; ++ ++ /* Determine the point where the wide multiplication ++ should be inserted. Complex low res is OK since it is required ++ by both high and low part getters, thus it dominates both of them. */ ++ complex_res_lo = SSA_NAME_DEF_STMT (match[3]); ++ insert_before = gsi_for_stmt (complex_res_lo); ++ gsi_next (&insert_before); ++ ++ /* Create the widen multiplication. */ ++ arg0 = build_and_insert_cast (&insert_before, loc, new_type, match[0]); ++ arg1 = build_and_insert_cast (&insert_before, loc, new_type, match[1]); ++ widen_mult = build_and_insert_binop (&insert_before, loc, "widen_mult", ++ MULT_EXPR, arg0, arg1); ++ ++ /* Find the mult low part getter. */ ++ FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, match[3]) ++ if (gimple_assign_rhs_code (use_stmt) == REALPART_EXPR) ++ BREAK_FROM_IMM_USE_STMT (use_iter); ++ ++ /* Create high and low (if needed) parts extractors. */ ++ /* Low part. */ ++ if (use_stmt) ++ { ++ loc = gimple_location (use_stmt); ++ tmp = build_and_insert_cast (&insert_before, loc, ++ TREE_TYPE (gimple_get_lhs (use_stmt)), ++ widen_mult); ++ gassign *new_stmt = gimple_build_assign (gimple_get_lhs (use_stmt), ++ NOP_EXPR, tmp); ++ gsi_replace (&insert_before, new_stmt, true); ++ } ++ ++ /* High part. */ ++ loc = gimple_location (stmt); ++ tmp = build_and_insert_binop (gsi, loc, "widen_mult_hi", ++ RSHIFT_EXPR, widen_mult, ++ build_int_cst (new_type, ++ TYPE_PRECISION (new_type) / 2)); ++ tmp = build_and_insert_cast (gsi, loc, TREE_TYPE (lhs), tmp); ++ gassign *new_stmt = gimple_build_assign (lhs, NOP_EXPR, tmp); ++ gsi_replace (gsi, new_stmt, true); ++ ++ widen_mul_stats.double_sized_mul_optimized++; ++ return true; ++} ++ + /* Find integer multiplications where the operands are extended from + smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR + where appropriate. */ +@@ -3801,6 +3876,9 @@ math_opts_dom_walker::after_dom_children (basic_block bb) + break; + + case PLUS_EXPR: ++ if (convert_double_size_mul (&gsi, stmt)) ++ break; ++ __attribute__ ((fallthrough)); + case MINUS_EXPR: + if (!convert_plusminus_to_widen (&gsi, stmt, code)) + match_uaddsub_overflow (&gsi, stmt, code); +@@ -3892,6 +3970,8 @@ pass_optimize_widening_mul::execute (function *fun) + widen_mul_stats.fmas_inserted); + statistics_counter_event (fun, "divmod calls inserted", + widen_mul_stats.divmod_calls_inserted); ++ statistics_counter_event (fun, "double sized mul optimized", ++ widen_mul_stats.double_sized_mul_optimized); + + return cfg_changed ? TODO_cleanup_cfg : 0; + } +-- +2.33.0 + diff --git a/0146-LOOP-CRC32-Add-Crc32-Optimization-in-Gzip-For-crc32-.patch b/0146-LOOP-CRC32-Add-Crc32-Optimization-in-Gzip-For-crc32-.patch new file mode 100644 index 0000000..a9a8e94 --- /dev/null +++ b/0146-LOOP-CRC32-Add-Crc32-Optimization-in-Gzip-For-crc32-.patch @@ -0,0 +1,2354 @@ +From 179412c66d0cdd6a48ef1c29acae90908102a1c9 Mon Sep 17 00:00:00 2001 +From: xingyushuai +Date: Mon, 24 Apr 2023 09:34:35 +0800 +Subject: [PATCH 08/13] [LOOP CRC32]Add Crc32 Optimization in Gzip For crc32 + algorithm in APBC int_gzip. Match crc32 lookup table algorithm. An example + for crc32 lookup table alg: ```c do { c = crc_32_tab[((int)c ^ (*s++)) & + 0xff] ^ (c >> 8); } while (--n); + +Usage: `gcc -O3 -march=armv8.1-a -floop-crc yourfile.c` +Node: The cpu you use needs to support the crc32 instructions +--- + gcc/config/aarch64/aarch64-builtins.c | 29 + + gcc/config/aarch64/aarch64-protos.h | 1 + + gcc/config/aarch64/aarch64.c | 12 + + gcc/doc/tm.texi | 9 + + gcc/doc/tm.texi.in | 2 + + gcc/match.pd | 146 +-- + gcc/passes.def | 2 +- + gcc/target.def | 14 + + gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c | 85 -- + .../tree-ssa/loop-crc-1.c.042t.loop_crc | 90 -- + gcc/testsuite/gcc.dg/tree-ssa/loop-crc-2.c | 88 -- + .../tree-ssa/loop-crc-4.c.042t.loop_crc | 0 + .../loop-crc-calculation-check-fail.c | 156 --- + ...crc-calculation-check-fail.c.042t.loop_crc | 64 -- + .../loop-crc-calculation-check-fail.s | 329 ------- + ...crc-3.c => loop-crc-loop-condition-fail.c} | 6 +- + ...op-crc-4.c => loop-crc-loop-form-fail-2.c} | 7 +- + .../gcc.dg/tree-ssa/loop-crc-loop-form-fail.c | 3 +- + .../gcc.dg/tree-ssa/loop-crc-sucess.c | 7 +- + .../tree-ssa/loop-crc-table-check-fail.c | 3 +- + gcc/tree-ssa-loop-crc.c | 903 +++++++++++++++--- + 21 files changed, 873 insertions(+), 1083 deletions(-) + delete mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c + delete mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c.042t.loop_crc + delete mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-2.c + delete mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-4.c.042t.loop_crc + delete mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c + delete mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c.042t.loop_crc + delete mode 100644 gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.s + rename gcc/testsuite/gcc.dg/tree-ssa/{loop-crc-3.c => loop-crc-loop-condition-fail.c} (97%) + rename gcc/testsuite/gcc.dg/tree-ssa/{loop-crc-4.c => loop-crc-loop-form-fail-2.c} (95%) + +diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c +index d92157dff..1e8b046da 100644 +--- a/gcc/config/aarch64/aarch64-builtins.c ++++ b/gcc/config/aarch64/aarch64-builtins.c +@@ -441,6 +441,12 @@ typedef struct + #define VAR1(T, N, MAP, A) \ + AARCH64_SIMD_BUILTIN_##T##_##N##A, + ++enum aarch64_crc_builtins{ ++ AARCH64_BUILTIN_CRC32B, ++ AARCH64_BUILTIN_CRC32H, ++ AARCH64_BUILTIN_CRC32W, ++}; ++ + enum aarch64_builtins + { + AARCH64_BUILTIN_MIN, +@@ -1321,6 +1327,29 @@ aarch64_general_builtin_decl (unsigned code, bool) + + return aarch64_builtin_decls[code]; + } ++/* Implement TARGET_GET_CRC_BUILTIN_CODE */ ++unsigned ++get_crc_builtin_code(unsigned code, bool) ++{ ++ if (code > AARCH64_BUILTIN_CRC32W) ++ return AARCH64_BUILTIN_MIN; ++ ++ unsigned res = AARCH64_BUILTIN_MIN; ++ switch (code) { ++ case AARCH64_BUILTIN_CRC32B: ++ res = AARCH64_BUILTIN_crc32b; ++ break; ++ case AARCH64_BUILTIN_CRC32H: ++ res = AARCH64_BUILTIN_crc32h; ++ break; ++ case AARCH64_BUILTIN_CRC32W: ++ res = AARCH64_BUILTIN_crc32w; ++ break; ++ default: ++ break; ++ } ++ return res; ++} + + typedef enum + { +diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h +index 9b6d309a7..a0ca662bc 100644 +--- a/gcc/config/aarch64/aarch64-protos.h ++++ b/gcc/config/aarch64/aarch64-protos.h +@@ -723,6 +723,7 @@ tree aarch64_general_fold_builtin (unsigned int, tree, unsigned int, tree *); + gimple *aarch64_general_gimple_fold_builtin (unsigned int, gcall *); + rtx aarch64_general_expand_builtin (unsigned int, tree, rtx, int); + tree aarch64_general_builtin_decl (unsigned, bool); ++unsigned get_crc_builtin_code(unsigned , bool); + tree aarch64_general_builtin_rsqrt (unsigned int); + tree aarch64_builtin_vectorized_function (unsigned int, tree, tree); + +diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c +index cbdde11b0..b8407c612 100644 +--- a/gcc/config/aarch64/aarch64.c ++++ b/gcc/config/aarch64/aarch64.c +@@ -13735,6 +13735,15 @@ aarch64_builtin_decl (unsigned int code, bool initialize_p) + gcc_unreachable (); + } + ++/* Implement TARGET_GET_CRC_BUILTIN_CODE. */ ++static unsigned ++aarch64_get_crc_builtin_code(unsigned code, bool initialize_p) ++{ ++ unsigned subcode = get_crc_builtin_code(code,initialize_p); ++ unsigned res = subcode << AARCH64_BUILTIN_SHIFT; ++ return res; ++} ++ + /* Return true if it is safe and beneficial to use the approximate rsqrt optabs + to optimize 1.0/sqrt. */ + +@@ -23911,6 +23920,9 @@ aarch64_run_selftests (void) + #undef TARGET_BUILTIN_DECL + #define TARGET_BUILTIN_DECL aarch64_builtin_decl + ++#undef TARGET_GET_CRC_BUILTIN_CODE ++#define TARGET_GET_CRC_BUILTIN_CODE aarch64_get_crc_builtin_code ++ + #undef TARGET_BUILTIN_RECIPROCAL + #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal + +diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi +index 0508fce57..b46418d0b 100644 +--- a/gcc/doc/tm.texi ++++ b/gcc/doc/tm.texi +@@ -11610,6 +11610,15 @@ If @var{code} is out of range the function should return + @code{error_mark_node}. + @end deftypefn + ++@deftypefn {Target Hook} unsigned TARGET_GET_CRC_BUILTIN_CODE (unsigned @var{code}, bool @var{initialize_p}) ++Define this hook to get crc32 builtin code. It should be a function that ++returns the crc32 builtin function code @var{code}. ++If there is no such builtin and it cannot be initialized at this time ++if @var{initialize_p} is true the function should return @code{NULL_TREE}. ++If @var{code} is out of range the function should return ++@code{error_mark_node}. ++@end deftypefn ++ + @deftypefn {Target Hook} rtx TARGET_EXPAND_BUILTIN (tree @var{exp}, rtx @var{target}, rtx @var{subtarget}, machine_mode @var{mode}, int @var{ignore}) + + Expand a call to a machine specific built-in function that was set up by +diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in +index 3b70ea484..2663547c7 100644 +--- a/gcc/doc/tm.texi.in ++++ b/gcc/doc/tm.texi.in +@@ -7941,6 +7941,8 @@ to by @var{ce_info}. + + @hook TARGET_BUILTIN_DECL + ++@hook TARGET_GET_CRC_BUILTIN_CODE ++ + @hook TARGET_EXPAND_BUILTIN + + @hook TARGET_RESOLVE_OVERLOADED_BUILTIN +diff --git a/gcc/match.pd b/gcc/match.pd +index 87b316953..0f92003f7 100644 +--- a/gcc/match.pd ++++ b/gcc/match.pd +@@ -3487,160 +3487,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) + ) + #endif + +- +-#if GIMPLE +-(if (canonicalize_math_p ()) +-/* These patterns are mostly used by PHIOPT to move some operations outside of +- the if statements. They should be done late because it gives jump threading +- and few other passes to reduce what is going on. */ +-/* a ? x op POW2 : x -> x op (a ? POW2 : 0). */ +- (for op (plus minus bit_ior bit_xor lshift rshift lrotate rrotate) +- (simplify +- (cond @0 (op:s @1 INTEGER_CST@2) @1) +- /* powerof2cst */ +- (if (INTEGRAL_TYPE_P (type) && integer_pow2p (@2)) +- (with { +- tree shift = build_int_cst (integer_type_node, tree_log2 (@2)); +- } +- (op @1 (lshift (convert (convert:boolean_type_node @0)) { shift; }))) +- ) +- ) +- ) +-) +-#endif +- +-#if GIMPLE +-/* These patterns are mostly used by FORWPROP to move some operations outside of +- the if statements. They should be done late because it gives jump threading +- and few other passes to reduce what is going on. */ +-/* Mul64 is defined as a multiplication algorithm which compute two 64-bit integers to one 128-bit integer +- (i64 ResLo, i64 ResHi) = Mul64(i64 In0, i64 In1) { +- In0Lo = In0(D) & 4294967295; +- In0Hi = In0(D) >> 32; +- In1Lo = In1(D) & 4294967295; +- In1Hi = In1(D) >> 32; +- Mull_01 = In0Hi * In1Lo; +- Addc = In0Lo * In1Hi + Mull_01; +- addc32 = Addc << 32; +- ResLo = In0Lo * In1Lo + addc32; +- ResHi = ((long unsigned int) (addc32 > ResLo)) + +- (((long unsigned int) (Mull_01 > Addc)) << 32) + (Addc >> 32) + In0Hi * In1Hi; +- } */ +- (simplify +- (plus +- (plus +- (convert +- (gt @10 +- (plus +- (mult @4 @6) +- (lshift@10 @9 @3)))) +- (lshift +- (convert +- (gt @8 @9)) @3)) +- (plus@11 +- (rshift +- (plus@9 +- (mult (bit_and@4 SSA_NAME@0 @2) @7) +- (mult@8 @5 (bit_and@6 SSA_NAME@1 INTEGER_CST@2))) @3) +- (mult (rshift@5 SSA_NAME@0 @3) +- (rshift@7 SSA_NAME@1 INTEGER_CST@3)))) +- (if (INTEGRAL_TYPE_P (type) && INTEGRAL_TYPE_P (TREE_TYPE (@0)) && types_match (@0, @1) && +- TYPE_PRECISION (type) == 64) +- (with { +- tree i128_type = build_nonstandard_integer_type (128, TYPE_UNSIGNED (type)); +- tree shift = build_int_cst (integer_type_node, 64); +- //direct_internal_fn_supported_p (UMULH, type, OPTIMIZE_FOR_BOTH) +- } +- (convert:type (rshift +- (mult (convert:i128_type @0) (convert:i128_type @1)) { shift; }))) +- ) +- ) +- +- /* (i64 ResLo, i64 ResHi) = Mul64(i64 In0, i64 In1) { +- In0Lo = In0(D) & 4294967295; +- In0Hi = In0(D) >> 32; +- In1Lo = In1(D) & 4294967295; +- In1Hi = In1(D) >> 32; +- Mull_01 = In0Hi * In1Lo; +- Addc = In0Lo * In1Hi + Mull_01; +- addc32 = Addc << 32; +- ResLo = In0(D) * In1(D); +- ResHi = ((long unsigned int) (addc32 > ResLo)) + +- (((long unsigned int) (Mull_01 > Addc)) << 32) + (Addc >> 32) + In0Hi * In1Hi; +- } */ +- (simplify +- (plus +- (plus +- (convert +- (gt (lshift@10 @9 @3) +- (mult @0 @1))) +- (lshift +- (convert +- (gt @8 @9)) @3)) +- (plus@11 +- (rshift +- (plus@9 +- (mult (bit_and@4 SSA_NAME@0 @2) @7) +- (mult@8 @5 (bit_and@6 SSA_NAME@1 INTEGER_CST@2))) @3) +- (mult (rshift@5 SSA_NAME@0 @3) +- (rshift@7 SSA_NAME@1 INTEGER_CST@3)))) +- (if (INTEGRAL_TYPE_P (type) && INTEGRAL_TYPE_P (TREE_TYPE (@0)) && types_match (@0, @1) && +- TYPE_PRECISION (type) == 64) +- (with { +- tree i128_type = build_nonstandard_integer_type (128, TYPE_UNSIGNED (type)); +- tree shift = build_int_cst (integer_type_node, 64); +- //direct_internal_fn_supported_p (UMULH, type, OPTIMIZE_FOR_BOTH) +- } +- (convert:type (rshift +- (mult (convert:i128_type @0) (convert:i128_type @1)) { shift; }))) +- ) +- ) +-#endif +- +-#if GIMPLE +-/* These patterns are mostly used by FORWPROP to move some operations outside of +- the if statements. They should be done late because it gives jump threading +- and few other passes to reduce what is going on. */ +- /* +- In0Lo = In0(D) & 4294967295; +- In0Hi = In0(D) >> 32; +- In1Lo = In1(D) & 4294967295; +- In1Hi = In1(D) >> 32; +- Addc = In0Lo * In1Hi + In0Hi * In1Lo; +- addc32 = Addc << 32; +- ResLo = In0Lo * In1Lo + addc32 +- */ +- (simplify +- (plus (mult @4 @5) +- (lshift +- (plus +- (mult (bit_and@4 SSA_NAME@0 @2) (rshift SSA_NAME@1 @3)) +- (mult (rshift SSA_NAME@0 @3) (bit_and@5 SSA_NAME@1 INTEGER_CST@2))) INTEGER_CST@3)) +- (if (INTEGRAL_TYPE_P (type) && INTEGRAL_TYPE_P (TREE_TYPE (@0)) && types_match (@0, @1) && +- TYPE_PRECISION (type) == 64) +- (with { +- tree i128_type = build_nonstandard_integer_type (128, TYPE_UNSIGNED (type)); +- tree shift = build_int_cst (integer_type_node, 64); +- //direct_internal_fn_supported_p (UMULH, type, OPTIMIZE_FOR_BOTH) +- } +- (mult (convert:type @0) (convert:type @1))) +- ) +- ) +-#endif +- +- + #if GIMPLE + /* Try to match */ + /* + _4 = (int) _3; //NOP_EXPR (SSA_NAME @2) + _5 = _4 ^ c_10; //BIT_XOR_EXPR (SSA_NAME@1, SSA_NAME) +-_6 = _5 & 255; //BIT_XOR_EXPR (SSA_NAME, INTEGER_CST@3) ++_6 = _5 & 255; //BIT_AND_EXPR (SSA_NAME, INTEGER_CST@3) + */ + (match (crc_match_index @1 @2 @3) + (bit_and (bit_xor (nop SSA_NAME@2) SSA_NAME@1) INTEGER_CST@3) + (if (INTEGRAL_TYPE_P (type) && tree_to_uhwi(@3) == 255)) + ) +- + #endif + + #if GIMPLE +@@ -3653,7 +3510,6 @@ c_19 = _7 ^ _8; // BIT_XOR_EXPR (SSA_NAME@3, SSA_NAME) + (bit_xor SSA_NAME@3 (rshift SSA_NAME@1 INTEGER_CST@2)) + (if (INTEGRAL_TYPE_P (type) && tree_to_uhwi(@2) == 8)) + ) +- + #endif + + /* Simplification moved from fold_cond_expr_with_comparison. It may also +diff --git a/gcc/passes.def b/gcc/passes.def +index 7abd946ce..df7d65733 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -92,7 +92,7 @@ along with GCC; see the file COPYING3. If not see + NEXT_PASS (pass_cd_dce); + NEXT_PASS (pass_phiopt, true /* early_p */); + NEXT_PASS (pass_array_widen_compare); +- NEXT_PASS (pass_loop_crc); ++ NEXT_PASS (pass_loop_crc); + NEXT_PASS (pass_tail_recursion); + NEXT_PASS (pass_convert_switch); + NEXT_PASS (pass_cleanup_eh); +diff --git a/gcc/target.def b/gcc/target.def +index 202056411..34d3561bd 100644 +--- a/gcc/target.def ++++ b/gcc/target.def +@@ -2421,6 +2421,20 @@ If @var{code} is out of range the function should return\n\ + @code{error_mark_node}.", + tree, (unsigned code, bool initialize_p), NULL) + ++/* Initialize (if INITIALIZE_P is true) and return the real code of ++ target-specific built-in function . ++ Return NULL if that is not possible. Return error_mark_node if CODE ++ is outside of the range of valid crc32 codes. */ ++DEFHOOK ++(get_crc_builtin_code, ++ "Define this hook to get crc32 builtin code. It should be a function that\n\ ++returns the crc32 builtin function code @var{code}.\n\ ++If there is no such builtin and it cannot be initialized at this time\n\ ++if @var{initialize_p} is true the function should return @code{NULL_TREE}.\n\ ++If @var{code} is out of range the function should return\n\ ++@code{error_mark_node}.", ++ unsigned , (unsigned code, bool initialize_p), NULL) ++ + /* Expand a target-specific builtin. */ + DEFHOOK + (expand_builtin, +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c +deleted file mode 100644 +index 07f9e01ec..000000000 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c ++++ /dev/null +@@ -1,85 +0,0 @@ +-/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ +-/* { dg-options "-O3 -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ +- +-#include +-#include +-typedef unsigned long ulg; +-typedef unsigned char uch; +- +-static const ulg crc_32_tab[] = { +- 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, +- 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, +- 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, +- 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, +- 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, +- 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, +- 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, +- 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, +- 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, +- 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, +- 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, +- 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, +- 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, +- 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, +- 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, +- 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, +- 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, +- 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, +- 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, +- 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, +- 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, +- 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, +- 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, +- 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, +- 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, +- 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, +- 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, +- 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, +- 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, +- 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, +- 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, +- 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, +- 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, +- 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, +- 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, +- 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, +- 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, +- 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, +- 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, +- 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, +- 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, +- 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, +- 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, +- 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, +- 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, +- 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, +- 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, +- 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, +- 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, +- 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, +- 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, +- 0x2d02ef8dL +-}; +- +-ulg updcrc(s, n) +- uch *s; /* pointer to bytes to pump through */ +- unsigned n; /* number of bytes in s[] */ +-{ +- register ulg c; /* temporary variable */ +- +- static ulg crc = (ulg)0xffffffffL; /* shift register contents */ +- +- if (s == NULL) { +- c = 0xffffffffL; +- } else { +- c = crc; +- if (n) +- do { +- c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); +- } while (--n); +- } +- crc = c; +- return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ +-} +-/* { dg-final { scan-tree-dump-times "Processing loop" 1 "loop_crc"} } */ +-/* { dg-final { scan-tree-dump-times "the loop can be optimized" 1 "loop_crc"} } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c.042t.loop_crc b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c.042t.loop_crc +deleted file mode 100644 +index c726059f3..000000000 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-1.c.042t.loop_crc ++++ /dev/null +@@ -1,90 +0,0 @@ +- +-;; Function updcrc (updcrc, funcdef_no=0, decl_uid=3687, cgraph_uid=1, symbol_order=1) +- +-;; 2 loops found +-;; +-;; Loop 0 +-;; header 0, latch 1 +-;; depth 0, outer -1 +-;; nodes: 0 1 2 3 6 4 7 5 +-;; +-;; Loop 1 +-;; header 4, latch 7 +-;; depth 1, outer 0 +-;; nodes: 4 7 +-;; 2 succs { 5 3 } +-;; 3 succs { 6 5 } +-;; 6 succs { 4 } +-;; 4 succs { 7 5 } +-;; 7 succs { 4 } +-;; 5 succs { 1 } +- +-Starting the loop_crc pass +-====================================== +-Processing loop 1: +-====================================== +-;; +-;; Loop 1 +-;; header 4, latch 7 +-;; depth 1, outer 0 +-;; nodes: 4 7 +- +- +-The 1th loop form is success matched,and the loop can be optimized. +-updcrc (uch * s, unsigned int n) +-{ +- static ulg crc = 4294967295; +- register ulg c; +- unsigned char _2; +- long unsigned int _3; +- long unsigned int _4; +- long unsigned int _5; +- long unsigned int _6; +- long unsigned int _7; +- ulg _21; +- +- : +- if (s_12(D) == 0B) +- goto ; [INV] +- else +- goto ; [INV] +- +- : +- c_14 = crc; +- if (n_15(D) != 0) +- goto ; [INV] +- else +- goto ; [INV] +- +- : +- +- : +- # s_8 = PHI +- # n_9 = PHI +- # c_10 = PHI +- s_16 = s_8 + 1; +- _2 = *s_8; +- _3 = (long unsigned int) _2; +- _4 = _3 ^ c_10; +- _5 = _4 & 255; +- _6 = crc_32_tab[_5]; +- _7 = c_10 >> 8; +- c_17 = _6 ^ _7; +- n_18 = n_9 + 4294967295; +- if (n_18 != 0) +- goto ; [INV] +- else +- goto ; [INV] +- +- : +- goto ; [100.00%] +- +- : +- # c_11 = PHI <4294967295(2), c_14(3), c_17(4)> +- crc = c_11; +- _21 = c_11 ^ 4294967295; +- return _21; +- +-} +- +- +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-2.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-2.c +deleted file mode 100644 +index f73c4d550..000000000 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-2.c ++++ /dev/null +@@ -1,88 +0,0 @@ +-/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ +-/* { dg-options "-O3 -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ +- +-#include +-#include +-typedef unsigned long ulg; +-typedef unsigned char uch; +- +-static const ulg crc_32_tab[] = { +- 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, +- 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, +- 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, +- 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, +- 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, +- 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, +- 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, +- 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, +- 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, +- 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, +- 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, +- 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, +- 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, +- 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, +- 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, +- 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, +- 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, +- 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, +- 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, +- 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, +- 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, +- 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, +- 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, +- 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, +- 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, +- 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, +- 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, +- 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, +- 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, +- 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, +- 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, +- 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, +- 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, +- 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, +- 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, +- 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, +- 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, +- 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, +- 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, +- 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, +- 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, +- 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, +- 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, +- 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, +- 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, +- 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, +- 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, +- 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, +- 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, +- 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, +- 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, +- 0x2d02ef8dL +-}; +- +-ulg updcrc(s, n) +- uch *s; /* pointer to bytes to pump through */ +- unsigned n; /* number of bytes in s[] */ +-{ +- register ulg c; /* temporary variable */ +- +- static ulg crc = (ulg)0xffffffffL; /* shift register contents */ +- +- if (s == NULL) { +- c = 0xffffffffL; +- } else { +- c = crc; +- if (n) +- do { +- c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); +- for (int i = 0; i < 5; i++) { +- c++; +- } +- +- } while (--n); +- } +- crc = c; +- return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ +-} +-/* { dg-final { scan-tree-dump-times "Wrong loop form for crc matching." 1 "loop_crc"} } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-4.c.042t.loop_crc b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-4.c.042t.loop_crc +deleted file mode 100644 +index e69de29bb..000000000 +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c +deleted file mode 100644 +index 71b25f537..000000000 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c ++++ /dev/null +@@ -1,156 +0,0 @@ +-/* { dg-do compile { target {{ aarch64*-*-linux* } && lp64 } } } */ +-/* { dg-options "-O3 -mabi=lp64 -floop-crc -fdump-tree-loop_crc-details" } */ +- +-#include +-#include +-typedef unsigned long ulg; +-typedef unsigned char uch; +- +-static const ulg crc_32_tab[] = { +- 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, +- 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, +- 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, +- 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, +- 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, +- 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, +- 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, +- 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, +- 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, +- 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, +- 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, +- 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, +- 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, +- 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, +- 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, +- 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, +- 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, +- 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, +- 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, +- 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, +- 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, +- 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, +- 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, +- 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, +- 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, +- 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, +- 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, +- 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, +- 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, +- 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, +- 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, +- 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, +- 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, +- 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, +- 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, +- 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, +- 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, +- 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, +- 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, +- 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, +- 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, +- 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, +- 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, +- 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, +- 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, +- 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, +- 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, +- 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, +- 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, +- 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, +- 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, +- 0x2d02ef8dL +-}; +- +-int test[5] = {0}; +- +-ulg updcrc(s, n) +- uch *s; +- unsigned n; +-{ +- register ulg c; +- +- static ulg crc = (ulg)0xffffffffL; +- int a = 0; +- if (s == NULL) { +- c = 0xffffffffL; +- } else { +- c = crc; +- if (n) +- do { +- a++; +- c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8) ; +- } while (--n) ; +- } +- crc = c; +- return c ^ 0xffffffffL*a; +-} +- +-ulg updcrc1(s, n) +- uch *s; +- unsigned n; +-{ +- register ulg c; +- +- static ulg crc = (ulg)0xffffffffL; +- if (s == NULL) { +- c = 0xffffffffL; +- } else { +- c = crc; +- unsigned n_back = n; +- if (n) +- do { +- c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8) ; +- n = n - 2; +- } while (n != 0) ; +- } +- +- crc = c; +- return c ^ 0xffffffffL; +-} +- +-ulg updcrc2(s, n) +- uch *s; +- unsigned n; +-{ +- register ulg c; +- +- static ulg crc = (ulg)0xffffffffL; +- if (s == NULL) { +- c = 0xffffffffL; +- } else { +- c = crc; +- unsigned n_back = n; +- if (n) +- do { +- c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8) + 1; +- } while (--n) ; +- } +- +- crc = c; +- return c ^ 0xffffffffL; +-} +-/* +-ulg updcrc3(s, n) +- uch *s; +- int n; +-{ +- register ulg c; +- +- static ulg crc = (ulg)0xffffffffL; +- if (s == NULL) { +- c = 0xffffffffL; +- } else { +- c = crc; +- do { +- c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); +- --n; +- } while (n ) ; +- } +- +- crc = c; +- return c ^ 0xffffffffL; +-}*/ +-/* { dg-final { scan-tree-dump-times "num of phi noeds check failed." 1 "loop_crc"} } */ +-/* { dg-final { scan-tree-dump-times "evolution pattern check failed." 1 "loop_crc"} } */ +-/* { dg-final { scan-tree-dump-times "calculation pattern check failed." 1 "loop_crc"} } */ +- +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c.042t.loop_crc b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c.042t.loop_crc +deleted file mode 100644 +index 6d52a8684..000000000 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.c.042t.loop_crc ++++ /dev/null +@@ -1,64 +0,0 @@ +- +-;; Function updcrc3 (updcrc3, funcdef_no=0, decl_uid=3687, cgraph_uid=1, symbol_order=1) +- +-;; 2 loops found +-;; +-;; Loop 0 +-;; header 0, latch 1 +-;; depth 0, outer -1 +-;; nodes: 0 1 2 3 4 5 +-;; +-;; Loop 1 +-;; header 4, latch 4 +-;; depth 1, outer 0 +-;; nodes: 4 +-;; 2 succs { 5 3 } +-;; 3 succs { 4 5 } +-;; 4 succs { 4 } +-;; 5 succs { 1 } +- +-Starting the loop_crc pass +-====================================== +-Processing loop 1: +-====================================== +-;; +-;; Loop 1 +-;; header 4, latch 4 +-;; depth 1, outer 0 +-;; nodes: 4 +- +- +- +-Wrong loop form for crc matching. +-updcrc3 (uch * s, unsigned int n) +-{ +- unsigned int n_back; +- static ulg crc = 4294967295; +- register ulg c; +- ulg _22; +- +- : +- if (s_12(D) == 0B) +- goto ; [INV] +- else +- goto ; [INV] +- +- : +- c_14 = crc; +- if (n_15(D) != 0) +- goto ; [INV] +- else +- goto ; [INV] +- +- : +- goto ; [100.00%] +- +- : +- # c_11 = PHI <4294967295(2), c_14(3)> +- crc = c_11; +- _22 = c_11 ^ 4294967295; +- return _22; +- +-} +- +- +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.s b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.s +deleted file mode 100644 +index cae934bfe..000000000 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-calculation-check-fail.s ++++ /dev/null +@@ -1,329 +0,0 @@ +- .arch armv8-a +- .file "loop-crc-calculation-check-fail.c" +- .text +- .section .rodata +- .align 3 +- .type crc_32_tab, %object +- .size crc_32_tab, 2048 +-crc_32_tab: +- .xword 0 +- .xword 1996959894 +- .xword 3993919788 +- .xword 2567524794 +- .xword 124634137 +- .xword 1886057615 +- .xword 3915621685 +- .xword 2657392035 +- .xword 249268274 +- .xword 2044508324 +- .xword 3772115230 +- .xword 2547177864 +- .xword 162941995 +- .xword 2125561021 +- .xword 3887607047 +- .xword 2428444049 +- .xword 498536548 +- .xword 1789927666 +- .xword 4089016648 +- .xword 2227061214 +- .xword 450548861 +- .xword 1843258603 +- .xword 4107580753 +- .xword 2211677639 +- .xword 325883990 +- .xword 1684777152 +- .xword 4251122042 +- .xword 2321926636 +- .xword 335633487 +- .xword 1661365465 +- .xword 4195302755 +- .xword 2366115317 +- .xword 997073096 +- .xword 1281953886 +- .xword 3579855332 +- .xword 2724688242 +- .xword 1006888145 +- .xword 1258607687 +- .xword 3524101629 +- .xword 2768942443 +- .xword 901097722 +- .xword 1119000684 +- .xword 3686517206 +- .xword 2898065728 +- .xword 853044451 +- .xword 1172266101 +- .xword 3705015759 +- .xword 2882616665 +- .xword 651767980 +- .xword 1373503546 +- .xword 3369554304 +- .xword 3218104598 +- .xword 565507253 +- .xword 1454621731 +- .xword 3485111705 +- .xword 3099436303 +- .xword 671266974 +- .xword 1594198024 +- .xword 3322730930 +- .xword 2970347812 +- .xword 795835527 +- .xword 1483230225 +- .xword 3244367275 +- .xword 3060149565 +- .xword 1994146192 +- .xword 31158534 +- .xword 2563907772 +- .xword 4023717930 +- .xword 1907459465 +- .xword 112637215 +- .xword 2680153253 +- .xword 3904427059 +- .xword 2013776290 +- .xword 251722036 +- .xword 2517215374 +- .xword 3775830040 +- .xword 2137656763 +- .xword 141376813 +- .xword 2439277719 +- .xword 3865271297 +- .xword 1802195444 +- .xword 476864866 +- .xword 2238001368 +- .xword 4066508878 +- .xword 1812370925 +- .xword 453092731 +- .xword 2181625025 +- .xword 4111451223 +- .xword 1706088902 +- .xword 314042704 +- .xword 2344532202 +- .xword 4240017532 +- .xword 1658658271 +- .xword 366619977 +- .xword 2362670323 +- .xword 4224994405 +- .xword 1303535960 +- .xword 984961486 +- .xword 2747007092 +- .xword 3569037538 +- .xword 1256170817 +- .xword 1037604311 +- .xword 2765210733 +- .xword 3554079995 +- .xword 1131014506 +- .xword 879679996 +- .xword 2909243462 +- .xword 3663771856 +- .xword 1141124467 +- .xword 855842277 +- .xword 2852801631 +- .xword 3708648649 +- .xword 1342533948 +- .xword 654459306 +- .xword 3188396048 +- .xword 3373015174 +- .xword 1466479909 +- .xword 544179635 +- .xword 3110523913 +- .xword 3462522015 +- .xword 1591671054 +- .xword 702138776 +- .xword 2966460450 +- .xword 3352799412 +- .xword 1504918807 +- .xword 783551873 +- .xword 3082640443 +- .xword 3233442989 +- .xword 3988292384 +- .xword 2596254646 +- .xword 62317068 +- .xword 1957810842 +- .xword 3939845945 +- .xword 2647816111 +- .xword 81470997 +- .xword 1943803523 +- .xword 3814918930 +- .xword 2489596804 +- .xword 225274430 +- .xword 2053790376 +- .xword 3826175755 +- .xword 2466906013 +- .xword 167816743 +- .xword 2097651377 +- .xword 4027552580 +- .xword 2265490386 +- .xword 503444072 +- .xword 1762050814 +- .xword 4150417245 +- .xword 2154129355 +- .xword 426522225 +- .xword 1852507879 +- .xword 4275313526 +- .xword 2312317920 +- .xword 282753626 +- .xword 1742555852 +- .xword 4189708143 +- .xword 2394877945 +- .xword 397917763 +- .xword 1622183637 +- .xword 3604390888 +- .xword 2714866558 +- .xword 953729732 +- .xword 1340076626 +- .xword 3518719985 +- .xword 2797360999 +- .xword 1068828381 +- .xword 1219638859 +- .xword 3624741850 +- .xword 2936675148 +- .xword 906185462 +- .xword 1090812512 +- .xword 3747672003 +- .xword 2825379669 +- .xword 829329135 +- .xword 1181335161 +- .xword 3412177804 +- .xword 3160834842 +- .xword 628085408 +- .xword 1382605366 +- .xword 3423369109 +- .xword 3138078467 +- .xword 570562233 +- .xword 1426400815 +- .xword 3317316542 +- .xword 2998733608 +- .xword 733239954 +- .xword 1555261956 +- .xword 3268935591 +- .xword 3050360625 +- .xword 752459403 +- .xword 1541320221 +- .xword 2607071920 +- .xword 3965973030 +- .xword 1969922972 +- .xword 40735498 +- .xword 2617837225 +- .xword 3943577151 +- .xword 1913087877 +- .xword 83908371 +- .xword 2512341634 +- .xword 3803740692 +- .xword 2075208622 +- .xword 213261112 +- .xword 2463272603 +- .xword 3855990285 +- .xword 2094854071 +- .xword 198958881 +- .xword 2262029012 +- .xword 4057260610 +- .xword 1759359992 +- .xword 534414190 +- .xword 2176718541 +- .xword 4139329115 +- .xword 1873836001 +- .xword 414664567 +- .xword 2282248934 +- .xword 4279200368 +- .xword 1711684554 +- .xword 285281116 +- .xword 2405801727 +- .xword 4167216745 +- .xword 1634467795 +- .xword 376229701 +- .xword 2685067896 +- .xword 3608007406 +- .xword 1308918612 +- .xword 956543938 +- .xword 2808555105 +- .xword 3495958263 +- .xword 1231636301 +- .xword 1047427035 +- .xword 2932959818 +- .xword 3654703836 +- .xword 1088359270 +- .xword 936918000 +- .xword 2847714899 +- .xword 3736837829 +- .xword 1202900863 +- .xword 817233897 +- .xword 3183342108 +- .xword 3401237130 +- .xword 1404277552 +- .xword 615818150 +- .xword 3134207493 +- .xword 3453421203 +- .xword 1423857449 +- .xword 601450431 +- .xword 3009837614 +- .xword 3294710456 +- .xword 1567103746 +- .xword 711928724 +- .xword 3020668471 +- .xword 3272380065 +- .xword 1510334235 +- .xword 755167117 +- .text +- .align 2 +- .global updcrc3 +- .type updcrc3, %function +-updcrc3: +-.LFB0: +- .cfi_startproc +- str x19, [sp, -48]! +- .cfi_def_cfa_offset 48 +- .cfi_offset 19, -48 +- str x0, [sp, 24] +- str w1, [sp, 20] +- ldr x0, [sp, 24] +- cmp x0, 0 +- bne .L2 +- mov x19, 4294967295 +- b .L3 +-.L2: +- adrp x0, crc.0 +- add x0, x0, :lo12:crc.0 +- ldr x19, [x0] +- ldr w0, [sp, 20] +- str w0, [sp, 44] +- ldr w0, [sp, 20] +- cmp w0, 0 +- beq .L3 +-.L4: +- ldr x0, [sp, 24] +- add x1, x0, 1 +- str x1, [sp, 24] +- ldrb w0, [x0] +- and x0, x0, 255 +- eor x0, x19, x0 +- and x1, x0, 255 +- adrp x0, crc_32_tab +- add x0, x0, :lo12:crc_32_tab +- ldr x1, [x0, x1, lsl 3] +- lsr x0, x19, 8 +- eor x19, x1, x0 +- ldr w0, [sp, 20] +- sub w0, w0, #1 +- str w0, [sp, 20] +- ldr w0, [sp, 20] +- cmp w0, 999 +- bls .L4 +-.L3: +- adrp x0, crc.0 +- add x0, x0, :lo12:crc.0 +- str x19, [x0] +- eor x0, x19, 4294967295 +- ldr x19, [sp], 48 +- .cfi_restore 19 +- .cfi_def_cfa_offset 0 +- ret +- .cfi_endproc +-.LFE0: +- .size updcrc3, .-updcrc3 +- .data +- .align 3 +- .type crc.0, %object +- .size crc.0, 8 +-crc.0: +- .xword 4294967295 +- .ident "GCC: (Kunpeng gcc 10.3.1-2.3.0.b006) 10.3.1" +- .section .note.GNU-stack,"",@progbits +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-3.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-condition-fail.c +similarity index 97% +rename from gcc/testsuite/gcc.dg/tree-ssa/loop-crc-3.c +rename to gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-condition-fail.c +index 70eb1b814..fefa949f9 100644 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-3.c ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-condition-fail.c +@@ -74,12 +74,12 @@ ulg updcrc(s, n) + } else { + c = crc; + if (n) +- do { +- c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ if (n) do { ++ c = crc_32_tab[((int)c ^ (*s++)) & 0xff] ^ (c >> 8); + } while (--n || c != 0) ; + } + crc = c; + exit1: + return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ + } +-/* { dg-final { scan-tree-dump-times "Wrong loop form for crc matching." 1 "loop_crc"} } */ +\ No newline at end of file ++/* { dg-final { scan-tree-dump-times "Wrong loop form for crc matching." 1 "loop_crc"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-4.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail-2.c +similarity index 95% +rename from gcc/testsuite/gcc.dg/tree-ssa/loop-crc-4.c +rename to gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail-2.c +index 1d7e0a319..b37446ec5 100644 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-4.c ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail-2.c +@@ -75,8 +75,8 @@ ulg updcrc(s, n) + } else { + c = crc; + if (n) +- do { +- c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8) * test[c%5]; ++ if (n) do { ++ c = crc_32_tab[((int)c ^ (*s++)) & 0xff] ^ (c >> 8); + } while (--n) ; + } + do { +@@ -86,4 +86,5 @@ ulg updcrc(s, n) + crc = c; + return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ + } +-/* { dg-final { scan-tree-dump-times "Table check fail. not only single array is read." 2 "loop_crc"} } */ +\ No newline at end of file ++/* { dg-final { scan-tree-dump-times "Table check fail. not only single array is read." 2 "loop_crc"} } */ ++/* { dg-final { scan-tree-dump-times "Wrong crc table for crc matching." 1 "loop_crc"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c +index b59704e31..3dc500a46 100644 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c +@@ -108,4 +108,5 @@ ulg updcrc1(s, n) + crc = c; + return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ + } +-/* { dg-final { scan-tree-dump-times "Wrong loop form for crc matching." 2 "loop_crc"} } */ +\ No newline at end of file ++/* { dg-final { scan-tree-dump-times "Table check fail. not only single array is read." 1 "loop_crc"} } */ ++/* { dg-final { scan-tree-dump-times "Wrong crc table for crc matching." 2 "loop_crc"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c +index e1e16eaf2..8b556efc8 100644 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c +@@ -73,12 +73,11 @@ ulg updcrc(s, n) + c = 0xffffffffL; + } else { + c = crc; +- if (n) +- do { +- c = crc_32_tab[(c ^ (*s++)) & 0xff] ^ (c >> 8); ++ if (n) do { ++ c = crc_32_tab[((int)c ^ (*s++)) & 0xff] ^ (c >> 8); + } while (--n); + } + crc = c; + return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ + } +-/* { dg-final { scan-tree-dump-times "the loop can be optimized" 1 "loop_crc"} } */ +\ No newline at end of file ++/* { dg-final { scan-tree-dump-times "The 1th loop form is successmatched,and the loop can be optimized." 1 "loop_crc"} } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c +index f03a4fa82..de21f4553 100644 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c +@@ -110,4 +110,5 @@ ulg updcrc1(s, n) + return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */ + } + /* { dg-final { scan-tree-dump-times "Table check fail. not only single array is read." 2 "loop_crc"} } */ +-/* { dg-final { scan-tree-dump-times "Table check fail. Table not matching." 1 "loop_crc"} } */ +\ No newline at end of file ++/* { dg-final { scan-tree-dump-times "Wrong crc table for crc matching." 3 "loop_crc"} } */ ++/* { dg-final { scan-tree-dump-times "Table check fail. Table not matching." 1 "loop_crc"} } */ +diff --git a/gcc/tree-ssa-loop-crc.c b/gcc/tree-ssa-loop-crc.c +index 4982384c6..8225c2fa5 100644 +--- a/gcc/tree-ssa-loop-crc.c ++++ b/gcc/tree-ssa-loop-crc.c +@@ -1,5 +1,5 @@ +-/* Array widen compare. +- Copyright (C) 2022-2022 Free Software Foundation, Inc. ++/* loop crc. ++ Copyright (C) 2023-2023 Free Software Foundation, Inc. + + This file is part of GCC. + +@@ -42,13 +42,235 @@ along with GCC; see the file COPYING3. If not see + #include "print-tree.h" + #include "cfghooks.h" + #include "gimple-fold.h" ++#include "diagnostic-core.h" ++ ++/* This pass handles scenarios similar to the following: ++ulg updcrc(s, n) ++ uch *s; ++ unsigned n; ++{ ++ register ulg c; ++ ++ static ulg crc = (ulg)0xffffffffL; ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) do { ++ c = crc_32_tab[((int)c ^ (*s++)) & 0xff] ^ (c >> 8); ++ } while (--n); ++ } ++ crc = c; ++ return c ^ 0xffffffffL; ++} ++ ++If the hardware supports the crc instruction, then the pass completes the ++conversion of the above scenario into: ++ ++#define SIZE_U32 sizeof(uint32_t) ++unsigned long updcrc(s, n) ++ unsigned char *s; ++ unsigned n; ++{ ++ register unsigned long c; ++ ++ static unsigned long crc = (unsigned long)0xffffffffL; ++ ++ if (s == NULL) { ++ c = 0xffffffffL; ++ } else { ++ c = crc; ++ if (n) ++ { ++ uint32_t nn = n/SIZE_U32; ++ do{ ++ c = __crc32w(c,*((uint32_t *)s)); ++ s += SIZE_U32; ++ }while(--nn); ++ if (n & sizeof(uint16_t)) { ++ c = __crc32h(c, *((uint16_t *)s)); ++ s += sizeof(uint16_t); ++ } ++ if (n & sizeof(uint8_t)) ++ c = __crc32b(c, *s); ++ } ++ } ++ crc = c; ++ return c ^ 0xffffffffL; ++} ++ ++This pass is to complete the conversion of such scenarios from the internal ++perspective of the compiler: ++1)match_crc_loop:The function completes the screening of such scenarios; ++2)convert_to_new_loop:The function completes the conversion of ++ origin_loop to new loops, and removes origin_loop; ++3)origin_loop_info: The structure is used to record important information ++ of origin_loop: such as loop exit, initial value of induction ++ variable, etc; ++4) create_new_loops: The function is used as the key content of the pass ++ to complete the creation of new loops. */ + +-/* Match.pd function to match the ctz expression. */ + extern bool gimple_crc_match_index (tree, tree *, tree (*)(tree)); + extern bool gimple_crc_match_res (tree, tree *, tree (*)(tree)); + + static gimple *crc_table_read_stmt = NULL; + ++static gphi* phi_s = NULL; ++static gphi* phi_c = NULL; ++static tree nn_tree = NULL; ++ ++enum aarch64_crc_builtins ++{ ++ AARCH64_BUILTIN_CRC32B, ++ AARCH64_BUILTIN_CRC32H, ++ AARCH64_BUILTIN_CRC32W, ++}; ++ ++/* The useful information of origin loop. */ ++struct origin_loop_info ++{ ++ tree limit; /* The limit index of the array in the old loop. */ ++ tree base_n; /* The initial value of the old loop. */ ++ tree base_s; /* The initial value of the old loop. */ ++ tree base_c; /* The initial value of the old loop. */ ++ edge entry_edge; /* The edge into the old loop. */ ++ edge exit_edge; /* The edge outto the old loop. */ ++ basic_block exit_bb; ++}; ++ ++typedef struct origin_loop_info origin_loop_info; ++ ++static origin_loop_info origin_loop; ++hash_map n_map; ++hash_map nn_map; ++hash_map s_map; ++hash_map c_map; ++hash_map crc_map; ++ ++/* Initialize the origin_loop structure. */ ++static void ++init_origin_loop_structure () ++{ ++ origin_loop.entry_edge = NULL; ++ origin_loop.exit_edge = NULL; ++ origin_loop.exit_bb = NULL; ++ origin_loop.limit = NULL; ++ origin_loop.base_n = NULL; ++ origin_loop.base_s = NULL; ++ origin_loop.base_c = NULL; ++} ++ ++/* Get the edge that first entered the loop. */ ++static edge ++get_loop_preheader_edge (class loop *loop) ++{ ++ edge e; ++ edge_iterator ei; ++ ++ FOR_EACH_EDGE (e, ei, loop->header->preds) ++ if (e->src != loop->latch) ++ break; ++ ++ return e; ++} ++ ++/* Returns true if t is SSA_NAME and user variable exists. */ ++ ++static bool ++ssa_name_var_p (tree t) ++{ ++ if (!t || TREE_CODE (t) != SSA_NAME) ++ return false; ++ if (SSA_NAME_VAR (t)) ++ return true; ++ return false; ++} ++ ++/* Returns true if t1 and t2 are SSA_NAME and belong to the same variable. */ ++ ++static bool ++same_ssa_name_var_p (tree t1, tree t2) ++{ ++ if (!ssa_name_var_p (t1) || !ssa_name_var_p (t2)) ++ return false; ++ if (SSA_NAME_VAR (t1) == SSA_NAME_VAR (t2)) ++ return true; ++ return false; ++} ++ ++/* Get origin loop induction variable upper bound. */ ++ ++static bool ++get_iv_upper_bound (gimple *stmt) ++{ ++ if (origin_loop.limit != NULL || origin_loop.base_n != NULL) ++ return false; ++ ++ tree lhs = gimple_cond_lhs (stmt); ++ tree rhs = gimple_cond_rhs (stmt); ++ ++ if (TREE_CODE (TREE_TYPE (lhs)) != INTEGER_TYPE ++ || TREE_CODE (TREE_TYPE (rhs)) != INTEGER_TYPE) ++ return false; ++ ++ /* TODO: Currently, the input restrictions on lhs and rhs are implemented ++ through PARM_DECL. We may consider relax the restrictions later, and ++ we need to consider the overall adaptation scenario and adding test ++ cases. */ ++ if (ssa_name_var_p (lhs) && TREE_CODE (SSA_NAME_VAR (lhs)) == PARM_DECL) ++ { ++ origin_loop.limit = rhs; ++ origin_loop.base_n = lhs; ++ } ++ else ++ return false; ++ ++ if (origin_loop.limit != NULL && origin_loop.base_n != NULL) ++ return true; ++ ++ return false; ++} ++ ++/* Get origin loop info. */ ++static bool ++get_origin_loop_info(class loop *loop) ++{ ++ vec edges; ++ edges = get_loop_exit_edges (loop); ++ origin_loop.exit_edge = edges[0]; ++ origin_loop.exit_bb = origin_loop.exit_edge->dest; ++ origin_loop.entry_edge = get_loop_preheader_edge(loop); ++ origin_loop.base_s = PHI_ARG_DEF_FROM_EDGE(phi_s,origin_loop.entry_edge); ++ origin_loop.base_c = PHI_ARG_DEF_FROM_EDGE(phi_c,origin_loop.entry_edge); ++ ++ basic_block preheader_bb; ++ preheader_bb = origin_loop.entry_edge->src; ++ ++ if(preheader_bb->preds->length() != 1) ++ return false; ++ ++ edge entry_pre_bb_edge; ++ entry_pre_bb_edge = EDGE_PRED (preheader_bb, 0); ++ ++ basic_block pre_preheader_bb; ++ pre_preheader_bb = entry_pre_bb_edge->src; ++ ++ gimple_stmt_iterator gsi; ++ gimple *stmt; ++ bool get_upper_bound = false; ++ for (gsi = gsi_start_bb (pre_preheader_bb); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ stmt = gsi_stmt (gsi); ++ if (stmt && gimple_code (stmt) == GIMPLE_COND ++ && get_iv_upper_bound (stmt)) { ++ get_upper_bound = true; ++ break; ++ } ++ } ++ ++ return get_upper_bound; ++} + + /* The loop form check will check the entire loop control flow + It should be a loop that: +@@ -102,7 +324,8 @@ only_one_array_read (class loop *loop, tree &crc_table) + if (gimple_code (stmt) == GIMPLE_ASSIGN && + TREE_CODE(gimple_assign_rhs1 (stmt)) == ARRAY_REF) + { +- if (crc_table == NULL) ++ if (crc_table == NULL && ++ gimple_assign_rhs1 (stmt)->base.readonly_flag) + { + crc_table = gimple_assign_rhs1 (stmt); + crc_table_read_stmt = stmt; +@@ -174,15 +397,18 @@ static const unsigned HOST_WIDE_INT crc_32_tab[] = { + static bool + match_crc_table (tree crc_table) + { ++ const unsigned LOW_BOUND = 0; ++ const unsigned UP_BOUND = 255; ++ const unsigned ELEMENT_SIZE = 8; + unsigned HOST_WIDE_INT lb = tree_to_uhwi (array_ref_low_bound (crc_table)); + unsigned HOST_WIDE_INT ub = tree_to_uhwi (array_ref_up_bound (crc_table)); + unsigned HOST_WIDE_INT es = tree_to_uhwi (array_ref_element_size (crc_table)); +- if (lb != 0 || ub != 255 || es != 8) ++ if (lb != LOW_BOUND || ub != UP_BOUND || es != ELEMENT_SIZE) + return false; + + tree decl = TREE_OPERAND (crc_table, 0); + tree ctor = ctor_for_folding(decl); +- for (int i = 0; i < 255; i++) { ++ for (int i = lb; i <= ub; i++) { + unsigned HOST_WIDE_INT val = tree_to_uhwi (CONSTRUCTOR_ELT (ctor,i)->value); + if (crc_32_tab[i] != val) + return false; +@@ -273,6 +499,7 @@ check_evolution_pattern (class loop* loop, gphi *capture[]) + if (s != NULL) + return false; + s = capture[i]; ++ phi_s = s; + } + else if (evolution_pattern_plus_with_p(loop, capture[i], 4294967295)) + { +@@ -285,6 +512,7 @@ check_evolution_pattern (class loop* loop, gphi *capture[]) + if (c != NULL) + return false; + c = capture[i]; ++ phi_c = c; + } + } + +@@ -314,14 +542,19 @@ check_calculation_pattern (class loop* loop, gphi *capture[]) + _5 = _4 ^ c_10; //BIT_XOR_EXPR (SSA_NAME, PHI @1) + _6 = _5 & 255; //BIT_XOR_EXPR (SSA_NAME, INTEGER_CST@3) + */ +- + if (!gimple_crc_match_index(index, res_ops, NULL)) + return false; +- gimple *s_res_stmt = SSA_NAME_DEF_STMT(res_ops[1]); +- tree s_res = TREE_OPERAND(gimple_assign_rhs1(s_res_stmt),0); +- if (res_ops[0] != gimple_phi_result (c) || +- s_res != gimple_phi_result (s)) ++ gimple *s_res_stmt = SSA_NAME_DEF_STMT (res_ops[0]); ++ if (!s_res_stmt) ++ return false; ++ gimple *s_def_stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (s_res_stmt)); ++ if (!s_def_stmt) + return false; ++ tree s_res = TREE_OPERAND (gimple_assign_rhs1 (s_def_stmt), 0); ++ if (res_ops[1] != gimple_phi_result (c) || s_res != gimple_phi_result (s)) ++ { ++ return false; ++ } + + /* Try to match + _8 = c_12 >> 8; // RSHIFT_EXPR (SSA_NAME @1, INTEGER_CST @2) +@@ -333,7 +566,11 @@ check_calculation_pattern (class loop* loop, gphi *capture[]) + return false; + if (res_ops[0] != gimple_phi_result (c) + || res_ops[2] != gimple_assign_lhs(crc_table_read_stmt)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n gimple_crc_match_res pattern check failed.\n"); + return false; ++ } + + return true; + } +@@ -419,101 +656,91 @@ crc_loop_body_check (class loop *loop) + return false; + } + return true; +-/* gphi *phi; +- gphi_iterator gsi; +- int num_of_phi = 0; +- //s, n, c; +- //only 3 phi nodes are there, every one of the phi nodes comming from 2 edge only, one from preheader, one from latch +- // s increase by 1 every itoration +- // n decrease by 1 every itoration +- // The final one is c, which is the result, should be used for the start of the later pattern matching +- for (gsi = gsi_start_phis(loop->header); !gsi_end_p(gsi); gsi_next(&gsi)) +- { +- phi = gsi.phi(); ++} + +- if (phi) num_of_phi++; +- if (num_of_phi > 3) return false; // more then 3 phi node +- if (gimple_phi_num_args(phi) > 2) // more than 2 edges other then one backedge and one preheader edge +- return false; +- //capture[num_of_phi - 1] = gimple_phi_result(phi); +- capture[num_of_phi - 1] = phi; +- } +- if (num_of_phi != 3) return false; // phi node should be 3 */ +- // Find the envolution pattern for s and n, try to match the identity of these variable +-/* gphi *s=NULL; +- gphi *n=NULL; +- gphi *c=NULL; ++/* Check the prev_bb of prev_bb of loop header. The prev_bb we are trying to match is + +- for (int i = 0; i < 3; i++) +- { +- if (evolution_pattern_plus_with_p(loop, capture[i], 1)) +- { +- if(s != NULL) +- return false; +- s = capture[i]; +- } +- else if (evolution_pattern_plus_with_p(loop, capture[i], 4294967295)) +- { +- if(n != NULL) +- return false; +- n = capture[i]; +- } +- else +- { +- if(c != NULL) +- return false; +- c = capture[i]; +- } +- } ++c_15 = crc; ++if (n_16(D) != 0) ++ goto ; [INV] ++else ++ goto ; [INV] + +- // some envolution pattern cannot find +- if (!n || !s || !c) +- return false; +- gphi *s=capture[0]; +- gphi *n=capture[1]; +- gphi *c=capture[2]; +- tree res_ops[3]; +- tree index = TREE_OPERAND (gimple_assign_rhs1 (crc_table_read_stmt), 1); ++ In this case , we must be sure that the n is not zero. ++ so the match condition is ++ 1、the n is not zero. + +- /* Try to match +- _1 = (int) c_12; //NOP_EXPR (SSA_NAME @1) +- _4 = (int) _3; //NOP_EXPR (SSA_NAME @2) +- _5 = _1 ^ _4; //BIT_XOR_EXPR (SSA_NAME, SSA_NAME) +- _6 = _5 & 255; //BIT_XOR_EXPR (SSA_NAME, INTEGER_CST@3) ++ : ++if (s_13(D) == 0B) ++ goto ; [INV] ++else ++ goto ; [INV] + +- +- if (!gimple_crc_match_index(index, res_ops, NULL)) ++ In this case, we must be sure the s is not NULL. ++ so the match condition is ++ 1、the s is not NULL. ++*/ ++static bool ++crc_prev_bb_of_loop_header_check(class loop *loop) ++{ ++ basic_block header = loop->header; ++ basic_block prev_header_bb = header->prev_bb; ++ if(NULL == prev_header_bb) ++ { + return false; +- gimple *s_res_stmt = SSA_NAME_DEF_STMT(res_ops[1]); +- tree s_res = TREE_OPERAND(gimple_assign_rhs1(s_res_stmt),0); +- if (res_ops[0] != gimple_phi_result (c) || +- s_res != gimple_phi_result (s)) ++ } ++ ++ basic_block prev_prev_header_bb = prev_header_bb->prev_bb; ++ if(NULL == prev_prev_header_bb) ++ { + return false; ++ } ++ ++ gimple_stmt_iterator gsi; ++ gimple *stmt; ++ bool res = false; ++ for (gsi = gsi_start_bb (prev_prev_header_bb); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ stmt = gsi_stmt (gsi); ++ if (stmt == NULL) ++ return false; + +- /* +-_8 = c_12 >> 8; // RSHIFT_EXPR (SSA_NAME @1, INTEGER_CST @2) +-c_19 = _7 ^ _8; // BIT_XOR_EXPR (SSA_NAME@3, SSA_NAME) ++ if (gimple_code (stmt) == GIMPLE_COND && ++ gimple_cond_code(stmt) == NE_EXPR && ++ TREE_CODE(gimple_cond_rhs (stmt)) == INTEGER_CST && ++ tree_int_cst_sgn(gimple_cond_rhs (stmt)) == 0 ) ++ { ++ res = true; ++ break; ++ } ++ } + +- edge backedge = find_edge(loop->latch, loop->header); +- tree updated_c = PHI_ARG_DEF_FROM_EDGE (c, backedge); +- if (!gimple_crc_match_res(updated_c, res_ops, NULL)) +- return false; +- if (res_ops[0] != gimple_phi_result (c) +- || res_ops[2] != gimple_assign_lhs(crc_table_read_stmt)) ++ if(!res) ++ { + return false; ++ } + +- // try match n as the induction variable +- // The proceed condition for back edge is n != 0 +- gimple *cond_stmt = gsi_stmt (gsi_last_bb (loop->header)); +- if (!cond_stmt || gimple_code (cond_stmt) != GIMPLE_COND || gimple_cond_code (cond_stmt) != NE_EXPR +- || gimple_cond_lhs (cond_stmt) != PHI_ARG_DEF_FROM_EDGE (n, backedge) +- || tree_to_uhwi(gimple_cond_rhs (cond_stmt)) != 0) ++ basic_block first_bb = prev_prev_header_bb->prev_bb; ++ if(NULL == first_bb) + return false; +- +- return true; +- */ +-} + ++ for (gsi = gsi_start_bb (first_bb); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ stmt = gsi_stmt (gsi); ++ if (stmt == NULL) ++ return false; ++ ++ if (gimple_code (stmt) == GIMPLE_COND && ++ gimple_cond_code(stmt) == EQ_EXPR && ++ TREE_CODE(gimple_cond_rhs (stmt)) == INTEGER_CST && ++ tree_int_cst_sgn(gimple_cond_rhs (stmt)) == 0 ) ++ { ++ return true; ++ } ++ } ++ ++ return false; ++} + + static bool + match_crc_loop (class loop *loop) +@@ -536,13 +763,463 @@ match_crc_loop (class loop *loop) + fprintf (dump_file, "\nWrong loop body for crc matching.\n"); + return false; + } ++ if(!crc_prev_bb_of_loop_header_check(loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nWrong prev basic_blocks of loop header for crc matching.\n"); ++ return false; ++ } ++ ++ init_origin_loop_structure(); ++ if(!get_origin_loop_info(loop)) ++ return false; ++ + return true; + } + ++static void ++create_new_bb (basic_block &new_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ ++ new_bb = create_empty_bb (after_bb); ++ add_bb_to_loop (new_bb, outer); ++ set_immediate_dominator (CDI_DOMINATORS, new_bb, dominator_bb); ++} ++ ++static void ++change_preheader_bb(edge entry_edge) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple* g; ++ tree lhs1; ++ ++ lhs1 = create_tmp_var(TREE_TYPE(origin_loop.base_n),"nn"); ++ lhs1 = make_ssa_name(lhs1); ++ gsi = gsi_last_bb (entry_edge->src); ++ g = gimple_build_assign(lhs1,RSHIFT_EXPR,origin_loop.base_n, ++ build_int_cst (TREE_TYPE (origin_loop.base_n), 2)); ++ gimple_seq_add_stmt(&stmts,g); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ nn_tree = lhs1; ++ set_current_def(nn_tree, lhs1); ++ nn_map.put (entry_edge->src, lhs1); ++} ++ ++static gphi* ++create_phi_node_for_bb(tree old_name, basic_block bb) ++{ ++ gphi *phi = create_phi_node(NULL_TREE, bb); ++ create_new_def_for(old_name, phi, gimple_phi_result_ptr(phi)); ++ return phi; ++} ++ ++static gimple* ++call_builtin_fun(int code,tree& lhs, tree arg1, tree arg2) ++{ ++ unsigned int builtin_code = targetm.get_crc_builtin_code(code, true);// 根据code获取到正确的builtin_fun_code ++ tree fn = targetm.builtin_decl(builtin_code,true); // get the decl of __builtin_aarch64_crc32w ++ if (!fn || fn == error_mark_node) ++ fatal_error (input_location, ++ "target specific builtin not available"); ++ gimple* call_builtin = gimple_build_call(fn, 2, arg1, arg2); // _40 = __builtin_aarch64_crc32* (_1, _2); ++ lhs = make_ssa_name (unsigned_type_node); ++ gimple_call_set_lhs(call_builtin,lhs); ++ ++ return call_builtin; ++} ++ ++/* Create loop_header and loop_latch for new loop ++ : ++ # s_14 = PHI ++ # c_16 = PHI ++ # nn_19 = PHI ++ _1 = (unsigned int) c_16; ++ _2 = MEM[(uint32_t *)s_14]; ++ _40 = __builtin_aarch64_crc32w (_1, _2); ++ c_29 = (long unsigned int) _40; ++ s_30 = s_14 + 4; ++ nn_31 = nn_19 + 4294967295; ++ if (nn_31 != 0) ++ The IR of bb is as above. */ ++static void ++create_loop_bb(basic_block& loop_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer, edge entry_edge) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple* g; ++ gphi* phi_s_loop; ++ gphi* phi_c_loop; ++ gphi* phi_nn_loop; ++ ++ create_new_bb(loop_bb, after_bb, dominator_bb, outer); ++ redirect_edge_and_branch(entry_edge, loop_bb); ++ gsi = gsi_last_bb(loop_bb); ++ tree entry_nn = get_current_def(nn_tree); ++ phi_s_loop = create_phi_node_for_bb(origin_loop.base_s, loop_bb); ++ phi_c_loop = create_phi_node_for_bb(origin_loop.base_c, loop_bb); ++ phi_nn_loop = create_phi_node_for_bb(entry_nn, loop_bb); ++ ++ tree res_s = gimple_phi_result(phi_s_loop); ++ tree res_nn = gimple_phi_result(phi_nn_loop); ++ tree lhs1 = gimple_build(&stmts, NOP_EXPR, unsigned_type_node, ++ gimple_phi_result(phi_c_loop)); ++ g = gimple_build_assign(make_ssa_name(unsigned_type_node), ++ fold_build2(MEM_REF,unsigned_type_node,res_s, ++ build_int_cst (build_pointer_type (unsigned_type_node), 0))); ++ gimple_seq_add_stmt(&stmts, g); ++ tree lhs2 = gimple_assign_lhs(g); // _2 = MEM[(uint32_t *)s_14]; ++ unsigned int code = AARCH64_BUILTIN_CRC32W; ++ tree lhs3; ++ gimple* build_crc32w = call_builtin_fun(code,lhs3, lhs1, lhs2); ++ crc_map.put(loop_bb, lhs3); ++ gimple_seq_add_stmt(&stmts,build_crc32w); ++ ++ tree lhs4 = copy_ssa_name(origin_loop.base_c); ++ g = gimple_build_assign(lhs4, NOP_EXPR, lhs3); ++ gimple_seq_add_stmt(&stmts, g); ++ c_map.put(loop_bb, lhs4); ++ ++ tree lhs5 = copy_ssa_name(origin_loop.base_s); ++ g = gimple_build_assign(lhs5, POINTER_PLUS_EXPR, res_s, ++ build_int_cst (sizetype, 4)); ++ gimple_seq_add_stmt(&stmts, g); ++ s_map.put(loop_bb, lhs5); ++ ++ tree lhs6 = copy_ssa_name(nn_tree); ++ g = gimple_build_assign(lhs6, PLUS_EXPR, res_nn, ++ build_int_cst (TREE_TYPE (res_nn), 4294967295)); ++ gimple_seq_add_stmt(&stmts,g); ++ nn_map.put(loop_bb, lhs6); ++ ++ gcond* cond_stmt = gimple_build_cond (NE_EXPR, lhs6, origin_loop.limit, ++ NULL_TREE, NULL_TREE); ++ gimple_seq_add_stmt (&stmts, cond_stmt); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++} ++ ++/* : ++ # c_6 = PHI ++ # s_46 = PHI ++ _44 = n_26(D) & 2; ++ if (_44 != 0) ++ The IR of bb is as above. */ ++static void ++create_cond_bb(basic_block& cond_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer){ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gphi* phi_s_loop; ++ gphi* phi_c_loop; ++ ++ create_new_bb(cond_bb, after_bb, dominator_bb, outer); ++ gsi = gsi_last_bb(cond_bb); ++ tree entry_nn = get_current_def(nn_tree); ++ phi_s_loop = create_phi_node_for_bb(origin_loop.base_s, cond_bb); ++ phi_c_loop = create_phi_node_for_bb(origin_loop.base_c, cond_bb); ++ tree res_s = gimple_phi_result(phi_s_loop); ++ set_current_def(origin_loop.base_s, res_s); ++ s_map.put(cond_bb, res_s); ++ tree res_c = gimple_phi_result(phi_c_loop); ++ set_current_def(origin_loop.base_c, res_c); ++ c_map.put(cond_bb, res_c); ++ ++ tree lhs1 = gimple_build(&stmts, BIT_AND_EXPR, TREE_TYPE(origin_loop.base_n), ++ origin_loop.base_n, build_int_cst (TREE_TYPE (origin_loop.base_n), 2)); ++ gcond* cond_stmt = gimple_build_cond (NE_EXPR, lhs1, origin_loop.limit, ++ NULL_TREE, NULL_TREE); ++ gimple_seq_add_stmt (&stmts, cond_stmt); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++} ++ ++/* : ++ _7 = MEM[(uint16_t *)s_46]; ++ _41 = __builtin_aarch64_crc32h (_8, _7); ++ c_33 = (long unsigned int) _41; ++ s_34 = s_30 + 2; ++ The IR of bb is as above.*/ ++static void ++create_cond_true_bb(basic_block& cond_true_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer){ ++ gimple_seq stmts = NULL; ++ gimple* g; ++ gimple_stmt_iterator gsi; ++ ++ create_new_bb(cond_true_bb, after_bb, dominator_bb, outer); ++ gsi = gsi_last_bb(cond_true_bb); ++ tree s_46 = *(s_map.get(after_bb)); ++ g = gimple_build_assign(make_ssa_name(short_unsigned_type_node), ++ fold_build2(MEM_REF,short_unsigned_type_node,s_46, ++ build_int_cst (build_pointer_type (short_unsigned_type_node), 0))); ++ gimple_seq_add_stmt(&stmts,g); ++ tree lhs1 = gimple_assign_lhs(g); // _7 = MEM[(uint16_t *)s_46]; ++ unsigned int code = AARCH64_BUILTIN_CRC32H; ++ tree lhs2; ++ gimple* call_builtin = call_builtin_fun(code, lhs2,*(crc_map.get(cond_true_bb->prev_bb->prev_bb)),lhs1); ++ crc_map.put(cond_true_bb,lhs2); ++ gimple_seq_add_stmt(&stmts, call_builtin); ++ ++ tree lhs3 = copy_ssa_name(origin_loop.base_c); ++ g = gimple_build_assign(lhs3, NOP_EXPR, lhs2); ++ gimple_seq_add_stmt(&stmts, g); ++ c_map.put(cond_true_bb, lhs3); ++ ++ tree lhs5 = copy_ssa_name(s_46); ++ g = gimple_build_assign(lhs5, POINTER_PLUS_EXPR, s_46, ++ build_int_cst (sizetype, 2)); // s_30 + 2; ++ gimple_seq_add_stmt(&stmts, g); ++ s_map.put(cond_true_bb, lhs5); ++ ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ s_map.put(cond_true_bb, lhs5); ++} ++ ++/* : ++ # s_15 = PHI ++ # c_17 = PHI ++ _3 = n_26(D) & 1; ++ if (_3 != 0) ++ The IR of bb is as above.*/ ++static void ++create_cond_false_bb(basic_block& cond_false_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gphi* phi_s_cond_true_bb; ++ gphi* phi_c_cond_true_bb; ++ ++ create_new_bb(cond_false_bb, after_bb, dominator_bb, outer); ++ make_single_succ_edge(after_bb, cond_false_bb, EDGE_FALLTHRU); ++ ++ tree entry_s = get_current_def(origin_loop.base_s); ++ phi_s_cond_true_bb = create_phi_node_for_bb(entry_s, cond_false_bb); ++ tree entry_c = get_current_def(origin_loop.base_c); ++ phi_c_cond_true_bb = create_phi_node_for_bb(entry_c, cond_false_bb); ++ tree res_s = gimple_phi_result(phi_s_cond_true_bb); ++ set_current_def(origin_loop.base_s, res_s); ++ s_map.put(cond_false_bb, res_s); ++ tree res_c = gimple_phi_result(phi_c_cond_true_bb); ++ set_current_def(origin_loop.base_c, res_c); ++ c_map.put(cond_false_bb, res_c); ++ ++ gsi = gsi_last_bb(cond_false_bb); ++ tree lhs1 = gimple_build(&stmts, BIT_AND_EXPR, TREE_TYPE(origin_loop.base_n), ++ origin_loop.base_n, build_int_cst (TREE_TYPE (origin_loop.base_n), 1)); ++ gcond* cond_stmt = gimple_build_cond (NE_EXPR, lhs1, origin_loop.limit, ++ NULL_TREE, NULL_TREE); ++ gimple_seq_add_stmt (&stmts, cond_stmt); ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++} ++ ++/* : ++ _11 = (unsigned int) c_17; ++ _12 = *s_15; ++ _42 = __builtin_aarch64_crc32b (_11, _12); ++ c_36 = (long unsigned int) _42; ++ The IR of bb is as above. */ ++static void ++create_lastcond_true_bb(basic_block& new_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer){ ++ gimple_seq stmts = NULL; ++ gimple_stmt_iterator gsi; ++ gimple* g; ++ ++ create_new_bb(new_bb, after_bb, dominator_bb, outer); ++ gsi = gsi_last_bb(new_bb); ++ ++ tree lhs1 = gimple_build(&stmts, NOP_EXPR, unsigned_type_node, ++ get_current_def(origin_loop.base_c)); ++ tree lhs2; ++ tree s_15 = get_current_def(origin_loop.base_s); ++ g = gimple_build_assign (make_ssa_name (unsigned_char_type_node), ++ fold_build2 (MEM_REF, unsigned_char_type_node, s_15, ++ build_int_cst (TREE_TYPE(s_15), 0))); ++ gimple_seq_add_stmt (&stmts, g); ++ lhs2 = gimple_assign_lhs (g); ++ ++ unsigned int code = AARCH64_BUILTIN_CRC32B; ++ tree lhs3; ++ gimple* call_builtin = call_builtin_fun(code, lhs3, lhs1, lhs2); ++ crc_map.put(new_bb,lhs3); ++ gimple_seq_add_stmt(&stmts,call_builtin); ++ ++ tree lhs4 = copy_ssa_name(origin_loop.base_c); ++ g = gimple_build_assign(lhs4, NOP_EXPR, lhs3); ++ gimple_seq_add_stmt(&stmts, g); ++ c_map.put(new_bb, lhs4); ++ ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++} ++ ++static bool ++optional_add_phi_arg(gphi * phi, tree phi_res, tree phi_arg, edge e) ++{ ++ location_t loc; ++ if (same_ssa_name_var_p (phi_arg, phi_res)) ++ { ++ if (virtual_operand_p (phi_arg)) ++ loc = UNKNOWN_LOCATION; ++ else ++ loc = gimple_location (SSA_NAME_DEF_STMT (phi_arg)); ++ add_phi_arg (phi, phi_arg, e, loc); ++ ++ return true; ++ } ++ ++ return false; ++} ++ ++/* Add phi_arg for bb with phi node. */ ++static void ++update_phi_nodes (basic_block bb) ++{ ++ edge e; ++ edge_iterator ei; ++ gphi *phi; ++ gphi_iterator gsi; ++ tree res; ++ ++ for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ phi = gsi.phi (); ++ res = gimple_phi_result (phi); ++ ++ FOR_EACH_EDGE (e, ei, bb->preds) ++ { ++ if (PHI_ARG_DEF_FROM_EDGE (phi, e)) ++ continue; ++ tree var_c; ++ tree* ptr_var_c = c_map.get (e->src); ++ if(ptr_var_c == NULL) ++ { ++ var_c = origin_loop.base_c; ++ } else { ++ var_c = *ptr_var_c; ++ } ++ if(optional_add_phi_arg(phi, res, var_c, e)) ++ continue; ++ ++ tree var_nn; ++ tree* ptr_var_nn = nn_map.get (e->src); ++ if(ptr_var_nn == NULL) ++ { ++ var_nn = nn_tree; ++ } else { ++ var_nn = *ptr_var_nn; ++ } ++ if(optional_add_phi_arg(phi, res, var_nn, e)) ++ continue; ++ ++ tree var_s; ++ tree* ptr_var_s = s_map.get (e->src); ++ if(ptr_var_s == NULL) ++ { ++ var_s = origin_loop.base_s; ++ } else { ++ var_s = *ptr_var_s; ++ } ++ if(optional_add_phi_arg(phi, res, var_s, e)) ++ continue; ++ } ++ } ++} ++ ++static void ++create_new_loops(edge entry_edge) ++{ ++ class loop* new_loop = NULL; ++ basic_block loop_bb, cond_bb, cond_true_bb, cond_false_bb, lastcond_true_bb; ++ class loop *outer = entry_edge->src->loop_father; ++ change_preheader_bb(entry_edge); ++ ++ create_loop_bb(loop_bb, entry_edge->src, entry_edge->src, outer, entry_edge); ++ create_cond_bb(cond_bb, loop_bb, loop_bb, outer); ++ make_edge(loop_bb, loop_bb, EDGE_TRUE_VALUE); ++ make_edge(loop_bb, cond_bb, EDGE_FALSE_VALUE); ++ update_phi_nodes(loop_bb); ++ ++ new_loop = alloc_loop (); ++ new_loop->header = loop_bb; ++ new_loop->latch = loop_bb; ++ add_loop (new_loop, outer); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nPrint byte new loop %d:\n", new_loop->num); ++ flow_loop_dump (new_loop, dump_file, NULL, 1); ++ fprintf (dump_file, "\n\n"); ++ } ++ ++ create_cond_true_bb(cond_true_bb, cond_bb, cond_bb, outer); ++ make_edge(cond_bb, cond_true_bb, EDGE_TRUE_VALUE); ++ create_cond_false_bb(cond_false_bb, cond_true_bb, cond_bb, outer); ++ make_edge(cond_bb, cond_false_bb, EDGE_FALSE_VALUE); ++ update_phi_nodes(cond_bb); ++ update_phi_nodes(cond_false_bb); ++ create_lastcond_true_bb(lastcond_true_bb, cond_false_bb, cond_false_bb, outer); ++ make_edge(cond_false_bb, lastcond_true_bb, EDGE_TRUE_VALUE); ++ make_edge(cond_false_bb, origin_loop.exit_bb, EDGE_FALSE_VALUE); ++ make_single_succ_edge(lastcond_true_bb, origin_loop.exit_bb, EDGE_FALLTHRU); ++ ++ update_phi_nodes(origin_loop.exit_bb); ++ remove_edge(origin_loop.exit_edge); ++} ++ ++/* Clear information about the original loop. */ ++static void ++remove_origin_loop(class loop* loop) ++{ ++ basic_block* body = get_loop_body_in_dom_order(loop); ++ unsigned n = loop->num_nodes; ++ for(int i = 0; i < n; ++i) ++ { ++ delete_basic_block(body[i]); ++ } ++ free(body); ++ delete_loop(loop); ++} ++ ++/* Make sure that the dominance relationship of the newly inserted cfg ++ is not missing. */ ++static void ++update_loop_dominator(cdi_direction dir) ++{ ++ gcc_assert (dom_info_available_p (dir)); ++ ++ basic_block bb; ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ basic_block imm_bb = get_immediate_dominator (dir, bb); ++ if (!imm_bb || bb == origin_loop.exit_bb) ++ { ++ set_immediate_dominator (CDI_DOMINATORS, bb, ++ recompute_dominator (CDI_DOMINATORS, bb)); ++ continue; ++ } ++ } ++} ++ ++/* Perform the conversion of origin_loop to new_loop. */ ++static void ++convert_to_new_loop (class loop *loop) ++{ ++ create_new_loops (origin_loop.entry_edge); ++ remove_origin_loop (loop); ++ update_loop_dominator (CDI_DOMINATORS); ++ update_ssa (TODO_update_ssa); ++} ++ + /* The main entry of loop crc optimizes. */ + static unsigned int + tree_ssa_loop_crc () + { ++ if(TARGET_CRC32 == false){ ++ warning (OPT____,"The loop-crc optimization is not working."\ ++ "You should make sure that the specified architecture supports"\ ++ " crc:-march=armv8.1-a"); ++ return 0; ++ } + unsigned int todo = 0; + class loop *loop; + +@@ -553,28 +1230,28 @@ tree_ssa_loop_crc () + } + + FOR_EACH_LOOP (loop, LI_FROM_INNERMOST) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "======================================\n"); ++ fprintf (dump_file, "Processing loop %d:\n", loop->num); ++ fprintf (dump_file, "======================================\n"); ++ flow_loop_dump (loop, dump_file, NULL, 1); ++ fprintf (dump_file, "\n\n"); ++ } ++ ++ if (match_crc_loop (loop)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) +- { +- fprintf (dump_file, "======================================\n"); +- fprintf (dump_file, "Processing loop %d:\n", loop->num); +- fprintf (dump_file, "======================================\n"); +- flow_loop_dump (loop, dump_file, NULL, 1); +- fprintf (dump_file, "\n\n"); +- } +- +- if (match_crc_loop (loop)) +- { +- if (dump_file && (dump_flags & TDF_DETAILS)) +- { +- fprintf (dump_file, "The %dth loop form is success matched," +- "and the loop can be optimized.\n", +- loop->num); +- } +- +- convert_to_new_loop (loop); +- } ++ { ++ fprintf (dump_file, "The %dth loop form is success matched," ++ "and the loop can be optimized.\n", ++ loop->num); ++ } ++ ++ convert_to_new_loop (loop); + } ++ } + + todo |= (TODO_update_ssa); + return todo; +@@ -641,4 +1318,4 @@ gimple_opt_pass * + make_pass_loop_crc (gcc::context *ctxt) + { + return new pass_loop_crc (ctxt); +-} +\ No newline at end of file ++} +-- +2.33.0 + diff --git a/0147-add-insn-defs-and-correct-costs-for-cmlt-generation.patch b/0147-add-insn-defs-and-correct-costs-for-cmlt-generation.patch new file mode 100644 index 0000000..b03ad48 --- /dev/null +++ b/0147-add-insn-defs-and-correct-costs-for-cmlt-generation.patch @@ -0,0 +1,194 @@ +From 80b7de670da46d8921118799904cba4a0753bb72 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia WX1215920 +Date: Wed, 23 Aug 2023 15:03:00 +0300 +Subject: [PATCH 09/13] add insn defs and correct costs for cmlt generation + +--- + gcc/config/aarch64/aarch64-simd.md | 48 +++++++++++++++++++++++++++++ + gcc/config/aarch64/aarch64.c | 15 +++++++++ + gcc/config/aarch64/aarch64.opt | 4 +++ + gcc/config/aarch64/iterators.md | 3 +- + gcc/config/aarch64/predicates.md | 25 +++++++++++++++ + gcc/testsuite/gcc.dg/combine-cmlt.c | 20 ++++++++++++ + 6 files changed, 114 insertions(+), 1 deletion(-) + create mode 100755 gcc/testsuite/gcc.dg/combine-cmlt.c + +diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md +index 6049adc3f..f4213fd62 100644 +--- a/gcc/config/aarch64/aarch64-simd.md ++++ b/gcc/config/aarch64/aarch64-simd.md +@@ -4719,6 +4719,54 @@ + [(set_attr "type" "neon_compare, neon_compare_zero")] + ) + ++;; Use cmlt to replace vector arithmetic operations like this (SImode example): ++;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001) ++;; TODO: maybe extend to scalar operations or other cm** instructions. ++ ++(define_insn "*aarch64_cmlt_as_arith" ++ [(set (match_operand: 0 "register_operand" "=w") ++ (minus: ++ (ashift: ++ (and: ++ (lshiftrt: ++ (match_operand:VDQHSD 1 "register_operand" "w") ++ (match_operand:VDQHSD 2 "half_size_minus_one_operand")) ++ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")) ++ (match_operand:VDQHSD 4 "half_size_operand")) ++ (and: ++ (lshiftrt: ++ (match_dup 1) ++ (match_dup 2)) ++ (match_dup 3))))] ++ "TARGET_SIMD && flag_cmlt_arith" ++ "cmlt\t%0., %1., #0" ++ [(set_attr "type" "neon_compare_zero")] ++) ++ ++;; The helper definition that allows combiner to use the previous pattern. ++ ++(define_insn_and_split "*aarch64_cmlt_tmp" ++ [(set (match_operand: 0 "register_operand" "=w") ++ (and: ++ (lshiftrt: ++ (match_operand:VDQHSD 1 "register_operand" "w") ++ (match_operand:VDQHSD 2 "half_size_minus_one_operand")) ++ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))] ++ "TARGET_SIMD && flag_cmlt_arith" ++ "#" ++ "&& reload_completed" ++ [(set (match_operand: 0 "register_operand") ++ (lshiftrt: ++ (match_operand:VDQHSD 1 "register_operand") ++ (match_operand:VDQHSD 2 "half_size_minus_one_operand"))) ++ (set (match_dup 0) ++ (and: ++ (match_dup 0) ++ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))] ++ "" ++ [(set_attr "type" "neon_compare_zero")] ++) ++ + (define_insn_and_split "aarch64_cmdi" + [(set (match_operand:DI 0 "register_operand" "=w,w,r") + (neg:DI +diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c +index cbdde11b0..7a00a0817 100644 +--- a/gcc/config/aarch64/aarch64.c ++++ b/gcc/config/aarch64/aarch64.c +@@ -12659,6 +12659,21 @@ cost_minus: + return true; + } + ++ /* Detect aarch64_cmlt_as_arith instruction. Now only this pattern ++ matches the condition. The costs of cmlt and sub instructions ++ are comparable, so we are not increasing the cost here. */ ++ if (flag_cmlt_arith && GET_CODE (op0) == ASHIFT ++ && GET_CODE (op1) == AND) ++ { ++ rtx op0_subop0 = XEXP (op0, 0); ++ if (rtx_equal_p (op0_subop0, op1)) ++ { ++ rtx lshrt_op = XEXP (op0_subop0, 0); ++ if (GET_CODE (lshrt_op) == LSHIFTRT) ++ return true; ++ } ++ } ++ + /* Look for SUB (extended register). */ + if (is_a (mode, &int_mode) + && aarch64_rtx_arith_op_extract_p (op1, int_mode)) +diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt +index bb888461a..c42494036 100644 +--- a/gcc/config/aarch64/aarch64.opt ++++ b/gcc/config/aarch64/aarch64.opt +@@ -273,6 +273,10 @@ Use an immediate to offset from the stack protector guard register, sp_el0. + This option is for use with fstack-protector-strong and not for use in + user-land code. + ++mcmlt-arith ++Target Report Var(flag_cmlt_arith) Optimization Init(0) ++Use SIMD cmlt instruction to perform some arithmetic/logic calculations. ++ + TargetVariable + long aarch64_stack_protector_guard_offset = 0 + +diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md +index 0a7145281..d3be06c6f 100644 +--- a/gcc/config/aarch64/iterators.md ++++ b/gcc/config/aarch64/iterators.md +@@ -1228,7 +1228,8 @@ + (V2DI "2s")]) + + ;; Register suffix narrowed modes for VQN. +-(define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h") ++(define_mode_attr V2ntype [(V4HI "8b") (V2SI "4h") ++ (V8HI "16b") (V4SI "8h") + (V2DI "4s")]) + + ;; Widened modes of vector modes. +diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md +index 1754b1eff..de58562a7 100644 +--- a/gcc/config/aarch64/predicates.md ++++ b/gcc/config/aarch64/predicates.md +@@ -47,6 +47,31 @@ + return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3); + }) + ++(define_predicate "half_size_minus_one_operand" ++ (match_code "const_vector") ++{ ++ op = unwrap_const_vec_duplicate (op); ++ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2; ++ return CONST_INT_P (op) && (UINTVAL (op) == size - 1); ++}) ++ ++(define_predicate "half_size_operand" ++ (match_code "const_vector") ++{ ++ op = unwrap_const_vec_duplicate (op); ++ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2; ++ return CONST_INT_P (op) && (UINTVAL (op) == size); ++}) ++ ++(define_predicate "cmlt_arith_mask_operand" ++ (match_code "const_vector") ++{ ++ op = unwrap_const_vec_duplicate (op); ++ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2; ++ unsigned long long mask = ((unsigned long long) 1 << size) | 1; ++ return CONST_INT_P (op) && (UINTVAL (op) == mask); ++}) ++ + (define_predicate "subreg_lowpart_operator" + (ior (match_code "truncate") + (and (match_code "subreg") +diff --git a/gcc/testsuite/gcc.dg/combine-cmlt.c b/gcc/testsuite/gcc.dg/combine-cmlt.c +new file mode 100755 +index 000000000..b4c9a37ff +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/combine-cmlt.c +@@ -0,0 +1,20 @@ ++/* { dg-do compile { target aarch64-*-* } } */ ++/* { dg-options "-O3 -mcmlt-arith" } */ ++ ++/* The test checks usage of cmlt insns for arithmetic/logic calculations ++ * in foo (). It's inspired by sources of x264 codec. */ ++ ++typedef unsigned short int uint16_t; ++typedef unsigned int uint32_t; ++ ++void foo( uint32_t *a, uint32_t *b) ++{ ++ for (unsigned i = 0; i < 4; i++) ++ { ++ uint32_t s = ((a[i]>>((8 * sizeof(uint16_t))-1)) ++ &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1); ++ b[i] = (a[i]+s)^s; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {cmlt\t} 1 } } */ +-- +2.33.0 + diff --git a/0148-Introduce-RTL-ifcvt-enhancements.patch b/0148-Introduce-RTL-ifcvt-enhancements.patch new file mode 100644 index 0000000..1bc1aca --- /dev/null +++ b/0148-Introduce-RTL-ifcvt-enhancements.patch @@ -0,0 +1,502 @@ +From df68d120a049049671e44f6cda51e96a9a82c613 Mon Sep 17 00:00:00 2001 +From: Chernonog Vyacheslav 00812786 +Date: Mon, 28 Nov 2022 14:16:48 +0300 +Subject: [PATCH 10/13] Introduce RTL ifcvt enhancements + +It is controlled by option -fifcvt-allow-complicated-cmps, allowing +ifcvt to deal with complicated cmps like + if (cmp) + X = reg1 + else + X = reg2 + reg3 +and + if (cmp) + X = reg1 + reg3 + Y = reg2 + reg4 + Z = reg3 + +Parameter -param=ifcvt-allow-register-renaming=[0,1,2] allows ifcvt to +aggressively rename registers in basic blocks. +* 0: does not allow ifcvt to rename registers +* 1: allows ifcvt to rename registers in then and else bb +* 2: allows to rename registers in condition and else/then bb +--- + gcc/ifcvt.c | 298 ++++++++++++++++++++++++++++++++++++++----------- + gcc/params.opt | 8 ++ + 2 files changed, 240 insertions(+), 66 deletions(-) + +diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c +index 2452f231c..50a73a7ca 100644 +--- a/gcc/ifcvt.c ++++ b/gcc/ifcvt.c +@@ -1,5 +1,5 @@ + /* If-conversion support. +- Copyright (C) 2000-2020 Free Software Foundation, Inc. ++ Copyright (C) 2000-2022 Free Software Foundation, Inc. + + This file is part of GCC. + +@@ -876,7 +876,9 @@ noce_emit_store_flag (struct noce_if_info *if_info, rtx x, int reversep, + } + + /* Don't even try if the comparison operands or the mode of X are weird. */ +- if (cond_complex || !SCALAR_INT_MODE_P (GET_MODE (x))) ++ if (!param_ifcvt_allow_complicated_cmps ++ && (cond_complex ++ || !SCALAR_INT_MODE_P (GET_MODE (x)))) + return NULL_RTX; + + return emit_store_flag (x, code, XEXP (cond, 0), +@@ -1743,8 +1745,9 @@ noce_emit_cmove (struct noce_if_info *if_info, rtx x, enum rtx_code code, + + /* Don't even try if the comparison operands are weird + except that the target supports cbranchcc4. */ +- if (! general_operand (cmp_a, GET_MODE (cmp_a)) +- || ! general_operand (cmp_b, GET_MODE (cmp_b))) ++ if (! param_ifcvt_allow_complicated_cmps ++ && (! general_operand (cmp_a, GET_MODE (cmp_a)) ++ || ! general_operand (cmp_b, GET_MODE (cmp_b)))) + { + if (!have_cbranchcc4 + || GET_MODE_CLASS (GET_MODE (cmp_a)) != MODE_CC +@@ -1915,19 +1918,6 @@ noce_try_cmove (struct noce_if_info *if_info) + return FALSE; + } + +-/* Return true if X contains a conditional code mode rtx. */ +- +-static bool +-contains_ccmode_rtx_p (rtx x) +-{ +- subrtx_iterator::array_type array; +- FOR_EACH_SUBRTX (iter, array, x, ALL) +- if (GET_MODE_CLASS (GET_MODE (*iter)) == MODE_CC) +- return true; +- +- return false; +-} +- + /* Helper for bb_valid_for_noce_process_p. Validate that + the rtx insn INSN is a single set that does not set + the conditional register CC and is in general valid for +@@ -1946,7 +1936,6 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc) + /* Currently support only simple single sets in test_bb. */ + if (!sset + || !noce_operand_ok (SET_DEST (sset)) +- || contains_ccmode_rtx_p (SET_DEST (sset)) + || !noce_operand_ok (SET_SRC (sset))) + return false; + +@@ -1960,13 +1949,17 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc) + in this function. */ + + static bool +-bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename) ++bbs_ok_for_cmove_arith (basic_block bb_a, ++ basic_block bb_b, ++ rtx to_rename, ++ bitmap conflict_regs) + { + rtx_insn *a_insn; + bitmap bba_sets = BITMAP_ALLOC (®_obstack); +- ++ bitmap intersections = BITMAP_ALLOC (®_obstack); + df_ref def; + df_ref use; ++ rtx_insn *last_a = last_active_insn (bb_a, FALSE); + + FOR_BB_INSNS (bb_a, a_insn) + { +@@ -1976,30 +1969,25 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename) + rtx sset_a = single_set (a_insn); + + if (!sset_a) +- { +- BITMAP_FREE (bba_sets); +- return false; +- } ++ goto end_cmove_arith_check_and_fail; ++ if (a_insn == last_a) ++ continue; + /* Record all registers that BB_A sets. */ + FOR_EACH_INSN_DEF (def, a_insn) + if (!(to_rename && DF_REF_REG (def) == to_rename)) + bitmap_set_bit (bba_sets, DF_REF_REGNO (def)); + } + ++ bitmap_and (intersections, df_get_live_in (bb_b), bba_sets); + rtx_insn *b_insn; +- + FOR_BB_INSNS (bb_b, b_insn) + { + if (!active_insn_p (b_insn)) + continue; +- + rtx sset_b = single_set (b_insn); + + if (!sset_b) +- { +- BITMAP_FREE (bba_sets); +- return false; +- } ++ goto end_cmove_arith_check_and_fail; + + /* Make sure this is a REG and not some instance + of ZERO_EXTRACT or SUBREG or other dangerous stuff. +@@ -2011,25 +1999,34 @@ bbs_ok_for_cmove_arith (basic_block bb_a, basic_block bb_b, rtx to_rename) + if (MEM_P (SET_DEST (sset_b))) + gcc_assert (rtx_equal_p (SET_DEST (sset_b), to_rename)); + else if (!REG_P (SET_DEST (sset_b))) +- { +- BITMAP_FREE (bba_sets); +- return false; +- } ++ goto end_cmove_arith_check_and_fail; + +- /* If the insn uses a reg set in BB_A return false. */ ++ /* If the insn uses a reg set in BB_A return false ++ or try to collect register list for renaming. */ + FOR_EACH_INSN_USE (use, b_insn) + { +- if (bitmap_bit_p (bba_sets, DF_REF_REGNO (use))) ++ if (bitmap_bit_p (intersections, DF_REF_REGNO (use))) + { +- BITMAP_FREE (bba_sets); +- return false; ++ if (param_ifcvt_allow_register_renaming < 1) ++ goto end_cmove_arith_check_and_fail; ++ ++ /* Those regs should be renamed. We can't rename CC reg, but ++ possibly we can provide combined comparison in the future. */ ++ if (GET_MODE_CLASS (GET_MODE (DF_REF_REG (use))) == MODE_CC) ++ goto end_cmove_arith_check_and_fail; ++ bitmap_set_bit (conflict_regs, DF_REF_REGNO (use)); + } + } +- + } + + BITMAP_FREE (bba_sets); ++ BITMAP_FREE (intersections); + return true; ++ ++end_cmove_arith_check_and_fail: ++ BITMAP_FREE (bba_sets); ++ BITMAP_FREE (intersections); ++ return false; + } + + /* Emit copies of all the active instructions in BB except the last. +@@ -2084,6 +2081,134 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple) + return true; + } + ++/* This function tries to rename regs that intersect with considered bb. */ ++ ++static bool ++noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs) ++{ ++ bool success = true; ++ if (bitmap_empty_p (cond_rename_regs)) ++ return true; ++ if (param_ifcvt_allow_register_renaming < 2) ++ return false; ++ df_ref use; ++ rtx_insn* cmp_insn = if_info->cond_earliest; ++ /* Jump instruction as a condion currently unsupported. */ ++ if (JUMP_P (cmp_insn)) ++ return false; ++ rtx_insn* before_cmp = PREV_INSN (cmp_insn); ++ start_sequence (); ++ rtx_insn *copy_of_cmp = as_a (copy_rtx (cmp_insn)); ++ basic_block cmp_block = BLOCK_FOR_INSN (cmp_insn); ++ FOR_EACH_INSN_USE (use, cmp_insn) ++ { ++ if (bitmap_bit_p (cond_rename_regs, DF_REF_REGNO (use))) ++ { ++ rtx use_reg = DF_REF_REG (use); ++ rtx tmp = gen_reg_rtx (GET_MODE (use_reg)); ++ if (!validate_replace_rtx (use_reg, tmp, copy_of_cmp)) ++ { ++ end_sequence (); ++ return false; ++ } ++ noce_emit_move_insn (tmp, use_reg); ++ } ++ } ++ ++ emit_insn (PATTERN (copy_of_cmp)); ++ rtx_insn *seq = get_insns (); ++ unshare_all_rtl_in_chain (seq); ++ end_sequence (); ++ ++ emit_insn_after_setloc (seq, before_cmp, INSN_LOCATION (cmp_insn)); ++ delete_insn_and_edges (cmp_insn); ++ rtx_insn* insn; ++ FOR_BB_INSNS (cmp_block, insn) ++ df_insn_rescan (insn); ++ ++ if_info->cond = noce_get_condition (if_info->jump, ++ ©_of_cmp, ++ if_info->then_else_reversed); ++ if_info->cond_earliest = copy_of_cmp; ++ if_info->rev_cond = NULL_RTX; ++ ++ return success; ++} ++ ++/* This function tries to rename regs that intersect with considered bb. */ ++static bool ++noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs) ++{ ++ if (bitmap_empty_p (rename_regs)) ++ return true; ++ rtx_insn* insn; ++ rtx_insn *last_insn = last_active_insn (test_bb, FALSE); ++ bool res = true; ++ start_sequence (); ++ FOR_BB_INSNS (test_bb, insn) ++ { ++ if (!active_insn_p (insn)) ++ continue; ++ /* Only ssets are supported for now. */ ++ rtx sset = single_set (insn); ++ gcc_assert (sset); ++ rtx x = SET_DEST (sset); ++ if (!REG_P (x) || bitmap_bit_p (rename_regs, REGNO (x))) ++ continue; ++ ++ machine_mode mode = GET_MODE (x); ++ rtx tmp = gen_reg_rtx (mode); ++ if (!validate_replace_rtx_part (x, tmp, &SET_DEST (sset), insn)) ++ { ++ gcc_assert (insn != last_insn); ++ /* We can generate additional move for such case, ++ but it will increase register preasure. ++ For now just stop transformation. */ ++ rtx result_rtx = SET_DEST (single_set (last_insn)); ++ if (REG_P (result_rtx) && (x != result_rtx)) ++ { ++ res = false; ++ break; ++ } ++ if (!validate_replace_rtx (x, tmp, insn)) ++ gcc_unreachable (); ++ noce_emit_move_insn (tmp,x); ++ } ++ set_used_flags (insn); ++ rtx_insn* rename_candidate; ++ for (rename_candidate = NEXT_INSN (insn); ++ rename_candidate && rename_candidate!= NEXT_INSN (BB_END (test_bb)); ++ rename_candidate = NEXT_INSN (rename_candidate)) ++ { ++ if (!reg_overlap_mentioned_p (x, rename_candidate)) ++ continue; ++ ++ int replace_res = TRUE; ++ if (rename_candidate == last_insn) ++ { ++ validate_replace_src_group (x, tmp, rename_candidate); ++ replace_res = apply_change_group (); ++ } ++ else ++ replace_res = validate_replace_rtx (x, tmp, rename_candidate); ++ gcc_assert (replace_res); ++ set_used_flags (rename_candidate); ++ ++ } ++ set_used_flags (x); ++ set_used_flags (tmp); ++ ++ } ++ rtx_insn *seq = get_insns (); ++ unshare_all_rtl_in_chain (seq); ++ end_sequence (); ++ emit_insn_before_setloc (seq, first_active_insn (test_bb), ++ INSN_LOCATION (first_active_insn (test_bb))); ++ FOR_BB_INSNS (test_bb, insn) ++ df_insn_rescan (insn); ++ return res; ++} ++ + /* Try more complex cases involving conditional_move. */ + + static int +@@ -2166,11 +2291,29 @@ noce_try_cmove_arith (struct noce_if_info *if_info) + std::swap (then_bb, else_bb); + } + } +- ++ bitmap else_bb_rename_regs = BITMAP_ALLOC (®_obstack); ++ bitmap then_bb_rename_regs = BITMAP_ALLOC (®_obstack); + if (then_bb && else_bb +- && (!bbs_ok_for_cmove_arith (then_bb, else_bb, if_info->orig_x) +- || !bbs_ok_for_cmove_arith (else_bb, then_bb, if_info->orig_x))) +- return FALSE; ++ && (!bbs_ok_for_cmove_arith (then_bb, else_bb, ++ if_info->orig_x, ++ then_bb_rename_regs) ++ || !bbs_ok_for_cmove_arith (else_bb, then_bb, ++ if_info->orig_x, ++ else_bb_rename_regs))) ++ { ++ BITMAP_FREE (then_bb_rename_regs); ++ BITMAP_FREE (else_bb_rename_regs); ++ return FALSE; ++ } ++ bool prepass_renaming = true; ++ prepass_renaming |= noce_rename_regs_in_bb (then_bb, then_bb_rename_regs); ++ prepass_renaming |= noce_rename_regs_in_bb (else_bb, else_bb_rename_regs); ++ ++ BITMAP_FREE (then_bb_rename_regs); ++ BITMAP_FREE (else_bb_rename_regs); ++ ++ if (!prepass_renaming) ++ return FALSE; + + start_sequence (); + +@@ -2178,7 +2321,6 @@ noce_try_cmove_arith (struct noce_if_info *if_info) + came from the test block. The non-empty complex block that we will + emit might clobber the register used by B or A, so move it to a pseudo + first. */ +- + rtx tmp_a = NULL_RTX; + rtx tmp_b = NULL_RTX; + +@@ -3052,7 +3194,8 @@ noce_operand_ok (const_rtx op) + + static bool + bb_valid_for_noce_process_p (basic_block test_bb, rtx cond, +- unsigned int *cost, bool *simple_p) ++ unsigned int *cost, bool *simple_p, ++ bitmap cond_rename_regs) + { + if (!test_bb) + return false; +@@ -3086,10 +3229,10 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond, + rtx_insn *prev_last_insn = PREV_INSN (last_insn); + gcc_assert (prev_last_insn); + +- /* For now, disallow setting x multiple times in test_bb. */ +- if (REG_P (x) && reg_set_between_p (x, first_insn, prev_last_insn)) ++ if (REG_P (x) ++ && reg_set_between_p (x, first_insn, prev_last_insn) ++ && param_ifcvt_allow_register_renaming < 1) + return false; +- + bitmap test_bb_temps = BITMAP_ALLOC (®_obstack); + + /* The regs that are live out of test_bb. */ +@@ -3099,25 +3242,35 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond, + rtx_insn *insn; + FOR_BB_INSNS (test_bb, insn) + { +- if (insn != last_insn) +- { +- if (!active_insn_p (insn)) +- continue; ++ if (insn == last_insn) ++ continue; ++ if (!active_insn_p (insn)) ++ continue; + +- if (!insn_valid_noce_process_p (insn, cc)) +- goto free_bitmap_and_fail; ++ if (!insn_valid_noce_process_p (insn, cc)) ++ goto free_bitmap_and_fail; + +- rtx sset = single_set (insn); +- gcc_assert (sset); ++ rtx sset = single_set (insn); ++ gcc_assert (sset); + +- if (contains_mem_rtx_p (SET_SRC (sset)) +- || !REG_P (SET_DEST (sset)) +- || reg_overlap_mentioned_p (SET_DEST (sset), cond)) +- goto free_bitmap_and_fail; ++ if (contains_mem_rtx_p (SET_SRC (sset)) ++ || !REG_P (SET_DEST (sset))) ++ goto free_bitmap_and_fail; + +- potential_cost += pattern_cost (sset, speed_p); +- bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset))); ++ if (reg_overlap_mentioned_p (SET_DEST (sset), cond)) ++ { ++ if (param_ifcvt_allow_register_renaming < 1) ++ goto free_bitmap_and_fail; ++ rtx sset_dest = SET_DEST (sset); ++ if (REG_P (sset_dest) ++ && (GET_MODE_CLASS (GET_MODE (sset_dest)) != MODE_CC)) ++ bitmap_set_bit (cond_rename_regs, REGNO (sset_dest)); ++ else ++ goto free_bitmap_and_fail; + } ++ potential_cost += pattern_cost (sset, speed_p); ++ if (SET_DEST (sset) != SET_DEST (last_set)) ++ bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset))); + } + + /* If any of the intermediate results in test_bb are live after test_bb +@@ -3475,14 +3628,27 @@ noce_process_if_block (struct noce_if_info *if_info) + + bool speed_p = optimize_bb_for_speed_p (test_bb); + unsigned int then_cost = 0, else_cost = 0; ++ bitmap cond_rename_regs = BITMAP_ALLOC (®_obstack); + if (!bb_valid_for_noce_process_p (then_bb, cond, &then_cost, +- &if_info->then_simple)) +- return false; ++ &if_info->then_simple, cond_rename_regs)) ++ { ++ BITMAP_FREE (cond_rename_regs); ++ return false; ++ } + + if (else_bb + && !bb_valid_for_noce_process_p (else_bb, cond, &else_cost, +- &if_info->else_simple)) ++ &if_info->else_simple, cond_rename_regs)) ++ { ++ BITMAP_FREE (cond_rename_regs); ++ return false; ++ } ++ ++ if (!noce_rename_regs_in_cond (if_info, cond_rename_regs)) + return false; ++ cond = if_info->cond; ++ ++ BITMAP_FREE (cond_rename_regs); + + if (speed_p) + if_info->original_cost += average_cost (then_cost, else_cost, +@@ -5426,7 +5592,7 @@ if_convert (bool after_combine) + { + basic_block bb; + int pass; +- ++ cleanup_cfg (CLEANUP_EXPENSIVE); + if (optimize == 1) + { + df_live_add_problem (); +diff --git a/gcc/params.opt b/gcc/params.opt +index 83fd705ee..345f9b3ff 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -574,6 +574,14 @@ Maximum permissible cost for the sequence that would be generated by the RTL if- + Common Joined UInteger Var(param_max_rtl_if_conversion_unpredictable_cost) Init(40) IntegerRange(0, 200) Param Optimization + Maximum permissible cost for the sequence that would be generated by the RTL if-conversion pass for a branch that is considered unpredictable. + ++-param=ifcvt-allow-complicated-cmps= ++Common Joined UInteger Var(param_ifcvt_allow_complicated_cmps) IntegerRange(0, 1) Param Optimization ++Allow RTL if-conversion pass to deal with complicated cmps (can increase compilation time). ++ ++-param=ifcvt-allow-register-renaming= ++Common Joined UInteger Var(param_ifcvt_allow_register_renaming) IntegerRange(0, 2) Param Optimization ++Allow RTL if-conversion pass to aggressively rename registers in basic blocks. Sometimes additional moves will be created. ++ + -param=max-sched-extend-regions-iters= + Common Joined UInteger Var(param_max_sched_extend_regions_iters) Param Optimization + The maximum number of iterations through CFG to extend regions. +-- +2.33.0 + diff --git a/0149-Add-more-flexible-check-for-pointer-aliasing-during-.patch b/0149-Add-more-flexible-check-for-pointer-aliasing-during-.patch new file mode 100644 index 0000000..2d53e18 --- /dev/null +++ b/0149-Add-more-flexible-check-for-pointer-aliasing-during-.patch @@ -0,0 +1,239 @@ +From f43bdfbdcfdeb425a0bd303f4787a13323fd2934 Mon Sep 17 00:00:00 2001 +From: vchernon +Date: Wed, 27 Sep 2023 11:07:29 +0800 +Subject: [PATCH 11/13] Add more flexible check for pointer aliasing during + vectorization + +It takes minimum between number of iteration and segment length and helps to +speed up loops with small number of iterations when only tail can be vectorized. +--- + gcc/params.opt | 5 ++ + .../sve/var_stride_flexible_segment_len_1.c | 23 +++++++ + gcc/tree-data-ref.c | 68 +++++++++++++------ + gcc/tree-data-ref.h | 11 ++- + gcc/tree-vect-data-refs.c | 14 +++- + 5 files changed, 95 insertions(+), 26 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c + +diff --git a/gcc/params.opt b/gcc/params.opt +index 83fd705ee..7f335a94b 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -964,6 +964,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop. + Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization + Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check. + ++-param=vect-alias-flexible-segment-len= ++Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization ++Use a minimum length of different segments. Currently the minimum between ++iteration number and vectorization length is chosen by this param. ++ + -param=vect-max-version-for-alignment-checks= + Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization + Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check. +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c +new file mode 100644 +index 000000000..894f075f3 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c +@@ -0,0 +1,23 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */ ++ ++#define TYPE int ++#define SIZE 257 ++ ++void __attribute__ ((weak)) ++f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused))) ++{ ++ for (int i = 0; i < SIZE; ++i) ++ x[i * n] += y[i * n]; ++} ++ ++/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */ ++/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */ ++/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */ ++/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */ ++/* Should use a WAR check that multiplies by (VF-2)*4 rather than ++ an overlap check that multiplies by (257-1)*4. */ ++/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */ ++/* One range check and a check for n being zero. */ ++/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */ ++/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */ +diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c +index 2cb54def8..8c5f1048c 100644 +--- a/gcc/tree-data-ref.c ++++ b/gcc/tree-data-ref.c +@@ -2071,31 +2071,14 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr, + same arguments. Try to optimize cases in which the second access + is a write and in which some overlap is valid. */ + +-static bool +-create_waw_or_war_checks (tree *cond_expr, ++static void ++create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a, + const dr_with_seg_len_pair_t &alias_pair) + { + const dr_with_seg_len& dr_a = alias_pair.first; + const dr_with_seg_len& dr_b = alias_pair.second; + +- /* Check for cases in which: +- +- (a) DR_B is always a write; +- (b) the accesses are well-ordered in both the original and new code +- (see the comment above the DR_ALIAS_* flags for details); and +- (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */ +- if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW)) +- return false; +- +- /* Check for equal (but possibly variable) steps. */ + tree step = DR_STEP (dr_a.dr); +- if (!operand_equal_p (step, DR_STEP (dr_b.dr))) +- return false; +- +- /* Make sure that we can operate on sizetype without loss of precision. */ +- tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr)); +- if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype)) +- return false; + + /* All addresses involved are known to have a common alignment ALIGN. + We can therefore subtract ALIGN from an exclusive endpoint to get +@@ -2112,9 +2095,6 @@ create_waw_or_war_checks (tree *cond_expr, + fold_convert (ssizetype, indicator), + ssize_int (0)); + +- /* Get lengths in sizetype. */ +- tree seg_len_a +- = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len)); + step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step)); + + /* Each access has the following pattern: +@@ -2221,6 +2201,50 @@ create_waw_or_war_checks (tree *cond_expr, + *cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit); + if (dump_enabled_p ()) + dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n"); ++} ++ ++/* This is a wrapper function for create_waw_or_war_checks2. */ ++static bool ++create_waw_or_war_checks (tree *cond_expr, ++ const dr_with_seg_len_pair_t &alias_pair) ++{ ++ const dr_with_seg_len& dr_a = alias_pair.first; ++ const dr_with_seg_len& dr_b = alias_pair.second; ++ ++ /* Check for cases in which: ++ ++ (a) DR_B is always a write; ++ (b) the accesses are well-ordered in both the original and new code ++ (see the comment above the DR_ALIAS_* flags for details); and ++ (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */ ++ if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW)) ++ return false; ++ ++ /* Check for equal (but possibly variable) steps. */ ++ tree step = DR_STEP (dr_a.dr); ++ if (!operand_equal_p (step, DR_STEP (dr_b.dr))) ++ return false; ++ ++ /* Make sure that we can operate on sizetype without loss of precision. */ ++ tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr)); ++ if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype)) ++ return false; ++ ++ /* Get lengths in sizetype. */ ++ tree seg_len_a ++ = fold_convert (sizetype, ++ rewrite_to_non_trapping_overflow (dr_a.seg_len)); ++ create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair); ++ if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2) ++ { ++ tree seg_len2_a ++ = fold_convert (sizetype, ++ rewrite_to_non_trapping_overflow (dr_a.seg_len2)); ++ tree cond_expr2; ++ create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair); ++ *cond_expr = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, ++ *cond_expr, cond_expr2); ++ } + return true; + } + +diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h +index 771d20fbb..5903ce66a 100644 +--- a/gcc/tree-data-ref.h ++++ b/gcc/tree-data-ref.h +@@ -208,12 +208,19 @@ class dr_with_seg_len + public: + dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size, + unsigned int a) +- : dr (d), seg_len (len), access_size (size), align (a) {} +- ++ : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a) ++ {} ++ dr_with_seg_len (data_reference_p d, tree len, tree len2, ++ unsigned HOST_WIDE_INT size, unsigned int a) ++ : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a) ++ {} + data_reference_p dr; + /* The offset of the last access that needs to be checked minus + the offset of the first. */ + tree seg_len; ++ /* The second version of segment length. Currently this is used to ++ soften checks for a small number of iterations. */ ++ tree seg_len2; + /* A value that, when added to abs (SEG_LEN), gives the total number of + bytes in the segment. */ + poly_uint64 access_size; +diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c +index e4466a4f3..1b8a03c9c 100644 +--- a/gcc/tree-vect-data-refs.c ++++ b/gcc/tree-vect-data-refs.c +@@ -3498,6 +3498,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) + { + poly_uint64 lower_bound; + tree segment_length_a, segment_length_b; ++ tree segment_length2_a, segment_length2_b; + unsigned HOST_WIDE_INT access_size_a, access_size_b; + unsigned int align_a, align_b; + +@@ -3598,6 +3599,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) + { + segment_length_a = size_zero_node; + segment_length_b = size_zero_node; ++ segment_length2_a = size_zero_node; ++ segment_length2_b = size_zero_node; + } + else + { +@@ -3606,8 +3609,15 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) + length_factor = scalar_loop_iters; + else + length_factor = size_int (vect_factor); ++ /* In any case we should rememeber scalar_loop_iters ++ this helps to create flexible aliasing check ++ for small number of iterations. */ + segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor); + segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor); ++ segment_length2_a ++ = vect_vfa_segment_size (dr_info_a, scalar_loop_iters); ++ segment_length2_b ++ = vect_vfa_segment_size (dr_info_b, scalar_loop_iters); + } + access_size_a = vect_vfa_access_size (dr_info_a); + access_size_b = vect_vfa_access_size (dr_info_b); +@@ -3652,9 +3662,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) + } + + dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a, +- access_size_a, align_a); ++ segment_length2_a, access_size_a, align_a); + dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b, +- access_size_b, align_b); ++ segment_length2_b, access_size_b, align_b); + /* Canonicalize the order to be the one that's needed for accurate + RAW, WAR and WAW flags, in cases where the data references are + well-ordered. The order doesn't really matter otherwise, +-- +2.33.0 + diff --git a/0150-Implement-propagation-of-permutations-in-fwprop.patch b/0150-Implement-propagation-of-permutations-in-fwprop.patch new file mode 100644 index 0000000..005730e --- /dev/null +++ b/0150-Implement-propagation-of-permutations-in-fwprop.patch @@ -0,0 +1,1050 @@ +From 07aa5f889dc8bc3e642affe21dcfc197ad7d8b3b Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia +Date: Sun, 3 Sep 2023 05:52:32 +0800 +Subject: [PATCH 12/13] Implement propagation of permutations in fwprop + +It is an implementation of permutation forward propagation, which is a +transformation designed to decrease the number of vector permutation +instructions in vectorized code, moving the permutations over arithmetic +operations. +--- + gcc/config/aarch64/aarch64-simd.md | 26 + + gcc/params.opt | 4 + + gcc/testsuite/gcc.dg/vect/transpose-9.c | 56 ++ + gcc/tree-ssa-forwprop.c | 891 ++++++++++++++++++++++++ + 4 files changed, 977 insertions(+) + create mode 100755 gcc/testsuite/gcc.dg/vect/transpose-9.c + +diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md +index 6049adc3f..af6d3ebf6 100644 +--- a/gcc/config/aarch64/aarch64-simd.md ++++ b/gcc/config/aarch64/aarch64-simd.md +@@ -4615,6 +4615,19 @@ + [(set_attr "type" "neon_shift_imm_long")] + ) + ++(define_insn "*aarch64_simd_vec_unpacks_lo_shiftsi" ++ [(set (match_operand:V4SI 0 "register_operand" "=w") ++ (ashift:V4SI ++ (sign_extend:V4SI ++ (vec_select:V4HI ++ (match_operand:V8HI 1 "register_operand" "w") ++ (match_operand:V8HI 2 "vect_par_cnst_lo_half" ""))) ++ (match_operand:V4SI 3 "aarch64_simd_rshift_imm" "Dr")))] ++ "TARGET_SIMD" ++ "shll\t%0.4s, %1.4h, #%3" ++ [(set_attr "type" "neon_compare_zero")] ++) ++ + ;; vshll_high_n + + (define_insn "aarch64_shll2_n" +@@ -4632,6 +4645,19 @@ + [(set_attr "type" "neon_shift_imm_long")] + ) + ++(define_insn "*aarch64_simd_vec_unpacks_hi_shiftsi" ++ [(set (match_operand:V4SI 0 "register_operand" "=w") ++ (ashift:V4SI ++ (sign_extend:V4SI ++ (vec_select:V4HI ++ (match_operand:V8HI 1 "register_operand" "w") ++ (match_operand:V8HI 2 "vect_par_cnst_hi_half" ""))) ++ (match_operand:V4SI 3 "aarch64_simd_rshift_imm" "Dr")))] ++ "TARGET_SIMD" ++ "shll2\t%0.4s, %1.8h, #%3" ++ [(set_attr "type" "neon_compare_zero")] ++) ++ + ;; vrshr_n + + (define_insn "aarch64_shr_n" +diff --git a/gcc/params.opt b/gcc/params.opt +index 83fd705ee..a87f6f00a 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -852,6 +852,10 @@ Maximum size, in storage units, of an aggregate which should be considered for s + Common Joined UInteger Var(param_sra_max_propagations) Param Optimization Init(32) + Maximum number of artificial accesses to enable forward propagation that Scalar Replacement of Aggregates will keep for one local variable. + ++-param=tree-forwprop-perm= ++Common Joined UInteger Var(param_tree_forwprop_perm) Param Optimization Init(0) ++Propagate permutations in vectorized code on tree forward propagation. ++ + -param=ssa-name-def-chain-limit= + Common Joined UInteger Var(param_ssa_name_def_chain_limit) Init(512) Param Optimization + The maximum number of SSA_NAME assignments to follow in determining a value. +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-9.c b/gcc/testsuite/gcc.dg/vect/transpose-9.c +new file mode 100755 +index 000000000..f20a67c6e +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-9.c +@@ -0,0 +1,56 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-mtune=tsv110 --param=tree-forwprop-perm=1 -fdump-tree-forwprop-details" } */ ++/* { dg-require-effective-target vect_int } */ ++#include ++#include ++#include "tree-vect.h" ++ ++typedef unsigned short int sum_t; ++typedef unsigned int sum2_t; ++typedef long int intptr_t; ++typedef unsigned char data; ++#define BITS_PER_SUM (8 * sizeof(sum_t)) ++ ++static sum2_t bar(sum2_t a ) ++{ ++ sum2_t s = ((a>>(BITS_PER_SUM-1))&(((sum2_t)1<>BITS_PER_SUM)) >> 1; ++} ++/* { dg-final { scan-tree-dump "Initial permutations were reduced:" "forwprop4" } } */ ++/* { dg-final { scan-tree-dump "Permutations were moved through binary operations:" "forwprop4" } } */ ++ +diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c +index ba0b55f4a..92ef5d036 100644 +--- a/gcc/tree-ssa-forwprop.c ++++ b/gcc/tree-ssa-forwprop.c +@@ -2225,6 +2225,893 @@ simplify_permutation (gimple_stmt_iterator *gsi) + return 0; + } + ++/* Compare the UID of two gimple stmts for sorting in ascending order. */ ++ ++static int ++gimple_uid_cmp (const void *ptr0, const void *ptr1) ++{ ++ const gimple *stmt0 = *(gimple * const *) ptr0; ++ const gimple *stmt1 = *(gimple * const *) ptr1; ++ ++ if (gimple_uid (stmt0) < gimple_uid (stmt1)) ++ return -1; ++ else if (gimple_uid (stmt0) > gimple_uid (stmt1)) ++ return 1; ++ return 0; ++} ++ ++/* Find a source permutation statement in backward direction through a chain of ++ unary, single or binary operations. In the last case only one variable ++ operand is allowed. If it's found, return true and save the statement in ++ perm_stmts, otherwise return false. */ ++ ++static bool ++find_src_perm_stmt (tree op, auto_vec &perm_stmts) ++{ ++ gimple *stmt; ++ while ((stmt = get_prop_source_stmt (op, false, NULL))) ++ { ++ if (!can_propagate_from (stmt)) ++ return false; ++ ++ if (gimple_assign_rhs_code (stmt) == VEC_PERM_EXPR) ++ { ++ perm_stmts.safe_push (stmt); ++ return true; ++ } ++ ++ /* TODO: check vector length and element size. */ ++ enum tree_code code = gimple_assign_rhs_code (stmt); ++ switch (get_gimple_rhs_class (code)) ++ { ++ case GIMPLE_TERNARY_RHS: ++ return false; ++ case GIMPLE_BINARY_RHS: ++ { ++ tree op1 = gimple_assign_rhs1 (stmt); ++ tree op2 = gimple_assign_rhs2 (stmt); ++ bool is_cst_op1 = is_gimple_constant (op1); ++ bool is_cst_op2 = is_gimple_constant (op2); ++ if ((is_cst_op1 && is_cst_op2) || (!is_cst_op1 && !is_cst_op2)) ++ return false; ++ op = !is_cst_op1 && is_cst_op2 ? op1 : op2; ++ break; ++ } ++ case GIMPLE_UNARY_RHS: ++ case GIMPLE_SINGLE_RHS: ++ op = gimple_assign_rhs1 (stmt); ++ break; ++ default: ++ gcc_unreachable (); ++ } ++ if (TREE_CODE (op) != SSA_NAME) ++ return false; ++ } ++ return false; ++} ++ ++/* Check the stmt is binary operation and find initial permutations for both ++ of its sources. */ ++ ++static bool ++find_initial_permutations (gimple_stmt_iterator *gsi, tree &type, ++ auto_vec &perm_stmts) ++{ ++ gimple *stmt = gsi_stmt (*gsi); ++ enum tree_code code = gimple_assign_rhs_code (stmt); ++ ++ // TODO: support other initial binary operations. ++ gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR); ++ ++ type = TREE_TYPE (gimple_assign_lhs (stmt)); ++ if (!VECTOR_TYPE_P (type)) ++ return false; ++ tree op1 = gimple_assign_rhs1 (stmt); ++ tree op2 = gimple_assign_rhs2 (stmt); ++ if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME ++ || TREE_TYPE (op1) != type || TREE_TYPE (op2) != type || op1 == op2) ++ return false; ++ ++ if (find_src_perm_stmt (op1, perm_stmts) ++ && find_src_perm_stmt (op2, perm_stmts)) ++ return true; ++ return false; ++} ++ ++/* Check if the permutation statement is suitable for the transformation. */ ++ ++static bool ++check_perm_stmt (gimple *stmt, tree type, vec *perm_stmts, ++ vec *src_vects) ++{ ++ if (!stmt || !can_propagate_from (stmt)) ++ return false; ++ ++ enum tree_code code = gimple_assign_rhs_code (stmt); ++ if (code != VEC_PERM_EXPR) ++ return false; ++ ++ tree op3 = gimple_assign_rhs3 (stmt); ++ tree op1 = gimple_assign_rhs1 (stmt); ++ tree op2 = gimple_assign_rhs2 (stmt); ++ if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME ++ || TREE_CODE (op3) != VECTOR_CST) ++ return false; ++ if (type != NULL_TREE && (TREE_TYPE (op1) != type ++ || TREE_TYPE (op2) != type)) ++ return false; ++ if (perm_stmts) ++ perm_stmts->safe_push (stmt); ++ if (src_vects) ++ { ++ src_vects->safe_push (op1); ++ src_vects->safe_push (op2); ++ } ++ return true; ++} ++ ++/* Collect permutation stmts preceding the given stmt. */ ++ ++static bool ++find_perm_set (gimple *stmt, tree type, vec &perm_stmts, ++ vec &src_vects) ++{ ++ auto_vec ops; ++ if (!check_perm_stmt (stmt, NULL, NULL, &ops)) ++ return false; ++ ++ unsigned i; ++ tree op; ++ bool single_use_op = false; ++ FOR_EACH_VEC_ELT (ops, i, op) ++ { ++ /* Skip if we already processed the same operand. */ ++ if (i > 0 && ops[i] == ops[i - 1]) ++ continue; ++ /* Find one permutation stmt. */ ++ gimple *def_stmt = get_prop_source_stmt (op, false, &single_use_op); ++ if (!check_perm_stmt (def_stmt, type, &perm_stmts, &src_vects)) ++ return false; ++ if (single_use_op || src_vects.length () <= 1) ++ return false; ++ unsigned last_i = src_vects.length () - 1; ++ unsigned before_last_i = src_vects.length () - 2; ++ ++ /* Find one more permutation stmt. */ ++ gimple *use_stmt; ++ imm_use_iterator iter; ++ FOR_EACH_IMM_USE_STMT (use_stmt, iter, src_vects[before_last_i]) ++ if (use_stmt != def_stmt) ++ BREAK_FROM_IMM_USE_STMT (iter); ++ if (!use_stmt || use_stmt == def_stmt ++ || gimple_assign_rhs_code (use_stmt) != VEC_PERM_EXPR ++ || src_vects[before_last_i] != gimple_assign_rhs1 (use_stmt) ++ || src_vects[last_i] != gimple_assign_rhs2 (use_stmt)) ++ return false; ++ perm_stmts.safe_push (use_stmt); ++ } ++ return true; ++} ++ ++/* Walk permutation pattern and make a vector of permutation indices. */ ++ ++static bool ++make_vec_of_indices (vec &perm_pattern, vec &perm_indices) ++{ ++ unsigned i, j; ++ tree tree_it; ++ FOR_EACH_VEC_ELT (perm_pattern, i, tree_it) ++ { ++ unsigned HOST_WIDE_INT nelts; ++ if (!VECTOR_CST_NELTS (tree_it).is_constant (&nelts)) ++ return false; ++ for (j = 0; j < nelts; j++) ++ { ++ tree val = VECTOR_CST_ELT (tree_it, j); ++ gcc_checking_assert (TREE_CODE (val) == INTEGER_CST); ++ perm_indices.safe_push (TREE_INT_CST_LOW (val)); ++ } ++ } ++ return true; ++} ++ ++/* Check or collect a permutation pattern in the provided perm_stmts depending ++ on the passed parameters. If collect_pattern is true, collect permutation ++ vectors to pattern. In other case, check the pattern suits perm_stmts. */ ++ ++static bool ++check_or_collect_perm_pattern (vec &perm_stmts, vec &pattern, ++ bool collect_pattern) ++{ ++ unsigned i, j; ++ gimple *stmt_it; ++ tree tree_it; ++ FOR_EACH_VEC_ELT (perm_stmts, i, stmt_it) ++ { ++ gcc_assert (gimple_assign_rhs_code (stmt_it) == VEC_PERM_EXPR); ++ tree perm_vec = gimple_assign_rhs3 (stmt_it); ++ bool found = false; ++ FOR_EACH_VEC_ELT (pattern, j, tree_it) ++ if (operand_equal_p (tree_it, perm_vec)) ++ { ++ found = true; ++ break; ++ } ++ if (collect_pattern && !found) ++ pattern.safe_push (perm_vec); ++ else ++ gcc_assert (found); ++ if (i % pattern.length () != j) ++ return false; ++ } ++ return true; ++} ++ ++/* Identify the permutation pattern and check it. For now, we are checking ++ only transposition permutations with no more than 2 lines in their patterns. ++ Collect permutation const vectors and the second permutation stmts. */ ++ ++static bool ++check_perm_pattern (vec &first_perm_stmts, vec &perm_pattern, ++ vec &second_perm_stmts) ++{ ++ unsigned i, j; ++ gimple *stmt_it; ++ if (!check_or_collect_perm_pattern (first_perm_stmts, perm_pattern, true)) ++ return false; ++ ++ if (perm_pattern.length () == 0 || perm_pattern.length () > 2) ++ return false; ++ ++ /* Find the second permutation stmts. */ ++ hash_set visited; ++ FOR_EACH_VEC_ELT (first_perm_stmts, i, stmt_it) ++ { ++ tree dst = gimple_assign_lhs (stmt_it); ++ use_operand_p use_p; ++ imm_use_iterator iter; ++ FOR_EACH_IMM_USE_FAST (use_p, iter, dst) ++ { ++ gimple *stmt_it2 = USE_STMT (use_p); ++ if (visited.contains (stmt_it2)) ++ continue; ++ second_perm_stmts.safe_push (stmt_it2); ++ visited.add (stmt_it2); ++ } ++ } ++ second_perm_stmts.qsort (gimple_uid_cmp); ++ ++ if (first_perm_stmts.length () != second_perm_stmts.length ()) ++ return false; ++ ++ /* Check that all second_perm_stmts are VEC_PERM_EXPR. */ ++ FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it) ++ if (gimple_assign_rhs_code (stmt_it) != VEC_PERM_EXPR) ++ return false; ++ ++ /* Check permutation pattern on the second permutation stmts. */ ++ if (!check_or_collect_perm_pattern (second_perm_stmts, perm_pattern, false)) ++ return false; ++ ++ /* Check values of permutation indices. */ ++ auto_vec perm_indices (vector_cst_encoded_nelts (perm_pattern[0]) ++ * perm_pattern.length ()); ++ if (!make_vec_of_indices (perm_pattern, perm_indices)) ++ return false; ++ ++ unsigned val, half_len = perm_indices.length () / 2; ++ FOR_EACH_VEC_ELT (perm_indices, j, val) ++ if (val != (j % 2 ? half_len + j / 2 : j / 2)) ++ return false; ++ ++ /* Check the correspondence of defs in first_perm_stmts and uses in ++ second_perm_stmts. */ ++ tree type1 = TREE_TYPE (gimple_assign_lhs (first_perm_stmts[0])); ++ tree type2 = TREE_TYPE (gimple_assign_lhs (second_perm_stmts[0])); ++ if (type1 != type2) ++ return false; ++ ++ unsigned HOST_WIDE_INT len = TYPE_VECTOR_SUBPARTS (type1).to_constant (); ++ FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it) ++ { ++ /* Vectors of first/second perm stmts consist of blocks, each block ++ transposes its own set of input vectors. J corresponds to the number ++ of such block in the vector. */ ++ unsigned j = (i / len) * len; ++ gimple *src_stmt1 = first_perm_stmts[j + (i - j) / 2]; ++ gimple *src_stmt2 = first_perm_stmts[j + (i - j) / 2 + len / 2]; ++ if (gimple_assign_rhs1 (stmt_it) != gimple_assign_lhs (src_stmt1) ++ || gimple_assign_rhs2 (stmt_it) != gimple_assign_lhs (src_stmt2)) ++ return false; ++ } ++ return true; ++} ++ ++/* For the given vector of stmts find all immediate def or use stmts. ++ It uses SSA and don't go trough loads/stores. */ ++ ++static bool ++find_next_stmts (auto_vec &stmts, auto_vec &next_stmts, ++ bool is_forward, bool skip_perms) ++{ ++ unsigned i; ++ gimple *stmt_it; ++ hash_set new_stmt_set; ++ FOR_EACH_VEC_ELT (stmts, i, stmt_it) ++ { ++ if (is_forward) ++ { ++ tree lhs = gimple_assign_lhs (stmt_it); ++ if (!lhs || TREE_CODE (lhs) != SSA_NAME) ++ continue; ++ imm_use_iterator iter; ++ gimple *use_stmt; ++ FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs) ++ if (!new_stmt_set.contains (use_stmt)) ++ { ++ new_stmt_set.add (use_stmt); ++ if (!skip_perms ++ || gimple_assign_rhs_code (use_stmt) != VEC_PERM_EXPR) ++ next_stmts.safe_push (use_stmt); ++ } ++ } ++ else ++ { ++ tree rhs; ++ auto_vec rhs_vec (3); ++ if ((rhs = gimple_assign_rhs1 (stmt_it))) ++ rhs_vec.quick_push (rhs); ++ if ((rhs = gimple_assign_rhs2 (stmt_it))) ++ rhs_vec.quick_push (rhs); ++ if ((rhs = gimple_assign_rhs3 (stmt_it))) ++ rhs_vec.quick_push (rhs); ++ unsigned j; ++ FOR_EACH_VEC_ELT (rhs_vec, j, rhs) ++ { ++ if (TREE_CODE (rhs) == VIEW_CONVERT_EXPR) ++ rhs = TREE_OPERAND (rhs, 0); ++ if (TREE_CODE (rhs) != SSA_NAME) ++ continue; ++ gimple *def_stmt = get_prop_source_stmt (rhs, false, NULL); ++ if (!def_stmt) ++ return false; ++ if (new_stmt_set.contains (def_stmt)) ++ continue; ++ new_stmt_set.add (def_stmt); ++ if (!skip_perms ++ || gimple_assign_rhs_code (def_stmt) != VEC_PERM_EXPR) ++ next_stmts.safe_push (def_stmt); ++ } ++ } ++ } ++ return true; ++} ++ ++/* Check if stmts in the vector have similar code and type. Process only ++ assign stmts. */ ++ ++static bool ++check_stmts_similarity (auto_vec &stmts, enum tree_code &code) ++{ ++ code = NOP_EXPR; ++ tree type = NULL_TREE; ++ unsigned i; ++ gimple *stmt_it; ++ FOR_EACH_VEC_ELT (stmts, i, stmt_it) ++ { ++ if (!is_gimple_assign (stmt_it)) ++ return false; ++ tree lhs = gimple_assign_lhs (stmt_it); ++ enum tree_code code2 = gimple_assign_rhs_code (stmt_it); ++ if (type != NULL_TREE) ++ { ++ /* Unpack lo/hi are the same for the analysis. */ ++ if (((code2 != VEC_UNPACK_LO_EXPR && code2 != VEC_UNPACK_HI_EXPR) ++ || (code != VEC_UNPACK_LO_EXPR && code != VEC_UNPACK_HI_EXPR)) ++ && (!lhs || type != TREE_TYPE (lhs) ++ || (code != NOP_EXPR && code != code2))) ++ return false; ++ } ++ else if (lhs) ++ type = TREE_TYPE (lhs); ++ if (code == NOP_EXPR) ++ code = code2; ++ } ++ return true; ++} ++ ++/* Check that the order of definitions of first_stmts and uses of second_stmts ++ is the same. */ ++ ++static bool ++check_def_use_order (vec &first_stmts, vec &second_stmts) ++{ ++ first_stmts.qsort (gimple_uid_cmp); ++ second_stmts.qsort (gimple_uid_cmp); ++ unsigned len1 = first_stmts.length (); ++ unsigned len2 = second_stmts.length (); ++ ++ /* Skip if one of the blocks is empty or the second block is permutaions. */ ++ if (!len1 || !len2 ++ || gimple_assign_rhs_code (second_stmts[0]) == VEC_PERM_EXPR) ++ return true; ++ ++ unsigned i; ++ gimple *stmt_it; ++ FOR_EACH_VEC_ELT (first_stmts, i, stmt_it) ++ { ++ tree op = gimple_assign_lhs (stmt_it); ++ imm_use_iterator iter; ++ gimple *stmt; ++ FOR_EACH_IMM_USE_STMT (stmt, iter, op) ++ { ++ if ((len1 == len2 && stmt != second_stmts[i]) ++ || (len1 == len2 * 2 && stmt != second_stmts[i % len2])) ++ RETURN_FROM_IMM_USE_STMT (iter, false); ++ enum tree_code code = gimple_assign_rhs_code (stmt); ++ if ((len1 * 2 == len2) ++ && ((code == VEC_UNPACK_LO_EXPR && stmt != second_stmts[2 * i]) ++ || (code == VEC_UNPACK_HI_EXPR ++ && stmt != second_stmts[2 * i + 1]))) ++ RETURN_FROM_IMM_USE_STMT (iter, false); ++ } ++ } ++ return true; ++} ++ ++/* Check similarity of stmts in the block of arithmetic operations. */ ++ ++static bool ++check_arithmetic_block (vec &initial_perm_stmts, unsigned nstmts) ++{ ++ auto_vec next_stmts (nstmts); ++ auto_vec prev_stmts (nstmts); ++ ++ enum tree_code code; ++ unsigned i; ++ gimple *stmt_it; ++ FOR_EACH_VEC_ELT (initial_perm_stmts, i, stmt_it) ++ prev_stmts.quick_push (stmt_it); ++ ++ do ++ { ++ next_stmts.block_remove (0, next_stmts.length ()); ++ if (!find_next_stmts (prev_stmts, next_stmts, false, true)) ++ return false; ++ ++ /* Check that types and codes of all stmts in the list are the same. */ ++ if (!check_stmts_similarity (next_stmts, code)) ++ return false; ++ /* Check that the order of all operands is the same. */ ++ if (!check_def_use_order (next_stmts, prev_stmts)) ++ return false; ++ prev_stmts.block_remove (0, prev_stmts.length ()); ++ ++ FOR_EACH_VEC_ELT (next_stmts, i, stmt_it) ++ prev_stmts.safe_push (stmt_it); ++ } ++ while (code != NOP_EXPR); ++ ++ return true; ++} ++ ++/* Find two blocks of permutations on two sets of input vectors which are ++ used in the same vectorized arithmetic operations after the permutaion: ++ Va1...VaN = PERM{P1} (Sa1...SaN) ++ Vb1...VbN = PERM{P1} (Sb1...SbN) ++ Vc1...VcN = binops (Va1...VaN, Vb1...VbN) ++ The goal of the transformation is to execute the block of permutations ++ only once on the result of the arithmetic operations: ++ Va1...VaN = binops (Sa1...SaN, Sb1...SbN) ++ Vc1...VcN = PERM{P1} (Va1...VaN) ++ ++ Currently the analysis looks for transposition permutations that consist ++ of two layers of statements e.g.: ++ Vt1 = PERM { 0, 4, 1, 5 } Sa1, Sa2 // the first ++ Vt2 = PERM { 2, 6, 3, 7 } Sa1, Sa2 ++ Vt3 = PERM { 0, 4, 1, 5 } Sa3, Sa4 ++ Vt4 = PERM { 2, 6, 3, 7 } Sa3, Sa4 ++ Va1 = PERM { 0, 4, 1, 5 } Vt1, Vt3 // the second ++ Va2 = PERM { 2, 6, 3, 7 } Vt1, Vt3 ++ Va3 = PERM { 0, 4, 1, 5 } Vt2, Vt4 ++ Va4 = PERM { 2, 6, 3, 7 } Vt2, Vt4 ++ Permutation stmts are collected in first_perm_stmts and second_perm_stmts ++ vectors correspondinglys. ++ ++ Arithmetic operations may contain several stmts for one pair of input source ++ vectors e.g.: ++ Vtmp1 = unop (Va1) ++ Vtmp2 = binop (Vb1, const) ++ Vc1 = binop (Vtmp1, Vtmp2) ++ The last stmts of each sequence in the arithmetic block are collected ++ in final_arith_stmts. */ ++ ++static bool ++analyze_perm_fwprop (tree type, unsigned HOST_WIDE_INT nelts, ++ vec &stmts, auto_vec &src_vects, ++ auto_vec &perm_pattern, ++ auto_vec &final_arith_stmts, ++ auto_vec &second_perm_stmts) ++{ ++ gcc_checking_assert (stmts.length () == 2); ++ auto_vec first_perm_stmts (nelts * 2); ++ if (!find_perm_set (stmts[0], type, first_perm_stmts, src_vects) ++ || !find_perm_set (stmts[1], type, first_perm_stmts, src_vects)) ++ return false; ++ first_perm_stmts.qsort (gimple_uid_cmp); ++ ++ /* Determine permutation pattern. */ ++ if (!check_perm_pattern (first_perm_stmts, perm_pattern, second_perm_stmts)) ++ return false; ++ ++ /* Find all arithmetic stmts. */ ++ unsigned i; ++ gimple *stmt_it; ++ auto_vec all_arith_stmts (nelts * 2); ++ hash_set visited; ++ FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it) ++ { ++ tree dst = gimple_assign_lhs (stmt_it); ++ use_operand_p use_p; ++ gimple *use_stmt; ++ if (!single_imm_use (dst, &use_p, &use_stmt)) ++ return false; ++ all_arith_stmts.quick_push (use_stmt); ++ visited.add (use_stmt); ++ } ++ ++ /* Select final arithmetic stmts. */ ++ FOR_EACH_VEC_ELT (all_arith_stmts, i, stmt_it) ++ { ++ tree dst = gimple_assign_lhs (stmt_it); ++ use_operand_p use_p; ++ imm_use_iterator iter; ++ bool use_only_outside_arith_stmts = true; ++ FOR_EACH_IMM_USE_FAST (use_p, iter, dst) ++ if (visited.contains (USE_STMT (use_p))) ++ { ++ use_only_outside_arith_stmts = false; ++ break; ++ } ++ if (use_only_outside_arith_stmts) ++ final_arith_stmts.quick_push (stmt_it); ++ } ++ ++ /* Check that all results has the same arithmetic patterns. */ ++ if (!check_arithmetic_block (final_arith_stmts, nelts)) ++ return false; ++ ++ if (final_arith_stmts.length () < nelts) ++ return false; ++ return true; ++} ++ ++/* Substitute uses of stmts' results by new_uses. */ ++ ++static void ++substitute_uses (vec &stmts, vec &new_uses) ++{ ++ gcc_checking_assert (stmts.length () == new_uses.length ()); ++ unsigned i; ++ gimple *stmt_it; ++ FOR_EACH_VEC_ELT (stmts, i, stmt_it) ++ { ++ tree op = gimple_assign_lhs (stmt_it); ++ imm_use_iterator iter; ++ gimple *use_stmt; ++ FOR_EACH_IMM_USE_STMT (use_stmt, iter, op) ++ { ++ use_operand_p use_p; ++ FOR_EACH_IMM_USE_ON_STMT (use_p, iter) ++ SET_USE (use_p, new_uses[i]); ++ update_stmt (use_stmt); ++ } ++ } ++} ++ ++/* Propagate permutations through the block of arithmetic operations. */ ++ ++static void ++fwprop_perms (tree type, auto_vec &src_vects, ++ auto_vec &perm_pattern, ++ auto_vec &final_arith_stmts, ++ auto_vec &second_perm_stmts) ++{ ++ /* Build new permutation stmts after the block of arithmetic stmts. */ ++ gimple_seq new_stmts = NULL; ++ unsigned perm_block_size = final_arith_stmts.length (); ++ auto_vec new_first_perm_vals (perm_block_size); ++ hash_set new_stmts_set; ++ unsigned i, perm_pattern_size = perm_pattern.length (); ++ for (i = 0; i < perm_block_size; i++) ++ { ++ tree op0 = gimple_assign_lhs (final_arith_stmts[i / 2]); ++ unsigned idx = i / 2 + perm_block_size / 2; ++ tree op1 = gimple_assign_lhs (final_arith_stmts[idx]); ++ tree res = gimple_build (&new_stmts, VEC_PERM_EXPR, type, op0, op1, ++ perm_pattern[i % perm_pattern_size]); ++ new_first_perm_vals.quick_push (res); ++ new_stmts_set.add (gimple_seq_last (new_stmts)); ++ } ++ auto_vec new_second_perm_vals (perm_block_size); ++ for (i = 0; i < perm_block_size; i++) ++ { ++ tree op0 = new_first_perm_vals[i / 2]; ++ tree op1 = new_first_perm_vals[i / 2 + perm_block_size/ 2]; ++ tree res = gimple_build (&new_stmts, VEC_PERM_EXPR, type, op0, op1, ++ perm_pattern[i % perm_pattern_size]); ++ new_second_perm_vals.quick_push (res); ++ new_stmts_set.add (gimple_seq_last (new_stmts)); ++ } ++ ++ gimple_stmt_iterator g = gsi_for_stmt (final_arith_stmts.last ()); ++ gsi_insert_seq_after (&g, new_stmts, GSI_SAME_STMT); ++ ++ /* Replace old uses of the arithmetic block results by destinations of ++ the new permutation block. */ ++ gimple *stmt_it; ++ FOR_EACH_VEC_ELT (final_arith_stmts, i, stmt_it) ++ { ++ tree op0 = gimple_assign_lhs (final_arith_stmts[i]); ++ imm_use_iterator iter; ++ gimple *use_stmt; ++ use_operand_p use_p; ++ FOR_EACH_IMM_USE_STMT (use_stmt, iter, op0) ++ { ++ if (new_stmts_set.contains (use_stmt)) ++ continue; ++ FOR_EACH_IMM_USE_ON_STMT (use_p, iter) ++ SET_USE (use_p, new_second_perm_vals[i]); ++ update_stmt (use_stmt); ++ } ++ } ++ ++ /* Disconnect the old permutation stmts. */ ++ substitute_uses (second_perm_stmts, src_vects); ++} ++ ++/* Find the permutation stmts in the forward or backward direction (in terms of ++ def/use graph) starting from the vector of initial stmts. Count reduction ++ stmts (i.e. binary operations) if they can change the number of processed ++ elements. */ ++ ++static bool ++find_perm_stmts (vec &initial_stmts, unsigned nstmts, ++ vec &final_perm_stmts, bool is_forward, ++ unsigned &nreduct) ++{ ++ auto_vec next_stmts (nstmts); ++ auto_vec prev_stmts (nstmts); ++ ++ nreduct = 0; ++ enum tree_code code; ++ unsigned i; ++ gimple *stmt_it; ++ FOR_EACH_VEC_ELT (initial_stmts, i, stmt_it) ++ prev_stmts.quick_push (stmt_it); ++ ++ do ++ { ++ next_stmts.block_remove (0, next_stmts.length ()); ++ if (!find_next_stmts (prev_stmts, next_stmts, is_forward, false)) ++ return false; ++ ++ /* Check that types and codes of all stmts in the list are the same. */ ++ if (!check_stmts_similarity (next_stmts, code)) ++ return false; ++ ++ /* TODO: don't take into account binary operations with constants. */ ++ if (TREE_CODE_CLASS (code) == tcc_binary) ++ nreduct += 1; ++ ++ if (is_forward ? !check_def_use_order (prev_stmts, next_stmts) ++ : !check_def_use_order (next_stmts, prev_stmts)) ++ return false; ++ ++ prev_stmts.block_remove (0, prev_stmts.length ()); ++ ++ FOR_EACH_VEC_ELT (next_stmts, i, stmt_it) ++ prev_stmts.safe_push (stmt_it); ++ } ++ while (code != NOP_EXPR && code != VEC_PERM_EXPR); ++ ++ if (code != VEC_PERM_EXPR) ++ return false; ++ ++ FOR_EACH_VEC_ELT (next_stmts, i, stmt_it) ++ final_perm_stmts.safe_push (stmt_it); ++ final_perm_stmts.qsort (gimple_uid_cmp); ++ return true; ++} ++ ++/* Check if the initial and the final permutations can be optimized i.e. ++ the initial permutation can be removed with the modification of ++ the final one. */ ++ ++static bool ++can_reduce_permutations (unsigned init_nelts, vec &perm_pattern, ++ vec &init_perm_stmts) ++{ ++ auto_vec perm_indices (init_nelts); ++ if (!make_vec_of_indices (perm_pattern, perm_indices)) ++ return false; ++ unsigned i, j; ++ gimple *stmt_it; ++ unsigned perm_vec_size = perm_indices.length (); ++ FOR_EACH_VEC_ELT (init_perm_stmts, i, stmt_it) ++ { ++ gcc_assert (gimple_assign_rhs_code (stmt_it) == VEC_PERM_EXPR); ++ tree perm_vec2 = gimple_assign_rhs3 (stmt_it); ++ unsigned HOST_WIDE_INT mask_elts; ++ if (!VECTOR_CST_NELTS (perm_vec2).is_constant (&mask_elts)) ++ return false; ++ for (j = 0; j < mask_elts; j++) ++ { ++ tree val = VECTOR_CST_ELT (perm_vec2, j); ++ gcc_assert (TREE_CODE (val) == INTEGER_CST); ++ unsigned HOST_WIDE_INT int_val = TREE_INT_CST_LOW (val); ++ if (int_val != perm_indices[j % perm_vec_size] ++ + (j / perm_vec_size) * perm_vec_size) ++ return false; ++ } ++ } ++ return true; ++} ++ ++/* Find permutation blocks before and after arithmetic operations and decide ++ if the number of permutations can be reduced, e.g: ++ Va1...VaN = PERM{P1} (Sa1...SaN) ++ Vb1...VbM = some operations (Va1...VaN) ++ Vb1...VbM = PERM{P2} (Sb1...SbM) ++ can be transformed to: ++ Vb1...VbM = some operations (Va1...VaN) ++ Vb1...VbM = PERM{P3} (Sb1...SbM) ++ ++ Currently it supports initial permutations like this: ++ Va1 = PERM { 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15} Sa1 ++ and transposition permutations with two layers of permutation stmts as ++ final permutaions. ++ ++ Operations between permutations can include unary and binary arithmetic, ++ element conversions and vector packing/unpacking. */ ++ ++static bool ++analyze_perm_reduction (unsigned HOST_WIDE_INT nelts, ++ vec &perm_stmts, ++ vec &init_perm_stmts, ++ vec &second_perm_stmts) ++{ ++ auto_vec first_perm_stmts (nelts * 2); ++ if (!check_perm_stmt (perm_stmts[0], NULL_TREE, &first_perm_stmts, NULL) ++ || !check_perm_stmt (perm_stmts[1], NULL_TREE, &first_perm_stmts, NULL)) ++ return false; ++ ++ unsigned nreduct; ++ auto_vec final_perm_stmts (nelts * 2); ++ if (!find_perm_stmts (first_perm_stmts, nelts, final_perm_stmts, true, ++ nreduct)) ++ return false; ++ ++ if (!find_perm_stmts (final_perm_stmts, nelts, init_perm_stmts, false, ++ nreduct)) ++ return false; ++ ++ /* Check number of elemetns in the inital and final data block. */ ++ tree init_elem_type = TREE_TYPE (gimple_assign_lhs (init_perm_stmts[0])); ++ unsigned init_nelts = TYPE_VECTOR_SUBPARTS (init_elem_type).to_constant () ++ * init_perm_stmts.length (); ++ tree final_elem_type = TREE_TYPE (gimple_assign_lhs (final_perm_stmts[0])); ++ unsigned final_nelts = TYPE_VECTOR_SUBPARTS (final_elem_type).to_constant () ++ * final_perm_stmts.length (); ++ if (init_nelts != final_nelts * (1 + nreduct)) ++ return false; ++ ++ /* Check the final permutations and detect its pattern. */ ++ auto_vec perm_pattern (nelts); ++ if (!check_perm_pattern (final_perm_stmts, perm_pattern, second_perm_stmts)) ++ return false; ++ ++ return can_reduce_permutations (init_nelts, perm_pattern, init_perm_stmts); ++} ++ ++/* Do the optimization: skip the initial permutation and change the order ++ of destinations after the second layer of permutation statements in ++ the final permutation block. */ ++ ++static void ++reduce_perms (vec &init_perm_stmts, vec &second_perm_stmts) ++{ ++ unsigned i; ++ gimple *stmt_it; ++ auto_vec new_srcs (init_perm_stmts.length ()); ++ FOR_EACH_VEC_ELT (init_perm_stmts, i, stmt_it) ++ new_srcs.quick_push (gimple_assign_rhs1 (stmt_it)); ++ substitute_uses (init_perm_stmts, new_srcs); ++ ++ unsigned half = second_perm_stmts.length () / 2; ++ auto_vec new_dsts (second_perm_stmts.length ()); ++ FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it) ++ { ++ unsigned idx = i < half ? i << 1 : ((i - half) << 1) + 1; ++ new_dsts.quick_push (gimple_assign_lhs (second_perm_stmts[idx])); ++ } ++ ++ FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it) ++ { ++ gimple_assign_set_lhs (stmt_it, new_dsts[i]); ++ update_stmt (stmt_it); ++ } ++} ++ ++/* Optimize permutations in the following two cases: ++ 1. Recognize the same permutations of two sets of vectors with subsequent ++ binary arithmetic operations on them: ++ V1 = PERM{1} (S1); ++ V2 = PERM{1} (S2); ++ V3 = V1 binop V2; ++ then move the permutation after the operations: ++ V0 = S1 binop S2; ++ V3 = PERM{1} V0; ++ 2. Detect the first permutation before some operations on a set of vectors ++ and the second one after the operations: ++ V1 = PERM{1} (S1) ++ V2 = set of operations (V1) ++ V3 = PERM{2} (V2) ++ try to reduce them: ++ V2 = set of operations (S1) ++ V3 = PERM{3} (V2) ++ Return true if the optimization is successful. */ ++ ++static bool ++propagate_permutations (gimple_stmt_iterator *gsi) ++{ ++ tree type; ++ auto_vec perm_stmts (2); ++ ++ if (!find_initial_permutations (gsi, type, perm_stmts)) ++ return false; ++ ++ unsigned HOST_WIDE_INT nelts = TYPE_VECTOR_SUBPARTS (type).to_constant (); ++ auto_vec final_arith_stmts (nelts * 2); ++ auto_vec second_perm_stmts (nelts * 2); ++ auto_vec src_vects (nelts * 2); ++ auto_vec perm_pattern (nelts); ++ if (analyze_perm_fwprop (type, nelts, perm_stmts, src_vects, perm_pattern, ++ final_arith_stmts, second_perm_stmts)) ++ { ++ fwprop_perms (type, src_vects, perm_pattern, final_arith_stmts, ++ second_perm_stmts); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ unsigned i; ++ gimple *stmt_it; ++ fprintf (dump_file, "Permutations were moved through " ++ "binary operations:\n"); ++ FOR_EACH_VEC_ELT (second_perm_stmts, i, stmt_it) ++ print_gimple_stmt (dump_file, stmt_it, 0); ++ } ++ return true; ++ } ++ ++ auto_vec init_perm_stmts (nelts * 2); ++ auto_vec final_perm_stmts (nelts * 2); ++ if (analyze_perm_reduction (nelts, perm_stmts, init_perm_stmts, ++ final_perm_stmts)) ++ { ++ reduce_perms (init_perm_stmts, final_perm_stmts); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ unsigned i; ++ gimple *stmt_it; ++ fprintf (dump_file, "Initial permutations were reduced:\n"); ++ FOR_EACH_VEC_ELT (init_perm_stmts, i, stmt_it) ++ print_gimple_stmt (dump_file, stmt_it, 0); ++ } ++ return true; ++ } ++ return false; ++} ++ + /* Get the BIT_FIELD_REF definition of VAL, if any, looking through + conversions with code CONV_CODE or update it if still ERROR_MARK. + Return NULL_TREE if no such matching def was found. */ +@@ -3155,6 +4042,10 @@ pass_forwprop::execute (function *fun) + || code == BIT_XOR_EXPR) + && simplify_rotate (&gsi)) + changed = true; ++ else if ((code == PLUS_EXPR || code == MINUS_EXPR) ++ && param_tree_forwprop_perm ++ && propagate_permutations (&gsi)) ++ changed = true; + else if (code == VEC_PERM_EXPR) + { + int did_something = simplify_permutation (&gsi); +-- +2.33.0 + diff --git a/0151-Fix-bugs-and-add-tests-for-RTL-ifcvt.patch b/0151-Fix-bugs-and-add-tests-for-RTL-ifcvt.patch new file mode 100644 index 0000000..b228e2e --- /dev/null +++ b/0151-Fix-bugs-and-add-tests-for-RTL-ifcvt.patch @@ -0,0 +1,381 @@ +From 4bcb19923cdcb042d66057766d661ef68bf70e92 Mon Sep 17 00:00:00 2001 +From: Chernonog Vyacheslav 00812786 +Date: Wed, 29 Mar 2023 05:22:17 +0300 +Subject: [PATCH 13/13] Fix bugs and add tests for RTL ifcvt + +1. Fix bug in rtl ifcvt that run pass despite renaming failure. +2. Fix bug that prevent final set register to be renamed. +3. Clean up dominance info before runnig cleanup_cfg to avoid fixup + invalid dominance info. +4. Remove duplicated cleanup_cfg. +5. Add tests. +--- + gcc/common.opt | 4 + + gcc/ifcvt.c | 88 ++++++++++++------- + gcc/params.opt | 4 - + .../gcc.c-torture/execute/ifcvt-renaming-1.c | 38 ++++++++ + gcc/testsuite/gcc.dg/ifcvt-6.c | 29 ++++++ + 5 files changed, 128 insertions(+), 35 deletions(-) + create mode 100644 gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c + create mode 100644 gcc/testsuite/gcc.dg/ifcvt-6.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index 6f0ed7cea..92d3a1986 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -3534,4 +3534,8 @@ fipa-ra + Common Report Var(flag_ipa_ra) Optimization + Use caller save register across calls if possible. + ++fifcvt-allow-complicated-cmps ++Common Report Var(flag_ifcvt_allow_complicated_cmps) Optimization ++Allow RTL if-conversion pass to deal with complicated cmps (can increase compilation time). ++ + ; This comment is to ensure we retain the blank line above. +diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c +index 50a73a7ca..209987ebc 100644 +--- a/gcc/ifcvt.c ++++ b/gcc/ifcvt.c +@@ -876,7 +876,7 @@ noce_emit_store_flag (struct noce_if_info *if_info, rtx x, int reversep, + } + + /* Don't even try if the comparison operands or the mode of X are weird. */ +- if (!param_ifcvt_allow_complicated_cmps ++ if (!flag_ifcvt_allow_complicated_cmps + && (cond_complex + || !SCALAR_INT_MODE_P (GET_MODE (x)))) + return NULL_RTX; +@@ -1745,7 +1745,7 @@ noce_emit_cmove (struct noce_if_info *if_info, rtx x, enum rtx_code code, + + /* Don't even try if the comparison operands are weird + except that the target supports cbranchcc4. */ +- if (! param_ifcvt_allow_complicated_cmps ++ if (! flag_ifcvt_allow_complicated_cmps + && (! general_operand (cmp_a, GET_MODE (cmp_a)) + || ! general_operand (cmp_b, GET_MODE (cmp_b)))) + { +@@ -1918,6 +1918,19 @@ noce_try_cmove (struct noce_if_info *if_info) + return FALSE; + } + ++/* Return true if X contains a conditional code mode rtx. */ ++ ++static bool ++contains_ccmode_rtx_p (rtx x) ++{ ++ subrtx_iterator::array_type array; ++ FOR_EACH_SUBRTX (iter, array, x, ALL) ++ if (GET_MODE_CLASS (GET_MODE (*iter)) == MODE_CC) ++ return true; ++ ++ return false; ++} ++ + /* Helper for bb_valid_for_noce_process_p. Validate that + the rtx insn INSN is a single set that does not set + the conditional register CC and is in general valid for +@@ -1936,6 +1949,8 @@ insn_valid_noce_process_p (rtx_insn *insn, rtx cc) + /* Currently support only simple single sets in test_bb. */ + if (!sset + || !noce_operand_ok (SET_DEST (sset)) ++ || (!flag_ifcvt_allow_complicated_cmps ++ && contains_ccmode_rtx_p (SET_DEST (sset))) + || !noce_operand_ok (SET_SRC (sset))) + return false; + +@@ -1974,8 +1989,7 @@ bbs_ok_for_cmove_arith (basic_block bb_a, + continue; + /* Record all registers that BB_A sets. */ + FOR_EACH_INSN_DEF (def, a_insn) +- if (!(to_rename && DF_REF_REG (def) == to_rename)) +- bitmap_set_bit (bba_sets, DF_REF_REGNO (def)); ++ bitmap_set_bit (bba_sets, DF_REF_REGNO (def)); + } + + bitmap_and (intersections, df_get_live_in (bb_b), bba_sets); +@@ -1984,6 +1998,7 @@ bbs_ok_for_cmove_arith (basic_block bb_a, + { + if (!active_insn_p (b_insn)) + continue; ++ + rtx sset_b = single_set (b_insn); + + if (!sset_b) +@@ -2081,7 +2096,12 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple) + return true; + } + +-/* This function tries to rename regs that intersect with considered bb. */ ++/* This function tries to rename regs that intersect with considered bb ++ inside condition expression. Condition expression will be moved down ++ if the optimization will be applied, so it is essential to be sure that ++ all intersected registers will be renamed otherwise transformation ++ can't be applied. Function returns true if renaming was successful ++ and optimization can proceed futher. */ + + static bool + noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs) +@@ -2092,11 +2112,11 @@ noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs) + if (param_ifcvt_allow_register_renaming < 2) + return false; + df_ref use; +- rtx_insn* cmp_insn = if_info->cond_earliest; ++ rtx_insn *cmp_insn = if_info->cond_earliest; + /* Jump instruction as a condion currently unsupported. */ + if (JUMP_P (cmp_insn)) + return false; +- rtx_insn* before_cmp = PREV_INSN (cmp_insn); ++ rtx_insn *before_cmp = PREV_INSN (cmp_insn); + start_sequence (); + rtx_insn *copy_of_cmp = as_a (copy_rtx (cmp_insn)); + basic_block cmp_block = BLOCK_FOR_INSN (cmp_insn); +@@ -2122,7 +2142,7 @@ noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs) + + emit_insn_after_setloc (seq, before_cmp, INSN_LOCATION (cmp_insn)); + delete_insn_and_edges (cmp_insn); +- rtx_insn* insn; ++ rtx_insn *insn; + FOR_BB_INSNS (cmp_block, insn) + df_insn_rescan (insn); + +@@ -2135,13 +2155,15 @@ noce_rename_regs_in_cond (struct noce_if_info *if_info, bitmap cond_rename_regs) + return success; + } + +-/* This function tries to rename regs that intersect with considered bb. */ ++/* This function tries to rename regs that intersect with considered bb. ++ return true if the renaming was successful and optimization can ++ proceed futher, false otherwise. */ + static bool + noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs) + { + if (bitmap_empty_p (rename_regs)) + return true; +- rtx_insn* insn; ++ rtx_insn *insn; + rtx_insn *last_insn = last_active_insn (test_bb, FALSE); + bool res = true; + start_sequence (); +@@ -2153,7 +2175,7 @@ noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs) + rtx sset = single_set (insn); + gcc_assert (sset); + rtx x = SET_DEST (sset); +- if (!REG_P (x) || bitmap_bit_p (rename_regs, REGNO (x))) ++ if (!REG_P (x) || !bitmap_bit_p (rename_regs, REGNO (x))) + continue; + + machine_mode mode = GET_MODE (x); +@@ -2175,7 +2197,7 @@ noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs) + noce_emit_move_insn (tmp,x); + } + set_used_flags (insn); +- rtx_insn* rename_candidate; ++ rtx_insn *rename_candidate; + for (rename_candidate = NEXT_INSN (insn); + rename_candidate && rename_candidate!= NEXT_INSN (BB_END (test_bb)); + rename_candidate = NEXT_INSN (rename_candidate)) +@@ -2193,17 +2215,16 @@ noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs) + replace_res = validate_replace_rtx (x, tmp, rename_candidate); + gcc_assert (replace_res); + set_used_flags (rename_candidate); +- + } + set_used_flags (x); + set_used_flags (tmp); +- + } +- rtx_insn *seq = get_insns (); +- unshare_all_rtl_in_chain (seq); +- end_sequence (); +- emit_insn_before_setloc (seq, first_active_insn (test_bb), +- INSN_LOCATION (first_active_insn (test_bb))); ++ ++ rtx_insn *seq = get_insns (); ++ unshare_all_rtl_in_chain (seq); ++ end_sequence (); ++ emit_insn_before_setloc (seq, first_active_insn (test_bb), ++ INSN_LOCATION (first_active_insn (test_bb))); + FOR_BB_INSNS (test_bb, insn) + df_insn_rescan (insn); + return res; +@@ -2305,9 +2326,10 @@ noce_try_cmove_arith (struct noce_if_info *if_info) + BITMAP_FREE (else_bb_rename_regs); + return FALSE; + } +- bool prepass_renaming = true; +- prepass_renaming |= noce_rename_regs_in_bb (then_bb, then_bb_rename_regs); +- prepass_renaming |= noce_rename_regs_in_bb (else_bb, else_bb_rename_regs); ++ bool prepass_renaming = noce_rename_regs_in_bb (then_bb, ++ then_bb_rename_regs) ++ && noce_rename_regs_in_bb (else_bb, ++ else_bb_rename_regs); + + BITMAP_FREE (then_bb_rename_regs); + BITMAP_FREE (else_bb_rename_regs); +@@ -2321,6 +2343,7 @@ noce_try_cmove_arith (struct noce_if_info *if_info) + came from the test block. The non-empty complex block that we will + emit might clobber the register used by B or A, so move it to a pseudo + first. */ ++ + rtx tmp_a = NULL_RTX; + rtx tmp_b = NULL_RTX; + +@@ -3233,6 +3256,7 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond, + && reg_set_between_p (x, first_insn, prev_last_insn) + && param_ifcvt_allow_register_renaming < 1) + return false; ++ + bitmap test_bb_temps = BITMAP_ALLOC (®_obstack); + + /* The regs that are live out of test_bb. */ +@@ -3268,9 +3292,10 @@ bb_valid_for_noce_process_p (basic_block test_bb, rtx cond, + else + goto free_bitmap_and_fail; + } +- potential_cost += pattern_cost (sset, speed_p); +- if (SET_DEST (sset) != SET_DEST (last_set)) +- bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset))); ++ ++ potential_cost += pattern_cost (sset, speed_p); ++ if (SET_DEST (sset) != SET_DEST (last_set)) ++ bitmap_set_bit (test_bb_temps, REGNO (SET_DEST (sset))); + } + + /* If any of the intermediate results in test_bb are live after test_bb +@@ -3645,11 +3670,12 @@ noce_process_if_block (struct noce_if_info *if_info) + } + + if (!noce_rename_regs_in_cond (if_info, cond_rename_regs)) +- return false; +- cond = if_info->cond; +- ++ { ++ BITMAP_FREE (cond_rename_regs); ++ return false; ++ } + BITMAP_FREE (cond_rename_regs); +- ++ cond = if_info->cond; + if (speed_p) + if_info->original_cost += average_cost (then_cost, else_cost, + find_edge (test_bb, then_bb)); +@@ -5592,12 +5618,13 @@ if_convert (bool after_combine) + { + basic_block bb; + int pass; +- cleanup_cfg (CLEANUP_EXPENSIVE); ++ + if (optimize == 1) + { + df_live_add_problem (); + df_live_set_all_dirty (); + } ++ cleanup_cfg (CLEANUP_EXPENSIVE); + + /* Record whether we are after combine pass. */ + ifcvt_after_combine = after_combine; +@@ -5702,7 +5729,6 @@ rest_of_handle_if_conversion (void) + dump_reg_info (dump_file); + dump_flow_info (dump_file, dump_flags); + } +- cleanup_cfg (CLEANUP_EXPENSIVE); + if_convert (false); + if (num_updated_if_blocks) + /* Get rid of any dead CC-related instructions. */ +diff --git a/gcc/params.opt b/gcc/params.opt +index 345f9b3ff..272a0eb2b 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -574,10 +574,6 @@ Maximum permissible cost for the sequence that would be generated by the RTL if- + Common Joined UInteger Var(param_max_rtl_if_conversion_unpredictable_cost) Init(40) IntegerRange(0, 200) Param Optimization + Maximum permissible cost for the sequence that would be generated by the RTL if-conversion pass for a branch that is considered unpredictable. + +--param=ifcvt-allow-complicated-cmps= +-Common Joined UInteger Var(param_ifcvt_allow_complicated_cmps) IntegerRange(0, 1) Param Optimization +-Allow RTL if-conversion pass to deal with complicated cmps (can increase compilation time). +- + -param=ifcvt-allow-register-renaming= + Common Joined UInteger Var(param_ifcvt_allow_register_renaming) IntegerRange(0, 2) Param Optimization + Allow RTL if-conversion pass to aggressively rename registers in basic blocks. Sometimes additional moves will be created. +diff --git a/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c b/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c +new file mode 100644 +index 000000000..761c8ab7e +--- /dev/null ++++ b/gcc/testsuite/gcc.c-torture/execute/ifcvt-renaming-1.c +@@ -0,0 +1,38 @@ ++ ++extern void abort(void); ++ ++__attribute__ ((noinline)) ++int foo (int x, int y, int z, int a, int b) ++{ ++ if (a < 2) ++ { ++ if (a == 0) ++ { ++ if (x - y < 0) ++ x = x - y + z; ++ else ++ x = x - y; ++ } ++ else ++ { ++ if (x + y >= z) ++ x = x + y - z; ++ else ++ x = x + y; ++ } ++ } ++ return x; ++} ++ ++int main(void) ++{ ++ if (foo (5,10,7,0,1) != 2) // x - y + z = -5 + 7 = 2 ++ abort (); ++ if (foo (50,10,7,0,1) != 40) // x - y = 40 ++ abort (); ++ if (foo (5,10,7,1,1) != 8) // x + y - z = 5 + 10 - 7 = 8 ++ abort (); ++ if (foo (5,10,70,1,1) != 15) // x + y = 15 ++ abort (); ++ return 0; ++} +diff --git a/gcc/testsuite/gcc.dg/ifcvt-6.c b/gcc/testsuite/gcc.dg/ifcvt-6.c +new file mode 100644 +index 000000000..7d2a8d58b +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/ifcvt-6.c +@@ -0,0 +1,29 @@ ++/* { dg-do compile { target { aarch64*-*-* } } } */ ++/* { dg-options "-fdump-rtl-ce1 -O2 -fifcvt-allow-complicated-cmps --param max-rtl-if-conversion-unpredictable-cost=100 --param max-rtl-if-conversion-predictable-cost=100 --param=ifcvt-allow-register-renaming=2 " } */ ++ ++typedef unsigned int uint16_t; ++ ++uint16_t ++foo (uint16_t x, uint16_t y, uint16_t z, uint16_t a, ++ uint16_t b, uint16_t c, uint16_t d) ++{ ++ int i = 1; ++ int j = 1; ++ if (a > b) ++ { ++ j = x; ++ if (b > c) ++ i = y; ++ else ++ i = z; ++ } ++ else ++ { ++ j = y; ++ if (c > d) ++ i = z; ++ } ++ return i * j; ++} ++ ++/* { dg-final { scan-rtl-dump "7 true changes made" "ce1" } } */ +-- +2.33.0 + diff --git a/0152-Add-LLC-Allocation-Pass.patch b/0152-Add-LLC-Allocation-Pass.patch new file mode 100644 index 0000000..0a19214 --- /dev/null +++ b/0152-Add-LLC-Allocation-Pass.patch @@ -0,0 +1,4905 @@ +From e0e139bf642398d1e1b8cfd803ee6ce276404991 Mon Sep 17 00:00:00 2001 +From: huangxiaoquan +Date: Wed, 6 Dec 2023 17:51:11 +0800 +Subject: [PATCH] Add LLC-Allocation Pass LLC allocation allows the compiler to + identify frequently-used data in the program and strengthens the ability to + prefetch and distribute it to the last level cache (LLC) through memory + accesses of the corresponding data variables. Add flag -fllc-allocate to + enable LLC allocation. + +--- + gcc/Makefile.in | 1 + + gcc/cfgloop.h | 3 + + gcc/common.opt | 4 + + gcc/config/aarch64/aarch64-sve.md | 48 +- + gcc/config/aarch64/aarch64.c | 18 + + gcc/doc/tm.texi | 21 + + gcc/doc/tm.texi.in | 6 + + gcc/internal-fn.c | 115 + + gcc/internal-fn.def | 4 + + gcc/optabs.def | 2 + + gcc/params.opt | 53 + + gcc/passes.def | 1 + + gcc/target.def | 31 + + gcc/testsuite/gcc.dg/llc-allocate/llc-1.c | 61 + + gcc/testsuite/gcc.dg/llc-allocate/llc-2.c | 54 + + .../gcc.dg/llc-allocate/llc-allocate.exp | 27 + + .../llc-allocate/llc-issue-builtin-prefetch.c | 48 + + .../gcc.dg/llc-allocate/llc-nonzero-offset.c | 50 + + .../gcc.dg/llc-allocate/llc-ref-trace.c | 62 + + .../llc-allocate/llc-tool-insertion-1.c | 48 + + .../llc-allocate/llc-tool-insertion-2.c | 48 + + .../llc-allocate/llc-tool-insertion-3.c | 48 + + .../llc-allocate/llc-tool-insertion-4.c | 47 + + .../llc-allocate/llc-tool-insertion-5.c | 48 + + .../llc-allocate/llc-tool-insertion-6.c | 47 + + .../llc-tool-insertion-7-null-var-name.c | 52 + + .../llc-tool-insertion-8-tmp-var-name.c | 54 + + .../gfortran.dg/llc-allocate/llc-3.f90 | 213 ++ + .../gfortran.dg/llc-allocate/llc-allocate.exp | 29 + + .../llc-trace-multiple-base-var.f90 | 63 + + .../llc-unknown-type-size-unit.f90 | 58 + + gcc/timevar.def | 1 + + gcc/tree-cfg.c | 11 + + gcc/tree-cfg.h | 1 + + gcc/tree-pass.h | 1 + + gcc/tree-scalar-evolution.c | 8 +- + gcc/tree-scalar-evolution.h | 3 +- + gcc/tree-ssa-llc-allocate.c | 2898 +++++++++++++++++ + gcc/tree-ssa-loop-niter.c | 38 +- + gcc/tree-ssa-loop-niter.h | 3 +- + 40 files changed, 4297 insertions(+), 31 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-1.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-2.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-nonzero-offset.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-2.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-3.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-4.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-5.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-6.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-7-null-var-name.c + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 + create mode 100644 gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 + create mode 100644 gcc/tree-ssa-llc-allocate.c + +diff --git a/gcc/Makefile.in b/gcc/Makefile.in +index 2a59acfbe..31bf2cde2 100644 +--- a/gcc/Makefile.in ++++ b/gcc/Makefile.in +@@ -1594,6 +1594,7 @@ OBJS = \ + tree-ssa-loop-array-widen-compare.o \ + tree-ssa-loop-crc.o \ + tree-ssa-loop-prefetch.o \ ++ tree-ssa-llc-allocate.o \ + tree-ssa-loop-split.o \ + tree-ssa-loop-unswitch.o \ + tree-ssa-loop.o \ +diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h +index 18b404e29..e3ecf5076 100644 +--- a/gcc/cfgloop.h ++++ b/gcc/cfgloop.h +@@ -272,6 +272,9 @@ public: + the basic-block from being collected but its index can still be + reused. */ + basic_block former_header; ++ ++ /* Number of latch executions from vectorization. */ ++ tree vec_nb_iterations; + }; + + /* Set if the loop is known to be infinite. */ +diff --git a/gcc/common.opt b/gcc/common.opt +index 4db061b44..2dde0f673 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -2233,6 +2233,10 @@ Common Joined RejectNegative UInteger Var(prefetch_level) Init(0) IntegerRange(0 + Generate prefetch instructions, if available, for arrays in loops. The prefetch + level can control the optimize level to array prefetch. + ++fllc-allocate ++Common Report Var(flag_llc_allocate) Init(-1) Optimization ++Generate LLC hint instructions. ++ + fprofile + Common Report Var(profile_flag) + Enable basic program profiling code. +diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md +index d17a77706..c5b99b6c4 100644 +--- a/gcc/config/aarch64/aarch64-sve.md ++++ b/gcc/config/aarch64/aarch64-sve.md +@@ -1940,7 +1940,7 @@ + (define_insn "@aarch64_sve_prefetch" + [(prefetch (unspec:DI + [(match_operand: 0 "register_operand" "Upl") +- (match_operand:SVE_FULL_I 1 "aarch64_sve_prefetch_operand" "UP") ++ (match_operand:SVE_FULL 1 "aarch64_sve_prefetch_operand" "UP") + (match_operand:DI 2 "const_int_operand")] + UNSPEC_SVE_PREFETCH) + (match_operand:DI 3 "const_int_operand") +@@ -1973,14 +1973,14 @@ + ;; 6: the prefetch operator (an svprfop) + ;; 7: the normal RTL prefetch rw flag + ;; 8: the normal RTL prefetch locality value +-(define_insn "@aarch64_sve_gather_prefetch" ++(define_insn "@aarch64_sve_gather_prefetch" + [(prefetch (unspec:DI + [(match_operand:VNx4BI 0 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl") +- (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk, rk, rk") ++ (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk, rk, rk") + (match_operand:VNx4SI_ONLY 2 "register_operand" "w, w, w, w, w, w") + (match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, Ui1, i, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, Ui1, i, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -1988,12 +1988,12 @@ + "TARGET_SVE" + { + static const char *const insns[][2] = { +- "prf", "%0, [%2.s]", +- "prf", "%0, [%2.s, #%1]", ++ "prf", "%0, [%2.s]", ++ "prf", "%0, [%2.s, #%1]", + "prfb", "%0, [%1, %2.s, sxtw]", + "prfb", "%0, [%1, %2.s, uxtw]", +- "prf", "%0, [%1, %2.s, sxtw %p4]", +- "prf", "%0, [%1, %2.s, uxtw %p4]" ++ "prf", "%0, [%1, %2.s, sxtw %p4]", ++ "prf", "%0, [%1, %2.s, uxtw %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +@@ -2002,14 +2002,14 @@ + + ;; Predicated gather prefetches for 64-bit elements. The value of operand 3 + ;; doesn't matter in this case. +-(define_insn "@aarch64_sve_gather_prefetch" ++(define_insn "@aarch64_sve_gather_prefetch" + [(prefetch (unspec:DI + [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl, Upl, Upl") +- (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk") ++ (match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk") + (match_operand:VNx2DI_ONLY 2 "register_operand" "w, w, w, w") + (match_operand:DI 3 "const_int_operand") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -2017,10 +2017,10 @@ + "TARGET_SVE" + { + static const char *const insns[][2] = { +- "prf", "%0, [%2.d]", +- "prf", "%0, [%2.d, #%1]", ++ "prf", "%0, [%2.d]", ++ "prf", "%0, [%2.d, #%1]", + "prfb", "%0, [%1, %2.d]", +- "prf", "%0, [%1, %2.d, lsl %p4]" ++ "prf", "%0, [%1, %2.d, lsl %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +@@ -2028,7 +2028,7 @@ + ) + + ;; Likewise, but with the offset being sign-extended from 32 bits. +-(define_insn_and_rewrite "*aarch64_sve_gather_prefetch_sxtw" ++(define_insn_and_rewrite "*aarch64_sve_gather_prefetch_sxtw" + [(prefetch (unspec:DI + [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl") + (match_operand:DI 1 "register_operand" "rk, rk") +@@ -2039,8 +2039,8 @@ + (match_operand:VNx2DI 2 "register_operand" "w, w")))] + UNSPEC_PRED_X) + (match_operand:DI 3 "const_int_operand") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -2049,7 +2049,7 @@ + { + static const char *const insns[][2] = { + "prfb", "%0, [%1, %2.d, sxtw]", +- "prf", "%0, [%1, %2.d, sxtw %p4]" ++ "prf", "%0, [%1, %2.d, sxtw %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +@@ -2061,7 +2061,7 @@ + ) + + ;; Likewise, but with the offset being zero-extended from 32 bits. +-(define_insn "*aarch64_sve_gather_prefetch_uxtw" ++(define_insn "*aarch64_sve_gather_prefetch_uxtw" + [(prefetch (unspec:DI + [(match_operand:VNx2BI 0 "register_operand" "Upl, Upl") + (match_operand:DI 1 "register_operand" "rk, rk") +@@ -2069,8 +2069,8 @@ + (match_operand:VNx2DI 2 "register_operand" "w, w") + (match_operand:VNx2DI 9 "aarch64_sve_uxtw_immediate")) + (match_operand:DI 3 "const_int_operand") +- (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") +- (match_operand:SVE_FULL_I 5 "aarch64_simd_imm_zero") ++ (match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i") ++ (match_operand:SVE_FULL 5 "aarch64_simd_imm_zero") + (match_operand:DI 6 "const_int_operand")] + UNSPEC_SVE_PREFETCH_GATHER) + (match_operand:DI 7 "const_int_operand") +@@ -2079,7 +2079,7 @@ + { + static const char *const insns[][2] = { + "prfb", "%0, [%1, %2.d, uxtw]", +- "prf", "%0, [%1, %2.d, uxtw %p4]" ++ "prf", "%0, [%1, %2.d, uxtw %p4]" + }; + const char *const *parts = insns[which_alternative]; + return aarch64_output_sve_prefetch (parts[0], operands[6], parts[1]); +diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c +index dbdc6dffb..aa077ec0a 100644 +--- a/gcc/config/aarch64/aarch64.c ++++ b/gcc/config/aarch64/aarch64.c +@@ -2367,6 +2367,13 @@ aarch64_sve_data_mode_p (machine_mode mode) + return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA; + } + ++/* Return true if MODE is an full SVE data vector mode. */ ++static bool ++aarch64_full_sve_data_mode_p (machine_mode mode) ++{ ++ return aarch64_classify_vector_mode (mode) == VEC_SVE_DATA; ++} ++ + /* Return the number of defined bytes in one constituent vector of + SVE mode MODE, which has vector flags VEC_FLAGS. */ + static poly_int64 +@@ -24370,6 +24377,17 @@ aarch64_libgcc_floating_mode_supported_p + #undef TARGET_ASM_FUNCTION_EPILOGUE + #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks + ++#undef TARGET_VECTORIZE_CODE_FOR_PREFETCH ++#define TARGET_VECTORIZE_CODE_FOR_PREFETCH code_for_aarch64_sve_prefetch ++ ++#undef TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH ++#define TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH \ ++ code_for_aarch64_sve_gather_prefetch ++ ++#undef TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P ++#define TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P \ ++ aarch64_full_sve_data_mode_p ++ + struct gcc_target targetm = TARGET_INITIALIZER; + + #include "gt-aarch64.h" +diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi +index b46418d0b..ef3566510 100644 +--- a/gcc/doc/tm.texi ++++ b/gcc/doc/tm.texi +@@ -6122,6 +6122,27 @@ The default is @code{NULL_TREE} which means to not vectorize scatter + stores. + @end deftypefn + ++@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_PREFETCH (machine_mode @var{arg}) ++This hook should return the decl of a function that implements the ++vectorized variant of the function with the @code{combined_fn} code ++@var{code} or @code{NULL_TREE} if such a function is not available. ++The return type of the vectorized function shall be of vector type ++@var{vec_type_out} and the argument types should be @var{vec_type_in}. ++@end deftypefn ++ ++@deftypefn {Target Hook} insn_code TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH (machine_mode @var{mode_to}, machine_mode @var{mode_form}) ++This hook should return the decl of a function that implements the ++vectorized variant of the function with the @code{combined_fn} code ++@var{code} or @code{NULL_TREE} if such a function is not available. ++The return type of the vectorized function shall be of vector type ++@var{vec_type_out} and the argument types should be @var{vec_type_in}. ++@end deftypefn ++ ++@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P (machine_mode @var{arg}) ++This hook should return true if the target hardware architecture ++supports a full SVE data vector mode. ++@end deftypefn ++ + @deftypefn {Target Hook} int TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN (struct cgraph_node *@var{}, struct cgraph_simd_clone *@var{}, @var{tree}, @var{int}) + This hook should set @var{vecsize_mangle}, @var{vecsize_int}, @var{vecsize_float} + fields in @var{simd_clone} structure pointed by @var{clone_info} argument and also +diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in +index 2663547c7..945d0f696 100644 +--- a/gcc/doc/tm.texi.in ++++ b/gcc/doc/tm.texi.in +@@ -4195,6 +4195,12 @@ address; but often a machine-dependent strategy can generate better code. + + @hook TARGET_VECTORIZE_BUILTIN_SCATTER + ++@hook TARGET_VECTORIZE_CODE_FOR_PREFETCH ++ ++@hook TARGET_VECTORIZE_CODE_FOR_GATHER_PREFETCH ++ ++@hook TARGET_VECTORIZE_PREFETCH_HANDLEABLE_MODE_P ++ + @hook TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN + + @hook TARGET_SIMD_CLONE_ADJUST +diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c +index 644f234e0..e8a3bb654 100644 +--- a/gcc/internal-fn.c ++++ b/gcc/internal-fn.c +@@ -102,10 +102,12 @@ init_internal_fns () + direct_internal_fn. */ + #define not_direct { -2, -2, false } + #define mask_load_direct { -1, 2, false } ++#define mask_prefetch_direct { -1, 2, false } + #define load_lanes_direct { -1, -1, false } + #define mask_load_lanes_direct { -1, -1, false } + #define gather_load_direct { 3, 1, false } + #define mask_store_direct { 3, 2, false } ++#define gather_prefetch_direct { 3, 1, false } + #define store_lanes_direct { 0, 0, false } + #define mask_store_lanes_direct { 0, 0, false } + #define vec_cond_mask_direct { 0, 0, false } +@@ -2520,6 +2522,53 @@ expand_mask_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) + + #define expand_mask_load_lanes_optab_fn expand_mask_load_optab_fn + ++/* Expand MASK_PREFETCH call STMT using optab OPTAB. ++ .MASK_STORE (_5, 64B, loop_mask_98, vect__8.10_102); ++ .MASK_PREFETCH (_68, 64B, loop_mask_98, vect__8.10_102, 4); ++*/ ++ ++static void ++expand_mask_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab) ++{ ++ if (targetm.vectorize.code_for_prefetch == NULL ++ || targetm.vectorize.prefetch_handleable_mode_p == NULL) ++ return; ++ ++ tree base = gimple_call_arg (stmt, 0); ++ if (base == NULL_TREE) ++ return; ++ ++ tree maskt = gimple_call_arg (stmt, 2); ++ tree target = gimple_call_arg (stmt, 3); ++ tree prfop = gimple_call_arg (stmt, 4); ++ HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop); ++ /* Bit 3 of the prfop selects stores over loads. */ ++ HOST_WIDE_INT access = prfop_int & 8; ++ /* Bits 1 and 2 specify the locality; 0-based for svprfop but ++ 1-based for PREFETCH. */ ++ HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1; ++ ++ machine_mode m_mode = TYPE_MODE (TREE_TYPE (target)); ++ if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode)) ++ return; ++ insn_code icode = targetm.vectorize.code_for_prefetch (m_mode); ++ ++ rtx mask = expand_normal (maskt); ++ rtx base_rtx = expand_normal (base); ++ /* Convert ptr_mode value X to Pmode. */ ++ if (ptr_mode == SImode) ++ base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode); ++ ++ unsigned i = 0; ++ class expand_operand ops[5]; ++ create_input_operand (&ops[i++], mask, TYPE_MODE (TREE_TYPE (maskt))); ++ create_address_operand (&ops[i++], base_rtx); ++ create_integer_operand (&ops[i++], prfop_int); ++ create_integer_operand (&ops[i++], access); ++ create_integer_operand (&ops[i++], locality); ++ expand_insn (icode, i, ops); ++} ++ + /* Expand MASK_STORE{,_LANES} call STMT using optab OPTAB. */ + + static void +@@ -2920,6 +2969,70 @@ expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab) + emit_move_insn (lhs_rtx, ops[0].value); + } + ++/* Expand {MASK_,}GATHER_PREFETCH call CALL using optab OPTAB. ++ vect_patt_97.14_77 = .MASK_GATHER_LOAD (_78, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87); ++ .MASK_GATHER_PREFETCH (_45, vect__14.13_79, 8, { 0.0, ... }, loop_mask_87, vect_patt_97.14_77, 4); ++*/ ++ ++static void ++expand_gather_prefetch_optab_fn (internal_fn, gcall *stmt, direct_optab optab) ++{ ++ if (targetm.vectorize.code_for_gather_prefetch == NULL ++ || targetm.vectorize.prefetch_handleable_mode_p == NULL) ++ return; ++ ++ /* Extracting tree nodes, only expand for scalar base and vector index. */ ++ tree base = gimple_call_arg (stmt, 0); ++ if (VECTOR_TYPE_P (TREE_TYPE (base))) ++ return; ++ tree offset = gimple_call_arg (stmt, 1); ++ if (VECTOR_TYPE_P (TREE_TYPE (offset)) == false) ++ return; ++ ++ tree scale = gimple_call_arg (stmt, 2); ++ tree mask = gimple_call_arg (stmt, 4); ++ tree target = gimple_call_arg (stmt, 5); ++ tree prfop = gimple_call_arg (stmt, 6); ++ ++ /* Convert to the rtx node. */ ++ rtx base_rtx = expand_normal (base); ++ /* Convert ptr_mode value X to Pmode. */ ++ if (ptr_mode == SImode) ++ base_rtx = simplify_gen_unary (ZERO_EXTEND, DImode, base_rtx, SImode); ++ rtx offset_rtx = expand_normal (offset); ++ rtx const_rtx = CONST0_RTX (TYPE_MODE (TREE_TYPE (target))); ++ rtx mask_rtx = expand_normal (mask); ++ HOST_WIDE_INT scale_int = tree_to_shwi (scale); ++ HOST_WIDE_INT prfop_int = tree_to_uhwi (prfop); ++ /* Bit 3 of the prfop selects stores over loads. */ ++ HOST_WIDE_INT access = prfop_int & 8; ++ /* Bits 1 and 2 specify the locality; 0-based for svprfop but ++ 1-based for PREFETCH. */ ++ HOST_WIDE_INT locality = ((prfop_int >> 1) & 3) + 1; ++ ++ /* add operand. */ ++ unsigned int i = 0; ++ class expand_operand ops[9]; ++ create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask))); ++ create_address_operand (&ops[i++], base_rtx); ++ create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset))); ++ /* Check whether the index has unsigned. */ ++ create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset))); ++ create_integer_operand (&ops[i++], scale_int); ++ create_input_operand (&ops[i++], const_rtx, GET_MODE (const_rtx)); ++ create_integer_operand (&ops[i++], prfop_int); ++ create_integer_operand (&ops[i++], access); ++ create_integer_operand (&ops[i++], locality); ++ ++ machine_mode reg_mode = GET_MODE (offset_rtx); ++ machine_mode m_mode = TYPE_MODE (TREE_TYPE (target)); ++ if (!targetm.vectorize.prefetch_handleable_mode_p (m_mode)) ++ return; ++ insn_code icode = targetm.vectorize.code_for_gather_prefetch ++ (m_mode, reg_mode); ++ expand_insn (icode, i, ops); ++} ++ + /* Expand DIVMOD() using: + a) optab handler for udivmod/sdivmod if it is available. + b) If optab_handler doesn't exist, generate call to +@@ -3210,9 +3323,11 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, + #define direct_cond_binary_optab_supported_p direct_optab_supported_p + #define direct_cond_ternary_optab_supported_p direct_optab_supported_p + #define direct_mask_load_optab_supported_p direct_optab_supported_p ++#define direct_mask_prefetch_optab_supported_p direct_optab_supported_p + #define direct_load_lanes_optab_supported_p multi_vector_optab_supported_p + #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p + #define direct_gather_load_optab_supported_p convert_optab_supported_p ++#define direct_gather_prefetch_optab_supported_p direct_optab_supported_p + #define direct_mask_store_optab_supported_p direct_optab_supported_p + #define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p + #define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p +diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def +index 0c6fc3711..cc0f42b98 100644 +--- a/gcc/internal-fn.def ++++ b/gcc/internal-fn.def +@@ -119,6 +119,8 @@ along with GCC; see the file COPYING3. If not see + #endif + + DEF_INTERNAL_OPTAB_FN (MASK_LOAD, ECF_PURE, maskload, mask_load) ++DEF_INTERNAL_OPTAB_FN (MASK_PREFETCH, ECF_NOVOPS | ECF_LEAF, ++ maskprefetch, mask_prefetch) + DEF_INTERNAL_OPTAB_FN (LOAD_LANES, ECF_CONST, vec_load_lanes, load_lanes) + DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE, + vec_mask_load_lanes, mask_load_lanes) +@@ -126,6 +128,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE, + DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load) + DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE, + mask_gather_load, gather_load) ++DEF_INTERNAL_OPTAB_FN (MASK_GATHER_PREFETCH, ECF_NOVOPS | ECF_LEAF, ++ mask_gather_prefetch, gather_prefetch) + + DEF_INTERNAL_OPTAB_FN (SCATTER_STORE, 0, scatter_store, scatter_store) + DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0, +diff --git a/gcc/optabs.def b/gcc/optabs.def +index 0c64eb52a..ee25bc3f7 100644 +--- a/gcc/optabs.def ++++ b/gcc/optabs.def +@@ -90,9 +90,11 @@ OPTAB_CD(vec_cmp_optab, "vec_cmp$a$b") + OPTAB_CD(vec_cmpu_optab, "vec_cmpu$a$b") + OPTAB_CD(vec_cmpeq_optab, "vec_cmpeq$a$b") + OPTAB_CD(maskload_optab, "maskload$a$b") ++OPTAB_CD(maskprefetch_optab, "maskprefetch$a$b") + OPTAB_CD(maskstore_optab, "maskstore$a$b") + OPTAB_CD(gather_load_optab, "gather_load$a$b") + OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b") ++OPTAB_CD(mask_gather_prefetch_optab, "mask_gather_prefetch$a$b") + OPTAB_CD(scatter_store_optab, "scatter_store$a$b") + OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b") + OPTAB_CD(vec_extract_optab, "vec_extract$a$b") +diff --git a/gcc/params.opt b/gcc/params.opt +index 2044524a3..c429359e3 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -1005,4 +1005,57 @@ Target size of compressed pointer, which should be 8, 16 or 32. + Common Joined UInteger Var(semi_relayout_level) Init(13) IntegerRange(11, 15) Param Optimization + Set capacity of each bucket to semi-relayout to (1 << semi-relayout-level) / 8 . + ++-param=mem-access-ratio= ++Common Joined UInteger Var(param_mem_access_ratio) Init(20) IntegerRange(0, 100) Param Optimization ++Memory access ratio (in percent). ++ ++-param=mem-access-num= ++Common Joined UInteger Var(param_mem_access_num) Init(3) Param Optimization ++Memory access num. ++ ++-param=prefetch-offset= ++Common Joined UInteger Var(param_prefetch_offset) Init(1024) ++IntegerRange(1, 999999) Param Optimization ++Prefetch Offset, which is usually a power of two due to cache line size. ++ ++-param=branch-prob-threshold= ++Common Joined UInteger Var(param_branch_prob_threshold) Init(80) IntegerRange(50, 100) ++Param Optimization ++High Execution Rate Branch Threshold. ++ ++-param=issue-topn= ++Common Joined UInteger Var(param_issue_topn) Init(1) Param Optimization ++Issue topn LLC mem_ref hint. ++ ++-param=force-issue= ++Common Joined UInteger Var(param_force_issue) Init(0) IntegerRange(0, 1) Param ++Force issue the topn LLC mem_ref hint, without generating dynamic multi-branches. ++ ++-param=llc-capacity-per-core= ++Common Joined UInteger Var(param_llc_capacity_per_core) Init(114) IntegerRange(0, 999999) Param ++LLC capacity per core. ++ ++-param=target-variables= ++Common Joined Var(param_target_variables) Init("") Param Optimization ++--param=target-variables=[,,...] Target variables for prefetching, separated by comma, ++without space. The representation of a variable can be complex and containing space, please surround ++it by quotation marks and escape special characters in Linux. The input length should be no more ++than 512 characters. ++ ++-param=use-ref-group-index= ++Common Joined UInteger Var(param_use_ref_group_index) Init(0) IntegerRange(0, 1) Param Optimization ++Prefetch the target variables by their indices in sorted ref_groups, use together with parameter ++target-variables. ++ ++-param=mem-ref-index= ++Common Joined Var(param_mem_ref_index) Init("") Param Optimization ++--param=mem-ref-index=[,,...] Prefetch the target variable at the memory reference ++location with the index of customized order, separated by comma, without space. The input length ++should be no more than 512 characters. ++ ++-param=filter-kernels= ++Common Joined UInteger Var(param_filter_kernels) Init(1) IntegerRange(0, 1) Param ++Allow LLC allocate pass to greedily filter kernels by traversing the corresponding basic blocks ++through edges with branch probability no less than param_branch_prob_threshold. ++ + ; This comment is to ensure we retain the blank line above. +diff --git a/gcc/passes.def b/gcc/passes.def +index df7d65733..ea59fc8ca 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -303,6 +303,7 @@ along with GCC; see the file COPYING3. If not see + /* Run IVOPTs after the last pass that uses data-reference analysis + as that doesn't handle TARGET_MEM_REFs. */ + NEXT_PASS (pass_iv_optimize); ++ NEXT_PASS (pass_llc_allocate); + NEXT_PASS (pass_lim); + NEXT_PASS (pass_tree_loop_done); + POP_INSERT_PASSES () +diff --git a/gcc/target.def b/gcc/target.def +index 34d3561bd..351c94c37 100644 +--- a/gcc/target.def ++++ b/gcc/target.def +@@ -2072,6 +2072,37 @@ DEFHOOK + (void *data), + default_destroy_cost_data) + ++/* Function for vector prefetch operation. */ ++DEFHOOK ++(code_for_prefetch, ++ "This hook should return the decl of a function that implements the\n\ ++vectorized variant of the function with the @code{combined_fn} code\n\ ++@var{code} or @code{NULL_TREE} if such a function is not available.\n\ ++The return type of the vectorized function shall be of vector type\n\ ++@var{vec_type_out} and the argument types should be @var{vec_type_in}.", ++ insn_code, (machine_mode arg), ++ NULL) ++ ++/* Function for vector gather prefetch operation. */ ++DEFHOOK ++(code_for_gather_prefetch, ++ "This hook should return the decl of a function that implements the\n\ ++vectorized variant of the function with the @code{combined_fn} code\n\ ++@var{code} or @code{NULL_TREE} if such a function is not available.\n\ ++The return type of the vectorized function shall be of vector type\n\ ++@var{vec_type_out} and the argument types should be @var{vec_type_in}.", ++ insn_code, (machine_mode mode_to, machine_mode mode_form), ++ NULL) ++ ++/* Function to check whether the target hardware architecture supports ++ a full SVE data vector mode. */ ++DEFHOOK ++(prefetch_handleable_mode_p, ++ "This hook should return true if the target hardware architecture\n\ ++supports a full SVE data vector mode.", ++ bool, (machine_mode arg), ++ NULL) ++ + HOOK_VECTOR_END (vectorize) + + #undef HOOK_PREFIX +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c +new file mode 100644 +index 000000000..a4828eaab +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-1.c +@@ -0,0 +1,61 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param issue-topn=2" } */ ++ ++#include ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++#define N 100000 ++ ++int A_i[N]; ++int A_j[N]; ++double A_data[N]; ++double x_data[N]; ++double y_data[N]; ++int num_rows = N; ++ ++void ++MatMult (int *A_i, int *A_j, double *A_data, double *x_data, ++ int num_rows, double *y_data) ++{ ++ int i = 0; ++ int j = 0; ++ double temp = 0; ++ for (i = 0; i < num_rows; i++) ++ { ++ temp = y_data[i]; ++ for (j = A_i[i]; j < A_i[i+1]; j++) ++ temp += A_data[j] * x_data[A_j[j]]; ++ y_data[i] = temp; ++ } ++} ++ ++int ++main (int argc, char *argv[]) ++{ ++ int testIter = 2; ++ ++ for (int i = 0; i < testIter; i++) ++ MatMult (A_i, A_j, A_data, x_data, num_rows, y_data); ++ ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 6 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "static_data_size:" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 4 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not ", size: (?!(0\.000000))" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 6 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\\d x_data \\(0.000000, 1, 0\\) : 3" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\\d A_j \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\\d A_data \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "insert svprfd_gather" 2 "llc_allocate" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp +new file mode 100644 +index 000000000..4f34e722f +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp +@@ -0,0 +1,27 @@ ++# Copyright (C) 2022-2023 Free Software Foundation, Inc. ++ ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 3 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with GCC; see the file COPYING3. If not see ++# . ++ ++load_lib gcc-dg.exp ++load_lib target-supports.exp ++ ++# Initialize `dg'. ++dg-init ++ ++dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \ ++ "" "-fllc-allocate" ++ ++# All done. ++dg-finish +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c +new file mode 100644 +index 000000000..2a58c501f +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c +@@ -0,0 +1,48 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=uPtr" } */ ++ ++#include ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++typedef struct stack_def ++{ ++ int top; /* index to top stack element */ ++ unsigned long reg_set; /* set of live registers */ ++ unsigned char reg[128]; /* register - stack mapping */ ++} *stack; ++ ++typedef struct block_info_def ++{ ++ struct stack_def stack_in; /* Input stack configuration. */ ++ struct stack_def stack_out; /* Output stack configuration. */ ++ unsigned long out_reg_set; /* Stack regs live on output. */ ++ int done; /* True if block already converted. */ ++ int predecessors; /* Number of predecessors that need ++ to be visited. */ ++} *block_info; ++ ++typedef struct basic_block_def ++{ ++ void *aux; ++} *basic_block; ++ ++unsigned char ++convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high) ++{ ++ stack output_stack; ++ ++ output_stack = &(((block_info) bb->aux)->stack_in); ++ if (value_reg_low == -1) ++ output_stack->top = -1; ++ else ++ { ++ int reg; ++ output_stack->top = value_reg_high - value_reg_low; ++ for (reg = value_reg_low; reg <= value_reg_high; ++reg) ++ { ++ (output_stack->reg + 16)[value_reg_high - reg] = reg; ++ output_stack->reg_set |= (unsigned long) 1 << reg; ++ } ++ } ++ return output_stack->reg[0]; ++} ++ ++/* { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c +new file mode 100644 +index 000000000..27cd574cf +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-ref-trace.c +@@ -0,0 +1,62 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0" } */ ++ ++#include ++#include ++ ++#define N 1000 ++ ++long a[N] = {0}; ++long b[N] = {0}; ++long c[N] = {0}; ++ ++double ++referenceTrace (double *psiPtr, int *lPtr, int *uPtr, int nCells) ++{ ++ double sum; ++ for (int cell = 0; cell < nCells; cell++) ++ { ++ // Multi-layer pointer ++ sum += psiPtr[lPtr[cell]]; ++ psiPtr[uPtr[cell]] = sum; ++ ++ // Outer pointer, inner array ++ sum += psiPtr[b[cell]]; ++ psiPtr[a[cell]] = sum; ++ ++ // Multi-layer array, currently failed tracing at b[cell] and a[cell] ++ sum += a[b[cell]]; ++ c[a[cell]] = sum; ++ ++ // Outer array, inner pointer, currently failed tracing at lPtr[cell] ++ sum += a[lPtr[cell]]; ++ c[lPtr[cell]] = sum; ++ } ++ return sum; ++} ++ ++int ++main (int argc, char *argv[]) ++{ ++ int testIter = 2; ++ ++ double *psiPtr = NULL; ++ int *lPtr = NULL; ++ int *uPtr = NULL; ++ psiPtr = (double *) calloc (N, sizeof(double)); ++ lPtr = (int *) calloc (N, sizeof(int)); ++ uPtr = (int *) calloc (N, sizeof(int)); ++ ++ for (int i = 0; i < testIter; i++) ++ referenceTrace (psiPtr, lPtr, uPtr, N); ++ ++ free (psiPtr); ++ free (lPtr); ++ free (uPtr); ++ ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 16 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "Tracing failed" 8 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "unhandled issue scene" 2 "llc_allocate" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c +new file mode 100644 +index 000000000..276781c4f +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c +@@ -0,0 +1,48 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=lPtr" } */ ++ ++#include ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cell ++ ++#define N 131590 ++#define F 384477 ++ ++double diagPtr[N]; ++double psiPtr[N]; ++double ApsiPtr[N]; ++int lPtr[F]; ++int uPtr[F]; ++double lowerPtr[F]; ++double upperPtr[F]; ++ ++void ++AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, ++ int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) ++{ ++ for (int cell=0; cellaux\"" } */ ++ ++#include ++ ++typedef struct stack_def ++{ ++ int top; /* index to top stack element */ ++ unsigned long reg_set; /* set of live registers */ ++ unsigned char reg[128]; /* register - stack mapping */ ++} *stack; ++ ++typedef struct block_info_def ++{ ++ struct stack_def stack_in; /* Input stack configuration. */ ++ struct stack_def stack_out; /* Output stack configuration. */ ++ unsigned long out_reg_set; /* Stack regs live on output. */ ++ int done; /* True if block already converted. */ ++ int predecessors; /* Number of predecessors that need ++ to be visited. */ ++} *block_info; ++ ++typedef struct basic_block_def ++{ ++ void *aux; ++} *basic_block; ++ ++unsigned char ++convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high) ++{ ++ stack output_stack; ++ ++ output_stack = &(((block_info) bb->aux)->stack_in); ++ if (value_reg_low == -1) ++ output_stack->top = -1; ++ else ++ { ++ int reg; ++ output_stack->top = value_reg_high - value_reg_low; ++ for (reg = value_reg_low; reg <= value_reg_high; ++reg) ++ { ++ (output_stack->reg + 16)[value_reg_high - reg] = reg; ++ output_stack->reg_set |= (unsigned long) 1 << reg; ++ } ++ } ++ return output_stack->reg[0]; ++} ++ ++/* { dg-final { scan-tree-dump-not "Unrecognizable variable name" ++ "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c +new file mode 100644 +index 000000000..09a525ce1 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c +@@ -0,0 +1,54 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -c -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param target-variables=tmp_var_0" } */ ++ ++#include ++ ++typedef struct stack_def ++{ ++ int top; /* index to top stack element */ ++ unsigned long reg_set; /* set of live registers */ ++ unsigned char reg[128]; /* register - stack mapping */ ++} *stack; ++ ++typedef struct block_info_def ++{ ++ struct stack_def stack_in; /* Input stack configuration. */ ++ struct stack_def stack_out; /* Output stack configuration. */ ++ unsigned long out_reg_set; /* Stack regs live on output. */ ++ int done; /* True if block already converted. */ ++ int predecessors; /* Number of predecessors that need ++ to be visited. */ ++} *block_info; ++ ++typedef struct basic_block_def ++{ ++ void *aux; ++} *basic_block; ++ ++unsigned char ++convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high) ++{ ++ stack output_stack; ++ ++ output_stack = &(((block_info) bb->aux)->stack_in); ++ if (value_reg_low == -1) ++ output_stack->top = -1; ++ else ++ { ++ int reg; ++ output_stack->top = value_reg_high - value_reg_low; ++ for (reg = value_reg_low; reg <= value_reg_high; ++reg) ++ { ++ (output_stack->reg + 16)[value_reg_high - reg] = reg; ++ output_stack->reg_set |= (unsigned long) 1 << reg; ++ } ++ } ++ return output_stack->reg[0]; ++} ++ ++/* { dg-final { scan-tree-dump-not "Unrecognizable variable name" ++ "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "NOTICE: Prefetching target variable \"" ++ " bb_16(D)->aux \"" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */ +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 +new file mode 100644 +index 000000000..ec918e144 +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-3.f90 +@@ -0,0 +1,213 @@ ++! { dg-do compile { target { aarch64*-*-linux* } } } ++! { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50" } ++ ++program main ++ ++ IMPLICIT NONE ++ INTEGER :: ids,ide, jds,jde, kds,kde ++ INTEGER,parameter :: ims=-4,kms=1,jms=-4 ++ INTEGER,parameter :: ime=210,kme=36,jme=192 ++ INTEGER :: its,ite, jts,jte, kts,kte ++ INTEGER :: number_of_small_timesteps,rk_step, rk_order, step ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme) :: t_1, t_2, c2a, p, ph, pm1, al, alt ++ ++ ++ REAL, DIMENSION(ims:ime, jms:jme) :: mu, muts ++ ++ REAL, DIMENSION(kms:kme) :: dnw, rdnw, znu ++ ++ REAL :: rdx,rdy ++ REAL :: dts, t0, smdiv ++ REAL :: random1,time_begin,time_end,total_time ++ ++ INTEGER :: i, j, k ++ INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end ++ INTEGER :: i_endu, j_endv ++ INTEGER :: interval=1 ++ INTEGER :: epoch,iter ++ ++ LOGICAL :: non_hydrostatic ++ ++ data ids, jds, kds, its, jts, kts /6*1/ ++ data ide, ite /2*205/ ++ data jde, jte /2*187/ ++ data kde, kte /2*36/ ++ ++ number_of_small_timesteps = 1 ++ rk_step = 3 ++ rk_order = 1 ++ dts = 1. ++ ++ rdx = 1. ++ rdy = 1. ++ ++ t0 = 0. ++ smdiv = 1. ++ step = 1 ++ non_hydrostatic = .true. ++ ++ call random_number(random1) ++ interval = random1*100 ++ interval=1 ++ ++ call random_seed(put=(/(i,i=1,10000,interval)/)) ++ ++ call random_number(alt) ++ call random_number(c2a) ++ call random_number(ph) ++ call random_number(pm1) ++ call random_number(mu) ++ call random_number(muts) ++ call random_number(dnw) ++ call random_number(rdnw) ++ call random_number(znu) ++ ++ do iter=1,2 ++ call calc_p_rho( al, p, ph, & ++ alt, t_2, t_1, c2a, pm1, & ++ mu, muts, znu, t0, & ++ rdnw, dnw, smdiv, & ++ non_hydrostatic, step, & ++ ids, ide, jds, jde, kds, kde, & ++ ims, ime, jms, jme, kms, kme, & ++ its,ite, jts,jte, kts,kte ) ++ ++ enddo ++ ++end program ++ ++ ++SUBROUTINE calc_p_rho( al, p, ph, & ++ alt, t_2, t_1, c2a, pm1, & ++ mu, muts, znu, t0, & ++ rdnw, dnw, smdiv, & ++ non_hydrostatic, step, & ++ ids, ide, jds, jde, kds, kde, & ++ ims, ime, jms, jme, kms, kme, & ++ its,ite, jts,jte, kts,kte ) ++ ++ IMPLICIT NONE ! religion first ++ !asb ++! declarations for the stuff coming in ++ ++ INTEGER, INTENT(IN ) :: ids,ide, jds,jde, kds,kde ++ INTEGER, INTENT(IN ) :: ims,ime, jms,jme, kms,kme ++ INTEGER, INTENT(IN ) :: its,ite, jts,jte, kts,kte ++ ++ INTEGER, INTENT(IN ) :: step ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT( OUT) :: al, & ++ p ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(IN ) :: alt, & ++ t_2, & ++ t_1, & ++ c2a ++ ++ REAL, DIMENSION(ims:ime, kms:kme, jms:jme),INTENT(INOUT) :: ph, pm1 ++ ++ REAL, DIMENSION(ims:ime, jms:jme) , INTENT(IN ) :: mu, & ++ muts ++ ++ REAL, DIMENSION(kms:kme) , INTENT(IN ) :: dnw, & ++ rdnw, & ++ znu ++ ++ REAL, INTENT(IN ) :: t0, smdiv ++ ++ LOGICAL, INTENT(IN ) :: non_hydrostatic ++ ++! local variables ++ ++ INTEGER :: i, j, k ++ INTEGER :: i_start, i_end, j_start, j_end, k_start, k_end ++ REAL :: ptmp ++ ++ i_start = its ++ i_end = min(ite,ide-1) ++ j_start = jts ++ j_end = min(jte,jde-1) ++ k_start = kts ++ k_end = min(kte,kde-1) ++ ++ IF (non_hydrostatic) THEN ++ DO j=j_start, j_end ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ ++! al computation is all dry, so ok with moisture ++ ++ al(i,k,j)=-1./muts(i,j)*(alt(i,k,j)*mu(i,j) & ++ +rdnw(k)*(ph(i,k+1,j)-ph(i,k,j))) ++ ++! this is temporally linearized p, no moisture correction needed ++ ++ p(i,k,j)=c2a(i,k,j)*(alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j)) & ++ /(muts(i,j)*(t0+t_1(i,k,j)))-al (i,k,j)) ++ ++ ENDDO ++ ENDDO ++ ENDDO ++ ++ ELSE ! hydrostatic calculation ++ ++ DO j=j_start, j_end ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ p(i,k,j)=mu(i,j)*znu(k) ++ al(i,k,j)=alt(i,k,j)*(t_2(i,k,j)-mu(i,j)*t_1(i,k,j)) & ++ /(muts(i,j)*(t0+t_1(i,k,j)))-p(i,k,j)/c2a(i,k,j) ++ ph(i,k+1,j)=ph(i,k,j)-dnw(k)*(muts(i,j)*al (i,k,j) & ++ +mu(i,j)*alt(i,k,j)) ++ ENDDO ++ ENDDO ++ ENDDO ++ ++ END IF ++ ++! divergence damping setup ++ ++ IF (step == 0) then ! we're initializing small timesteps ++ DO j=j_start, j_end ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ pm1(i,k,j)=p(i,k,j) ++ ENDDO ++ ENDDO ++ ENDDO ++ ELSE ! we're in the small timesteps ++ DO j=j_start, j_end ! and adding div damping component ++ DO k=k_start, k_end ++ DO i=i_start, i_end ++ ptmp = p(i,k,j) ++ p(i,k,j) = p(i,k,j) + smdiv*(p(i,k,j)-pm1(i,k,j)) ++ pm1(i,k,j) = ptmp ++ ENDDO ++ ENDDO ++ ENDDO ++ END IF ++ ++END SUBROUTINE calc_p_rho ++ ++! { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 6 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "Tracing succeeded" 48 "llc_allocate" } } ++! { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } ++! { dg-final { scan-tree-dump-not "static_data_size:" "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){2}\}" 3 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){4}\}" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-not ", size: (?!(0\.000000))" "llc_allocate" } } ++! { dg-final { scan-tree-dump-times ", size: 0\.000000" 28 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d p \\(0.000000, 3, 0\\) : 8" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d pm1 \\(0.000000, 2, 0\\) : 5" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d ph \\(0.000000, 2, 0\\) : 4" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d al \\(0.000000, 1, 0\\) : 3" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d alt \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d t_1 \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d t_2 \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d c2a \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d mu \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "\\d muts \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "runtime issue" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "static issue" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "insert svprfd" 2 "llc_allocate" } } +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp +new file mode 100644 +index 000000000..068341784 +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-allocate.exp +@@ -0,0 +1,29 @@ ++# Copyright (C) 2022-2023 Free Software Foundation, Inc. ++ ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 3 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with GCC; see the file COPYING3. If not see ++# . ++ ++# GCC testsuite that uses the `dg.exp' driver. ++ ++load_lib gfortran-dg.exp ++ ++# Initialize `dg'. ++dg-init ++ ++# Main loop. ++gfortran-dg-runtest [lsort \ ++ [glob -nocomplain $srcdir/$subdir/*.\[fF\]{,90,95,03,08} ] ] "" "" ++ ++# All done. ++dg-finish +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 +new file mode 100644 +index 000000000..23e360540 +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-trace-multiple-base-var.f90 +@@ -0,0 +1,63 @@ ++! { dg-do compile { target { aarch64*-*-linux* } } } ++! { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno" } ++ ++MODULE INPUT ++ IMPLICIT NONE ++ ++ INTEGER, PARAMETER :: wp = 8, jpi = 25, jpj = 39, jpk = 31, kjpt = 2 ++ ++ INTEGER :: kt = 1, jpkm1 = 30, jpjm1 = 38, fs_jpim1 = 24, fs_2 = 2 ++ REAL(wp), DIMENSION(jpi, jpj) :: e12t ++ REAL(wp), DIMENSION(jpi, jpj, jpk) :: fse3t_n ++ REAL(wp), DIMENSION(jpi, jpj, jpk, kjpt) :: pta ++ ++END MODULE INPUT ++ ++PROGRAM MAIN ++ USE INPUT ++ ++ IMPLICIT NONE ++ ++ INTEGER :: EPOCH ++ ++! Initialize arrays ++ ++ e12t = 1 ++ fse3t_n = 1 ++ pta = 1 ++! ++ ++ DO EPOCH=1,2 ++ CALL tra_ldf_iso ++ ENDDO ++ ++END PROGRAM MAIN ++ ++SUBROUTINE tra_ldf_iso ++ USE INPUT ++ ++ IMPLICIT NONE ++ ! ++ INTEGER :: ji, jj, jk, jn ! dummy loop indices ++ REAL(wp) :: zbtr, ztra ! - - ++ REAL(wp), DIMENSION(jpi, jpj, jpk) :: ztfw ++ ++ DO jn = 1, kjpt ++ ztfw(:, :, 1) = 0.e0; ztfw(:, :, jpk) = 0.e0 ++ ++ DO jk = 1, jpkm1 ++ DO jj = 2, jpjm1 ++ DO ji = fs_2, fs_jpim1 ! vector opt. ++ zbtr = 1.0/(e12t(ji, jj)*fse3t_n(ji, jj, jk)) ++ ztra = (ztfw(ji, jj, jk) - ztfw(ji, jj, jk + 1))*zbtr ++ pta(ji, jj, jk, jn) = pta(ji, jj, jk, jn) + ztra ++ END DO ++ END DO ++ END DO ++ ! ++ END DO ++ ! ++END SUBROUTINE tra_ldf_iso ++ ++! { dg-final { scan-tree-dump-times "Traced variables at vectp_ztfw" 2 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "Tracing unusual number or occurrences of base variables. Choose ztfw." 2 "llc_allocate" } } +diff --git a/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 +new file mode 100644 +index 000000000..d76c75b5b +--- /dev/null ++++ b/gcc/testsuite/gfortran.dg/llc-allocate/llc-unknown-type-size-unit.f90 +@@ -0,0 +1,58 @@ ++! { dg-do compile { target { aarch64*-*-linux* } } } ++! { dg-options "-c -O3 -march=armv8.2-a+sve -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param issue-topn=1 --param mem-access-ratio=5 --param mem-access-num=1" } ++ ++Module module_domain ++ IMPLICIT NONE ++ ++ REAL, PARAMETER :: g = 9.8 ++ TYPE :: grid_type ++ REAL, POINTER :: phb(:,:,:), ph_2(:,:,:), p(:,:,:), pb(:,:,:) ++ REAL, POINTER :: fnm(:), fnp(:) ++ END TYPE ++END Module ++ ++SUBROUTINE calc_p8w(p8w, ix, iy, k_start, k_end) ++ ++ USE module_domain ++ !USE module_model_constants ++ ++ IMPLICIT NONE ++ ++ ++ !TYPE (domain), INTENT(IN) :: grid ++ INTEGER, INTENT(IN) :: k_start, k_end, ix, iy ++ REAL, DIMENSION(k_start:k_end), INTENT(OUT) :: p8w ++ ++ ++ INTEGER :: k ++ REAL :: z0, z1, z2, w1, w2 ++ REAL, DIMENSION(k_start:k_end) :: z_at_w ++ REAL, DIMENSION(k_start:k_end-1) :: z ++ TYPE (grid_type), POINTER :: grid ++ ++ ++ DO k = k_start, k_end ++ z_at_w(k) = (grid%phb(ix,k,iy)+grid%ph_2(ix,k,iy))/g ++ END DO ++ ++ DO k = k_start, k_end-1 ++ z(k) = 0.5*(z_at_w(k) + z_at_w(k+1)) ++ END DO ++ ++ DO k = k_start+1, k_end-1 ++ p8w(k) = grid%fnm(k)*(grid%p(ix,k,iy)+grid%pb(ix,k,iy)) + & ++ grid%fnp(k)*(grid%p(ix,k-1,iy)+grid%pb(ix,k-1,iy)) ++ END DO ++ ++ z0 = z_at_w(k_start) ++ z1 = z(k_start) ++ z2 = z(k_start+1) ++ w1 = (z0 - z2)/(z1 - z2) ++ w2 = 1. - w1 ++ p8w(k_start) = w1*(grid%p(ix,k_start,iy)+grid%pb(ix,k_start,iy)) + & ++ w2*(grid%p(ix,k_start+1,iy)+grid%pb(ix,k_start+1,iy)) ++ ++END SUBROUTINE calc_p8w ++ ++! { dg-final { scan-tree-dump-times "runtime issue" 1 "llc_allocate" } } ++! { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } +diff --git a/gcc/timevar.def b/gcc/timevar.def +index ba86a1b7b..4b643538f 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -207,6 +207,7 @@ DEFTIMEVAR (TV_TREE_LOOP_DISTRIBUTION, "tree loop distribution") + DEFTIMEVAR (TV_CHECK_DATA_DEPS , "tree check data dependences") + DEFTIMEVAR (TV_TREE_PREFETCH , "tree prefetching") + DEFTIMEVAR (TV_TREE_LOOP_IVOPTS , "tree iv optimization") ++DEFTIMEVAR (TV_TREE_LLC_ALLOCATE , "tree llc allocation") + DEFTIMEVAR (TV_PREDCOM , "predictive commoning") + DEFTIMEVAR (TV_TREE_CH , "tree copy headers") + DEFTIMEVAR (TV_TREE_SSA_UNCPROP , "tree SSA uncprop") +diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c +index d82fe23d8..9eb173d69 100644 +--- a/gcc/tree-cfg.c ++++ b/gcc/tree-cfg.c +@@ -8365,6 +8365,17 @@ print_loops (FILE *file, int verbosity) + print_loop_and_siblings (file, bb->loop_father, 0, verbosity); + } + ++/* Dump a loop to file. */ ++ ++void ++loop_dump (FILE *file, class loop *loop) ++{ ++ print_loop (file, loop, 0, 0); ++ fprintf (file, "vec_niter = "); ++ print_generic_expr (file, loop->vec_nb_iterations); ++ fprintf (file, "\n"); ++} ++ + /* Dump a loop. */ + + DEBUG_FUNCTION void +diff --git a/gcc/tree-cfg.h b/gcc/tree-cfg.h +index beb4997a6..dad0ca0a6 100644 +--- a/gcc/tree-cfg.h ++++ b/gcc/tree-cfg.h +@@ -83,6 +83,7 @@ extern void dump_function_to_file (tree, FILE *, dump_flags_t); + extern void debug_function (tree, dump_flags_t); + extern void print_loops_bb (FILE *, basic_block, int, int); + extern void print_loops (FILE *, int); ++extern void loop_dump (FILE *file, class loop *loop); + extern void debug (class loop &ref); + extern void debug (class loop *ptr); + extern void debug_verbose (class loop &ref); +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index 027f8992d..a1e215901 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -383,6 +383,7 @@ extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt); ++extern gimple_opt_pass *make_pass_llc_allocate (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_tree_loop_done (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_ch (gcc::context *ctxt); + extern gimple_opt_pass *make_pass_ch_vect (gcc::context *ctxt); +diff --git a/gcc/tree-scalar-evolution.c b/gcc/tree-scalar-evolution.c +index edab77827..73ffa0759 100644 +--- a/gcc/tree-scalar-evolution.c ++++ b/gcc/tree-scalar-evolution.c +@@ -2789,7 +2789,7 @@ resolve_mixers (class loop *loop, tree chrec, bool *folded_casts) + the loop body has been executed 6 times. */ + + tree +-number_of_latch_executions (class loop *loop) ++number_of_latch_executions (class loop *loop, bool guarantee) + { + edge exit; + class tree_niter_desc niter_desc; +@@ -2810,7 +2810,8 @@ number_of_latch_executions (class loop *loop) + res = chrec_dont_know; + exit = single_exit (loop); + +- if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false)) ++ if (exit && number_of_iterations_exit (loop, exit, &niter_desc, false, ++ true, NULL, guarantee)) + { + may_be_zero = niter_desc.may_be_zero; + res = niter_desc.niter; +@@ -2836,7 +2837,8 @@ number_of_latch_executions (class loop *loop) + fprintf (dump_file, "))\n"); + } + +- loop->nb_iterations = res; ++ if (guarantee) ++ loop->nb_iterations = res; + return res; + } + +diff --git a/gcc/tree-scalar-evolution.h b/gcc/tree-scalar-evolution.h +index e2fbfb55b..218155650 100644 +--- a/gcc/tree-scalar-evolution.h ++++ b/gcc/tree-scalar-evolution.h +@@ -21,7 +21,8 @@ along with GCC; see the file COPYING3. If not see + #ifndef GCC_TREE_SCALAR_EVOLUTION_H + #define GCC_TREE_SCALAR_EVOLUTION_H + +-extern tree number_of_latch_executions (class loop *); ++extern tree number_of_latch_executions (class loop *, ++ bool guarantee = true); + extern gcond *get_loop_exit_condition (const class loop *); + + extern void scev_initialize (void); +diff --git a/gcc/tree-ssa-llc-allocate.c b/gcc/tree-ssa-llc-allocate.c +new file mode 100644 +index 000000000..746a1cf95 +--- /dev/null ++++ b/gcc/tree-ssa-llc-allocate.c +@@ -0,0 +1,2898 @@ ++/* LLC allocate. ++ Copyright (C) 2022-2023 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it ++under the terms of the GNU General Public License as published by the ++Free Software Foundation; either version 3, or (at your option) any ++later version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#include "config.h" ++#define INCLUDE_MAP ++#define INCLUDE_SET ++#define INCLUDE_VECTOR ++#define INCLUDE_LIST ++#define INCLUDE_ALGORITHM ++#define INCLUDE_STRING ++#include "system.h" ++#include "coretypes.h" ++#include "backend.h" ++#include "target.h" ++#include "rtl.h" ++#include "tree.h" ++#include "gimple.h" ++#include "predict.h" ++#include "tree-pass.h" ++#include "gimple-ssa.h" ++#include "optabs-query.h" ++#include "tree-pretty-print.h" ++#include "fold-const.h" ++#include "stor-layout.h" ++#include "gimplify.h" ++#include "gimple-iterator.h" ++#include "gimplify-me.h" ++#include "tree-ssa-loop-ivopts.h" ++#include "tree-ssa-loop-manip.h" ++#include "tree-ssa-loop-niter.h" ++#include "tree-ssa-loop.h" ++#include "ssa.h" ++#include "tree-into-ssa.h" ++#include "cfgloop.h" ++#include "tree-scalar-evolution.h" ++#include "langhooks.h" ++#include "tree-inline.h" ++#include "tree-data-ref.h" ++#include "diagnostic-core.h" ++#include "dbgcnt.h" ++#include "gimple-pretty-print.h" ++#include "internal-fn.h" ++#include "tree-cfg.h" ++#include "profile-count.h" ++ ++/* Number of parallel cores. */ ++const unsigned int PARALLEL_NUM = 288; ++ ++/* Indirect access weight. */ ++const unsigned int INDIRECT_ACCESS_VALUE = 2; ++ ++/* Write memory weight. */ ++const unsigned int WRITE_COST = 2; ++ ++/* Prefetch tool input max length. */ ++#ifndef PREFETCH_TOOL_INPUT_MAX_LEN ++#define PREFETCH_TOOL_INPUT_MAX_LEN 512 ++#endif ++ ++/* Prefetch tool number max length. */ ++#ifndef PREFETCH_TOOL_NUM_MAX_LEN ++#define PREFETCH_TOOL_NUM_MAX_LEN 9 ++#endif ++ ++namespace { ++ ++using namespace std; ++ ++/* loop bound info of the memory reference located. */ ++struct loop_bound ++{ ++ /* iv tree_node. */ ++ tree iv; ++ ++ /* define stmt of iv. */ ++ gimple *def_stmt; ++ ++ /* loop where stmt is located. */ ++ class loop *loop; ++ ++ /* loop unroll factor. */ ++ unsigned int unroll; ++ ++ /* Number of iterations of loop. */ ++ tree niters; ++ ++ loop_bound (tree t, gimple *stmt) ++ { ++ iv = t; ++ def_stmt = stmt; ++ loop = loop_containing_stmt (stmt); ++ unroll = 1; ++ niters = chrec_dont_know; ++ } ++}; ++ ++/* method of calculating the data size. */ ++ ++enum calc_type ++{ ++ UNHANDLE_CALC = 0, ++ RUNTIME_CALC, ++ STATIC_CALC ++}; ++ ++/* Describes a info of a memory reference. */ ++ ++struct data_ref ++{ ++ /* The memory reference. */ ++ tree ref; ++ ++ /* Statement where the ref is located. */ ++ gimple *stmt; ++ ++ /* var_decl or param_decl, used for the ref_group. */ ++ tree var; ++ ++ /* Base of the reference. */ ++ tree base; ++ ++ /* Constant offset of the reference. */ ++ tree offset; ++ ++ /* index of the reference. */ ++ tree index; ++ ++ /* Constant step of the reference. */ ++ tree step; ++ ++ /* loop boundary info of each dimension. */ ++ vector loop_bounds; ++ ++ /* memory data size, Unit: MB. */ ++ double data_size; ++ ++ /* method of calculating the data size. */ ++ calc_type calc_by; ++ ++ /* True if the info of ref is traced, and then record it. */ ++ unsigned int trace_status_p : 1; ++ ++ /* True if the loop is vectorized. */ ++ unsigned int vectorize_p : 1; ++ ++ /* True if the memory reference is shared. */ ++ unsigned int parallel_p : 1; ++ ++ /* True if the memory reference is regular. */ ++ unsigned int regular_p : 1; ++ ++ /* True if the memory reference is read. */ ++ unsigned int read_p : 1; ++ ++ data_ref () ++ { ++ ref = NULL_TREE; ++ stmt = NULL; ++ var = NULL_TREE; ++ base = NULL_TREE; ++ offset = NULL_TREE; ++ index = NULL_TREE; ++ step = NULL_TREE; ++ data_size = 0; ++ calc_by = UNHANDLE_CALC; ++ trace_status_p = false; ++ vectorize_p = false; ++ parallel_p = false; ++ regular_p = true; ++ read_p = true; ++ } ++}; ++ ++/* ================ phase 1 get_dense_memory_kernels ================ */ ++ ++/* Add ref node and print. */ ++ ++void ++add_ref (vector &references, tree op, gimple *stmt, ++ bool vectorize_p, bool read_p) ++{ ++ data_ref ref; ++ ref.ref = op; ++ ref.stmt = stmt; ++ ref.vectorize_p = vectorize_p; ++ ref.read_p = read_p; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, ref.ref, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ references.push_back (ref); ++} ++ ++/* Get the references from the simple call (vectorization type). */ ++ ++void ++get_references_in_gimple_call (gimple *stmt, vector &references) ++{ ++ if (gimple_code (stmt) != GIMPLE_CALL) ++ return; ++ ++ if (gimple_call_internal_p (stmt)) ++ { ++ bool read_p = false; ++ switch (gimple_call_internal_fn (stmt)) ++ { ++ case IFN_MASK_GATHER_LOAD: ++ case IFN_MASK_LOAD: ++ { ++ if (gimple_call_lhs (stmt) == NULL_TREE) ++ return; ++ read_p = true; ++ // FALLTHRU ++ } ++ case IFN_MASK_STORE: ++ { ++ /* _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B]; ++ vect__1.1 = .MASK_LOAD (_1, 64B, loop_mask_4); ++ ++ _1 = &MEM[base: a_2(D), index: ivtmp_3, step: 8, offset: 0B]; ++ .MASK_STORE (_1, 64B, loop_mask_4, vect__1.2); ++ ++ _1 = (sizetype) a_2(D); ++ vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8, ++ { 0.0, ... }, loop_mask_5); ++ */ ++ tree op1 = gimple_call_arg (stmt, 0); ++ if (TREE_CODE (op1) != SSA_NAME) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "get_references_in_gimple_call: "); ++ fprintf (dump_file, "find base that not ssa_name: "); ++ print_generic_expr (dump_file, op1, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ return; ++ } ++ gimple *op1_def = SSA_NAME_DEF_STMT (op1); ++ if (op1_def != NULL && gimple_code (op1_def) == GIMPLE_ASSIGN) ++ { ++ /* &MEM[base: xx] */ ++ tree rhs1 = gimple_assign_rhs1 (op1_def); ++ /* If the definition stmt of the operation is memory ++ reference type, read it directly. */ ++ if (TREE_CODE (rhs1) == ADDR_EXPR ++ && TREE_CODE (TREE_OPERAND (rhs1, 0)) == TARGET_MEM_REF) ++ op1 = TREE_OPERAND (rhs1, 0); /* MEM[base: xx] */ ++ } ++ ++ add_ref (references, op1, stmt, true, read_p); ++ return; ++ } ++ default: ++ return; ++ } ++ } ++} ++ ++/* Stores the locations of memory references in STMT to REFERENCES. */ ++ ++void ++get_references_in_stmt (gimple *stmt, vector &references) ++{ ++ if (!gimple_vuse (stmt)) ++ return; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "gimple_vuse: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ ++ if (gimple_code (stmt) == GIMPLE_ASSIGN) ++ { ++ tree op0 = gimple_assign_lhs (stmt); ++ tree op1 = gimple_assign_rhs1 (stmt); ++ tree base = NULL_TREE; ++ ++ /* _1 = MEM[base: a, index: i, step: 8, offset: 0B]; */ ++ if (REFERENCE_CLASS_P (op1) && (base = get_base_address (op1)) ++ && TREE_CODE (base) != SSA_NAME && !is_gimple_min_invariant (base)) ++ add_ref (references, op1, stmt, false, true); ++ ++ if (REFERENCE_CLASS_P (op0) && get_base_address (op0)) ++ add_ref (references, op0, stmt, false, false); ++ } ++ else if (gimple_code (stmt) == GIMPLE_CALL) ++ get_references_in_gimple_call (stmt, references); ++ ++ return; ++} ++ ++/* flag of loop filter out. */ ++ ++struct loop_filter_out_flag ++{ ++ /* Use external gimple. */ ++ bool use_ext_gimple; ++ ++ /* Use external call. */ ++ bool use_ext_call; ++ ++ /* Use external node. */ ++ bool use_ext_node; ++ ++ /* Use loop defined in macros. */ ++ bool use_macro_loop; ++ ++ /* Use external node. */ ++ bool use_cond_func; ++}; ++ ++/* Check whether an external node is used. */ ++ ++bool use_ext_node_p (const vector &references, ++ unsigned int &start) ++{ ++ expanded_location cfun_xloc ++ = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); ++ ++ unsigned i = start; ++ start = references.size (); ++ for (; i < references.size (); i++) ++ { ++ data_ref ref = references[i]; ++ expanded_location xloc = expand_location (ref.stmt->location); ++ if (xloc.file && filename_cmp (cfun_xloc.file, xloc.file)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "use_ext_node\n\n"); ++ return true; ++ } ++ } ++ return false; ++} ++ ++/* Determine whether to filter out loops by stmt. */ ++ ++bool ++filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt, ++ const vector &references, unsigned int &start) ++{ ++ /* check use_ext_gimple. */ ++ expanded_location cfun_xloc ++ = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); ++ expanded_location xloc = expand_location (stmt->location); ++ if (xloc.file && filename_cmp (cfun_xloc.file, xloc.file)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "use_ext_gimple: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ loop_filter.use_ext_gimple = true; ++ return true; ++ } ++ ++ /* check use_ext_call. */ ++ if (gimple_code (stmt) == GIMPLE_CALL && !gimple_call_internal_p (stmt)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "use_ext_call: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ loop_filter.use_ext_call = true; ++ return true; ++ } ++ ++ /* check use_macro_loop. */ ++ if (xloc.file && xloc.column != 1) ++ loop_filter.use_macro_loop = false; ++ ++ /* checke use_cond_func, VEC_COND_EXPR/MIN_EXPR/MAX_EXPR. */ ++ if (gimple_code (stmt) == GIMPLE_ASSIGN) ++ { ++ enum tree_code rhs_code = gimple_assign_rhs_code (stmt); ++ if (rhs_code == VEC_COND_EXPR || rhs_code == MIN_EXPR ++ || rhs_code == MAX_EXPR) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "use_cond_func: "); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); ++ } ++ loop_filter.use_cond_func = true; ++ return true; ++ } ++ } ++ ++ /* check use_ext_node. */ ++ if (use_ext_node_p (references, start)) ++ { ++ loop_filter.use_ext_node = true; ++ return true; ++ } ++ ++ return false; ++} ++ ++/* Dump the flag type of the loop is filtered out. */ ++ ++void ++dump_loop_filter_out_flag (loop_filter_out_flag &loop_filter) ++{ ++ if (loop_filter.use_ext_gimple) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_ext_gimple\n"); ++ } ++ if (loop_filter.use_ext_call) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_ext_call\n"); ++ } ++ ++ if (loop_filter.use_ext_node) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_ext_node\n"); ++ } ++ ++ if (loop_filter.use_macro_loop) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_macro_loop\n"); ++ } ++ ++ if (loop_filter.use_cond_func) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: use_cond_func\n"); ++ } ++} ++ ++/* Get references in loop. */ ++ ++bool ++get_references_in_loop (vector &references, ++ loop_filter_out_flag &loop_filter, ++ class loop *loop) ++{ ++ unsigned int start = 0; ++ bool filter_out_loop = true; ++ ++ /* Analyze each bb in the loop. */ ++ basic_block *body = get_loop_body_in_dom_order (loop); ++ for (unsigned i = 0; i < loop->num_nodes; i++) ++ { ++ basic_block bb = body[i]; ++ if (bb->loop_father != loop) ++ continue; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n==== the %dth loop bb body ====\n", i); ++ gimple_dump_bb (dump_file, bb, 0, dump_flags); ++ fprintf (dump_file, "\n"); ++ } ++ ++ gimple_stmt_iterator bsi; ++ for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) ++ { ++ gimple *stmt = gsi_stmt (bsi); ++ get_references_in_stmt (stmt, references); ++ filter_out_loop = filter_out_loop_by_stmt_p (loop_filter, stmt, ++ references, start); ++ if (filter_out_loop) ++ break; ++ } ++ if (filter_out_loop) ++ break; ++ } ++ free (body); ++ return !filter_out_loop; ++} ++ ++/* Determine whether the loop is a single path. */ ++ ++bool ++single_path_p (class loop *loop, basic_block bb) ++{ ++ if (bb == NULL) ++ return false; ++ if (bb == loop->latch) ++ return true; ++ ++ gimple *stmt = last_stmt (bb); ++ bool res = false; ++ ++ if (stmt && gimple_code (stmt) == GIMPLE_COND) ++ { ++ gcc_assert (EDGE_COUNT (bb->succs) == 2); ++ edge true_edge = NULL; ++ edge false_edge = NULL; ++ extract_true_false_edges_from_block (bb, &true_edge, &false_edge); ++ ++ /* Returns false, if a branch occurs. */ ++ if (true_edge->dest->loop_father == loop ++ && false_edge->dest->loop_father == loop) ++ return false; ++ ++ if (true_edge->dest->loop_father == loop) ++ res = single_path_p (loop, true_edge->dest); ++ else ++ res = single_path_p (loop, false_edge->dest); ++ } ++ else ++ { ++ edge e = find_fallthru_edge (bb->succs); ++ if (e) ++ res = single_path_p (loop, e->dest); ++ } ++ return res; ++} ++ ++/* Computes an estimated number of insns in LOOP, weighted by WEIGHTS. ++ Assume that the HPC data reading and calculation process does not involve ++ adding branches in loops. Therefore, all bbs of loops are directly used for ++ calculation (excluding embedded loops) without considering branch weighting. ++*/ ++ ++unsigned ++estimate_loop_insns (class loop *loop, eni_weights *weights) ++{ ++ basic_block *body = get_loop_body (loop); ++ gimple_stmt_iterator gsi; ++ unsigned size = 0, i; ++ ++ for (i = 0; i < loop->num_nodes; i++) ++ { ++ basic_block bb = body[i]; ++ if (bb->loop_father != loop) ++ { ++ continue; ++ } ++ for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi)) ++ size += estimate_num_insns (gsi_stmt (gsi), weights); ++ } ++ free (body); ++ ++ return size; ++} ++ ++/* Check whether the memory access is dense. */ ++ ++bool ++dense_memory_p (const vector &references, class loop *loop) ++{ ++ int ref_count = references.size (); ++ unsigned int ninsns = estimate_loop_insns (loop, &eni_size_weights); ++ float mem_to_insn_ratio = (float)ref_count / (float)ninsns; ++ ++ /* The number of cores to be run and DDR bandwidth information can be ++ transferred to flexibly adjust the threshold. */ ++ bool dense_mem = (mem_to_insn_ratio >= (param_mem_access_ratio / 100.0) ++ && ref_count >= param_mem_access_num); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (cfun->decl)); ++ ++ /* Dump dense memory source code location. */ ++ if (ref_count && references[0].stmt->location) ++ { ++ expanded_location xloc = expand_location ++ (references[0].stmt->location); ++ int fn_start = 0; ++ if (DECL_SOURCE_LOCATION (current_function_decl)) ++ fn_start = expand_location ( ++ DECL_SOURCE_LOCATION (current_function_decl)).line; ++ int fn_end = fn_start; ++ if (cfun->function_end_locus) ++ fn_end = expand_location (cfun->function_end_locus).line; ++ if (xloc.file) ++ fprintf (dump_file, "[%s:%s(%d-%d):%d:%d] ", ++ xloc.file, fn_name, fn_start, fn_end, ++ xloc.line, xloc.column); ++ } ++ ++ /* Dump memory dense information. */ ++ if (dense_mem) ++ fprintf (dump_file, "dense memory access: "); ++ else ++ fprintf (dump_file, "non-dense mem access: "); ++ fprintf (dump_file, ++ "ref_count = %d, ninsns = %d, mem_to_insn_ratio = %f\n\n", ++ ref_count, ninsns, mem_to_insn_ratio); ++ } ++ ++ return dense_mem; ++} ++ ++/* Analyze the inner loop and get the loop with dense memory access. */ ++ ++bool ++get_dense_memory_kernels (vector &kernels, ++ map > &kernels_refs) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 1: get_dense_memory_kernels\n\n"); ++ class loop *loop = NULL; ++ FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) ++ { ++ number_of_latch_executions (loop); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n========== Processing loop %d: ==========\n", ++ loop->num); ++ loop_dump (dump_file, loop); ++ flow_loop_dump (loop, dump_file, NULL, 1); ++ fprintf (dump_file, "loop unroll: %d\n", loop->unroll); ++ } ++ ++ if (get_loop_exit_edges (loop).length () != 1 ++ || !single_path_p (loop, loop->header)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: loop_branching\n"); ++ continue; ++ } ++ ++ vector references; ++ loop_filter_out_flag loop_filter = {false, false, false, true, false}; ++ ++ if (!get_references_in_loop (references, loop_filter, loop)) ++ { ++ dump_loop_filter_out_flag (loop_filter); ++ continue; ++ } ++ ++ if (dense_memory_p (references, loop)) ++ { ++ kernels_refs[loop] = references; ++ kernels.push_back (loop); ++ } ++ } ++ return kernels.size () > 0; ++} ++ ++/* ================ phase 2 trace_data_refs_info ================ */ ++ ++/* Determine whether the declaration is a non-vectorized. */ ++ ++bool ++generic_decl_p (tree expr) ++{ ++ if (expr == NULL_TREE) ++ return false; ++ enum tree_code expr_code = TREE_CODE (expr); ++ if (expr_code != VAR_DECL && expr_code != PARM_DECL ++ && expr_code != COMPONENT_REF) ++ return false; ++ ++ tree type = TREE_TYPE (expr); ++ while (type) ++ { ++ if (TREE_CODE (type) != VECTOR_TYPE) ++ /* TREE_TYPE (NODE) ( ++ CONTAINS_STRUCT_CHECK (NODE, TS_TYPED)->typed.type) */ ++ type = CONTAINS_STRUCT_CHECK (type, TS_TYPED) ? TREE_TYPE (type) : NULL; ++ else ++ return false; ++ } ++ return true; ++} ++ ++/* Initial worklist preparation for source variable tracing. ++ Add different initial node based on different gimple statements. */ ++ ++void ++add_worklist (vector &worklist, set &walked, gimple *def_stmt) ++{ ++ if (gimple_code (def_stmt) == GIMPLE_PHI) ++ { ++ for (unsigned i = 0; i < gimple_phi_num_args (def_stmt); i++) ++ { ++ tree node = gimple_phi_arg_def (def_stmt, i); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ } ++ } ++ else if (is_gimple_assign (def_stmt)) ++ { ++ tree_code rhs_code = gimple_assign_rhs_code (def_stmt); ++ if (rhs_code == POINTER_PLUS_EXPR || rhs_code == NEGATE_EXPR ++ || rhs_code == NOP_EXPR || rhs_code == SSA_NAME ++ || rhs_code == COMPONENT_REF) ++ { ++ tree node = gimple_assign_rhs1 (def_stmt); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ } ++ else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR) ++ { ++ tree node = gimple_assign_rhs1 (def_stmt); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ node = gimple_assign_rhs2 (def_stmt); ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ walked.insert (node); ++ } ++ } ++ else ++ { ++ /* unhandled assign rhs_code: _219 = _17 * _70; ++ _17 = *grid_56(D).sst.span; ++ _70 = *grid_56(D).sst.dim[0].stride; ++ */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "unhandled assign rhs_code: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ } ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "unsupported tracing stmt: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ } ++} ++ ++ ++/* Tracing source variables: ++ vectp.1 = a_2(D) + _3; ++ _4 = &MEM[base: vectp.1, index: ivtmp_5, step: 8, offset: 0B]; ++ vect__1.6 = .MASK_LOAD (_4, 64B, loop_mask_7); ++ ++ _1 = (sizetype) b_2(D); ++ vect_patt_3.3 = .MASK_GATHER_LOAD (_1, vect__4.4, 8, { 0.0, ... }, ++ loop_mask_5); ++ ... ++ Due to previous pass optimizations, the current tracing method can find ++ several source variable candidates. We decide to record them in a map and ++ later filter out the true base variable by some criteria. ++*/ ++ ++void ++trace_base_var_helper (tree arg, set &walked, ++ map& base_var_candid) ++{ ++ if (arg == NULL) ++ return; ++ ++ /* Array type. */ ++ tree op0 = NULL; ++ if (TREE_CODE (arg) == ADDR_EXPR ++ && (op0 = TREE_OPERAND (arg, 0)) && generic_decl_p (op0)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "array type\n"); ++ base_var_candid[op0] += 1; ++ return; ++ } ++ ++ /* Pointer type. */ ++ if (TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE && generic_decl_p (arg)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "pointer type\n"); ++ base_var_candid[arg] += 1; ++ return; ++ } ++ ++ /* SSA_NAME type. */ ++ if (TREE_CODE (arg) != SSA_NAME) ++ return; ++ ++ tree tmp_var = SSA_NAME_VAR (arg); ++ if (tmp_var && generic_decl_p (tmp_var) ++ && TREE_CODE (TREE_TYPE (arg)) == POINTER_TYPE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "ssa pointer type\n"); ++ base_var_candid[tmp_var] += 1; ++ return; ++ } ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (arg); ++ if (def_stmt == NULL) ++ return; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, arg, TDF_SLIM); ++ fprintf (dump_file, "\t\t: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); ++ } ++ ++ vector worklist; ++ add_worklist (worklist, walked, def_stmt); ++ for (unsigned i = 0; i < worklist.size (); ++i) ++ trace_base_var_helper (worklist[i], walked, base_var_candid); ++} ++ ++/* Identify the base variable traced from base address of memory reference. ++ We recognize that current method could detect several base variable ++ candidates and the temporary criteria for base variable determination ++ is that either one of the following statement is true: ++ 1. The number of base variable candidates is 1; ++ 2. The number of detected gimple statements for some variable is 1. ++ We may use other criteria or relax the current criteria ++ (e.g., criterion 2: 1 -> any odd number). */ ++ ++bool ++trace_base_var (tree &var, tree arg, set &walked) ++{ ++ map base_var_candid; ++ trace_base_var_helper (arg, walked, base_var_candid); ++ bool is_tracing_unusual = false; ++ if (base_var_candid.size () == 1) ++ var = base_var_candid.begin ()->first; ++ else ++ { ++ is_tracing_unusual = true; ++ for (const pair& base_var_count : base_var_candid) ++ if (base_var_count.second == 1) ++ var = base_var_count.first; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Traced variables at "); ++ print_generic_expr (dump_file, arg, TDF_SLIM); ++ fprintf (dump_file, ":\n"); ++ for (const pair& base_var_count : base_var_candid) ++ fprintf (dump_file, "%s:%d, ", get_name (base_var_count.first), ++ base_var_count.second); ++ fprintf (dump_file, "\n"); ++ ++ if (var == NULL_TREE) ++ fprintf (dump_file, "Unhandled scenario for tracing base variable.\n"); ++ else if (is_tracing_unusual && var != NULL_TREE) ++ fprintf (dump_file, "Tracing unusual number or occurrences of base " ++ "variables. Choose %s.\n", get_name (var)); ++ } ++ return var != NULL_TREE; ++} ++ ++/* Tracing direct memory reference information. */ ++ ++bool ++trace_direct_mem_ref (data_ref &mem_ref, set &traced_ref_stmt) ++{ ++ if (TREE_CODE (mem_ref.ref) != TARGET_MEM_REF) ++ return false; ++ ++ /* Direct memory access, regardless of whether it is in vectorized form, ++ can be determined through TARGET_MEM_REF. */ ++ mem_ref.base = TREE_OPERAND (mem_ref.ref, 0); ++ mem_ref.offset = TREE_OPERAND (mem_ref.ref, 1); ++ mem_ref.index = TREE_OPERAND (mem_ref.ref, 2); ++ mem_ref.step = TREE_OPERAND (mem_ref.ref, 3); ++ ++ set walked; ++ if (mem_ref.var == NULL_TREE ++ && !trace_base_var (mem_ref.var, mem_ref.base, walked)) ++ return false; ++ ++ traced_ref_stmt.insert (mem_ref.stmt); ++ return true; ++} ++ ++/* Recursively trace and check whether the definition stmt of the ++ index operand is a recorded stmt in direct access tracing. ++ If true, it is an indirect access. */ ++ ++bool ++trace_indirect_operand (tree arg, set &traced_ref_stmt) ++{ ++ if (TREE_CODE (arg) != SSA_NAME) ++ return false; ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (arg); ++ ++ if (traced_ref_stmt.count (def_stmt)) ++ return true; ++ ++ if (!def_stmt || !is_gimple_assign (def_stmt)) ++ return false; ++ ++ tree_code rhs_code = gimple_assign_rhs_code (def_stmt); ++ /* Collect a whitelist of gimple_assign_rhs_code for tracing pointer/array ++ type indirect memory access. Please check examples before function ++ trace_indirect_ptr and trace_indirect_array. */ ++ if (rhs_code != MULT_EXPR && rhs_code != NOP_EXPR ++ && rhs_code != CONVERT_EXPR && rhs_code != PLUS_EXPR ++ && rhs_code != ARRAY_REF) ++ return false; ++ ++ tree op = NULL_TREE; ++ ssa_op_iter iter; ++ FOR_EACH_SSA_TREE_OPERAND (op, def_stmt, iter, SSA_OP_USE) ++ { ++ if (trace_indirect_operand (op, traced_ref_stmt)) ++ return true; ++ } ++ return false; ++} ++ ++/* Trace the pointer of the indirect memory access: ++ 1) obtain the base address of the indirect memory access. ++ 2) ensure that the index has been traced in the direct memory access. ++ ++ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; // Traced in ++ direct access ++ _4 = (long unsigned int) _1; ++ _5 = _4 * 8; ++ _6 = p(D) + _5; // get base ++ _7 = *_6; // start tracing ++*/ ++ ++bool ++trace_indirect_ptr (tree &base, tree &index, tree arg, ++ set traced_ref_stmt) ++{ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (arg); ++ ++ if (!def_stmt || !is_gimple_assign (def_stmt)) ++ return false; ++ ++ tree_code rhs_code = gimple_assign_rhs_code (def_stmt); ++ if (rhs_code != POINTER_PLUS_EXPR) ++ return false; ++ ++ /* POINTER_PLUS_EXPR, The first operand is always a pointer/reference type. ++ The second operand is always an unsigned integer type compatible with ++ sizetype. */ ++ base = gimple_assign_rhs1 (def_stmt); ++ index = gimple_assign_rhs2 (def_stmt); ++ ++ return trace_indirect_operand (index, traced_ref_stmt); ++} ++ ++/* Trace the array of the indirect memory access: ++ 1) obtain the base address of the indirect memory access. ++ 2) ensure that the index has been traced in the direct memory access. ++ ++ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; // Traced in ++ direct access ++ _4 = (integer(kind=8)) _1; ++ _5 = _4 + 135; ++ _6 = p[_5]; // start tracing ++*/ ++ ++bool ++trace_indirect_array (tree &base, tree &index, ++ set traced_ref_stmt, tree ref) ++{ ++ if (TREE_CODE (ref) != ARRAY_REF) ++ return false; ++ base = TREE_OPERAND (ref, 0); ++ index = TREE_OPERAND (ref, 1); ++ return trace_indirect_operand (index, traced_ref_stmt); ++} ++ ++/* Tracing indirect memory reference information. ++ Include tracing of base addresses and source variable. ++ _x(ssa name) -> a_2(base addr) -> a(src var) */ ++ ++bool ++trace_indirect_mem_ref (data_ref &mem_ref, ++ set &traced_ref_stmt) ++{ ++ /* Processing of vectorization types. */ ++ if (mem_ref.vectorize_p) ++ { ++ tree op = gimple_call_arg (mem_ref.stmt, 1); ++ if (trace_indirect_operand (op, traced_ref_stmt)) ++ { ++ mem_ref.base = gimple_call_arg (mem_ref.stmt, 0); ++ mem_ref.regular_p = false; ++ set walked; ++ if (mem_ref.var == NULL_TREE ++ && !trace_base_var (mem_ref.var, mem_ref.base, walked)) ++ return false; ++ return true; ++ } ++ return false; ++ } ++ ++ /* Processing of non-vectorized types. */ ++ tree op = NULL_TREE; ++ ssa_op_iter iter; ++ FOR_EACH_SSA_TREE_OPERAND (op, mem_ref.stmt, iter, SSA_OP_USE) ++ { ++ ++ /* Array type: ++ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; ++ _4 = c[_1]; ++ ++ Pointer type: ++ _1 = MEM[base: a_2(D), index: ivtmp.3_3, step: 4, offset: 0B]; ++ _4 = (long unsigned int) _1; ++ _5 = _4 * 8; ++ _6 = p(D) + _5; ++ _7 = *_6; ++ */ ++ tree base = NULL_TREE; ++ tree index = NULL_TREE; ++ if (trace_indirect_array (base, index, traced_ref_stmt, mem_ref.ref) ++ || trace_indirect_ptr (base, index, op, traced_ref_stmt)) ++ { ++ /* ARRAY_REF, The first operand is the array; ++ the second is the index. */ ++ mem_ref.base = base; ++ mem_ref.index = index; ++ mem_ref.regular_p = false; ++ set walked; ++ if (mem_ref.var == NULL_TREE ++ && !trace_base_var (mem_ref.var, mem_ref.base, walked)) ++ return false; ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++/* Trace references base info: ++ 1) Parallel analysis ++ 2) Memory access rule analysis ++ 3) Tracing base address and source variable of memory references ++ We will extend parallel analysis later. ++*/ ++ ++void ++trace_ref_info (data_ref &mem_ref, set &traced_ref_stmt) ++{ ++ enum tree_code ref_code = TREE_CODE (mem_ref.ref); ++ if (/* Vectorized and non-vectorized direct access. */ ++ ref_code != TARGET_MEM_REF ++ /* non-vectorized indirect memory access. */ ++ && ref_code != MEM_REF && ref_code != ARRAY_REF ++ /* vectorized indirect memory access. */ ++ && ref_code != SSA_NAME) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "ref is another tree-code: "); ++ fprintf (dump_file, "stmt: "); ++ print_gimple_stmt (dump_file, mem_ref.stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "ref: "); ++ print_generic_expr (dump_file, mem_ref.ref, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ return; ++ } ++ ++ /* 1) Direct and indirect access traces and traces source variables. */ ++ if (!trace_direct_mem_ref (mem_ref, traced_ref_stmt) ++ && !trace_indirect_mem_ref (mem_ref, traced_ref_stmt)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Tracing failed.\n\n"); ++ return; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Tracing succeeded.\n\n"); ++ mem_ref.trace_status_p = true; ++} ++ ++/* Tracing and sorting reference groups. */ ++ ++void ++trace_data_refs_info (vector &kernels, ++ map > &loop_refs) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 2: trace_all_references_info\n\n"); ++ ++ set traced_ref_stmt; ++ ++ for (unsigned i = 0; i < kernels.size (); ++i) ++ { ++ class loop* loop = kernels[i]; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "loop header %d:\n", loop->header->index); ++ for (unsigned j = 0; j < loop_refs[loop].size (); ++j) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "trace_references_base_info %d:\n", j); ++ print_generic_expr (dump_file, loop_refs[loop][j].ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ trace_ref_info (loop_refs[loop][j], traced_ref_stmt); ++ } ++ } ++} ++ ++/* ================ phase 3 analyze_nested_kernels ================ */ ++ ++/* Return the inner most type for arrays and pointers of TYPE. */ ++ ++tree ++inner_type (tree type) ++{ ++ while (POINTER_TYPE_P (type) ++ || TREE_CODE (type) == ARRAY_TYPE) ++ type = TREE_TYPE (type); ++ return type; ++} ++ ++/* Check whether the input iv is the loop dimension boundary. */ ++ ++bool ++loop_bound_iv_p (tree t, tree &outer_loop_t) ++{ ++ if (t == NULL || TREE_CODE (t) != SSA_NAME ++ || TREE_CODE (TREE_TYPE (t)) != INTEGER_TYPE) ++ return false; ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (t); ++ if (gimple_code (def_stmt) != GIMPLE_PHI) ++ return false; ++ ++ /* Filter scenarios with only two phi inputs. */ ++ if (gimple_phi_num_args (def_stmt) != 2) ++ return false; ++ ++ gphi *phi_stmt = as_a (def_stmt); ++ basic_block src0 = gimple_phi_arg_edge (phi_stmt, 0)->src; ++ basic_block src1 = gimple_phi_arg_edge (phi_stmt, 1)->src; ++ ++ class loop *loop = loop_containing_stmt (def_stmt); ++ bool res = false; ++ /* Two phi inputs, one from the current loop and one from the outer loop. */ ++ if ((src0->loop_father == loop) && (src1->loop_father == loop_outer (loop))) ++ { ++ outer_loop_t = gimple_phi_arg_def (def_stmt, 1); ++ res = true; ++ } ++ else if ((src1->loop_father == loop) ++ && (src0->loop_father == loop_outer (loop))) ++ { ++ outer_loop_t = gimple_phi_arg_def (def_stmt, 0); ++ res = true; ++ } ++ ++ if (res) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "===> "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); ++ } ++ return true; ++ } ++ return false; ++} ++ ++/* add worklist and walked list. */ ++ ++void ++add_worklist_walked (vector &worklist, set &walked, tree node) ++{ ++ if (!walked.count (node)) ++ { ++ worklist.push_back (node); ++ /* Avoid phi node cycle introduction, which makes the worklist unable ++ to end. */ ++ walked.insert (node); ++ } ++} ++ ++/* check bound iv and add worklist. */ ++ ++void ++check_bound_iv_and_add_worklist (vector &worklist, set &walked, ++ tree t, data_ref &mem_ref) ++{ ++ if (TREE_CODE (t) != SSA_NAME) ++ return; ++ ++ gimple *def_stmt = SSA_NAME_DEF_STMT (t); ++ if (def_stmt == NULL) ++ return; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, t, TDF_SLIM); ++ fprintf (dump_file, "\t\t: "); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_SLIM); ++ } ++ ++ if (gimple_code (def_stmt) == GIMPLE_PHI) ++ { ++ tree out_loop_t = NULL_TREE; ++ if (loop_bound_iv_p (t, out_loop_t)) ++ { ++ mem_ref.loop_bounds.push_back (loop_bound (t, def_stmt)); ++ add_worklist_walked (worklist, walked, out_loop_t); ++ } ++ } ++ else if (is_gimple_assign (def_stmt)) ++ { ++ tree_code rhs_code = gimple_assign_rhs_code (def_stmt); ++ ++ /* unary. */ ++ if (rhs_code == SSA_NAME || rhs_code == NOP_EXPR) ++ add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt)); ++ else if (rhs_code == POINTER_PLUS_EXPR) ++ add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt)); ++ ++ /* binary. */ ++ else if (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR ++ || rhs_code == MULT_EXPR) ++ { ++ add_worklist_walked (worklist, walked, gimple_assign_rhs1 (def_stmt)); ++ add_worklist_walked (worklist, walked, gimple_assign_rhs2 (def_stmt)); ++ } ++ } ++} ++ ++/* DFS trace the loop bound of iv. */ ++ ++bool ++trace_loop_bound_iv (data_ref &mem_ref) ++{ ++ /* Indirect memory access, the size cannot be determined based on the loop ++ boundary. */ ++ if (!mem_ref.regular_p) ++ return false; ++ ++ /* Determine and record the boundary iv of the current index, ++ but do not trace it. */ ++ tree outer_loop_t = NULL_TREE; ++ if (loop_bound_iv_p (mem_ref.index, outer_loop_t)) ++ mem_ref.loop_bounds.push_back ( ++ loop_bound (mem_ref.index, SSA_NAME_DEF_STMT (mem_ref.index))); ++ ++ vector worklist; ++ worklist.push_back (mem_ref.base); ++ set walked; ++ ++ while (worklist.size ()) ++ { ++ tree t = worklist.back (); ++ worklist.pop_back (); ++ ++ /* add worklist. */ ++ check_bound_iv_and_add_worklist (worklist, walked, t, mem_ref); ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nmem_ref access dimension: %ld\n", ++ mem_ref.loop_bounds.size ()); ++ ++ return mem_ref.loop_bounds.size () > 0; ++} ++ ++/* dump loop bound. */ ++ ++void ++loop_bound_dump (FILE *file, loop_bound &lb) ++{ ++ class loop *loop = lb.loop; ++ fprintf (file, "loop_bound: loop_%d (", loop->num); ++ if (loop->header) ++ fprintf (file, "header = %d", loop->header->index); ++ else ++ { ++ fprintf (file, "deleted)\n"); ++ return; ++ } ++ if (loop->latch) ++ fprintf (file, ", latch = %d", loop->latch->index); ++ fprintf (file, ", lb_niters = "); ++ print_generic_expr (file, lb.niters); ++ fprintf (file, ")\n"); ++} ++ ++/* static calculate data size. */ ++ ++void ++static_calculate_data_size (data_ref &mem_ref) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nstatic_calculate_data_size\n"); ++ ++ tree size_unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var))); ++ HOST_WIDE_INT type_size = size_unit ? tree_to_uhwi (size_unit) : 0; ++ for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i) ++ { ++ HOST_WIDE_INT est_niter = tree_to_uhwi (mem_ref.loop_bounds[i].niters); ++ unsigned int unroll = mem_ref.loop_bounds[i].unroll; ++ if (i == 0) ++ { ++ /* The unit conversion between byte, kilobytes, and megabytes is ++ 1024. */ ++ mem_ref.data_size = double (type_size ++ * est_niter * unroll) / 1024 / 1024; ++ } ++ else ++ mem_ref.data_size *= est_niter * unroll; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "static_data_size: %lf\n", mem_ref.data_size); ++ } ++} ++ ++/* Recursive tracing and creating of dominant nodes. */ ++ ++tree ++trace_and_create_dominate_expr (tree expr, class loop *outermost) ++{ ++ if (expr == NULL_TREE || is_gimple_constant (expr)) ++ return expr; ++ ++ if (TREE_CODE (expr) != SSA_NAME) ++ return NULL_TREE; ++ ++ if (SSA_NAME_IS_DEFAULT_DEF (expr)) ++ return expr; ++ ++ gimple *stmt = SSA_NAME_DEF_STMT (expr); ++ basic_block def_bb = gimple_bb (stmt); ++ if (def_bb == NULL || def_bb->loop_father == NULL) ++ return NULL_TREE; ++ ++ if (dominated_by_p (CDI_DOMINATORS, outermost->header, def_bb)) ++ return expr; ++ ++ if (gimple_code (stmt) != GIMPLE_ASSIGN) ++ return NULL_TREE; ++ ++ enum tree_code rhs_code = gimple_assign_rhs_code (stmt); ++ tree_code_class code_class = TREE_CODE_CLASS (rhs_code); ++ tree type = TREE_TYPE (gimple_assign_lhs (stmt)); ++ tree rhs1 = trace_and_create_dominate_expr ++ (gimple_assign_rhs1 (stmt), outermost); ++ if (rhs1 == NULL_TREE) ++ return NULL_TREE; ++ ++ if (code_class == tcc_unary) ++ { ++ tree expr_new = build1 (rhs_code, type, rhs1); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ else if (code_class == tcc_binary) ++ { ++ tree rhs2 = trace_and_create_dominate_expr ++ (gimple_assign_rhs2 (stmt), outermost); ++ if (rhs2 == NULL_TREE) ++ return NULL_TREE; ++ ++ tree expr_new = fold_build2 (rhs_code, type, rhs1, rhs2); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ ++ return NULL_TREE; ++} ++ ++/* Recursive parsing and craating of nodes in expr expressions. */ ++ ++tree ++parse_and_create_expr (tree expr, class loop *outermost) ++{ ++ if (expr == NULL_TREE || expr == chrec_dont_know ++ || is_gimple_constant (expr) || TREE_CODE (expr) == ADDR_EXPR) ++ { ++ /* tcc_expression (e.g., &q) situation combined with tcc_unary. */ ++ if (TREE_CODE (expr) == ADDR_EXPR && dump_file ++ && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "tcc_expression case in ADDR_EXPR: "); ++ print_generic_expr (dump_file, expr, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr; ++ } ++ ++ if (TREE_CODE (expr) == SSA_NAME) ++ return trace_and_create_dominate_expr (expr, outermost); ++ else if (EXPR_P (expr)) ++ { ++ enum tree_code tree_code = TREE_CODE (expr); ++ tree_code_class code_class = TREE_CODE_CLASS (tree_code); ++ tree type = TREE_TYPE (expr); ++ tree op1 = parse_and_create_expr (TREE_OPERAND (expr, 0), outermost); ++ if (op1 == NULL_TREE) ++ return NULL_TREE; ++ ++ if (code_class == tcc_unary) ++ { ++ tree expr_new = build1 (tree_code, type, op1); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ else if (code_class == tcc_binary) ++ { ++ tree op2 = parse_and_create_expr (TREE_OPERAND (expr, 1), outermost); ++ if (op2 == NULL_TREE) ++ return NULL_TREE; ++ ++ tree expr_new = fold_build2 (tree_code, type, op1, op2); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "expr_new = "); ++ print_generic_expr (dump_file, expr_new, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return expr_new; ++ } ++ } ++ return NULL_TREE; ++} ++ ++/* Trace and creat dominate loop bounds. */ ++ ++void ++trace_and_create_dominate_loop_bounds (data_ref &mem_ref) ++{ ++ /* Check whether the niters is a loop dominant. ++ If not, trace and determine whether the result is dominant. If yes, create ++ the expr of the dominant node. ++ */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\ntrace_and_create_dominate_loop_bounds\n"); ++ ++ /* Determine the relationship between the boundary of the innermost loop and ++ the dominant of the outer loop and the processing. */ ++ loop_bound &outermost = mem_ref.loop_bounds.back (); ++ for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i) ++ { ++ loop_bound ¤t = mem_ref.loop_bounds[i]; ++ tree &niters = current.niters; ++ if (TREE_CODE (niters) == COND_EXPR) ++ niters = TREE_OPERAND (niters, 1); ++ ++ niters = parse_and_create_expr (niters, outermost.loop); ++ ++ if (niters == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM); ++ fprintf (dump_file, "Tracing loop bound failed at dimension %d", ++ i); ++ } ++ mem_ref.calc_by = UNHANDLE_CALC; ++ break; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ loop_bound_dump (dump_file, mem_ref.loop_bounds[i]); ++ } ++} ++ ++/* trace the dimension and corresponding loop bounds of mem_ref. ++ This function is used to supplement the information of mem_ref.loop_bounds. ++*/ ++ ++void ++trace_ref_dimension_and_loop_bounds (data_ref &mem_ref) ++{ ++ /* In the same loop, some memory access dimensions are different. Remove ++ variables with fewer dimensions. ++ Previous cyclic filtering conditions and memory access node records and ++ tracing. ++ The false result is also processed. ++ */ ++ if (dump_file) ++ fprintf (dump_file, "\ncalculate_data_size\n"); ++ ++ /* Trace the loop bound iv of ref to determine the dimension. */ ++ /* Record data from the loop perspective to avoid repeated tracing. */ ++ if (!trace_loop_bound_iv (mem_ref)) ++ return; ++ ++ /* The traced mem_ref may have multiple dimensions, which corresponds to ++ multiple loops. */ ++ /* And in the dimension-by-dimensional analysis, the computable way is ++ continuously reduced. */ ++ mem_ref.calc_by = STATIC_CALC; ++ for (unsigned i = 0; i < mem_ref.loop_bounds.size (); ++i) ++ { ++ class loop *loop = mem_ref.loop_bounds[i].loop; ++ tree &niters = mem_ref.loop_bounds[i].niters; ++ ++ /* Set NULL_TREE to ensure that nb_iterations are retraced and ++ vec_nb_iterations are also extracted. */ ++ loop->nb_iterations = NULL_TREE; ++ niters = number_of_latch_executions (loop, false); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ loop_dump (dump_file, loop); ++ ++ if (loop->unroll) ++ { ++ if (loop->unroll == USHRT_MAX && dump_file ++ && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "loop->unroll = USHRT_MAX = %d", USHRT_MAX); ++ mem_ref.loop_bounds[i].unroll = loop->unroll; ++ } ++ ++ if ((niters == chrec_dont_know) && loop->vec_nb_iterations ++ && (loop->vec_nb_iterations != chrec_dont_know)) ++ niters = loop->vec_nb_iterations; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ loop_bound_dump (dump_file, mem_ref.loop_bounds[i]); ++ ++ if (niters == NULL_TREE || niters == chrec_dont_know) ++ mem_ref.calc_by = min (mem_ref.calc_by, UNHANDLE_CALC); ++ else if (TREE_CODE (niters) != INTEGER_CST) ++ mem_ref.calc_by = min (mem_ref.calc_by, RUNTIME_CALC); ++ else ++ mem_ref.calc_by = min (mem_ref.calc_by, STATIC_CALC); ++ } ++ ++ if (mem_ref.calc_by == RUNTIME_CALC) ++ trace_and_create_dominate_loop_bounds (mem_ref); ++ else if (mem_ref.calc_by == STATIC_CALC) ++ static_calculate_data_size (mem_ref); ++} ++ ++/* analyze nested kernels. ++ 1. multidimension loop analyze. ++ 2. extended outer loop analyze. ++ Later we will extend outer loop analysis. ++*/ ++ ++bool ++analyze_nested_kernels (vector &kernels, ++ map > &loop_refs) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 3: analyze_nested_kernels\n\n"); ++ ++ for (unsigned i = 0; i < kernels.size (); ++i) ++ { ++ class loop* loop = kernels[i]; ++ if (loop_refs.count (loop) == 0) ++ continue; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n\nloop header %d:\n", loop->header->index); ++ for (unsigned j = 0; j < loop_refs[loop].size (); ++j) ++ { ++ if (loop_refs[loop][j].trace_status_p == false) ++ continue; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\ntrace_reference_dimension at mem_ref " ++ "index %d in loop %d:\n", j, loop->num); ++ print_generic_expr (dump_file, loop_refs[loop][j].ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ trace_ref_dimension_and_loop_bounds (loop_refs[loop][j]); ++ } ++ ++ } ++ return true; ++} ++ ++/* ================ phase 4 filter_and_sort_kernels ================ */ ++ ++/* Get the edge probability information of each basic block in the loop. */ ++ ++float ++get_edge_prob (edge e, float minimum) ++{ ++ float fvalue = 0; ++ ++ profile_probability probability = e->probability; ++ if (probability.initialized_p ()) ++ { ++ fvalue = probability.to_reg_br_prob_base () / float (REG_BR_PROB_BASE); ++ if (fvalue < minimum && probability.to_reg_br_prob_base ()) ++ fvalue = minimum; ++ } ++ return fvalue; ++} ++ ++/* Get the next bb with a high branch probability. */ ++ ++basic_block ++next_high_probability_bb (basic_block bb) ++{ ++ if (bb == NULL) ++ return NULL; ++ ++ /* Limit the minimum probability value. */ ++ const float MINNUM_PROB = 0.00001f; ++ float minimum = MINNUM_PROB; ++ ++ gimple *stmt = last_stmt (bb); ++ if (stmt && gimple_code (stmt) == GIMPLE_COND) ++ { ++ edge true_edge = NULL; ++ edge false_edge = NULL; ++ extract_true_false_edges_from_block (bb, &true_edge, &false_edge); ++ ++ float true_edge_prob = get_edge_prob (true_edge, minimum); ++ float false_edge_prob = get_edge_prob (false_edge, minimum); ++ /* If the content of the branch does not include the candidate ++ kernel, the branch probability may not be limited. */ ++ /* The edge_prob may have precision error during static prediction, ++ so we need to relax the limit before comparison. */ ++ if ((true_edge_prob >= (param_branch_prob_threshold / 100.0) - minimum) ++ && flow_bb_inside_loop_p (bb->loop_father, true_edge->dest)) ++ return true_edge->dest; ++ else if ((false_edge_prob >= (param_branch_prob_threshold / 100.0) ++ - minimum) && flow_bb_inside_loop_p (bb->loop_father, ++ false_edge->dest)) ++ return false_edge->dest; ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "No high probability bb:"); ++ fprintf (dump_file, "current bb: %d, true: %f, false: %f\n", ++ bb->index, true_edge_prob, false_edge_prob); ++ } ++ return NULL; ++ } ++ } ++ else ++ { ++ edge e = find_fallthru_edge (bb->succs); ++ if (e) ++ return e->dest; ++ } ++ return NULL; ++} ++ ++ ++/* Dump loop header bb. */ ++ ++void ++dump_loop_headers (const char *name, vector &loops) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n\n%s:\n", name); ++ fprintf (dump_file, "{ "); ++ for (unsigned int i = 0; i < loops.size (); i++) ++ fprintf (dump_file, "%d(%d) ", loops[i]->num, loops[i]->header->index); ++ fprintf (dump_file, "}\n\n"); ++ } ++} ++ ++/* Combine and sort candidate loops. */ ++ ++bool ++filter_and_sort_kernels (vector &sorted_kernels, ++ vector &kernels) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nPhase 4: filter_and_sort_kernels:\n\n"); ++ ++ set end_bb; ++ list walked_header_bb; /* Used to record nested loops. */ ++ ++ for (unsigned i = 0; i < kernels.size (); ++i) ++ end_bb.insert (kernels[i]->header); ++ ++ dump_loop_headers ("kernels", kernels); ++ ++ if (!param_filter_kernels) ++ { ++ for (vector::iterator it = kernels.begin (); ++ it != kernels.end (); ++it) ++ sorted_kernels.push_back (*it); ++ } ++ else ++ { ++ basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun); ++ ++ while (bb) ++ { ++ if (bb == NULL) ++ return false; ++ if (bb == EXIT_BLOCK_PTR_FOR_FN (cfun)) ++ break; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "%d ", bb->index); ++ ++ /* bb is not the head of the loop, go to the next. */ ++ if (bb != bb->loop_father->header) ++ { ++ bb = next_high_probability_bb (bb); ++ continue; ++ } ++ ++ /* bb is the head of the loop. */ ++ if (bb != walked_header_bb.back ()) ++ { ++ if (end_bb.count (bb)) ++ { ++ sorted_kernels.push_back (bb->loop_father); ++ bb = single_exit (bb->loop_father)->dest; ++ continue; ++ } ++ if (loop_outer (bb->loop_father) != NULL ++ && get_loop_exit_edges (bb->loop_father).length () != 1) ++ return false; ++ walked_header_bb.push_back (bb); ++ bb = next_high_probability_bb (bb); ++ continue; ++ } ++ else ++ { ++ walked_header_bb.pop_back (); ++ bb = single_exit (bb->loop_father)->dest; ++ continue; ++ } ++ } ++ } ++ ++ dump_loop_headers ("sorted_kernels", sorted_kernels); ++ return true; ++} ++ ++/* ================ phase 5 record_and_sort_ref_groups ================ */ ++/* Memory reference score, different aspects of one memory reference. */ ++ ++struct ref_score ++{ ++ /* certain memory reference. */ ++ data_ref d_ref; ++ ++ /* local count for bb where memory reference is located. */ ++ gcov_type bb_count; ++ ++ /* line-location of memory reference. */ ++ int line; ++}; ++ ++ ++/* Memory reference group, different reference of the same variable. */ ++ ++struct ref_group ++{ ++ /* source variables. */ ++ tree var; ++ ++ /* variable size, Unit: MB. */ ++ double var_size; ++ ++ /* first ref for insert hint. */ ++ data_ref first_use; ++ ++ /* reuse scores of variables. */ ++ unsigned int reuse_level; ++ ++ /* method of calculating the var size. */ ++ calc_type calc_by; ++ ++ /* memory reference index for specific variable. */ ++ unsigned int mem_ref_index; ++ ++ /* Accessing Reference Records in Different Modes (key_index): ++ 000: write, random, non-parallel ++ 001: write, random, parallel ++ 010: write, regular, non-parallel ++ 011: write, regular, parallel ++ 100: read, random, non-parallel ++ 101: read, random, parallel ++ 110: read, regular, non-parallel ++ 111: read, regular, parallel ++ */ ++ map > ref_use; ++ ++ /* scores for different memory references. */ ++ vector ref_scores; ++ ++ ref_group () ++ { ++ var = NULL_TREE; ++ var_size = 0; ++ reuse_level = 0; ++ calc_by = UNHANDLE_CALC; ++ mem_ref_index = 0; ++ } ++}; ++ ++/* calculate reuse level. */ ++ ++unsigned int ++calculate_reuse_level (map > &var_use) ++{ ++ unsigned int level = 0; ++ for (map >::iterator it = var_use.begin (); ++ it != var_use.end (); ++it) ++ { ++ unsigned int parallel = 1; ++ unsigned int regular = 1; ++ unsigned int cost = 1; ++ ++ if ((*it).second[0].parallel_p) ++ parallel = PARALLEL_NUM; ++ if (!(*it).second[0].regular_p) ++ regular = INDIRECT_ACCESS_VALUE; ++ if (!(*it).second[0].read_p) ++ cost = WRITE_COST; ++ ++ /* In serial reuse, we will later check whether they are in the ++ same cacheline. If yes, delete the reuse. For details, see the ++ reuse analysis of prefetching and eliminate redundancy. */ ++ unsigned int add = parallel * ((*it).second.size () * (cost + regular)); ++ level += add; ++ if (add && dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "%d : %d * (%ld * (%d + %d)) = %d\n", ++ (*it).first, parallel, (*it).second.size (), cost, regular, add); ++ } ++ return level; ++} ++ ++/* Comparison of reference reuse level. */ ++ ++bool ++ref_group_reuse_cmp (const ref_group &a, const ref_group &b) ++{ ++ return a.reuse_level > b.reuse_level; ++} ++ ++/* Sort reference groups. */ ++ ++void ++sort_ref_groups (vector &ref_groups, ++ map &ref_groups_map) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nsort_ref_groups_by_reuse_level\n"); ++ ++ for (map::iterator it = ref_groups_map.begin (); ++ it != ref_groups_map.end (); ++it) ++ { ++ (*it).second.reuse_level = calculate_reuse_level ((*it).second.ref_use); ++ ref_groups.push_back ((*it).second); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, (*it).second.var, TDF_SLIM); ++ fprintf (dump_file, " : %d\n", (*it).second.reuse_level); ++ } ++ } ++ ++ sort (ref_groups.begin (), ref_groups.end (), ref_group_reuse_cmp); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nsorted ref_groups:\n"); ++ fprintf (dump_file, "rank var (data_size, num_of_mem_ref, need_tmp_name):" ++ " reuse_level_score\n"); ++ for (unsigned int i = 0; i < ref_groups.size (); ++i) ++ { ++ fprintf (dump_file, "%d ", i); ++ print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM); ++ int need_tmp_name = !get_name (ref_groups[i].var) ? 1 : 0; ++ fprintf (dump_file, " (%lf, %lu, %d)", ref_groups[i].var_size, ++ ref_groups[i].ref_scores.size (), need_tmp_name); ++ fprintf (dump_file, " : %d\n", ref_groups[i].reuse_level); ++ } ++ fprintf (dump_file, "\n"); ++ ++ fprintf (dump_file, "first_use:\n"); ++ for (unsigned int i = 0; i < ref_groups.size (); ++i) ++ { ++ fprintf (dump_file, "%d ", i); ++ print_generic_expr (dump_file, ref_groups[i].var, TDF_SLIM); ++ fprintf (dump_file, " : "); ++ if (!ref_groups[i].first_use.vectorize_p) ++ print_generic_expr (dump_file, ref_groups[i].first_use.ref, ++ TDF_SLIM); ++ else ++ print_gimple_stmt (dump_file, ref_groups[i].first_use.stmt, ++ TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* Attributes of variable data. */ ++ ++enum data_attribute ++{ ++ DA_PARALLEL = 0, ++ DA_REGULAR, ++ DA_READ ++}; ++ ++/* Record memory reference by use mode. ++ If the reference group is not found, create a group. */ ++ ++void ++record_mem_ref (map &ref_groups, data_ref &mem_ref) ++{ ++ unsigned int index = (mem_ref.parallel_p << DA_PARALLEL) ++ + (mem_ref.regular_p << DA_REGULAR) + (mem_ref.read_p << DA_READ); ++ ++ if (!ref_groups.count (mem_ref.var)) ++ { ++ ref_group ref_group; ++ ref_group.var = mem_ref.var; ++ ref_group.first_use = mem_ref; ++ ref_groups[mem_ref.var] = ref_group; ++ } ++ ++ /* Ref_groups' calc_by depends on the inserted mem_ref's calc_by. ++ Runtime issue requires the specified mem_ref's calc_by to be >= 1. ++ Temporarily modified ref_group's first_use after sorting mem_refs. */ ++ ref_groups[mem_ref.var].calc_by = max (ref_groups[mem_ref.var].calc_by, ++ mem_ref.calc_by); ++ ref_groups[mem_ref.var].var_size = max (ref_groups[mem_ref.var].var_size, ++ mem_ref.data_size); ++ ref_groups[mem_ref.var].ref_use[index].push_back (mem_ref); ++ ++ ref_score ref_level{ mem_ref, ((mem_ref.stmt)->bb->count).to_gcov_type (), ++ expand_location (mem_ref.stmt->location).line }; ++ ref_groups[mem_ref.var].ref_scores.push_back (ref_level); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "recorded in: "); ++ print_generic_expr (dump_file, mem_ref.var, TDF_SLIM); ++ fprintf (dump_file, ":%d:%ld\n", index, ++ ref_groups[mem_ref.var].ref_use[index].size () - 1); ++ ++ fprintf (dump_file, "base: "); ++ print_generic_expr (dump_file, mem_ref.base, TDF_SLIM); ++ ++ fprintf (dump_file, ", index: "); ++ print_generic_expr (dump_file, mem_ref.index, TDF_SLIM); ++ ++ fprintf (dump_file, ", step: "); ++ if (mem_ref.step && cst_and_fits_in_hwi (mem_ref.step)) ++ fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, ++ int_cst_value (mem_ref.step)); ++ else ++ print_generic_expr (dump_file, mem_ref.step, TDF_SLIM); ++ ++ fprintf (dump_file, ", offset: "); ++ if (mem_ref.offset && cst_and_fits_in_hwi (mem_ref.offset)) ++ fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, ++ int_cst_value (mem_ref.offset)); ++ else ++ print_generic_expr (dump_file, mem_ref.offset, TDF_SLIM); ++ fprintf (dump_file, ", %s", mem_ref.read_p ? "read" : "write"); ++ ++ fprintf (dump_file, ", size: %lf", mem_ref.data_size); ++ fprintf (dump_file, "\n\n"); ++ } ++} ++ ++/* Rank data reference index level by the scheme of source code line number. */ ++ ++bool ++data_ref_reuse_cmp (const ref_score &a, const ref_score &b) ++{ ++ return a.line < b.line; ++} ++ ++/* Sort data reference index level within one reference group in non-decreasing ++ order of the customized sorting scheme. */ ++ ++void ++sort_mem_ref_in_ref_group (map &ref_groups_map) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nsorted data_references:\n"); ++ for (map::iterator it = ref_groups_map.begin (); ++ it != ref_groups_map.end (); ++it) ++ { ++ vector &ref_scores = (*it).second.ref_scores; ++ stable_sort (ref_scores.begin (), ref_scores.end (), data_ref_reuse_cmp); ++ /* Update ref_group's first_use and calc_by with the first mem_ref after ++ sorting. */ ++ (*it).second.first_use = (*it).second.ref_scores[0].d_ref; ++ (*it).second.calc_by = (*it).second.first_use.calc_by; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ print_generic_expr (dump_file, (*it).first, TDF_SLIM); ++ fprintf (dump_file, " : %lu\n", ref_scores.size ()); ++ for (unsigned int i = 0; i < ref_scores.size (); ++i) ++ { ++ fprintf (dump_file, "mem_ref_index %u: ", i); ++ print_gimple_stmt (dump_file, ref_scores[i].d_ref.stmt, 0, ++ TDF_LINENO); ++ } ++ fprintf (dump_file, "\n\n"); ++ } ++ } ++} ++ ++/* Tracing and sorting reference groups. */ ++ ++bool ++record_and_sort_ref_groups (vector &ref_groups, ++ vector &kernels, ++ map > &loop_refs) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nPhase 5: trace_all_references_details\n\n"); ++ ++ map ref_groups_map; ++ ++ for (unsigned i = 0; i < kernels.size (); ++i) ++ { ++ class loop* loop = kernels[i]; ++ if (loop_refs.count (loop) == 0) ++ continue; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "loop header %d:\n", loop->header->index); ++ for (unsigned j = 0; j < loop_refs[loop].size (); ++j) ++ { ++ if (loop_refs[loop][j].trace_status_p) ++ record_mem_ref (ref_groups_map, loop_refs[loop][j]); ++ } ++ } ++ ++ /* Sort mem_ref within ref_group by local count and update first_use's ++ data_ref, stable sort. */ ++ sort_mem_ref_in_ref_group (ref_groups_map); ++ sort_ref_groups (ref_groups, ref_groups_map); ++ ++ return ref_groups.size () > 0; ++} ++ ++/* ================ phase 6 issue_llc_hint ================ */ ++ ++/* Issue vectorized mask prefetch gimple. */ ++ ++void ++issue_mask_prefetch (gimple *stmt) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "insert svprfd.\n"); ++ ++ /* vect__1.1 = .MASK_LOAD (_2, 32B, loop_mask_3); ++ .MASK_STORE (_4, 32B, loop_mask_5, vect__6.6); ++ */ ++ tree dataref_ptr = gimple_call_arg (stmt, 0); ++ tree scale = gimple_call_arg (stmt, 1); ++ tree final_mask = gimple_call_arg (stmt, 2); ++ tree target = NULL_TREE; ++ if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE) ++ target = gimple_call_arg (stmt, 3); ++ else if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD) ++ target = gimple_call_lhs (stmt); ++ /* 4: PLDL3KEEP. */ ++ tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); ++ ++ /* add offset. */ ++ gimple_stmt_iterator si = gsi_for_stmt (stmt); ++ /* target: vector_type - XXX_type. */ ++ if (target == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "unhandled scene: target vect is null"); ++ return; ++ } ++ HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi ++ (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target)))); ++ tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance); ++ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true, ++ NULL, true, GSI_SAME_STMT); ++ ++ gcall *call = gimple_build_call_internal (IFN_MASK_PREFETCH, ++ 5, addr, scale, final_mask, target, prfop); ++ gsi_insert_after (&si, call, GSI_SAME_STMT); ++ update_ssa (TODO_update_ssa_only_virtuals); ++} ++ ++/* Issue vectorized mask gather prefetch gimple. */ ++ ++void ++issue_mask_gather_prefetch (gimple *stmt) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "insert svprfd_gather_uxindex.\n"); ++ ++ /* vect_patt_1.1 = .MASK_GATHER_LOAD (_2, vect__3.3, 8, { 0.0, ... }, ++ loop_mask_4); */ ++ tree dataref_ptr = gimple_call_arg (stmt, 0); ++ tree vec_offset = gimple_call_arg (stmt, 1); ++ tree scale = gimple_call_arg (stmt, 2); ++ tree zero = gimple_call_arg (stmt, 3); ++ tree final_mask = gimple_call_arg (stmt, 4); ++ tree prfop = build_int_cst (TREE_TYPE (integer_zero_node), 4); ++ tree target = gimple_call_lhs (stmt); ++ ++ /* add offset. */ ++ gimple_stmt_iterator si = gsi_for_stmt (stmt); ++ if (target == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "unhandled scene: target vect is null"); ++ return; ++ } ++ HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi ++ (TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (target)))); ++ tree addr = fold_build_pointer_plus_hwi (dataref_ptr, distance); ++ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true, ++ NULL, true, GSI_SAME_STMT); ++ ++ gcall *call = gimple_build_call_internal ++ (IFN_MASK_GATHER_PREFETCH, 7, addr, ++ vec_offset, scale, zero, final_mask, target, prfop); ++ gsi_insert_after (&si, call, GSI_SAME_STMT); ++ update_ssa (TODO_update_ssa_only_virtuals); ++} ++ ++/* Issue builtin prefetch gimple. */ ++ ++void ++issue_builtin_prefetch (data_ref &mem_ref) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "insert prfm.\n"); ++ /* MEM[symbol: diagPtr, index: ivtmp_102, step: 8, offset: 0B] */ ++ gimple* stmt = mem_ref.stmt; ++ tree dataref_ptr = mem_ref.base; ++ tree data_idx = mem_ref.index; ++ tree scale = mem_ref.step; ++ tree offset = mem_ref.offset; ++ /* add offset. */ ++ gimple_stmt_iterator si = gsi_for_stmt (stmt); ++ if (scale == NULL_TREE) ++ { ++ /* _190 = (void *) ivtmp.444_221; ++ Cannot detect size unit at (void *). */ ++ scale = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (mem_ref.var))); ++ if (scale == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "ERROR: Unknown size unit for the prefetching " ++ "variable. Stop builtin_prefetch.\n\n"); ++ return; ++ } ++ } ++ ++ data_idx = data_idx ? data_idx : size_zero_node; ++ data_idx = build1 (NOP_EXPR, TREE_TYPE (scale), data_idx); ++ tree displacement = fold_build2 (MULT_EXPR, TREE_TYPE (scale), data_idx, ++ scale); ++ if (offset != NULL_TREE && TREE_CODE (offset) != TREE_CODE (size_zero_node)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "WARNING: offset's TREE_TYPE is not integer_cst: " ++ "%s\nStop builtin_prefetch.\n", ++ get_tree_code_name (TREE_CODE (offset))); ++ return; ++ } ++ offset = offset ? offset : size_zero_node; ++ offset = build1 (NOP_EXPR, TREE_TYPE (scale), offset); ++ dataref_ptr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (dataref_ptr), ++ dataref_ptr, offset); ++ tree addr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (dataref_ptr), ++ dataref_ptr, displacement); ++ HOST_WIDE_INT distance = param_prefetch_offset * tree_to_uhwi (scale); ++ ++ addr = fold_build_pointer_plus_hwi (addr, distance); ++ addr = force_gimple_operand_gsi (&si, unshare_expr (addr), true, ++ NULL, true, GSI_SAME_STMT); ++ /* __builtin_prefetch (_68, 0, 1); ++ 1st param: *addr, 2nd param: write/read (1/0), 3rd param: temporal locality ++ (high means strong locality) */ ++ gcall *call = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), ++ 3, addr, integer_zero_node, integer_one_node); ++ gsi_insert_after (&si, call, GSI_SAME_STMT); ++ update_ssa (TODO_update_ssa_only_virtuals); ++} ++ ++/* Retrieve memory reference at the specific index. */ ++ ++data_ref ++get_data_ref_at_idx (ref_group &var_ref_group) ++{ ++ unsigned int mem_ref_size = static_cast( ++ var_ref_group.ref_scores.size ()); ++ if (strlen (param_mem_ref_index) == 0) ++ return var_ref_group.first_use; ++ else ++ { ++ /* Insert prefetch hint at highly-likely-used location with the given ++ index. */ ++ if (var_ref_group.mem_ref_index >= mem_ref_size) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "WARNING: The target data_ref index is out " ++ "of range. Use top index instead!\n"); ++ return var_ref_group.ref_scores[0].d_ref; ++ } ++ return var_ref_group.ref_scores[var_ref_group.mem_ref_index].d_ref; ++ } ++} ++ ++/* Static form insertion and issue instruction. We may check the ++ determination of the ARM SVE architecture before SVE hint insertion. */ ++ ++void ++static_issue (vector &ref_groups, int num_issue_var) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "static issue\n"); ++ ++ for (int i = 0; i < num_issue_var; ++i) ++ { ++ data_ref mem_ref = get_data_ref_at_idx (ref_groups[i]); ++ if (mem_ref.vectorize_p) ++ { ++ enum internal_fn ifn_code = gimple_call_internal_fn ++ (mem_ref.stmt); ++ if (ifn_code == IFN_MASK_STORE || ifn_code == IFN_MASK_LOAD) ++ issue_mask_prefetch (mem_ref.stmt); ++ else if (ifn_code == IFN_MASK_GATHER_LOAD) ++ issue_mask_gather_prefetch (mem_ref.stmt); ++ else ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "other vectorized internal function\n"); ++ } ++ else ++ issue_builtin_prefetch (mem_ref); ++ } ++} ++ ++/* Generate the stmts for calculating the size. Later we will consider nested ++ multi-branches scenarios and check more information of niters when it is ++ a COND_EXPR. */ ++ ++tree ++calc_stmts_gen (vector &ref_groups, gimple_seq &cond_expr_stmt_list, ++ int num_issue_var) ++{ ++ /* Accumulated keep size. */ ++ tree total_size = build_real_from_int_cst ++ (double_type_node, integer_zero_node); ++ for (int i = 0; i < num_issue_var; ++i) ++ { ++ data_ref &mem_ref = ref_groups[i].first_use; ++ tree var = mem_ref.var; ++ for (unsigned j = 0; j < mem_ref.loop_bounds.size (); ++j) ++ { ++ tree niters = mem_ref.loop_bounds[j].niters; ++ ++ /* COND_EXPR. */ ++ if (TREE_CODE (niters) == COND_EXPR) ++ niters = TREE_OPERAND (niters, 1); ++ tree unit = TYPE_SIZE_UNIT (inner_type (TREE_TYPE (var))); ++ /* _190 = (void *) ivtmp.444_221; ++ Cannot detect size unit at (void *). */ ++ if (unit == NULL_TREE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "WARNING: Cannot detect size unit " ++ "(use 1 byte) for variable %s: ", get_name (var)); ++ print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ unit = size_one_node; ++ } ++ unit = build1 (NOP_EXPR, TREE_TYPE (niters), unit); ++ tree size = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, unit); ++ size = build1 (FLOAT_EXPR, double_type_node, size); ++ total_size = fold_build2 ++ (PLUS_EXPR, double_type_node, total_size, size); ++ } ++ } ++ /* Create a stmt list for size calculation. */ ++ tree div = build_int_cst (TREE_TYPE (integer_zero_node), 1024 * 1024); ++ div = build1 (NOP_EXPR, double_type_node, div); ++ total_size = fold_build2 (RDIV_EXPR, double_type_node, total_size, div); ++ ++ tree threshold = build_int_cst (TREE_TYPE (integer_zero_node), ++ param_llc_capacity_per_core / 2); ++ threshold = build_real_from_int_cst (double_type_node, threshold); ++ tree cond_expr = fold_build2 ++ (LE_EXPR, boolean_type_node, total_size, threshold); ++ ++ /* Convert cond_expr to stmt list. */ ++ cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), ++ &cond_expr_stmt_list, is_gimple_condexpr, NULL_TREE); ++ return cond_expr; ++} ++ ++/* Runtime form insertion and issue instruction. */ ++ ++void ++runtime_issue (vector &ref_groups, int num_issue_var) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "runtime issue\n"); ++ ++ if (ref_groups.size () == 0) ++ return; ++ data_ref &mem_ref = ref_groups[0].first_use; ++ class loop *loop = mem_ref.loop_bounds.back ().loop; ++ /* Ensure that variables are in the same loop. */ ++ for (int i = 1; i < num_issue_var; ++i) ++ { ++ data_ref &mem_ref = ref_groups[i].first_use; ++ if (loop != mem_ref.loop_bounds.back ().loop) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "topn var are not in the same loop\n"); ++ return; ++ } ++ } ++ if (loop == NULL) ++ return; ++ ++ /* If the exit edge points to bb with multiple inputs, split the exit edge ++ and create a new bb, make the exit edge point to bb only single input. */ ++ edge e = single_exit (loop); ++ if (e == NULL) ++ return; ++ if (!single_pred_p (e->dest)) ++ { ++ split_loop_exit_edge (e, true); ++ if (dump_enabled_p ()) ++ dump_printf (MSG_NOTE, "split exit edge\n"); ++ } ++ ++ gimple_seq cond_expr_stmt_list = NULL; ++ tree cond_expr = calc_stmts_gen (ref_groups, cond_expr_stmt_list, ++ num_issue_var); ++ ++ /* Use the previous cond and generate a new branch and copy loop. */ ++ basic_block condition_bb = NULL; ++ profile_probability prob = profile_probability::likely (); ++ initialize_original_copy_tables (); ++ class loop *nloop = loop_version (loop, cond_expr, &condition_bb, ++ prob, prob.invert (), prob, prob.invert (), true); ++ free_original_copy_tables (); ++ ++ /* Insert the generated stmt list before cond_expr. */ ++ gimple_stmt_iterator cond_exp_gsi; ++ if (cond_expr_stmt_list) ++ { ++ cond_exp_gsi = gsi_last_bb (condition_bb); ++ gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, ++ GSI_SAME_STMT); ++ } ++ update_ssa (TODO_update_ssa); ++ ++ /* Perform hint issue for branches that meet conditions. */ ++ static_issue (ref_groups, num_issue_var); ++} ++ ++/* Issue llc hints through prefetch instructions. */ ++ ++void ++issue_llc_hint (vector &ref_groups) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "issue_llc_hint:\n"); ++ ++ /* 1. If the issue-topn and force-issue options are available, top N var is ++ forcibly allocated and no runtime branch is generated. ++ 2. If the issue-topn option is available and the size of top N var is ++ statically known, top N is statically allocated and no runtime branch ++ is generated. ++ 3. If the issue-topn option is available and the size of the top N var is ++ unknown, but them is dynamically known, the top N is dynamically ++ allocated and generate runtime branches. (also depends on the screening ++ of the innermost variable boundary type) ++ 4. If the dynamic runtime cannot know the size, such as indirect access, ++ optimization is skipped. ++ */ ++ if (ref_groups.size () == 0) ++ return; ++ ++ int num_issue_var = min (param_issue_topn, ++ static_cast(ref_groups.size ())); ++ if (num_issue_var < param_issue_topn ++ && dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "WARNING: Only %u (less than param_issue_topn = %d) " ++ "ref_group(s) is found for llc hint.\n", ++ num_issue_var, param_issue_topn); ++ } ++ if (param_force_issue) ++ { ++ if (strlen (param_target_variables) > 0) ++ static_issue (ref_groups, static_cast(ref_groups.size ())); ++ else ++ static_issue (ref_groups, num_issue_var); ++ return; ++ } ++ calc_type topn_calc_type = STATIC_CALC; ++ for (int i = 0; i < num_issue_var; ++i) ++ topn_calc_type = min (topn_calc_type, ref_groups[i].calc_by); ++ ++ if (topn_calc_type == STATIC_CALC) ++ { ++ /* Before static issue, we still need to collect data size of all target ++ variables and compare the summation with LLC cache size. */ ++ double prefetch_data_size = 0.; ++ for (int i = 0; i < num_issue_var; ++i) ++ prefetch_data_size += ref_groups[i].var_size; ++ if (prefetch_data_size <= (double) param_llc_capacity_per_core * 0.8) ++ static_issue (ref_groups, num_issue_var); ++ else ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "static issue: Prefetch size exceeds LLC cache " ++ "size: %lf > %lf.\n", prefetch_data_size, ++ (double) param_llc_capacity_per_core * 0.8); ++ } ++ else if (topn_calc_type == RUNTIME_CALC) ++ runtime_issue (ref_groups, num_issue_var); ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "unhandled issue scene\n"); ++ } ++} ++ ++/* ==================== phase entry ==================== */ ++/* Check whether a string can be converted to an unsigned integer. */ ++ ++bool is_unsigned_int (const string &s) ++{ ++ if (s.empty () || s.size () > PREFETCH_TOOL_NUM_MAX_LEN) ++ return false; ++ ++ for (unsigned int i = 0; i < s.size (); ++i) ++ { ++ if (s[i] < '0' || s[i] > '9') ++ return false; ++ } ++ return true; ++} ++ ++/* Parse a substring separated by comma. If the substring is valid and ++ non-empty, store it as a parsed element. */ ++ ++bool ++parse_string_helper (const string &substr, vector& str_elts, ++ bool check_unsigned, size_t start, size_t end) ++{ ++ if (substr == "" && dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "WARNING: The input string from %lu to %lu is " ++ "empty.\n", start, end); ++ else if (check_unsigned && !is_unsigned_int (substr)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "ERROR: not an unsigned integer: %s\n", ++ substr.c_str ()); ++ str_elts.clear (); ++ return false; ++ } ++ else ++ str_elts.push_back (substr); ++ return true; ++} ++ ++/* Parse a user input string, separated by comma. */ ++ ++void ++parse_string (const string &s, vector& str_elts, ++ bool check_unsigned = false) ++{ ++ string delim = ","; ++ size_t start = 0; ++ size_t end = s.find (delim); ++ string substr = s.substr (start, end - start); ++ while (end != string::npos) ++ { ++ if (!parse_string_helper (substr, str_elts, check_unsigned, start, end)) ++ return; ++ start = end + delim.size (); ++ end = s.find (delim, start); ++ substr = s.substr (start, end - start); ++ } ++ parse_string_helper (substr, str_elts, check_unsigned, start, end); ++} ++ ++/* Parse user input of target variables and memory indices and create a map ++ that assigns a target variable to a memory index. */ ++ ++void ++parse_param_inputs (map &var2mem_idx) ++{ ++ /* The user input length should have an input length limit. */ ++ if ((strlen (param_target_variables) >= PREFETCH_TOOL_INPUT_MAX_LEN ++ || strlen (param_mem_ref_index) >= PREFETCH_TOOL_INPUT_MAX_LEN) ++ && dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "INVALID INPUT: The user inputs for target variables " ++ "and/or memory reference indices are too long for parsing.\n"); ++ ++ vector var_names; ++ string target_variables = param_target_variables; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Start parsing target variables:\n"); ++ if (param_use_ref_group_index) ++ parse_string (target_variables, var_names, true); ++ else ++ parse_string (target_variables, var_names, false); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Finish parsing target variables.\n\n"); ++ ++ vector var_mem_indices; ++ string mem_indices = param_mem_ref_index; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Start parsing memory reference indices:\n"); ++ parse_string (mem_indices, var_mem_indices, true); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Finish parsing memory reference indices.\n\n"); ++ ++ /* Construct a map of var_name: var_mem_index. */ ++ if (var_names.size () > 0) ++ { ++ if (var_mem_indices.size () < var_names.size ()) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "WARNING: The number of provided memory " ++ "reference indices is less than that of target " ++ "variables.\nUse the top index for all variables " ++ "instead.\n"); ++ for (string& var_name : var_names) ++ var2mem_idx[var_name] = 0; ++ } ++ else ++ { ++ if (var_mem_indices.size () > var_names.size () ++ && dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "WARNING: The number of target variables is " ++ "less than that of memory reference indices.\n"); ++ for (unsigned int i = 0; i < var_names.size (); ++i) ++ { ++ var2mem_idx[var_names[i]] = static_cast( ++ atoi (var_mem_indices[i].c_str ())); ++ } ++ } ++ } ++} ++ ++/* Filter reference groups by only selecting target variables from the user ++ input. There are two options for prefetching target variables: ++ 1. Specify variable name parsed by the pass, which you can double-check at ++ "sorted ref_groups" section in the dump file. ++ 2. Specify variable rank exhibited at "sorted ref_groups" section in the ++ dump file. ++*/ ++ ++void ++prefetch_variables (const vector& ref_groups, ++ vector& reduced_ref_groups) ++{ ++ map ref_group2mem_idx; ++ ++ map var2mem_idx; /* externally defined. */ ++ parse_param_inputs (var2mem_idx); ++ ++ if (param_use_ref_group_index) ++ { ++ /* Use ref_group index at "sorted ref_groups" section to specify ++ variable. */ ++ /* Collect the variables in "reduced_ref_group" only if their indices ++ show up at "sorted ref_groups" section. */ ++ for (const pair &var_mem_idx : var2mem_idx) ++ { ++ unsigned int var_idx = static_cast(atoi ( ++ var_mem_idx.first.c_str ())); ++ if (var_idx < ref_groups.size ()) ++ ref_group2mem_idx[var_idx] = var_mem_idx.second; ++ else if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "WARNING: The index \"%u\" does not show " ++ "up in the ref_groups.\n", var_idx); ++ } ++ } ++ else ++ { ++ /* Use variable name shown up at "sorted ref_groups" section to specify ++ variable: ++ var2ref_group_idx + var2mem_idx -> ref_group2mem_idx. */ ++ /* Create a map that assigns the variable name to its corresponding ++ ref_group index. */ ++ map var2ref_group_idx; /* internally detected. */ ++ for (unsigned int i = 0; i < ref_groups.size (); ++i) ++ { ++ const ref_group &curr_ref_group = ref_groups[i]; ++ const int UINT_MAX_DIGIT = 10; ++ /* Unrecognizable variable name related to ref_group. */ ++ if (!get_name (curr_ref_group.var)) ++ { ++ /* If the variable name does not have a string representation, ++ we can rename it by "tmp_var_" + . */ ++ char group_idx[UINT_MAX_DIGIT]; ++ sprintf (group_idx, "%u", i); ++ string tmp_var_name = "tmp_var_" + std::string (group_idx); ++ fprintf (dump_file, "Unrecognizable variable name at ref_group " ++ "index %u.\nThe tree expression for variable is: ", i); ++ print_generic_expr (dump_file, curr_ref_group.var, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ var2ref_group_idx[tmp_var_name] = i; ++ } ++ else ++ var2ref_group_idx[std::string (get_name (curr_ref_group.var))] = i; ++ } ++ /* Collect the variables in "reduced_ref_group" only if they show up in ++ the ref_groups. */ ++ for (const pair &var_mem_idx : var2mem_idx) ++ { ++ if (var2ref_group_idx.count (var_mem_idx.first)) ++ { ++ unsigned int ref_group_idx = var2ref_group_idx[var_mem_idx.first]; ++ ref_group2mem_idx[ref_group_idx] = var_mem_idx.second; ++ } ++ else if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "WARNING: Target variable \" %s \" does " ++ "not show up in the ref_groups. Check whether it needs " ++ "temporary variable name.\n", ++ var_mem_idx.first.c_str ()); ++ } ++ } ++ ++ for (const pair &ref_group_mem_idx : ++ ref_group2mem_idx) ++ { ++ ref_group curr_ref_group = ref_groups[ref_group_mem_idx.first]; ++ curr_ref_group.mem_ref_index = ref_group_mem_idx.second; ++ reduced_ref_groups.push_back (curr_ref_group); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nNOTICE: Prefetching target variable \" "); ++ print_generic_expr (dump_file, curr_ref_group.var, TDF_SLIM); ++ fprintf (dump_file, " \" at ref_group index %u and memory location " ++ "index %u.\n", ref_group_mem_idx.first, ++ ref_group_mem_idx.second); ++ } ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n\n"); ++} ++ ++ ++/* The LLC intelligent allocation consists of 6 steps. */ ++ ++void ++llc_allocate (void) ++{ ++ map > kernels_refs; ++ vector kernels; ++ if (!get_dense_memory_kernels (kernels, kernels_refs)) ++ return; ++ ++ trace_data_refs_info (kernels, kernels_refs); ++ ++ if (!analyze_nested_kernels (kernels, kernels_refs)) ++ return; ++ ++ vector sorted_kernels; ++ if (!filter_and_sort_kernels (sorted_kernels, kernels)) ++ return; ++ ++ vector ref_groups; ++ if (!record_and_sort_ref_groups (ref_groups, sorted_kernels, kernels_refs)) ++ return; ++ ++ if (strlen (param_target_variables) > 0) ++ { ++ /* If "param_target_variables" is not empty, we will issue parsed target ++ variables compulsorily. */ ++ param_force_issue = true; ++ vector reduced_ref_groups; ++ prefetch_variables (ref_groups, reduced_ref_groups); ++ issue_llc_hint (reduced_ref_groups); ++ } ++ else ++ issue_llc_hint (ref_groups); ++} ++ ++/* Check whether the function is an operator reloading function. */ ++ ++bool ++operator_func_p (function *fn) ++{ ++ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl)); ++ ++ if (fn_name && strncmp (fn_name, "operator", 8) == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "operator_func: %s ", fn_name); ++ ++ return true; ++ } ++ return false; ++} ++ ++/* Check whether the function file location is known. */ ++ ++bool ++func_location_p (function *fn) ++{ ++ expanded_location fn_decl_xloc ++ = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); ++ expanded_location fn_xloc ++ = expand_location (fn->function_start_locus); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "fn->function_start_locus = %d \n", ++ fn->function_start_locus); ++ fprintf (dump_file, "fn_xloc.file = %s \n", ++ fn_xloc.file ? fn_xloc.file : "NULL"); ++ fprintf (dump_file, "fn_decl_xloc.file = %s \n", ++ fn_decl_xloc.file ? fn_decl_xloc.file : "NULL"); ++ fprintf (dump_file, "LOCATION_FILE (input_location) = %s \n", ++ LOCATION_FILE (input_location) ? LOCATION_FILE (input_location) ++ : "NULL"); ++ } ++ if (fn_decl_xloc.file == NULL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Function location unknown, skip analysis \n"); ++ return false; ++ } ++ /* Newly generated functions are filtered out, such as function constant ++ propagation func.constprop (). */ ++ if (LOCATION_FILE (input_location) != fn_decl_xloc.file) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Function location non-local, skip analysis \n"); ++ return false; ++ } ++ return true; ++} ++ ++/* Dump function information. */ ++ ++void ++dump_function_info (function *fn) ++{ ++ const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (fn->decl)); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nfn_name: %s\n", fn_name); ++ expanded_location cfun_xloc ++ = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); ++ if (cfun_xloc.line) ++ { ++ if (cfun_xloc.file) ++ fprintf (dump_file, "[%s:%d:%d]\n", ++ cfun_xloc.file, cfun_xloc.line, cfun_xloc.column); ++ } ++ fprintf (dump_file, "\n"); ++ flow_loops_dump (dump_file, NULL, 1); ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* dump param. */ ++ ++void ++dump_param (void) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "LLC allocate parameters:\n"); ++ fprintf (dump_file, " block size: %d\n", param_l1_cache_line_size); ++ fprintf (dump_file, " L1 cache size: %d lines, %d kB\n", ++ param_l1_cache_size * 1024 / param_l1_cache_line_size, ++ param_l1_cache_size); ++ fprintf (dump_file, " L1 cache line size: %d\n", ++ param_l1_cache_line_size); ++ fprintf (dump_file, " L2 cache size: %d kB\n", param_l2_cache_size); ++ fprintf (dump_file, " min mem_access_ratio: %d \n", ++ param_mem_access_ratio); ++ fprintf (dump_file, " min mem_access_num: %d \n", ++ param_mem_access_num); ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++const pass_data pass_data_llc_allocate = ++{ ++ GIMPLE_PASS, /* type. */ ++ "llc_allocate", /* name. */ ++ OPTGROUP_LOOP, /* optinfo_flags. */ ++ TV_TREE_PREFETCH, /* tv_id. */ ++ (PROP_cfg | PROP_ssa), /* properties_required. */ ++ 0, /* properties_provided. */ ++ 0, /* properties_destroyed. */ ++ 0, /* todo_flags_start. */ ++ 0, /* todo_flags_finish. */ ++}; ++ ++class pass_llc_allocate : public gimple_opt_pass ++{ ++public: ++ pass_llc_allocate (gcc::context *ctxt) ++ : gimple_opt_pass (pass_data_llc_allocate, ctxt) ++ {} ++ ++ /* opt_pass methods. */ ++ virtual bool gate (function *) ++ { ++ return (optimize >= 2 && flag_llc_allocate > 0); ++ } ++ virtual unsigned int execute (function *); ++ ++}; // class pass_llc_allocate ++ ++unsigned int ++pass_llc_allocate::execute (function *fn) ++{ ++ unsigned int ret = 0; ++ ++ if (!targetm.have_prefetch () ++ || targetm.vectorize.code_for_prefetch == NULL ++ || targetm.vectorize.prefetch_handleable_mode_p == NULL ++ || targetm.vectorize.code_for_gather_prefetch == NULL) ++ return 0; ++ ++ if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH)) ++ { ++ tree type = build_function_type_list (void_type_node, ++ const_ptr_type_node, NULL_TREE); ++ tree decl = add_builtin_function ("__builtin_prefetch", type, ++ BUILT_IN_PREFETCH, BUILT_IN_NORMAL, ++ NULL, NULL_TREE); ++ DECL_IS_NOVOPS (decl) = true; ++ set_builtin_decl (BUILT_IN_PREFETCH, decl, false); ++ } ++ ++ dump_param (); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "llc_allocate: %s\n", ++ IDENTIFIER_POINTER (DECL_NAME (fn->decl))); ++ ++ if (number_of_loops (fn) <= 1 || !func_location_p (fn) ++ || operator_func_p (fn)) ++ return ret; ++ ++ dump_function_info (fn); ++ ++ llc_allocate (); ++ ++ return ret; ++} ++ ++} // anon namespace ++ ++gimple_opt_pass * ++make_pass_llc_allocate (gcc::context *ctxt) ++{ ++ return new pass_llc_allocate (ctxt); ++} +diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c +index 7775bc727..c500d5e20 100644 +--- a/gcc/tree-ssa-loop-niter.c ++++ b/gcc/tree-ssa-loop-niter.c +@@ -2384,6 +2384,37 @@ loop_only_exit_p (const class loop *loop, basic_block *body, const_edge exit) + return true; + } + ++/* Returns whether the number of vectorized iterations for the loop can be ++ estimated from the given IR and update the corresponding loop attribute, ++ e.g., next_mask_114 = .WHILE_ULT (_122, niters.5_75, { 0, ... }); */ ++ ++bool ++number_of_iterations_vect (class loop *loop, tree lhs, tree rhs) ++{ ++ loop->vec_nb_iterations = chrec_dont_know; ++ ++ if ((TREE_CODE (lhs) != SSA_NAME && TREE_CODE (rhs) != SSA_NAME) ++ || (TREE_CODE (lhs) == SSA_NAME && TREE_CODE (rhs) == SSA_NAME)) ++ return false; ++ ++ tree ssa = TREE_CODE (lhs) == SSA_NAME ? lhs : rhs; ++ gimple *def_stmt = SSA_NAME_DEF_STMT (ssa); ++ ++ if (gimple_code (def_stmt) != GIMPLE_CALL ++ || !gimple_call_internal_p (def_stmt)) ++ return false; ++ ++ internal_fn ifn = gimple_call_internal_fn (def_stmt); ++ if (ifn != IFN_WHILE_ULT) ++ return false; ++ ++ gcall *call = dyn_cast (def_stmt); ++ tree niters = gimple_call_arg (call, 1); ++ loop->vec_nb_iterations = niters; ++ ++ return true; ++} ++ + /* Stores description of number of iterations of LOOP derived from + EXIT (an exit edge of the LOOP) in NITER. Returns true if some useful + information could be derived (and fields of NITER have meaning described +@@ -2454,6 +2485,9 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit, + op1 = gimple_cond_rhs (stmt); + type = TREE_TYPE (op0); + ++ if (TREE_CODE (type) == VECTOR_TYPE) ++ number_of_iterations_vect (loop, op0, op1); ++ + if (TREE_CODE (type) != INTEGER_TYPE + && !POINTER_TYPE_P (type)) + return false; +@@ -2730,14 +2764,14 @@ bool + number_of_iterations_exit (class loop *loop, edge exit, + class tree_niter_desc *niter, + bool warn, bool every_iteration, +- basic_block *body) ++ basic_block *body, bool guarantee) + { + gcond *stmt; + if (!number_of_iterations_exit_assumptions (loop, exit, niter, + &stmt, every_iteration, body)) + return false; + +- if (integer_nonzerop (niter->assumptions)) ++ if (integer_nonzerop (niter->assumptions) || guarantee == false) + return true; + + if (warn && dump_enabled_p ()) +diff --git a/gcc/tree-ssa-loop-niter.h b/gcc/tree-ssa-loop-niter.h +index eb8d15794..d38472e52 100644 +--- a/gcc/tree-ssa-loop-niter.h ++++ b/gcc/tree-ssa-loop-niter.h +@@ -27,7 +27,8 @@ extern bool loop_only_exit_p (const class loop *, basic_block *body, + extern bool number_of_iterations_exit (class loop *, edge, + class tree_niter_desc *niter, bool, + bool every_iteration = true, +- basic_block * = NULL); ++ basic_block * = NULL, ++ bool guarantee = true); + extern bool number_of_iterations_exit_assumptions (class loop *, edge, + class tree_niter_desc *, + gcond **, bool = true, +-- +2.33.0 + diff --git a/0153-LLC-add-extending-outer-loop.patch b/0153-LLC-add-extending-outer-loop.patch new file mode 100644 index 0000000..fef87dc --- /dev/null +++ b/0153-LLC-add-extending-outer-loop.patch @@ -0,0 +1,1285 @@ +From 4a365290cd9563385d32a22f7b1532c50b69e063 Mon Sep 17 00:00:00 2001 +From: zhaoshujian +Date: Mon, 11 Dec 2023 15:06:28 +0800 +Subject: [PATCH] LLC add extending outer loop + + +diff --git a/gcc/params.opt b/gcc/params.opt +index c429359e3..227175eef 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -1058,4 +1058,10 @@ Common Joined UInteger Var(param_filter_kernels) Init(1) IntegerRange(0, 1) Para + Allow LLC allocate pass to greedily filter kernels by traversing the corresponding basic blocks + through edges with branch probability no less than param_branch_prob_threshold. + ++-param=outer-loop-nums= ++Common Joined UInteger Var(param_outer_loop_num) Init(1) IntegerRange(1, 10) Param ++Maximum number of outer loops allowed to extend outer loops for loops that ++cannot recognize inner loop boundaries. ++ ++ + ; This comment is to ensure we retain the blank line above. +diff --git a/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp +new file mode 100644 +index 000000000..9e98191ed +--- /dev/null ++++ b/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp +@@ -0,0 +1,27 @@ ++# Copyright (C) 1997-2022 Free Software Foundation, Inc. ++ ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 3 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with GCC; see the file COPYING3. If not see ++# . ++ ++load_lib g++-dg.exp ++load_lib target-supports.exp ++ ++# Initialize `dg'. ++dg-init ++ ++dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.C]] \ ++ "" "-fllc-allocate" ++ ++# All done. ++dg-finish +\ No newline at end of file +diff --git a/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C b/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C +new file mode 100644 +index 000000000..44a9d7c66 +--- /dev/null ++++ b/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50 --param filter-kernels=0 --param mem-access-num=2 --param issue-topn=1 --param force-issue=1" } */ ++#include "multidim_array.h" ++ ++class Input ++{ ++ public: ++ int metadata_offset = 13; ++ int exp_nr_images = 1; ++ MultidimArray exp_Mweight; ++ void convertAllSquaredDifferencesToWeights(); ++}; ++ ++int main() ++{ ++ clock_t start = clock(); ++ Input input; ++ int testIter = 2; ++ ++ for (int i = 0; i < testIter; ++i) ++ { ++ input.convertAllSquaredDifferencesToWeights(); ++ } ++ return 0; ++} ++ ++void Input::convertAllSquaredDifferencesToWeights() ++{ ++ for (int img_id = 0; img_id < exp_nr_images; img_id++) ++ { ++ int my_metadata_offset = metadata_offset + img_id; ++ MultidimArray sorted_weight; ++ ++ exp_Mweight.getRow(img_id, sorted_weight); ++ long int np = 0; ++ FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(sorted_weight) ++ { ++ if (DIRECT_MULTIDIM_ELEM(sorted_weight, n) > 0.) ++ { ++ DIRECT_MULTIDIM_ELEM(sorted_weight, np) = DIRECT_MULTIDIM_ELEM( \ ++ sorted_weight, n); ++ np++; ++ } ++ } ++ } ++} ++ ++ ++ ++/* { dg-final { scan-tree-dump-times "dense memory access" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "__builtin_prefetch" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */ ++ +diff --git a/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h b/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h +new file mode 100644 +index 000000000..d65066ebf +--- /dev/null ++++ b/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h +@@ -0,0 +1,186 @@ ++#ifndef MULTIDIM_ARRAY_H ++#define MULTIDIM_ARRAY_H ++ ++#include ++ ++#define RELION_ALIGNED_MALLOC malloc ++#define RELION_ALIGNED_FREE free ++ ++#define STARTINGX(v) ((v).xinit) ++#define STARTINGY(v) ((v).yinit) ++#define NZYXSIZE(v) ((v).nzyxdim) ++ ++#define DIRECT_MULTIDIM_ELEM(v,n) ((v).data[(n)]) ++#define FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(v) \ ++ for (long int n=0; n ++class MultidimArray ++{ ++public: ++ T* data; ++ bool destroyData; ++ long int ndim; ++ long int zdim; ++ long int ydim; ++ long int xdim; ++ long int yxdim; ++ long int zyxdim; ++ long int nzyxdim; ++ long int zinit; ++ long int yinit; ++ long int xinit; ++ long int nzyxdimAlloc; ++ ++public: ++ void clear() ++ { ++ coreDeallocate(); ++ coreInit(); ++ } ++ ++ void coreInit() ++ { ++ xdim=0; ++ yxdim=0; ++ zyxdim=0; ++ nzyxdim=0; ++ ydim=1; ++ zdim=1; ++ ndim=1; ++ zinit=0; ++ yinit=0; ++ xinit=0; ++ data=NULL; ++ nzyxdimAlloc = 0; ++ destroyData=true; ++ } ++ ++ void coreAllocate(long int _ndim, long int _zdim, long int _ydim, long int _xdim) ++ { ++ if (_ndim <= 0 || _zdim <= 0 || _ydim<=0 || _xdim<=0) ++ { ++ clear(); ++ return; ++ } ++ ++ ndim=_ndim; ++ zdim=_zdim; ++ ydim=_ydim; ++ xdim=_xdim; ++ yxdim=ydim*xdim; ++ zyxdim=zdim*yxdim; ++ nzyxdim=ndim*zyxdim; ++ ++ coreAllocate(); ++ } ++ ++ void coreAllocate() ++ { ++ data = (T*)RELION_ALIGNED_MALLOC(sizeof(T) * nzyxdim); ++ nzyxdimAlloc = nzyxdim; ++ } ++ ++ void coreDeallocate() ++ { ++ if (data != NULL && destroyData) ++ { ++ RELION_ALIGNED_FREE(data); ++ } ++ data=NULL; ++ nzyxdimAlloc = 0; ++ } ++ ++ void resize(long int Ndim, long int Zdim, long int Ydim, long int Xdim) ++ { ++ if (Ndim*Zdim*Ydim*Xdim == nzyxdimAlloc && data != NULL) ++ { ++ ndim = Ndim; ++ xdim = Xdim; ++ ydim = Ydim; ++ zdim = Zdim; ++ yxdim = Ydim * Xdim; ++ zyxdim = Zdim * yxdim; ++ nzyxdim = Ndim * zyxdim; ++ nzyxdimAlloc = nzyxdim; ++ return; ++ } ++ ++ if (Xdim <= 0 || Ydim <= 0 || Zdim <= 0 || Ndim <= 0) ++ { ++ clear(); ++ return; ++ } ++ ++ if (NZYXSIZE(*this) > 0 && data == NULL) ++ { ++ coreAllocate(); ++ return; ++ } ++ ++ size_t YXdim=Ydim*Xdim; ++ size_t ZYXdim=Zdim*YXdim; ++ size_t NZYXdim=Ndim*ZYXdim; ++ ++ T * new_data = (T*)RELION_ALIGNED_MALLOC(sizeof(T) * NZYXdim); ++ for (long int l = 0; l < Ndim; l++) ++ for (long int k = 0; k < Zdim; k++) ++ for (long int i = 0; i < Ydim; i++) ++ for (long int j = 0; j < Xdim; j++) ++ { ++ T val; ++ new_data[l*ZYXdim + k*YXdim+i*Xdim+j] = val; ++ } ++ coreDeallocate(); ++ ++ data = new_data; ++ ndim = Ndim; ++ xdim = Xdim; ++ ydim = Ydim; ++ zdim = Zdim; ++ yxdim = Ydim * Xdim; ++ zyxdim = Zdim * yxdim; ++ nzyxdim = Ndim * zyxdim; ++ nzyxdimAlloc = nzyxdim; ++ } ++ ++ void resize(long int Xdim) ++ { ++ resize(1, 1, 1, Xdim); ++ } ++ ++ inline T& operator()(long int i, long int j) const ++ { ++ return A2D_ELEM(*this, i, j); ++ } ++ ++ inline T& operator()(long int i) const ++ { ++ return A1D_ELEM(*this, i); ++ } ++ ++ void getRow(long int i, MultidimArray& v) const ++ { ++ if (xdim == 0 || ydim == 0) ++ { ++ v.clear(); ++ return; ++ } ++ ++ v.resize(xdim); ++ for (long int j = 0; j < xdim; j++) ++ v(j) = (*this)(i, j); ++ } ++}; ++ ++#endif /* MULTIDIM_ARRAY_H */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c +index 9bc6cc32b..9f8a5c307 100644 +--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c +@@ -39,13 +39,13 @@ main (int argc, char *argv[]) + return 0; + } + +-/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 2 "llc_allocate" } } */ +-/* { dg-final { scan-tree-dump-times "Tracing succeeded" 6 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 4 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 14 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-not "static_data_size:" "llc_allocate" } } */ +-/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 4 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 2 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-not ", size: (?!(0\.000000))" "llc_allocate" } } */ +-/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 6 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 8 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-times "\\d x_data \\(0.000000, 1, 0\\) : 3" 2 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-times "\\d A_j \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-times "\\d A_data \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp +index 4f34e722f..05a3bf842 100644 +--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp +@@ -24,4 +24,4 @@ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \ + "" "-fllc-allocate" + + # All done. +-dg-finish ++dg-finish +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c +new file mode 100644 +index 000000000..9b2b656fd +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c +@@ -0,0 +1,61 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++#include ++#define N 131590 ++#define F 384477 ++ ++int ownStartPtr[F]; ++double bPrimePtr[N]; ++double diagPtr[N]; ++double psiPtr[N]; ++double upperPtr[F]; ++double lowerPtr[F]; ++int uPtr[F]; ++ ++void SMOOTH(int *ownStartPtr, double *bPrimePtr, double *diagPtr, double *psiPtr, int *uPtr, double *lowerPtr, double *upperPtr, int nCells); ++ ++int main(int argc, char *argv[]) ++{ ++ int nCells = N; ++ int nFaces = F; ++ int testIter = 2; ++ for (int i = 0; i < testIter; i++) ++ { ++ SMOOTH(ownStartPtr, bPrimePtr, diagPtr, psiPtr, uPtr, lowerPtr, upperPtr, nCells); ++ } ++ return 0; ++} ++ ++ ++void SMOOTH(int *ownStartPtr, double *bPrimePtr, double *diagPtr, double *psiPtr, int *uPtr, double *lowerPtr, double *upperPtr, int nCells) ++{ ++ double psii; ++ int fStart; ++ int fEnd = ownStartPtr[0]; ++ ++ for (int celli = 0; celli < nCells; celli++) ++ { ++ fStart = fEnd; ++ fEnd = ownStartPtr[celli + 1]; ++ psii = bPrimePtr[celli]; ++ ++ for (int facei = fStart; facei &references) + + struct loop_filter_out_flag + { +- /* Use external gimple. */ +- bool use_ext_gimple; +- + /* Use external call. */ + bool use_ext_call; + +@@ -358,21 +355,7 @@ bool + filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt, + const vector &references, unsigned int &start) + { +- /* check use_ext_gimple. */ +- expanded_location cfun_xloc +- = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); + expanded_location xloc = expand_location (stmt->location); +- if (xloc.file && filename_cmp (cfun_xloc.file, xloc.file)) +- { +- if (dump_file && (dump_flags & TDF_DETAILS)) +- { +- fprintf (dump_file, "use_ext_gimple: "); +- print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); +- } +- loop_filter.use_ext_gimple = true; +- return true; +- } +- + /* check use_ext_call. */ + if (gimple_code (stmt) == GIMPLE_CALL && !gimple_call_internal_p (stmt)) + { +@@ -421,11 +404,6 @@ filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt, + void + dump_loop_filter_out_flag (loop_filter_out_flag &loop_filter) + { +- if (loop_filter.use_ext_gimple) +- { +- if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "non-dense mem access: use_ext_gimple\n"); +- } + if (loop_filter.use_ext_call) + { + if (dump_file && (dump_flags & TDF_DETAILS)) +@@ -493,45 +471,6 @@ get_references_in_loop (vector &references, + return !filter_out_loop; + } + +-/* Determine whether the loop is a single path. */ +- +-bool +-single_path_p (class loop *loop, basic_block bb) +-{ +- if (bb == NULL) +- return false; +- if (bb == loop->latch) +- return true; +- +- gimple *stmt = last_stmt (bb); +- bool res = false; +- +- if (stmt && gimple_code (stmt) == GIMPLE_COND) +- { +- gcc_assert (EDGE_COUNT (bb->succs) == 2); +- edge true_edge = NULL; +- edge false_edge = NULL; +- extract_true_false_edges_from_block (bb, &true_edge, &false_edge); +- +- /* Returns false, if a branch occurs. */ +- if (true_edge->dest->loop_father == loop +- && false_edge->dest->loop_father == loop) +- return false; +- +- if (true_edge->dest->loop_father == loop) +- res = single_path_p (loop, true_edge->dest); +- else +- res = single_path_p (loop, false_edge->dest); +- } +- else +- { +- edge e = find_fallthru_edge (bb->succs); +- if (e) +- res = single_path_p (loop, e->dest); +- } +- return res; +-} +- + /* Computes an estimated number of insns in LOOP, weighted by WEIGHTS. + Assume that the HPC data reading and calculation process does not involve + adding branches in loops. Therefore, all bbs of loops are directly used for +@@ -611,6 +550,45 @@ dense_memory_p (const vector &references, class loop *loop) + + /* Analyze the inner loop and get the loop with dense memory access. */ + ++void ++analyze_loop_dense_memory (vector &kernels, ++ map > &kernels_refs, ++ class loop *loop) ++{ ++ vector references; ++ number_of_latch_executions (loop); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n========== Processing loop %d: ==========\n", ++ loop->num); ++ loop_dump (dump_file, loop); ++ flow_loop_dump (loop, dump_file, NULL, 1); ++ fprintf (dump_file, "loop unroll: %d\n", loop->unroll); ++ } ++ ++ if (get_loop_exit_edges (loop).length () != 1) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: loop_branching\n"); ++ return; ++ } ++ ++ loop_filter_out_flag loop_filter = {false, false, true, false}; ++ ++ if (!get_references_in_loop (references, loop_filter, loop)) ++ { ++ dump_loop_filter_out_flag (loop_filter); ++ return; ++ } ++ ++ if (dense_memory_p (references, loop)) ++ { ++ kernels_refs[loop] = references; ++ kernels.push_back (loop); ++ } ++} ++/* Analyze the inner loop and get the loop with dense memory access. */ ++ + bool + get_dense_memory_kernels (vector &kernels, + map > &kernels_refs) +@@ -619,40 +597,7 @@ get_dense_memory_kernels (vector &kernels, + fprintf (dump_file, "\nPhase 1: get_dense_memory_kernels\n\n"); + class loop *loop = NULL; + FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) +- { +- number_of_latch_executions (loop); +- if (dump_file && (dump_flags & TDF_DETAILS)) +- { +- fprintf (dump_file, "\n========== Processing loop %d: ==========\n", +- loop->num); +- loop_dump (dump_file, loop); +- flow_loop_dump (loop, dump_file, NULL, 1); +- fprintf (dump_file, "loop unroll: %d\n", loop->unroll); +- } +- +- if (get_loop_exit_edges (loop).length () != 1 +- || !single_path_p (loop, loop->header)) +- { +- if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "non-dense mem access: loop_branching\n"); +- continue; +- } +- +- vector references; +- loop_filter_out_flag loop_filter = {false, false, false, true, false}; +- +- if (!get_references_in_loop (references, loop_filter, loop)) +- { +- dump_loop_filter_out_flag (loop_filter); +- continue; +- } +- +- if (dense_memory_p (references, loop)) +- { +- kernels_refs[loop] = references; +- kernels.push_back (loop); +- } +- } ++ analyze_loop_dense_memory (kernels, kernels_refs, loop); + return kernels.size () > 0; + } + +@@ -1094,33 +1039,41 @@ trace_ref_info (data_ref &mem_ref, set &traced_ref_stmt) + mem_ref.trace_status_p = true; + } + ++/* Trace all references in the loop. */ ++ ++void ++trace_loop_refs_info (vector &refs, set &traced_ref_stmt) ++{ ++ for (unsigned i = 0; i < refs.size (); ++i) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "trace_references_base_info %d:\n", i); ++ print_generic_expr (dump_file, refs[i].ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ trace_ref_info (refs[i], traced_ref_stmt); ++ } ++} ++ + /* Tracing and sorting reference groups. */ + + void + trace_data_refs_info (vector &kernels, +- map > &loop_refs) ++ map > &loop_refs, ++ set &traced_ref_stmt) + { + if (dump_file) + fprintf (dump_file, "\nPhase 2: trace_all_references_info\n\n"); + +- set traced_ref_stmt; +- + for (unsigned i = 0; i < kernels.size (); ++i) + { +- class loop* loop = kernels[i]; +- ++ class loop *loop = kernels[i]; ++ if (loop_refs.count (loop) == 0) ++ continue; + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "loop header %d:\n", loop->header->index); +- for (unsigned j = 0; j < loop_refs[loop].size (); ++j) +- { +- if (dump_file && (dump_flags & TDF_DETAILS)) +- { +- fprintf (dump_file, "trace_references_base_info %d:\n", j); +- print_generic_expr (dump_file, loop_refs[loop][j].ref, TDF_SLIM); +- fprintf (dump_file, "\n"); +- } +- trace_ref_info (loop_refs[loop][j], traced_ref_stmt); +- } ++ trace_loop_refs_info (loop_refs[loop], traced_ref_stmt); + } + } + +@@ -1205,7 +1158,7 @@ void + check_bound_iv_and_add_worklist (vector &worklist, set &walked, + tree t, data_ref &mem_ref) + { +- if (TREE_CODE (t) != SSA_NAME) ++ if (t == NULL_TREE || TREE_CODE (t) != SSA_NAME) + return; + + gimple *def_stmt = SSA_NAME_DEF_STMT (t); +@@ -1278,8 +1231,13 @@ trace_loop_bound_iv (data_ref &mem_ref) + } + + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "\nmem_ref access dimension: %ld\n", +- mem_ref.loop_bounds.size ()); ++ { ++ fprintf (dump_file, "\nmem_ref access dimension: %ld\n", ++ mem_ref.loop_bounds.size ()); ++ fprintf (dump_file, "Traced variables: "); ++ print_generic_expr (dump_file, mem_ref.base, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } + + return mem_ref.loop_bounds.size () > 0; + } +@@ -1487,7 +1445,7 @@ trace_and_create_dominate_loop_bounds (data_ref &mem_ref) + if (dump_file && (dump_flags & TDF_DETAILS)) + { + print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM); +- fprintf (dump_file, "Tracing loop bound failed at dimension %d", ++ fprintf (dump_file, "Tracing loop bound failed at dimension %d\n", + i); + } + mem_ref.calc_by = UNHANDLE_CALC; +@@ -1565,42 +1523,246 @@ trace_ref_dimension_and_loop_bounds (data_ref &mem_ref) + static_calculate_data_size (mem_ref); + } + +-/* analyze nested kernels. +- 1. multidimension loop analyze. +- 2. extended outer loop analyze. +- Later we will extend outer loop analysis. ++/* Get the loop's niters tree. ++ Return NULL_TREE if not found. */ ++ ++tree ++get_cur_loop_niters (map > &loop_refs, ++ class loop* loop) ++{ ++ if (loop_refs.count (loop) == 0) ++ return NULL_TREE; ++ vector bounds = loop_refs[loop][0].loop_bounds; ++ return bounds.size () ? bounds[0].niters : NULL_TREE; ++} ++ ++/* Trace the sources of the niters tree and return the ++ outermost depth of the loops containing them. ++ Return start_depth if not found. ++ ++ example: ++ niters:(long) (((int) i_end_417 - (int) i_start_452) + 1) ++ operand_num: 1, subtree:(long) (((int) i_end_417 - (int) i_start_452) + 1) ++ operand_num: 2, subtree:((int) i_end_417 - (int) i_start_452) + 1 ++ operand_num: 2, subtree:(int) i_end_417 - (int) i_start_452 ++ operand_num: 1, subtree:(int) i_end_417 ++ SSA_NAME of niters: i_end_417 ++ gimple of SSA: i_end_417 = PHI ++ return gimple depth; ++*/ ++ ++unsigned ++trace_outer_loop_depth (tree niters, unsigned start_depth) ++{ ++ /* If niter does not exist or the type is INTEGER_CST, ++ the loop bound is determined and return start_depth. */ ++ if (niters == NULL_TREE || TREE_CODE (niters) == INTEGER_CST) ++ return start_depth; ++ ++ gimple *def_stmt = NULL; ++ /* niters examples: i_start_452, fEnd_35, fEnd_100. */ ++ enum tree_code niter_code = TREE_CODE (niters); ++ if (niter_code == SSA_NAME) ++ { ++ /* Trace the SSA that define this niter. */ ++ def_stmt = SSA_NAME_DEF_STMT (niters); ++ enum gimple_code stmt_code = gimple_code (def_stmt); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "ssa_name of niters: "); ++ print_generic_expr (dump_file, niters); ++ fprintf (dump_file, "\ngimple of ssa: \n"); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ /* Termination condition of dfs. Return the depth of the bb block. */ ++ if (stmt_code == GIMPLE_PHI || stmt_code == GIMPLE_NOP) ++ { ++ basic_block def_bb = gimple_bb (SSA_NAME_DEF_STMT (niters)); ++ if (def_bb == NULL || def_bb->loop_father == NULL) ++ return start_depth; ++ unsigned ret_depth = loop_depth (def_bb->loop_father); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Stop tracing the outer loop depth, "); ++ fprintf (dump_file, "current depth: %d, current bb: %d\n", \ ++ ret_depth, def_bb->index); ++ } ++ return ret_depth; ++ } ++ /* 'ASSIGN': Use dfs to trace the rhs of the assignment statement. */ ++ else if (stmt_code == GIMPLE_ASSIGN) ++ { ++ tree rhs = gimple_assign_rhs1 (def_stmt); ++ if (TREE_CODE (rhs) == TARGET_MEM_REF) ++ /* fEnd_35 = MEM[base: _19, index: ivtmp.96, step: 4, ++ offset: 0B] */ ++ return trace_outer_loop_depth (TREE_OPERAND (rhs, 2), start_depth); ++ else ++ { ++ /* M.218_658 = MIN_EXPR <_631, _657> */ ++ unsigned min_depth = start_depth; ++ unsigned operand_num = gimple_num_ops (def_stmt); ++ /* 'ASSIGN': start from 1 because op[0] is the lhs. */ ++ for (unsigned i = 1; i < operand_num; i++) ++ { ++ tree subtree = GIMPLE_CHECK2 ++ (def_stmt)->op[i]; ++ if (subtree == NULL) ++ continue; ++ unsigned depth = trace_outer_loop_depth (subtree, \ ++ start_depth); ++ min_depth = MIN (min_depth, depth); ++ } ++ return min_depth; ++ } ++ } ++ else ++ { ++ /* Adding termination conditions: ++ 1. Niters is MEM variable; ++ 2. Niters is a runtime value (smooth_uPtr), and consider \ ++ finding footprint in other mem_ref; ++ 3. Niters is loop variable (i_start/i_end), and the boundary in \ ++ the outer loop depends on the variable j_start/j_end. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "The loop termination condition"); ++ fprintf (dump_file, "is to be extended.\n"); ++ } ++ return start_depth; ++ } ++ } ++ /* The operand nums can be obtained when the tree code is as follows. */ ++ else if (niter_code == NOP_EXPR || niter_code == MEM_REF ++ || niter_code == ARRAY_REF || niter_code == COND_EXPR ++ || niter_code == PLUS_EXPR || niter_code == MINUS_EXPR ++ || niter_code == TARGET_MEM_REF || niter_code == POINTER_PLUS_EXPR) ++ { ++ /* operand_num is the operand in the niters statement. ++ example: In the following niter statement, operand_num = 3. ++ (unsigned int) fEnd_35 - (unsigned int) fEnd_100 + 4294967295. */ ++ unsigned operand_num = TREE_OPERAND_LENGTH (niters); ++ unsigned min_depth = start_depth; ++ for (unsigned i = 0; i < operand_num; i++) ++ { ++ tree subtree = TREE_OPERAND (niters, i); ++ if (subtree == NULL) ++ continue; ++ unsigned depth = trace_outer_loop_depth (subtree, start_depth); ++ min_depth = MIN (min_depth, depth); ++ } ++ return min_depth; ++ } ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "niters is another tree code: %s\n", \ ++ get_tree_code_name (niter_code)); ++ print_generic_expr (dump_file, niters, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return start_depth; ++ } ++} ++ ++/* Traces the ref dimension information in each loop. */ ++ ++void ++analyze_loop_refs_dimension (vector &refs) ++{ ++ for (unsigned i = 0; i < refs.size (); ++i) ++ { ++ if (refs[i].trace_status_p == false) ++ continue; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "trace_reference_dimension %d:\n", i); ++ print_generic_expr (dump_file, refs[i].ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ trace_ref_dimension_and_loop_bounds (refs[i]); ++ } ++} ++/* analyze nested kernels ++ 1. multidimension loop analyze ++ 2. extended outer loop analyze + */ + + bool + analyze_nested_kernels (vector &kernels, +- map > &loop_refs) ++ map > &loop_refs, ++ set &traced_ref_stmt) + { + if (dump_file) + fprintf (dump_file, "\nPhase 3: analyze_nested_kernels\n\n"); + +- for (unsigned i = 0; i < kernels.size (); ++i) ++ /* `kernels` may be added in during outer loop extension phase, ++ thus using initial size to avoid repeatedly analyzing. */ ++ unsigned init_kernels_size = kernels.size (); ++ for (unsigned i = 0; i < init_kernels_size; ++i) + { + class loop* loop = kernels[i]; + if (loop_refs.count (loop) == 0) + continue; + + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "\n\nloop header %d:\n", loop->header->index); +- for (unsigned j = 0; j < loop_refs[loop].size (); ++j) ++ fprintf (dump_file, "loop header %d:\n", loop->header->index); ++ analyze_loop_refs_dimension (loop_refs[loop]); ++ ++ unsigned depth = loop_depth (loop); ++ unsigned outer_depth = trace_outer_loop_depth (get_cur_loop_niters \ ++ (loop_refs, loop), depth); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "cur_depth: %d, outer_depth: %d\n", \ ++ depth, outer_depth); ++ /* param_outer_loop_num: number of loops of the extended outer loop. ++ Outermost loop should not be extended when outer_depth = 0. ++ `outer_depth == depth` means the current loop is the loop which ++ boundary is known, so there is no need to extend the outer loop. */ ++ if (outer_depth == 0 || outer_depth == depth ++ || depth > outer_depth + param_outer_loop_num) ++ continue; ++ /* Extend outer loop. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nStart extending outer loop\n"); ++ /* Superloops of the loop, start from the loop closest to the \ ++ current loop in the outermost loop. */ ++ for (unsigned j = 0; j < param_outer_loop_num && --depth; ++j) + { +- if (loop_refs[loop][j].trace_status_p == false) ++ class loop* outer_loop = (*loop->superloops)[depth]; ++ /* The outer loop may be added when analyzing previous inner loops, ++ i.e. the outer loop contains two or more inner loops. */ ++ if (loop_refs.count (outer_loop)) + continue; +- +- if (dump_file && (dump_flags & TDF_DETAILS)) ++ /* phase1~phase3 analysis on the extended outer loop. */ ++ analyze_loop_dense_memory (kernels, loop_refs, outer_loop); ++ if (loop_refs.count (outer_loop) == 0) ++ continue; ++ for (unsigned k = 0; k < loop_refs[outer_loop].size (); ++k) + { +- fprintf (dump_file, "\ntrace_reference_dimension at mem_ref " +- "index %d in loop %d:\n", j, loop->num); +- print_generic_expr (dump_file, loop_refs[loop][j].ref, TDF_SLIM); +- fprintf (dump_file, "\n"); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "outer_analyze_nested_kernels %d: ", k); ++ print_generic_expr (dump_file, loop_refs[outer_loop][k].ref,\ ++ TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } + } +- trace_ref_dimension_and_loop_bounds (loop_refs[loop][j]); ++ trace_loop_refs_info (loop_refs[outer_loop], traced_ref_stmt); ++ analyze_loop_refs_dimension (loop_refs[outer_loop]); ++ outer_depth = trace_outer_loop_depth (get_cur_loop_niters \ ++ (loop_refs, outer_loop), depth); ++ /* `outer_depth == depth` means the current loop is the loop which ++ boundary is known, so there is no need to extend the outer loop. */ ++ if (outer_depth == depth) ++ break; ++ else ++ /* The outer loop cannot find the current loop boundary, ++ Remove the record of outer_loop from the loop_refs. */ ++ loop_refs.erase (outer_loop); + } +- + } + return true; + } +@@ -2694,9 +2856,10 @@ llc_allocate (void) + if (!get_dense_memory_kernels (kernels, kernels_refs)) + return; + +- trace_data_refs_info (kernels, kernels_refs); ++ set traced_ref_stmt; ++ trace_data_refs_info (kernels, kernels_refs, traced_ref_stmt); + +- if (!analyze_nested_kernels (kernels, kernels_refs)) ++ if (!analyze_nested_kernels (kernels, kernels_refs, traced_ref_stmt)) + return; + + vector sorted_kernels; +-- +2.33.0 + diff --git a/0154-Loop-CRC32-Judge-null-on-pointers-and-solving-coding.patch b/0154-Loop-CRC32-Judge-null-on-pointers-and-solving-coding.patch new file mode 100644 index 0000000..0cea2a1 --- /dev/null +++ b/0154-Loop-CRC32-Judge-null-on-pointers-and-solving-coding.patch @@ -0,0 +1,1772 @@ +From a47fee215f11993a6025c1d814b70b050e4eb1b0 Mon Sep 17 00:00:00 2001 +From: XingYuShuai <1150775134@qq.com> +Date: Thu, 7 Dec 2023 18:41:37 +0800 +Subject: [PATCH 1/7] [Loop CRC32] Judge null on pointers and solving coding + style issues + +--- + gcc/common.opt | 2 +- + gcc/config/aarch64/aarch64-builtins.c | 13 +- + gcc/config/aarch64/aarch64-protos.h | 2 +- + gcc/config/aarch64/aarch64.c | 6 +- + gcc/doc/invoke.texi | 12 +- + gcc/match.pd | 20 +- + gcc/passes.def | 2 +- + gcc/target.def | 2 +- + .../tree-ssa/loop-crc-loop-condition-fail.c | 2 +- + .../tree-ssa/loop-crc-loop-form-fail-2.c | 2 +- + .../gcc.dg/tree-ssa/loop-crc-loop-form-fail.c | 2 +- + .../gcc.dg/tree-ssa/loop-crc-sucess.c | 2 +- + .../tree-ssa/loop-crc-table-check-fail.c | 2 +- + gcc/timevar.def | 2 +- + gcc/tree-ssa-loop-crc.c | 960 +++++++++--------- + 15 files changed, 528 insertions(+), 503 deletions(-) + +diff --git a/gcc/common.opt b/gcc/common.opt +index 4db061b44..96ac252fc 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1067,7 +1067,7 @@ In some extreme situations this may result in unsafe behavior. + + floop-crc + Common Report Var(flag_loop_crc) Optimization +-do the loop crc conversion. ++Do the loop crc conversion. + + fauto-inc-dec + Common Report Var(flag_auto_inc_dec) Init(1) Optimization +diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c +index 1e8b046da..5e612460e 100644 +--- a/gcc/config/aarch64/aarch64-builtins.c ++++ b/gcc/config/aarch64/aarch64-builtins.c +@@ -441,7 +441,8 @@ typedef struct + #define VAR1(T, N, MAP, A) \ + AARCH64_SIMD_BUILTIN_##T##_##N##A, + +-enum aarch64_crc_builtins{ ++enum aarch64_crc_builtins ++{ + AARCH64_BUILTIN_CRC32B, + AARCH64_BUILTIN_CRC32H, + AARCH64_BUILTIN_CRC32W, +@@ -1327,15 +1328,17 @@ aarch64_general_builtin_decl (unsigned code, bool) + + return aarch64_builtin_decls[code]; + } +-/* Implement TARGET_GET_CRC_BUILTIN_CODE */ +-unsigned +-get_crc_builtin_code(unsigned code, bool) ++ ++/* Implement TARGET_GET_CRC_BUILTIN_CODE. */ ++unsigned ++get_crc_builtin_code (unsigned code, bool) + { + if (code > AARCH64_BUILTIN_CRC32W) + return AARCH64_BUILTIN_MIN; + + unsigned res = AARCH64_BUILTIN_MIN; +- switch (code) { ++ switch (code) ++ { + case AARCH64_BUILTIN_CRC32B: + res = AARCH64_BUILTIN_crc32b; + break; +diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h +index a0ca662bc..1a4fc2028 100644 +--- a/gcc/config/aarch64/aarch64-protos.h ++++ b/gcc/config/aarch64/aarch64-protos.h +@@ -723,7 +723,7 @@ tree aarch64_general_fold_builtin (unsigned int, tree, unsigned int, tree *); + gimple *aarch64_general_gimple_fold_builtin (unsigned int, gcall *); + rtx aarch64_general_expand_builtin (unsigned int, tree, rtx, int); + tree aarch64_general_builtin_decl (unsigned, bool); +-unsigned get_crc_builtin_code(unsigned , bool); ++unsigned get_crc_builtin_code (unsigned, bool); + tree aarch64_general_builtin_rsqrt (unsigned int); + tree aarch64_builtin_vectorized_function (unsigned int, tree, tree); + +diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c +index dbdc6dffb..faedcaca1 100644 +--- a/gcc/config/aarch64/aarch64.c ++++ b/gcc/config/aarch64/aarch64.c +@@ -13751,10 +13751,10 @@ aarch64_builtin_decl (unsigned int code, bool initialize_p) + } + + /* Implement TARGET_GET_CRC_BUILTIN_CODE. */ +-static unsigned +-aarch64_get_crc_builtin_code(unsigned code, bool initialize_p) ++static unsigned ++aarch64_get_crc_builtin_code (unsigned code, bool initialize_p) + { +- unsigned subcode = get_crc_builtin_code(code,initialize_p); ++ unsigned subcode = get_crc_builtin_code (code, initialize_p); + unsigned res = subcode << AARCH64_BUILTIN_SHIFT; + return res; + } +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index 52018617a..a0a84c20b 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -460,7 +460,7 @@ Objective-C and Objective-C++ Dialects}. + -fno-allocation-dce -fallow-store-data-races @gol + -fassociative-math -fauto-profile -fauto-profile[=@var{path}] @gol + -farray-widen-compare -fauto-inc-dec -fbranch-probabilities @gol +--fcaller-saves -floop-crc @gol ++-fcaller-saves @gol + -fcombine-stack-adjustments -fconserve-stack @gol + -fcompare-elim -fcprop-registers -fcrossjumping @gol + -fcse-follow-jumps -fcse-skip-blocks -fcx-fortran-rules @gol +@@ -489,7 +489,7 @@ Objective-C and Objective-C++ Dialects}. + -fisolate-erroneous-paths-dereference -fisolate-erroneous-paths-attribute @gol + -fivopts -fkeep-inline-functions -fkeep-static-functions @gol + -fkeep-static-consts -flimit-function-alignment -flive-range-shrinkage @gol +--floop-block -floop-interchange -floop-strip-mine @gol ++-floop-block -floop-crc -floop-interchange -floop-strip-mine @gol + -floop-unroll-and-jam -floop-nest-optimize @gol + -floop-parallelize-all -flra-remat -flto -flto-compression-level @gol + -flto-partition=@var{alg} -fmerge-all-constants @gol +@@ -9722,10 +9722,6 @@ extreme situations this may result in unsafe behavior. + This option may generate better or worse code; results are highly dependent + on the structure of loops within the source code. + +-@item -floop-crc +-@opindex floop-crc +-Do the loop crc conversion +- + @item -fdce + @opindex fdce + Perform dead code elimination (DCE) on RTL@. +@@ -10497,6 +10493,10 @@ for @option{-Os}, since it usually increases code size. + Perform loop optimizations on trees. This flag is enabled by default + at @option{-O} and higher. + ++@item -floop-crc ++@opindex floop-crc ++Do the loop crc conversion ++ + @item -ftree-loop-linear + @itemx -floop-strip-mine + @itemx -floop-block +diff --git a/gcc/match.pd b/gcc/match.pd +index e21d94e56..24ae157af 100644 +--- a/gcc/match.pd ++++ b/gcc/match.pd +@@ -3559,12 +3559,11 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) + #endif + + #if GIMPLE +-/* Try to match */ +- /* +-_4 = (int) _3; //NOP_EXPR (SSA_NAME @2) +-_5 = _4 ^ c_10; //BIT_XOR_EXPR (SSA_NAME@1, SSA_NAME) +-_6 = _5 & 255; //BIT_AND_EXPR (SSA_NAME, INTEGER_CST@3) +- */ ++/* Try to match ++ _4 = (int) _3; NOP_EXPR (SSA_NAME @2) ++ _5 = _4 ^ c_10; BIT_XOR_EXPR (SSA_NAME@1, SSA_NAME) ++ _6 = _5 & 255; BIT_AND_EXPR (SSA_NAME, INTEGER_CST@3) ++*/ + (match (crc_match_index @1 @2 @3) + (bit_and (bit_xor (nop SSA_NAME@2) SSA_NAME@1) INTEGER_CST@3) + (if (INTEGRAL_TYPE_P (type) && tree_to_uhwi(@3) == 255)) +@@ -3572,11 +3571,10 @@ _6 = _5 & 255; //BIT_AND_EXPR (SSA_NAME, INTEGER_CST@3) + #endif + + #if GIMPLE +-/* Try to match */ +- /* +-_8 = c_12 >> 8; // RSHIFT_EXPR (SSA_NAME @1, INTEGER_CST @2) +-c_19 = _7 ^ _8; // BIT_XOR_EXPR (SSA_NAME@3, SSA_NAME) +- */ ++/* Try to match ++ _8 = c_12 >> 8; RSHIFT_EXPR (SSA_NAME @1, INTEGER_CST @2) ++ c_19 = _7 ^ _8; BIT_XOR_EXPR (SSA_NAME@3, SSA_NAME) ++*/ + (match (crc_match_res @1 @2 @3) + (bit_xor SSA_NAME@3 (rshift SSA_NAME@1 INTEGER_CST@2)) + (if (INTEGRAL_TYPE_P (type) && tree_to_uhwi(@2) == 8)) +diff --git a/gcc/passes.def b/gcc/passes.def +index df7d65733..d5d5376f2 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -92,7 +92,7 @@ along with GCC; see the file COPYING3. If not see + NEXT_PASS (pass_cd_dce); + NEXT_PASS (pass_phiopt, true /* early_p */); + NEXT_PASS (pass_array_widen_compare); +- NEXT_PASS (pass_loop_crc); ++ NEXT_PASS (pass_loop_crc); + NEXT_PASS (pass_tail_recursion); + NEXT_PASS (pass_convert_switch); + NEXT_PASS (pass_cleanup_eh); +diff --git a/gcc/target.def b/gcc/target.def +index 34d3561bd..49976160f 100644 +--- a/gcc/target.def ++++ b/gcc/target.def +@@ -2422,7 +2422,7 @@ If @var{code} is out of range the function should return\n\ + tree, (unsigned code, bool initialize_p), NULL) + + /* Initialize (if INITIALIZE_P is true) and return the real code of +- target-specific built-in function . ++ target-specific built-in function. + Return NULL if that is not possible. Return error_mark_node if CODE + is outside of the range of valid crc32 codes. */ + DEFHOOK +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-condition-fail.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-condition-fail.c +index fefa949f9..14db89def 100644 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-condition-fail.c ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-condition-fail.c +@@ -61,7 +61,7 @@ static const ulg crc_32_tab[] = { + 0x2d02ef8dL + }; + +-ulg updcrc(s, n) ++ulg updcrc (s, n) + uch *s; /* pointer to bytes to pump through */ + unsigned n; /* number of bytes in s[] */ + { +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail-2.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail-2.c +index b37446ec5..1fcabd3c6 100644 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail-2.c ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail-2.c +@@ -62,7 +62,7 @@ static const ulg crc_32_tab[] = { + }; + int test[5] = {0}; + +-ulg updcrc(s, n) ++ulg updcrc (s, n) + uch *s; /* pointer to bytes to pump through */ + unsigned n; /* number of bytes in s[] */ + { +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c +index 3dc500a46..e09649e1c 100644 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-loop-form-fail.c +@@ -62,7 +62,7 @@ static const ulg crc_32_tab[] = { + }; + + /* check when the loop have a innor loop, should fail. */ +-ulg updcrc(s, n) ++ulg updcrc (s, n) + uch *s; /* pointer to bytes to pump through */ + unsigned n; /* number of bytes in s[] */ + { +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c +index 8b556efc8..cdb538622 100644 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-sucess.c +@@ -61,7 +61,7 @@ static const ulg crc_32_tab[] = { + 0x2d02ef8dL + }; + +-ulg updcrc(s, n) ++ulg updcrc (s, n) + uch *s; /* pointer to bytes to pump through */ + unsigned n; /* number of bytes in s[] */ + { +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c +index de21f4553..9131fec0b 100644 +--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c ++++ b/gcc/testsuite/gcc.dg/tree-ssa/loop-crc-table-check-fail.c +@@ -63,7 +63,7 @@ static const ulg crc_32_tab[] = { + int test[5] = {0}; + + /* check when the loop is doing more then 1 array read or writing an array, both should fail. */ +-ulg updcrc(s, n) ++ulg updcrc (s, n) + uch *s; /* pointer to bytes to pump through */ + unsigned n; /* number of bytes in s[] */ + { +diff --git a/gcc/timevar.def b/gcc/timevar.def +index ba86a1b7b..49a51b7dc 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -215,7 +215,7 @@ DEFTIMEVAR (TV_TREE_COPY_RENAME , "tree rename SSA copies") + DEFTIMEVAR (TV_TREE_SSA_VERIFY , "tree SSA verifier") + DEFTIMEVAR (TV_TREE_STMT_VERIFY , "tree STMT verifier") + DEFTIMEVAR (TV_TREE_ARRAY_WIDEN_COMPARE, "tree array widen compare") +-DEFTIMEVAR (TV_TREE_LOOP_CRC, "tree loop crc") ++DEFTIMEVAR (TV_TREE_LOOP_CRC , "tree loop crc") + DEFTIMEVAR (TV_TREE_SWITCH_CONVERSION, "tree switch conversion") + DEFTIMEVAR (TV_TREE_SWITCH_LOWERING, "tree switch lowering") + DEFTIMEVAR (TV_TREE_RECIP , "gimple CSE reciprocals") +diff --git a/gcc/tree-ssa-loop-crc.c b/gcc/tree-ssa-loop-crc.c +index 8225c2fa5..9878363eb 100644 +--- a/gcc/tree-ssa-loop-crc.c ++++ b/gcc/tree-ssa-loop-crc.c +@@ -1,4 +1,5 @@ +-/* loop crc. ++/* This pass converts special loops where do CRC algorithms to ++ simple CRC instructions in AArch64. + Copyright (C) 2023-2023 Free Software Foundation, Inc. + + This file is part of GCC. +@@ -45,79 +46,90 @@ along with GCC; see the file COPYING3. If not see + #include "diagnostic-core.h" + + /* This pass handles scenarios similar to the following: +-ulg updcrc(s, n) +- uch *s; +- unsigned n; ++ulg updcrc (s, n) ++ uch *s; ++ unsigned n; + { +- register ulg c; ++ register ulg c; + + static ulg crc = (ulg)0xffffffffL; + +- if (s == NULL) { ++ if (s == NULL) ++ { + c = 0xffffffffL; +- } else { ++ } ++ else ++ { + c = crc; +- if (n) do { ++ if (n) ++ do ++ { + c = crc_32_tab[((int)c ^ (*s++)) & 0xff] ^ (c >> 8); + } while (--n); + } + crc = c; +- return c ^ 0xffffffffL; ++ return c ^ 0xffffffffL; + } + +-If the hardware supports the crc instruction, then the pass completes the ++If the hardware supports the crc instruction, then the pass completes the + conversion of the above scenario into: + + #define SIZE_U32 sizeof(uint32_t) + unsigned long updcrc(s, n) +- unsigned char *s; +- unsigned n; ++ unsigned char *s; ++ unsigned n; + { +- register unsigned long c; ++ register unsigned long c; + +- static unsigned long crc = (unsigned long)0xffffffffL; ++ static unsigned long crc = (unsigned long)0xffffffffL; + +- if (s == NULL) { +- c = 0xffffffffL; +- } else { +- c = crc; +- if (n) +- { +- uint32_t nn = n/SIZE_U32; +- do{ +- c = __crc32w(c,*((uint32_t *)s)); +- s += SIZE_U32; +- }while(--nn); +- if (n & sizeof(uint16_t)) { +- c = __crc32h(c, *((uint16_t *)s)); +- s += sizeof(uint16_t); +- } +- if (n & sizeof(uint8_t)) +- c = __crc32b(c, *s); +- } ++ if (s == NULL) ++ { ++ c = 0xffffffffL; ++ } ++ else ++ { ++ c = crc; ++ if (n) ++ { ++ uint32_t nn = n/SIZE_U32; ++ do ++ { ++ c = __crc32w (c,*((uint32_t *)s)); ++ s += SIZE_U32; ++ } while(--nn); + } +- crc = c; +- return c ^ 0xffffffffL; ++ } ++ if (n & sizeof (uint16_t)) ++ { ++ c = __crc32h (c, *((uint16_t *)s)); ++ s += sizeof (uint16_t); ++ } ++ if (n & sizeof (uint8_t)) ++ c = __crc32b (c, *s); ++ crc = c; ++ return c ^ 0xffffffffL; + } + +-This pass is to complete the conversion of such scenarios from the internal +-perspective of the compiler: +-1)match_crc_loop:The function completes the screening of such scenarios; +-2)convert_to_new_loop:The function completes the conversion of +- origin_loop to new loops, and removes origin_loop; +-3)origin_loop_info: The structure is used to record important information +- of origin_loop: such as loop exit, initial value of induction +- variable, etc; +-4) create_new_loops: The function is used as the key content of the pass +- to complete the creation of new loops. */ ++This pass is to complete the conversion of such scenarios from ++the internal perspective of the compiler: ++1) match_crc_loop: The function completes the screening of such ++ scenarios; ++2) convert_to_new_loop: The function completes the conversion of ++ origin_loop to new loops, and removes origin_loop; ++3) origin_loop_info: The structure is used to record important ++ information of origin_loop: such as loop exit, initial value ++ of induction variable, etc; ++4) create_new_loops: The function is used as the key content ++ of the pass to complete the creation of new loops. */ + + extern bool gimple_crc_match_index (tree, tree *, tree (*)(tree)); + extern bool gimple_crc_match_res (tree, tree *, tree (*)(tree)); + + static gimple *crc_table_read_stmt = NULL; + +-static gphi* phi_s = NULL; +-static gphi* phi_c = NULL; ++static gphi *phi_s = NULL; ++static gphi *phi_c = NULL; + static tree nn_tree = NULL; + + enum aarch64_crc_builtins +@@ -130,11 +142,11 @@ enum aarch64_crc_builtins + /* The useful information of origin loop. */ + struct origin_loop_info + { +- tree limit; /* The limit index of the array in the old loop. */ +- tree base_n; /* The initial value of the old loop. */ +- tree base_s; /* The initial value of the old loop. */ +- tree base_c; /* The initial value of the old loop. */ +- edge entry_edge; /* The edge into the old loop. */ ++ tree limit; /* The limit index of the array in the old loop. */ ++ tree base_n; /* The initial value of the old loop. */ ++ tree base_s; /* The initial value of the old loop. */ ++ tree base_c; /* The initial value of the old loop. */ ++ edge entry_edge; /* The edge into the old loop. */ + edge exit_edge; /* The edge outto the old loop. */ + basic_block exit_bb; + }; +@@ -215,14 +227,14 @@ get_iv_upper_bound (gimple *stmt) + return false; + + /* TODO: Currently, the input restrictions on lhs and rhs are implemented +- through PARM_DECL. We may consider relax the restrictions later, and ++ through PARM_DECL. We may consider relax the restrictions later, and + we need to consider the overall adaptation scenario and adding test +- cases. */ ++ cases. */ + if (ssa_name_var_p (lhs) && TREE_CODE (SSA_NAME_VAR (lhs)) == PARM_DECL) +- { +- origin_loop.limit = rhs; +- origin_loop.base_n = lhs; +- } ++ { ++ origin_loop.limit = rhs; ++ origin_loop.base_n = lhs; ++ } + else + return false; + +@@ -233,26 +245,26 @@ get_iv_upper_bound (gimple *stmt) + } + + /* Get origin loop info. */ +-static bool +-get_origin_loop_info(class loop *loop) ++static bool ++get_origin_loop_info (class loop *loop) + { + vec edges; +- edges = get_loop_exit_edges (loop); ++ edges = get_loop_exit_edges (loop); + origin_loop.exit_edge = edges[0]; + origin_loop.exit_bb = origin_loop.exit_edge->dest; +- origin_loop.entry_edge = get_loop_preheader_edge(loop); +- origin_loop.base_s = PHI_ARG_DEF_FROM_EDGE(phi_s,origin_loop.entry_edge); +- origin_loop.base_c = PHI_ARG_DEF_FROM_EDGE(phi_c,origin_loop.entry_edge); +- ++ origin_loop.entry_edge = get_loop_preheader_edge (loop); ++ origin_loop.base_s = PHI_ARG_DEF_FROM_EDGE (phi_s,origin_loop.entry_edge); ++ origin_loop.base_c = PHI_ARG_DEF_FROM_EDGE (phi_c,origin_loop.entry_edge); ++ + basic_block preheader_bb; + preheader_bb = origin_loop.entry_edge->src; +- +- if(preheader_bb->preds->length() != 1) ++ ++ if (preheader_bb->preds->length () != 1) + return false; + + edge entry_pre_bb_edge; + entry_pre_bb_edge = EDGE_PRED (preheader_bb, 0); +- ++ + basic_block pre_preheader_bb; + pre_preheader_bb = entry_pre_bb_edge->src; + +@@ -260,21 +272,23 @@ get_origin_loop_info(class loop *loop) + gimple *stmt; + bool get_upper_bound = false; + for (gsi = gsi_start_bb (pre_preheader_bb); !gsi_end_p (gsi); gsi_next (&gsi)) +- { +- stmt = gsi_stmt (gsi); +- if (stmt && gimple_code (stmt) == GIMPLE_COND +- && get_iv_upper_bound (stmt)) { +- get_upper_bound = true; +- break; ++ { ++ stmt = gsi_stmt (gsi); ++ if (stmt && gimple_code (stmt) == GIMPLE_COND ++ && get_iv_upper_bound (stmt)) ++ { ++ get_upper_bound = true; ++ break; ++ } + } +- } + + return get_upper_bound; + } + + /* The loop form check will check the entire loop control flow + It should be a loop that: +- 1. a do-while loop with header and latch only with no other control flow inside the loop ++ 1. a do-while loop with header and latch only with no other control flow ++ inside the loop + 2. have only one exiting edge + 3. have only one back edge and one entry edge + */ +@@ -283,14 +297,15 @@ crc_loop_form_check (class loop *loop) + { + if (loop->num_nodes > 2 || loop->inner) + return false; +- // should only have 1 exit edge ++ // Should only have 1 exit edge + vec edges; + edges = get_loop_exit_edges (loop); + if (edges.length() != 1) + return false; + +- // The header should have only 2 incoming edges +- // One of them is the preheader edge and the other is the backedge from the latch ++ // The header should have only 2 incoming edges ++ // One of them is the preheader edge and the other is the backedge from the ++ // latch + if (EDGE_COUNT (loop->header->preds) != 2) + return false; + edge e1 = EDGE_PRED (loop->header, 0); +@@ -317,23 +332,23 @@ only_one_array_read (class loop *loop, tree &crc_table) + if (stmt == NULL) + return false; + +- if (gimple_code (stmt) == GIMPLE_ASSIGN && +- TREE_CODE(gimple_assign_lhs (stmt)) == ARRAY_REF ) +- return false; ++ if (gimple_code (stmt) == GIMPLE_ASSIGN ++ && TREE_CODE (gimple_assign_lhs (stmt)) == ARRAY_REF) ++ return false; + +- if (gimple_code (stmt) == GIMPLE_ASSIGN && +- TREE_CODE(gimple_assign_rhs1 (stmt)) == ARRAY_REF) +- { +- if (crc_table == NULL && +- gimple_assign_rhs1 (stmt)->base.readonly_flag) +- { +- crc_table = gimple_assign_rhs1 (stmt); +- crc_table_read_stmt = stmt; +- res = true; +- } +- else +- return false; +- } ++ if (gimple_code (stmt) == GIMPLE_ASSIGN ++ && TREE_CODE (gimple_assign_rhs1 (stmt)) == ARRAY_REF) ++ { ++ if (crc_table == NULL ++ && gimple_assign_rhs1 (stmt)->base.readonly_flag) ++ { ++ crc_table = gimple_assign_rhs1 (stmt); ++ crc_table_read_stmt = stmt; ++ res = true; ++ } ++ else ++ return false; ++ } + } + return res; + } +@@ -400,19 +415,26 @@ match_crc_table (tree crc_table) + const unsigned LOW_BOUND = 0; + const unsigned UP_BOUND = 255; + const unsigned ELEMENT_SIZE = 8; +- unsigned HOST_WIDE_INT lb = tree_to_uhwi (array_ref_low_bound (crc_table)); +- unsigned HOST_WIDE_INT ub = tree_to_uhwi (array_ref_up_bound (crc_table)); +- unsigned HOST_WIDE_INT es = tree_to_uhwi (array_ref_element_size (crc_table)); ++ tree low_bound = array_ref_low_bound (crc_table); ++ tree up_bound = array_ref_up_bound (crc_table); ++ tree element_size = array_ref_element_size (crc_table); ++ if (low_bound == NULL || up_bound == NULL || element_size == NULL) ++ return false; ++ unsigned HOST_WIDE_INT lb = tree_to_uhwi (low_bound); ++ unsigned HOST_WIDE_INT ub = tree_to_uhwi (up_bound); ++ unsigned HOST_WIDE_INT es = tree_to_uhwi (element_size); + if (lb != LOW_BOUND || ub != UP_BOUND || es != ELEMENT_SIZE) + return false; + + tree decl = TREE_OPERAND (crc_table, 0); + tree ctor = ctor_for_folding(decl); +- for (int i = lb; i <= ub; i++) { +- unsigned HOST_WIDE_INT val = tree_to_uhwi (CONSTRUCTOR_ELT (ctor,i)->value); +- if (crc_32_tab[i] != val) +- return false; +- } ++ for (int i = lb; i <= ub; i++) ++ { ++ unsigned HOST_WIDE_INT val = tree_to_uhwi (CONSTRUCTOR_ELT (ctor, ++ i)->value); ++ if (crc_32_tab[i] != val) ++ return false; ++ } + return true; + } + +@@ -426,21 +448,23 @@ crc_table_check (class loop *loop) + if (!only_one_array_read (loop, crc_table)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "\nTable check fail. not only single array is read.\n"); ++ fprintf (dump_file, "\nTable check fail. not only single array " ++ "is read.\n"); + return false; + } + if (!match_crc_table (crc_table)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "\nTable check fail. Table not matching.\n"); ++ fprintf (dump_file, "\nTable check fail. Table not matching.\n"); + return false; +- } ++ } + return true; + } + +-/* check whether the evolution pattern of phi is phi = SSA_NAME + target*/ ++/* Check whether the evolution pattern of phi is phi = SSA_NAME + target*/ + static bool +-evolution_pattern_plus_with_p (class loop* loop, gphi *phi, unsigned HOST_WIDE_INT target) ++evolution_pattern_plus_with_p (class loop *loop, gphi *phi, ++ unsigned HOST_WIDE_INT target) + { + edge backedge = find_edge (loop->latch, loop->header); + if (backedge == NULL) +@@ -449,7 +473,8 @@ evolution_pattern_plus_with_p (class loop* loop, gphi *phi, unsigned HOST_WIDE_I + gimple *evolution_expr = SSA_NAME_DEF_STMT (evolution_node); + + if (evolution_expr && (gimple_assign_rhs_code (evolution_expr) == PLUS_EXPR || +- gimple_assign_rhs_code (evolution_expr) == POINTER_PLUS_EXPR)) ++ gimple_assign_rhs_code (evolution_expr) ++ == POINTER_PLUS_EXPR)) + { + tree rhs1 = gimple_assign_rhs1 (evolution_expr); + tree rhs2 = gimple_assign_rhs2 (evolution_expr); +@@ -471,13 +496,14 @@ check_num_of_phi (basic_block header, gphi *capture[]) + + for (gsi = gsi_start_phis (header); !gsi_end_p (gsi); gsi_next (&gsi)) + { +- phi = gsi.phi(); +- if (phi) num_of_phi++; +- if (num_of_phi > 3) +- return false; +- capture[num_of_phi - 1] = phi; ++ phi = gsi.phi (); ++ if (phi) ++ num_of_phi++; ++ if (num_of_phi > 3) ++ return false; ++ capture[num_of_phi - 1] = phi; + } +- /* phi node should be exactly 3. */ ++ /* Phi node should be exactly 3. */ + return num_of_phi == 3; + } + +@@ -486,7 +512,7 @@ check_num_of_phi (basic_block header, gphi *capture[]) + every time (n), and a 3rd one neither (c). Return 3 phi nodes in + the capture with the order of s,n,c.*/ + static bool +-check_evolution_pattern (class loop* loop, gphi *capture[]) ++check_evolution_pattern (class loop *loop, gphi *capture[]) + { + gphi *s=NULL; + gphi *n=NULL; +@@ -494,29 +520,29 @@ check_evolution_pattern (class loop* loop, gphi *capture[]) + + for (int i = 0; i < 3; i++) + { +- if (evolution_pattern_plus_with_p(loop, capture[i], 1)) ++ if (evolution_pattern_plus_with_p (loop, capture[i], 1)) + { + if (s != NULL) +- return false; ++ return false; + s = capture[i]; + phi_s = s; + } +- else if (evolution_pattern_plus_with_p(loop, capture[i], 4294967295)) ++ else if (evolution_pattern_plus_with_p (loop, capture[i], 4294967295)) + { + if (n != NULL) +- return false; ++ return false; + n = capture[i]; + } + else + { + if (c != NULL) +- return false; ++ return false; + c = capture[i]; + phi_c = c; + } + } + +- // some envolution pattern cannot find ++ // Some envolution pattern cannot find + if (!n || !s || !c) + return false; + +@@ -525,12 +551,13 @@ check_evolution_pattern (class loop* loop, gphi *capture[]) + capture[2] = c; + return true; + } +-/* check the calculation pattern before and after the crc_table array read stmt. ++/* Check the calculation pattern before and after the crc_table array read stmt. + _7 = crc_32_tab[_6]; +- The caculation of index _6 should be the result of a sequency of calculation by the s and c ++ The caculation of index _6 should be the result of a sequency of calculation ++ by the s and c + The result of the array read _7 should be used to calculate the new c. */ + static bool +-check_calculation_pattern (class loop* loop, gphi *capture[]) ++check_calculation_pattern (class loop *loop, gphi *capture[]) + { + gphi *s=capture[0]; + gphi *c=capture[2]; +@@ -542,7 +569,7 @@ check_calculation_pattern (class loop* loop, gphi *capture[]) + _5 = _4 ^ c_10; //BIT_XOR_EXPR (SSA_NAME, PHI @1) + _6 = _5 & 255; //BIT_XOR_EXPR (SSA_NAME, INTEGER_CST@3) + */ +- if (!gimple_crc_match_index(index, res_ops, NULL)) ++ if (!gimple_crc_match_index (index, res_ops, NULL)) + return false; + gimple *s_res_stmt = SSA_NAME_DEF_STMT (res_ops[0]); + if (!s_res_stmt) +@@ -552,38 +579,37 @@ check_calculation_pattern (class loop* loop, gphi *capture[]) + return false; + tree s_res = TREE_OPERAND (gimple_assign_rhs1 (s_def_stmt), 0); + if (res_ops[1] != gimple_phi_result (c) || s_res != gimple_phi_result (s)) +- { + return false; +- } + + /* Try to match + _8 = c_12 >> 8; // RSHIFT_EXPR (SSA_NAME @1, INTEGER_CST @2) + c_19 = _7 ^ _8; // BIT_XOR_EXPR (SSA_NAME@3, SSA_NAME) + */ +- edge backedge = find_edge(loop->latch, loop->header); ++ edge backedge = find_edge (loop->latch, loop->header); + tree updated_c = PHI_ARG_DEF_FROM_EDGE (c, backedge); +- if (!gimple_crc_match_res(updated_c, res_ops, NULL)) ++ if (!gimple_crc_match_res (updated_c, res_ops, NULL)) + return false; + if (res_ops[0] != gimple_phi_result (c) +- || res_ops[2] != gimple_assign_lhs(crc_table_read_stmt)) +- { +- if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "\n gimple_crc_match_res pattern check failed.\n"); +- return false; +- } ++ || res_ops[2] != gimple_assign_lhs (crc_table_read_stmt)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n gimple_crc_match_res pattern check failed.\n"); ++ return false; ++ } + + return true; + } + +-/* check the exit condition is n != 0. */ ++/* Check the exit condition is n != 0. */ + static bool +-check_exit_condition (class loop* loop, gphi *n) ++check_exit_condition (class loop *loop, gphi *n) + { +- edge backedge = find_edge(loop->latch, loop->header); ++ edge backedge = find_edge (loop->latch, loop->header); + gimple *cond_stmt = gsi_stmt (gsi_last_bb (loop->header)); +- if (!cond_stmt || gimple_code (cond_stmt) != GIMPLE_COND || gimple_cond_code (cond_stmt) != NE_EXPR ++ if (!cond_stmt || gimple_code (cond_stmt) != GIMPLE_COND ++ || gimple_cond_code (cond_stmt) != NE_EXPR + || gimple_cond_lhs (cond_stmt) != PHI_ARG_DEF_FROM_EDGE (n, backedge) +- || tree_to_uhwi(gimple_cond_rhs (cond_stmt)) != 0) ++ || tree_to_uhwi (gimple_cond_rhs (cond_stmt)) != 0) + return false; + + return true; +@@ -620,8 +646,8 @@ so the matching condition is + which is limited by the condition that the loop have exactly 3 phi nodes. + 2. The 3 loop variants should have evolution pattern as 1 of the 3 nodes is + increased by 1 every itoration, 1 of the 3 nodes is decreased by 1 every itor +- and the 3rd one is neither. These three tree node SSA value will be captured for +- the later arithmatic pattern matching ++ and the 3rd one is neither. These three tree node SSA value will be captured ++ for the later arithmatic pattern matching + 3. Pattern matching for the index of crc_table + 4. pattern matching for the result of c calcuation after read from crc_table + 5. The exit condition matching. +@@ -637,107 +663,101 @@ crc_loop_body_check (class loop *loop) + fprintf (dump_file, "\n num of phi noeds check failed.\n"); + return false; + } +- if (!check_evolution_pattern(loop, capture)) ++ if (!check_evolution_pattern (loop, capture)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\n evolution pattern check failed.\n"); +- return false; ++ return false; + } +- if (!check_calculation_pattern(loop, capture)) ++ if (!check_calculation_pattern (loop, capture)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\n calculation pattern check failed.\n"); +- return false; ++ return false; + } +- if (!check_exit_condition(loop, capture[1] /* n*/)) ++ if (!check_exit_condition (loop, capture[1])) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\n exit condition check failed.\n"); +- return false; ++ return false; + } + return true; + } + +-/* Check the prev_bb of prev_bb of loop header. The prev_bb we are trying to match is ++/* Check the prev_bb of prev_bb of loop header. The prev_bb we are trying to ++match is + + c_15 = crc; +-if (n_16(D) != 0) ++if (n_16 (D) != 0) + goto ; [INV] + else + goto ; [INV] + + In this case , we must be sure that the n is not zero. + so the match condition is +- 1、the n is not zero. ++ 1 the n is not zero. + + : +-if (s_13(D) == 0B) ++if (s_13 (D) == 0B) + goto ; [INV] + else + goto ; [INV] + + In this case, we must be sure the s is not NULL. + so the match condition is +- 1、the s is not NULL. ++ 1 the s is not NULL. + */ + static bool +-crc_prev_bb_of_loop_header_check(class loop *loop) ++crc_prev_bb_of_loop_header_check (class loop *loop) + { + basic_block header = loop->header; + basic_block prev_header_bb = header->prev_bb; +- if(NULL == prev_header_bb) +- { ++ if (NULL == prev_header_bb) + return false; +- } + + basic_block prev_prev_header_bb = prev_header_bb->prev_bb; +- if(NULL == prev_prev_header_bb) +- { ++ if (NULL == prev_prev_header_bb) + return false; +- } + + gimple_stmt_iterator gsi; + gimple *stmt; + bool res = false; +- for (gsi = gsi_start_bb (prev_prev_header_bb); !gsi_end_p (gsi); gsi_next (&gsi)) +- { +- stmt = gsi_stmt (gsi); +- if (stmt == NULL) +- return false; ++ for (gsi = gsi_start_bb (prev_prev_header_bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ stmt = gsi_stmt (gsi); ++ if (stmt == NULL) ++ return false; + +- if (gimple_code (stmt) == GIMPLE_COND && +- gimple_cond_code(stmt) == NE_EXPR && +- TREE_CODE(gimple_cond_rhs (stmt)) == INTEGER_CST && +- tree_int_cst_sgn(gimple_cond_rhs (stmt)) == 0 ) ++ if (gimple_code (stmt) == GIMPLE_COND ++ && gimple_cond_code (stmt) == NE_EXPR ++ && TREE_CODE (gimple_cond_rhs (stmt)) == INTEGER_CST ++ && tree_int_cst_sgn (gimple_cond_rhs (stmt)) == 0) + { + res = true; + break; + } +- } ++ } + +- if(!res) +- { ++ if (!res) + return false; +- } + + basic_block first_bb = prev_prev_header_bb->prev_bb; +- if(NULL == first_bb) ++ if (NULL == first_bb) + return false; + + for (gsi = gsi_start_bb (first_bb); !gsi_end_p (gsi); gsi_next (&gsi)) +- { +- stmt = gsi_stmt (gsi); +- if (stmt == NULL) +- return false; ++ { ++ stmt = gsi_stmt (gsi); ++ if (stmt == NULL) ++ return false; + +- if (gimple_code (stmt) == GIMPLE_COND && +- gimple_cond_code(stmt) == EQ_EXPR && +- TREE_CODE(gimple_cond_rhs (stmt)) == INTEGER_CST && +- tree_int_cst_sgn(gimple_cond_rhs (stmt)) == 0 ) +- { +- return true; +- } +- } ++ if (gimple_code (stmt) == GIMPLE_COND ++ && gimple_cond_code (stmt) == EQ_EXPR ++ && TREE_CODE (gimple_cond_rhs (stmt)) == INTEGER_CST ++ && tree_int_cst_sgn (gimple_cond_rhs (stmt)) == 0) ++ return true; ++ } + + return false; + } +@@ -745,86 +765,88 @@ crc_prev_bb_of_loop_header_check(class loop *loop) + static bool + match_crc_loop (class loop *loop) + { +- if (!crc_loop_form_check(loop)) ++ if (!crc_loop_form_check (loop)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "\nWrong loop form for crc matching.\n"); ++ fprintf (dump_file, "\nWrong loop form for crc matching.\n"); + return false; + } +- if (!crc_table_check(loop)) ++ if (!crc_table_check (loop)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "\nWrong crc table for crc matching.\n"); ++ fprintf (dump_file, "\nWrong crc table for crc matching.\n"); + return false; + } +- if (!crc_loop_body_check(loop)) ++ if (!crc_loop_body_check (loop)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "\nWrong loop body for crc matching.\n"); ++ fprintf (dump_file, "\nWrong loop body for crc matching.\n"); + return false; + } +- if(!crc_prev_bb_of_loop_header_check(loop)) +- { +- if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "\nWrong prev basic_blocks of loop header for crc matching.\n"); ++ if (!crc_prev_bb_of_loop_header_check (loop)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nWrong prev basic_blocks of loop header for" ++ " crc matching.\n"); + return false; +- } ++ } + +- init_origin_loop_structure(); +- if(!get_origin_loop_info(loop)) +- return false; ++ init_origin_loop_structure (); ++ if (!get_origin_loop_info (loop)) ++ return false; + + return true; + } + + static void + create_new_bb (basic_block &new_bb, basic_block after_bb, +- basic_block dominator_bb, class loop *outer) ++ basic_block dominator_bb, class loop *outer) + { + new_bb = create_empty_bb (after_bb); + add_bb_to_loop (new_bb, outer); + set_immediate_dominator (CDI_DOMINATORS, new_bb, dominator_bb); + } + +-static void +-change_preheader_bb(edge entry_edge) ++static void ++change_preheader_bb (edge entry_edge) + { + gimple_seq stmts = NULL; + gimple_stmt_iterator gsi; +- gimple* g; ++ gimple *g; + tree lhs1; + +- lhs1 = create_tmp_var(TREE_TYPE(origin_loop.base_n),"nn"); +- lhs1 = make_ssa_name(lhs1); ++ lhs1 = create_tmp_var (TREE_TYPE (origin_loop.base_n),"nn"); ++ lhs1 = make_ssa_name (lhs1); + gsi = gsi_last_bb (entry_edge->src); +- g = gimple_build_assign(lhs1,RSHIFT_EXPR,origin_loop.base_n, +- build_int_cst (TREE_TYPE (origin_loop.base_n), 2)); +- gimple_seq_add_stmt(&stmts,g); ++ g = gimple_build_assign (lhs1, RSHIFT_EXPR, origin_loop.base_n, ++ build_int_cst (TREE_TYPE (origin_loop.base_n), 2)); ++ gimple_seq_add_stmt (&stmts, g); + gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); + nn_tree = lhs1; +- set_current_def(nn_tree, lhs1); ++ set_current_def (nn_tree, lhs1); + nn_map.put (entry_edge->src, lhs1); + } + +-static gphi* +-create_phi_node_for_bb(tree old_name, basic_block bb) ++static gphi * ++create_phi_node_for_bb (tree old_name, basic_block bb) + { +- gphi *phi = create_phi_node(NULL_TREE, bb); +- create_new_def_for(old_name, phi, gimple_phi_result_ptr(phi)); ++ gphi *phi = create_phi_node (NULL_TREE, bb); ++ create_new_def_for (old_name, phi, gimple_phi_result_ptr (phi)); + return phi; + } + +-static gimple* +-call_builtin_fun(int code,tree& lhs, tree arg1, tree arg2) ++static gimple * ++call_builtin_fun (int code,tree &lhs, tree arg1, tree arg2) + { +- unsigned int builtin_code = targetm.get_crc_builtin_code(code, true);// 根据code获取到正确的builtin_fun_code +- tree fn = targetm.builtin_decl(builtin_code,true); // get the decl of __builtin_aarch64_crc32w ++ unsigned int builtin_code = targetm.get_crc_builtin_code (code, true); ++ // Get the decl of __builtin_aarch64_crc32w ++ tree fn = targetm.builtin_decl (builtin_code, true); + if (!fn || fn == error_mark_node) + fatal_error (input_location, + "target specific builtin not available"); +- gimple* call_builtin = gimple_build_call(fn, 2, arg1, arg2); // _40 = __builtin_aarch64_crc32* (_1, _2); ++ gimple *call_builtin = gimple_build_call (fn, 2, arg1, arg2); + lhs = make_ssa_name (unsigned_type_node); +- gimple_call_set_lhs(call_builtin,lhs); ++ gimple_call_set_lhs (call_builtin, lhs); + + return call_builtin; + } +@@ -843,58 +865,60 @@ call_builtin_fun(int code,tree& lhs, tree arg1, tree arg2) + if (nn_31 != 0) + The IR of bb is as above. */ + static void +-create_loop_bb(basic_block& loop_bb, basic_block after_bb, +- basic_block dominator_bb, class loop *outer, edge entry_edge) ++create_loop_bb (basic_block &loop_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer, edge entry_edge) + { + gimple_seq stmts = NULL; + gimple_stmt_iterator gsi; +- gimple* g; +- gphi* phi_s_loop; +- gphi* phi_c_loop; +- gphi* phi_nn_loop; +- +- create_new_bb(loop_bb, after_bb, dominator_bb, outer); +- redirect_edge_and_branch(entry_edge, loop_bb); +- gsi = gsi_last_bb(loop_bb); +- tree entry_nn = get_current_def(nn_tree); +- phi_s_loop = create_phi_node_for_bb(origin_loop.base_s, loop_bb); +- phi_c_loop = create_phi_node_for_bb(origin_loop.base_c, loop_bb); +- phi_nn_loop = create_phi_node_for_bb(entry_nn, loop_bb); +- +- tree res_s = gimple_phi_result(phi_s_loop); +- tree res_nn = gimple_phi_result(phi_nn_loop); +- tree lhs1 = gimple_build(&stmts, NOP_EXPR, unsigned_type_node, +- gimple_phi_result(phi_c_loop)); +- g = gimple_build_assign(make_ssa_name(unsigned_type_node), +- fold_build2(MEM_REF,unsigned_type_node,res_s, +- build_int_cst (build_pointer_type (unsigned_type_node), 0))); +- gimple_seq_add_stmt(&stmts, g); +- tree lhs2 = gimple_assign_lhs(g); // _2 = MEM[(uint32_t *)s_14]; +- unsigned int code = AARCH64_BUILTIN_CRC32W; ++ gimple *g; ++ gphi *phi_s_loop; ++ gphi *phi_c_loop; ++ gphi *phi_nn_loop; ++ ++ create_new_bb (loop_bb, after_bb, dominator_bb, outer); ++ redirect_edge_and_branch (entry_edge, loop_bb); ++ gsi = gsi_last_bb (loop_bb); ++ tree entry_nn = get_current_def (nn_tree); ++ phi_s_loop = create_phi_node_for_bb (origin_loop.base_s, loop_bb); ++ phi_c_loop = create_phi_node_for_bb (origin_loop.base_c, loop_bb); ++ phi_nn_loop = create_phi_node_for_bb (entry_nn, loop_bb); ++ ++ tree res_s = gimple_phi_result (phi_s_loop); ++ tree res_nn = gimple_phi_result (phi_nn_loop); ++ tree lhs1 = gimple_build (&stmts, NOP_EXPR, unsigned_type_node, ++ gimple_phi_result (phi_c_loop)); ++ g = gimple_build_assign (make_ssa_name (unsigned_type_node), ++ fold_build2 (MEM_REF, unsigned_type_node, res_s, ++ build_int_cst ( ++ build_pointer_type ( ++ unsigned_type_node),0))); ++ gimple_seq_add_stmt (&stmts, g); ++ tree lhs2 = gimple_assign_lhs (g); // _2 = MEM[(uint32_t *)s_14]; ++ unsigned int code = AARCH64_BUILTIN_CRC32W; + tree lhs3; +- gimple* build_crc32w = call_builtin_fun(code,lhs3, lhs1, lhs2); +- crc_map.put(loop_bb, lhs3); +- gimple_seq_add_stmt(&stmts,build_crc32w); +- +- tree lhs4 = copy_ssa_name(origin_loop.base_c); +- g = gimple_build_assign(lhs4, NOP_EXPR, lhs3); +- gimple_seq_add_stmt(&stmts, g); +- c_map.put(loop_bb, lhs4); +- +- tree lhs5 = copy_ssa_name(origin_loop.base_s); +- g = gimple_build_assign(lhs5, POINTER_PLUS_EXPR, res_s, +- build_int_cst (sizetype, 4)); +- gimple_seq_add_stmt(&stmts, g); +- s_map.put(loop_bb, lhs5); +- +- tree lhs6 = copy_ssa_name(nn_tree); +- g = gimple_build_assign(lhs6, PLUS_EXPR, res_nn, ++ gimple *build_crc32w = call_builtin_fun (code, lhs3, lhs1, lhs2); ++ crc_map.put (loop_bb, lhs3); ++ gimple_seq_add_stmt (&stmts, build_crc32w); ++ ++ tree lhs4 = copy_ssa_name (origin_loop.base_c); ++ g = gimple_build_assign (lhs4, NOP_EXPR, lhs3); ++ gimple_seq_add_stmt (&stmts, g); ++ c_map.put (loop_bb, lhs4); ++ ++ tree lhs5 = copy_ssa_name (origin_loop.base_s); ++ g = gimple_build_assign (lhs5, POINTER_PLUS_EXPR, res_s, ++ build_int_cst (sizetype, 4)); ++ gimple_seq_add_stmt (&stmts, g); ++ s_map.put (loop_bb, lhs5); ++ ++ tree lhs6 = copy_ssa_name (nn_tree); ++ g = gimple_build_assign (lhs6, PLUS_EXPR, res_nn, + build_int_cst (TREE_TYPE (res_nn), 4294967295)); +- gimple_seq_add_stmt(&stmts,g); +- nn_map.put(loop_bb, lhs6); ++ gimple_seq_add_stmt (&stmts,g); ++ nn_map.put (loop_bb, lhs6); + +- gcond* cond_stmt = gimple_build_cond (NE_EXPR, lhs6, origin_loop.limit, +- NULL_TREE, NULL_TREE); ++ gcond *cond_stmt = gimple_build_cond (NE_EXPR, lhs6, origin_loop.limit, ++ NULL_TREE, NULL_TREE); + gimple_seq_add_stmt (&stmts, cond_stmt); + gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); + } +@@ -904,30 +928,32 @@ create_loop_bb(basic_block& loop_bb, basic_block after_bb, + # s_46 = PHI + _44 = n_26(D) & 2; + if (_44 != 0) +- The IR of bb is as above. */ ++ The IR of bb is as above. */ + static void +-create_cond_bb(basic_block& cond_bb, basic_block after_bb, +- basic_block dominator_bb, class loop *outer){ ++create_cond_bb (basic_block &cond_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ + gimple_seq stmts = NULL; + gimple_stmt_iterator gsi; +- gphi* phi_s_loop; +- gphi* phi_c_loop; +- +- create_new_bb(cond_bb, after_bb, dominator_bb, outer); +- gsi = gsi_last_bb(cond_bb); +- tree entry_nn = get_current_def(nn_tree); +- phi_s_loop = create_phi_node_for_bb(origin_loop.base_s, cond_bb); +- phi_c_loop = create_phi_node_for_bb(origin_loop.base_c, cond_bb); +- tree res_s = gimple_phi_result(phi_s_loop); +- set_current_def(origin_loop.base_s, res_s); +- s_map.put(cond_bb, res_s); +- tree res_c = gimple_phi_result(phi_c_loop); +- set_current_def(origin_loop.base_c, res_c); +- c_map.put(cond_bb, res_c); +- +- tree lhs1 = gimple_build(&stmts, BIT_AND_EXPR, TREE_TYPE(origin_loop.base_n), +- origin_loop.base_n, build_int_cst (TREE_TYPE (origin_loop.base_n), 2)); +- gcond* cond_stmt = gimple_build_cond (NE_EXPR, lhs1, origin_loop.limit, ++ gphi *phi_s_loop; ++ gphi *phi_c_loop; ++ ++ create_new_bb (cond_bb, after_bb, dominator_bb, outer); ++ gsi = gsi_last_bb (cond_bb); ++ tree entry_nn = get_current_def (nn_tree); ++ phi_s_loop = create_phi_node_for_bb (origin_loop.base_s, cond_bb); ++ phi_c_loop = create_phi_node_for_bb (origin_loop.base_c, cond_bb); ++ tree res_s = gimple_phi_result (phi_s_loop); ++ set_current_def (origin_loop.base_s, res_s); ++ s_map.put (cond_bb, res_s); ++ tree res_c = gimple_phi_result (phi_c_loop); ++ set_current_def (origin_loop.base_c, res_c); ++ c_map.put (cond_bb, res_c); ++ ++ tree lhs1 = gimple_build (&stmts, BIT_AND_EXPR, ++ TREE_TYPE (origin_loop.base_n), origin_loop.base_n, ++ build_int_cst (TREE_TYPE (origin_loop.base_n), 2)); ++ gcond *cond_stmt = gimple_build_cond (NE_EXPR, lhs1, origin_loop.limit, + NULL_TREE, NULL_TREE); + gimple_seq_add_stmt (&stmts, cond_stmt); + gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); +@@ -938,41 +964,45 @@ create_cond_bb(basic_block& cond_bb, basic_block after_bb, + _41 = __builtin_aarch64_crc32h (_8, _7); + c_33 = (long unsigned int) _41; + s_34 = s_30 + 2; +- The IR of bb is as above.*/ ++ The IR of bb is as above. */ + static void +-create_cond_true_bb(basic_block& cond_true_bb, basic_block after_bb, +- basic_block dominator_bb, class loop *outer){ ++create_cond_true_bb (basic_block &cond_true_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ + gimple_seq stmts = NULL; +- gimple* g; ++ gimple *g; + gimple_stmt_iterator gsi; + +- create_new_bb(cond_true_bb, after_bb, dominator_bb, outer); +- gsi = gsi_last_bb(cond_true_bb); +- tree s_46 = *(s_map.get(after_bb)); +- g = gimple_build_assign(make_ssa_name(short_unsigned_type_node), +- fold_build2(MEM_REF,short_unsigned_type_node,s_46, +- build_int_cst (build_pointer_type (short_unsigned_type_node), 0))); +- gimple_seq_add_stmt(&stmts,g); +- tree lhs1 = gimple_assign_lhs(g); // _7 = MEM[(uint16_t *)s_46]; ++ create_new_bb (cond_true_bb, after_bb, dominator_bb, outer); ++ gsi = gsi_last_bb (cond_true_bb); ++ tree s_46 = *(s_map.get (after_bb)); ++ tree type = build_pointer_type (short_unsigned_type_node); ++ g = gimple_build_assign (make_ssa_name (short_unsigned_type_node), ++ fold_build2 (MEM_REF, short_unsigned_type_node, s_46, ++ build_int_cst (type, 0))); ++ gimple_seq_add_stmt (&stmts,g); ++ tree lhs1 = gimple_assign_lhs (g); // _7 = MEM[(uint16_t *)s_46]; + unsigned int code = AARCH64_BUILTIN_CRC32H; + tree lhs2; +- gimple* call_builtin = call_builtin_fun(code, lhs2,*(crc_map.get(cond_true_bb->prev_bb->prev_bb)),lhs1); +- crc_map.put(cond_true_bb,lhs2); +- gimple_seq_add_stmt(&stmts, call_builtin); +- +- tree lhs3 = copy_ssa_name(origin_loop.base_c); +- g = gimple_build_assign(lhs3, NOP_EXPR, lhs2); +- gimple_seq_add_stmt(&stmts, g); +- c_map.put(cond_true_bb, lhs3); ++ gimple *call_builtin = call_builtin_fun (code, lhs2, ++ *(crc_map.get ( ++ cond_true_bb->prev_bb->prev_bb)), lhs1); ++ crc_map.put (cond_true_bb,lhs2); ++ gimple_seq_add_stmt (&stmts, call_builtin); ++ ++ tree lhs3 = copy_ssa_name (origin_loop.base_c); ++ g = gimple_build_assign (lhs3, NOP_EXPR, lhs2); ++ gimple_seq_add_stmt (&stmts, g); ++ c_map.put (cond_true_bb, lhs3); + +- tree lhs5 = copy_ssa_name(s_46); +- g = gimple_build_assign(lhs5, POINTER_PLUS_EXPR, s_46, +- build_int_cst (sizetype, 2)); // s_30 + 2; +- gimple_seq_add_stmt(&stmts, g); +- s_map.put(cond_true_bb, lhs5); ++ tree lhs5 = copy_ssa_name (s_46); ++ g = gimple_build_assign (lhs5, POINTER_PLUS_EXPR, s_46, ++ build_int_cst (sizetype, 2)); // s_30 + 2; ++ gimple_seq_add_stmt (&stmts, g); ++ s_map.put (cond_true_bb, lhs5); + + gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); +- s_map.put(cond_true_bb, lhs5); ++ s_map.put (cond_true_bb, lhs5); + } + + /* : +@@ -980,34 +1010,35 @@ create_cond_true_bb(basic_block& cond_true_bb, basic_block after_bb, + # c_17 = PHI + _3 = n_26(D) & 1; + if (_3 != 0) +- The IR of bb is as above.*/ ++ The IR of bb is as above. */ + static void +-create_cond_false_bb(basic_block& cond_false_bb, basic_block after_bb, +- basic_block dominator_bb, class loop *outer) ++create_cond_false_bb (basic_block &cond_false_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) + { + gimple_seq stmts = NULL; + gimple_stmt_iterator gsi; +- gphi* phi_s_cond_true_bb; +- gphi* phi_c_cond_true_bb; +- +- create_new_bb(cond_false_bb, after_bb, dominator_bb, outer); +- make_single_succ_edge(after_bb, cond_false_bb, EDGE_FALLTHRU); +- +- tree entry_s = get_current_def(origin_loop.base_s); +- phi_s_cond_true_bb = create_phi_node_for_bb(entry_s, cond_false_bb); +- tree entry_c = get_current_def(origin_loop.base_c); +- phi_c_cond_true_bb = create_phi_node_for_bb(entry_c, cond_false_bb); +- tree res_s = gimple_phi_result(phi_s_cond_true_bb); +- set_current_def(origin_loop.base_s, res_s); +- s_map.put(cond_false_bb, res_s); +- tree res_c = gimple_phi_result(phi_c_cond_true_bb); +- set_current_def(origin_loop.base_c, res_c); +- c_map.put(cond_false_bb, res_c); +- +- gsi = gsi_last_bb(cond_false_bb); +- tree lhs1 = gimple_build(&stmts, BIT_AND_EXPR, TREE_TYPE(origin_loop.base_n), +- origin_loop.base_n, build_int_cst (TREE_TYPE (origin_loop.base_n), 1)); +- gcond* cond_stmt = gimple_build_cond (NE_EXPR, lhs1, origin_loop.limit, ++ gphi *phi_s_cond_true_bb; ++ gphi *phi_c_cond_true_bb; ++ ++ create_new_bb (cond_false_bb, after_bb, dominator_bb, outer); ++ make_single_succ_edge (after_bb, cond_false_bb, EDGE_FALLTHRU); ++ ++ tree entry_s = get_current_def (origin_loop.base_s); ++ phi_s_cond_true_bb = create_phi_node_for_bb (entry_s, cond_false_bb); ++ tree entry_c = get_current_def (origin_loop.base_c); ++ phi_c_cond_true_bb = create_phi_node_for_bb (entry_c, cond_false_bb); ++ tree res_s = gimple_phi_result (phi_s_cond_true_bb); ++ set_current_def (origin_loop.base_s, res_s); ++ s_map.put (cond_false_bb, res_s); ++ tree res_c = gimple_phi_result (phi_c_cond_true_bb); ++ set_current_def (origin_loop.base_c, res_c); ++ c_map.put (cond_false_bb, res_c); ++ ++ gsi = gsi_last_bb (cond_false_bb); ++ tree lhs1 = gimple_build (&stmts, BIT_AND_EXPR, ++ TREE_TYPE (origin_loop.base_n), origin_loop.base_n, ++ build_int_cst (TREE_TYPE (origin_loop.base_n), 1)); ++ gcond *cond_stmt = gimple_build_cond (NE_EXPR, lhs1, origin_loop.limit, + NULL_TREE, NULL_TREE); + gimple_seq_add_stmt (&stmts, cond_stmt); + gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); +@@ -1020,53 +1051,54 @@ create_cond_false_bb(basic_block& cond_false_bb, basic_block after_bb, + c_36 = (long unsigned int) _42; + The IR of bb is as above. */ + static void +-create_lastcond_true_bb(basic_block& new_bb, basic_block after_bb, +- basic_block dominator_bb, class loop *outer){ ++create_lastcond_true_bb (basic_block &new_bb, basic_block after_bb, ++ basic_block dominator_bb, class loop *outer) ++{ + gimple_seq stmts = NULL; + gimple_stmt_iterator gsi; +- gimple* g; ++ gimple *g; + +- create_new_bb(new_bb, after_bb, dominator_bb, outer); +- gsi = gsi_last_bb(new_bb); ++ create_new_bb (new_bb, after_bb, dominator_bb, outer); ++ gsi = gsi_last_bb (new_bb); + +- tree lhs1 = gimple_build(&stmts, NOP_EXPR, unsigned_type_node, +- get_current_def(origin_loop.base_c)); ++ tree lhs1 = gimple_build (&stmts, NOP_EXPR, unsigned_type_node, ++ get_current_def (origin_loop.base_c)); + tree lhs2; +- tree s_15 = get_current_def(origin_loop.base_s); ++ tree s_15 = get_current_def (origin_loop.base_s); + g = gimple_build_assign (make_ssa_name (unsigned_char_type_node), + fold_build2 (MEM_REF, unsigned_char_type_node, s_15, +- build_int_cst (TREE_TYPE(s_15), 0))); ++ build_int_cst (TREE_TYPE (s_15), 0))); + gimple_seq_add_stmt (&stmts, g); + lhs2 = gimple_assign_lhs (g); + + unsigned int code = AARCH64_BUILTIN_CRC32B; + tree lhs3; +- gimple* call_builtin = call_builtin_fun(code, lhs3, lhs1, lhs2); +- crc_map.put(new_bb,lhs3); +- gimple_seq_add_stmt(&stmts,call_builtin); ++ gimple *call_builtin = call_builtin_fun (code, lhs3, lhs1, lhs2); ++ crc_map.put (new_bb,lhs3); ++ gimple_seq_add_stmt (&stmts,call_builtin); + +- tree lhs4 = copy_ssa_name(origin_loop.base_c); +- g = gimple_build_assign(lhs4, NOP_EXPR, lhs3); +- gimple_seq_add_stmt(&stmts, g); +- c_map.put(new_bb, lhs4); ++ tree lhs4 = copy_ssa_name (origin_loop.base_c); ++ g = gimple_build_assign (lhs4, NOP_EXPR, lhs3); ++ gimple_seq_add_stmt (&stmts, g); ++ c_map.put (new_bb, lhs4); + + gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); + } + + static bool +-optional_add_phi_arg(gphi * phi, tree phi_res, tree phi_arg, edge e) ++optional_add_phi_arg (gphi * phi, tree phi_res, tree phi_arg, edge e) + { + location_t loc; + if (same_ssa_name_var_p (phi_arg, phi_res)) +- { +- if (virtual_operand_p (phi_arg)) +- loc = UNKNOWN_LOCATION; +- else +- loc = gimple_location (SSA_NAME_DEF_STMT (phi_arg)); +- add_phi_arg (phi, phi_arg, e, loc); ++ { ++ if (virtual_operand_p (phi_arg)) ++ loc = UNKNOWN_LOCATION; ++ else ++ loc = gimple_location (SSA_NAME_DEF_STMT (phi_arg)); ++ add_phi_arg (phi, phi_arg, e, loc); + +- return true; +- } ++ return true; ++ } + + return false; + } +@@ -1082,108 +1114,101 @@ update_phi_nodes (basic_block bb) + tree res; + + for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) +- { +- phi = gsi.phi (); +- res = gimple_phi_result (phi); +- +- FOR_EACH_EDGE (e, ei, bb->preds) + { +- if (PHI_ARG_DEF_FROM_EDGE (phi, e)) +- continue; +- tree var_c; +- tree* ptr_var_c = c_map.get (e->src); +- if(ptr_var_c == NULL) +- { +- var_c = origin_loop.base_c; +- } else { +- var_c = *ptr_var_c; +- } +- if(optional_add_phi_arg(phi, res, var_c, e)) +- continue; ++ phi = gsi.phi (); ++ res = gimple_phi_result (phi); + +- tree var_nn; +- tree* ptr_var_nn = nn_map.get (e->src); +- if(ptr_var_nn == NULL) ++ FOR_EACH_EDGE (e, ei, bb->preds) + { +- var_nn = nn_tree; +- } else { +- var_nn = *ptr_var_nn; ++ if (PHI_ARG_DEF_FROM_EDGE (phi, e)) ++ continue; ++ tree var_c; ++ tree *ptr_var_c = c_map.get (e->src); ++ if (ptr_var_c == NULL) ++ var_c = origin_loop.base_c; ++ else ++ var_c = *ptr_var_c; ++ if (optional_add_phi_arg (phi, res, var_c, e)) ++ continue; ++ ++ tree var_nn; ++ tree *ptr_var_nn = nn_map.get (e->src); ++ if (ptr_var_nn == NULL) ++ var_nn = nn_tree; ++ else ++ var_nn = *ptr_var_nn; ++ if (optional_add_phi_arg (phi, res, var_nn, e)) ++ continue; ++ ++ tree var_s; ++ tree *ptr_var_s = s_map.get (e->src); ++ if (ptr_var_s == NULL) ++ var_s = origin_loop.base_s; ++ else ++ var_s = *ptr_var_s; ++ if (optional_add_phi_arg (phi, res, var_s, e)) ++ continue; + } +- if(optional_add_phi_arg(phi, res, var_nn, e)) +- continue; +- +- tree var_s; +- tree* ptr_var_s = s_map.get (e->src); +- if(ptr_var_s == NULL) +- { +- var_s = origin_loop.base_s; +- } else { +- var_s = *ptr_var_s; +- } +- if(optional_add_phi_arg(phi, res, var_s, e)) +- continue; + } +- } + } + +-static void +-create_new_loops(edge entry_edge) ++static void ++create_new_loops (edge entry_edge) + { +- class loop* new_loop = NULL; ++ class loop *new_loop = NULL; + basic_block loop_bb, cond_bb, cond_true_bb, cond_false_bb, lastcond_true_bb; + class loop *outer = entry_edge->src->loop_father; +- change_preheader_bb(entry_edge); ++ change_preheader_bb (entry_edge); + +- create_loop_bb(loop_bb, entry_edge->src, entry_edge->src, outer, entry_edge); +- create_cond_bb(cond_bb, loop_bb, loop_bb, outer); +- make_edge(loop_bb, loop_bb, EDGE_TRUE_VALUE); +- make_edge(loop_bb, cond_bb, EDGE_FALSE_VALUE); +- update_phi_nodes(loop_bb); ++ create_loop_bb (loop_bb, entry_edge->src, entry_edge->src, outer, entry_edge); ++ create_cond_bb (cond_bb, loop_bb, loop_bb, outer); ++ make_edge (loop_bb, loop_bb, EDGE_TRUE_VALUE); ++ make_edge (loop_bb, cond_bb, EDGE_FALSE_VALUE); ++ update_phi_nodes (loop_bb); + + new_loop = alloc_loop (); + new_loop->header = loop_bb; + new_loop->latch = loop_bb; + add_loop (new_loop, outer); + if (dump_file && (dump_flags & TDF_DETAILS)) +- { +- fprintf (dump_file, "\nPrint byte new loop %d:\n", new_loop->num); +- flow_loop_dump (new_loop, dump_file, NULL, 1); +- fprintf (dump_file, "\n\n"); +- } ++ { ++ fprintf (dump_file, "\nPrint byte new loop %d:\n", new_loop->num); ++ flow_loop_dump (new_loop, dump_file, NULL, 1); ++ fprintf (dump_file, "\n\n"); ++ } + +- create_cond_true_bb(cond_true_bb, cond_bb, cond_bb, outer); +- make_edge(cond_bb, cond_true_bb, EDGE_TRUE_VALUE); +- create_cond_false_bb(cond_false_bb, cond_true_bb, cond_bb, outer); +- make_edge(cond_bb, cond_false_bb, EDGE_FALSE_VALUE); +- update_phi_nodes(cond_bb); +- update_phi_nodes(cond_false_bb); +- create_lastcond_true_bb(lastcond_true_bb, cond_false_bb, cond_false_bb, outer); +- make_edge(cond_false_bb, lastcond_true_bb, EDGE_TRUE_VALUE); +- make_edge(cond_false_bb, origin_loop.exit_bb, EDGE_FALSE_VALUE); +- make_single_succ_edge(lastcond_true_bb, origin_loop.exit_bb, EDGE_FALLTHRU); +- +- update_phi_nodes(origin_loop.exit_bb); +- remove_edge(origin_loop.exit_edge); ++ create_cond_true_bb (cond_true_bb, cond_bb, cond_bb, outer); ++ make_edge (cond_bb, cond_true_bb, EDGE_TRUE_VALUE); ++ create_cond_false_bb (cond_false_bb, cond_true_bb, cond_bb, outer); ++ make_edge (cond_bb, cond_false_bb, EDGE_FALSE_VALUE); ++ update_phi_nodes (cond_bb); ++ update_phi_nodes (cond_false_bb); ++ create_lastcond_true_bb (lastcond_true_bb, cond_false_bb, ++ cond_false_bb, outer); ++ make_edge (cond_false_bb, lastcond_true_bb, EDGE_TRUE_VALUE); ++ make_edge (cond_false_bb, origin_loop.exit_bb, EDGE_FALSE_VALUE); ++ make_single_succ_edge (lastcond_true_bb, origin_loop.exit_bb, EDGE_FALLTHRU); ++ ++ update_phi_nodes (origin_loop.exit_bb); ++ remove_edge (origin_loop.exit_edge); + } + + /* Clear information about the original loop. */ + static void +-remove_origin_loop(class loop* loop) ++remove_origin_loop (class loop *loop) + { +- basic_block* body = get_loop_body_in_dom_order(loop); ++ basic_block *body = get_loop_body_in_dom_order (loop); + unsigned n = loop->num_nodes; +- for(int i = 0; i < n; ++i) +- { +- delete_basic_block(body[i]); +- } +- free(body); +- delete_loop(loop); ++ for (int i = 0; i < n; ++i) ++ delete_basic_block (body[i]); ++ free (body); ++ delete_loop (loop); + } + + /* Make sure that the dominance relationship of the newly inserted cfg + is not missing. */ + static void +-update_loop_dominator(cdi_direction dir) ++update_loop_dominator (cdi_direction dir) + { + gcc_assert (dom_info_available_p (dir)); + +@@ -1192,11 +1217,11 @@ update_loop_dominator(cdi_direction dir) + { + basic_block imm_bb = get_immediate_dominator (dir, bb); + if (!imm_bb || bb == origin_loop.exit_bb) +- { +- set_immediate_dominator (CDI_DOMINATORS, bb, ++ { ++ set_immediate_dominator (CDI_DOMINATORS, bb, + recompute_dominator (CDI_DOMINATORS, bb)); +- continue; +- } ++ continue; ++ } + } + } + +@@ -1214,12 +1239,13 @@ convert_to_new_loop (class loop *loop) + static unsigned int + tree_ssa_loop_crc () + { +- if(TARGET_CRC32 == false){ +- warning (OPT____,"The loop-crc optimization is not working."\ +- "You should make sure that the specified architecture supports"\ +- " crc:-march=armv8.1-a"); +- return 0; +- } ++ if (TARGET_CRC32 == false) ++ { ++ warning (OPT____,"The loop-crc optimization is not working." \ ++ "You should make sure that the specified architecture" \ ++ "supports crc:-march=armv8.1-a"); ++ return 0; ++ } + unsigned int todo = 0; + class loop *loop; + +@@ -1232,25 +1258,25 @@ tree_ssa_loop_crc () + FOR_EACH_LOOP (loop, LI_FROM_INNERMOST) + { + if (dump_file && (dump_flags & TDF_DETAILS)) +- { +- fprintf (dump_file, "======================================\n"); +- fprintf (dump_file, "Processing loop %d:\n", loop->num); +- fprintf (dump_file, "======================================\n"); +- flow_loop_dump (loop, dump_file, NULL, 1); +- fprintf (dump_file, "\n\n"); +- } ++ { ++ fprintf (dump_file, "======================================\n"); ++ fprintf (dump_file, "Processing loop %d:\n", loop->num); ++ fprintf (dump_file, "======================================\n"); ++ flow_loop_dump (loop, dump_file, NULL, 1); ++ fprintf (dump_file, "\n\n"); ++ } + + if (match_crc_loop (loop)) +- { +- if (dump_file && (dump_flags & TDF_DETAILS)) +- { +- fprintf (dump_file, "The %dth loop form is success matched," +- "and the loop can be optimized.\n", +- loop->num); +- } ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "The %dth loop form is success matched," ++ "and the loop can be optimized.\n", ++ loop->num); ++ } + +- convert_to_new_loop (loop); +- } ++ convert_to_new_loop (loop); ++ } + } + + todo |= (TODO_update_ssa); +@@ -1260,7 +1286,6 @@ tree_ssa_loop_crc () + /* Loop crc. */ + + namespace { +- + const pass_data pass_data_tree_loop_crc = + { + GIMPLE_PASS, +@@ -1281,11 +1306,10 @@ public: + : gimple_opt_pass (pass_data_tree_loop_crc, ctxt) + {} + +- /* opt_pass methods: */ ++ /* Opt_pass methods: */ + virtual bool gate (function *); + virtual unsigned int execute (function *); +- +-}; // class pass_loop_crc ++}; // Class pass_loop_crc + + bool + pass_loop_crc::gate (function *) +@@ -1304,15 +1328,15 @@ pass_loop_crc::execute (function *fun) + || POINTER_SIZE != 64 || TYPE_PRECISION (integer_type_node) != 32) + { + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "The current data mode is not supported," +- "only the LP64 date mode is supported.\n"); ++ fprintf (dump_file, "The current data mode is not supported," ++ "only the LP64 date mode is supported.\n"); + return 0; + } + + return tree_ssa_loop_crc (); + } + +-} // anon namespace ++} // Anon namespace + + gimple_opt_pass * + make_pass_loop_crc (gcc::context *ctxt) +-- +2.33.0 + diff --git a/0155-Add-maxmin-and-uzp1-uzp2-combining.patch b/0155-Add-maxmin-and-uzp1-uzp2-combining.patch new file mode 100644 index 0000000..3aca7e2 --- /dev/null +++ b/0155-Add-maxmin-and-uzp1-uzp2-combining.patch @@ -0,0 +1,477 @@ +From 1e886b98ff7ffdac023dcee8645717f2849d2eb7 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia WX1215920 +Date: Wed, 25 Oct 2023 18:12:28 +0300 +Subject: [PATCH 1/6] Add maxmin and uzp1/uzp2 combining + +--- + gcc/config/aarch64/aarch64-simd.md | 339 +++++++++++++++++++++++++- + gcc/config/aarch64/predicates.md | 19 ++ + gcc/testsuite/gcc.dg/combine-maxmin.c | 46 ++++ + 3 files changed, 399 insertions(+), 5 deletions(-) + create mode 100755 gcc/testsuite/gcc.dg/combine-maxmin.c + +diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md +index 6049adc3f..7f707de57 100644 +--- a/gcc/config/aarch64/aarch64-simd.md ++++ b/gcc/config/aarch64/aarch64-simd.md +@@ -1034,6 +1034,82 @@ + [(set_attr "type" "neon_shift_imm")] + ) + ++;; Simplify the extension with following truncation for shift+neg operation. ++ ++(define_insn_and_split "*aarch64_sshr_neg_v8hi" ++ [(set (match_operand:V8HI 0 "register_operand" "=w") ++ (vec_concat:V8HI ++ (truncate:V4HI ++ (ashiftrt:V4SI ++ (neg:V4SI ++ (sign_extend:V4SI ++ (vec_select:V4HI ++ (match_operand:V8HI 1 "register_operand") ++ (match_operand:V8HI 3 "vect_par_cnst_lo_half")))) ++ (match_operand:V4SI 2 "maxmin_arith_shift_operand"))) ++ (truncate:V4HI ++ (ashiftrt:V4SI ++ (neg:V4SI ++ (sign_extend:V4SI ++ (vec_select:V4HI ++ (match_dup 1) ++ (match_operand:V8HI 4 "vect_par_cnst_hi_half")))) ++ (match_dup 2)))))] ++ "TARGET_SIMD" ++ "#" ++ "&& true" ++ [(set (match_operand:V8HI 0 "register_operand" "=w") ++ (ashiftrt:V8HI ++ (neg:V8HI ++ (match_operand:V8HI 1 "register_operand" "w")) ++ (match_operand:V8HI 2 "aarch64_simd_imm_minus_one")))] ++ { ++ /* Reduce the shift amount to smaller mode. */ ++ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[2], 0)) ++ - (GET_MODE_UNIT_BITSIZE (GET_MODE (operands[2])) / 2); ++ operands[2] = aarch64_simd_gen_const_vector_dup (V8HImode, val); ++ } ++ [(set_attr "type" "multiple")] ++) ++ ++;; The helper definition that allows combiner to use the previous pattern. ++ ++(define_insn_and_split "*aarch64_sshr_neg_tmpv8hi" ++ [(set (match_operand:V8HI 0 "register_operand" "=w") ++ (vec_concat:V8HI ++ (truncate:V4HI ++ (ashiftrt:V4SI ++ (neg:V4SI ++ (match_operand:V4SI 1 "register_operand" "w")) ++ (match_operand:V4SI 2 "maxmin_arith_shift_operand"))) ++ (truncate:V4HI ++ (ashiftrt:V4SI ++ (neg:V4SI ++ (match_operand:V4SI 3 "register_operand" "w")) ++ (match_dup 2)))))] ++ "TARGET_SIMD" ++ "#" ++ "&& true" ++ [(set (match_operand:V4SI 1 "register_operand" "=w") ++ (ashiftrt:V4SI ++ (neg:V4SI ++ (match_dup 1)) ++ (match_operand:V4SI 2 "maxmin_arith_shift_operand"))) ++ (set (match_operand:V4SI 3 "register_operand" "=w") ++ (ashiftrt:V4SI ++ (neg:V4SI ++ (match_dup 3)) ++ (match_dup 2))) ++ (set (match_operand:V8HI 0 "register_operand" "=w") ++ (vec_concat:V8HI ++ (truncate:V4HI ++ (match_dup 1)) ++ (truncate:V4HI ++ (match_dup 3))))] ++ "" ++ [(set_attr "type" "multiple")] ++) ++ + (define_insn "*aarch64_simd_sra" + [(set (match_operand:VDQ_I 0 "register_operand" "=w") + (plus:VDQ_I +@@ -1459,6 +1535,78 @@ + [(set_attr "type" "neon_minmax")] + ) + ++;; Use sequential smax+smin to replace vector arithmetic operations like this: ++;; a = ((x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x); ++;; TODO: maybe extend to scalar operations. ++ ++(define_insn_and_split "*aarch64_maxmin_arith" ++ [(set (match_operand:VDQHSD 0 "register_operand" "=w") ++ (xor:VDQHSD ++ (and:VDQHSD ++ (xor:VDQHSD ++ (ashiftrt:VDQHSD ++ (neg:VDQHSD ++ (match_operand:VDQHSD 1 "register_operand")) ++ (match_operand:VDQHSD 2 "maxmin_arith_shift_operand")) ++ (match_dup 1)) ++ (neg:VDQHSD ++ (eq:VDQHSD ++ (and:VDQHSD ++ (match_dup 1) ++ (match_operand:VDQHSD 3 "aarch64_bic_imm_for_maxmin")) ++ (match_operand:VDQHSD 4 "aarch64_simd_or_scalar_imm_zero")))) ++ (ashiftrt:VDQHSD ++ (neg:VDQHSD ++ (match_dup 1)) ++ (match_dup 2))))] ++ "TARGET_SIMD && !reload_completed" ++ "#" ++ "&& true" ++ [(set (match_operand:VDQHSD 5 "register_operand" "w") (match_dup 3)) ++ (set (match_operand:VDQHSD 6 "register_operand" "w") (match_dup 4)) ++ (set (match_operand:VDQHSD 0 "register_operand" "=w") ++ (smax:VDQHSD (match_operand:VDQHSD 1 "register_operand" "w") ++ (match_operand:VDQHSD 6 "register_operand" "w"))) ++ (set (match_operand:VDQHSD 0 "register_operand" "=w") ++ (smin:VDQHSD (match_operand:VDQHSD 0 "register_operand" "w") ++ (match_operand:VDQHSD 5 "register_operand" "w")))] ++ { ++ if (can_create_pseudo_p ()) ++ { ++ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[3], 0)); ++ operands[3] = aarch64_simd_gen_const_vector_dup (mode, ++ ~val); ++ operands[5] = gen_reg_rtx (mode); ++ operands[6] = gen_reg_rtx (mode); ++ } ++ else ++ FAIL; ++ } ++ [(set_attr "type" "neon_minmax")] ++) ++ ++;; The helper definition that allows combiner to use the previous pattern. ++ ++(define_insn_and_split "*aarch64_maxmin_tmp" ++ [(set (match_operand:VDQHSD 0 "register_operand" "=w") ++ (ashiftrt:VDQHSD ++ (neg:VDQHSD ++ (match_operand:VDQHSD 1 "register_operand" "w")) ++ (match_operand:VDQHSD 2 "maxmin_arith_shift_operand")))] ++ "TARGET_SIMD" ++ "#" ++ "&& reload_completed" ++ [(set (match_operand:VDQHSD 0 "register_operand") ++ (neg:VDQHSD ++ (match_operand:VDQHSD 1 "register_operand" "w"))) ++ (set (match_dup 0) ++ (ashiftrt:VDQHSD ++ (match_dup 0) ++ (match_operand:VDQHSD 2 "maxmin_arith_shift_operand")))] ++ "" ++ [(set_attr "type" "neon_minmax")] ++) ++ + ;; Pairwise FP Max/Min operations. + (define_insn "aarch64_p" + [(set (match_operand:VHSDF 0 "register_operand" "=w") +@@ -1599,7 +1747,8 @@ + DONE; + }) + +-;; For quads. ++;; For quads. Use UZP1 on the narrower type, which discards the high part of ++;; each wide element. + + (define_insn "vec_pack_trunc_" + [(set (match_operand: 0 "register_operand" "=&w") +@@ -1609,12 +1758,32 @@ + "TARGET_SIMD" + { + if (BYTES_BIG_ENDIAN) +- return "xtn\\t%0., %2.\;xtn2\\t%0., %1."; ++ return "uzp1\\t%0., %2., %1."; + else +- return "xtn\\t%0., %1.\;xtn2\\t%0., %2."; ++ return "uzp1\\t%0., %1., %2."; + } +- [(set_attr "type" "multiple") +- (set_attr "length" "8")] ++ [(set_attr "type" "neon_permute") ++ (set_attr "length" "4")] ++) ++ ++(define_insn "vec_pack_trunc_shifted_" ++ [(set (match_operand: 0 "register_operand" "=&w") ++ (vec_concat: ++ (truncate: ++ (ashiftrt:VQN (match_operand:VQN 1 "register_operand" "w") ++ (match_operand:VQN 2 "half_size_operand" "w"))) ++ (truncate: ++ (ashiftrt:VQN (match_operand:VQN 3 "register_operand" "w") ++ (match_operand:VQN 4 "half_size_operand" "w")))))] ++ "TARGET_SIMD" ++ { ++ if (BYTES_BIG_ENDIAN) ++ return "uzp2\\t%0., %3., %1."; ++ else ++ return "uzp2\\t%0., %1., %3."; ++ } ++ [(set_attr "type" "neon_permute") ++ (set_attr "length" "4")] + ) + + ;; Widening operations. +@@ -4852,6 +5021,166 @@ + [(set_attr "type" "neon_tst")] + ) + ++;; Simplify the extension with following truncation for cmtst-like operation. ++ ++(define_insn_and_split "*aarch64_cmtst_arith_v8hi" ++ [(set (match_operand:V8HI 0 "register_operand" "=w") ++ (vec_concat:V8HI ++ (plus:V4HI ++ (truncate:V4HI ++ (eq:V4SI ++ (sign_extend:V4SI ++ (vec_select:V4HI ++ (and:V8HI ++ (match_operand:V8HI 1 "register_operand") ++ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) ++ (match_operand:V8HI 3 "vect_par_cnst_lo_half"))) ++ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))) ++ (match_operand:V4HI 5 "aarch64_simd_imm_minus_one")) ++ (plus:V4HI ++ (truncate:V4HI ++ (eq:V4SI ++ (sign_extend:V4SI ++ (vec_select:V4HI ++ (and:V8HI ++ (match_dup 1) ++ (match_dup 2)) ++ (match_operand:V8HI 6 "vect_par_cnst_hi_half"))) ++ (match_dup 4))) ++ (match_dup 5))))] ++ "TARGET_SIMD && !reload_completed" ++ "#" ++ "&& true" ++ [(set (match_operand:V8HI 6 "register_operand" "=w") ++ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) ++ (set (match_operand:V8HI 0 "register_operand" "=w") ++ (plus:V8HI ++ (eq:V8HI ++ (and:V8HI ++ (match_operand:V8HI 1 "register_operand" "w") ++ (match_dup 6)) ++ (match_operand:V8HI 4 "aarch64_simd_imm_zero")) ++ (match_operand:V8HI 5 "aarch64_simd_imm_minus_one")))] ++ { ++ if (can_create_pseudo_p ()) ++ { ++ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[4], 0)); ++ operands[4] = aarch64_simd_gen_const_vector_dup (V8HImode, val); ++ int val2 = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[5], 0)); ++ operands[5] = aarch64_simd_gen_const_vector_dup (V8HImode, val2); ++ ++ operands[6] = gen_reg_rtx (V8HImode); ++ } ++ else ++ FAIL; ++ } ++ [(set_attr "type" "neon_tst_q")] ++) ++ ++;; Three helper definitions that allow combiner to use the previous pattern. ++ ++(define_insn_and_split "*aarch64_cmtst_arith_tmp_lo_v8hi" ++ [(set (match_operand:V4SI 0 "register_operand" "=w") ++ (neg:V4SI ++ (eq:V4SI ++ (sign_extend:V4SI ++ (vec_select:V4HI ++ (and:V8HI ++ (match_operand:V8HI 1 "register_operand") ++ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) ++ (match_operand:V8HI 3 "vect_par_cnst_lo_half"))) ++ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))] ++ "TARGET_SIMD && !reload_completed" ++ "#" ++ "&& true" ++ [(set (match_operand:V8HI 5 "register_operand" "=w") ++ (and:V8HI ++ (match_operand:V8HI 1 "register_operand") ++ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))) ++ (set (match_operand:V4SI 0 "register_operand" "=w") ++ (sign_extend:V4SI ++ (vec_select:V4HI ++ (match_dup 5) ++ (match_operand:V8HI 3 "vect_par_cnst_lo_half")))) ++ (set (match_dup 0) ++ (neg:V4SI ++ (eq:V4SI ++ (match_dup 0) ++ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))] ++ { ++ if (can_create_pseudo_p ()) ++ operands[5] = gen_reg_rtx (V8HImode); ++ else ++ FAIL; ++ } ++ [(set_attr "type" "multiple")] ++) ++ ++(define_insn_and_split "*aarch64_cmtst_arith_tmp_hi_v8hi" ++ [(set (match_operand:V4SI 0 "register_operand" "=w") ++ (neg:V4SI ++ (eq:V4SI ++ (sign_extend:V4SI ++ (vec_select:V4HI ++ (and:V8HI ++ (match_operand:V8HI 1 "register_operand") ++ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) ++ (match_operand:V8HI 3 "vect_par_cnst_hi_half"))) ++ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))] ++ "TARGET_SIMD && !reload_completed" ++ "#" ++ "&& true" ++ [(set (match_operand:V8HI 5 "register_operand" "=w") ++ (and:V8HI ++ (match_operand:V8HI 1 "register_operand") ++ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))) ++ (set (match_operand:V4SI 0 "register_operand" "=w") ++ (sign_extend:V4SI ++ (vec_select:V4HI ++ (match_dup 5) ++ (match_operand:V8HI 3 "vect_par_cnst_hi_half")))) ++ (set (match_dup 0) ++ (neg:V4SI ++ (eq:V4SI ++ (match_dup 0) ++ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))] ++ { ++ if (can_create_pseudo_p ()) ++ operands[5] = gen_reg_rtx (V8HImode); ++ else ++ FAIL; ++ } ++ [(set_attr "type" "multiple")] ++) ++ ++(define_insn_and_split "*aarch64_cmtst_arith_tmpv8hi" ++ [(set (match_operand:V8HI 0 "register_operand" "=w") ++ (vec_concat:V8HI ++ (truncate:V4HI ++ (not:V4SI ++ (match_operand:V4SI 1 "register_operand" "w"))) ++ (truncate:V4HI ++ (not:V4SI ++ (match_operand:V4SI 2 "register_operand" "w")))))] ++ "TARGET_SIMD" ++ "#" ++ "&& true" ++ [(set (match_operand:V4SI 1 "register_operand" "=w") ++ (not:V4SI ++ (match_dup 1))) ++ (set (match_operand:V4SI 2 "register_operand" "=w") ++ (not:V4SI ++ (match_dup 2))) ++ (set (match_operand:V8HI 0 "register_operand" "=w") ++ (vec_concat:V8HI ++ (truncate:V4HI ++ (match_dup 1)) ++ (truncate:V4HI ++ (match_dup 2))))] ++ "" ++ [(set_attr "type" "multiple")] ++) ++ + (define_insn_and_split "aarch64_cmtstdi" + [(set (match_operand:DI 0 "register_operand" "=w,r") + (neg:DI +diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md +index 1754b1eff..3cd83334b 100644 +--- a/gcc/config/aarch64/predicates.md ++++ b/gcc/config/aarch64/predicates.md +@@ -91,6 +91,25 @@ + (match_test "aarch64_simd_valid_immediate (op, NULL, + AARCH64_CHECK_ORR)")))) + ++(define_predicate "aarch64_bic_imm_for_maxmin" ++ (match_code "const_vector") ++{ ++ if (!aarch64_simd_valid_immediate (op, NULL, AARCH64_CHECK_BIC)) ++ return false; ++ op = unwrap_const_vec_duplicate (op); ++ unsigned int size = GET_MODE_UNIT_BITSIZE (mode); ++ return CONST_INT_P (op) ++ && ((~UINTVAL (op)) < (((long unsigned int) 1 << size) - 1)); ++}) ++ ++(define_predicate "maxmin_arith_shift_operand" ++ (match_code "const_vector") ++{ ++ op = unwrap_const_vec_duplicate (op); ++ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) - 1; ++ return CONST_INT_P (op) && (UINTVAL (op) == size); ++}) ++ + (define_predicate "aarch64_reg_or_bic_imm" + (ior (match_operand 0 "register_operand") + (and (match_code "const_vector") +diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c +new file mode 100755 +index 000000000..06bce7029 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/combine-maxmin.c +@@ -0,0 +1,46 @@ ++/* { dg-do compile { target aarch64-*-* } } */ ++/* { dg-options "-O3 -fdump-rtl-combine-all" } */ ++ ++/* The test checks usage of smax/smin insns for clip evaluation and ++ * uzp1/uzp2 insns for vector element narrowing. It's inspired by ++ * sources of x264 codec. */ ++ ++typedef unsigned char uint8_t; ++typedef long int intptr_t; ++typedef signed short int int16_t; ++ ++static __attribute__((always_inline)) inline uint8_t clip (int x ) ++{ ++ return ( (x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x ); ++} ++ ++void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, ++ intptr_t stride, int width, int height, int16_t *buf) ++{ ++ const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0; ++ for( int y = 0; y < height; y++ ) { ++ for( int x = -2; x < width+3; x++ ) { ++ int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride] ++ + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride])); ++ dstv[x] = clip ( (v + 16) >> 5 ); ++ buf[x+2] = v + pad; ++ } ++ for( int x = 0; x < width; x++ ) ++ dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1] ++ + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1])) ++ - 32*pad + 512) >> 10); ++ for( int x = 0; x < width; x++ ) ++ dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1] ++ + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1])) ++ + 16) >> 5); ++ dsth += stride; ++ dstv += stride; ++ dstc += stride; ++ src += stride; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {smax\t} 4 } } */ ++/* { dg-final { scan-assembler-times {smin\t} 4 } } */ ++/* { dg-final { scan-assembler-times {cmtst\t} 2 } } */ ++/* { dg-final { scan-assembler-times {uzp1\t} 6 } } */ +-- +2.33.0 + diff --git a/0156-add-icp-optimization.patch b/0156-add-icp-optimization.patch new file mode 100644 index 0000000..efdbcb0 --- /dev/null +++ b/0156-add-icp-optimization.patch @@ -0,0 +1,2387 @@ +From f1c357fc742e94b3d4a7123253db64d7ebac304f Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia WX1215920 +Date: Mon, 12 Jun 2023 01:43:24 +0300 +Subject: [PATCH 3/6] add icp optimization + +--- + gcc/common.opt | 8 + + gcc/dbgcnt.def | 1 + + gcc/ipa-devirt.c | 1855 +++++++++++++++++++++++++++++++++++ + gcc/passes.def | 1 + + gcc/testsuite/gcc.dg/icp1.c | 40 + + gcc/testsuite/gcc.dg/icp2.c | 38 + + gcc/testsuite/gcc.dg/icp3.c | 52 + + gcc/testsuite/gcc.dg/icp4.c | 55 ++ + gcc/testsuite/gcc.dg/icp5.c | 66 ++ + gcc/testsuite/gcc.dg/icp6.c | 66 ++ + gcc/testsuite/gcc.dg/icp7.c | 48 + + gcc/timevar.def | 1 + + gcc/tree-pass.h | 1 + + 13 files changed, 2232 insertions(+) + create mode 100644 gcc/testsuite/gcc.dg/icp1.c + create mode 100644 gcc/testsuite/gcc.dg/icp2.c + create mode 100644 gcc/testsuite/gcc.dg/icp3.c + create mode 100644 gcc/testsuite/gcc.dg/icp4.c + create mode 100644 gcc/testsuite/gcc.dg/icp5.c + create mode 100644 gcc/testsuite/gcc.dg/icp6.c + create mode 100644 gcc/testsuite/gcc.dg/icp7.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index 6f0ed7cea..5a9e9c479 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1270,6 +1270,14 @@ fdevirtualize + Common Report Var(flag_devirtualize) Optimization + Try to convert virtual calls to direct ones. + ++ficp ++Common Report Var(flag_icp) Optimization Init(0) ++Try to promote indirect calls to direct ones. ++ ++ficp-speculatively ++Common Report Var(flag_icp_speculatively) Optimization ++Promote indirect calls speculatively. ++ + fdiagnostics-show-location= + Common Joined RejectNegative Enum(diagnostic_prefixing_rule) + -fdiagnostics-show-location=[once|every-line] How often to emit source location at the beginning of line-wrapped diagnostics. +diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def +index 232b19289..9d38a094b 100644 +--- a/gcc/dbgcnt.def ++++ b/gcc/dbgcnt.def +@@ -166,6 +166,7 @@ DEBUG_COUNTER (graphite_scop) + DEBUG_COUNTER (hoist) + DEBUG_COUNTER (hoist_insn) + DEBUG_COUNTER (ia64_sched2) ++DEBUG_COUNTER (icp) + DEBUG_COUNTER (if_after_combine) + DEBUG_COUNTER (if_after_reload) + DEBUG_COUNTER (if_conversion) +diff --git a/gcc/ipa-devirt.c b/gcc/ipa-devirt.c +index 3ab704973..79466d91d 100644 +--- a/gcc/ipa-devirt.c ++++ b/gcc/ipa-devirt.c +@@ -103,9 +103,14 @@ along with GCC; see the file COPYING3. If not see + indirect polymorphic edge all possible polymorphic call targets of the call. + + pass_ipa_devirt performs simple speculative devirtualization. ++ pass_ipa_icp performs simple indirect call promotion. + */ + + #include "config.h" ++#define INCLUDE_ALGORITHM ++#define INCLUDE_SET ++#define INCLUDE_MAP ++#define INCLUDE_LIST + #include "system.h" + #include "coretypes.h" + #include "backend.h" +@@ -127,6 +132,7 @@ along with GCC; see the file COPYING3. If not see + #include "ipa-fnsummary.h" + #include "demangle.h" + #include "dbgcnt.h" ++#include "gimple-iterator.h" + #include "gimple-pretty-print.h" + #include "intl.h" + #include "stringpool.h" +@@ -4369,5 +4375,1854 @@ make_pass_ipa_odr (gcc::context *ctxt) + return new pass_ipa_odr (ctxt); + } + ++/* Function signature map used to look up function decl which corresponds to ++ the given function type. */ ++typedef std::set type_set; ++typedef std::set decl_set; ++typedef std::map type_alias_map; ++typedef std::map type_decl_map; ++typedef std::map uid_to_type_map; ++typedef std::map type_map; ++ ++static bool has_address_taken_functions_with_varargs = false; ++static type_set *unsafe_types = NULL; ++static type_alias_map *fta_map = NULL; ++static type_alias_map *ta_map = NULL; ++static type_map *ctype_map = NULL; ++static type_alias_map *cbase_to_ptype = NULL; ++static type_decl_map *fs_map = NULL; ++static uid_to_type_map *type_uid_map = NULL; ++ ++static void ++print_type_set(unsigned ftype_uid, type_alias_map *map) ++{ ++ if (!map->count (ftype_uid)) ++ return; ++ type_set* s = (*map)[ftype_uid]; ++ for (type_set::const_iterator it = s->begin (); it != s->end (); it++) ++ fprintf (dump_file, it == s->begin () ? "%d" : ", %d", *it); ++} ++ ++static void ++dump_type_with_uid (const char *msg, tree type, dump_flags_t flags = TDF_NONE) ++{ ++ fprintf (dump_file, msg); ++ print_generic_expr (dump_file, type, flags); ++ fprintf (dump_file, " (%d)\n", TYPE_UID (type)); ++} ++ ++/* Walk aggregate type and collect types of scalar elements. */ ++ ++static void ++collect_scalar_types (tree tp, std::list &types) ++{ ++ /* TODO: take into account different field offsets. ++ Also support array casts. */ ++ if (tp && dump_file && (dump_flags & TDF_DETAILS)) ++ dump_type_with_uid ("Walk var's type: ", tp, TDF_UID); ++ if (RECORD_OR_UNION_TYPE_P (tp)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Record's fields {\n"); ++ for (tree field = TYPE_FIELDS (tp); field; ++ field = DECL_CHAIN (field)) ++ { ++ if (TREE_CODE (field) != FIELD_DECL) ++ continue; ++ collect_scalar_types (TREE_TYPE (field), types); ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "}\n"); ++ return; ++ } ++ if (TREE_CODE (tp) == ARRAY_TYPE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Array's innermost type:\n"); ++ /* Take the innermost component type. */ ++ tree elt; ++ for (elt = TREE_TYPE (tp); TREE_CODE (elt) == ARRAY_TYPE; ++ elt = TREE_TYPE (elt)) ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ print_generic_expr (dump_file, elt); ++ collect_scalar_types (elt, types); ++ return; ++ } ++ types.push_back (tp); ++} ++ ++static void maybe_register_aliases (tree type1, tree type2); ++ ++/* Walk type lists and maybe register type aliases. */ ++ ++static void ++compare_type_lists (std::list tlist1, std::list tlist2) ++{ ++ for (std::list::iterator ti1 = tlist1.begin (), ti2 = tlist2.begin (); ++ ti1 != tlist1.end (); ++ti1, ++ti2) ++ { ++ /* TODO: correct the analysis results if lists have different length. */ ++ if (ti2 == tlist2.end ()) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Type lists with different length!\n"); ++ break; ++ } ++ maybe_register_aliases (*ti1, *ti2); ++ } ++} ++ ++/* For two given types collect scalar element types and ++ compare the result lists to find type aliases. */ ++ ++static void ++collect_scalar_types_and_find_aliases (tree t1, tree t2) ++{ ++ std::list tlist1; ++ std::list tlist2; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "First type list: "); ++ collect_scalar_types (t1, tlist1); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Second type list: "); ++ collect_scalar_types (t2, tlist2); ++ compare_type_lists (tlist1, tlist2); ++} ++ ++/* Dump type with the corresponding set from the map. */ ++ ++static void ++dump_type_uid_with_set (const char *msg, tree type, type_alias_map *map, ++ bool dump_type = true, bool with_newline = true) ++{ ++ fprintf (dump_file, msg, TYPE_UID (type)); ++ if (dump_type) ++ print_generic_expr (dump_file, type); ++ fprintf (dump_file, " ("); ++ print_type_set (TYPE_UID (type), map); ++ fprintf (dump_file, ")"); ++ fprintf (dump_file, with_newline ? "\n" : " "); ++} ++ ++static void ++dump_two_types_uids_with_set (const char *msg, unsigned t1_uid, ++ unsigned t2_uid, type_alias_map *map) ++{ ++ fprintf (dump_file, msg, t1_uid, t2_uid); ++ fprintf (dump_file, " ("); ++ print_type_set (t1_uid, map); ++ fprintf (dump_file, ")\n"); ++} ++ ++/* Register type aliases in the map. Return true if new alias ++ is registered. */ ++ ++static bool ++register_ailas_type (tree type, tree alias_type, type_alias_map *map, ++ bool only_merge = false) ++{ ++ /* TODO: maybe support the case with one missed type. */ ++ if (!type || !alias_type) ++ return false; ++ unsigned type_uid = TYPE_UID (type); ++ unsigned alias_type_uid = TYPE_UID (alias_type); ++ if (type_uid_map->count (type_uid) == 0) ++ (*type_uid_map)[type_uid] = type; ++ if (type_uid_map->count (alias_type_uid) == 0) ++ (*type_uid_map)[alias_type_uid] = alias_type; ++ ++ if (map->count (type_uid) == 0 && map->count (alias_type_uid) == 0) ++ { ++ (*map)[type_uid] = new type_set (); ++ (*map)[alias_type_uid] = (*map)[type_uid]; ++ } ++ else if (map->count (type_uid) == 0) ++ (*map)[type_uid] = (*map)[alias_type_uid]; ++ else if (map->count (alias_type_uid) == 0) ++ (*map)[alias_type_uid] = (*map)[type_uid]; ++ else if (map->count (type_uid) && map->count (alias_type_uid)) ++ { ++ if ((*map)[type_uid] == (*map)[alias_type_uid]) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_two_types_uids_with_set ("Types (%d) and (%d) are already in", ++ type_uid, alias_type_uid, map); ++ return false; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ dump_type_uid_with_set ("T1 (%d) in set", type, map, false, true); ++ dump_type_uid_with_set ("T2 (%d) in set", alias_type, map, ++ false, true); ++ } ++ (*map)[type_uid]->insert ((*map)[alias_type_uid]->begin (), ++ (*map)[alias_type_uid]->end ()); ++ type_set *type_set = (*map)[alias_type_uid]; ++ for (type_set::const_iterator it1 = type_set->begin (); ++ it1 != type_set->end (); ++it1) ++ (*map)[*it1] = (*map)[type_uid]; ++ delete type_set; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "MERGE: "); ++ } ++ if (!only_merge) ++ { ++ (*map)[type_uid]->insert (alias_type_uid); ++ (*map)[type_uid]->insert (type_uid); ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_two_types_uids_with_set ("Insert types (%d) and (%d) into set", ++ type_uid, alias_type_uid, map); ++ return true; ++} ++ ++static void ++dump_two_types_with_uids (const char *msg, tree t1, tree t2) ++{ ++ fprintf (dump_file, msg); ++ print_generic_expr (dump_file, t1, TDF_UID); ++ fprintf (dump_file, " (%d), ", TYPE_UID (t1)); ++ print_generic_expr (dump_file, t2, TDF_UID); ++ fprintf (dump_file, " (%d)\n", TYPE_UID (t2)); ++} ++ ++static void ++analyze_pointees (tree type1, tree type2) ++{ ++ gcc_assert (POINTER_TYPE_P (type1) && POINTER_TYPE_P (type2)); ++ tree base1 = TREE_TYPE (type1); ++ tree base2 = TREE_TYPE (type2); ++ /* TODO: maybe analyze void pointers. */ ++ if (VOID_TYPE_P(base1) || VOID_TYPE_P(base2)) ++ return; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_two_types_with_uids ("Walk pointee types: ", base1, base2); ++ collect_scalar_types_and_find_aliases (base1, base2); ++} ++ ++static void ++map_canonical_base_to_pointer (tree type, tree to_insert) ++{ ++ type = TYPE_MAIN_VARIANT (type); ++ tree base_type = TREE_TYPE (type); ++ tree cbase_type = TYPE_CANONICAL (base_type); ++ if (!cbase_type) ++ return; ++ unsigned cbase_type_uid = TYPE_UID (cbase_type); ++ if (type_uid_map->count (cbase_type_uid) == 0) ++ (*type_uid_map)[cbase_type_uid] = cbase_type; ++ ++ if (cbase_to_ptype->count (cbase_type_uid) == 0) ++ { ++ (*cbase_to_ptype)[cbase_type_uid] = new type_set (); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "New map cb-to-p=(%d): ", cbase_type_uid); ++ } ++ else if (!(*cbase_to_ptype)[cbase_type_uid]->count (TYPE_UID (to_insert))) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Found map cb-to-p=(%d): ", cbase_type_uid); ++ } ++ else ++ return; ++ /* Add all variants of 'to_insert' type. */ ++ for (tree t = to_insert; t; t = TYPE_NEXT_VARIANT (t)) ++ { ++ unsigned t_uid = TYPE_UID (t); ++ if (!(*cbase_to_ptype)[cbase_type_uid]->count (t_uid)) ++ { ++ (*cbase_to_ptype)[cbase_type_uid]->insert (t_uid); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "(%d) ", t_uid); ++ } ++ if (type_uid_map->count (t_uid) == 0) ++ (*type_uid_map)[t_uid] = t; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n"); ++} ++ ++/* Analyse two types and maybe register them as aliases. Also collect ++ unsafe function types and map canonical base types to corresponding ++ pointer types. */ ++ ++static void ++maybe_register_aliases (tree type1, tree type2) ++{ ++ if (type1 && POINTER_TYPE_P (type1) && !FUNCTION_POINTER_TYPE_P (type1)) ++ map_canonical_base_to_pointer (type1, type1); ++ if (type2 && POINTER_TYPE_P (type2) && !FUNCTION_POINTER_TYPE_P (type2)) ++ map_canonical_base_to_pointer (type2, type2); ++ ++ if (type1 == type2 || !type1 || !type2) ++ return; ++ ++ if (POINTER_TYPE_P (type1) && POINTER_TYPE_P (type2)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_two_types_with_uids ("Pointer types: ", type1, type2); ++ if (register_ailas_type (type1, type2, ta_map)) ++ analyze_pointees (type1, type2); ++ } ++ /* If function and non-function type pointers alias, ++ the function type is unsafe. */ ++ if (FUNCTION_POINTER_TYPE_P (type1) && !FUNCTION_POINTER_TYPE_P (type2)) ++ unsafe_types->insert (TYPE_UID (type1)); ++ if (FUNCTION_POINTER_TYPE_P (type2) && !FUNCTION_POINTER_TYPE_P (type1)) ++ unsafe_types->insert (TYPE_UID (type2)); ++ ++ /* Try to figure out with pointers to incomplete types. */ ++ if (POINTER_TYPE_P (type1) && POINTER_TYPE_P (type2)) ++ { ++ type1 = TYPE_MAIN_VARIANT (type1); ++ type2 = TYPE_MAIN_VARIANT (type2); ++ tree base1 = TREE_TYPE (type1); ++ tree base2 = TREE_TYPE (type2); ++ if (RECORD_OR_UNION_TYPE_P (base1) && RECORD_OR_UNION_TYPE_P (base2)) ++ { ++ tree cb1 = TYPE_CANONICAL (base1); ++ tree cb2 = TYPE_CANONICAL (base2); ++ if (cb1 && !cb2) ++ map_canonical_base_to_pointer (type1, type2); ++ if (cb2 && !cb1) ++ map_canonical_base_to_pointer (type2, type1); ++ } ++ } ++} ++ ++/* Maybe register non-void/equal type aliases. */ ++ ++static void ++maybe_register_non_void_aliases (tree t1, tree t2) ++{ ++ gcc_assert (t1 && t2); ++ if (type_uid_map->count (TYPE_UID (t1)) == 0) ++ (*type_uid_map)[TYPE_UID (t1)] = t1; ++ if (type_uid_map->count (TYPE_UID (t2)) == 0) ++ (*type_uid_map)[TYPE_UID (t2)] = t2; ++ ++ /* Skip equal and void types. */ ++ if (t1 == t2 || VOID_TYPE_P (t1) || VOID_TYPE_P (t2)) ++ return; ++ maybe_register_aliases (t1, t2); ++} ++ ++/* Detect function type in call stmt. */ ++ ++static tree ++get_call_fntype (gcall *stmt) ++{ ++ tree fntype = NULL; ++ if (gimple_call_fndecl (stmt) && TREE_TYPE (gimple_call_fndecl (stmt))) ++ fntype = TREE_TYPE (gimple_call_fndecl (stmt)); ++ else ++ { ++ tree call_fn = gimple_call_fn (stmt); ++ tree ptype = TREE_TYPE (call_fn); ++ gcc_assert (ptype && TREE_TYPE (ptype)); ++ fntype = TREE_TYPE (ptype); ++ } ++ gcc_assert (fntype && fntype != void_type_node ++ && (TREE_CODE (fntype) == FUNCTION_TYPE ++ || TREE_CODE (fntype) == METHOD_TYPE)); ++ return fntype; ++} ++ ++static void ++dump_global_var (tree decl) ++{ ++ fprintf (dump_file, "Analyze global var: "); ++ print_generic_decl (dump_file, decl, TDF_NONE); ++ fprintf (dump_file, "\n"); ++} ++ ++static void ++collect_block_elt_types (tree tp, std::list &types, tree block) ++{ ++ tree vt = TREE_TYPE (tp); ++ gcc_assert (vt); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ const char *msg = TREE_CODE (block) == BLOCK ? "VAR's block: " : ++ "VAR's ctor: "; ++ fprintf (dump_file, msg); ++ print_generic_expr (dump_file, tp); ++ dump_type_with_uid (" with type ", vt); ++ } ++ collect_scalar_types (vt, types); ++} ++ ++/* Compare types of initialization block's or constructor's elements and ++ fields of the initializer type to find type aliases. */ ++ ++static void ++compare_block_and_init_type (tree block, tree t1) ++{ ++ std::list tlist1; ++ std::list tlist2; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Init's type list: "); ++ collect_scalar_types (t1, tlist1); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Block's type list: "); ++ if (TREE_CODE (block) == CONSTRUCTOR) ++ { ++ unsigned HOST_WIDE_INT idx; ++ tree value; ++ FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (block), idx, value) ++ { ++ gcc_assert (value); ++ collect_block_elt_types (value, tlist2, block); ++ } ++ } ++ else if (TREE_CODE (block) == BLOCK) ++ for (tree var = BLOCK_VARS (block); var; var = DECL_CHAIN (var)) ++ { ++ if (TREE_CODE (var) != VAR_DECL) ++ continue; ++ collect_block_elt_types (var, tlist2, block); ++ } ++ else ++ gcc_unreachable (); ++ compare_type_lists (tlist1, tlist2); ++} ++ ++/* Analyze global var to find type aliases comparing types of var and ++ initializer elements. */ ++ ++static void ++analyze_global_var (varpool_node *var) ++{ ++ var->get_constructor(); ++ tree decl = var->decl; ++ if (TREE_CODE (decl) == SSA_NAME || !DECL_INITIAL (decl) ++ || integer_zerop (DECL_INITIAL (decl))) ++ return; ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_global_var (decl); ++ tree var_type = TREE_TYPE (decl); ++ tree init_type = TREE_TYPE (DECL_INITIAL (decl)); ++ gcc_assert (var_type && init_type); ++ if (RECORD_OR_UNION_TYPE_P (init_type) ++ && !initializer_zerop (DECL_INITIAL (decl))) ++ compare_block_and_init_type (DECL_INITIAL (decl), init_type); ++ else if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Is not a record with nonzero init\n"); ++ ++ if (var_type == init_type) ++ return; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_two_types_with_uids ("Mismatch of var and init types: ", ++ var_type, init_type); ++ collect_scalar_types_and_find_aliases (var_type, init_type); ++} ++ ++static void ++dump_function_node_info (struct cgraph_node *n) ++{ ++ fprintf (dump_file, "\nAnalyse function node: "); ++ print_generic_expr (dump_file, n->decl); ++ fprintf (dump_file, "\n"); ++ tree fndecl_type = TREE_TYPE (n->decl); ++ dump_type_with_uid ("Function decl type: ", fndecl_type, TDF_UID); ++ if (TREE_TYPE (fndecl_type)) ++ dump_type_with_uid ("Return type: ", TREE_TYPE (fndecl_type)); ++ tree argt = TYPE_ARG_TYPES (fndecl_type); ++ for (unsigned i = 1; argt && argt != void_type_node ++ && !VOID_TYPE_P (TREE_VALUE (argt)); ++i, argt = TREE_CHAIN (argt)) ++ { ++ tree atype = TREE_VALUE (argt); ++ fprintf (dump_file, "%d-arg type: ", i); ++ dump_type_with_uid ("", atype); ++ } ++ fprintf (dump_file, "\n"); ++} ++ ++static void ++dump_call_stmt_info (gcall *stmt, tree fntype) ++{ ++ fprintf (dump_file, "\nAnalyse call stmt: "); ++ if (stmt) ++ print_gimple_stmt (dump_file, stmt, 3, TDF_DETAILS); ++ else ++ fprintf (dump_file, "(no stmt)\n"); ++ dump_type_with_uid ("fntype=", fntype, TDF_UID); ++ if (gimple_call_fntype (stmt)) ++ dump_type_with_uid ("fntype1=", gimple_call_fntype (stmt), TDF_UID); ++ if (gimple_call_fndecl (stmt) && TREE_TYPE (gimple_call_fndecl (stmt))) ++ dump_type_with_uid ("fntype2=", TREE_TYPE (gimple_call_fndecl (stmt)), ++ TDF_UID); ++} ++ ++/* Dump actual and formal arg types. */ ++ ++static void ++dump_arg_types_with_uids (int i, tree t1, tree t2) ++{ ++ if (i >= 0) ++ fprintf (dump_file, "Call's %d-arg types: ", i); ++ else ++ fprintf (dump_file, "Call's return types: "); ++ fprintf (dump_file, "(%d) and (%d) ", TYPE_UID (t1), TYPE_UID (t2)); ++ print_generic_expr (dump_file, t1, TDF_UID); ++ fprintf (dump_file, " "); ++ print_generic_expr (dump_file, t2, TDF_UID); ++ fprintf (dump_file, "\n"); ++} ++ ++/* Analyze call graph edge with connected call stmt to find type aliases in ++ arguments and return value casts. */ ++ ++static void ++analyze_cgraph_edge (cgraph_edge *e) ++{ ++ gcall *stmt = e->call_stmt; ++ gcc_assert (stmt != NULL); ++ tree fntype = get_call_fntype (stmt); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_call_stmt_info (stmt, fntype); ++ if (gimple_has_lhs (stmt)) ++ { ++ tree t1 = TREE_TYPE (gimple_call_lhs (stmt)); ++ tree t2 = TREE_TYPE (fntype); ++ const int is_return_arg = -1; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_arg_types_with_uids (is_return_arg, t1, t2); ++ maybe_register_non_void_aliases (t1, t2); ++ } ++ ++ tree argt = TYPE_ARG_TYPES (fntype); ++ if (!argt) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Finish call stmt analysis\n"); ++ return; ++ } ++ gcc_assert (argt); ++ unsigned num_args = gimple_call_num_args (stmt); ++ for (unsigned i = 0; i < num_args && argt; ++i, argt = TREE_CHAIN (argt)) ++ { ++ tree arg = gimple_call_arg (stmt, i); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_arg_types_with_uids (i, TREE_VALUE (argt), TREE_TYPE (arg)); ++ if (TREE_VALUE (argt) == TREE_TYPE (arg) ++ || !POINTER_TYPE_P (TREE_VALUE (argt)) ++ || !POINTER_TYPE_P (TREE_TYPE (arg))) ++ continue; ++ maybe_register_non_void_aliases (TREE_VALUE (argt), TREE_TYPE (arg)); ++ tree t1 = TREE_TYPE (TREE_VALUE (argt)); ++ tree t2 = TREE_TYPE (TREE_TYPE (arg)); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Call's %d-arg base types: (%d) and (%d)\n", ++ i, (t1 ? TYPE_UID (t1) : 0), (t2 ? TYPE_UID (t2) : 0)); ++ maybe_register_non_void_aliases (t1, t2); ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "End list of args\n"); ++ tree fndecl_type = NULL; ++ if (e->callee && e->callee->decl) ++ fndecl_type = TREE_TYPE (e->callee->decl); ++ if (fndecl_type && fndecl_type != fntype) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Function decl and edge types mismatch:\n"); ++ register_ailas_type (fntype, fndecl_type, fta_map); ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "End call stmt analysis\n"); ++} ++ ++static void ++dump_assign_info (gimple *stmt, tree rhs, tree lhs_type, tree rhs_type) ++{ ++ fprintf (dump_file, "\nAnalyse assign cast/copy stmt, rhs=%s: ", ++ get_tree_code_name (TREE_CODE (rhs))); ++ print_gimple_stmt (dump_file, stmt, 3, TDF_DETAILS); ++ fprintf (dump_file, "Types: "); ++ print_generic_expr (dump_file, lhs_type); ++ fprintf (dump_file, ", "); ++ print_generic_expr (dump_file, rhs_type); ++ fprintf (dump_file, "\n"); ++} ++ ++/* Analyze cast/copy assign stmt to find type aliases. */ ++ ++static void ++analyze_assign_stmt (gimple *stmt) ++{ ++ gcc_assert (is_gimple_assign (stmt)); ++ tree rhs_type = NULL_TREE; ++ tree lhs_type = TREE_TYPE (gimple_assign_lhs (stmt)); ++ tree rhs = gimple_assign_rhs1 (stmt); ++ if (TREE_CODE (rhs) == MEM_REF) ++ { ++ rhs = TREE_OPERAND (rhs, 0); ++ tree ptr_type = TREE_TYPE (rhs); ++ gcc_assert (POINTER_TYPE_P (ptr_type)); ++ rhs_type = TREE_TYPE (ptr_type); ++ } ++ else if (TREE_CODE (rhs) == ADDR_EXPR) ++ { ++ rhs = TREE_OPERAND (rhs, 0); ++ if (VAR_OR_FUNCTION_DECL_P (rhs) || TREE_CODE (rhs) == STRING_CST ++ || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL) ++ rhs_type = build_pointer_type (TREE_TYPE (rhs)); ++ else if (TREE_CODE (rhs) == COMPONENT_REF) ++ { ++ rhs = TREE_OPERAND (rhs, 1); ++ rhs_type = build_pointer_type (TREE_TYPE (rhs)); ++ } ++ else if (TREE_CODE (rhs) == MEM_REF) ++ { ++ rhs = TREE_OPERAND (rhs, 0); ++ rhs_type = TREE_TYPE (rhs); ++ gcc_assert (POINTER_TYPE_P (rhs_type)); ++ } ++ else ++ gcc_unreachable(); ++ } ++ else ++ rhs_type = TREE_TYPE (rhs); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_assign_info (stmt, rhs, lhs_type, rhs_type); ++ if (CONSTANT_CLASS_P (rhs) && !zerop (rhs) ++ && FUNCTION_POINTER_TYPE_P (TREE_TYPE (rhs))) ++ { ++ tree ftype = TREE_TYPE (rhs_type); ++ unsafe_types->insert (TYPE_UID (ftype)); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Function type (%d) is unsafe due to assign " ++ "non-zero cst to function pointer\n", TYPE_UID (ftype)); ++ } ++ maybe_register_non_void_aliases (lhs_type, rhs_type); ++} ++ ++/* Walk all fn's stmt to analyze assigns. */ ++ ++static void ++analyze_assigns (function* fn) ++{ ++ push_cfun (fn); ++ basic_block bb; ++ gimple_stmt_iterator si; ++ FOR_EACH_BB_FN (bb, fn) ++ for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) ++ { ++ gimple *stmt = gsi_stmt (si); ++ if (!gimple_assign_cast_p (stmt) && !gimple_assign_copy_p (stmt)) ++ continue; ++ analyze_assign_stmt (stmt); ++ } ++ pop_cfun (); ++} ++ ++/* Walk all functions to collect sets of type aliases. */ ++ ++static void ++collect_type_alias_sets () ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\n\nCollect type alias sets walking global vars.\n"); ++ ++ varpool_node *var; ++ FOR_EACH_VARIABLE (var) ++ if (var->real_symbol_p ()) ++ analyze_global_var (var); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nCollect type alias sets walking functions.\n"); ++ ++ struct cgraph_node *n; ++ FOR_EACH_FUNCTION (n) ++ { ++ if (!n->has_gimple_body_p ()) ++ continue; ++ n->get_body (); ++ function *fn = DECL_STRUCT_FUNCTION (n->decl); ++ if (!fn) ++ continue; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_function_node_info (n); ++ /* Analyze direct/indirect function calls. */ ++ for (cgraph_edge *e = n->callees; e; e = e->next_callee) ++ analyze_cgraph_edge (e); ++ for (cgraph_edge *e = n->indirect_calls; e; e = e->next_callee) ++ analyze_cgraph_edge (e); ++ /* Analyze assign (with casts) statements. */ ++ analyze_assigns (fn); ++ } ++} ++ ++static void ++process_cbase_to_ptype_map () ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nProcess types in cbase-to-ptypes map:\n"); ++ ++ for (type_alias_map::iterator it1 = cbase_to_ptype->begin (); ++ it1 != cbase_to_ptype->end (); ++it1) ++ { ++ type_set *set = it1->second; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_type_uid_with_set ("cb=(%d): ", (*type_uid_map)[it1->first], ++ cbase_to_ptype); ++ tree ctype = NULL; ++ for (type_set::const_iterator it2 = set->begin (); ++ it2 != set->end (); it2++) ++ { ++ tree t2 = (*type_uid_map)[*it2]; ++ if (t2 == TYPE_MAIN_VARIANT (t2)) ++ { ++ ctype = t2; ++ break; ++ } ++ } ++ if (!ctype) ++ continue; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_type_with_uid ("Select canonical type: ", ctype); ++ for (type_set::const_iterator it2 = set->begin (); ++ it2 != set->end (); it2++) ++ { ++ tree t = (*type_uid_map)[*it2]; ++ if (!ctype_map->count (t)) ++ { ++ (*ctype_map)[t] = ctype; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Set canonical type for (%d)->c(%d)\n", ++ *it2, TYPE_UID (ctype)); ++ } ++ else if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Canonical type is already set (%d)->c(%d)\n", ++ *it2, TYPE_UID ((*ctype_map)[t])); ++ } ++ } ++} ++ ++static void ++set_canonical_type_for_type_set (type_set *set) ++{ ++ tree one_canonical = NULL; ++ for (type_set::const_iterator it = set->begin (); it != set->end (); it++) ++ { ++ tree t = (*type_uid_map)[*it]; ++ gcc_assert (t); ++ if ((TYPE_CANONICAL (t) || ctype_map->count (t))) ++ { ++ one_canonical = TYPE_CANONICAL (t) ? TYPE_CANONICAL (t) ++ : (*ctype_map)[t]; ++ gcc_assert (COMPLETE_TYPE_P (t)); ++ break; ++ } ++ } ++ for (type_set::const_iterator it = set->begin (); it != set->end (); it++) ++ { ++ tree t = (*type_uid_map)[*it]; ++ if (!ctype_map->count (t)) ++ { ++ (*ctype_map)[t] = one_canonical; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ if (one_canonical) ++ fprintf (dump_file, "Set canonical type for (%d)->c(%d)\n", ++ TYPE_UID (t), TYPE_UID (one_canonical)); ++ else ++ fprintf (dump_file, "Set NULL canonical for (%d)\n", *it); ++ } ++ } ++ else if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ tree ct = (*ctype_map)[t]; ++ fprintf (dump_file, "Canonical type is already set (%d)->c(%d)\n", ++ TYPE_UID (t), ct ? TYPE_UID (ct) : -1); ++ } ++ } ++} ++ ++static void ++dump_is_type_set_incomplete (type_set * set) ++{ ++ bool has_complete_types = false; ++ for (type_set::const_iterator it = set->begin (); it != set->end (); it++) ++ if (COMPLETE_TYPE_P ((*type_uid_map)[*it])) ++ { ++ has_complete_types = true; ++ break; ++ } ++ if (!has_complete_types) ++ fprintf (dump_file, "Set of incomplete types\n"); ++} ++ ++static void ++process_alias_type_sets () ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nProcess alias sets of types:\n"); ++ /* Keep processed types to process each type set (in ta_map) only once. */ ++ type_set processed_types; ++ for (type_alias_map::iterator it1 = ta_map->begin (); ++ it1 != ta_map->end (); ++it1) ++ { ++ tree type = (*type_uid_map)[it1->first]; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_type_uid_with_set ("(%d) ", type, ta_map); ++ if (processed_types.count (TYPE_UID (type)) != 0 ++ || unsafe_types->count (TYPE_UID (type)) != 0) ++ continue; ++ type_set *set = it1->second; ++ for (type_set::const_iterator it2 = set->begin (); ++ it2 != set->end (); it2++) ++ processed_types.insert (*it2); ++ /* Check if this type set contains function pointers and ++ non-function pointers. */ ++ bool has_no_fp = false, has_fp = false; ++ for (type_set::const_iterator it2 = set->begin (); ++ it2 != set->end (); it2++) ++ { ++ tree t2 = (*type_uid_map)[*it2]; ++ if (FUNCTION_POINTER_TYPE_P (t2)) ++ has_fp = true; ++ else ++ has_no_fp = true; ++ if (has_fp && has_no_fp) ++ break; ++ } ++ if (has_fp) ++ { ++ for (type_set::const_iterator it2 = set->begin (); ++ it2 != set->end (); it2++) ++ { ++ tree t2 = (*type_uid_map)[*it2]; ++ /* If it's a type set with mixed function and not-function types, ++ mark all function pointer types in the set as unsafe. */ ++ if (has_no_fp && FUNCTION_POINTER_TYPE_P (t2)) ++ { ++ tree ftype = TREE_TYPE (t2); ++ unsafe_types->insert (TYPE_UID (ftype)); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Insert function type (%d) to unsafe " ++ "due to escape its pointer type (%d) to mixed " ++ "alias set (printed before)\n", ++ TYPE_UID (ftype), TYPE_UID (t2)); ++ } ++ /* If it's a type set with only function pointer types, ++ mark all base function types in the set as aliases. */ ++ if (!has_no_fp) ++ { ++ gcc_assert (FUNCTION_POINTER_TYPE_P (type) ++ && FUNCTION_POINTER_TYPE_P (t2)); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Insert function type aliases by " ++ "function pointer aliases:\n"); ++ register_ailas_type (TREE_TYPE (type), TREE_TYPE (t2), ++ fta_map); ++ } ++ } ++ } ++ set_canonical_type_for_type_set (set); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_is_type_set_incomplete (set); ++ } ++} ++ ++static void ++dump_unsafe_and_canonical_types () ++{ ++ fprintf (dump_file, "\nList of unsafe types:\n"); ++ for (type_set::iterator it = unsafe_types->begin (); ++ it != unsafe_types->end (); ++it) ++ { ++ print_generic_expr (dump_file, (*type_uid_map)[*it]); ++ fprintf (dump_file, " (%d)\n", *it); ++ } ++ fprintf (dump_file, "\nList of alias canonical types:\n"); ++ for (type_alias_map::iterator it = ta_map->begin (); ++ it != ta_map->end (); ++it) ++ { ++ tree type = (*type_uid_map)[it->first]; ++ if (ctype_map->count (type) == 0) ++ continue; ++ print_generic_expr (dump_file, type); ++ fprintf (dump_file, " -> "); ++ tree ctype = (*ctype_map)[type]; ++ if (ctype != NULL) ++ { ++ print_generic_expr (dump_file, ctype); ++ fprintf (dump_file, " (%d)->(%d)\n", ++ TYPE_UID (type), TYPE_UID (ctype)); ++ } ++ else ++ fprintf (dump_file, " null\n"); ++ } ++} ++ ++static void ++init_function_type_alias_for_edge (cgraph_edge *e) ++{ ++ gcall *stmt = e->call_stmt; ++ gcc_assert (stmt != NULL); ++ tree fntype = get_call_fntype (stmt); ++ if (fta_map->count (TYPE_UID (fntype)) == 0) ++ register_ailas_type (fntype, fntype, fta_map); ++} ++ ++/* This pass over all function types makes each function type to have ++ at least one alias (itself). */ ++ ++static void ++init_function_type_aliases () ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nInit aliases for all function types.\n"); ++ ++ struct cgraph_node *n; ++ FOR_EACH_FUNCTION (n) ++ { ++ tree fntype = TREE_TYPE (n->decl); ++ if (fta_map->count (TYPE_UID (fntype)) == 0) ++ register_ailas_type (fntype, fntype, fta_map); ++ ++ if (!n->has_gimple_body_p ()) ++ continue; ++ n->get_body (); ++ function *fn = DECL_STRUCT_FUNCTION (n->decl); ++ if (!fn) ++ continue; ++ ++ /* Init for function types of direct/indirect callees. */ ++ for (cgraph_edge *e = n->callees; e; e = e->next_callee) ++ init_function_type_alias_for_edge (e); ++ for (cgraph_edge *e = n->indirect_calls; e; e = e->next_callee) ++ init_function_type_alias_for_edge (e); ++ } ++} ++ ++/* In lto-common.c there is the global canonical type table and the ++ corresponding machinery which detects the same types from differens ++ modules and joins them assigning the one canonical type. However ++ lto does not set the goal to do a complete and precise matching, so ++ sometimes a few types has no TYPE_CANONICAL set. Since ICP relies on ++ precise type matching, we create the similar table and register all ++ the required types in it. */ ++ ++static std::map *canonical_type_hash_cache = NULL; ++static std::map *icp_canonical_types = NULL; ++ ++static hashval_t hash_canonical_type (tree type); ++ ++/* Register canonical type in icp_canonical_types and ctype_map evaluating ++ its hash (using hash_canonical_type) if it's needed. */ ++ ++static hashval_t ++icp_register_canonical_type (tree t) ++{ ++ hashval_t hash; ++ if (canonical_type_hash_cache->count ((const_tree) t) == 0) ++ { ++ tree t1 = TYPE_MAIN_VARIANT (t); ++ if (!COMPLETE_TYPE_P (t1) && TYPE_CANONICAL (t1) ++ && COMPLETE_TYPE_P (TYPE_CANONICAL (t1))) ++ { ++ t1 = TYPE_CANONICAL (t1); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Use complete canonical (%d) for (%d)\n", ++ TYPE_UID (t1), TYPE_UID (t)); ++ } ++ hash = hash_canonical_type (t1); ++ /* Cache the just computed hash value. */ ++ (*canonical_type_hash_cache)[(const_tree) t] = hash; ++ } ++ else ++ hash = (*canonical_type_hash_cache)[(const_tree) t]; ++ ++ tree new_type = t; ++ if (icp_canonical_types->count (hash)) ++ { ++ new_type = (*icp_canonical_types)[hash]; ++ gcc_checking_assert (new_type != t); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Found canonical (%d) for (%d), h=%u\n", ++ TYPE_UID (new_type), TYPE_UID (t), (unsigned int) hash); ++ } ++ else ++ { ++ (*icp_canonical_types)[hash] = t; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Register canonical %d, h=%u\n", TYPE_UID (t), ++ (unsigned int) hash); ++ } ++ if (ctype_map->count (t) == 0) ++ (*ctype_map)[t] = new_type; ++ return hash; ++} ++ ++/* Merge hstate with hash of the given type. If the type is not registered, ++ register it in the maps of the canonical types. */ ++ ++static void ++iterative_hash_canonical_type (tree type, inchash::hash &hstate) ++{ ++ hashval_t v; ++ /* All type variants have same TYPE_CANONICAL. */ ++ type = TYPE_MAIN_VARIANT (type); ++ if (canonical_type_hash_cache->count ((const_tree) type)) ++ v = (*canonical_type_hash_cache)[(const_tree) type]; ++ else ++ v = icp_register_canonical_type (type); ++ hstate.merge_hash (v); ++} ++ ++/* Compute and return hash for the given type. It does not take into account ++ base types of pointer types. */ ++ ++static hashval_t ++hash_canonical_type (tree type) ++{ ++ inchash::hash hstate; ++ enum tree_code code; ++ /* Combine a few common features of types so that types are grouped into ++ smaller sets; when searching for existing matching types to merge, ++ only existing types having the same features as the new type will be ++ checked. */ ++ code = tree_code_for_canonical_type_merging (TREE_CODE (type)); ++ hstate.add_int (code); ++ if (!RECORD_OR_UNION_TYPE_P (type)) ++ hstate.add_int (TYPE_MODE (type)); ++ /* Incorporate common features of numerical types. */ ++ if (INTEGRAL_TYPE_P (type) ++ || SCALAR_FLOAT_TYPE_P (type) ++ || FIXED_POINT_TYPE_P (type) ++ || TREE_CODE (type) == OFFSET_TYPE ++ || POINTER_TYPE_P (type)) ++ { ++ hstate.add_int (TYPE_PRECISION (type)); ++ if (!type_with_interoperable_signedness (type)) ++ hstate.add_int (TYPE_UNSIGNED (type)); ++ } ++ if (VECTOR_TYPE_P (type)) ++ { ++ hstate.add_poly_int (TYPE_VECTOR_SUBPARTS (type)); ++ hstate.add_int (TYPE_UNSIGNED (type)); ++ } ++ if (TREE_CODE (type) == COMPLEX_TYPE) ++ hstate.add_int (TYPE_UNSIGNED (type)); ++ if (POINTER_TYPE_P (type)) ++ hstate.add_int (TYPE_ADDR_SPACE (TREE_TYPE (type))); ++ /* For array types hash the domain bounds and the string flag. */ ++ if (TREE_CODE (type) == ARRAY_TYPE && TYPE_DOMAIN (type)) ++ { ++ hstate.add_int (TYPE_STRING_FLAG (type)); ++ /* OMP lowering can introduce error_mark_node in place of ++ random local decls in types. */ ++ if (TYPE_MIN_VALUE (TYPE_DOMAIN (type)) != error_mark_node) ++ inchash::add_expr (TYPE_MIN_VALUE (TYPE_DOMAIN (type)), hstate); ++ if (TYPE_MAX_VALUE (TYPE_DOMAIN (type)) != error_mark_node) ++ inchash::add_expr (TYPE_MAX_VALUE (TYPE_DOMAIN (type)), hstate); ++ } ++ /* Recurse for aggregates with a single element type. */ ++ if (TREE_CODE (type) == ARRAY_TYPE ++ || TREE_CODE (type) == COMPLEX_TYPE ++ || TREE_CODE (type) == VECTOR_TYPE) ++ iterative_hash_canonical_type (TREE_TYPE (type), hstate); ++ /* Incorporate function return and argument types. */ ++ if (TREE_CODE (type) == FUNCTION_TYPE || TREE_CODE (type) == METHOD_TYPE) ++ { ++ unsigned nargs = 0; ++ iterative_hash_canonical_type (TREE_TYPE (type), hstate); ++ for (tree p = TYPE_ARG_TYPES (type); p; p = TREE_CHAIN (p)) ++ { ++ iterative_hash_canonical_type (TREE_VALUE (p), hstate); ++ nargs++; ++ } ++ hstate.add_int (nargs); ++ } ++ if (RECORD_OR_UNION_TYPE_P (type)) ++ { ++ unsigned nfields = 0; ++ for (tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f)) ++ if (TREE_CODE (f) == FIELD_DECL) ++ { ++ iterative_hash_canonical_type (TREE_TYPE (f), hstate); ++ nfields++; ++ } ++ hstate.add_int (nfields); ++ } ++ return hstate.end (); ++} ++ ++/* It finds canonical type in ctype_map and icp_canonical_types maps. */ ++ ++static tree ++find_canonical_type (tree type) ++{ ++ if (ctype_map->count (type)) ++ return (*ctype_map)[type]; ++ if (canonical_type_hash_cache->count ((const_tree) type) == 0) ++ return NULL; ++ hashval_t h = (*canonical_type_hash_cache)[(const_tree) type]; ++ if (icp_canonical_types->count (h)) ++ return (*icp_canonical_types)[h]; ++ return NULL; ++} ++ ++/* It updates hash for the given type taking into account pointees in pointer ++ types. If the type is incomplete function type, it returns true. It's used ++ only for function type hash calculation. */ ++ ++static bool ++initial_hash_canonical_type (tree type, inchash::hash &hstate) ++{ ++ /* All type variants have same TYPE_CANONICAL. */ ++ type = TYPE_MAIN_VARIANT (type); ++ if (VOID_TYPE_P (type)) ++ { ++ hstate.add_int (POINTER_TYPE); ++ return false; ++ } ++ hstate.add_int (TREE_CODE (type)); ++ hstate.add_int (TYPE_MODE (type)); ++ if (POINTER_TYPE_P (type)) ++ { ++ tree base_type = TREE_TYPE (type); ++ hstate.add_int (TYPE_ADDR_SPACE (base_type)); ++ return initial_hash_canonical_type (base_type, hstate); ++ } ++ tree ctype = find_canonical_type (type); ++ if (!ctype) ++ { ++ if (TREE_CODE (type) == FUNCTION_TYPE || TREE_CODE (type) == METHOD_TYPE) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Due to ftype (%d)\n", TYPE_UID (type)); ++ return true; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_type_with_uid ("Has NO canonical type: ", type, TDF_UID); ++ icp_register_canonical_type (type); ++ if (ctype_map->count(type)) ++ ctype = (*ctype_map)[type]; ++ if (ctype && dump_file && (dump_flags & TDF_DETAILS)) ++ dump_type_with_uid ("Found canonical type: ", ctype, TDF_UID); ++ } ++ else if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_type_with_uid ("Canonical type: ", ctype, TDF_UID); ++ hstate.add_int (TYPE_UID (ctype)); ++ return false; ++} ++ ++/* It returns hash value for the given function type. If the function type is ++ incomplete, insert it in the incomplete_hash_ftype set. */ ++ ++static hashval_t ++get_hash_for_ftype (tree type, type_set *incomplete_hash_ftype) ++{ ++ bool incomplete = false; ++ inchash::hash hstate; ++ /* Function type is expected. */ ++ gcc_assert (TREE_CODE (type) == FUNCTION_TYPE ++ || TREE_CODE (type) == METHOD_TYPE); ++ /* Hash return type. */ ++ tree rt = TREE_TYPE (type); ++ tree ct = rt ? find_canonical_type (rt) : void_type_node; ++ incomplete |= initial_hash_canonical_type (ct ? ct : rt, hstate); ++ /* Hash arg types. */ ++ tree argt = TYPE_ARG_TYPES (type); ++ if (!argt) ++ incomplete |= initial_hash_canonical_type (void_type_node, hstate); ++ else ++ for (unsigned i = 1; argt; ++i, argt = TREE_CHAIN (argt)) ++ { ++ tree ct = find_canonical_type (TREE_VALUE (argt)); ++ ct = ct ? ct : TREE_VALUE (argt); ++ incomplete |= initial_hash_canonical_type (ct, hstate); ++ } ++ if (incomplete && incomplete_hash_ftype->count (TYPE_UID (type)) == 0) ++ incomplete_hash_ftype->insert (TYPE_UID (type)); ++ else if (!incomplete && incomplete_hash_ftype->count (TYPE_UID (type)) != 0) ++ incomplete_hash_ftype->erase (TYPE_UID (type)); ++ return hstate.end(); ++} ++ ++/* Find type aliases evaluating type hashes and connecting types with ++ the same hash values. */ ++ ++static void ++find_type_aliases_by_compatibility () ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nFind type aliases checking their compatibility.\n"); ++ ++ std::map hash_to_ftype; ++ type_set *incomplete_hash_ftype = new type_set; ++ canonical_type_hash_cache = new std::map; ++ icp_canonical_types = new std::map; ++ ++ bool changed; ++ int i = 0; ++ do ++ { ++ changed = false; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Iteration %d\n", i); ++ for (type_alias_map::iterator it = fta_map->begin (); ++ it != fta_map->end (); ++it) ++ { ++ tree type = (*type_uid_map)[it->first]; ++ if (TYPE_CANONICAL (type)) ++ continue; ++ hashval_t hash = get_hash_for_ftype (type, incomplete_hash_ftype); ++ if (incomplete_hash_ftype->count (TYPE_UID (type)) != 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Incomplete (%d), h=%u\n", TYPE_UID (type), ++ (unsigned int) hash); ++ continue; ++ } ++ if (hash_to_ftype.count (hash) == 0) ++ hash_to_ftype[hash] = type; ++ TYPE_CANONICAL (type) = hash_to_ftype[hash]; ++ changed = true; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "(%d)->(%d), h=%u\n", TYPE_UID (type), ++ TYPE_UID (TYPE_CANONICAL (type)), (unsigned int) hash); ++ } ++ i++; ++ } ++ while (changed); ++ ++ delete incomplete_hash_ftype; ++ delete icp_canonical_types; ++ delete canonical_type_hash_cache; ++} ++ ++static void ++dump_function_type_aliases_list () ++{ ++ fprintf (dump_file, "\nList of function type aliases:\n"); ++ for (type_alias_map::iterator it = fta_map->begin (); ++ it != fta_map->end (); ++it) ++ dump_type_uid_with_set ("(%d) ", (*type_uid_map)[it->first], fta_map); ++} ++ ++/* Collect type aliases and find missed canonical types. */ ++ ++static void ++collect_function_type_aliases () ++{ ++ collect_type_alias_sets (); ++ process_cbase_to_ptype_map (); ++ process_alias_type_sets (); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_unsafe_and_canonical_types (); ++ ++ /* TODO: maybe remove this pass. */ ++ init_function_type_aliases (); ++ for (type_alias_map::iterator it = fta_map->begin (); ++ it != fta_map->end (); ++it) ++ set_canonical_type_for_type_set (it->second); ++ find_type_aliases_by_compatibility (); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ dump_function_type_aliases_list (); ++} ++ ++static void ++dump_function_signature_info (struct cgraph_node *n, tree ftype, bool varargs) ++{ ++ fprintf (dump_file, "Function decl: "); ++ print_generic_expr (dump_file, n->decl); ++ dump_type_uid_with_set (" with type (%d) ", ftype, fta_map, true, false); ++ if (varargs) ++ fprintf (dump_file, "has varargs, "); ++ if (TREE_CODE (ftype) == METHOD_TYPE) ++ fprintf (dump_file, "is method, "); ++ if (!n->address_taken) ++ fprintf (dump_file, "is not address taken, "); ++ if (unsafe_types->count (TYPE_UID (ftype))) ++ fprintf (dump_file, "is unsafe, "); ++ fprintf (dump_file, "\n"); ++} ++ ++/* Check if the function has variadic arguments. ++ It's corrected count_num_arguments (). */ ++ ++static bool ++has_varargs (tree decl) ++{ ++ tree t; ++ unsigned int num = 0; ++ for (t = TYPE_ARG_TYPES (TREE_TYPE (decl)); ++ t && TREE_VALUE (t) != void_type_node; t = TREE_CHAIN (t)) ++ num++; ++ if (!t && num) ++ return true; ++ return false; ++} ++ ++/* Join fs_map's sets for function type aliases. */ ++ ++static void ++merge_fs_map_for_ftype_aliases () ++{ ++ if (dump_file) ++ fprintf (dump_file, "\n\nMerge decl sets for function type aliases:\n"); ++ type_set processed_types; ++ for (type_decl_map::iterator it1 = fs_map->begin (); ++ it1 != fs_map->end (); ++it1) ++ { ++ if (processed_types.count (it1->first) != 0) ++ continue; ++ decl_set *d_set = it1->second; ++ tree type = (*type_uid_map)[it1->first]; ++ type_set *set = (*fta_map)[it1->first]; ++ for (type_set::const_iterator it2 = set->begin (); ++ it2 != set->end (); it2++) ++ { ++ tree t2 = (*type_uid_map)[*it2]; ++ processed_types.insert (*it2); ++ if (type == t2) ++ continue; ++ gcc_assert ((TREE_CODE (type) == FUNCTION_TYPE ++ || TREE_CODE (type) == METHOD_TYPE) ++ && (TREE_CODE (t2) == FUNCTION_TYPE ++ || TREE_CODE (t2) == METHOD_TYPE)); ++ if (fs_map->count (*it2) == 0 || (*fs_map)[*it2] == NULL) ++ (*fs_map)[*it2] = d_set; ++ else ++ { ++ decl_set *t2_decl_set = (*fs_map)[*it2]; ++ (*fs_map)[*it2] = d_set; ++ gcc_assert (t2_decl_set && t2_decl_set->size() > 0); ++ d_set->insert (t2_decl_set->begin (), t2_decl_set->end ()); ++ delete t2_decl_set; ++ } ++ } ++ } ++} ++ ++/* Dump function types with set of functions corresponding to it. */ ++ ++static void ++dump_function_signature_sets () ++{ ++ fprintf (dump_file, "\n\nUnique sets of function signatures:\n"); ++ std::set processed_sets; ++ for (type_decl_map::iterator it1 = fs_map->begin (); ++ it1 != fs_map->end (); ++it1) ++ { ++ decl_set *set = it1->second; ++ if (processed_sets.count (set) != 0) ++ continue; ++ processed_sets.insert (set); ++ fprintf (dump_file, "{ "); ++ print_type_set (it1->first, fta_map); ++ fprintf (dump_file, " : "); ++ for (decl_set::const_iterator it2 = set->begin (); ++ it2 != set->end (); it2++) ++ { ++ fprintf (dump_file, it2 == set->begin () ? "" : ", "); ++ print_generic_expr (dump_file, *it2); ++ fprintf (dump_file, "(%d)", DECL_UID (*it2)); ++ } ++ fprintf (dump_file, "}\n"); ++ } ++} ++ ++/* Fill the map of function types to sets of function decls. */ ++ ++static void ++collect_function_signatures () ++{ ++ if (dump_file) ++ fprintf (dump_file, "\n\nCollect function signatures:\n"); ++ struct cgraph_node *n; ++ FOR_EACH_FUNCTION (n) ++ { ++ gcc_assert (n->decl && TREE_TYPE (n->decl)); ++ tree ftype = TREE_TYPE (n->decl); ++ bool varargs = has_varargs (n->decl); ++ if (varargs && n->address_taken) ++ has_address_taken_functions_with_varargs = true; ++ if (dump_file) ++ dump_function_signature_info (n, ftype, varargs); ++ if (!n->address_taken) ++ continue; ++ /* TODO: make a separate pass at the end to remove canonicals. */ ++ tree ctype = TYPE_CANONICAL (ftype); ++ unsigned alias_type_fs = ctype ? TYPE_UID (ctype) : 0; ++ if (dump_file) ++ fprintf (dump_file, "canonical type: %d %ld\n", ++ alias_type_fs, fs_map->count (alias_type_fs)); ++ if (alias_type_fs) ++ { ++ if (fs_map->count (TYPE_UID (ctype)) == 0) ++ (*fs_map)[TYPE_UID (ctype)] = new decl_set (); ++ if (dump_file) ++ fprintf (dump_file, "insert decl (%d) to set of map [%d]\n", ++ DECL_UID (n->decl), TYPE_UID (ctype)); ++ (*fs_map)[TYPE_UID (ctype)]->insert (n->decl); ++ } ++ } ++ merge_fs_map_for_ftype_aliases (); ++ if (dump_file) ++ dump_function_signature_sets (); ++} ++ ++#define MAX_TARG_STAT 4 ++struct icp_stats ++{ ++ int npolymorphic; ++ int nspeculated; ++ int nsubst; ++ int ncold; ++ int nmultiple; ++ int noverwritable; ++ int nnotdefined; ++ int nexternal; ++ int nartificial; ++ int nremove; ++ int nicp; ++ int nspec; ++ int nf; ++ int ncalls; ++ int nindir; ++ int nind_only; ++ int ntargs[MAX_TARG_STAT + 1]; ++}; ++ ++static void ++dump_processing_function (struct cgraph_node *n, struct icp_stats &stats) ++{ ++ fprintf (dump_file, "\n\nProcesing function %s\n", n->dump_name ()); ++ print_generic_expr (dump_file, n->decl); ++ fprintf (dump_file, "\n"); ++ dump_type_with_uid ("Func's type: ", TREE_TYPE (n->decl)); ++ if (dump_file && (dump_flags & TDF_STATS)) ++ { ++ struct cgraph_edge *e; ++ stats.nf++; ++ for (e = n->indirect_calls; e; e = e->next_callee) ++ stats.nindir++; ++ for (e = n->callees; e; e = e->next_callee) ++ stats.ncalls++; ++ stats.ncalls += stats.nindir; ++ if (n->callers == NULL) ++ { ++ fprintf (dump_file, "Function has NO callers\n"); ++ stats.nind_only++; ++ } ++ } ++} ++ ++static void ++dump_indirect_call_site (tree call_fn, tree call_fn_ty) ++{ ++ fprintf (dump_file, "Indirect call site: "); ++ print_generic_expr (dump_file, call_fn); ++ dump_type_with_uid ("\nFunction pointer type: ", call_fn_ty); ++} ++ ++static void ++erase_from_unreachable (unsigned type_uid, type_set &unreachable) ++{ ++ unreachable.erase (type_uid); ++ if (!fta_map->count (type_uid)) ++ return; ++ type_set *set = (*fta_map)[type_uid]; ++ for (type_set::const_iterator it = set->begin (); it != set->end (); it++) ++ unreachable.erase (*it); ++} ++ ++static void ++dump_found_fdecls (decl_set *decls, unsigned ctype_uid) ++{ ++ fprintf (dump_file, "Signature analysis FOUND decls (%d):", ctype_uid); ++ for (decl_set::const_iterator it = decls->begin (); it != decls->end (); it++) ++ { ++ print_generic_expr (dump_file, *it); ++ fprintf (dump_file, "(%d), ", DECL_UID (*it)); ++ } ++ if (unsafe_types->count (ctype_uid)) ++ fprintf (dump_file, "type is UNSAFE"); ++ fprintf (dump_file, "\n"); ++} ++ ++static void ++count_found_targets (struct icp_stats &stats, unsigned size) ++{ ++ gcc_assert (size > 0); ++ stats.ntargs[size > MAX_TARG_STAT ? MAX_TARG_STAT : size - 1]++; ++} ++ ++/* Promote the indirect call. */ ++ ++static void ++promote_call (struct cgraph_edge *e, struct cgraph_node *n, ++ struct cgraph_node *likely_target, struct icp_stats *stats) ++{ ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, e->call_stmt, ++ "promoting indirect call in %s to %s\n", ++ n->dump_name (), likely_target->dump_name ()); ++ } ++ if (!likely_target->can_be_discarded_p ()) ++ { ++ symtab_node *sn = likely_target->noninterposable_alias (); ++ cgraph_node *alias = dyn_cast (sn); ++ if (alias) ++ likely_target = alias; ++ } ++ gimple *new_call; ++ if (flag_icp_speculatively) ++ { ++ e->make_speculative (likely_target, e->count.apply_scale (5, 10)); ++ new_call = e->call_stmt; ++ stats->nspec++; ++ } ++ else ++ { ++ cgraph_edge *e2 = cgraph_edge::make_direct (e, likely_target); ++ new_call = cgraph_edge::redirect_call_stmt_to_callee (e2); ++ stats->nsubst++; ++ } ++ if (dump_file) ++ { ++ fprintf (dump_file, "The call is substituted by: "); ++ print_gimple_stmt (dump_file, new_call, 0); ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++/* Find functions which are called only indirectly and if they are not in ++ fs_map, they can be removed. For now it is used only to print stats. */ ++ ++static int ++find_functions_can_be_removed (type_set &unreachable) ++{ ++ int nremove = 0; ++ if (dump_file) ++ fprintf (dump_file, "\nRemove unused functions:\n"); ++ struct cgraph_node *n; ++ FOR_EACH_FUNCTION (n) ++ { ++ gcc_assert (n->decl && TREE_TYPE (n->decl)); ++ if (n->callers != NULL) ++ continue; ++ tree ftype = TREE_TYPE (n->decl); ++ tree ctype = TYPE_CANONICAL (ftype); ++ if (!ctype || !unreachable.count (TYPE_UID (ctype)) ++ || unsafe_types->count (TYPE_UID (ftype)) ++ || TREE_CODE (ftype) == METHOD_TYPE || n->callers != NULL ++ || !n->definition || n->alias || n->thunk.thunk_p || n->clones) ++ continue; ++ if (dump_file) ++ fprintf (dump_file, "%s is not used\n", n->dump_name ()); ++ nremove++; ++ } ++ return nremove; ++} ++ ++static void ++dump_stats (struct icp_stats &st) ++{ ++ fprintf (dump_file, "\nSTATS: %i candidates for indirect call promotion," ++ " %i substituted, %i speculatively promoted, %i cold\n" ++ "%i have multiple targets, %i already speculated, %i external," ++ " %i not defined, %i artificial, %i polymorphic calls," ++ " %i overwritable\n", st.nicp, st.nsubst, st.nspec, st.ncold, ++ st.nmultiple, st.nspeculated, st.nexternal, st.nnotdefined, ++ st.nartificial, st.npolymorphic, st.noverwritable); ++ if (!(dump_flags & TDF_STATS)) ++ return; ++ fprintf (dump_file, "EXTRA STATS: %i functions, %i indirect calls," ++ " %i total calls, %i called only indirectly, %i may be removed\n" ++ "Indirect call sites with found targets ", st.nf, st.nindir, ++ st.ncalls, st.nind_only, st.nremove); ++ for (unsigned i = 0; i < MAX_TARG_STAT; i++) ++ fprintf (dump_file, "%u:%i, ", i + 1, st.ntargs[i]); ++ fprintf (dump_file, "more:%i\n", st.ntargs[MAX_TARG_STAT]); ++} ++ ++/* Optimize indirect calls. When an indirect call has only one target, ++ promote it into a direct call. */ ++ ++static bool ++optimize_indirect_calls () ++{ ++ /* TODO: maybe move to the top of ipa_icp. */ ++ if (has_address_taken_functions_with_varargs) ++ { ++ if (dump_file) ++ fprintf (dump_file, "\n\nAddress taken function with varargs is found." ++ " Skip the optimization.\n"); ++ return false; ++ } ++ struct icp_stats stats = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, {0, 0, 0, 0, 0}}; ++ /* At first assume all function types are unreadchable. */ ++ type_set unreachable_ftypes; ++ if (dump_file && (dump_flags & TDF_STATS)) ++ for (type_decl_map::iterator it = fs_map->begin (); ++ it != fs_map->end (); ++it) ++ unreachable_ftypes.insert (it->first); ++ ++ struct cgraph_node *n; ++ FOR_EACH_DEFINED_FUNCTION (n) ++ { ++ if (dump_file) ++ dump_processing_function (n, stats); ++ struct cgraph_edge *e; ++ bool update = false; ++ if (!opt_for_fn (n->decl, flag_icp) || !n->has_gimple_body_p () ++ || n->inlined_to || !n->indirect_calls) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Skip the function\n"); ++ continue; ++ } ++ /* If the function has indirect calls which are not polymorphic, ++ process its body, otherwise continue. */ ++ bool non_polymorphic_calls = false; ++ for (e = n->indirect_calls; e; e = e->next_callee) ++ if (!e->indirect_info->polymorphic) ++ { ++ non_polymorphic_calls = true; ++ break; ++ } ++ if (!non_polymorphic_calls) ++ { ++ if (dump_file) ++ fprintf (dump_file, "All indirect calls are polymorphic," ++ "skip...\n"); ++ continue; ++ } ++ /* Get the function body to operate with call statements. */ ++ n->get_body (); ++ /* Walk indirect call sites and apply the optimization. */ ++ cgraph_edge *next; ++ for (e = n->indirect_calls; e; e = next) ++ { ++ next = e->next_callee; ++ if (e->indirect_info->polymorphic) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Target is polymorphic, skip...\n\n"); ++ stats.npolymorphic++; ++ continue; ++ } ++ stats.nicp++; ++ struct cgraph_node *likely_target = NULL; ++ gcall *stmt = e->call_stmt; ++ gcc_assert (stmt != NULL); ++ tree call_fn = gimple_call_fn (stmt); ++ tree call_fn_ty = TREE_TYPE (call_fn); ++ if (dump_file) ++ dump_indirect_call_site (call_fn, call_fn_ty); ++ tree decl = NULL_TREE; ++ if (POINTER_TYPE_P (call_fn_ty)) ++ { ++ if (dump_file) ++ dump_type_with_uid ("Pointee type: ", TREE_TYPE (call_fn_ty)); ++ if (dump_file && (dump_flags & TDF_STATS)) ++ erase_from_unreachable (TYPE_UID (TREE_TYPE (call_fn_ty)), ++ unreachable_ftypes); ++ /* Try to use the signature analysis results. */ ++ tree ctype = TYPE_CANONICAL (TREE_TYPE (call_fn_ty)); ++ unsigned ctype_uid = ctype ? TYPE_UID (ctype) : 0; ++ if (ctype_uid && fs_map->count (ctype_uid)) ++ { ++ if (dump_flags && (dump_flags & TDF_STATS)) ++ erase_from_unreachable (ctype_uid, unreachable_ftypes); ++ decl_set *decls = (*fs_map)[ctype_uid]; ++ if (dump_file) ++ dump_found_fdecls (decls, ctype_uid); ++ /* TODO: optimize for multple targets. */ ++ if (!unsafe_types->count (ctype_uid) && decls->size () == 1) ++ { ++ decl = *(decls->begin ()); ++ likely_target = cgraph_node::get (decl); ++ } ++ if (!unsafe_types->count (ctype_uid) ++ && (dump_flags & TDF_STATS)) ++ count_found_targets (stats, decls->size ()); ++ } ++ } ++ if (!decl || !likely_target) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Callee is unknown\n\n"); ++ continue; ++ } ++ if (TREE_CODE (TREE_TYPE (decl)) == METHOD_TYPE) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Callee is method\n\n"); ++ continue; ++ } ++ if (e->speculative) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Call is already speculated\n\n"); ++ stats.nspeculated++; ++ continue; ++ } ++ if (!likely_target->definition) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Target is not a definition\n\n"); ++ stats.nnotdefined++; ++ continue; ++ } ++ /* Do not introduce new references to external symbols. While we ++ can handle these just well, it is common for programs to ++ incorrectly with headers defining methods they are linked ++ with. */ ++ if (DECL_EXTERNAL (likely_target->decl)) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Target is external\n\n"); ++ stats.nexternal++; ++ continue; ++ } ++ /* Don't use an implicitly-declared destructor (c++/58678). */ ++ struct cgraph_node *non_thunk_target ++ = likely_target->function_symbol (); ++ if (DECL_ARTIFICIAL (non_thunk_target->decl)) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Target is artificial\n\n"); ++ stats.nartificial++; ++ continue; ++ } ++ if (likely_target->get_availability () <= AVAIL_INTERPOSABLE ++ && likely_target->can_be_discarded_p ()) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Target is overwritable\n\n"); ++ stats.noverwritable++; ++ continue; ++ } ++ else if (dbg_cnt (icp)) ++ { ++ promote_call (e, n, likely_target, &stats); ++ update = true; ++ } ++ } ++ if (update) ++ ipa_update_overall_fn_summary (n); ++ } ++ ++ if (dump_file && (dump_flags & TDF_STATS)) ++ stats.nremove = find_functions_can_be_removed (unreachable_ftypes); ++ ++ if (dump_file) ++ dump_stats (stats); ++ return stats.nsubst || stats.nspec; ++} ++ ++/* Delete the given MAP with allocated sets. One set may be associated with ++ more then one type/decl. */ ++ ++template ++static void ++remove_type_alias_map (MAP *map) ++{ ++ std::set processed_sets; ++ for (typename MAP::iterator it = map->begin (); it != map->end (); it++) ++ { ++ typename MAP::mapped_type set = it->second; ++ if (processed_sets.count (set) != 0) ++ continue; ++ processed_sets.insert (set); ++ delete set; ++ } ++ delete map; ++} ++ ++/* The ipa indirect call promotion pass. Run required analysis and optimize ++ indirect calls. ++ When indirect call has only one target, promote it into a direct call. */ ++ ++static unsigned int ++ipa_icp (void) ++{ ++ ta_map = new type_alias_map; ++ fta_map = new type_alias_map; ++ cbase_to_ptype = new type_alias_map; ++ fs_map = new type_decl_map; ++ ctype_map = new type_map; ++ unsafe_types = new type_set; ++ type_uid_map = new uid_to_type_map; ++ ++ /* Find type aliases, fill the function signature map and ++ optimize indirect calls. */ ++ collect_function_type_aliases (); ++ collect_function_signatures (); ++ bool optimized = optimize_indirect_calls (); ++ ++ remove_type_alias_map (ta_map); ++ remove_type_alias_map (fta_map); ++ remove_type_alias_map (cbase_to_ptype); ++ remove_type_alias_map (fs_map); ++ delete ctype_map; ++ delete unsafe_types; ++ delete type_uid_map; ++ ++ return optimized ? TODO_remove_functions : 0; ++} ++ ++namespace { ++ ++const pass_data pass_data_ipa_icp = ++{ ++ IPA_PASS, /* type */ ++ "icp", /* name */ ++ OPTGROUP_NONE, /* optinfo_flags */ ++ TV_IPA_ICP, /* tv_id */ ++ 0, /* properties_required */ ++ 0, /* properties_provided */ ++ 0, /* properties_destroyed */ ++ 0, /* todo_flags_start */ ++ 0, /* todo_flags_finish */ ++}; ++ ++class pass_ipa_icp : public ipa_opt_pass_d ++{ ++public: ++ pass_ipa_icp (gcc::context *ctxt) ++ : ipa_opt_pass_d (pass_data_ipa_icp, ctxt, ++ NULL, /* generate_summary */ ++ NULL, /* write_summary */ ++ NULL, /* read_summary */ ++ NULL, /* write_optimization_summary */ ++ NULL, /* read_optimization_summary */ ++ NULL, /* stmt_fixup */ ++ 0, /* function_transform_todo_flags_start */ ++ NULL, /* function_transform */ ++ NULL) /* variable_transform */ ++ {} ++ ++ /* opt_pass methods: */ ++ virtual bool gate (function *) ++ { ++ return (optimize && flag_icp && !seen_error () ++ && (in_lto_p || flag_whole_program)); ++ } ++ ++ virtual unsigned int execute (function *) { return ipa_icp (); } ++ ++}; // class pass_ipa_icp ++ ++} // anon namespace ++ ++ipa_opt_pass_d * ++make_pass_ipa_icp (gcc::context *ctxt) ++{ ++ return new pass_ipa_icp (ctxt); ++} + + #include "gt-ipa-devirt.h" +diff --git a/gcc/passes.def b/gcc/passes.def +index ea50db086..2685018cd 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -152,6 +152,7 @@ along with GCC; see the file COPYING3. If not see + NEXT_PASS (pass_ipa_profile); + NEXT_PASS (pass_ipa_icf); + NEXT_PASS (pass_ipa_devirt); ++ NEXT_PASS (pass_ipa_icp); + NEXT_PASS (pass_ipa_cp); + NEXT_PASS (pass_ipa_sra); + NEXT_PASS (pass_ipa_cdtor_merge); +diff --git a/gcc/testsuite/gcc.dg/icp1.c b/gcc/testsuite/gcc.dg/icp1.c +new file mode 100644 +index 000000000..c2117f738 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/icp1.c +@@ -0,0 +1,40 @@ ++/* { dg-do run } */ ++/* { dg-options "-O2 -flto -ficp -fdump-ipa-icp=./icp1.c.077i.icp" } */ ++ ++int dummy = 0; ++ ++typedef int (*ftype1)(int a); ++typedef float (*ftype2)(int a); ++ ++ftype1 func1; ++ ++struct { ++ int a; ++ int* b; ++ ftype1 myf1; ++ ftype2 myf2; ++} my_str; ++ ++int foo(int a) { ++ my_str.myf1 = func1; ++ if (a % 2 == 0) ++ dummy += dummy % (dummy - a); ++ return a + 1; ++} ++ ++float bar(int a) { ++ my_str.myf2 = &bar; ++ func1 = &foo; ++ return foo(a); ++} ++ ++int main() { ++ bar(1); ++ my_str.myf2(3); ++ return (my_str.myf1(2) + func1(4)) != 8; ++} ++ ++/* { dg-final { scan-ipa-dump "The call is substituted by:.*= foo \\(4\\);" "icp" } } */ ++/* { dg-final { scan-ipa-dump "The call is substituted by:.*= foo \\(2\\);" "icp" } } */ ++/* { dg-final { scan-ipa-dump "The call is substituted by: bar \\(3\\);" "icp" } } */ ++/* { dg-final { scan-ipa-dump "STATS: 3 candidates for indirect call promotion, 3 substituted, 0 speculatively promoted, 0 cold" "icp" } } */ +diff --git a/gcc/testsuite/gcc.dg/icp2.c b/gcc/testsuite/gcc.dg/icp2.c +new file mode 100644 +index 000000000..03d31d407 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/icp2.c +@@ -0,0 +1,38 @@ ++/* { dg-do run } */ ++/* { dg-options "-O2 -flto -ficp -fdump-ipa-icp=./icp2.c.077i.icp" } */ ++ ++int dummy = 0; ++ ++typedef int (*ftype1)(int a); ++typedef float (*ftype2)(int a); ++ ++ftype1 func1; ++ ++struct { ++ int a; ++ int* b; ++ ftype1 myf1; ++ ftype2 myf2; ++} my_str; ++ ++int foo(int a) { ++ my_str.myf1 = func1; ++ if (a % 2 == 0) ++ dummy += dummy % (dummy - a); ++ return a + 1; ++} ++ ++float bar(int a) { ++ my_str.myf2 = dummy ? (ftype2) &foo : &bar; ++ func1 = (ftype1) &bar; ++ return foo(a); ++} ++ ++int main() { ++ bar(1); ++ my_str.myf2(3); ++ return (my_str.myf1(2) + func1(4)) != 8; ++} ++ ++/* { dg-final { scan-ipa-dump-not "The call is substituted by.*" "icp" } } */ ++/* { dg-final { scan-ipa-dump "STATS: 3 candidates for indirect call promotion, 0 substituted, 0 speculatively promoted, 0 cold" "icp" } } */ +diff --git a/gcc/testsuite/gcc.dg/icp3.c b/gcc/testsuite/gcc.dg/icp3.c +new file mode 100644 +index 000000000..2a7d1e6f5 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/icp3.c +@@ -0,0 +1,52 @@ ++/* { dg-do run } */ ++/* { dg-options "-O2 -flto -ficp -fdump-ipa-icp=./icp3.c.077i.icp" } */ ++ ++#include ++ ++int dummy = 0; ++ ++typedef int (*ftype1)(int a); ++typedef float (*ftype2)(int a); ++typedef ftype1 (*ftype3) (ftype2); ++ ++ftype1 func1; ++ ++struct { ++ int a; ++ int* b; ++ ftype1 myf1; ++ ftype2 myf2; ++ ftype3 myf3; ++} my_str; ++ ++ftype1 boo(ftype2 a) { ++ printf ("Call boo\n"); ++ return (ftype1) a; ++} ++ ++int foo(int a) { ++ printf ("Call foo\n"); ++ my_str.myf1 = func1; ++ if (a % 2 == 0) ++ dummy += dummy % (dummy - a); ++ return a + 1; ++} ++ ++float bar(int a) { ++ printf("Call bar\n"); ++ my_str.myf2 = (ftype2) my_str.myf3((ftype2) foo); ++ func1 = &foo; ++ return foo(a); ++} ++ ++int main() { ++ my_str.myf3 = &boo; ++ bar(1); ++ my_str.myf2(3); ++ return (my_str.myf1(2) + func1(4)) != 8; ++} ++ ++/* { dg-final { scan-ipa-dump "The call is substituted by:.*= foo \\(4\\);" "icp" } } */ ++/* { dg-final { scan-ipa-dump "The call is substituted by:.*= foo \\(2\\);" "icp" } } */ ++/* { dg-final { scan-ipa-dump "The call is substituted by: foo \\(3\\);" "icp" } } */ ++/* { dg-final { scan-ipa-dump "STATS: 4 candidates for indirect call promotion, 3 substituted, 0 speculatively promoted, 0 cold" "icp" } } */ +diff --git a/gcc/testsuite/gcc.dg/icp4.c b/gcc/testsuite/gcc.dg/icp4.c +new file mode 100644 +index 000000000..e3e1d5116 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/icp4.c +@@ -0,0 +1,55 @@ ++/* { dg-do run } */ ++/* { dg-options "-O2 -flto -ficp -fdump-ipa-icp=./icp4.c.077i.icp" } */ ++ ++#include ++ ++int dummy = 0; ++ ++typedef int (*ftype1)(int a); ++typedef float (*ftype2)(int a); ++typedef ftype1 (*ftype3) (ftype2); ++ ++ftype1 func1; ++ftype1 boo(ftype2 a); ++int foo(int a); ++float bar(int a); ++ ++typedef struct { ++ int a; ++ int* b; ++ ftype1 myf1; ++ ftype2 myf2; ++ ftype3 myf3; ++} T; ++ ++T my_str = {0, (int*) &dummy, (ftype1) &boo, (ftype2) &foo, (ftype3) &bar}; ++ ++ftype1 boo(ftype2 a) { ++ printf ("Call boo\n"); ++ return (ftype1) a; ++} ++ ++int foo(int a) { ++ printf ("Call foo\n"); ++ my_str.myf1 = func1; ++ if (a % 2 == 0) ++ dummy += dummy % (dummy - a); ++ return a + 1; ++} ++ ++float bar(int a) { ++ printf("Call bar\n"); ++ my_str.myf2 = (ftype2) my_str.myf3((ftype2) foo); ++ func1 = &foo; ++ return foo(a); ++} ++ ++int main() { ++ my_str.myf3 = &boo; ++ bar(1); ++ my_str.myf2(3); ++ return (my_str.myf1(2) + func1(4)) != 8; ++} ++ ++/* { dg-final { scan-ipa-dump-not "The call is substituted by.*" "icp" } } */ ++/* { dg-final { scan-ipa-dump "STATS: 4 candidates for indirect call promotion, 0 substituted, 0 speculatively promoted, 0 cold" "icp" } } */ +diff --git a/gcc/testsuite/gcc.dg/icp5.c b/gcc/testsuite/gcc.dg/icp5.c +new file mode 100644 +index 000000000..c7709243c +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/icp5.c +@@ -0,0 +1,66 @@ ++/* { dg-do run } */ ++/* { dg-options "-O2 -flto -ficp -fdump-ipa-icp=./icp5.c.077i.icp" } */ ++ ++#include ++ ++int dummy = 0; ++ ++typedef int (*ftype1)(int a); ++typedef float (*ftype2)(int a); ++typedef ftype1 (*ftype3) (ftype2); ++ ++ftype1 func1; ++ftype1 boo(ftype2 a); ++int foo(int a); ++float bar(int a); ++ ++typedef struct { ++ int a; ++ int* b; ++ ftype1 myf1; ++ ftype2 myf2; ++ ftype3 myf3; ++} T; ++ ++T my_str; ++ ++typedef struct { ++ int a; ++ int* b; ++ ftype3 myf1; ++ ftype2 myf2; ++ ftype1 myf3; ++} T1; ++ ++T1 my1 = {0, &dummy, boo, &bar, &foo}; ++ ++ftype1 boo(ftype2 a) { ++ printf("Call boo\n"); ++ return (ftype1) a; ++} ++ ++int foo(int a) { ++ printf("Call foo\n"); ++ my_str.myf1 = func1; ++ if (a % 2 == 0) ++ dummy += dummy % (dummy - a); ++ return a + 1; ++} ++ ++float bar(int a) { ++ printf("Call bar\n"); ++ my_str.myf2 = (ftype2) my_str.myf3((ftype2) foo); ++ func1 = &foo; ++ return foo(a); ++} ++ ++int main() { ++ my_str = *(T*)&my1; ++ my_str.myf3 = &boo; ++ bar(1); ++ my_str.myf2(3); ++ return (my_str.myf1(2) + func1(4)) != 8; ++} ++ ++/* { dg-final { scan-ipa-dump-not "The call is substituted by.*" "icp" } } */ ++/* { dg-final { scan-ipa-dump "STATS: 4 candidates for indirect call promotion, 0 substituted, 0 speculatively promoted, 0 cold" "icp" } } */ +diff --git a/gcc/testsuite/gcc.dg/icp6.c b/gcc/testsuite/gcc.dg/icp6.c +new file mode 100644 +index 000000000..5a9f15045 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/icp6.c +@@ -0,0 +1,66 @@ ++/* { dg-do run } */ ++/* { dg-options "-O2 -flto -ficp -fdump-ipa-icp=./icp6.c.077i.icp -Wno-int-conversion -Wno-incompatible-pointer-types" } */ ++int dummy = 0; ++ ++typedef int (*ftype1)(int a); ++typedef float (*ftype2)(int a); ++typedef int (*ftype3)(); ++typedef int (*ftype4)(int a, int b); ++ ++ftype1 func1; ++ftype4 func2; ++ ++struct { ++ int a; ++ int* b; ++ ftype1 myf1; ++ ftype2 myf2; ++ ftype3 myf3; ++} my_str; ++ ++int foo3(float a) { ++ return dummy; ++} ++ ++int foo4(int a, int b) { ++ return a*b; ++} ++ ++int foo(int a) { ++ my_str.myf1 = func1; ++ if (a % 2 == 0) ++ dummy += dummy % (dummy - a); ++ return a + 1; ++} ++ ++int foo2(float a) { ++ func1 = (ftype1) &foo; ++ func2 = &foo4; ++ return dummy + foo3 (a); ++} ++ ++float bar2(int a) { ++ my_str.myf2 = (ftype2)(0x864213); ++ func2 = 0x65378; ++ return foo(a); ++} ++ ++float bar(int a) { ++ my_str.myf3 = &foo2; ++ my_str.myf2 = &bar; ++ func1 = (ftype1) &dummy; ++ func2 = (ftype4) &bar2; ++ return foo(a); ++} ++ ++int main() { ++ bar(1); ++ bar2(1); ++ bar(0); ++ my_str.myf2(3); ++ ((ftype1) my_str.myf3)(0.0); ++ int sum = func1(4); ++ return (sum + my_str.myf1(2) + func2(5, 6)) != 38; ++} ++/* { dg-final { scan-ipa-dump "The call is substituted by.*foo2 \\(0\\);" "icp" } } */ ++/* { dg-final { scan-ipa-dump "STATS: 5 candidates for indirect call promotion, 1 substituted, 0 speculatively promoted, 0 cold" "icp" } } */ +diff --git a/gcc/testsuite/gcc.dg/icp7.c b/gcc/testsuite/gcc.dg/icp7.c +new file mode 100644 +index 000000000..fa52197f4 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/icp7.c +@@ -0,0 +1,48 @@ ++/* { dg-do run } */ ++/* { dg-options "-O2 -flto -ficp -fdump-ipa-icp=./icp7.c.077i.icp" } */ ++ ++#include ++ ++int dummy = 0; ++ ++typedef int (*ftype1)(int a); ++typedef float (*ftype2)(int a); ++ ++ftype1 func1; ++ ++struct { ++ int a; ++ int* b; ++ ftype1 myf1; ++ ftype2 myf2; ++} my_str; ++ ++int boo(int a, ...) { ++ va_list ap; ++ va_start(ap, a); ++ if (a == 0) ++ dummy += va_arg(ap, int); ++ va_end(ap); ++ return dummy; ++} ++ ++int foo(int a) { ++ my_str.myf1 = func1; ++ if (a % 2 == 0) ++ dummy += dummy % (dummy - a); ++ return a + 1; ++} ++ ++float bar(int a) { ++ my_str.myf2 = &bar; ++ func1 = (ftype1) &boo; ++ return foo(a); ++} ++ ++int main() { ++ bar(1); ++ my_str.myf2(3); ++ return (my_str.myf1(2) + func1(4)); ++} ++ ++/* { dg-final { scan-ipa-dump "Address taken function with varargs is found. Skip the optimization." "icp" } } */ +diff --git a/gcc/timevar.def b/gcc/timevar.def +index 2814b14f2..e12b0e50d 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -71,6 +71,7 @@ DEFTIMEVAR (TV_CGRAPHOPT , "callgraph optimization") + DEFTIMEVAR (TV_CGRAPH_FUNC_EXPANSION , "callgraph functions expansion") + DEFTIMEVAR (TV_CGRAPH_IPA_PASSES , "callgraph ipa passes") + DEFTIMEVAR (TV_IPA_ODR , "ipa ODR types") ++DEFTIMEVAR (TV_IPA_ICP , "ipa indirect call promotion") + DEFTIMEVAR (TV_IPA_FNSUMMARY , "ipa function summary") + DEFTIMEVAR (TV_IPA_UNREACHABLE , "ipa dead code removal") + DEFTIMEVAR (TV_IPA_INHERITANCE , "ipa inheritance graph") +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index 3cdc12466..f41c2692d 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -506,6 +506,7 @@ extern ipa_opt_pass_d *make_pass_ipa_cp (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_sra (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_icf (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_devirt (gcc::context *ctxt); ++extern ipa_opt_pass_d *make_pass_ipa_icp (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_odr (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_reference (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_hsa (gcc::context *ctxt); +-- +2.33.0 + diff --git a/0157-Add-split-complex-instructions-pass.patch b/0157-Add-split-complex-instructions-pass.patch new file mode 100644 index 0000000..cf934ae --- /dev/null +++ b/0157-Add-split-complex-instructions-pass.patch @@ -0,0 +1,1241 @@ +From 987c93481282335e81dac84db73b9f98ff5df6b0 Mon Sep 17 00:00:00 2001 +From: Agrachev Andrey WX1228450 +Date: Tue, 12 Dec 2023 09:56:49 +0800 +Subject: [PATCH 4/6] Add split-complex-instructions pass + +--- + gcc/common.opt | 5 + + gcc/config/aarch64/aarch64.c | 42 ++ + gcc/doc/tm.texi | 8 + + gcc/doc/tm.texi.in | 4 + + gcc/params.opt | 8 + + gcc/passes.def | 1 + + gcc/sched-rgn.c | 702 ++++++++++++++++++ + gcc/target.def | 10 + + .../gcc.dg/rtl/aarch64/test-ldp-dont-split.c | 74 ++ + .../rtl/aarch64/test-ldp-split-rearrange.c | 40 + + .../gcc.dg/rtl/aarch64/test-ldp-split.c | 174 +++++ + gcc/timevar.def | 1 + + gcc/tree-pass.h | 1 + + 13 files changed, 1070 insertions(+) + create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c + create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c + create mode 100644 gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index f2c53cc31..36b016253 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1705,6 +1705,11 @@ floop-nest-optimize + Common Report Var(flag_loop_nest_optimize) Optimization + Enable the loop nest optimizer. + ++fsplit-ldp-stp ++Common Report Var(flag_split_ldp_stp) Optimization ++Split load/store pair instructions into separate load/store operations ++for better performance. ++ + fstrict-volatile-bitfields + Common Report Var(flag_strict_volatile_bitfields) Init(-1) Optimization + Force bitfield accesses to match their type width. +diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c +index afacb5a6a..ae9e0802b 100644 +--- a/gcc/config/aarch64/aarch64.c ++++ b/gcc/config/aarch64/aarch64.c +@@ -23858,6 +23858,48 @@ aarch64_run_selftests (void) + + #endif /* #if CHECKING_P */ + ++/* TODO: refuse to use ranges intead of full list of an instruction codes. */ ++ ++bool ++is_aarch64_ldp_insn (int icode) ++{ ++ if ((icode >= CODE_FOR_load_pair_sw_sisi ++ && icode <= CODE_FOR_load_pair_dw_tftf) ++ || (icode >= CODE_FOR_loadwb_pairsi_si ++ && icode <= CODE_FOR_loadwb_pairtf_di) ++ || (icode >= CODE_FOR_load_pairv8qiv8qi ++ && icode <= CODE_FOR_load_pairdfdf) ++ || (icode >= CODE_FOR_load_pairv16qiv16qi ++ && icode <= CODE_FOR_load_pairv8bfv2df) ++ || (icode >= CODE_FOR_load_pair_lanesv8qi ++ && icode <= CODE_FOR_load_pair_lanesdf)) ++ return true; ++ return false; ++} ++ ++bool ++is_aarch64_stp_insn (int icode) ++{ ++ if ((icode >= CODE_FOR_store_pair_sw_sisi ++ && icode <= CODE_FOR_store_pair_dw_tftf) ++ || (icode >= CODE_FOR_storewb_pairsi_si ++ && icode <= CODE_FOR_storewb_pairtf_di) ++ || (icode >= CODE_FOR_vec_store_pairv8qiv8qi ++ && icode <= CODE_FOR_vec_store_pairdfdf) ++ || (icode >= CODE_FOR_vec_store_pairv16qiv16qi ++ && icode <= CODE_FOR_vec_store_pairv8bfv2df) ++ || (icode >= CODE_FOR_store_pair_lanesv8qi ++ && icode <= CODE_FOR_store_pair_lanesdf)) ++ return true; ++ return false; ++} ++ ++#undef TARGET_IS_LDP_INSN ++#define TARGET_IS_LDP_INSN is_aarch64_ldp_insn ++ ++#undef TARGET_IS_STP_INSN ++#define TARGET_IS_STP_INSN is_aarch64_stp_insn ++ + #undef TARGET_STACK_PROTECT_GUARD + #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard + +diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi +index ef3566510..ac1d665c5 100644 +--- a/gcc/doc/tm.texi ++++ b/gcc/doc/tm.texi +@@ -11870,6 +11870,14 @@ object files that are not referenced from @code{main} and uses export + lists. + @end defmac + ++@deftypefn {Target Hook} bool TARGET_IS_LDP_INSN (int @var{icode}) ++Return true if icode is corresponding to any of the LDP instruction types. ++@end deftypefn ++ ++@deftypefn {Target Hook} bool TARGET_IS_STP_INSN (int @var{icode}) ++Return true if icode is corresponding to any of the STP instruction types. ++@end deftypefn ++ + @deftypefn {Target Hook} bool TARGET_CANNOT_MODIFY_JUMPS_P (void) + This target hook returns @code{true} past the point in which new jump + instructions could be created. On machines that require a register for +diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in +index 945d0f696..0cd70dda4 100644 +--- a/gcc/doc/tm.texi.in ++++ b/gcc/doc/tm.texi.in +@@ -8010,6 +8010,10 @@ object files that are not referenced from @code{main} and uses export + lists. + @end defmac + ++@hook TARGET_IS_LDP_INSN ++ ++@hook TARGET_IS_STP_INSN ++ + @hook TARGET_CANNOT_MODIFY_JUMPS_P + + @hook TARGET_HAVE_CONDITIONAL_EXECUTION +diff --git a/gcc/params.opt b/gcc/params.opt +index 227175eef..450d227c6 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -1005,6 +1005,14 @@ Target size of compressed pointer, which should be 8, 16 or 32. + Common Joined UInteger Var(semi_relayout_level) Init(13) IntegerRange(11, 15) Param Optimization + Set capacity of each bucket to semi-relayout to (1 << semi-relayout-level) / 8 . + ++-param=param-ldp-dependency-search-range= ++Common Joined UInteger Var(param_ldp_dependency_search_range) Init(16) IntegerRange(1, 32) Param Optimization ++Range for depended ldp search in split-ldp-stp path. ++ ++-param=param-relayout-bucket-num= ++Common Joined UInteger Var(param_relayout_bucket_num) Init(15) IntegerRange(1, 20) Param Optimization ++Relayout num of bucket. ++ + -param=mem-access-ratio= + Common Joined UInteger Var(param_mem_access_ratio) Init(20) IntegerRange(0, 100) Param Optimization + Memory access ratio (in percent). +diff --git a/gcc/passes.def b/gcc/passes.def +index 4e6a58634..ba13d897c 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -487,6 +487,7 @@ along with GCC; see the file COPYING3. If not see + NEXT_PASS (pass_reorder_blocks); + NEXT_PASS (pass_leaf_regs); + NEXT_PASS (pass_split_before_sched2); ++ NEXT_PASS (pass_split_complex_instructions); + NEXT_PASS (pass_sched2); + NEXT_PASS (pass_stack_regs); + PUSH_INSERT_PASSES_WITHIN (pass_stack_regs) +diff --git a/gcc/sched-rgn.c b/gcc/sched-rgn.c +index 7f5dfdb3d..32f4489d8 100644 +--- a/gcc/sched-rgn.c ++++ b/gcc/sched-rgn.c +@@ -44,6 +44,8 @@ along with GCC; see the file COPYING3. If not see + are actually scheduled. */ + + #include "config.h" ++#define INCLUDE_SET ++#define INCLUDE_VECTOR + #include "system.h" + #include "coretypes.h" + #include "backend.h" +@@ -65,6 +67,7 @@ along with GCC; see the file COPYING3. If not see + #include "dbgcnt.h" + #include "pretty-print.h" + #include "print-rtl.h" ++#include "cfgrtl.h" + + /* Disable warnings about quoting issues in the pp_xxx calls below + that (intentionally) don't follow GCC diagnostic conventions. */ +@@ -3955,6 +3958,705 @@ make_pass_sched_fusion (gcc::context *ctxt) + return new pass_sched_fusion (ctxt); + } + ++namespace { ++ ++/* Def-use analisys special functions implementation. */ ++ ++static struct df_link * ++get_defs (rtx_insn *insn, rtx reg) ++{ ++ df_ref use; ++ struct df_link *ref_chain, *ref_link; ++ ++ FOR_EACH_INSN_USE (use, insn) ++ { ++ if (GET_CODE (DF_REF_REG (use)) == SUBREG) ++ return NULL; ++ if (REGNO (DF_REF_REG (use)) == REGNO (reg)) ++ break; ++ } ++ ++ gcc_assert (use != NULL); ++ ++ ref_chain = DF_REF_CHAIN (use); ++ ++ for (ref_link = ref_chain; ref_link; ref_link = ref_link->next) ++ { ++ /* Problem getting some definition for this instruction. */ ++ if (ref_link->ref == NULL) ++ return NULL; ++ if (DF_REF_INSN_INFO (ref_link->ref) == NULL) ++ return NULL; ++ /* As global regs are assumed to be defined at each function call ++ dataflow can report a call_insn as being a definition of REG. ++ But we can't do anything with that in this pass so proceed only ++ if the instruction really sets REG in a way that can be deduced ++ from the RTL structure. */ ++ if (global_regs[REGNO (reg)] ++ && !set_of (reg, DF_REF_INSN (ref_link->ref))) ++ return NULL; ++ } ++ ++ return ref_chain; ++} ++ ++static struct df_link * ++get_uses (rtx_insn *insn, rtx reg) ++{ ++ df_ref def; ++ struct df_link *ref_chain, *ref_link; ++ ++ FOR_EACH_INSN_DEF (def, insn) ++ if (REGNO (DF_REF_REG (def)) == REGNO (reg)) ++ break; ++ ++ gcc_assert (def != NULL && "Broken def-use analisys chain."); ++ ++ ref_chain = DF_REF_CHAIN (def); ++ ++ for (ref_link = ref_chain; ref_link; ref_link = ref_link->next) ++ { ++ /* Problem getting some use for this instruction. */ ++ if (ref_link->ref == NULL) ++ return NULL; ++ } ++ ++ return ref_chain; ++} ++ ++const pass_data pass_data_split_complex_instructions = { ++ RTL_PASS, /* Type. */ ++ "split_complex_instructions", /* Name. */ ++ OPTGROUP_NONE, /* Optinfo_flags. */ ++ TV_SPLIT_CMP_INS, /* Tv_id. */ ++ 0, /* Properties_required. */ ++ 0, /* Properties_provided. */ ++ 0, /* Properties_destroyed. */ ++ 0, /* Todo_flags_start. */ ++ (TODO_df_verify | TODO_df_finish), /* Todo_flags_finish. */ ++}; ++ ++class pass_split_complex_instructions : public rtl_opt_pass ++{ ++private: ++ enum complex_instructions_t ++ { ++ UNDEFINED, ++ LDP, ++ LDP_TI, ++ STP, ++ STR ++ }; ++ ++ void split_complex_insn (rtx_insn *insn); ++ void split_ldp_ti (rtx_insn *insn); ++ void split_ldp_with_offset (rtx_insn *ldp_insn); ++ void split_simple_ldp (rtx_insn *ldp_insn); ++ void split_ldp_stp (rtx_insn *insn); ++ complex_instructions_t get_insn_type (rtx_insn *insn); ++ ++ basic_block bb; ++ rtx_insn *insn; ++ std::set dependent_stores_candidates; ++ std::set ldp_to_split_list; ++ ++ complex_instructions_t complex_insn_type = UNDEFINED; ++ bool is_store_insn (rtx_insn *insn); ++ bool is_ldp_dependent_on_store (rtx_insn *ldp_insn, basic_block bb); ++ bool bfs_for_reg_dependent_store (rtx_insn *ldp_insn, basic_block search_bb, ++ rtx_insn *search_insn, ++ int search_range ++ = param_ldp_dependency_search_range); ++ bool is_store_reg_dependent (rtx_insn *ldp_insn, rtx_insn *str_insn); ++ void init_df (); ++ void find_dependent_stores_candidates (rtx_insn *ldp_insn); ++ int get_insn_offset (rtx_insn *insn, complex_instructions_t insn_type, ++ int *arith_operation_ptr = NULL); ++ ++public: ++ pass_split_complex_instructions (gcc::context *ctxt) ++ : rtl_opt_pass (pass_data_split_complex_instructions, ctxt) ++ { ++ } ++ /* opt_pass methods: */ ++ virtual bool gate (function *); ++ ++ virtual unsigned int ++ execute (function *) ++ { ++ enum rtx_code ldp_memref_code; ++ init_df (); ++ ldp_to_split_list.clear (); ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ FOR_BB_INSNS (bb, insn) ++ { ++ complex_instructions_t insn_type = get_insn_type (insn); ++ /* TODO: Add splitting of STP instructions. */ ++ if (insn_type != LDP && insn_type != LDP_TI) ++ continue; ++ /* TODO: Currently support only ldp_ti and ldp with REG or ++ PLUS/MINUS offset expression. */ ++ if (insn_type == LDP_TI) ++ { ++ ldp_memref_code = GET_CODE (XEXP (XEXP (PATTERN (insn), 1), ++ 0)); ++ if (ldp_memref_code != REG && ldp_memref_code != PLUS ++ && ldp_memref_code != MINUS) ++ continue; ++ } ++ if (is_ldp_dependent_on_store (insn, bb)) ++ { ++ ldp_to_split_list.insert (insn); ++ } ++ } ++ } ++ ++ for (std::set::iterator i = ldp_to_split_list.begin (); ++ i != ldp_to_split_list.end (); ++i) ++ split_complex_insn (*i); ++ ++ return 0; ++ } ++}; // class pass_split_complex_instructions ++ ++bool ++pass_split_complex_instructions::is_ldp_dependent_on_store (rtx_insn *ldp_insn, ++ basic_block bb) ++{ ++ find_dependent_stores_candidates (ldp_insn); ++ return bfs_for_reg_dependent_store (ldp_insn, bb, ldp_insn); ++} ++ ++bool ++pass_split_complex_instructions::bfs_for_reg_dependent_store ( ++ rtx_insn *ldp_insn, basic_block search_bb, rtx_insn *search_insn, ++ int search_range) ++{ ++ rtx_insn *current_search_insn = search_insn; ++ ++ for (int i = search_range; i > 0; --i) ++ { ++ if (!current_search_insn) ++ return false; ++ bool checking_result ++ = is_store_reg_dependent (ldp_insn, current_search_insn); ++ if (checking_result) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, "LDP to split:\n"); ++ print_rtl_single (dump_file, ldp_insn); ++ fprintf (dump_file, "Found STR:\n"); ++ print_rtl_single (dump_file, current_search_insn); ++ } ++ return true; ++ } ++ if (current_search_insn == BB_HEAD (search_bb)) ++ { ++ /* Search in all parent BBs for the reg_dependent store. */ ++ edge_iterator ei; ++ edge e; ++ ++ FOR_EACH_EDGE (e, ei, search_bb->preds) ++ if (e->src->index != 0 ++ && bfs_for_reg_dependent_store (ldp_insn, e->src, ++ BB_END (e->src), i - 1)) ++ return true; ++ return false; ++ } ++ else ++ { ++ if (!active_insn_p (current_search_insn)) ++ i++; ++ current_search_insn = PREV_INSN (current_search_insn); ++ } ++ } ++ return false; ++} ++ ++void ++pass_split_complex_instructions::init_df () ++{ ++ df_set_flags (DF_RD_PRUNE_DEAD_DEFS); ++ df_chain_add_problem (DF_UD_CHAIN + DF_DU_CHAIN); ++ df_mir_add_problem (); ++ df_live_add_problem (); ++ df_live_set_all_dirty (); ++ df_analyze (); ++ df_set_flags (DF_DEFER_INSN_RESCAN); ++} ++ ++void ++pass_split_complex_instructions::find_dependent_stores_candidates ( ++ rtx_insn *ldp_insn) ++{ ++ dependent_stores_candidates.clear (); ++ df_ref use; ++ ++ FOR_EACH_INSN_USE (use, ldp_insn) ++ { ++ df_link *defs = get_defs (ldp_insn, DF_REF_REG (use)); ++ if (!defs) ++ return; ++ ++ for (df_link *def = defs; def; def = def->next) ++ { ++ df_link *uses ++ = get_uses (DF_REF_INSN (def->ref), DF_REF_REG (def->ref)); ++ if (!uses) ++ continue; ++ ++ for (df_link *use = uses; use; use = use->next) ++ { ++ if (DF_REF_CLASS (use->ref) == DF_REF_REGULAR ++ && is_store_insn (DF_REF_INSN (use->ref))) ++ dependent_stores_candidates.insert (DF_REF_INSN (use->ref)); ++ } ++ } ++ } ++} ++ ++bool ++pass_split_complex_instructions::is_store_reg_dependent (rtx_insn *ldp_insn, ++ rtx_insn *str_insn) ++{ ++ if (!is_store_insn (str_insn) ++ || dependent_stores_candidates.find (str_insn) ++ == dependent_stores_candidates.end ()) ++ return false; ++ ++ int ldp_offset_sign = UNDEFINED; ++ int ldp_offset ++ = get_insn_offset (ldp_insn, get_insn_type (ldp_insn), &ldp_offset_sign); ++ if (ldp_offset_sign == MINUS) ++ ldp_offset = -ldp_offset; ++ ++ int str_offset_sign = UNDEFINED; ++ int str_offset = get_insn_offset (str_insn, STR, &str_offset_sign); ++ if (str_offset_sign == MINUS) ++ str_offset = -str_offset; ++ ++ if (str_offset == ldp_offset || str_offset == ldp_offset + 8) ++ return true; ++ ++ return false; ++} ++ ++bool ++pass_split_complex_instructions::is_store_insn (rtx_insn *insn) ++{ ++ if (!insn) ++ return false; ++ rtx sset_b = single_set (insn); ++ /* TODO: The condition below allow to take only store instructions in which ++ the memory location's operand is either a register (base) or an plus/minus ++ operation (base + #imm). So it might make sense to add support for other ++ cases (e.g. multiply and shift). */ ++ if (sset_b && MEM_P (SET_DEST (sset_b)) ++ && GET_MODE (XEXP (sset_b, 0)) != BLKmode ++ && (GET_CODE (XEXP (XEXP (sset_b, 0), 0)) == REG ++ || (GET_CODE (XEXP (XEXP (sset_b, 0), 0)) == PLUS ++ || GET_CODE (XEXP (XEXP (sset_b, 0), 0)) == MINUS) ++ && (GET_CODE (XEXP (XEXP (XEXP (sset_b, 0), 0), 1)) == CONST_INT))) ++ return true; ++ ++ return false; ++} ++ ++int ++pass_split_complex_instructions::get_insn_offset ( ++ rtx_insn *insn, complex_instructions_t insn_type, int *arith_operation_ptr) ++{ ++ rtx insn_pat = PATTERN (insn); ++ int returned_offset = 0; ++ ++ rtx offset_expr = NULL; ++ rtx offset_value_expr = NULL; ++ ++ switch (insn_type) ++ { ++ case LDP: ++ { ++ int number_of_sub_insns = XVECLEN (insn_pat, 0); ++ ++ /* Calculate it's own ofsset of first load insn. */ ++ rtx_insn *first_load_insn = NULL; ++ if (number_of_sub_insns == 2) ++ { ++ first_load_insn ++ = make_insn_raw (copy_rtx (XVECEXP (insn_pat, 0, 0))); ++ arith_operation_ptr = NULL; ++ ++ offset_expr = XEXP (XEXP (PATTERN (first_load_insn), 1), 0); ++ if (GET_CODE (offset_expr) == PLUS ++ || GET_CODE (offset_expr) == MINUS) ++ offset_value_expr ++ = XEXP (XEXP (XEXP (PATTERN (first_load_insn), 1), 0), 1); ++ else ++ offset_expr = NULL; ++ } ++ else if (number_of_sub_insns == 3) ++ { ++ rtx_insn *offset_sub_insn ++ = make_insn_raw (copy_rtx (XVECEXP (insn_pat, 0, 0))); ++ ++ offset_expr = XEXP (PATTERN (offset_sub_insn), 1); ++ offset_value_expr = XEXP (XEXP (PATTERN (offset_sub_insn), 1), 1); ++ } ++ else ++ { ++ gcc_assert (false ++ && "Wrong number of elements in the ldp_insn vector"); ++ } ++ break; ++ } ++ case LDP_TI: ++ { ++ offset_expr = XEXP (XEXP (insn_pat, 1), 0); ++ if (GET_CODE (offset_expr) != PLUS && GET_CODE (offset_expr) != MINUS) ++ return 0; ++ offset_value_expr = XEXP (XEXP (XEXP (insn_pat, 1), 0), 1); ++ break; ++ } ++ case STR: ++ { ++ offset_expr = XEXP (XEXP (insn_pat, 0), 0); ++ /* If memory location is specified by single base register then the ++ offset is zero. */ ++ if (GET_CODE (offset_expr) == REG) ++ return 0; ++ offset_value_expr = XEXP (XEXP (XEXP (insn_pat, 0), 0), 1); ++ break; ++ } ++ default: ++ { ++ if (dumps_are_enabled && dump_file) ++ { ++ fprintf (dump_file, "Instruction that was tried to split:\n"); ++ print_rtl_single (dump_file, insn); ++ } ++ gcc_assert (false && "Unsupported instruction type"); ++ break; ++ } ++ } ++ ++ if (offset_expr != NULL && offset_value_expr ++ && GET_CODE (offset_value_expr) == CONST_INT) ++ returned_offset = XINT (offset_value_expr, 0); ++ ++ if (arith_operation_ptr != NULL) ++ { ++ *arith_operation_ptr = GET_CODE (offset_expr); ++ gcc_assert ((*arith_operation_ptr == MINUS ++ || *arith_operation_ptr == PLUS) ++ && "Unexpected arithmetic operation in the offset expr"); ++ } ++ ++ return returned_offset; ++} ++ ++void ++pass_split_complex_instructions::split_simple_ldp (rtx_insn *ldp_insn) ++{ ++ rtx pat = PATTERN (ldp_insn); ++ ++ rtx_insn *mem_insn_1 = make_insn_raw (copy_rtx (XVECEXP (pat, 0, 0))); ++ rtx_insn *mem_insn_2 = make_insn_raw (copy_rtx (XVECEXP (pat, 0, 1))); ++ ++ int dest_regno = REGNO (SET_DEST (PATTERN (mem_insn_1))); ++ int src_regno; ++ ++ rtx srs_reg_insn = XEXP (SET_SRC (PATTERN (mem_insn_1)), 0); ++ ++ if (GET_CODE (srs_reg_insn) == REG) ++ src_regno = REGNO (srs_reg_insn); ++ else ++ src_regno = REGNO (XEXP (srs_reg_insn, 0)); ++ ++ rtx_insn *emited_insn_1, *emited_insn_2; ++ ++ /* in cases like ldp r1,r2,[r1] we emit ldr r2,[r1] first. */ ++ if (src_regno == dest_regno) ++ std::swap (mem_insn_1, mem_insn_2); ++ ++ emited_insn_1 = emit_insn (PATTERN (mem_insn_1)); ++ emited_insn_2 = emit_insn (PATTERN (mem_insn_2)); ++ ++ int sub_insn_1_code = recog (PATTERN (mem_insn_1), mem_insn_1, 0); ++ int sub_insn_2_code = recog (PATTERN (mem_insn_2), mem_insn_2, 0); ++ ++ INSN_CODE (emited_insn_1) = sub_insn_1_code; ++ INSN_CODE (emited_insn_2) = sub_insn_2_code; ++} ++ ++void ++pass_split_complex_instructions::split_ldp_with_offset (rtx_insn *ldp_insn) ++{ ++ rtx pat = PATTERN (ldp_insn); ++ bool post_index = true; ++ ++ rtx_insn offset_insn; ++ rtx_insn mem_insn_1; ++ rtx_insn mem_insn_2; ++ ++ int offset_insn_code; ++ int mem_insn_1_code = -1; ++ int mem_insn_2_code = -1; ++ ++ int offset = 0; ++ int arith_operation = UNDEFINED; ++ ++ for (int i = 0; i < 3; i++) ++ { ++ rtx sub_insn = XVECEXP (pat, 0, i); ++ rtx_insn *copy_of_sub_insn = make_insn_raw (copy_rtx (sub_insn)); ++ int sub_insn_code ++ = recog (PATTERN (copy_of_sub_insn), copy_of_sub_insn, 0); ++ ++ /* If sub_insn is offset related. */ ++ if (GET_RTX_CLASS (sub_insn_code) == RTX_UNARY) ++ { ++ offset_insn = *copy_of_sub_insn; ++ offset_insn_code = sub_insn_code; ++ gcc_assert (i == 0 ++ && "Offset related insn must be the first " ++ "element of a parallel insn vector"); ++ ++ offset = get_insn_offset (ldp_insn, LDP, &arith_operation); ++ } ++ else ++ { ++ if (GET_CODE (XEXP (PATTERN (copy_of_sub_insn), 0)) != REG) ++ { ++ rtx &offset_expr ++ = XEXP (XEXP (XEXP (PATTERN (copy_of_sub_insn), 0), 0), 1); ++ if (GET_CODE (offset_expr) == CONST_INT) ++ { ++ int local_offset = XINT (offset_expr, 0); ++ offset = (arith_operation == PLUS ? offset : -offset); ++ ++ offset_expr = GEN_INT (local_offset + offset); ++ ++ gcc_assert ( ++ (arith_operation == MINUS || arith_operation == PLUS) ++ && "Unexpected arithmetic operation in offset related " ++ "sub_insn"); ++ ++ if (i == 1) ++ post_index = false; ++ } ++ else ++ { ++ post_index = true; ++ } ++ } ++ } ++ if (i == 1) ++ { ++ mem_insn_1 = *copy_of_sub_insn; ++ mem_insn_1_code = sub_insn_code; ++ } ++ if (i == 2) ++ { ++ mem_insn_2 = *copy_of_sub_insn; ++ mem_insn_2_code = sub_insn_code; ++ } ++ } ++ gcc_assert (mem_insn_1_code != -1 && mem_insn_2_code != -1 ++ && "Uninitialized memory insns"); ++ ++ int dest_regno = REGNO (SET_DEST (PATTERN (&mem_insn_1))); ++ int src_regno; ++ ++ rtx srs_reg_insn = XEXP (SET_SRC (PATTERN (&mem_insn_1)), 0); ++ ++ if (GET_CODE (srs_reg_insn) == REG) ++ src_regno = REGNO (srs_reg_insn); ++ else ++ src_regno = REGNO (XEXP (srs_reg_insn, 0)); ++ ++ /* Don't split such weird LDP. */ ++ if (src_regno == dest_regno) ++ return; ++ ++ rtx_insn *emited_offset_insn; ++ if (!post_index) ++ { ++ emited_offset_insn = emit_insn (PATTERN (&offset_insn)); ++ INSN_CODE (emited_offset_insn) = offset_insn_code; ++ } ++ ++ rtx_insn *emited_insn_1 = emit_insn (PATTERN (&mem_insn_1)); ++ rtx_insn *emited_insn_2 = emit_insn (PATTERN (&mem_insn_2)); ++ ++ ++ INSN_CODE (emited_insn_1) = mem_insn_1_code; ++ INSN_CODE (emited_insn_2) = mem_insn_2_code; ++ ++ if (post_index) ++ { ++ emited_offset_insn = emit_insn (PATTERN (&offset_insn)); ++ INSN_CODE (emited_offset_insn) = offset_insn_code; ++ } ++} ++ ++void ++pass_split_complex_instructions::split_ldp_stp (rtx_insn *insn) ++{ ++ rtx_insn *prev_insn = PREV_INSN (insn); ++ int number_of_sub_insns = XVECLEN (PATTERN (insn), 0); ++ ++ start_sequence (); ++ ++ if (number_of_sub_insns == 2) ++ split_simple_ldp (insn); ++ else if (number_of_sub_insns == 3) ++ split_ldp_with_offset (insn); ++ else ++ gcc_assert (false && "Broken complex insn vector"); ++ ++ rtx_insn *seq = get_insns (); ++ unshare_all_rtl_in_chain (seq); ++ end_sequence (); ++ ++ emit_insn_after_setloc (seq, prev_insn, INSN_LOCATION (insn)); ++ delete_insn_and_edges (insn); ++} ++ ++void ++pass_split_complex_instructions::split_ldp_ti (rtx_insn *insn) ++{ ++ rtx_insn *prev_insn = PREV_INSN (insn); ++ rtx_insn *load_insn_1 = make_insn_raw (copy_rtx (PATTERN (insn))); ++ rtx_insn *load_insn_2 = make_insn_raw (copy_rtx (PATTERN (insn))); ++ ++ rtx reg_insn_1 = XEXP (PATTERN (load_insn_1), 0); ++ rtx mem_insn_1 = XEXP (PATTERN (load_insn_1), 1); ++ rtx mem_insn_2 = XEXP (PATTERN (load_insn_2), 1); ++ ++ PUT_MODE (mem_insn_1, DImode); ++ PUT_MODE (mem_insn_2, DImode); ++ ++ int reg_no_1 = REGNO (reg_insn_1); ++ ++ XEXP (PATTERN (load_insn_1), 0) = gen_rtx_REG (DImode, reg_no_1); ++ XEXP (PATTERN (load_insn_2), 0) = gen_rtx_REG (DImode, reg_no_1 + 1); ++ ++ rtx load_insn_2_plus_expr = XEXP (XEXP (PATTERN (load_insn_2), 1), 0); ++ if (GET_CODE (load_insn_2_plus_expr) == REG) ++ { ++ XEXP (XEXP (PATTERN (load_insn_2), 1), 0) ++ = gen_rtx_PLUS (DImode, ++ gen_rtx_REG (DImode, REGNO (load_insn_2_plus_expr)), ++ GEN_INT (GET_MODE_SIZE (DImode))); ++ } ++ else ++ { ++ rtx load_insn_2_offset_expr ++ = XEXP (XEXP (XEXP (PATTERN (load_insn_2), 1), 0), 1); ++ ++ if (load_insn_2_offset_expr == NULL) ++ return; ++ ++ if (GET_CODE (load_insn_2_offset_expr) == CONST_INT) ++ { ++ int load_insn_2_offset = XINT (load_insn_2_offset_expr, 0); ++ XEXP (XEXP (XEXP (PATTERN (load_insn_2), 1), 0), 1) ++ = GEN_INT (load_insn_2_offset + GET_MODE_SIZE (DImode)); ++ } ++ } ++ ++ start_sequence (); ++ ++ int src_regno; ++ rtx srs_reg_insn = XEXP (XEXP (PATTERN (load_insn_1), 1), 0); ++ ++ if (GET_CODE (srs_reg_insn) == REG) ++ src_regno = REGNO (srs_reg_insn); ++ else ++ src_regno = REGNO (XEXP (srs_reg_insn, 0)); ++ ++ /* in cases like ldp r1,r2,[r1] we emit ldr r2,[r1] first. */ ++ if (src_regno == reg_no_1) ++ std::swap (load_insn_1, load_insn_2); ++ ++ rtx_insn *emited_load_insn_1 = emit_insn (PATTERN (load_insn_1)); ++ rtx_insn *emited_load_insn_2 = emit_insn (PATTERN (load_insn_2)); ++ ++ INSN_CODE (emited_load_insn_1) ++ = recog (PATTERN (emited_load_insn_1), emited_load_insn_1, 0); ++ INSN_CODE (emited_load_insn_2) ++ = recog (PATTERN (emited_load_insn_2), emited_load_insn_2, 0); ++ ++ rtx_insn *seq = get_insns (); ++ unshare_all_rtl_in_chain (seq); ++ end_sequence (); ++ ++ emit_insn_after_setloc (seq, prev_insn, INSN_LOCATION (insn)); ++ delete_insn_and_edges (insn); ++} ++ ++void ++pass_split_complex_instructions::split_complex_insn (rtx_insn *insn) ++{ ++ complex_instructions_t insn_type = get_insn_type (insn); ++ /* TODO: Add splitting of STP instructions. */ ++ if (insn_type == LDP || insn_type == STP) ++ split_ldp_stp (insn); ++ else if (insn_type == LDP_TI) ++ split_ldp_ti (insn); ++ else ++ gcc_assert (false && "Unsupported type of insn to split"); ++} ++ ++pass_split_complex_instructions::complex_instructions_t ++pass_split_complex_instructions::get_insn_type (rtx_insn *insn) ++{ ++ if (!INSN_P (insn)) ++ return UNDEFINED; ++ ++ rtx pat = PATTERN (insn); ++ int icode = recog (PATTERN (insn), insn, NULL); ++ ++ if (GET_CODE (pat) == PARALLEL) ++ { ++ if (targetm.is_ldp_insn (icode)) ++ { ++ return LDP; ++ } ++ if (targetm.is_stp_insn (icode)) ++ { ++ return STP; ++ } ++ else ++ { ++ return UNDEFINED; ++ } ++ } ++ rtx set_insn = single_set (insn); ++ if (set_insn && GET_CODE (XEXP (set_insn, 1)) == MEM ++ && GET_MODE (XEXP (set_insn, 1)) == E_TImode) ++ return LDP_TI; ++ ++ return UNDEFINED; ++} ++ ++bool ++pass_split_complex_instructions::gate (function *) ++{ ++ return targetm.is_ldp_insn && targetm.is_stp_insn && optimize > 0 ++ && flag_split_ldp_stp > 0; ++} ++ ++} // anon namespace ++ ++rtl_opt_pass * ++make_pass_split_complex_instructions (gcc::context *ctxt) ++{ ++ return new pass_split_complex_instructions (ctxt); ++} ++ + #if __GNUC__ >= 10 + # pragma GCC diagnostic pop + #endif +diff --git a/gcc/target.def b/gcc/target.def +index 649373449..48c8a8234 100644 +--- a/gcc/target.def ++++ b/gcc/target.def +@@ -2727,6 +2727,16 @@ modes and they have different conditional execution capability, such as ARM.", + bool, (void), + default_have_conditional_execution) + ++DEFHOOK ++(is_ldp_insn, ++ "Return true if icode is corresponding to any of the LDP instruction types.", ++ bool, (int icode), NULL) ++ ++DEFHOOK ++(is_stp_insn, ++ "Return true if icode is corresponding to any of the STP instruction types.", ++ bool, (int icode), NULL) ++ + DEFHOOK + (gen_ccmp_first, + "This function prepares to emit a comparison insn for the first compare in a\n\ +diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c +new file mode 100644 +index 000000000..3918d43f6 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c +@@ -0,0 +1,74 @@ ++/* { dg-do compile { target aarch64-*-* } } */ ++/* { dg-additional-options "-fsplit-ldp-stp" } */ ++/* ++ * Tests are: ++ * Patterns where LDP insns should NOT be split ++ * */ ++ ++int __RTL (startwith ("split_complex_instructions")) ++simple_ldp_after_store () ++{ ++(function "simple_ldp_after_store" ++ (insn-chain ++ (block 2 ++ (edge-from entry (flags "FALLTHRU")) ++ (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK) ++ (cinsn 228 (set (reg/i:DI sp) ++ (reg/i:DI x0))) ++ (cinsn 101 (set (mem/c:DI ++ (plus:DI (reg/f:DI sp) ++ (const_int 32))[1 S4 A32])(reg:DI x0))) ++ (cinsn 10 (parallel [ ++ (set (reg:DI x29) ++ (mem:DI (plus:DI (reg/f:DI sp) (const_int 8)) [1 S4 A32])) ++ (set (reg:DI x30) ++ (mem:DI (plus:DI (reg/f:DI sp) ++ (const_int 16)) [1 S4 A32]))])) ++ (cinsn 11 (use (reg/i:DI sp))) ++ (cinsn 12 (use (reg/i:DI cc))) ++ (cinsn 13 (use (reg/i:DI x29))) ++ (cinsn 14 (use (reg/i:DI x30))) ++ (cinsn 15 (use (reg/i:DI x0))) ++ (edge-to exit (flags "FALLTHRU")) ++ ) ;; block 2 ++ ) ;; insn-chain ++) ;; function "simple_ldp_after_store" ++} ++ ++int __RTL (startwith ("split_complex_instructions")) ++ldp_after_store_in_different_bb () ++{ ++(function "ldp_after_store_in_different_bb" ++ (insn-chain ++ (block 2 ++ (edge-from entry (flags "FALLTHRU")) ++ (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK) ++ (cinsn 228 (set (reg/i:DI sp) ++ (reg/i:DI x0))) ++ (cinsn 101 (set (mem/c:DI ++ (plus:DI (reg/f:DI sp) ++ (const_int 32))[1 S4 A32])(reg:DI x0))) ++ (edge-to 3 (flags "FALLTHRU")) ++ ) ;; block 2 ++ (block 3 ++ (edge-from 2 (flags "FALLTHRU")) ++ (cnote 4 [bb 3] NOTE_INSN_BASIC_BLOCK) ++ (cinsn 10 (parallel [ ++ (set (reg:DI x29) ++ (mem:DI (plus:DI (reg/f:DI sp) (const_int 8)) [1 S4 A32])) ++ (set (reg:DI x30) ++ (mem:DI (plus:DI (reg/f:DI sp) ++ (const_int 16)) [1 S4 A32]))])) ++ (cinsn 11 (use (reg/i:DI sp))) ++ (cinsn 12 (use (reg/i:DI cc))) ++ (cinsn 13 (use (reg/i:DI x29))) ++ (cinsn 14 (use (reg/i:DI x30))) ++ (cinsn 15 (use (reg/i:DI x0))) ++ (edge-to exit (flags "FALLTHRU")) ++ ) ;; block 3 ++ ) ;; insn-chain ++) ;; function "ldp_after_store_in_different_bb" ++} ++ ++/* Verify that the output code contains exactly 2 ldp. */ ++/* { dg-final { scan-assembler-times {ldp\t} 2 } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c +new file mode 100644 +index 000000000..8c035c3e1 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c +@@ -0,0 +1,40 @@ ++/* { dg-do compile { target aarch64-*-* } } */ ++/* { dg-additional-options "-fsplit-ldp-stp" } */ ++/* ++ * Test is: ++ * Pattern where LDP insns should be split with rearrangement in order ++ * to deal with data dependecy betwen subinstruction. ++ * */ ++ ++int __RTL (startwith ("split_complex_instructions")) ++simple_ldp_after_store () ++{ ++(function "ldp_equal_registers" ++ (insn-chain ++ (block 2 ++ (edge-from entry (flags "FALLTHRU")) ++ (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK) ++ (cinsn 228 (set (reg/i:DI x1) ++ (reg/i:DI x0))) ++ (cinsn 101 (set (mem/c:DI ++ (plus:DI (reg/f:DI x1) ++ (const_int 8))[1 S4 A32])(reg:DI x0))) ++ (cinsn 10 (parallel [ ++ (set (reg:DI x1) ++ (mem:DI (plus:DI (reg/f:DI x1) (const_int 8)) [1 S4 A32])) ++ (set (reg:DI x2) ++ (mem:DI (plus:DI (reg/f:DI x1) ++ (const_int 16)) [1 S4 A32]))])) ++ (cinsn 11 (use (reg/i:DI sp))) ++ (cinsn 12 (use (reg/i:DI cc))) ++ (cinsn 13 (use (reg/i:DI x0))) ++ (cinsn 14 (use (reg/i:DI x1))) ++ (cinsn 15 (use (reg/i:DI x2))) ++ (edge-to exit (flags "FALLTHRU")) ++ ) ;; block 2 ++ ) ;; insn-chain ++) ;; function "ldp_equal_registers" ++} ++ ++/* Verify that the output code doesn't contain ldp. */ ++/* { dg-final { scan-assembler-times ".*ldr.*x2.*x1,.*16.*ldr.*x1.*x1.*8" 1 } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c +new file mode 100644 +index 000000000..2615e4fa1 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c +@@ -0,0 +1,174 @@ ++/* { dg-do compile { target aarch64-*-* } } */ ++/* { dg-additional-options "-O1 -fsplit-ldp-stp" } */ ++/* ++ * Tests are: ++ * Patterns where LDP insns should be split ++ * */ ++ ++int __RTL (startwith ("split_complex_instructions")) ++simple_ldp_after_store () ++{ ++(function "simple_ldp_after_store" ++ (insn-chain ++ (block 2 ++ (edge-from entry (flags "FALLTHRU")) ++ (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK) ++ (cinsn 228 (set (reg/i:DI sp) ++ (reg/i:DI x0))) ++ (cinsn 238 (set (reg/i:DI x1) ++ (reg/i:DI x0))) ++ ++ (cinsn 101 (set (mem/c:DI ++ (plus:DI (reg/f:DI sp) ++ (const_int 8))[1 S4 A32])(reg:DI x0))) ++ (cinsn 10 (parallel [ ++ (set (reg:DI x29) ++ (mem:DI (plus:DI (reg/f:DI sp) (const_int 8)) [1 S4 A32])) ++ (set (reg:DI x30) ++ (mem:DI (plus:DI (reg/f:DI sp) ++ (const_int 16)) [1 S4 A32]))])) ++ ++ (cinsn 102 (set (mem/c:DI (plus:DI (reg/f:DI x1) ++ (const_int -16)) [1 S4 A32]) ++ (reg:DI x0))) ++ (cinsn 11 (parallel [ ++ (set (reg:DI x3) ++ (mem:DI (plus:DI (reg/f:DI x1) (const_int -16)) [1 S4 A32])) ++ (set (reg:DI x4) ++ (mem:DI (plus:DI (reg/f:DI x1) (const_int -8)) [1 S4 A32])) ++ ])) ++ ++ (cinsn 103 (set (mem/c:DI (reg/f:DI x1) [1 S4 A32]) ++ (reg:DI x0))) ++ (cinsn 12 (parallel [ ++ (set (reg:DI x5) (mem:DI (reg/f:DI x1) [1 S4 A32])) ++ (set (reg:DI x6) (mem:DI (plus:DI (reg/f:DI x1) ++ (const_int 8)) [1 S4 A32])) ++ ])) ++ ++ (cinsn 13 (use (reg/i:DI sp))) ++ (cinsn 14 (use (reg/i:DI cc))) ++ (cinsn 15 (use (reg/i:DI x29))) ++ (cinsn 16 (use (reg/i:DI x30))) ++ (cinsn 17 (use (reg/i:DI x0))) ++ (cinsn 18 (use (reg/i:DI x3))) ++ (cinsn 19 (use (reg/i:DI x4))) ++ (cinsn 20 (use (reg/i:DI x5))) ++ (cinsn 21 (use (reg/i:DI x6))) ++ (edge-to exit (flags "FALLTHRU")) ++ ) ;; block 2 ++ ) ;; insn-chain ++) ;; function "simple_ldp_after_store" ++} ++ ++int __RTL (startwith ("split_complex_instructions")) ++ldp_ti_after_store () ++{ ++ (function "ldp_ti_after_store" ++ (insn-chain ++ (block 2 ++ (edge-from entry (flags "FALLTHRU")) ++ (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK) ++ (cinsn 228 (set (reg/i:DI sp) ++ (reg/i:DI x0))) ++ (cinsn 238 (set (reg/i:DI x2) ++ (reg/i:DI x0))) ++ ++ (cinsn 101 (set (mem/c:DI ++ (plus:DI (reg/f:DI sp) ++ (const_int 136))[1 S4 A32])(reg:DI x0))) ++ (insn 81 (set (reg:TI x0 [1 S4 A32]) ++ (mem/c:TI (plus:DI (reg/f:DI sp) ++ (const_int 136 )) [1 S4 A32])) ++ (expr_list:REG_EQUIV (mem/c:TI (plus:DI (reg/f:DI sfp) ++ (const_int -24 )) [1 S4 A32]) ++ (nil))) ++ ++ (cinsn 102 (set (mem/c:DI (plus:DI (reg/f:DI x2) ++ (const_int -16)) [1 S4 A32]) ++ (reg:DI x0))) ++ (insn 82 (set (reg:TI x3 [1 S4 A32]) ++ (mem/c:TI (plus:DI (reg/f:DI x2) ++ (const_int -16)) [1 S4 A32]))) ++ ++ (cinsn 103 (set (mem/c:DI (reg/f:DI x2) [1 S4 A32]) ++ (reg:DI x0))) ++ (insn 83 (set (reg:TI x5 [1 S4 A32]) ++ (mem/c:TI (reg/f:DI x2) [1 S4 A32]))) ++ ++ (cinsn 11 (use (reg/i:DI sp))) ++ (cinsn 12 (use (reg/i:DI cc))) ++ (cinsn 13 (use (reg/i:DI x29))) ++ (cinsn 14 (use (reg/i:DI x30))) ++ (cinsn 15 (use (reg/i:DI x0))) ++ (cinsn 16 (use (reg/i:DI x3))) ++ (cinsn 17 (use (reg/i:DI x5))) ++ (cinsn 18 (use (reg/i:DI x1))) ++ (cinsn 19 (use (reg/i:DI x4))) ++ (cinsn 20 (use (reg/i:DI x6))) ++ (edge-to exit (flags "FALLTHRU")) ++ ) ;; block 2 ++ ) ;; insn-chain ++) ;; function "ldp_ti_after_store" ++} ++ ++int __RTL (startwith ("split_complex_instructions")) ++ldp_after_store_in_different_bb () ++{ ++(function "ldp_after_store_in_different_bb" ++ (insn-chain ++ (block 2 ++ (edge-from entry (flags "FALLTHRU")) ++ (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK) ++ (cinsn 228 (set (reg/i:DI sp) ++ (reg/i:DI x0))) ++ (cinsn 238 (set (reg/i:DI x1) ++ (reg/i:DI x0))) ++ ++ (cinsn 101 (set (mem/c:DI ++ (plus:DI (reg/f:DI sp) ++ (const_int 8))[1 S4 A32])(reg:DI x0))) ++ (cinsn 102 (set (mem/c:DI (plus:DI (reg/f:DI x1) ++ (const_int -16)) [1 S4 A32]) ++ (reg:DI x0))) ++ (cinsn 103 (set (mem/c:DI (reg/f:DI x1) [1 S4 A32]) ++ (reg:DI x0))) ++ (edge-to 3 (flags "FALLTHRU")) ++ ) ;; block 2 ++ (block 3 ++ (edge-from 2 (flags "FALLTHRU")) ++ (cnote 4 [bb 3] NOTE_INSN_BASIC_BLOCK) ++ (cinsn 10 (parallel [ ++ (set (reg:DI x29) ++ (mem:DI (plus:DI (reg/f:DI sp) (const_int 8)) [1 S4 A32])) ++ (set (reg:DI x30) ++ (mem:DI (plus:DI (reg/f:DI sp) ++ (const_int 16)) [1 S4 A32]))])) ++ (cinsn 11 (parallel [ ++ (set (reg:DI x3) ++ (mem:DI (plus:DI (reg/f:DI x1) (const_int -16)) [1 S4 A32])) ++ (set (reg:DI x4) ++ (mem:DI (plus:DI (reg/f:DI x1) (const_int -8)) [1 S4 A32])) ++ ])) ++ (cinsn 12 (parallel [ ++ (set (reg:DI x5) (mem:DI (reg/f:DI x1) [1 S4 A32])) ++ (set (reg:DI x6) (mem:DI (plus:DI (reg/f:DI x1) ++ (const_int 8)) [1 S4 A32])) ++ ])) ++ (cinsn 13 (use (reg/i:DI sp))) ++ (cinsn 14 (use (reg/i:DI cc))) ++ (cinsn 15 (use (reg/i:DI x29))) ++ (cinsn 16 (use (reg/i:DI x30))) ++ (cinsn 17 (use (reg/i:DI x0))) ++ (cinsn 18 (use (reg/i:DI x3))) ++ (cinsn 19 (use (reg/i:DI x4))) ++ (cinsn 20 (use (reg/i:DI x5))) ++ (cinsn 21 (use (reg/i:DI x6))) ++ (edge-to exit (flags "FALLTHRU")) ++ ) ;; block 3 ++ ) ;; insn-chain ++) ;; function "ldp_after_store_in_different_bb" ++} ++ ++/* Verify that the output code doesn't contain ldp. */ ++/* { dg-final { scan-assembler-not {ldp\t} } } */ +diff --git a/gcc/timevar.def b/gcc/timevar.def +index 4059c57d5..24caf1b5d 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -277,6 +277,7 @@ DEFTIMEVAR (TV_RELOAD_CSE_REGS , "reload CSE regs") + DEFTIMEVAR (TV_GCSE_AFTER_RELOAD , "load CSE after reload") + DEFTIMEVAR (TV_REE , "ree") + DEFTIMEVAR (TV_THREAD_PROLOGUE_AND_EPILOGUE, "thread pro- & epilogue") ++DEFTIMEVAR (TV_SPLIT_CMP_INS , "split complex instructions") + DEFTIMEVAR (TV_IFCVT2 , "if-conversion 2") + DEFTIMEVAR (TV_SPLIT_PATHS , "split paths") + DEFTIMEVAR (TV_COMBINE_STACK_ADJUST , "combine stack adjustments") +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index 55d9dd668..232a3fdf6 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -598,6 +598,7 @@ extern rtl_opt_pass *make_pass_gcse2 (gcc::context *ctxt); + extern rtl_opt_pass *make_pass_split_after_reload (gcc::context *ctxt); + extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context + *ctxt); ++extern rtl_opt_pass *make_pass_split_complex_instructions (gcc::context *ctxt); + extern rtl_opt_pass *make_pass_stack_adjustments (gcc::context *ctxt); + extern rtl_opt_pass *make_pass_sched_fusion (gcc::context *ctxt); + extern rtl_opt_pass *make_pass_peephole2 (gcc::context *ctxt); +-- +2.33.0 + diff --git a/0158-Implement-IPA-prefetch-optimization.patch b/0158-Implement-IPA-prefetch-optimization.patch new file mode 100644 index 0000000..7661117 --- /dev/null +++ b/0158-Implement-IPA-prefetch-optimization.patch @@ -0,0 +1,2072 @@ +From 0a83ceb119be0f1a1e061b3864752e783ec2c680 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia WX1215920 +Date: Tue, 12 Dec 2023 10:29:17 +0800 +Subject: [PATCH 5/6] Implement IPA prefetch optimization + +--- + gcc/Makefile.in | 1 + + gcc/cgraph.c | 1 + + gcc/cgraph.h | 2 + + gcc/common.opt | 8 + + gcc/ipa-devirt.c | 54 +- + gcc/ipa-prefetch.c | 1820 ++++++++++++++++++++++++++++++++++++++++++++ + gcc/ipa-sra.c | 8 + + gcc/params.opt | 8 + + gcc/passes.def | 1 + + gcc/timevar.def | 1 + + gcc/tree-pass.h | 1 + + 11 files changed, 1903 insertions(+), 2 deletions(-) + create mode 100644 gcc/ipa-prefetch.c + +diff --git a/gcc/Makefile.in b/gcc/Makefile.in +index 31bf2cde2..bcb9305a0 100644 +--- a/gcc/Makefile.in ++++ b/gcc/Makefile.in +@@ -1411,6 +1411,7 @@ OBJS = \ + ipa-inline-analysis.o \ + ipa-inline-transform.o \ + ipa-predicate.o \ ++ ipa-prefetch.o \ + ipa-profile.o \ + ipa-prop.o \ + ipa-param-manipulation.o \ +diff --git a/gcc/cgraph.c b/gcc/cgraph.c +index 4a44b4a48..57146bf8c 100644 +--- a/gcc/cgraph.c ++++ b/gcc/cgraph.c +@@ -988,6 +988,7 @@ cgraph_node::create_indirect_edge (gcall *call_stmt, int ecf_flags, + edge->indirect_info = cgraph_allocate_init_indirect_info (); + edge->indirect_info->ecf_flags = ecf_flags; + edge->indirect_info->vptr_changed = true; ++ edge->indirect_info->targets = NULL; + + /* Record polymorphic call info. */ + if (!cloning_p +diff --git a/gcc/cgraph.h b/gcc/cgraph.h +index 8e71af072..0b2bb1e82 100644 +--- a/gcc/cgraph.h ++++ b/gcc/cgraph.h +@@ -1710,6 +1710,8 @@ public: + int param_index; + /* ECF flags determined from the caller. */ + int ecf_flags; ++ /* Vector of potential call targets determined by analysis. */ ++ vec *targets; + + /* Number of speculative call targets, it's less than GCOV_TOPN_VALUES. */ + unsigned num_speculative_call_targets : 16; +diff --git a/gcc/common.opt b/gcc/common.opt +index f2c53cc31..cb5854463 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1274,6 +1274,10 @@ fdevirtualize + Common Report Var(flag_devirtualize) Optimization + Try to convert virtual calls to direct ones. + ++fipa-ic ++Common Report Var(flag_ipa_ic) Optimization Init(0) ++Perform interprocedural analysis of indirect calls. ++ + ficp + Common Report Var(flag_icp) Optimization Init(0) + Try to promote indirect calls to direct ones. +@@ -2245,6 +2249,10 @@ fllc-allocate + Common Report Var(flag_llc_allocate) Init(-1) Optimization + Generate LLC hint instructions. + ++fipa-prefetch ++Common Report Var(flag_ipa_prefetch) Init(0) Optimization ++Generate prefetch instructions, if available, using IPA info. ++ + fprofile + Common Report Var(profile_flag) + Enable basic program profiling code. +diff --git a/gcc/ipa-devirt.c b/gcc/ipa-devirt.c +index 79466d91d..fbde7eb94 100644 +--- a/gcc/ipa-devirt.c ++++ b/gcc/ipa-devirt.c +@@ -5703,6 +5703,54 @@ merge_fs_map_for_ftype_aliases () + } + } + ++/* Save results of indirect call analysis for the next passes. */ ++ ++static void ++save_analysis_results () ++{ ++ if (dump_file) ++ fprintf (dump_file, "\n\nSave results of indirect call analysis.\n"); ++ ++ struct cgraph_node *n; ++ FOR_EACH_FUNCTION (n) ++ { ++ cgraph_edge *e, *next; ++ for (e = n->indirect_calls; e; e = next) ++ { ++ next = e->next_callee; ++ if (e->indirect_info->polymorphic) ++ continue; ++ gcall *stmt = e->call_stmt; ++ gcc_assert (stmt != NULL); ++ tree call_fn = gimple_call_fn (stmt); ++ tree call_fn_ty = TREE_TYPE (call_fn); ++ if (!POINTER_TYPE_P (call_fn_ty)) ++ continue; ++ ++ tree ctype = TYPE_CANONICAL (TREE_TYPE (call_fn_ty)); ++ unsigned ctype_uid = ctype ? TYPE_UID (ctype) : 0; ++ if (!ctype_uid || unsafe_types->count (ctype_uid) ++ || !fs_map->count (ctype_uid)) ++ continue; ++ /* TODO: cleanup noninterposable aliases. */ ++ decl_set *decls = (*fs_map)[ctype_uid]; ++ if (dump_file) ++ { ++ fprintf (dump_file, "For call "); ++ print_gimple_stmt (dump_file, stmt, 0); ++ } ++ vec_alloc (e->indirect_info->targets, decls->size ()); ++ for (decl_set::const_iterator it = decls->begin (); ++ it != decls->end (); it++) ++ { ++ struct cgraph_node *target = cgraph_node::get (*it); ++ /* TODO: maybe discard some targets. */ ++ e->indirect_info->targets->quick_push (target); ++ } ++ } ++ } ++} ++ + /* Dump function types with set of functions corresponding to it. */ + + static void +@@ -5767,6 +5815,8 @@ collect_function_signatures () + } + } + merge_fs_map_for_ftype_aliases (); ++ if (flag_ipa_ic) ++ save_analysis_results (); + if (dump_file) + dump_function_signature_sets (); + } +@@ -6162,7 +6212,7 @@ ipa_icp (void) + optimize indirect calls. */ + collect_function_type_aliases (); + collect_function_signatures (); +- bool optimized = optimize_indirect_calls (); ++ bool optimized = flag_icp ? optimize_indirect_calls () : false; + + remove_type_alias_map (ta_map); + remove_type_alias_map (fta_map); +@@ -6209,7 +6259,7 @@ public: + /* opt_pass methods: */ + virtual bool gate (function *) + { +- return (optimize && flag_icp && !seen_error () ++ return (optimize && (flag_icp || flag_ipa_ic) && !seen_error () + && (in_lto_p || flag_whole_program)); + } + +diff --git a/gcc/ipa-prefetch.c b/gcc/ipa-prefetch.c +new file mode 100644 +index 000000000..93483a6e8 +--- /dev/null ++++ b/gcc/ipa-prefetch.c +@@ -0,0 +1,1820 @@ ++/* IPA prefetch optimizations. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ Contributed by Ilia Diachkov. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free ++Software Foundation; either version 3, or (at your option) any later ++version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ANY ++WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++/* IPA prefetch is an interprocedural pass that detects cases of indirect ++ memory access potentially in loops and inserts prefetch instructions ++ to optimize cache usage during these indirect memory accesses. */ ++ ++#include "config.h" ++#define INCLUDE_SET ++#define INCLUDE_MAP ++#include "system.h" ++#include "coretypes.h" ++#include "target.h" ++#include "tm.h" ++#include "tree.h" ++#include "tree-pass.h" ++#include "cgraph.h" ++#include "diagnostic-core.h" ++#include "function.h" ++#include "basic-block.h" ++#include "gimple.h" ++#include "vec.h" ++#include "tree-pretty-print.h" ++#include "gimple-pretty-print.h" ++#include "gimple-iterator.h" ++#include "gimple-walk.h" ++#include "cfg.h" ++#include "cfghooks.h" ++#include "ssa.h" ++#include "tree-dfa.h" ++#include "fold-const.h" ++#include "tree-inline.h" ++#include "stor-layout.h" ++#include "tree-into-ssa.h" ++#include "tree-cfg.h" ++#include "alloc-pool.h" ++#include "symbol-summary.h" ++#include "ipa-prop.h" ++#include "tree-eh.h" ++#include "bitmap.h" ++#include "cfgloop.h" ++#include "langhooks.h" ++#include "ipa-param-manipulation.h" ++#include "ipa-fnsummary.h" ++#include "tree-ssa-loop.h" ++#include "tree-ssa-loop-ivopts.h" ++#include "gimple-fold.h" ++#include "gimplify.h" ++ ++namespace { ++ ++/* Call graph analysis. */ ++ ++typedef std::set edge_set; ++typedef std::set node_set; ++typedef std::map node_to_iedge_map; ++typedef std::map node_to_node_map; ++typedef std::map edge_in_loop; ++typedef std::map node_in_loop; ++ ++static edge_in_loop *el_map = NULL; ++static node_in_loop *nl_map = NULL; ++static node_to_iedge_map *icn_map = NULL; ++/* Contains nodes which reachable from a given node. */ ++static node_to_node_map *nn_map = NULL; ++ ++static bool ++can_be_optimized (cgraph_node *n) ++{ ++ /* TODO: maybe check also inlined_to. */ ++ return opt_for_fn (n->decl, flag_ipa_prefetch) && n->has_gimple_body_p (); ++} ++ ++static void ++analyze_cgraph_edge (cgraph_edge *e) ++{ ++ gcall *stmt = e->call_stmt; ++ gcc_checking_assert (e && stmt); ++ basic_block bb = gimple_bb (stmt); ++ gcc_checking_assert (bb); ++ /* TODO: add the same check for indirect calls. */ ++ if (e->callee && !can_be_optimized (e->callee)) ++ return; ++ ++ if (dump_file) ++ { ++ if (e->callee) ++ fprintf (dump_file, "\t%*s%s %s%*s ", 1, "", ++ e->callee->dump_name (), !e->inline_failed ? "inlined" : ++ cgraph_inline_failed_string (e->inline_failed), 1, ""); ++ else ++ fprintf (dump_file, "\t%*s%s %s%*s ", 1, "", "(indirect)", ++ "n/a", 1, ""); ++ fprintf (dump_file, "freq:%4.2f", e->sreal_frequency ().to_double ()); ++ ++ if (e->callee && cross_module_call_p (e)) ++ fprintf (dump_file, " cross module"); ++ ++ class ipa_call_summary *es = ipa_call_summaries->get (e); ++ if (es) ++ fprintf (dump_file, " loop depth:%2i size:%2i time: %2i", ++ es->loop_depth, es->call_stmt_size, es->call_stmt_time); ++ ++ fprintf (dump_file, "\n"); ++ } ++ if (e->indirect_info && dump_file) ++ { ++ fprintf (dump_file, "II: %p\n", (void *) e->indirect_info->targets); ++ unsigned i = 0; ++ cgraph_node *n; ++ if (e->indirect_info->targets) ++ for (i = 0; e->indirect_info->targets->iterate (i, &n); ++i) ++ fprintf (dump_file, "\t%s\n", n->dump_name ()); ++ } ++ ++ if (bb_loop_depth (bb) == 0) ++ return; ++ ++ if (dump_file) ++ { ++ if (e->callee) ++ fprintf (dump_file, "\tCall in loop (%d): ", bb_loop_depth (bb)); ++ else ++ fprintf (dump_file, "\tICall in loop (%d): ", bb_loop_depth (bb)); ++ print_gimple_stmt (dump_file, stmt, 0); ++ } ++ (*el_map)[e] = e->sreal_frequency ().to_double (); ++} ++ ++/* Walk optimizible cgraph nodes and collect info for edges. */ ++ ++static void ++analyse_cgraph () ++{ ++ cgraph_node *n; ++ cgraph_edge *e; ++ FOR_EACH_DEFINED_FUNCTION (n) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, "\n\nProcesing function %s\n", n->dump_name ()); ++ print_generic_expr (dump_file, n->decl); ++ fprintf (dump_file, "\n"); ++ } ++ if (!can_be_optimized (n)) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Skip the function\n"); ++ continue; ++ } ++ ++ /* TODO: maybe remove loop info here. */ ++ push_cfun (DECL_STRUCT_FUNCTION (n->decl)); ++ calculate_dominance_info (CDI_DOMINATORS); ++ loop_optimizer_init (LOOPS_NORMAL); ++ ++ for (e = n->callees; e; e = e->next_callee) ++ analyze_cgraph_edge (e); ++ for (e = n->indirect_calls; e; e = e->next_callee) ++ analyze_cgraph_edge (e); ++ ++ free_dominance_info (CDI_DOMINATORS); ++ loop_optimizer_finalize (); ++ ++ pop_cfun (); ++ } ++} ++ ++/* Save indirect call info to node:icall_target map. */ ++ ++static void ++prepare_indirect_call_info () ++{ ++ cgraph_node *n, *n2; ++ cgraph_edge *e; ++ FOR_EACH_DEFINED_FUNCTION (n) ++ for (e = n->indirect_calls; e; e = e->next_callee) ++ { ++ if (!e->indirect_info->targets) ++ continue; ++ for (unsigned i = 0; e->indirect_info->targets->iterate (i, &n2); ++i) ++ { ++ if (icn_map->count (n2) == 0) ++ (*icn_map)[n2] = new edge_set; ++ (*icn_map)[n2]->insert (e); ++ } ++ } ++} ++ ++static void ++collect_nn_info (struct cgraph_edge *e, struct cgraph_node *n) ++{ ++ struct cgraph_node *n2 = e->caller; ++ if (nn_map->count (n2) == 0) ++ (*nn_map)[n2] = new node_set; ++ (*nn_map)[n2]->insert (n); ++ if (nn_map->count (n) != 0) ++ { ++ node_set *set = (*nn_map)[n]; ++ for (node_set::const_iterator it = set->begin (); ++ it != set->end (); it++) ++ (*nn_map)[n2]->insert (*it); ++ } ++} ++ ++static bool ++check_loop_info_for_cgraph_edge (struct cgraph_edge *e, struct cgraph_node *n, ++ bool &all_in_loop, double &rate) ++{ ++ collect_nn_info (e, n); ++ if (el_map->count (e) == 0) ++ { ++ if (dump_file) ++ fprintf (dump_file, "not all: %s->%s\n", ++ e->caller->dump_name (), n->dump_name ()); ++ all_in_loop = false; ++ return false; ++ } ++ rate += (*el_map)[e]; ++ return true; ++} ++ ++static bool ++update_loop_info_for_cgraph_node (struct cgraph_node *n) ++{ ++ bool changed = false, all_in_loop = true; ++ double rate = 0.0; ++ struct cgraph_edge *e; ++ ++ /* Iterate all direct callers. */ ++ if (n->callers) ++ for (e = n->callers; e; e = e->next_caller) ++ if (!check_loop_info_for_cgraph_edge (e, n, all_in_loop, rate)) ++ break; ++ ++ /* Iterate all possible indirect callers. */ ++ edge_set *set = (*icn_map)[n]; ++ if (set) ++ for (edge_set::const_iterator it = set->begin (); it != set->end (); it++) ++ if (!check_loop_info_for_cgraph_edge (*it, n, all_in_loop, rate)) ++ break; ++ ++ /* The node had 0 loop count but the rate is > 0, ++ so something is changed. */ ++ if (dump_file) ++ fprintf (dump_file, "%s: all=%d, nl->c=%lu, r=%4.2f\n", n->dump_name (), ++ all_in_loop, nl_map->count (n), rate); ++ ++ if (all_in_loop && nl_map->count (n) == 0 && rate > 0.0) ++ { ++ if (dump_file) ++ fprintf (dump_file, "%s: new rate %4.2f\n", n->dump_name (), rate); ++ changed = true; ++ } ++ if (all_in_loop) ++ { ++ (*nl_map)[n] = nl_map->count (n) ? (*nl_map)[n] + rate : rate; ++ for (e = n->callees; e; e = e->next_callee) ++ (*el_map)[e] = el_map->count (e) ? (*el_map)[e] + rate : rate; ++ for (e = n->indirect_calls; e; e = e->next_callee) ++ { ++ (*el_map)[e] = el_map->count (e) ? (*el_map)[e] + rate : rate; ++ if (dump_file) ++ fprintf (dump_file, "%s: reset indirect e=%p to %4.2f\n", ++ n->dump_name (), (void *) e, (*el_map)[e]); ++ } ++ } ++ return changed; ++} ++ ++/* Propagate in_loop info over the call graph. */ ++ ++static void ++propagate_loop_info_in_cgraph () ++{ ++ struct cgraph_node *n; ++ bool changed; ++ unsigned iteration = 0; ++ do ++ { ++ changed = false; ++ if (dump_file) ++ fprintf (dump_file, "\nIteration %u\n", iteration++); ++ FOR_EACH_DEFINED_FUNCTION (n) ++ { ++ if (!n->callers && !(*icn_map)[n]) ++ continue; ++ if (update_loop_info_for_cgraph_node (n)) ++ changed = true; ++ } ++ } while (changed); ++ ++ if (dump_file) ++ { ++ fprintf (dump_file, "\nList of nodes in loops:\n"); ++ FOR_EACH_DEFINED_FUNCTION (n) ++ if (nl_map->count (n) != 0) ++ fprintf (dump_file, "%s: %4.2f\n", n->dump_name (), (*nl_map)[n]); ++ fprintf (dump_file, "\nList of callable nodes:\n"); ++ FOR_EACH_DEFINED_FUNCTION (n) ++ if (nn_map->count (n) != 0) ++ { ++ node_set *set = (*nn_map)[n]; ++ fprintf (dump_file, "%s: ", n->dump_name ()); ++ for (node_set::const_iterator it = set->begin (); ++ it != set->end (); it++) ++ fprintf (dump_file, "%s ", (*it)->dump_name ()); ++ fprintf (dump_file, "\n"); ++ } ++ } ++} ++ ++/* Analysis of memory references. */ ++ ++typedef enum ++{ ++ MR_NONE, ++ MR_SIMPLE, ++ MR_POLYNOMIAL, ++ MR_INDIRECT, ++ MR_UNSUPPORTED ++} mr_type; ++const char *mr_type_str[] = ++ {"none", "simple", "poly", "indirect", "unsuppoted"}; ++ ++struct memref_type; ++typedef std::set memref_set; ++ ++static unsigned max_mr_id = 0; ++typedef struct memref_type ++{ ++ unsigned mr_id = 0; ++ mr_type type = MR_NONE; ++ tree mem = NULL_TREE; ++ tree base = NULL_TREE; ++ tree offset = NULL_TREE; ++ vec stmts = vNULL; ++ memref_set used_mrs; ++ bool is_store = false; ++ bool is_incr = false; ++ tree step = NULL_TREE; ++} memref_t; ++ ++typedef std::map tree_memref_map; ++typedef std::map > function_mrs_map; ++typedef std::map funct_mrs_map; ++typedef std::map memref_map; ++typedef std::map memref_tree_map; ++ ++typedef std::set stmt_set; ++typedef std::map tree_map; ++ ++tree_memref_map *tm_map; ++funct_mrs_map *fmrs_map; ++funct_mrs_map *optimize_mrs_map; ++memref_map *mr_candidate_map; ++tree_map *decl_map; ++ ++static void analyse_mem_ref (gimple *stmt, tree mem, memref_t* mr); ++ ++static memref_t* ++get_memref (gimple *stmt, tree mem, bool is_store) ++{ ++ if (tm_map->count (mem)) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Found mr %d for %p.\n", ++ (*tm_map)[mem]->mr_id, (void *) mem); ++ return (*tm_map)[mem]; ++ } ++ ++ memref_t *mr = new memref_t; ++ mr->mr_id = ++max_mr_id; ++ mr->is_store = is_store; ++ mr->mem = mem; ++ (*tm_map)[mem] = mr; ++ if (dump_file) ++ fprintf (dump_file, "Create mr %d for %p.\n", ++ mr->mr_id, (void *) mem); ++ analyse_mem_ref (stmt, mem, mr); ++ return mr; ++} ++ ++static void ++print_mrs_ids (memref_set &mrs, const char *start) ++{ ++ if (start) ++ fprintf (dump_file, "%s", start); ++ for (memref_set::const_iterator it = mrs.begin (); it != mrs.end (); it++) ++ fprintf (dump_file, "%d ", (*it)->mr_id); ++ fprintf (dump_file, "\n"); ++} ++ ++static void ++print_memref (memref_t *mr) ++{ ++ fprintf (dump_file, "MR (%d) type: %s (%s) mem: ", mr->mr_id, ++ mr_type_str[mr->type], mr->is_store ? "st" : "ld"); ++ print_generic_expr (dump_file, mr->mem); ++ fprintf (dump_file, "\nbase: "); ++ if (mr->base) ++ print_generic_expr (dump_file, mr->base); ++ else ++ fprintf (dump_file, "null"); ++ fprintf (dump_file, "\noffset: "); ++ if (mr->offset) ++ print_generic_expr (dump_file, mr->offset); ++ else ++ fprintf (dump_file, "null"); ++ fprintf (dump_file, "\nstmts:\n"); ++ for (unsigned int i = 0; i < mr->stmts.length (); i++) ++ print_gimple_stmt (dump_file, mr->stmts[i], 0); ++ print_mrs_ids (mr->used_mrs, "\tused memrefs: "); ++ if (mr->is_incr) ++ { ++ fprintf (dump_file, "\tis incremental with step: "); ++ print_generic_expr (dump_file, mr->step); ++ } ++ fprintf (dump_file, "\n"); ++} ++ ++/* If there is a simple load or store to a memory reference in STMT, returns ++ the location of the memory reference, and sets IS_STORE according to whether ++ it is a store or load. Otherwise, returns NULL. ++ TODO: from gcc/tree-ssa-loop-im.c, maybe make it global. */ ++ ++static tree * ++simple_mem_ref_in_stmt (gimple *stmt, bool *is_store) ++{ ++ tree *lhs, *rhs; ++ ++ /* Recognize SSA_NAME = MEM and MEM = (SSA_NAME | invariant) patterns. */ ++ if (!gimple_assign_single_p (stmt)) ++ return NULL; ++ ++ lhs = gimple_assign_lhs_ptr (stmt); ++ rhs = gimple_assign_rhs1_ptr (stmt); ++ ++ if (TREE_CODE (*lhs) == SSA_NAME && gimple_vuse (stmt)) ++ { ++ *is_store = false; ++ return rhs; ++ } ++ else if (gimple_vdef (stmt) ++ && (TREE_CODE (*rhs) == SSA_NAME || is_gimple_min_invariant (*rhs))) ++ { ++ *is_store = true; ++ return lhs; ++ } ++ else ++ return NULL; ++} ++ ++static void ++analyse_incremental (gimple *stmt, memref_t* mr) ++{ ++ if (!gimple_assign_single_p (stmt)) ++ return; ++ tree rhs1, rhs2; ++ /* TODO: maybe support other types of stmts. */ ++ while (stmt && is_gimple_assign (stmt)) ++ { ++ enum tree_code def_code = gimple_assign_rhs_code (stmt); ++ gimple_rhs_class rhs_class = gimple_assign_rhs_class (stmt); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Incr: in assign (%s)\n", ++ get_tree_code_name (def_code)); ++ print_gimple_stmt (dump_file, stmt, 3, TDF_DETAILS); ++ } ++ gcc_assert (def_code != ERROR_MARK); ++ switch (rhs_class) ++ { ++ case GIMPLE_TERNARY_RHS: ++ if (dump_file) ++ fprintf (dump_file, "Incr: unsupported trinary rhs\n"); ++ stmt = NULL; ++ break; ++ case GIMPLE_UNARY_RHS: ++ case GIMPLE_SINGLE_RHS: ++ rhs1 = gimple_assign_rhs1 (stmt); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Incr: (%s)", ++ get_tree_code_name (TREE_CODE (rhs1))); ++ print_generic_expr (dump_file, rhs1); ++ fprintf (dump_file, "\n"); ++ } ++ if (def_code == SSA_NAME) ++ stmt = SSA_NAME_DEF_STMT (rhs1); ++ else if (def_code == MEM_REF || def_code == COMPONENT_REF ++ || def_code == ARRAY_REF) ++ { ++ /* If we have dereference in address evaluation, ++ it's indirect memory access. */ ++ if (dump_file) ++ { ++ if (operand_equal_p (mr->mem, rhs1)) ++ fprintf (dump_file, "Incr: the same MEM\n"); ++ else ++ fprintf (dump_file, "Incr: diff MEM\n"); ++ print_generic_expr (dump_file, rhs1); ++ fprintf (dump_file, " "); ++ print_generic_expr (dump_file, mr->mem); ++ fprintf (dump_file, "\n"); ++ } ++ if (operand_equal_p (mr->mem, rhs1) && mr->step) ++ mr->is_incr = true; ++ stmt = NULL; ++ } ++ else ++ { ++ if (dump_file) ++ fprintf (dump_file, "Incr: unsupported unary/single\n"); ++ stmt = NULL; ++ } ++ break; ++ case GIMPLE_BINARY_RHS: ++ rhs1 = gimple_assign_rhs1 (stmt); ++ rhs2 = gimple_assign_rhs2 (stmt); ++ if (dump_file) ++ { ++ fprintf (dump_file, "(%s) (%s)", ++ get_tree_code_name (TREE_CODE (rhs1)), ++ get_tree_code_name (TREE_CODE (rhs2))); ++ print_generic_expr (dump_file, rhs1); ++ fprintf (dump_file, " "); ++ print_generic_expr (dump_file, rhs2); ++ fprintf (dump_file, "\n"); ++ } ++ /* TODO: extend for other types of incrementation. */ ++ if (TREE_CODE (rhs1) == SSA_NAME && TREE_CODE (rhs2) == INTEGER_CST) ++ { ++ stmt = SSA_NAME_DEF_STMT (rhs1); ++ mr->step = rhs2; ++ if (dump_file) ++ { ++ fprintf (dump_file, "Incr: const increment stmt: "); ++ print_gimple_stmt (dump_file, stmt, 3, TDF_DETAILS); ++ } ++ } ++ else ++ stmt = NULL; ++ break; ++ default: ++ gcc_unreachable (); ++ } ++ } ++ if ((mr->step && !mr->is_incr) || (!mr->step && mr->is_incr)) ++ { ++ mr->step = NULL_TREE; ++ mr->is_incr = false; ++ } ++} ++ ++static mr_type ++get_memref_type (memref_t *base, memref_t *used, enum tree_code code) ++{ ++ /* TODO: improve memref type detection. */ ++ enum tree_code base_code = TREE_CODE (base->mem); ++ if (dump_file) ++ fprintf (dump_file, "get_memref_type: base=%d,%d used=%d,%d code=%s " ++ "base_code=%s\n", base->mr_id, base->type, ++ used ? used->mr_id : -1, used ? used->type : -1, ++ get_tree_code_name (code), get_tree_code_name (base_code)); ++ if (used) ++ { ++ if (base->type > used->type) ++ return base->type; ++ if (used->type == MR_SIMPLE) ++ return MR_POLYNOMIAL; ++ if (used->type == MR_POLYNOMIAL) ++ return base_code == ARRAY_REF ? MR_POLYNOMIAL : MR_INDIRECT; ++ if (used->type == MR_INDIRECT) ++ return MR_INDIRECT; ++ return MR_UNSUPPORTED; ++ } ++ if (code == MEM_REF || code == ARRAY_REF || code == COMPONENT_REF) ++ return base->type; ++ if (code == POINTER_PLUS_EXPR || code == PLUS_EXPR ++ || code == MINUS_EXPR || code == MULT_EXPR) ++ return base->type <= MR_POLYNOMIAL ? MR_POLYNOMIAL : base->type; ++ return base->type >= MR_INDIRECT ? base->type : MR_INDIRECT; ++} ++ ++/* Recursively walk defs of src expression and record used stmts and other mrs. ++ Return a base address candidate if it's found. */ ++ ++static tree ++analyse_addr_eval (tree src, memref_t* mr) ++{ ++ if (TREE_CODE (src) != SSA_NAME) ++ return NULL_TREE; ++ gimple *stmt = SSA_NAME_DEF_STMT (src); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Src_stmt: "); ++ print_gimple_stmt (dump_file, stmt, 0); ++ } ++ if (!is_gimple_assign (stmt)) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, "Is not assign, stop analysis: "); ++ print_gimple_stmt (dump_file, stmt, 3, TDF_DETAILS); ++ } ++ mr->type = MR_UNSUPPORTED; ++ mr->stmts.safe_push (stmt); ++ return NULL_TREE; ++ } ++ enum tree_code def_code = gimple_assign_rhs_code (stmt); ++ if (def_code != MEM_REF && def_code != COMPONENT_REF ++ && def_code != ARRAY_REF) ++ mr->stmts.safe_push (stmt); ++ gimple_rhs_class rhs_class = gimple_assign_rhs_class (stmt); ++ tree rhs1, rhs2, base; ++ if (dump_file) ++ fprintf (dump_file, "In assign (%s): ", get_tree_code_name (def_code)); ++ ++ switch (rhs_class) ++ { ++ case GIMPLE_TERNARY_RHS: ++ if (dump_file) ++ fprintf (dump_file, "Unsupported trinary rhs\n"); ++ mr->type = MR_UNSUPPORTED; ++ return NULL_TREE; ++ case GIMPLE_UNARY_RHS: ++ case GIMPLE_SINGLE_RHS: ++ rhs1 = gimple_assign_rhs1 (stmt); ++ if (dump_file) ++ { ++ fprintf (dump_file, "(%s)", ++ get_tree_code_name (TREE_CODE (rhs1))); ++ print_generic_expr (dump_file, rhs1); ++ fprintf (dump_file, "\n"); ++ } ++ if (def_code == NOP_EXPR) ++ return analyse_addr_eval (rhs1, mr); ++ else if (def_code == MEM_REF || def_code == COMPONENT_REF ++ || def_code == ARRAY_REF) ++ { ++ memref_t *mr2 = get_memref (stmt, rhs1, false); ++ mr->type = get_memref_type (mr, mr2, def_code); ++ for (memref_set::const_iterator it = mr2->used_mrs.begin (); ++ it != mr2->used_mrs.end (); it++) ++ mr->used_mrs.insert (*it); ++ mr->used_mrs.insert (mr2); ++ return mr2->base; ++ } ++ else ++ { ++ if (dump_file) ++ fprintf (dump_file, "Unsupported unary/single\n"); ++ mr->type = MR_UNSUPPORTED; ++ } ++ return NULL_TREE; ++ case GIMPLE_BINARY_RHS: ++ rhs1 = gimple_assign_rhs1 (stmt); ++ rhs2 = gimple_assign_rhs2 (stmt); ++ if (dump_file) ++ { ++ fprintf (dump_file, "(%s) (%s)", ++ get_tree_code_name (TREE_CODE (rhs1)), ++ get_tree_code_name (TREE_CODE (rhs2))); ++ print_generic_expr (dump_file, rhs1); ++ fprintf (dump_file, " "); ++ print_generic_expr (dump_file, rhs2); ++ fprintf (dump_file, "\n"); ++ } ++ base = analyse_addr_eval (rhs1, mr); ++ analyse_addr_eval (rhs2, mr); ++ mr->type = get_memref_type (mr, NULL, def_code); ++ return base; ++ default: ++ gcc_unreachable (); ++ } ++ return NULL_TREE; ++} ++ ++static tree ++get_mem_ref_address_ssa_name (tree mem, tree base) ++{ ++ gcc_assert (TREE_CODE (mem) == MEM_REF); ++ if (base == NULL_TREE) ++ base = get_base_address (mem); ++ tree base_addr = NULL_TREE; ++ if (TREE_CODE (base) == MEM_REF) ++ base_addr = TREE_OPERAND (base, 0); ++ if (base_addr != NULL_TREE && TREE_CODE (base_addr) == SSA_NAME) ++ return base_addr; ++ return NULL_TREE; ++} ++ ++static void ++analyse_mem_ref (gimple *stmt, tree mem, memref_t* mr) ++{ ++ tree base = get_base_address (mem); ++ if (dump_file) ++ fprintf (dump_file, "Codes: base = %s, mem = %s\n", ++ base ? get_tree_code_name (TREE_CODE (base)) : "null", ++ mem ? get_tree_code_name (TREE_CODE (mem)) : "null"); ++ ++ mr->stmts.safe_push (stmt); ++ mr->base = base; ++ switch (TREE_CODE (mem)) ++ { ++ case COMPONENT_REF: ++ if (mr->is_store) ++ analyse_incremental (stmt, mr); ++ mr->type = MR_SIMPLE; ++ mr->offset = TREE_OPERAND (mem, 1); ++ return; ++ case ARRAY_REF: ++ analyse_addr_eval (TREE_OPERAND (mem, 1), mr); ++ return; ++ case MEM_REF: ++ { ++ tree base_addr = get_mem_ref_address_ssa_name (mem, base); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Base addr (%s): ", ++ base_addr ? get_tree_code_name (TREE_CODE (base_addr)) ++ : "null"); ++ if (base_addr) ++ print_generic_expr (dump_file, base_addr); ++ fprintf (dump_file, "\n"); ++ } ++ if (base_addr) ++ { ++ mr->base = analyse_addr_eval (base_addr, mr); ++ return; ++ } ++ break; ++ } ++ default: ++ break; ++ } ++ mr->type = MR_UNSUPPORTED; ++ mr->base = NULL_TREE; ++} ++ ++static void ++analyse_stmt (gimple *stmt) ++{ ++ bool is_store; ++ tree *mem = simple_mem_ref_in_stmt (stmt, &is_store); ++ if (!mem) ++ return; ++ if (dump_file) ++ { ++ fprintf (dump_file, "\n%s: mr is found in stmt (%s): ", ++ function_name (cfun), is_store ? "store" : "load"); ++ print_gimple_stmt (dump_file, stmt, 3, TDF_DETAILS); ++ } ++ memref_t *mr = get_memref (stmt, *mem, is_store); ++ (*fmrs_map)[cfun]->insert (mr); ++ if (dump_file) ++ print_memref (mr); ++} ++ ++/* Scan stmts for indirect stores/loads with bases passed as function args. */ ++ ++static void ++collect_memrefs_for_cgraph_node (struct cgraph_node *n) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nCollect indirect ptr info in %s\n", n->dump_name ()); ++ n->get_body (); ++ function *fn = DECL_STRUCT_FUNCTION (n->decl); ++ gcc_assert (fn && n->has_gimple_body_p ()); ++ ++ push_cfun (fn); ++ basic_block bb; ++ gimple_stmt_iterator si; ++ (*fmrs_map)[fn] = new memref_set; ++ FOR_EACH_BB_FN (bb, fn) ++ for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) ++ { ++ gimple *stmt = gsi_stmt (si); ++ analyse_stmt (stmt); ++ } ++ pop_cfun (); ++} ++ ++/* Walk cgraph nodes and collect memory references info. */ ++ ++static void ++collect_memory_references () ++{ ++ struct cgraph_node *n; ++ /* TODO: collect info only for loops and functions in loops. */ ++ FOR_EACH_DEFINED_FUNCTION (n) ++ if (nl_map->count (n) != 0 && n->has_gimple_body_p ()) ++ collect_memrefs_for_cgraph_node (n); ++ ++ if (dump_file) ++ { ++ fprintf (dump_file, "\n\nDump mem references:\n"); ++ FOR_EACH_DEFINED_FUNCTION (n) ++ if (nl_map->count (n) != 0 && n->has_gimple_body_p ()) ++ { ++ function *fn = DECL_STRUCT_FUNCTION (n->decl); ++ fprintf (dump_file, "\nIn function %s (%s):\n", function_name (fn), ++ nl_map->count (n) != 0 ? "in loop" : ""); ++ for (memref_set::const_iterator it = (*fmrs_map)[fn]->begin (); ++ it != (*fmrs_map)[fn]->end (); it++) ++ print_memref (*it); ++ } ++ } ++} ++ ++/* Analysis of loops. */ ++ ++memref_set *current_incr_mrs; ++memref_set *current_indirect_mrs; ++ ++static void ++collect_memref (memref_t *mr, class loop *loop, bool check_loop) ++{ ++ gimple *stmt = mr->stmts[0]; ++ gcc_assert (stmt); ++ if (check_loop && !flow_bb_inside_loop_p (loop, gimple_bb (stmt))) ++ return; ++ ++ /* TODO: Improve base invariant analysis for memrefs which are not local ++ (located in called functions). */ ++ bool is_base_inv = false; ++ if (mr->base) ++ is_base_inv = expr_invariant_in_loop_p (loop, mr->base); ++ ++ if (dump_file && (mr->type == MR_INDIRECT || mr->is_incr)) ++ { ++ fprintf (dump_file, "%s MR (%d): ", mr->is_incr ? "INCR" : "INDIRECT", ++ mr->mr_id); ++ print_generic_expr (dump_file, mr->mem); ++ fprintf (dump_file, "\twith base: "); ++ if (mr->base) ++ print_generic_expr (dump_file, mr->base); ++ else ++ fprintf (dump_file, "null"); ++ fprintf (dump_file, " (is_inv=%d)\n", is_base_inv); ++ } ++ ++ if (!is_base_inv) ++ return; ++ if (mr->type == MR_INDIRECT) ++ current_indirect_mrs->insert (mr); ++ if (mr->is_incr) ++ current_incr_mrs->insert (mr); ++} ++ ++static void ++analyse_callable_function (struct cgraph_node *n, class loop *loop) ++{ ++ if (dump_file) ++ fprintf (dump_file, "Callable (%s):\n", n->dump_name ()); ++ ++ function *fn = DECL_STRUCT_FUNCTION (n->decl); ++ if (fmrs_map->count (fn)) ++ for (memref_set::const_iterator it = (*fmrs_map)[fn]->begin (); ++ it != (*fmrs_map)[fn]->end (); it++) ++ collect_memref (*it, loop, false); ++} ++ ++static void ++insert_node_with_callable_nodes (node_set &s, struct cgraph_node *n) ++{ ++ s.insert (n); ++ if (nn_map->count (n) == 0) ++ return; ++ node_set *set = (*nn_map)[n]; ++ for (node_set::const_iterator it = set->begin (); it != set->end (); it++) ++ s.insert ((*it)); ++} ++ ++static bool ++compatible_memrefs_p (memref_t *mr1, memref_t *mr2, bool &compatible_offset) ++{ ++ if (!mr1->base || !mr2->base || !mr2->offset) ++ return false; ++ tree base_type1 = TYPE_MAIN_VARIANT (TREE_TYPE (mr1->base)); ++ tree base_type2 = TYPE_MAIN_VARIANT (TREE_TYPE (mr2->base)); ++ if (base_type1 != base_type2) ++ return false; ++ if (mr1->offset && mr1->offset == mr2->offset) ++ compatible_offset = true; ++ else ++ compatible_offset = false; ++ return true; ++} ++ ++static void ++compare_memrefs (memref_t* mr, memref_t* mr2) ++{ ++ /* TODO: improve analysis of memrefs from different functions: take into ++ account data flow and context. */ ++ bool compatible_offset = false; ++ if (!compatible_memrefs_p (mr, mr2, compatible_offset)) ++ return; ++ if (!compatible_offset) ++ { ++ for (memref_set::const_iterator it = mr->used_mrs.begin (); ++ it != mr->used_mrs.end (); it++) ++ if ((*it)->offset && (*it)->offset == mr2->offset) ++ { ++ compatible_offset = true; ++ if (dump_file) ++ fprintf (dump_file, "Used MR (%d) and INC MR have " ++ "the same offset\n", (*it)->mr_id); ++ break; ++ } ++ } ++ if (!compatible_offset) ++ return; ++ if (dump_file) ++ { ++ fprintf (dump_file, "MR (%d) is optimization candidate with offset: ", ++ mr->mr_id); ++ print_generic_expr (dump_file, mr2->offset); ++ fprintf (dump_file, "\n"); ++ } ++ ++ if (!mr_candidate_map->count (mr)) ++ { ++ (*mr_candidate_map)[mr] = mr2; ++ return; ++ } ++ /* TODO: support analysis with incrementation of different fields. */ ++ if ((*mr_candidate_map)[mr]->offset != mr2->offset) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, "It conflicts with previously found MR (%d) " ++ "with offset ", (*mr_candidate_map)[mr]->mr_id); ++ if ((*mr_candidate_map)[mr] != NULL) ++ print_generic_expr (dump_file, (*mr_candidate_map)[mr]->offset); ++ fprintf (dump_file, ", disable the optimization\n"); ++ } ++ (*mr_candidate_map)[mr] = NULL; ++ } ++} ++ ++/* In the given loop and all functions called from the loop, collect ++ indirect/incremental memrefs with invariant base address and inductive ++ offset. */ ++ ++static void ++collect_memrefs_for_loop (class loop *loop, struct cgraph_node *n, ++ function *fn) ++{ ++ current_incr_mrs = new memref_set; ++ current_indirect_mrs = new memref_set; ++ ++ if (dump_file) ++ fprintf (dump_file, "Loop %d\n", loop->num); ++ if (fmrs_map->count (fn)) ++ for (memref_set::const_iterator it = (*fmrs_map)[fn]->begin (); ++ it != (*fmrs_map)[fn]->end (); it++) ++ collect_memref (*it, loop, true); ++ ++ /* Collect vector of functions called in the loop. */ ++ node_set set; ++ struct cgraph_edge *e; ++ struct cgraph_node *n2; ++ for (e = n->callees; e; e = e->next_callee) ++ { ++ gcall *stmt = e->call_stmt; ++ if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt))) ++ continue; ++ insert_node_with_callable_nodes (set, e->callee); ++ } ++ for (e = n->indirect_calls; e; e = e->next_callee) ++ { ++ gcall *stmt = e->call_stmt; ++ if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt)) ++ || !e->indirect_info->targets) ++ continue; ++ for (unsigned i = 0; e->indirect_info->targets->iterate (i, &n2); ++i) ++ insert_node_with_callable_nodes (set, n2); ++ } ++ if (set.empty ()) ++ return; ++ if (dump_file) ++ fprintf (dump_file, "Go inside all callables of %s\n", n->dump_name ()); ++ ++ for (node_set::const_iterator it = set.begin (); it != set.end (); it++) ++ analyse_callable_function (*it, loop); ++ ++ if (!current_incr_mrs->empty () && !current_indirect_mrs->empty ()) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, "Loop has both incr and indirect memrefs\n" ++ "Incr: "); ++ for (memref_set::const_iterator it = current_incr_mrs->begin (); ++ it != current_incr_mrs->end (); it++) ++ fprintf (dump_file, "%d ", (*it)->mr_id); ++ fprintf (dump_file, "\nIndirect: "); ++ for (memref_set::const_iterator it = current_indirect_mrs->begin (); ++ it != current_indirect_mrs->end (); it++) ++ fprintf (dump_file, "%d ", (*it)->mr_id); ++ fprintf (dump_file, "\n"); ++ } ++ /* Check if indirect memref has a base address similar to one of ++ incremental memref. */ ++ for (memref_set::const_iterator it = current_indirect_mrs->begin (); ++ it != current_indirect_mrs->end (); it++) ++ for (memref_set::const_iterator it2 = current_incr_mrs->begin (); ++ it2 != current_incr_mrs->end (); it2++) ++ compare_memrefs (*it, *it2); ++ } ++ ++ delete current_incr_mrs; ++ delete current_indirect_mrs; ++} ++ ++static void ++analyse_loops_in_cgraph_node (struct cgraph_node *n) ++{ ++ if (dump_file) ++ fprintf (dump_file, "\nAnalyse loops in %s\n", n->dump_name ()); ++ ++ n->get_body (); ++ function *fn = DECL_STRUCT_FUNCTION (n->decl); ++ gcc_assert (fn && n->has_gimple_body_p ()); ++ ++ push_cfun (fn); ++ calculate_dominance_info (CDI_DOMINATORS); ++ loop_optimizer_init (LOOPS_NORMAL); ++ ++ class loop *loop; ++ FOR_EACH_LOOP (loop, 0) ++ { ++ class loop *outer = loop_outer (loop); ++ /* Walk only outermost loops. */ ++ if (outer->num != 0) ++ continue; ++ collect_memrefs_for_loop (loop, n, fn); ++ } ++ ++ free_dominance_info (CDI_DOMINATORS); ++ loop_optimizer_finalize (); ++ pop_cfun (); ++} ++ ++static void ++analyse_loops () ++{ ++ if (dump_file) ++ fprintf (dump_file, "\n\nLoops: procesing functions\n"); ++ cgraph_node *n; ++ FOR_EACH_DEFINED_FUNCTION (n) ++ { ++ if (!can_be_optimized (n)) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Skip the function\n"); ++ continue; ++ } ++ analyse_loops_in_cgraph_node (n); ++ } ++ ++ if (dump_file) ++ fprintf (dump_file, "\n\nList of optimization candidates:\n"); ++ ++ FOR_EACH_DEFINED_FUNCTION (n) ++ { ++ function *fn = DECL_STRUCT_FUNCTION (n->decl); ++ if (!can_be_optimized (n) || !fmrs_map->count (fn)) ++ continue; ++ for (memref_map::iterator it = mr_candidate_map->begin (); ++ it != mr_candidate_map->end (); ++it) ++ { ++ memref_t *mr = it->first, *mr2 = it->second; ++ if (mr2 == NULL || !(*fmrs_map)[fn]->count (mr)) ++ continue; ++ if (!optimize_mrs_map->count (fn)) ++ (*optimize_mrs_map)[fn] = new memref_set; ++ (*optimize_mrs_map)[fn]->insert (mr); ++ } ++ if (dump_file && optimize_mrs_map->count (fn)) ++ { ++ fprintf (dump_file, "Function %s\n", n->dump_name ()); ++ for (memref_set::const_iterator it ++ = (*optimize_mrs_map)[fn]->begin (); ++ it != (*optimize_mrs_map)[fn]->end (); it++) ++ { ++ memref_t *mr = *it, *mr2 = (*mr_candidate_map)[mr]; ++ fprintf (dump_file, "MRs %d,%d with incremental offset ", ++ mr->mr_id, mr2->mr_id); ++ print_generic_expr (dump_file, mr2->offset); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ } ++} ++ ++/* Reduce the set filtering out memrefs with the same memory references, ++ return the result vector of memrefs. */ ++ ++static void ++reduce_memref_set (memref_set *set, vec &vec) ++{ ++ for (memref_set::const_iterator it = set->begin (); ++ it != set->end (); it++) ++ { ++ memref_t *mr1 = *it; ++ if (!vec.length ()) ++ vec.safe_push (mr1); ++ else ++ { ++ bool inserted = false; ++ for (unsigned int i = 0; i < vec.length (); i++) ++ { ++ /* mr2 is less than current mr1. */ ++ memref_t *mr2 = vec[i]; ++ if (operand_equal_p (mr1->mem, mr2->mem)) ++ { ++ if (dump_file) ++ fprintf (dump_file, "The same mems in MRs %d and %d\n", ++ mr1->mr_id, mr2->mr_id); ++ /* TODO: maybe build new memref which include stmts of both ++ mr1 and mr2. */ ++ if ((mr1->is_store && !mr2->is_store) ++ || mr1->stmts.length () > mr2->stmts.length ()) ++ { ++ inserted = true; ++ vec[i] = mr1; ++ } ++ } ++ } ++ if (!inserted) ++ vec.safe_push (mr1); ++ } ++ } ++ if (dump_file) ++ { ++ fprintf (dump_file, "MRs (%d) after filtering: ", vec.length ()); ++ for (unsigned int i = 0; i < vec.length (); i++) ++ fprintf (dump_file, "%d ", vec[i]->mr_id); ++ fprintf (dump_file, "\n"); ++ } ++} ++ ++static void ++find_nearest_common_dominator (memref_t *mr, basic_block &dom) ++{ ++ for (unsigned int i = 0; i < mr->stmts.length (); i++) ++ { ++ basic_block bb = gimple_bb (mr->stmts[i]); ++ gcc_assert (bb); ++ if (dom == bb) ++ continue; ++ if (dom) ++ dom = nearest_common_dominator (CDI_DOMINATORS, dom, bb); ++ else ++ dom = bb; ++ } ++} ++ ++/* Return true if DECL is a parameter or a SSA_NAME for a parameter. ++ TODO: from gcc/tree-inline.c, maybe make it global. */ ++ ++static bool ++is_parm (tree decl) ++{ ++ if (TREE_CODE (decl) == SSA_NAME) ++ { ++ decl = SSA_NAME_VAR (decl); ++ if (!decl) ++ return false; ++ } ++ ++ return (TREE_CODE (decl) == PARM_DECL); ++} ++ ++/* TODO: the following functions are inspired by remap in gcc/tree-inline.c, ++ maybe we can share some functionality. */ ++ ++static tree ++remap_name (tree name, gimple *stmt, bool is_lhs) ++{ ++ tree new_tree = NULL_TREE; ++ if (decl_map->count (name)) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, "Find map: "); ++ print_generic_expr (dump_file, name); ++ fprintf (dump_file, " "); ++ print_generic_expr (dump_file, (*decl_map)[name]); ++ fprintf (dump_file, "\n"); ++ } ++ return unshare_expr ((*decl_map)[name]); ++ } ++ if (!is_lhs) ++ return name; ++ if (TREE_CODE (name) == SSA_NAME) ++ { ++ /* Remap anonymous SSA names or SSA names of anonymous decls. */ ++ tree var = SSA_NAME_VAR (name); ++ if (!var ++ || (!SSA_NAME_IS_DEFAULT_DEF (name) ++ && VAR_P (var) && !VAR_DECL_IS_VIRTUAL_OPERAND (var) ++ && DECL_ARTIFICIAL (var) && DECL_IGNORED_P (var) ++ && !DECL_NAME (var))) ++ { ++ new_tree = make_ssa_name (TREE_TYPE (name), stmt); ++ if (!var && SSA_NAME_IDENTIFIER (name)) ++ SET_SSA_NAME_VAR_OR_IDENTIFIER (new_tree, ++ SSA_NAME_IDENTIFIER (name)); ++ SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) ++ = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name); ++ /* So can range-info. */ ++ if (!POINTER_TYPE_P (TREE_TYPE (name)) ++ && SSA_NAME_RANGE_INFO (name)) ++ duplicate_ssa_name_range_info (new_tree, ++ SSA_NAME_RANGE_TYPE (name), ++ SSA_NAME_RANGE_INFO (name)); ++ /* TODO: maybe correct the insertion. */ ++ (*decl_map)[name] = new_tree; ++ if (dump_file) ++ { ++ fprintf (dump_file, "New map (no var): "); ++ print_generic_expr (dump_file, name); ++ fprintf (dump_file, " "); ++ print_generic_expr (dump_file, new_tree); ++ fprintf (dump_file, "\n"); ++ } ++ return new_tree; ++ } ++ /* TODO: maybe remap_name or do the same as before for SSA_NAME_VAR. */ ++ new_tree = make_ssa_name (TREE_TYPE (name), stmt); ++ (*decl_map)[name] = new_tree; ++ if (dump_file) ++ { ++ fprintf (dump_file, "New map: "); ++ print_generic_expr (dump_file, name); ++ fprintf (dump_file, " "); ++ print_generic_expr (dump_file, new_tree); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ else if (VAR_P (name) || TREE_CODE (name) == PARM_DECL) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, "VAR/PARM: "); ++ print_generic_expr (dump_file, name); ++ fprintf (dump_file, "\n"); ++ } ++ return name; ++ } ++ else ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, "Unsupported: "); ++ print_generic_expr (dump_file, name); ++ fprintf (dump_file, "\n"); ++ } ++ //gcc_unreachable (); ++ return name; ++ } ++ return new_tree; ++} ++ ++/* Passed to walk_tree. Copies the node pointed to, if appropriate. */ ++ ++static tree ++ipa_copy_tree_r (tree *tp, int *walk_subtrees, void *data ATTRIBUTE_UNUSED) ++{ ++ enum tree_code code = TREE_CODE (*tp); ++ enum tree_code_class cl = TREE_CODE_CLASS (code); ++ ++ /* We make copies of most nodes. */ ++ if (IS_EXPR_CODE_CLASS (cl) ++ || code == TREE_LIST ++ || code == TREE_VEC ++ || code == TYPE_DECL ++ || code == OMP_CLAUSE) ++ { ++ /* Because the chain gets clobbered when we make a copy, we save it ++ here. */ ++ tree chain = NULL_TREE, new_tree; ++ ++ if (CODE_CONTAINS_STRUCT (code, TS_COMMON)) ++ chain = TREE_CHAIN (*tp); ++ ++ /* Copy the node. */ ++ new_tree = copy_node (*tp); ++ ++ *tp = new_tree; ++ ++ /* Now, restore the chain, if appropriate. That will cause ++ walk_tree to walk into the chain as well. */ ++ if (code == PARM_DECL ++ || code == TREE_LIST ++ || code == OMP_CLAUSE) ++ TREE_CHAIN (*tp) = chain; ++ ++ /* For now, we don't update BLOCKs when we make copies. So, we ++ have to nullify all BIND_EXPRs. */ ++ if (TREE_CODE (*tp) == BIND_EXPR) ++ BIND_EXPR_BLOCK (*tp) = NULL_TREE; ++ } ++ else if (code == CONSTRUCTOR || code == STATEMENT_LIST) ++ gcc_unreachable (); ++ else if (TREE_CODE_CLASS (code) == tcc_type ++ || TREE_CODE_CLASS (code) == tcc_declaration ++ || TREE_CODE_CLASS (code) == tcc_constant) ++ *walk_subtrees = 0; ++ return NULL_TREE; ++} ++ ++/* Remap the GIMPLE operand pointed to by *TP. DATA is really a ++ 'struct walk_stmt_info *'. DATA->INFO is a 'gimple *'. ++ WALK_SUBTREES is used to indicate walk_gimple_op whether to keep ++ recursing into the children nodes of *TP. */ ++ ++static tree ++remap_gimple_op_r (tree *tp, int *walk_subtrees, void *data) ++{ ++ struct walk_stmt_info *wi_p = (struct walk_stmt_info *) data; ++ gimple *stmt = (gimple *) wi_p->info; ++ ++ /* For recursive invocations this is no longer the LHS itself. */ ++ bool is_lhs = wi_p->is_lhs; ++ wi_p->is_lhs = false; ++ ++ if (TREE_CODE (*tp) == SSA_NAME) ++ { ++ *tp = remap_name (*tp, stmt, is_lhs); ++ *walk_subtrees = 0; ++ if (is_lhs) ++ SSA_NAME_DEF_STMT (*tp) = wi_p->stmt; ++ return NULL; ++ } ++ else if (auto_var_in_fn_p (*tp, cfun->decl)) ++ { ++ /* Local variables and labels need to be replaced by equivalent ++ variables. We don't want to copy static variables; there's ++ only one of those, no matter how many times we inline the ++ containing function. Similarly for globals from an outer ++ function. */ ++ tree new_decl; ++ ++ /* Remap the declaration. */ ++ new_decl = remap_name (*tp, stmt, is_lhs); ++ gcc_assert (new_decl); ++ /* Replace this variable with the copy. */ ++ STRIP_TYPE_NOPS (new_decl); ++ /* ??? The C++ frontend uses void * pointer zero to initialize ++ any other type. This confuses the middle-end type verification. ++ As cloned bodies do not go through gimplification again the fixup ++ there doesn't trigger. */ ++ if (TREE_CODE (new_decl) == INTEGER_CST ++ && !useless_type_conversion_p (TREE_TYPE (*tp), TREE_TYPE (new_decl))) ++ new_decl = fold_convert (TREE_TYPE (*tp), new_decl); ++ *tp = new_decl; ++ *walk_subtrees = 0; ++ } ++ else if (TREE_CODE (*tp) == STATEMENT_LIST || TREE_CODE (*tp) == SAVE_EXPR) ++ { ++ if (dump_file) ++ { ++ fprintf (dump_file, "Unexpected tree: "); ++ print_generic_expr (dump_file, *tp); ++ fprintf (dump_file, "\n"); ++ } ++ gcc_unreachable (); ++ } ++ else ++ { ++ /* Otherwise, just copy the node. Note that copy_tree_r already ++ knows not to copy VAR_DECLs, etc., so this is safe. */ ++ ++ if (TREE_CODE (*tp) == MEM_REF) ++ { ++ /* We need to re-canonicalize MEM_REFs from inline substitutions ++ that can happen when a pointer argument is an ADDR_EXPR. ++ Recurse here manually to allow that. */ ++ tree ptr = TREE_OPERAND (*tp, 0); ++ tree type = TREE_TYPE (*tp); ++ tree old = *tp; ++ walk_tree (&ptr, remap_gimple_op_r, data, NULL); ++ *tp = fold_build2 (MEM_REF, type, ptr, TREE_OPERAND (*tp, 1)); ++ TREE_THIS_VOLATILE (*tp) = TREE_THIS_VOLATILE (old); ++ TREE_SIDE_EFFECTS (*tp) = TREE_SIDE_EFFECTS (old); ++ TREE_NO_WARNING (*tp) = TREE_NO_WARNING (old); ++ /* TODO: maybe support this case. */ ++ gcc_assert (MR_DEPENDENCE_CLIQUE (old) == 0); ++ /* We cannot propagate the TREE_THIS_NOTRAP flag if we have ++ remapped a parameter as the property might be valid only ++ for the parameter itself. */ ++ if (TREE_THIS_NOTRAP (old) && (!is_parm (TREE_OPERAND (old, 0)))) ++ TREE_THIS_NOTRAP (*tp) = 1; ++ REF_REVERSE_STORAGE_ORDER (*tp) = REF_REVERSE_STORAGE_ORDER (old); ++ *walk_subtrees = 0; ++ return NULL; ++ } ++ ++ /* Here is the "usual case". Copy this tree node, and then ++ tweak some special cases. */ ++ ipa_copy_tree_r (tp, walk_subtrees, NULL); ++ gcc_assert (!(TREE_CODE (*tp) == TARGET_EXPR && TREE_OPERAND (*tp, 3))); ++ if (TREE_CODE (*tp) == ADDR_EXPR) ++ { ++ /* TODO: If this used to be invariant, but is not any longer, ++ then regimplification is probably needed. */ ++ walk_tree (&TREE_OPERAND (*tp, 0), remap_gimple_op_r, data, NULL); ++ recompute_tree_invariant_for_addr_expr (*tp); ++ *walk_subtrees = 0; ++ } ++ } ++ /* TODO: maybe we need to update TREE_BLOCK (*tp). */ ++ ++ /* Keep iterating. */ ++ return NULL_TREE; ++} ++ ++static void ++create_cgraph_edge (cgraph_node *n, gimple *stmt) ++{ ++ gcall *call_stmt = dyn_cast (stmt); ++ basic_block bb = gimple_bb (stmt); ++ tree decl = gimple_call_fndecl (call_stmt); ++ if (!decl) ++ return; ++ struct cgraph_edge *e = n->create_edge (cgraph_node::get_create (decl), ++ call_stmt, bb->count); ++ /* TODO: maybe we need to store ipa_call_summary result. */ ++ ipa_call_summaries->get_create (e); ++} ++ ++/* Insert prefetch intrinsics in this function, return nonzero on success. */ ++ ++static int ++optimize_function (cgraph_node *n, function *fn) ++{ ++ /* In a given function, optimize only indirect memrefs with ++ the same incremental memref. ++ TODO: implement the optimization for other cases. */ ++ bool different_incrementals = false; ++ memref_t *first_mr = NULL; ++ memref_set used_mrs; ++ for (memref_set::const_iterator it = (*optimize_mrs_map)[fn]->begin (); ++ it != (*optimize_mrs_map)[fn]->end (); it++) ++ { ++ memref_t *mr = *it; ++ if (!first_mr) ++ first_mr = mr; ++ else if ((*mr_candidate_map)[first_mr] != (*mr_candidate_map)[mr]) ++ { ++ different_incrementals = true; ++ break; ++ } ++ for (memref_set::const_iterator it2 = mr->used_mrs.begin (); ++ it2 != mr->used_mrs.end (); it2++) ++ used_mrs.insert (*it2); ++ } ++ if (different_incrementals) ++ { ++ if (dump_file) ++ fprintf (dump_file, "It contains memrefs with different " ++ "incrementals. Skip the case.\n"); ++ return 0; ++ } ++ memref_t *inc_mr = (*mr_candidate_map)[first_mr]; ++ if (!inc_mr->stmts[0] || !gimple_assign_single_p (inc_mr->stmts[0])) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Incremental MR with unexpected stmt. " ++ "Skip the case.\n"); ++ return 0; ++ } ++ if (dump_file && !used_mrs.empty ()) ++ print_mrs_ids (used_mrs, "Common list of used mrs:\n"); ++ ++ /* Find a memref in used mrs which corresponds to the found incremental ++ memref. */ ++ memref_t *comp_mr = NULL; ++ for (memref_set::const_iterator it = used_mrs.begin (); ++ it != used_mrs.end (); it++) ++ { ++ bool c_offset; ++ if ((*it)->type != MR_SIMPLE || inc_mr->type != MR_SIMPLE ++ || !compatible_memrefs_p (*it, inc_mr, c_offset)) ++ continue; ++ if (c_offset) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Found compatible used MR (%d) and " ++ "incr MR (%d)\n", (*it)->mr_id, inc_mr->mr_id); ++ comp_mr = (*it); ++ } ++ } ++ if (!comp_mr || !comp_mr->stmts[0] ++ || !gimple_assign_single_p (comp_mr->stmts[0])) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Compatible MR in this function is not found " ++ " or it has unexpected stmt. Skip the case.\n"); ++ return 0; ++ } ++ ++ /* Filter out memrefs with the same memory references. ++ TODO: maybe do the same with used mrs. */ ++ vec vmrs = vNULL; ++ reduce_memref_set ((*optimize_mrs_map)[fn], vmrs); ++ ++ /* Find insertion place. Create new BB. */ ++ /* TODO: maybe it is useful to process also used_mrs. */ ++ basic_block dom_bb = NULL; ++ for (unsigned int i = 0; i < vmrs.length (); i++) ++ find_nearest_common_dominator (vmrs[i], dom_bb); ++ ++ if (!dom_bb) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Dominator bb for MRs is not found. " ++ "Skip the case.\n"); ++ return 0; ++ } ++ else if (dump_file) ++ fprintf (dump_file, "Dominator bb %d for MRs\n", dom_bb->index); ++ ++ split_block (dom_bb, (gimple *) NULL); ++ gimple_stmt_iterator gsi = gsi_last_bb (dom_bb); ++ ++ /* Create new inc var. Insert new_var = old_var + step * factor. */ ++ decl_map = new tree_map; ++ gcc_assert (comp_mr->stmts[0] && gimple_assign_single_p (comp_mr->stmts[0])); ++ tree inc_var = gimple_assign_lhs (comp_mr->stmts[0]); ++ gimple_seq stmts = NULL; ++ tree var_type = TREE_TYPE (inc_var); ++ enum tree_code inc_code; ++ if (TREE_CODE (var_type) == POINTER_TYPE) ++ inc_code = POINTER_PLUS_EXPR; ++ else ++ inc_code = PLUS_EXPR; ++ tree step = inc_mr->step; ++ unsigned dist_val = tree_to_uhwi (step) * param_ipa_prefetch_distance_factor; ++ tree dist = build_int_cst (TREE_TYPE (step), dist_val); ++ tree new_inc_var = gimple_build (&stmts, inc_code, var_type, inc_var, dist); ++ (*decl_map)[inc_var] = new_inc_var; ++ ++ /* Create other new vars. Insert new stmts. */ ++ struct walk_stmt_info wi; ++ stmt_set processed_stmts; ++ memref_tree_map mr_new_trees; ++ for (memref_set::const_iterator it = used_mrs.begin (); ++ it != used_mrs.end (); it++) ++ { ++ memref_t *mr = *it; ++ gimple *last_stmt = NULL; ++ if (mr == comp_mr) ++ continue; ++ for (int i = mr->stmts.length () - 1; i >= 0 ; i--) ++ { ++ if (processed_stmts.count (mr->stmts[i])) ++ continue; ++ processed_stmts.insert (mr->stmts[i]); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Copy stmt %d from used MR (%d):\n", ++ i, mr->mr_id); ++ print_gimple_stmt (dump_file, mr->stmts[i], 0); ++ } ++ /* Create a new copy of STMT and duplicate STMT's virtual ++ operands. */ ++ gimple *copy = gimple_copy (mr->stmts[i]); ++ gcc_checking_assert (!is_gimple_debug (copy)); ++ ++ /* Remap all the operands in COPY. */ ++ memset (&wi, 0, sizeof (wi)); ++ last_stmt = copy; ++ wi.info = copy; ++ walk_gimple_op (copy, remap_gimple_op_r, &wi); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Stmt %d after remap:\n",i); ++ print_gimple_stmt (dump_file, copy, 0); ++ } ++ gimple_seq_add_stmt (&stmts, copy); ++ } ++ gcc_assert (last_stmt); ++ mr_new_trees[mr] = gimple_assign_lhs (last_stmt); ++ if (dump_file) ++ { ++ fprintf (dump_file, "MR (%d) new mem: ", mr->mr_id); ++ print_generic_expr (dump_file, gimple_assign_lhs (last_stmt)); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ /* On new load check page fault. */ ++ /* Insert prefetch instructions. */ ++ if (dump_file) ++ fprintf (dump_file, "Evaluate addresses and insert prefetch insn.\n"); ++ ++ vec pcalls = vNULL; ++ tree local; ++ switch (param_ipa_prefetch_locality) ++ { ++ case 0: ++ local = integer_zero_node; ++ break; ++ case 1: ++ local = integer_one_node; ++ break; ++ case 2: ++ local = build_int_cst (integer_type_node, 2); ++ break; ++ default: ++ case 3: ++ local = integer_three_node; ++ break; ++ } ++ for (unsigned int j = 0; j < vmrs.length (); j++) ++ { ++ memref_t *mr = vmrs[j]; ++ /* Don't need to copy the last stmt, since we insert prefetch insn ++ instead of it. */ ++ for (int i = mr->stmts.length () - 1; i >= 1 ; i--) ++ { ++ if (processed_stmts.count (mr->stmts[i])) ++ continue; ++ processed_stmts.insert (mr->stmts[i]); ++ ++ gimple *copy = gimple_copy (mr->stmts[i]); ++ gcc_checking_assert (!is_gimple_debug (copy)); ++ ++ /* Remap all the operands in COPY. */ ++ memset (&wi, 0, sizeof (wi)); ++ wi.info = copy; ++ walk_gimple_op (copy, remap_gimple_op_r, &wi); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Stmt %d after remap:\n",i); ++ print_gimple_stmt (dump_file, copy, 0); ++ } ++ gimple_seq_add_stmt (&stmts, copy); ++ } ++ gimple *last_stmt = mr->stmts[0]; ++ gcc_assert (last_stmt); ++ mr_new_trees[mr] = gimple_assign_lhs (last_stmt); ++ tree write_p = mr->is_store ? integer_one_node : integer_zero_node; ++ tree addr = get_mem_ref_address_ssa_name (mr->mem, NULL_TREE); ++ if (decl_map->count (addr)) ++ addr = (*decl_map)[addr]; ++ last_stmt = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), ++ 3, addr, write_p, local); ++ pcalls.safe_push (last_stmt); ++ gimple_seq_add_stmt (&stmts, last_stmt); ++ } ++ ++ gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); ++ delete decl_map; ++ ++ /* Modify cgraph inserting calls to prefetch intrinsics. */ ++ for (unsigned i = 0; i < pcalls.length (); i++) ++ create_cgraph_edge (n, pcalls[i]); ++ ipa_update_overall_fn_summary (n); ++ ++ return 1; ++} ++ ++static int ++insert_prefetch () ++{ ++ int res = 0; ++ cgraph_node *n; ++ FOR_EACH_DEFINED_FUNCTION (n) ++ { ++ function *fn = DECL_STRUCT_FUNCTION (n->decl); ++ if (!optimize_mrs_map->count (fn)) ++ continue; ++ if (dump_file) ++ fprintf (dump_file, "Optimize function %s\n", n->dump_name ()); ++ push_cfun (DECL_STRUCT_FUNCTION (n->decl)); ++ calculate_dominance_info (CDI_DOMINATORS); ++ res |= optimize_function (n, fn); ++ free_dominance_info (CDI_DOMINATORS); ++ pop_cfun (); ++ } ++ return res; ++} ++ ++static unsigned int ++ipa_prefetch (void) ++{ ++ if (!targetm.have_prefetch ()) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Prefetch is not supported by the target.\n"); ++ return 0; ++ } ++ ++ unsigned int ret = 0; ++ el_map = new edge_in_loop; ++ nl_map = new node_in_loop; ++ icn_map = new node_to_iedge_map; ++ nn_map = new node_to_node_map; ++ tm_map = new tree_memref_map; ++ fmrs_map = new funct_mrs_map; ++ mr_candidate_map = new memref_map; ++ optimize_mrs_map = new funct_mrs_map; ++ ++ max_mr_id = 0; ++ /* TODO: check if we really need this init. */ ++ if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH)) ++ { ++ tree type = build_function_type_list (void_type_node, ++ const_ptr_type_node, NULL_TREE); ++ tree decl = add_builtin_function ("__builtin_prefetch", type, ++ BUILT_IN_PREFETCH, BUILT_IN_NORMAL, ++ NULL, NULL_TREE); ++ DECL_IS_NOVOPS (decl) = true; ++ set_builtin_decl (BUILT_IN_PREFETCH, decl, false); ++ } ++ ++ analyse_cgraph (); ++ prepare_indirect_call_info (); ++ propagate_loop_info_in_cgraph (); ++ collect_memory_references (); ++ analyse_loops (); ++ ++ /* TODO: implement some specific heuristics. */ ++ if (!optimize_mrs_map->empty ()) ++ ret = insert_prefetch (); ++ ++ delete el_map; ++ delete nl_map; ++ for (node_to_iedge_map::iterator it = icn_map->begin (); ++ it != icn_map->end (); ++it) ++ delete it->second; ++ delete icn_map; ++ for (node_to_node_map::iterator it = nn_map->begin (); ++ it != nn_map->end (); ++it) ++ delete it->second; ++ delete nn_map; ++ for (tree_memref_map::iterator it = tm_map->begin (); ++ it != tm_map->end (); ++it) ++ delete it->second; ++ delete tm_map; ++ for (funct_mrs_map::iterator it = fmrs_map->begin (); ++ it != fmrs_map->end (); ++it) ++ delete it->second; ++ delete fmrs_map; ++ delete mr_candidate_map; ++ delete optimize_mrs_map; ++ ++ /* TODO: maybe add other todos. */ ++ return ret | TODO_verify_all; ++} ++ ++const pass_data pass_data_ipa_prefetch = ++{ ++ SIMPLE_IPA_PASS, // type ++ "ipa_prefetch", // name ++ OPTGROUP_NONE, // optinfo_flags ++ TV_IPA_PREFETCH, // tv_id ++ 0, // properties_required ++ 0, // properties_provided ++ 0, // properties_destroyed ++ 0, // todo_flags_start ++ 0, // todo_flags_finish ++}; ++ ++class pass_ipa_prefetch : public simple_ipa_opt_pass ++{ ++public: ++ pass_ipa_prefetch (gcc::context *ctxt) ++ : simple_ipa_opt_pass (pass_data_ipa_prefetch, ctxt) ++ {} ++ ++ /* opt_pass methods: */ ++ virtual bool gate (function *); ++ virtual unsigned int execute (function *) ++ { ++ return ipa_prefetch (); ++ } ++}; // class pass_ipa_prefetch ++ ++bool ++pass_ipa_prefetch::gate (function *) ++{ ++ return (optimize >= 3 ++ && flag_ipa_prefetch ++ /* Don't bother doing anything if the program has errors. */ ++ && !seen_error () ++ && flag_lto_partition == LTO_PARTITION_ONE ++ /* Only enable struct optimizations in lto or whole_program. */ ++ && (in_lto_p || flag_whole_program)); ++} ++ ++} // anon namespace ++ ++simple_ipa_opt_pass * ++make_pass_ipa_prefetch (gcc::context *ctxt) ++{ ++ return new pass_ipa_prefetch (ctxt); ++} +diff --git a/gcc/ipa-sra.c b/gcc/ipa-sra.c +index 1cb30afc3..d7019ec42 100644 +--- a/gcc/ipa-sra.c ++++ b/gcc/ipa-sra.c +@@ -3090,6 +3090,14 @@ process_edge_to_unknown_caller (cgraph_edge *cs) + gcc_checking_assert (from_ifs); + isra_call_summary *csum = call_sums->get (cs); + ++ /* TODO: implement better support for call edges inserted after summary ++ collection but before sra wpa invocation. */ ++ if (!csum) ++ { ++ csum = call_sums->get_create (cs); ++ csum->m_return_ignored = true; ++ } ++ + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Processing an edge to an unknown caller from %s:\n", + cs->caller->dump_name ()); +diff --git a/gcc/params.opt b/gcc/params.opt +index 227175eef..1fe9513eb 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -246,6 +246,14 @@ Maximum pieces that IPA-SRA tracks per formal parameter, as a consequence, also + Common Joined UInteger Var(param_ipa_sra_ptr_growth_factor) Init(2) Param Optimization + Maximum allowed growth of number and total size of new parameters that ipa-sra replaces a pointer to an aggregate with. + ++-param=ipa-prefetch-distance-factor= ++Common Joined UInteger Var(param_ipa_prefetch_distance_factor) Init(4) Param Optimization ++The factor represents the number of inductive variable incrementations to evaluate an indirect memory address for IPA prefetch. ++ ++-param=ipa-prefetch-locality= ++Common Joined UInteger Var(param_ipa_prefetch_locality) Init(3) Param Optimization ++The flag represents temporal locality values in the following way: 0:pstl1strm, 1:pstl3keep, 2:pstl2keep, 3:pstl1keep. ++ + -param=ira-loop-reserved-regs= + Common Joined UInteger Var(param_ira_loop_reserved_regs) Init(2) Param Optimization + The number of registers in each class kept unused by loop invariant motion. +diff --git a/gcc/passes.def b/gcc/passes.def +index 4e6a58634..b8739ab3c 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -154,6 +154,7 @@ along with GCC; see the file COPYING3. If not see + NEXT_PASS (pass_ipa_icf); + NEXT_PASS (pass_ipa_devirt); + NEXT_PASS (pass_ipa_icp); ++ NEXT_PASS (pass_ipa_prefetch); + NEXT_PASS (pass_ipa_cp); + NEXT_PASS (pass_ipa_sra); + NEXT_PASS (pass_ipa_cdtor_merge); +diff --git a/gcc/timevar.def b/gcc/timevar.def +index 4059c57d5..98cf3f59e 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -81,6 +81,7 @@ DEFTIMEVAR (TV_IPA_CONSTANT_PROP , "ipa cp") + DEFTIMEVAR (TV_IPA_INLINING , "ipa inlining heuristics") + DEFTIMEVAR (TV_IPA_FNSPLIT , "ipa function splitting") + DEFTIMEVAR (TV_IPA_COMDATS , "ipa comdats") ++DEFTIMEVAR (TV_IPA_PREFETCH , "ipa prefetch") + DEFTIMEVAR (TV_IPA_STRUCT_REORG , "ipa struct reorg optimization") + DEFTIMEVAR (TV_IPA_EXTEND_AUTO_PROFILE, "ipa extend auto profile") + DEFTIMEVAR (TV_IPA_OPT , "ipa various optimizations") +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index 55d9dd668..a84924756 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -513,6 +513,7 @@ extern ipa_opt_pass_d *make_pass_ipa_odr (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_reference (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_hsa (gcc::context *ctxt); + extern ipa_opt_pass_d *make_pass_ipa_pure_const (gcc::context *ctxt); ++extern simple_ipa_opt_pass *make_pass_ipa_prefetch (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_struct_reorg (gcc::context *ctxt); + extern simple_ipa_opt_pass *make_pass_ipa_extend_auto_profile (gcc::context + *ctxt); +-- +2.33.0 + diff --git a/0159-Implement-AES-pattern-matching.patch b/0159-Implement-AES-pattern-matching.patch new file mode 100644 index 0000000..0926ad2 --- /dev/null +++ b/0159-Implement-AES-pattern-matching.patch @@ -0,0 +1,233 @@ +From 3a48cd1be0915a0fabbfb3a30bd9b67ccd5c65d3 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia WX1215920 +Date: Tue, 12 Dec 2023 10:41:12 +0800 +Subject: [PATCH 6/6] Implement AES pattern matching + +--- + gcc/Makefile.in | 1 + + gcc/common.opt | 4 ++++ + gcc/config/aarch64/aarch64.c | 24 +++++++++++++++++++++ + gcc/doc/tm.texi | 29 +++++++++++++++++++++++++ + gcc/doc/tm.texi.in | 12 +++++++++++ + gcc/passes.def | 1 + + gcc/target.def | 41 ++++++++++++++++++++++++++++++++++++ + gcc/timevar.def | 1 + + gcc/tree-pass.h | 1 + + 9 files changed, 114 insertions(+) + +diff --git a/gcc/Makefile.in b/gcc/Makefile.in +index 31bf2cde2..75b28722e 100644 +--- a/gcc/Makefile.in ++++ b/gcc/Makefile.in +@@ -1288,6 +1288,7 @@ OBJS = \ + cgraphunit.o \ + cgraphclones.o \ + combine.o \ ++ crypto-accel.o \ + combine-stack-adj.o \ + compare-elim.o \ + context.o \ +diff --git a/gcc/common.opt b/gcc/common.opt +index 36b016253..eb995f701 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1069,6 +1069,10 @@ floop-crc + Common Report Var(flag_loop_crc) Optimization + Do the loop crc conversion. + ++fcrypto-accel-aes ++Common Report Var(flag_crypto_accel_aes) Init(0) Optimization ++Perform crypto acceleration AES pattern matching. ++ + fauto-inc-dec + Common Report Var(flag_auto_inc_dec) Init(1) Optimization + Generate auto-inc/dec instructions. +diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c +index ae9e0802b..75efbcb97 100644 +--- a/gcc/config/aarch64/aarch64.c ++++ b/gcc/config/aarch64/aarch64.c +@@ -23894,6 +23894,30 @@ is_aarch64_stp_insn (int icode) + return false; + } + ++machine_mode ++aarch64_get_v16qi_mode () ++{ ++ return V16QImode; ++} ++ ++#undef TARGET_GET_V16QI_MODE ++#define TARGET_GET_V16QI_MODE aarch64_get_v16qi_mode ++ ++#undef TARGET_GEN_REV32V16QI ++#define TARGET_GEN_REV32V16QI gen_aarch64_rev32v16qi ++ ++#undef TARGET_GEN_AESEV16QI ++#define TARGET_GEN_AESEV16QI gen_aarch64_crypto_aesev16qi ++ ++#undef TARGET_GEN_AESDV16QI ++#define TARGET_GEN_AESDV16QI gen_aarch64_crypto_aesdv16qi ++ ++#undef TARGET_GEN_AESMCV16QI ++#define TARGET_GEN_AESMCV16QI gen_aarch64_crypto_aesmcv16qi ++ ++#undef TARGET_GEN_AESIMCV16QI ++#define TARGET_GEN_AESIMCV16QI gen_aarch64_crypto_aesimcv16qi ++ + #undef TARGET_IS_LDP_INSN + #define TARGET_IS_LDP_INSN is_aarch64_ldp_insn + +diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi +index ac1d665c5..4a998aa76 100644 +--- a/gcc/doc/tm.texi ++++ b/gcc/doc/tm.texi +@@ -11870,6 +11870,35 @@ object files that are not referenced from @code{main} and uses export + lists. + @end defmac + ++@deftypefn {Target Hook} machine_mode TARGET_GET_V16QI_MODE () ++This function get the 16 byte elements vector mode if target supports this. ++@end deftypefn ++ ++@deftypefn {Target Hook} rtx TARGET_GEN_REV32V16QI (rtx @var{dest}, rtx @var{src}) ++This function generate the byte reverse instruction ++ of 16 byte elements vector if target supports this. ++@end deftypefn ++ ++@deftypefn {Target Hook} rtx TARGET_GEN_AESEV16QI (rtx @var{dest}, rtx @var{src1}, rtx @var{src2}) ++This function generate the AES encryption instruction ++ of 16 byte elements vector if target supports this. ++@end deftypefn ++ ++@deftypefn {Target Hook} rtx TARGET_GEN_AESDV16QI (rtx @var{dest}, rtx @var{src1}, rtx @var{src2}) ++This function generate the AES decryption instruction ++ of 16 byte elements vector if target supports this. ++@end deftypefn ++ ++@deftypefn {Target Hook} rtx TARGET_GEN_AESMCV16QI (rtx @var{dest}, rtx @var{src}) ++This function generate the AES mix columns instruction ++ of 16 byte elements vector if target supports this. ++@end deftypefn ++ ++@deftypefn {Target Hook} rtx TARGET_GEN_AESIMCV16QI (rtx @var{dest}, rtx @var{src}) ++This function generate the AES inversed mix columns instruction ++ of 16 byte elements vector if target supports this. ++@end deftypefn ++ + @deftypefn {Target Hook} bool TARGET_IS_LDP_INSN (int @var{icode}) + Return true if icode is corresponding to any of the LDP instruction types. + @end deftypefn +diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in +index 0cd70dda4..f7094d8c2 100644 +--- a/gcc/doc/tm.texi.in ++++ b/gcc/doc/tm.texi.in +@@ -8010,6 +8010,18 @@ object files that are not referenced from @code{main} and uses export + lists. + @end defmac + ++@hook TARGET_GET_V16QI_MODE ++ ++@hook TARGET_GEN_REV32V16QI ++ ++@hook TARGET_GEN_AESEV16QI ++ ++@hook TARGET_GEN_AESDV16QI ++ ++@hook TARGET_GEN_AESMCV16QI ++ ++@hook TARGET_GEN_AESIMCV16QI ++ + @hook TARGET_IS_LDP_INSN + + @hook TARGET_IS_STP_INSN +diff --git a/gcc/passes.def b/gcc/passes.def +index ba13d897c..da5d71646 100644 +--- a/gcc/passes.def ++++ b/gcc/passes.def +@@ -448,6 +448,7 @@ along with GCC; see the file COPYING3. If not see + NEXT_PASS (pass_rtl_fwprop_addr); + NEXT_PASS (pass_inc_dec); + NEXT_PASS (pass_initialize_regs); ++ NEXT_PASS (pass_crypto_accel); + NEXT_PASS (pass_ud_rtl_dce); + NEXT_PASS (pass_combine); + NEXT_PASS (pass_if_after_combine); +diff --git a/gcc/target.def b/gcc/target.def +index 48c8a8234..b4dff78ea 100644 +--- a/gcc/target.def ++++ b/gcc/target.def +@@ -2727,6 +2727,47 @@ modes and they have different conditional execution capability, such as ARM.", + bool, (void), + default_have_conditional_execution) + ++DEFHOOK ++(get_v16qi_mode, ++ "This function get the 16 byte elements vector mode if target supports this.", ++ machine_mode, (), ++ NULL) ++ ++DEFHOOK ++(gen_rev32v16qi, ++ "This function generate the byte reverse instruction\n\ ++ of 16 byte elements vector if target supports this.", ++ rtx, (rtx dest, rtx src), ++ NULL) ++ ++DEFHOOK ++(gen_aesev16qi, ++ "This function generate the AES encryption instruction\n\ ++ of 16 byte elements vector if target supports this.", ++ rtx, (rtx dest, rtx src1, rtx src2), ++ NULL) ++ ++DEFHOOK ++(gen_aesdv16qi, ++ "This function generate the AES decryption instruction\n\ ++ of 16 byte elements vector if target supports this.", ++ rtx, (rtx dest, rtx src1, rtx src2), ++ NULL) ++ ++DEFHOOK ++(gen_aesmcv16qi, ++ "This function generate the AES mix columns instruction\n\ ++ of 16 byte elements vector if target supports this.", ++ rtx, (rtx dest, rtx src), ++ NULL) ++ ++DEFHOOK ++(gen_aesimcv16qi, ++ "This function generate the AES inversed mix columns instruction\n\ ++ of 16 byte elements vector if target supports this.", ++ rtx, (rtx dest, rtx src), ++ NULL) ++ + DEFHOOK + (is_ldp_insn, + "Return true if icode is corresponding to any of the LDP instruction types.", +diff --git a/gcc/timevar.def b/gcc/timevar.def +index 24caf1b5d..9ca74dffe 100644 +--- a/gcc/timevar.def ++++ b/gcc/timevar.def +@@ -258,6 +258,7 @@ DEFTIMEVAR (TV_AUTO_INC_DEC , "auto inc dec") + DEFTIMEVAR (TV_CSE2 , "CSE 2") + DEFTIMEVAR (TV_BRANCH_PROB , "branch prediction") + DEFTIMEVAR (TV_COMBINE , "combiner") ++DEFTIMEVAR (TV_CRYPTO_ACCEL , "crypto accel") + DEFTIMEVAR (TV_IFCVT , "if-conversion") + DEFTIMEVAR (TV_MODE_SWITCH , "mode switching") + DEFTIMEVAR (TV_SMS , "sms modulo scheduling") +diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h +index 232a3fdf6..29dc7e34b 100644 +--- a/gcc/tree-pass.h ++++ b/gcc/tree-pass.h +@@ -570,6 +570,7 @@ extern rtl_opt_pass *make_pass_cse2 (gcc::context *ctxt); + extern rtl_opt_pass *make_pass_df_initialize_opt (gcc::context *ctxt); + extern rtl_opt_pass *make_pass_df_initialize_no_opt (gcc::context *ctxt); + extern rtl_opt_pass *make_pass_reginfo_init (gcc::context *ctxt); ++extern rtl_opt_pass *make_pass_crypto_accel (gcc::context *ctxt); + extern rtl_opt_pass *make_pass_inc_dec (gcc::context *ctxt); + extern rtl_opt_pass *make_pass_stack_ptr_mod (gcc::context *ctxt); + extern rtl_opt_pass *make_pass_initialize_regs (gcc::context *ctxt); +-- +2.33.0 + diff --git a/0160-AES-Add-lost-files.patch b/0160-AES-Add-lost-files.patch new file mode 100644 index 0000000..ec231cf --- /dev/null +++ b/0160-AES-Add-lost-files.patch @@ -0,0 +1,3746 @@ +From be61a16a0aa4ad207513726a1ee056f384570c00 Mon Sep 17 00:00:00 2001 +From: Xiong Zhou +Date: Tue, 12 Dec 2023 11:59:34 +0800 +Subject: [PATCH] [AES] Add lost files. + +--- + gcc/crypto-accel.c | 2415 +++++++++++++++++ + gcc/rtl-matcher.h | 367 +++ + .../gcc.target/aarch64/aes-decrypt.c | 478 ++++ + .../gcc.target/aarch64/aes-encrypt.c | 443 +++ + 4 files changed, 3703 insertions(+) + create mode 100644 gcc/crypto-accel.c + create mode 100644 gcc/rtl-matcher.h + create mode 100644 gcc/testsuite/gcc.target/aarch64/aes-decrypt.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/aes-encrypt.c + +diff --git a/gcc/crypto-accel.c b/gcc/crypto-accel.c +new file mode 100644 +index 000000000..f4e810a6b +--- /dev/null ++++ b/gcc/crypto-accel.c +@@ -0,0 +1,2415 @@ ++/* Crypto-pattern optimizer. ++ Copyright (C) 2003-2023 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free ++Software Foundation; either version 3, or (at your option) any later ++version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ANY ++WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#include "config.h" ++#define INCLUDE_VECTOR ++#define INCLUDE_MAP ++#define INCLUDE_SET ++#define INCLUDE_ALGORITHM ++#include "system.h" ++#include "coretypes.h" ++#include "backend.h" ++#include "target.h" ++#include "rtl.h" ++#include "tree.h" ++#include "df.h" ++#include "memmodel.h" ++#include "optabs.h" ++#include "regs.h" ++#include "emit-rtl.h" ++#include "recog.h" ++#include "cfgrtl.h" ++#include "cfgcleanup.h" ++#include "expr.h" ++#include "tree-pass.h" ++#include "rtl-matcher.h" ++ ++/* Basic AES table descryption. */ ++struct aes_table ++{ ++ /* Number of elements per table. */ ++ static const unsigned int table_nelts = 256; ++ /* Number of tables. */ ++ static const unsigned int basic_tables_num = 4; ++ /* Number of rounds. */ ++ static const unsigned int rounds_num = 4; ++ /* Common ID for wrong table. */ ++ static const unsigned int BAD_TABLE = -1; ++ ++ typedef const unsigned int table_type[table_nelts]; ++ typedef table_type *table_map[basic_tables_num]; ++ ++ template ++ static bool is_basic_table (tree ctor, const T ethalon[table_nelts]) ++ { ++ if (TREE_CODE (ctor) != CONSTRUCTOR ++ ||CONSTRUCTOR_NELTS (ctor) != table_nelts) ++ return false; ++ ++ unsigned ix; ++ tree val; ++ FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (ctor), ix, val) ++ if (TREE_INT_CST_LOW (val) != ethalon[ix]) ++ return false; ++ return true; ++ } ++ ++ static unsigned check_table (tree ctor, ++ table_map tables) ++ { ++ for (unsigned i = 0; i < 4; ++i) ++ if (is_basic_table (ctor, *tables[i])) ++ return i; ++ return BAD_TABLE; ++ } ++}; ++ ++/* AES encryption info. */ ++struct aes_encrypt_table : aes_table ++{ ++ typedef enum ++ { ++ TE0, ++ TE1, ++ TE2, ++ TE3, ++ BAD_TABLE = aes_table::BAD_TABLE ++ } table_entry; ++ ++ static table_type Te0; ++ static table_type Te1; ++ static table_type Te2; ++ static table_type Te3; ++ ++ static table_map tables; ++ static table_entry rounds[rounds_num]; ++ static table_entry final_rounds[rounds_num]; ++ ++ static table_entry get_table_id (tree ctor) ++ { ++ return static_cast (check_table (ctor, tables)); ++ } ++}; ++ ++/* AES decryption info. */ ++struct aes_decrypt_table : aes_table ++{ ++ typedef enum ++ { ++ TD0, ++ TD1, ++ TD2, ++ TD3, ++ TD4, ++ BAD_TABLE = aes_table::BAD_TABLE ++ } table_entry; ++ ++ static table_type Td0; ++ static table_type Td1; ++ static table_type Td2; ++ static table_type Td3; ++ ++ static table_map tables; ++ static table_entry rounds[rounds_num]; ++ static table_entry final_rounds[rounds_num]; ++ ++ static const unsigned char Td4[table_nelts]; ++ ++ /* TD4 requires special handler due to type shrinking optimizations. */ ++ static bool is_td4 (tree ctor) ++ { ++ if (is_basic_table (ctor, Td4)) ++ return true; ++ ++ if (TREE_CODE (ctor) != STRING_CST ++ || TREE_STRING_LENGTH (ctor) != table_nelts) ++ return false; ++ ++ const unsigned char *p ++ = (const unsigned char *) TREE_STRING_POINTER (ctor); ++ for (int i = 0; i < TREE_STRING_LENGTH (ctor); ++i) ++ if (p[i] != Td4[i]) ++ return false; ++ ++ return true; ++ } ++ ++ static table_entry get_table_id (tree ctor) ++ { ++ unsigned int res = check_table (ctor, tables); ++ if (res == aes_table::BAD_TABLE ++ && is_td4 (ctor)) ++ return TD4; ++ return static_cast (res); ++ } ++}; ++ ++/* Basic tables info. */ ++aes_encrypt_table::table_map aes_encrypt_table::tables ++ = { &Te0, &Te1, &Te2, &Te3 }; ++aes_decrypt_table::table_map aes_decrypt_table::tables ++ = { &Td0, &Td1, &Td2, &Td3 }; ++ ++/* Round tables permutations info. */ ++aes_encrypt_table::table_entry aes_encrypt_table::rounds[] ++ = {TE0, TE1, TE2, TE3}; ++aes_decrypt_table::table_entry aes_decrypt_table::rounds[] ++ = {TD0, TD1, TD2, TD3}; ++aes_encrypt_table::table_entry aes_encrypt_table::final_rounds[] ++ = {TE2, TE3, TE0, TE1}; ++aes_decrypt_table::table_entry aes_decrypt_table::final_rounds[] ++ = {TD4, TD4, TD4, TD4}; ++ ++aes_encrypt_table::table_type aes_encrypt_table::Te0 = { ++ 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU, ++ 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U, ++ 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU, ++ 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU, ++ 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U, ++ 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU, ++ 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU, ++ 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU, ++ 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU, ++ 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU, ++ 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U, ++ 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU, ++ 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU, ++ 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U, ++ 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU, ++ 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU, ++ 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU, ++ 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU, ++ 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU, ++ 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U, ++ 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU, ++ 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU, ++ 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU, ++ 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU, ++ 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U, ++ 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U, ++ 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U, ++ 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U, ++ 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU, ++ 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U, ++ 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U, ++ 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU, ++ 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU, ++ 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U, ++ 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U, ++ 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U, ++ 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU, ++ 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U, ++ 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU, ++ 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U, ++ 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU, ++ 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U, ++ 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U, ++ 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU, ++ 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U, ++ 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U, ++ 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U, ++ 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U, ++ 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U, ++ 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U, ++ 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U, ++ 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U, ++ 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU, ++ 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U, ++ 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U, ++ 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U, ++ 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U, ++ 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U, ++ 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U, ++ 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU, ++ 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U, ++ 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U, ++ 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U, ++ 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU, ++}; ++ ++aes_encrypt_table::table_type aes_encrypt_table::Te1 = { ++ 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU, ++ 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U, ++ 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU, ++ 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U, ++ 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU, ++ 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U, ++ 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU, ++ 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U, ++ 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U, ++ 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU, ++ 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U, ++ 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U, ++ 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U, ++ 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU, ++ 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U, ++ 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U, ++ 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU, ++ 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U, ++ 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U, ++ 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U, ++ 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU, ++ 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU, ++ 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U, ++ 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU, ++ 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU, ++ 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U, ++ 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU, ++ 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U, ++ 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU, ++ 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U, ++ 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U, ++ 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U, ++ 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU, ++ 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U, ++ 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU, ++ 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U, ++ 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU, ++ 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U, ++ 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U, ++ 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU, ++ 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU, ++ 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU, ++ 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U, ++ 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U, ++ 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU, ++ 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U, ++ 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU, ++ 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U, ++ 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU, ++ 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U, ++ 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU, ++ 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU, ++ 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U, ++ 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU, ++ 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U, ++ 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU, ++ 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U, ++ 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U, ++ 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U, ++ 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU, ++ 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU, ++ 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U, ++ 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU, ++ 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U, ++}; ++ ++aes_encrypt_table::table_type aes_encrypt_table::Te2 = { ++ 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU, ++ 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U, ++ 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU, ++ 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U, ++ 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU, ++ 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U, ++ 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU, ++ 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U, ++ 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U, ++ 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU, ++ 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U, ++ 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U, ++ 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U, ++ 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU, ++ 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U, ++ 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U, ++ 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU, ++ 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U, ++ 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U, ++ 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U, ++ 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU, ++ 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU, ++ 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U, ++ 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU, ++ 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU, ++ 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U, ++ 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU, ++ 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U, ++ 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU, ++ 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U, ++ 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U, ++ 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U, ++ 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU, ++ 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U, ++ 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU, ++ 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U, ++ 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU, ++ 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U, ++ 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U, ++ 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU, ++ 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU, ++ 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU, ++ 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U, ++ 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U, ++ 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU, ++ 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U, ++ 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU, ++ 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U, ++ 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU, ++ 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U, ++ 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU, ++ 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU, ++ 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U, ++ 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU, ++ 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U, ++ 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU, ++ 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U, ++ 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U, ++ 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U, ++ 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU, ++ 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU, ++ 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U, ++ 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU, ++ 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U, ++}; ++ ++aes_encrypt_table::table_type aes_encrypt_table::Te3 = { ++ 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U, ++ 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U, ++ 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U, ++ 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU, ++ 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU, ++ 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU, ++ 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U, ++ 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU, ++ 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU, ++ 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U, ++ 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U, ++ 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU, ++ 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU, ++ 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU, ++ 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU, ++ 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU, ++ 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U, ++ 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU, ++ 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU, ++ 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U, ++ 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U, ++ 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U, ++ 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U, ++ 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U, ++ 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU, ++ 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U, ++ 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU, ++ 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU, ++ 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U, ++ 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U, ++ 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U, ++ 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU, ++ 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U, ++ 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU, ++ 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU, ++ 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U, ++ 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U, ++ 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU, ++ 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U, ++ 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU, ++ 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U, ++ 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U, ++ 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U, ++ 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U, ++ 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU, ++ 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U, ++ 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU, ++ 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U, ++ 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU, ++ 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U, ++ 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU, ++ 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU, ++ 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU, ++ 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU, ++ 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U, ++ 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U, ++ 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U, ++ 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U, ++ 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U, ++ 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U, ++ 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU, ++ 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U, ++ 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU, ++ 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU, ++}; ++ ++aes_decrypt_table::table_type aes_decrypt_table::Td0 = { ++ 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U, ++ 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U, ++ 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U, ++ 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU, ++ 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U, ++ 0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U, ++ 0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU, ++ 0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U, ++ 0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU, ++ 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U, ++ 0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U, ++ 0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U, ++ 0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U, ++ 0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU, ++ 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U, ++ 0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU, ++ 0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U, ++ 0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU, ++ 0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U, ++ 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U, ++ 0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U, ++ 0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU, ++ 0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U, ++ 0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU, ++ 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U, ++ 0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU, ++ 0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U, ++ 0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU, ++ 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU, ++ 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U, ++ 0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU, ++ 0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U, ++ 0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU, ++ 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U, ++ 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U, ++ 0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U, ++ 0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU, ++ 0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U, ++ 0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U, ++ 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU, ++ 0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U, ++ 0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U, ++ 0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U, ++ 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U, ++ 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U, ++ 0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU, ++ 0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U, ++ 0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U, ++ 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U, ++ 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U, ++ 0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U, ++ 0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU, ++ 0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU, ++ 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU, ++ 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU, ++ 0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U, ++ 0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U, ++ 0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU, ++ 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU, ++ 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U, ++ 0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU, ++ 0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U, ++ 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U, ++ 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U, ++}; ++ ++aes_decrypt_table::table_type aes_decrypt_table::Td1 = { ++ 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU, ++ 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U, ++ 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU, ++ 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U, ++ 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U, ++ 0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U, ++ 0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U, ++ 0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U, ++ 0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U, ++ 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU, ++ 0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU, ++ 0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU, ++ 0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U, ++ 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU, ++ 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U, ++ 0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U, ++ 0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U, ++ 0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU, ++ 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU, ++ 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U, ++ 0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU, ++ 0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U, ++ 0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU, ++ 0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU, ++ 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U, ++ 0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U, ++ 0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U, ++ 0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU, ++ 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U, ++ 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU, ++ 0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U, ++ 0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U, ++ 0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U, ++ 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU, ++ 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U, ++ 0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U, ++ 0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U, ++ 0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U, ++ 0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U, ++ 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U, ++ 0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU, ++ 0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU, ++ 0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U, ++ 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU, ++ 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U, ++ 0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU, ++ 0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU, ++ 0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U, ++ 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU, ++ 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U, ++ 0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U, ++ 0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U, ++ 0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U, ++ 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U, ++ 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U, ++ 0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U, ++ 0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU, ++ 0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U, ++ 0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U, ++ 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU, ++ 0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U, ++ 0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U, ++ 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U, ++ 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U, ++}; ++ ++aes_decrypt_table::table_type aes_decrypt_table::Td2 = { ++ 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U, ++ 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U, ++ 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U, ++ 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U, ++ 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU, ++ 0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U, ++ 0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U, ++ 0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U, ++ 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U, ++ 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU, ++ 0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U, ++ 0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U, ++ 0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU, ++ 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U, ++ 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U, ++ 0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U, ++ 0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U, ++ 0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U, ++ 0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U, ++ 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU, ++ 0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U, ++ 0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U, ++ 0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U, ++ 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U, ++ 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U, ++ 0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU, ++ 0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU, ++ 0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U, ++ 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU, ++ 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U, ++ 0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU, ++ 0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU, ++ 0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU, ++ 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU, ++ 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U, ++ 0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U, ++ 0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U, ++ 0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U, ++ 0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U, ++ 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U, ++ 0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U, ++ 0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU, ++ 0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU, ++ 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U, ++ 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U, ++ 0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU, ++ 0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU, ++ 0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U, ++ 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U, ++ 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U, ++ 0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U, ++ 0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U, ++ 0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U, ++ 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U, ++ 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU, ++ 0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U, ++ 0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U, ++ 0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U, ++ 0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U, ++ 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U, ++ 0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U, ++ 0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU, ++ 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U, ++ 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U, ++}; ++ ++aes_decrypt_table::table_type aes_decrypt_table::Td3 = { ++ 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU, ++ 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU, ++ 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U, ++ 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U, ++ 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU, ++ 0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU, ++ 0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U, ++ 0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU, ++ 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U, ++ 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU, ++ 0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U, ++ 0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U, ++ 0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U, ++ 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U, ++ 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U, ++ 0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU, ++ 0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU, ++ 0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U, ++ 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U, ++ 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU, ++ 0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU, ++ 0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U, ++ 0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U, ++ 0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U, ++ 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U, ++ 0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU, ++ 0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U, ++ 0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U, ++ 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU, ++ 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU, ++ 0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U, ++ 0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U, ++ 0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U, ++ 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU, ++ 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U, ++ 0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U, ++ 0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U, ++ 0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U, ++ 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U, ++ 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U, ++ 0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U, ++ 0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU, ++ 0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U, ++ 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U, ++ 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU, ++ 0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU, ++ 0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U, ++ 0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU, ++ 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U, ++ 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U, ++ 0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U, ++ 0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U, ++ 0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U, ++ 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U, ++ 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU, ++ 0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU, ++ 0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU, ++ 0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU, ++ 0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U, ++ 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U, ++ 0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U, ++ 0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU, ++ 0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U, ++ 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U, ++}; ++ ++const unsigned char aes_decrypt_table::Td4[table_nelts] = { ++ 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U, ++ 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU, ++ 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U, ++ 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU, ++ 0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU, ++ 0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU, ++ 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U, ++ 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U, ++ 0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U, ++ 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U, ++ 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU, ++ 0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U, ++ 0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU, ++ 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U, ++ 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U, ++ 0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU, ++ 0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU, ++ 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U, ++ 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U, ++ 0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU, ++ 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U, ++ 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU, ++ 0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U, ++ 0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U, ++ 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U, ++ 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU, ++ 0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU, ++ 0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU, ++ 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U, ++ 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U, ++ 0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U, ++ 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU, ++}; ++ ++/* In-round shifts info. */ ++static const unsigned HOST_WIDE_INT shift_csts[4] = {24, 16, 8, 0}; ++ ++/* Check if the pattern is plus-const. Helper for memref analysis. */ ++static bool ++plus_const_int_p (rtx op) ++{ ++ return GET_CODE (op) == PLUS && CONST_INT_P (XEXP (op, 1)); ++} ++ ++/* Obtain info about memory access. */ ++static bool ++decompose_mem (rtx mem, rtx &base, unsigned HOST_WIDE_INT &offset) ++{ ++ address_info info; ++ decompose_mem_address (&info, mem); ++ if (!info.base) ++ return false; ++ ++ base = *info.base; ++ ++ rtx op = XEXP (mem, 0); ++ if (plus_const_int_p (op)) ++ offset = UINTVAL (XEXP (op, 1)); ++ /* TODO: WRONG IN GENERAL CASE: we cannot guarantee that the offsets were not ++ changed. */ ++ else if ((GET_CODE (op) == PRE_MODIFY && plus_const_int_p (XEXP (op, 1))) ++ || REG_P (op)) ++ offset = 0; ++ else ++ return false; ++ ++ return true; ++} ++ ++/* Check if the regs in stmt are same as the provided ones. */ ++static bool ++cmp_regs_in_stmt (rtx stmt, rtx lhs, rtx rhs) ++{ ++ return (XEXP (stmt, 0) == lhs) && (XEXP (stmt, 1) == rhs); ++} ++ ++/* AES key info. Inhereted from mem_term_info to be used inside ++ matchers without any unnecessary casts. */ ++struct aes_key : mem_term_info ++{ ++ aes_key () ++ {} ++ aes_key (void *) ++ : mem_term_info (NULL, NULL_RTX) ++ {} ++ aes_key (const mem_term_info &m) ++ : mem_term_info (m) ++ {} ++ ++ /* Check if the key has the same base pointer origin as another one. ++ This check is required due to some possible CSE optimizations applied on ++ pointers before this pass. */ ++ bool has_same_origin (const aes_key &other, rtx_insn *use_point) const ++ { ++ /* Simple case: the pointer is same. */ ++ if (src == other.src) ++ return true; ++ ++ if (!use_point) ++ return false; ++ ++ basic_block curr_bb = BLOCK_FOR_INSN (use_point); ++ if (!single_pred_p (curr_bb) ++ || modified_between_p (src, BB_HEAD (curr_bb), use_point) ++ || modified_between_p (other.src, BB_HEAD (curr_bb), use_point)) ++ return false; ++ ++ edge e = single_pred_edge (curr_bb); ++ rtx_insn *jump = BB_END (e->src); ++ if (!any_condjump_p (jump)) ++ return false; ++ ++ basic_block from_bb = BLOCK_FOR_INSN (jump); ++ if (EDGE_COUNT (from_bb->succs) != 2) ++ return false; ++ ++ /* Need proof that the sources are equal: try to get it from ++ terminating condition. */ ++ rtx cond = XEXP (SET_SRC (pc_set (jump)), 0); ++ rtx_code code = GET_CODE (cond); ++ if (!((code == EQ && EDGE_SUCC (from_bb, 0) == e) ++ || (code == NE && EDGE_SUCC (from_bb, 1) == e))) ++ return false; ++ ++ rtx arg1 = XEXP (cond, 0); ++ if (XEXP (cond, 1) != CONST0_RTX (GET_MODE (arg1)) ++ || COMPARISON_P (arg1)) ++ return false; ++ ++ rtx_insn *cmp_insn = get_single_def_insn (jump, arg1); ++ rtx cmp; ++ if (!cmp_insn || !(cmp = get_single_set_op (cmp_insn))) ++ return false; ++ ++ if (!(cmp_regs_in_stmt (cmp, src, other.src) ++ || cmp_regs_in_stmt (cmp, other.src, src))) ++ return false; ++ ++ return true; ++ } ++}; ++ ++/* AES basic state input info. Inhereted from mem_term_info ++ to use it in matchers without any unnecessary casts. */ ++struct state_input_info : mem_term_info ++{ ++ state_input_info () ++ {} ++ state_input_info (const aes_key &k) ++ : mem_term_info (k), is_key (true) ++ {} ++ state_input_info (const mem_term_info &m) ++ : mem_term_info (m), is_key (false) ++ {} ++ ++ bool is_key; ++ ++ bool verify (const state_input_info *prev) const ++ { ++ if (!prev) ++ return true; ++ ++ return BLOCK_FOR_INSN (loc) == BLOCK_FOR_INSN (prev->loc); ++ } ++}; ++ ++/* Memory matcher to filter only suitable memory instructions. */ ++struct mem_matcher : matcher_term ++{ ++ static bool match (rtx_insn *insn, holder_type &m) ++ { ++ rtx src = get_single_set_op (insn); ++ return src && match (src, insn, m); ++ } ++ ++ static bool match (rtx src, rtx_insn *insn, holder_type &m) ++ { ++ if (!MEM_P (src)) ++ return false; ++ ++ mem_term_info info (NULL, NULL_RTX); ++ if (!decompose_mem (src, info.src, info.offset)) ++ return false; ++ ++ info.loc = insn; ++ m[0] = info; ++ return true; ++ } ++}; ++ ++/* AES entry input info. Enhanced from state input due to ideological ++ similarities. */ ++struct input_info : state_input_info ++{ ++ input_info () ++ {} ++ input_info (const mem_term_info &m, unsigned HOST_WIDE_INT shift_cst) ++ : state_input_info (m), shift_cst (shift_cst) ++ {} ++ input_info (const aes_key &k) ++ : state_input_info (k) ++ {} ++ ++ unsigned HOST_WIDE_INT shift_cst; ++ ++ /* Input info is sorted by references offsets. */ ++ bool operator < (const input_info &rhs) const ++ { ++ return offset < rhs.offset; ++ } ++ ++ std::pair input () const ++ { ++ return std::make_pair (src, offset); ++ } ++ ++ bool verify (const input_info *prev, unsigned i) const ++ { ++ if (!state_input_info::verify (prev)) ++ return false; ++ ++ /* Previous state should reference the previous element ++ of the same buffer. */ ++ if (prev && (src != prev->src || offset != prev->offset + 1)) ++ return false; ++ ++ /* State should use the corresponding shift constant. */ ++ return shift_csts[i] == shift_cst; ++ } ++ ++ static bool finalize (rtx_insn *insn, input_info *m) ++ { ++ typedef unop_matcher zext_matcher; ++ ++ zext_matcher::holder_type zext; ++ if (zext_matcher::match (insn, zext)) ++ { ++ *m = input_info (zext[0], 0); ++ return true; ++ } ++ ++ typedef binop_matcher > ++ shifted_variant; ++ shifted_variant::holder_type lsh; ++ if (!shifted_variant::match (insn, lsh)) ++ return false; ++ ++ gcc_assert (CONST_INT_P (lsh[1].src)); ++ *m = input_info (lsh[0], UINTVAL (lsh[1].src)); ++ return true; ++ } ++}; ++ ++/* Check if the corresponding constants combinations may be used for ++ AES table access. */ ++static bool ++verify_table_access (unsigned HOST_WIDE_INT shift_cst, ++ unsigned HOST_WIDE_INT and_cst = 0xFF, ++ bool and_present = true) ++{ ++ if (and_cst != 0xFF) ++ return false; ++ ++ switch (shift_cst) ++ { ++ case 0: ++ case 8: ++ case 16: ++ return and_present; ++ case 24: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++/* AES table reference description. */ ++template ++struct aes_table_ref ++{ ++ rtx_insn *insn; ++ rtx_insn *output_insn; ++ unsigned HOST_WIDE_INT lsr_cst; ++ rtx reg; ++ rtx output; ++ typename TABLE_T::table_entry itable; ++ bool is_final; ++ ++ bool verify (unsigned i) const ++ { ++ typename TABLE_T::table_entry (ðalon)[TABLE_T::rounds_num] ++ = is_final ? TABLE_T::final_rounds : TABLE_T::rounds; ++ return lsr_cst == shift_csts[i] && itable == ethalon[i]; ++ } ++}; ++ ++/* Check the minimal requirements of the pattern to be a table reference ++ and wrap the table id getter function. */ ++template ++static typename T::table_entry ++check_table (rtx mem) ++{ ++ tree expr = MEM_EXPR (mem); ++ if (!expr || TREE_CODE (expr) != ARRAY_REF) ++ return T::BAD_TABLE; ++ ++ tree decl = TREE_OPERAND (expr, 0); ++ if (!decl || !DECL_P (decl) || !TREE_READONLY (decl)) ++ return T::BAD_TABLE; ++ ++ tree ctor = DECL_INITIAL (decl); ++ if (!ctor) ++ return T::BAD_TABLE; ++ ++ return T::get_table_id (ctor); ++} ++ ++/* Simplified memory info. Used for simplier table ref analysis. */ ++struct simplified_mem_info ++{ ++ rtx base_reg; ++ rtx index; ++}; ++ ++/* Try to obtain table reference info. */ ++static bool ++decompose_tref_mem_address (simplified_mem_info &info, rtx mem) ++{ ++ address_info addr_info; ++ decompose_mem_address (&addr_info, mem); ++ if (!addr_info.base || !addr_info.index) ++ return false; ++ ++ info.base_reg = *addr_info.base; ++ info.index = *addr_info.index; ++ ++ if (!REG_P (info.base_reg)) ++ return false; ++ ++ if (addr_info.mode == SImode) ++ { ++ if (GET_CODE (info.index) != MULT) ++ return false; ++ ++ rtx cst = XEXP (info.index, 1); ++ if (!CONST_INT_P (cst) || UINTVAL (cst) != 4) ++ return false; ++ ++ info.index = XEXP (info.index, 0); ++ return true; ++ } ++ ++ return (addr_info.mode == QImode); ++} ++ ++/* Find the possible final output instruction. */ ++template ++static rtx_insn * ++get_possible_final_output (rtx_insn *insn, rtx reg, ++ unsigned HOST_WIDE_INT shift_cst, ++ typename TABLE_T::table_entry itable); ++ ++/* Specialize the function for AES encryption. The output is AND instruction ++ with propper constant. */ ++template<> ++rtx_insn * ++get_possible_final_output (rtx_insn *insn, rtx reg, ++ unsigned HOST_WIDE_INT shift_cst, ++ aes_encrypt_table::table_entry) ++{ ++ rtx_insn *out = get_single_use_insn (insn, reg); ++ if (!out) ++ return NULL; ++ ++ rtx cst_val = get_op_const_cst (out); ++ if (!cst_val) ++ return NULL; ++ ++ unsigned HOST_WIDE_INT ethalon; ++ switch (shift_cst) ++ { ++ case 24: ++ ethalon = 0xffffffffff000000; ++ break; ++ case 16: ++ ethalon = 0xff0000; ++ break; ++ case 8: ++ ethalon = 0xff00; ++ break; ++ case 0: ++ ethalon = 0xff; ++ break; ++ default: ++ gcc_unreachable (); ++ } ++ ++ return UINTVAL (cst_val) == ethalon ? out : NULL; ++} ++ ++/* Specialize the function for AES decryption. The output is ASHIFT instruction ++ with propper constant or direct reference to TD4 table. ++ ++ TODO: TD4 check might be done here for all the cases. However, now it is not ++ done here to make decryption and encryption matching ++ more general in common. */ ++template<> ++rtx_insn * ++get_possible_final_output (rtx_insn *insn, rtx reg, ++ unsigned HOST_WIDE_INT shift_cst, ++ aes_decrypt_table::table_entry it) ++{ ++ rtx_insn *out = get_single_use_insn (insn, reg); ++ if (!out) ++ return NULL; ++ ++ rtx cst_val = get_op_const_cst (out); ++ if (!cst_val) ++ // no shift case ++ return it == aes_decrypt_table::TD4 ? insn : NULL; ++ ++ return UINTVAL (cst_val) == shift_cst ? out : NULL; ++} ++ ++typedef arg_op_matcher reg_matcher; ++ ++/* Helper that matches suitable AES table references. */ ++template ++class tref_matcher ++{ ++ /* (reg >> cst) matcher. Helper. */ ++ typedef binop_matcher > table_access; ++ /* zext (reg >> cst) matcher. Used for TABLE[(val >> 24)] variant. */ ++ typedef unop_matcher direct; ++ /* zext ((reg >> cst1) & cst2) matcher. Used for ++ TABLE[(val >> (16|8)) & 0xff] variant. */ ++ typedef unop_matcher > > shifted; ++ /* zext (reg & cst) matcher. Used for TABLE[val & 0xff] variant. */ ++ typedef unop_matcher > > noshift; ++ ++ std::map table_alias; ++ ++ bool finalize (aes_table_ref &tref, ++ minimal_term_info &input_info, ++ minimal_term_info *shift_info = NULL, ++ minimal_term_info *mask_info = NULL) ++ { ++ gcc_assert (REG_P (input_info.src)); ++ gcc_assert (!shift_info || CONST_INT_P (shift_info->src)); ++ gcc_assert (!mask_info || CONST_INT_P (mask_info->src)); ++ ++ unsigned HOST_WIDE_INT shift ++ = shift_info ? UINTVAL (shift_info->src) : 0; ++ unsigned HOST_WIDE_INT mask ++ = mask_info ? UINTVAL (mask_info->src) : 0xFF; ++ if (!verify_table_access (shift, mask, mask_info)) ++ return false; ++ ++ tref.insn = input_info.loc; ++ tref.reg = input_info.src; ++ tref.lsr_cst = shift; ++ return true; ++ } ++ ++ bool match (rtx_insn *insn, rtx index, aes_table_ref &tref) ++ { ++ direct::holder_type direct_res; ++ if (direct::match (index, insn, direct_res)) ++ return finalize (tref, direct_res[0], &direct_res[1]); ++ ++ shifted::holder_type shifted_res; ++ if (shifted::match (index, insn, shifted_res)) ++ return finalize (tref, shifted_res[0], ++ &shifted_res[1], &shifted_res[2]); ++ ++ noshift::holder_type noshift_res; ++ return noshift::match (index, insn, noshift_res) ++ && finalize (tref, noshift_res[0], NULL, &noshift_res[1]); ++ } ++ ++public: ++ bool match (rtx_insn *insn, aes_table_ref &tref) ++ { ++ rtx mem = get_single_set_op (insn); ++ if (!mem && (mem = get_single_set_op (insn))) ++ mem = XEXP (mem, 0); ++ ++ rtx dst = get_single_set_dst (insn); ++ if (!mem || !MEM_P (mem) || !dst || GET_MODE (dst) != SImode) ++ return false; ++ ++ simplified_mem_info info; ++ if (!decompose_tref_mem_address (info, mem) ++ || !match (insn, info.index, tref)) ++ return false; ++ ++ typename TABLE_T::table_entry itable; ++ if (!table_alias.count (info.base_reg)) ++ { ++ itable = check_table (mem); ++ if (itable == TABLE_T::BAD_TABLE) ++ return false; ++ table_alias[info.base_reg] = itable; ++ } ++ else ++ itable = table_alias.at (info.base_reg); ++ ++ if (rtx_insn *out = get_possible_final_output (insn, dst, ++ tref.lsr_cst, ++ itable)) ++ { ++ tref.is_final = true; ++ tref.output_insn = out; ++ tref.output = NULL_RTX; ++ } ++ else ++ { ++ tref.is_final = false; ++ tref.output_insn = insn; ++ tref.output = dst; ++ } ++ ++ tref.itable = itable; ++ return true; ++ } ++}; ++ ++/* AES stage description. Required for some specializations ++ for curtain rounds. */ ++typedef enum { INPUT, MIDDLE, FINAL } aes_stage; ++ ++/* AES entity description. It can be both round or state inside round. ++ It provides interface for unified analysis between blocks of 4 parts: ++ round -> 4 states -> 4 * 4 arguments. */ ++template ++struct aes_entity ++{ ++ aes_key key; ++ std::set entries; ++ rtx_insn *loc; ++ ++ aes_entity () ++ : key (NULL), loc (NULL) ++ {} ++ ++ /* Push new entry to the entity. */ ++ bool push_entry (const ENTRY_T &v) ++ { ++ if (entries.size () == 4) ++ return false; ++ ++ entries.insert (v); ++ return true; ++ } ++ ++ /* The entities are sorted by key offset. */ ++ bool operator < (const aes_entity &rhs) const ++ { ++ return key.offset < rhs.key.offset; ++ } ++ ++ /* Verify that all of the entries are correct within their positions inside ++ the entity. */ ++ bool finalize () ++ { ++ if (entries.size () != 4) ++ return false; ++ ++ unsigned i = 0; ++ const ENTRY_T *prev = NULL; ++ for (typename std::set::iterator it = entries.begin (); ++ it != entries.end (); prev = &*it++, ++i) ++ if (!it->verify (prev, i)) ++ return false; ++ ++ loc = entries.begin ()->loc; ++ return true; ++ } ++}; ++ ++/* Check the correctness of input regs permutations. */ ++template ++static bool ++check_input_regs (const std::vector &curr, ++ const std::vector &prev); ++ ++/* Specialize the function for AES encryption. */ ++template<> ++bool ++check_input_regs (const std::vector &curr, ++ const std::vector &prev) ++{ ++ gcc_assert (curr.size () == 4 && prev.size () == 4); ++ unsigned idx[4] = { 1, 2, 3, 0 }; ++ for (int i = 0; i < 4; ++i) ++ if (curr[i] != prev[idx[i]]) ++ return false; ++ return true; ++} ++ ++/* Specialize the function for AES decryption. */ ++template<> ++bool ++check_input_regs (const std::vector &curr, ++ const std::vector &prev) ++{ ++ gcc_assert (curr.size () == 4 && prev.size () == 4); ++ unsigned idx[4] = { 3, 0, 1, 2 }; ++ for (int i = 0; i < 4; ++i) ++ if (curr[i] != prev[idx[i]]) ++ return false; ++ return true; ++} ++ ++/* Basic descryption of state input. */ ++template ++struct state_input ++{ ++ typedef std::vector type; ++ ++ static void finalize (type &in, rtx v) ++ { ++ in.push_back (v); ++ } ++ ++ template ++ static bool verify (const type &lhs, const type &rhs) ++ { ++ return check_input_regs (lhs, rhs); ++ } ++}; ++ ++/* Input round state uses special input. */ ++template<> ++struct state_input ++{ ++ typedef std::pair type; ++ ++ static void finalize (type &in, const type &v) ++ { ++ in = v; ++ // Order is inverted ++ in.second -= 3; ++ } ++ ++ template ++ static bool verify (const type &lhs, const type &rhs) ++ { ++ return lhs.first == rhs.first ++ && lhs.second == rhs.second + 4; ++ } ++}; ++ ++/* Basic descryption of state output. */ ++template ++struct state_output ++{ ++ typedef rtx type; ++ ++ static bool verify (const type &, const type &) ++ { ++ return true; ++ } ++}; ++ ++/* Final round state generates special output. */ ++template<> ++struct state_output ++{ ++ typedef std::pair type; ++ ++ static bool verify (const type &lhs, const type &rhs) ++ { ++ return lhs.first == rhs.first ++ && lhs.second == rhs.second + 4; ++ } ++}; ++ ++/* Basic descryption of round input. */ ++template ++struct round_input ++{ ++ typedef std::vector type; ++}; ++ ++/* Input round uses special input just as its state. */ ++template<> ++struct round_input ++{ ++ typedef std::pair type; ++}; ++ ++/* Basic descryption of round output. */ ++template ++struct round_output ++{ ++ typedef std::vector type; ++ ++ template ++ static void finalize (type &out, const T &v) ++ { ++ gcc_assert (v.size () == 4); ++ for (typename T::const_iterator it = v.begin (); it != v.end (); ++it) ++ out.push_back (it->output); ++ } ++ ++ template ++ static void reorder (type &) ++ {} ++}; ++ ++/* Reorder output for AES decryption: the order is changed compared to ++ AES encryption. */ ++template<> ++template<> ++void round_output::reorder (type &out) ++{ ++ gcc_assert (out.size () == 4); ++ std::swap (out[1], out[3]); ++} ++ ++template<> ++template<> ++void round_output::reorder (type &out) ++{ ++ round_output::reorder (out); ++} ++ ++/* Final round generates special output. */ ++template<> ++struct round_output : state_output ++{ ++ template ++ static void finalize (type &out, const T &v) ++ { ++ gcc_assert (v.size () == 4); ++ out = v.begin ()->output; ++ } ++ ++ template ++ static void reorder (type &) ++ {} ++}; ++ ++/* AES state descryption. */ ++template ++struct aes_state : aes_entity ++{ ++ typedef aes_entity base_entity; ++ ++ typename state_input::type input; ++ typename state_output::type output; ++ ++ aes_state () ++ : base_entity () ++ {} ++ ++ void set_output (const typename state_output::type &o) ++ { ++ output = o; ++ } ++ ++ bool push_entry (const ENTRY_T &v) ++ { ++ if (!v.is_key) ++ return base_entity::push_entry (v); ++ ++ if (this->key.src) ++ return false; ++ ++ this->key = v; ++ return true; ++ } ++ ++ /* Verify if the state is correct within its position in round. */ ++ bool verify (const aes_state *prev, unsigned) const ++ { ++ if (!prev) ++ return true; ++ ++ if (!this->key.has_same_origin (prev->key, this->loc) ++ || this->key.offset != prev->key.offset + 4 ++ || BLOCK_FOR_INSN (this->loc) != BLOCK_FOR_INSN (prev->loc)) ++ return false; ++ ++ return state_input::template verify (input, prev->input) ++ && state_output::verify (output, prev->output); ++ } ++ ++ /* Check if the entries of the state are correct and finalize stored info. */ ++ bool finalize () ++ { ++ if (!base_entity::finalize ()) ++ return false; ++ ++ for (typename std::set::iterator it = this->entries.begin (); ++ it != this->entries.end (); ++it) ++ state_input::finalize (input, it->input ()); ++ ++ return true; ++ } ++}; ++ ++/* AES round descryption. */ ++template ++struct aes_round : aes_entity, STAGE, K> ++{ ++ typedef aes_entity, STAGE, K> base_entity; ++ ++ typename round_input::type input; ++ typename round_output::type output; ++ ++ /* Check if the states are correct and finalize stored info. */ ++ bool finalize () ++ { ++ if (!base_entity::finalize ()) ++ return false; ++ ++ input = this->entries.begin ()->input; ++ this->key = this->entries.begin ()->key; ++ ++ round_output::finalize (output, this->entries); ++ round_output::template reorder (output); ++ ++ return true; ++ } ++}; ++ ++template ++class aes_optimizer; ++ ++/* AES round input info. Used to find and store info about ++ table references. ++ ++ Must be inited and finalized before and after usage. */ ++template ++struct round_input_info : state_input_info ++{ ++ typedef typename aes_optimizer::table_ref_map tref_map; ++ ++ round_input_info () ++ {} ++ round_input_info (rtx_insn *insn, const aes_table_ref *tref) ++ : state_input_info (mem_term_info (insn, NULL_RTX)), tref (tref) ++ {} ++ round_input_info (const aes_key &k) ++ : state_input_info (k) ++ {} ++ ++ rtx input () const ++ { ++ return tref->reg; ++ } ++ ++ rtx output () const ++ { ++ return tref->output; ++ } ++ ++ /* Table references are sorted by shift constants. ++ TODO: probably sort by key offset? */ ++ bool operator < (const round_input_info &rhs) const ++ { ++ return tref->lsr_cst > rhs.tref->lsr_cst; ++ } ++ ++ bool verify (const round_input_info *prev, unsigned i) const ++ { ++ return state_input_info::verify (prev) && tref->verify (i); ++ } ++ ++ static bool finalize (rtx_insn *insn, round_input_info *m) ++ { ++ if (checked_p->count (insn)) ++ return false; ++ ++ typename tref_map::const_iterator it = table_refs_p->find (insn); ++ if (it == table_refs_p->end ()) ++ return false; ++ ++ m[0] = round_input_info (insn, &it->second); ++ return true; ++ } ++ ++ const aes_table_ref *tref; ++ ++ static const tref_map *table_refs_p; ++ static const std::set *checked_p; ++ ++ /* Store lookup table references. */ ++ static void init (const tref_map &t, const std::set &c) ++ { ++ gcc_assert (!table_refs_p && !checked_p); ++ table_refs_p = &t; ++ checked_p = &c; ++ } ++ ++ /* Remove lookup table references. */ ++ static void fin () ++ { ++ gcc_assert (table_refs_p && checked_p); ++ table_refs_p = NULL; ++ checked_p = NULL; ++ } ++}; ++ ++template ++const typename aes_optimizer::table_ref_map * ++round_input_info::table_refs_p = NULL; ++ ++template ++const std::set * ++round_input_info::checked_p = NULL; ++ ++/* AES encryption/decryption optimizer. */ ++template ++class aes_optimizer ++{ ++public: ++ typedef std::map > table_ref_map; ++ ++ /* AES states typedefs. */ ++ typedef aes_state aes_input_state; ++ typedef aes_state, MIDDLE, T> aes_body_state; ++ typedef aes_state, FINAL, T> aes_final_state; ++ ++ /* AES rounds typedefs. */ ++ typedef aes_round aes_input_round; ++ typedef aes_round, MIDDLE, T> aes_body_round; ++ typedef aes_round, FINAL, T> aes_final_round; ++ ++ bool run (); ++ ++private: ++ bool collect_aes_lookup_tables (); ++ bool form_rounds (); ++ bool find_aes_init_round (); ++ bool collect_state (rtx_insn * insn, aes_body_state &state, ++ std::set &checked); ++ bool find_aes_rounds (); ++ bool collect_final_round (rtx_insn *insn, aes_final_state &state, ++ std::set &checked); ++ bool find_aes_final_round (); ++ bool check_aes_pattern (); ++ void erase_unused_rounds (std::set *> &used); ++ ++ bool gen_aes_code (); ++ bool gen_init_round (); ++ bool gen_round (const aes_body_round &round); ++ bool gen_final_round (); ++ ++ rtx gen_or_get_vreg (const std::vector &vec); ++ rtx get_vreg (const std::vector &vec); ++ rtx gen_vreg (const std::vector &vec); ++ ++ table_ref_map table_refs; ++ table_ref_map final_table_refs; ++ ++ aes_input_round input_round; ++ std::map, aes_body_round> rounds; ++ aes_final_round final_round; ++ ++ std::map, rtx> vec_regs; ++ std::vector to_delete; ++}; ++ ++/* Find all the AES table references in function. */ ++template ++bool ++aes_optimizer::collect_aes_lookup_tables () ++{ ++ basic_block bb; ++ rtx_insn *insn; ++ ++ tref_matcher m; ++ FOR_EACH_BB_FN (bb, cfun) ++ FOR_BB_INSNS (bb, insn) ++ { ++ aes_table_ref tref; ++ if (!m.match (insn, tref)) ++ continue; ++ ++ if (!tref.is_final) ++ table_refs[insn] = tref; ++ else ++ final_table_refs[tref.output_insn] = tref; ++ } ++ ++ return !table_refs.empty () && !final_table_refs.empty (); ++} ++ ++/* Helper function to match all the permutations of five arg ++ calculations. */ ++template ++struct five_args_calc_matcher ++{ ++ /* Helper for matching (op1 * op2). */ ++ typedef binop_matcher two_args_block; ++ /* Helper for matching (op1 * (op2 * op3)). */ ++ typedef binop_matcher three_args_block; ++ /* Helper for matching ((op1 * op2) * (op3 * op4)). */ ++ typedef binop_matcher opt_four_args_block; ++ /* Helper for matching (op1 * (op2 * (op3 * op4))). */ ++ typedef binop_matcher linear_four_args_block; ++ ++ /* Match the (op1 * ((op2 * op3) * (op4 * op5))) variant. */ ++ typedef binop_matcher opt_op_term; ++ /* Match the ((op1 * op2) * (op3 * (op4 * op5))) variant. */ ++ typedef binop_matcher three_op_two; ++ /* Match the (op1 * (op2 * (op3 * (op4 * op5)))) variant. */ ++ typedef binop_matcher fully_linear; ++ ++ static const int holder_size = fully_linear::holder_size; ++ static const int op_num = fully_linear::op_num; ++ typedef typename fully_linear::term_type term_type; ++ typedef typename fully_linear::holder_type holder_type; ++ ++ static rtx_insn* match (rtx_insn *insn, holder_type &m, unsigned depth = 1) ++ { ++ for (rtx dst = get_single_set_dst (insn); depth && insn && dst; ++ insn = get_single_use_insn (insn, dst), ++ dst = insn ? get_single_set_dst (insn) : NULL_RTX, ++ --depth) ++ if (opt_op_term::match (insn, m) || three_op_two::match (insn, m) ++ || fully_linear::match (insn, m)) ++ return insn; ++ return NULL; ++ } ++}; ++ ++/* Match the AES key. */ ++struct key_matcher : matcher_term ++{ ++ static bool match (rtx_insn *insn, holder_type &m) ++ { ++ mem_matcher::holder_type info; ++ if (!mem_matcher::match (insn, info)) ++ return false; ++ ++ m[0] = info[0]; ++ return true; ++ } ++}; ++ ++/* Matcher term for state input. */ ++template ++struct state_input_term : matcher_term ++{ ++ typedef typename matcher_term::holder_type holder_type; ++ ++ static bool match (rtx, rtx_insn *, holder_type &) ++ { ++ return false; ++ } ++ ++ static bool match (rtx_insn *insn, holder_type &m) ++ { ++ key_matcher::holder_type k; ++ if (key_matcher::match (insn, k)) ++ { ++ m[0] = k[0]; ++ return true; ++ } ++ ++ return matcher_term::term_type::finalize (insn, m); ++ } ++}; ++ ++/* Fill state from args. */ ++template ++static bool ++finalize_input (const T (&args)[5], STATE &state) ++{ ++ for (unsigned i = 0; i < 5; ++i) ++ if (!state.push_entry (args[i])) ++ return false; ++ ++ return state.finalize (); ++} ++ ++/* Construct input state. */ ++template ++static bool ++form_input (rtx_insn *insn, T &state) ++{ ++ typedef five_args_calc_matcher > ++ matcher; ++ ++ matcher::holder_type m; ++ if (!matcher::match (insn, m) || !finalize_input (m, state)) ++ return false; ++ ++ /* TODO: probably should not be set here. */ ++ state.set_output (SET_DEST (single_set (insn))); ++ return true; ++} ++ ++/* Get definitions chain for the reg being used in the insn. */ ++static df_link * ++get_defs (rtx_insn *insn, rtx reg) ++{ ++ df_link *ref_chain = get_def_chain (insn, reg); ++ gcc_assert (ref_chain); ++ ++ for (df_link *ref_link = ref_chain; ref_link; ref_link = ref_link->next) ++ if (!check_def_chain_ref (ref_link->ref, reg)) ++ return NULL; ++ ++ return ref_chain; ++} ++ ++/* Find AES init round. To do this, find the table references that depends on ++ two definitions. One of them is our input. */ ++template ++bool ++aes_optimizer::find_aes_init_round () ++{ ++ std::set checked; ++ ++ for (typename table_ref_map::iterator it = table_refs.begin (), ++ end = table_refs.end (); it != end; ++it) ++ for (df_link *def = get_defs (it->second.insn, it->second.reg); ++ def; def = def->next) ++ { ++ rtx_insn *def_insn = DF_REF_INSN (def->ref); ++ if (checked.count (def_insn)) ++ continue; ++ ++ aes_input_state input_state; ++ if (form_input (def_insn, input_state) ++ && !input_round.push_entry (input_state)) ++ return false; ++ ++ checked.insert (def_insn); ++ } ++ ++ return input_round.finalize (); ++} ++ ++/* Collect AES inner state. */ ++template ++bool ++aes_optimizer::collect_state (rtx_insn *insn, aes_body_state &state, ++ std::set &checked) ++{ ++ typedef round_input_info term_info; ++ typedef five_args_calc_matcher > matcher; ++ ++ typename matcher::holder_type m; ++ term_info::init (table_refs, checked); ++ rtx_insn *match_entry = matcher::match (insn, m, 3); ++ term_info::fin (); ++ ++ if (!match_entry || !finalize_input (m, state)) ++ return false; ++ ++ /* TODO: probably should not be set here. */ ++ state.set_output (SET_DEST (single_set (match_entry))); ++ for (unsigned i = 0; i < 5; ++i) ++ if (!m[i].is_key) ++ checked.insert (m[i].tref->output_insn); ++ ++ return true; ++} ++ ++/* Simple sorter to link rounds by their registers. */ ++struct reg_comp ++{ ++ bool operator () (rtx lhs, rtx rhs) const ++ { ++ return REGNO (lhs) < REGNO (rhs); ++ } ++}; ++ ++/* Find AES inner rounds. */ ++template ++bool ++aes_optimizer::find_aes_rounds () ++{ ++ typedef std::set input_key; ++ ++ std::set checked; ++ std::map candidate_rounds; ++ for (typename table_ref_map::iterator it = table_refs.begin (), ++ end = table_refs.end (); it != end; ++it) ++ { ++ rtx_insn *insn = it->first; ++ if (checked.count (insn)) ++ continue; ++ ++ rtx_insn *use = get_single_use_insn (insn, SET_DEST (single_set (insn))); ++ if (!use) ++ continue; ++ ++ aes_body_state state; ++ if (!collect_state (use, state, checked)) ++ continue; ++ ++ /* Sort the input so we can found the corresponding state. */ ++ input_key input (state.input.begin (), state.input.end ()); ++ candidate_rounds[input].push_entry (state); ++ } ++ ++ for (typename std::map::iterator ++ it = candidate_rounds.begin (); ++ it != candidate_rounds.end (); ++it) ++ if (it->second.finalize ()) ++ rounds[it->second.input] = it->second; ++ ++ return !rounds.empty (); ++} ++ ++template ++struct final_state_matcher; ++ ++/* AES encrypt matcher requires additional check on key calculations ++ due to possible optimizations. */ ++template<> ++struct final_state_matcher ++{ ++ typedef round_input_info term_info; ++ typedef five_args_calc_matcher, IOR, true> ++ matcher; ++ typedef typename matcher::term_type ++ holder_type[matcher::holder_size - matcher::op_num]; ++ ++ static rtx_insn *match (rtx_insn *insn, holder_type &m, unsigned depth) ++ { ++ matcher::holder_type inner_m; ++ rtx_insn *res = matcher::match (insn, inner_m, depth); ++ if (!res) ++ return NULL; ++ ++ /* Run pre-order traversal of the operands to check the correctness ++ of key usage. */ ++ gcc_assert (inner_m[0].is_op); ++ unsigned pos = 0; ++ if (!check_key_calculations (inner_m, pos)) ++ return NULL; ++ gcc_assert (pos == (matcher::holder_size - 1)); ++ ++ unsigned idx = 0; ++ for (unsigned i = 0; i < matcher::holder_size; ++i) ++ if (!inner_m[i].is_op) ++ m[idx++] = inner_m[i]; ++ ++ gcc_assert (idx == 5); ++ return res; ++ } ++ ++ static bool check_key_calculations (const matcher::holder_type &m, ++ unsigned &idx, ++ bool failure_on_key = false) ++ { ++ gcc_assert (idx < matcher::holder_size); ++ if (!m[idx].is_op) ++ return !(failure_on_key && m[idx].is_key); ++ ++ failure_on_key |= (GET_CODE (m[idx].src) == IOR); ++ return check_key_calculations (m, ++idx, failure_on_key) ++ && check_key_calculations (m, ++idx, failure_on_key); ++ } ++}; ++ ++ ++/* The final state is simple wrapper since no additional checks are required ++ here. */ ++template<> ++struct final_state_matcher ++{ ++ typedef round_input_info term_info; ++ typedef five_args_calc_matcher > matcher; ++ typedef typename matcher::holder_type holder_type; ++ ++ static rtx_insn *match (rtx_insn *insn, holder_type &m, unsigned depth) ++ { ++ return matcher::match (insn, m, depth); ++ } ++}; ++ ++/* Match the AES final state. */ ++template ++bool ++aes_optimizer::collect_final_round (rtx_insn *insn, aes_final_state &state, ++ std::set &checked) ++{ ++ typedef final_state_matcher matcher_wrapper; ++ ++ typename matcher_wrapper::holder_type m; ++ matcher_wrapper::term_info::init (final_table_refs, checked); ++ rtx_insn *match_entry = matcher_wrapper::match (insn, m, 3); ++ matcher_wrapper::term_info::fin (); ++ ++ rtx dst; ++ if (!match_entry || !(dst = get_single_set_dst (match_entry)) ++ || !finalize_input (m, state)) ++ return false; ++ ++ rtx src; ++ if (!(match_entry = get_single_use_insn (match_entry, dst)) ++ || !(check_simple_op (match_entry, src, dst)) ++ || !dst) ++ return false; ++ ++ std::pair output; ++ if (!(match_entry = get_single_use_insn (match_entry, dst)) ++ || !(dst = get_single_set_dst (match_entry)) ++ || !decompose_mem (dst, output.first, output.second)) ++ return false; ++ ++ to_delete.push_back (match_entry); ++ state.set_output (output); ++ for (unsigned i = 0; i < 5; ++i) ++ if (!m[i].is_key) ++ checked.insert (m[i].tref->output_insn); ++ ++ return true; ++} ++ ++/* Find the final round. */ ++template ++bool ++aes_optimizer::find_aes_final_round () ++{ ++ std::set checked; ++ for (typename table_ref_map::iterator it = final_table_refs.begin (), ++ end = final_table_refs.end (); it != end; ++it) ++ { ++ rtx_insn *insn = it->first; ++ ++ if (checked.count (insn)) ++ continue; ++ ++ rtx_insn *use = get_single_use_insn (insn, SET_DEST (single_set (insn))); ++ if (!use) ++ continue; ++ ++ aes_final_state state; ++ if (collect_final_round (use, state, checked)) ++ final_round.push_entry (state); ++ } ++ ++ return final_round.finalize (); ++} ++ ++template ++bool ++aes_optimizer::form_rounds () ++{ ++ return find_aes_final_round () ++ && find_aes_init_round () ++ && find_aes_rounds (); ++} ++ ++template ++void ++aes_optimizer::erase_unused_rounds (std::set *> &used) ++{ ++ if (used.size () == rounds.size ()) ++ return; ++ ++ for (typename std::map, aes_body_round>::iterator ++ it = rounds.begin (), next = it, ++ end = rounds.end (); it != end; it = next) ++ { ++ ++next; ++ if (!used.count (&it->first)) ++ rounds.erase (it); ++ } ++} ++ ++/* Find round starts and link them together. */ ++template ++bool ++aes_optimizer::check_aes_pattern () ++{ ++ std::set *> checked; ++ ++ typename std::map, aes_body_round>::iterator fit ++ = rounds.find (input_round.output); ++ ++ bool to_final = false; ++ while (fit != rounds.end () && !checked.count (&fit->first)) ++ { ++ checked.insert (&fit->first); ++ ++ if (fit->second.output == final_round.input) ++ to_final = true; ++ ++ fit = rounds.find (fit->second.output); ++ } ++ ++ if (!to_final) ++ return false; ++ ++ erase_unused_rounds (checked); ++ ++ return true; ++} ++ ++static bool ++gen_insns (const rtx patterns[4], rtx_insn *loc) ++{ ++ start_sequence (); ++ for (unsigned i = 0; i < 4; ++i) ++ { ++ rtx_insn *insn = emit_insn (patterns[i]); ++ if (recog_memoized (insn) < 0) ++ { ++ end_sequence (); ++ return false; ++ } ++ } ++ ++ rtx_insn *seq = get_insns (); ++ end_sequence (); ++ emit_insn_after (seq, loc); ++ ++ return true; ++} ++ ++static rtx ++gen_offset_access (rtx base, unsigned HOST_WIDE_INT offset) ++{ ++ if (!offset) ++ return base; ++ ++ machine_mode mode = GET_MODE (base); ++ return gen_rtx_PLUS (mode, base, gen_rtx_CONST_INT (mode, offset)); ++} ++ ++template ++rtx ++aes_optimizer::get_vreg (const std::vector &vec) ++{ ++ std::map, rtx>::iterator fit = vec_regs.find (vec); ++ if (fit != vec_regs.end ()) ++ return fit->second; ++ ++ return 0; ++} ++ ++template ++rtx ++aes_optimizer::gen_vreg (const std::vector &vec) ++{ ++ machine_mode vmode = targetm.get_v16qi_mode (); ++ rtx vreg = gen_reg_rtx (vmode); ++ vec_regs.insert (std::make_pair (vec, vreg)); ++ ++ return vreg; ++} ++ ++template ++rtx ++aes_optimizer::gen_or_get_vreg (const std::vector &vec) ++{ ++ rtx vreg = get_vreg (vec); ++ if (!vreg) ++ vreg = gen_vreg (vec); ++ ++ return vreg; ++} ++ ++template ++static rtx ++gen_aes_single_round (rtx vout, rtx vreg, rtx vkey); ++template ++static rtx ++gen_aes_mix_columns (rtx vreg, rtx vin); ++ ++template<> ++rtx ++gen_aes_single_round (rtx vout, rtx vreg, rtx vkey) ++{ ++ return targetm.gen_aesev16qi (vout, vreg, vkey); ++} ++ ++template<> ++rtx ++gen_aes_mix_columns (rtx vreg, rtx vin) ++{ ++ return targetm.gen_aesmcv16qi (vreg, vin); ++} ++ ++template<> ++rtx ++gen_aes_single_round (rtx vout, rtx vreg, rtx vkey) ++{ ++ return targetm.gen_aesdv16qi (vout, vreg, vkey); ++} ++ ++template<> ++rtx ++gen_aes_mix_columns (rtx vreg, rtx vin) ++{ ++ return targetm.gen_aesimcv16qi (vreg, vin); ++} ++ ++template ++bool ++aes_optimizer::gen_init_round () ++{ ++ rtx_insn *loc = input_round.loc; ++ ++ machine_mode vmode = targetm.get_v16qi_mode (); ++ ++ rtx vreg = gen_reg_rtx (vmode); ++ rtx vkey = gen_reg_rtx (vmode); ++ rtx vout = gen_vreg (input_round.output); ++ ++ rtx buf = input_round.input.first; ++ rtx key = gen_offset_access (input_round.key.src, input_round.key.offset); ++ ++ rtx vload_pat = gen_rtx_SET (vreg, ++ gen_rtx_MEM (vmode, buf)); ++ rtx vkey_load_pat = gen_rtx_SET (vkey, ++ gen_rtx_MEM (vmode, key)); ++ rtx vrev_pat = targetm.gen_rev32v16qi (vkey, vkey); ++ rtx vaes_pat = gen_aes_single_round (vout, vreg, vkey); ++ ++ const rtx patterns[4] = {vload_pat, vkey_load_pat, vrev_pat, vaes_pat}; ++ ++ return gen_insns (patterns, loc); ++} ++ ++template ++bool ++aes_optimizer::gen_round (const aes_body_round &round) ++{ ++ rtx_insn *loc = round.loc; ++ ++ machine_mode vmode = targetm.get_v16qi_mode (); ++ ++ rtx vreg = gen_reg_rtx (vmode); ++ rtx vkey = gen_reg_rtx (vmode); ++ rtx vin = gen_or_get_vreg (round.input); ++ rtx vout = gen_or_get_vreg (round.output); ++ ++ rtx key = gen_offset_access (round.key.src, round.key.offset); ++ ++ rtx vkey_load_pat = gen_rtx_SET (vkey, ++ gen_rtx_MEM (vmode, key)); ++ rtx vrev_pat = targetm.gen_rev32v16qi (vkey, vkey); ++ rtx vmix_pat = gen_aes_mix_columns (vreg, vin); ++ rtx vaes_pat = gen_aes_single_round (vout, vreg, vkey); ++ ++ const rtx patterns[4] = {vkey_load_pat, vrev_pat, vmix_pat, vaes_pat}; ++ ++ return gen_insns (patterns, loc); ++} ++ ++template ++bool ++aes_optimizer::gen_final_round () ++{ ++ rtx_insn *loc = final_round.loc; ++ ++ machine_mode vmode = targetm.get_v16qi_mode (); ++ ++ rtx vreg = gen_reg_rtx (vmode); ++ rtx vkey = gen_reg_rtx (vmode); ++ rtx vin = get_vreg (final_round.input); ++ ++ gcc_assert (vin); ++ ++ rtx buf = final_round.output.first; ++ rtx key = gen_offset_access (final_round.key.src, final_round.key.offset); ++ ++ rtx vkey_load_pat = gen_rtx_SET (vkey, ++ gen_rtx_MEM (vmode, key)); ++ rtx vrev_pat = targetm.gen_rev32v16qi (vkey, vkey); ++ rtx vxor_pat = gen_rtx_SET (vreg, gen_rtx_XOR (vmode, vin, vkey)); ++ rtx vstore_pat = gen_rtx_SET (gen_rtx_MEM (vmode, buf), vreg); ++ ++ const rtx patterns[4] = {vkey_load_pat, vrev_pat, vxor_pat, vstore_pat}; ++ ++ return gen_insns (patterns, loc); ++} ++ ++template ++bool ++aes_optimizer::gen_aes_code () ++{ ++ if (!gen_init_round ()) ++ return false; ++ ++ for (typename std::map, aes_body_round>::iterator ++ it = rounds.begin (), end = rounds.end (); it != end; ++it) ++ { ++ if (!gen_round (it->second)) ++ return false; ++ } ++ ++ if (!gen_final_round ()) ++ return false; ++ ++ for (std::vector::iterator it = to_delete.begin (), ++ end = to_delete.end (); it != end; ++it) ++ SET_INSN_DELETED (*it); ++ ++ return true; ++} ++ ++template ++bool ++aes_optimizer::run () ++{ ++ return collect_aes_lookup_tables () ++ && form_rounds () ++ && check_aes_pattern () ++ && gen_aes_code (); ++} ++ ++static unsigned int ++crypto_acceleration () ++{ ++ aes_optimizer enc; ++ aes_optimizer dec; ++ enc.run (); ++ dec.run (); ++ ++ return 0; ++} ++ ++static void ++init_df () ++{ ++ df_set_flags (DF_RD_PRUNE_DEAD_DEFS); ++ df_chain_add_problem (DF_UD_CHAIN + DF_DU_CHAIN); ++ df_mir_add_problem (); ++ df_live_add_problem (); ++ df_live_set_all_dirty (); ++ df_analyze (); ++ df_set_flags (DF_DEFER_INSN_RESCAN); ++} ++ ++namespace { ++ ++const pass_data pass_data_crypto_accel = ++{ ++ RTL_PASS, // type ++ "crypto_accel", // name ++ OPTGROUP_NONE, // optinfo_flags ++ TV_CRYPTO_ACCEL, // tv_id ++ PROP_cfglayout, // properties_required ++ 0, // properties_provided ++ 0, // properties_destroyed ++ 0, // todo_flags_start ++ TODO_df_finish, // todo_flags_finish ++}; ++ ++class pass_crypto_accel : public rtl_opt_pass ++{ ++public: ++ pass_crypto_accel (gcc::context *ctxt) ++ : rtl_opt_pass (pass_data_crypto_accel, ctxt) ++ {} ++ ++ /* opt_pass methods: */ ++ virtual bool gate (function *) ++ { ++ if (flag_crypto_accel_aes <= 0) ++ return false; ++ return targetm.get_v16qi_mode ++ && targetm.gen_rev32v16qi ++ && targetm.gen_aesev16qi ++ && targetm.gen_aesmcv16qi; ++ } ++ ++ virtual unsigned int execute (function *) ++ { ++ init_df (); ++ return crypto_acceleration (); ++ } ++}; // class pass_crypto_accel ++ ++} // anon namespace ++ ++rtl_opt_pass * ++make_pass_crypto_accel (gcc::context *ctxt) ++{ ++ return new pass_crypto_accel (ctxt); ++} +diff --git a/gcc/rtl-matcher.h b/gcc/rtl-matcher.h +new file mode 100644 +index 000000000..6aed8d98d +--- /dev/null ++++ b/gcc/rtl-matcher.h +@@ -0,0 +1,367 @@ ++/* Helpers for RTL pattern matchers. ++ Copyright (C) 2003-2023 Free Software Foundation, Inc. ++ ++This file is part of GCC. ++ ++GCC is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License as published by the Free ++Software Foundation; either version 3, or (at your option) any later ++version. ++ ++GCC is distributed in the hope that it will be useful, but WITHOUT ANY ++WARRANTY; without even the implied warranty of MERCHANTABILITY or ++FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received a copy of the GNU General Public License ++along with GCC; see the file COPYING3. If not see ++. */ ++ ++#ifndef GCC_RTL_MATCHER_H ++#define GCC_RTL_MATCHER_H ++ ++#include "config.h" ++#include "system.h" ++#include "rtl.h" ++#include "df.h" ++ ++/* Get definitions chain for the reg being used in insn. */ ++static df_link * ++get_def_chain (rtx_insn *insn, rtx reg) ++{ ++ df_ref use; ++ FOR_EACH_INSN_USE (use, insn) ++ { ++ rtx use_reg = DF_REF_REG (use); ++ if (GET_CODE (use_reg) == SUBREG) ++ { ++ if (REGNO (SUBREG_REG (use_reg)) == REGNO (reg)) ++ return NULL; ++ } ++ else ++ { ++ gcc_assert (REG_P (use_reg)); ++ if (REGNO (use_reg) == REGNO (reg)) ++ return DF_REF_CHAIN (use); ++ } ++ } ++ ++ return NULL; ++} ++ ++/* Check if the reg is not global and actually modified in the ref. */ ++static bool ++check_def_chain_ref (df_ref ref, rtx reg) ++{ ++ if (!ref || !DF_REF_INSN_INFO (ref)) ++ return false; ++ ++ return !global_regs[REGNO (reg)] ++ || set_of (reg, DF_REF_INSN (ref)); ++} ++ ++/* Get the single def instruction of the reg being used in the insn. */ ++static rtx_insn * ++get_single_def_insn (rtx_insn *insn, rtx reg) ++{ ++ if (!REG_P (reg)) ++ return NULL; ++ ++ df_link *ref_chain = get_def_chain (insn, reg); ++ gcc_assert (ref_chain); ++ ++ if (!ref_chain || ref_chain->next ++ || !check_def_chain_ref (ref_chain->ref, reg)) ++ return NULL; ++ ++ return DF_REF_INSN (ref_chain->ref); ++} ++ ++/* Get the single user instruction of the reg being set in the insn. */ ++static rtx_insn * ++get_single_use_insn (rtx_insn *insn, rtx reg) ++{ ++ df_ref def; ++ struct df_link *ref_chain; ++ ++ if (!REG_P (reg)) ++ return NULL; ++ ++ FOR_EACH_INSN_DEF (def, insn) ++ if (REGNO (DF_REF_REG (def)) == REGNO (reg)) ++ break; ++ ++ gcc_assert (def && "Broken def-use analysis chain."); ++ ++ ref_chain = DF_REF_CHAIN (def); ++ ++ if (!ref_chain || ref_chain->next || !ref_chain->ref) ++ return NULL; ++ ++ return DF_REF_INSN (ref_chain->ref); ++} ++ ++/* Get the rtx pattern of suitable opcode from single set instruction. */ ++template ++static rtx ++get_single_set_op (rtx_insn *insn) ++{ ++ rtx pat = single_set (insn); ++ if (!pat) ++ return NULL_RTX; ++ ++ rtx src = SET_SRC (pat); ++ if (GET_CODE (src) != OP1 && GET_CODE (src) != OP2) ++ return NULL_RTX; ++ ++ return src; ++} ++ ++/* Get the rtx pattern of suitable opcode from single set instruction. */ ++template ++static rtx ++get_single_set_op (rtx_insn *insn) ++{ ++ return get_single_set_op (insn); ++} ++ ++/* Get the rtx constant from single set instruction of suitable opcode. */ ++template ++static rtx ++get_op_const_cst (rtx_insn *insn) ++{ ++ rtx src = get_single_set_op (insn); ++ if (!src) ++ return NULL_RTX; ++ ++ rtx cst = XEXP (src, 1); ++ return CONST_INT_P (cst) ? cst : NULL_RTX; ++} ++ ++/* Get the rtx destination from single set instruction of suitable opcode. */ ++template ++static rtx ++get_single_set_dst (rtx_insn *insn) ++{ ++ rtx pat = single_set (insn); ++ if (!pat) ++ return NULL_RTX; ++ ++ rtx dst = SET_DEST (pat); ++ if (GET_CODE (dst) != OP) ++ return NULL_RTX; ++ ++ return dst; ++} ++ ++/* Get the rtx destination from single set instruction. */ ++static rtx ++get_single_set_dst (rtx_insn *insn) ++{ ++ rtx pat = single_set (insn); ++ if (!pat) ++ return NULL_RTX; ++ ++ return SET_DEST (pat); ++} ++ ++/* Check if the instruction is single set of suitable opcode. ++ Also gather its source and destination patterns. */ ++template ++static bool ++check_simple_op (rtx_insn *insn, rtx &src, rtx &dst) ++{ ++ rtx pat = single_set (insn); ++ if (!pat) ++ return false; ++ ++ src = SET_SRC (pat); ++ dst = SET_DEST (pat); ++ ++ if (GET_CODE (src) != OP) ++ return false; ++ ++ return true; ++} ++ ++/* Minimal term info of the RTL matcher. All of the custom matchers should ++ inherit from it. ++ ++ It stores information about matched pattern, instruction ++ of its location and predicate if the matched term represents operator ++ inside the matched tree. */ ++struct minimal_term_info ++{ ++ minimal_term_info () ++ {} ++ minimal_term_info (rtx_insn *loc, rtx src, bool is_op = false) ++ : loc (loc), src (src), is_op (is_op) ++ {} ++ ++ rtx_insn *loc; ++ rtx src; ++ bool is_op; ++}; ++ ++/* Term info for memory matcher. */ ++struct mem_term_info : minimal_term_info ++{ ++ mem_term_info () ++ {} ++ mem_term_info (rtx_insn *loc, rtx src, unsigned HOST_WIDE_INT offset = 0) ++ : minimal_term_info (loc, src), offset (offset) ++ {} ++ ++ unsigned HOST_WIDE_INT offset; ++}; ++ ++/* A wrapper being used to turn a term into a matcher-like entity. */ ++template ++struct matcher_term ++{ ++ /* Required storage size information of the matcher. */ ++ static const int holder_size = 1; ++ static const int op_num = 0; ++ typedef T term_type; ++ typedef term_type holder_type[holder_size]; ++}; ++ ++/* Simple matcher of patterns of suitable opcode. */ ++template ++struct arg_op_matcher : matcher_term ++{ ++ typedef typename matcher_term::holder_type holder_type; ++ ++ static bool match (rtx_insn *, holder_type &) ++ { ++ return false; ++ } ++ ++ static bool match (rtx src, rtx_insn *insn, holder_type &m) ++ { ++ if (GET_CODE (src) != ARGOP) ++ return false; ++ ++ static_cast (m[0]) = minimal_term_info (insn, src); ++ return true; ++ } ++}; ++ ++/* Simple matcher of integer constants. */ ++template ++struct int_cst_matcher : arg_op_matcher ++{}; ++ ++/* Unary operator matcher. */ ++template ++struct unop_matcher ++{ ++ /* Required storage size information of the matcher. */ ++ static const int holder_size = ARG::holder_size + store_op; ++ static const int op_num = ARG::op_num + store_op; ++ typedef typename ARG::term_type term_type; ++ typedef term_type holder_type[holder_size]; ++ ++ static bool match (rtx_insn *insn, holder_type &m) ++ { ++ rtx src = get_single_set_op (insn); ++ return src && match (src, insn, m); ++ } ++ ++ static bool match (rtx src, rtx_insn *insn, holder_type &m) ++ { ++ if (REG_P (src)) ++ { ++ insn = get_single_def_insn (insn, src); ++ if (insn && (src = single_set (insn))) ++ src = SET_SRC (src); ++ } ++ ++ if (!src || !insn || (GET_CODE (src) != OP1 && GET_CODE (src) != OP2)) ++ return false; ++ ++ /* Store current operation if needed. */ ++ if (store_op) ++ static_cast (m[0]) = minimal_term_info (insn, src, ++ true); ++ ++ rtx op = XEXP (src, 0); ++ rtx_insn *def = get_single_def_insn (insn, op); ++ typename ARG::holder_type &m_arg ++ = (typename ARG::holder_type &) *(m + store_op); ++ return (def && ARG::match (def, m_arg)) || ARG::match (op, insn, m_arg); ++ } ++}; ++ ++/* Binary operator matcher. */ ++template ++struct binop_matcher ++{ ++ /* Required storage size information of the matcher. */ ++ static const int holder_size = LHS::holder_size + RHS::holder_size + store_op; ++ static const int op_num = LHS::op_num + RHS::op_num + store_op; ++ typedef typename LHS::term_type term_type; ++ typedef term_type holder_type[holder_size]; ++ ++ static bool match (rtx_insn *insn, holder_type &m) ++ { ++ rtx src = get_single_set_op (insn); ++ return src && match (src, insn, m); ++ } ++ ++ static bool match (rtx src, rtx_insn *insn, holder_type &m) ++ { ++ if (GET_CODE (src) != OP1 && GET_CODE (src) != OP2) ++ return false; ++ ++ /* Store current operation if needed. */ ++ if (store_op) ++ static_cast (m[0]) = minimal_term_info (insn, src, ++ true); ++ ++ rtx lhs_op = XEXP (src, 0); ++ rtx rhs_op = XEXP (src, 1); ++ rtx_insn *lhs_def = get_single_def_insn (insn, lhs_op); ++ rtx_insn *rhs_def = get_single_def_insn (insn, rhs_op); ++ ++ return match (lhs_def, rhs_def, lhs_op, rhs_op, insn, m) ++ || (COMMUTATIVE && match (rhs_def, lhs_def, rhs_op, lhs_op, insn, m)); ++ } ++ ++private: ++ static bool match (rtx_insn *lhs_def, rtx_insn *rhs_def, ++ rtx lhs_op, rtx rhs_op, rtx_insn *insn, ++ holder_type &m) ++ { ++ /* Force template instantiation error on non-matching types. */ ++ gcc_assert ((typename LHS::term_type *) NULL ++ == (typename RHS::term_type *) NULL); ++ ++ /* Obtain locations in the storage. */ ++ typename LHS::holder_type &m_lhs ++ = (typename LHS::holder_type &) *(m + store_op); ++ typename RHS::holder_type &m_rhs ++ = (typename RHS::holder_type &) *(m + store_op ++ + LHS::holder_size); ++ ++ /* Try match both instructions. */ ++ if (lhs_def && rhs_def && LHS::match (lhs_def, m_lhs) ++ && RHS::match (rhs_def, m_rhs)) ++ return true; ++ /* Try match instruction and pattern. */ ++ else if (lhs_def && LHS::match (lhs_def, m_lhs) ++ && RHS::match (rhs_op, insn, m_rhs)) ++ return true; ++ /* Try match pattern and instruction. */ ++ else if (rhs_def && LHS::match (lhs_op, insn, m_lhs) ++ && RHS::match (rhs_def, m_rhs)) ++ return true; ++ /* Try match both patterns. */ ++ else ++ return LHS::match (lhs_op, insn, m_lhs) ++ && RHS::match (rhs_op, insn, m_rhs); ++ } ++}; ++ ++#endif // GCC_RTL_MATCHER_H +diff --git a/gcc/testsuite/gcc.target/aarch64/aes-decrypt.c b/gcc/testsuite/gcc.target/aarch64/aes-decrypt.c +new file mode 100644 +index 000000000..966ec5532 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/aes-decrypt.c +@@ -0,0 +1,478 @@ ++/* { dg-do run } */ ++/* { dg-options "-O3 -fno-inline --save-temps -fcrypto-accel-aes -march=armv8.2-a+lse+crypto" } */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++typedef uint8_t u8; ++typedef uint32_t u32; ++ ++static const u32 Td0[256] = { ++ 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U, ++ 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U, ++ 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U, ++ 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU, ++ 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U, ++ 0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U, ++ 0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU, ++ 0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U, ++ 0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU, ++ 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U, ++ 0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U, ++ 0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U, ++ 0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U, ++ 0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU, ++ 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U, ++ 0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU, ++ 0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U, ++ 0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU, ++ 0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U, ++ 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U, ++ 0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U, ++ 0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU, ++ 0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U, ++ 0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU, ++ 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U, ++ 0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU, ++ 0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U, ++ 0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU, ++ 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU, ++ 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U, ++ 0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU, ++ 0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U, ++ 0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU, ++ 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U, ++ 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U, ++ 0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U, ++ 0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU, ++ 0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U, ++ 0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U, ++ 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU, ++ 0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U, ++ 0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U, ++ 0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U, ++ 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U, ++ 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U, ++ 0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU, ++ 0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U, ++ 0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U, ++ 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U, ++ 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U, ++ 0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U, ++ 0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU, ++ 0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU, ++ 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU, ++ 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU, ++ 0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U, ++ 0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U, ++ 0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU, ++ 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU, ++ 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U, ++ 0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU, ++ 0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U, ++ 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U, ++ 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U, ++}; ++ ++static const u32 Td1[256] = { ++ 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU, ++ 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U, ++ 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU, ++ 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U, ++ 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U, ++ 0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U, ++ 0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U, ++ 0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U, ++ 0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U, ++ 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU, ++ 0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU, ++ 0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU, ++ 0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U, ++ 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU, ++ 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U, ++ 0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U, ++ 0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U, ++ 0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU, ++ 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU, ++ 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U, ++ 0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU, ++ 0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U, ++ 0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU, ++ 0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU, ++ 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U, ++ 0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U, ++ 0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U, ++ 0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU, ++ 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U, ++ 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU, ++ 0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U, ++ 0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U, ++ 0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U, ++ 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU, ++ 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U, ++ 0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U, ++ 0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U, ++ 0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U, ++ 0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U, ++ 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U, ++ 0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU, ++ 0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU, ++ 0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U, ++ 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU, ++ 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U, ++ 0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU, ++ 0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU, ++ 0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U, ++ 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU, ++ 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U, ++ 0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U, ++ 0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U, ++ 0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U, ++ 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U, ++ 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U, ++ 0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U, ++ 0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU, ++ 0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U, ++ 0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U, ++ 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU, ++ 0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U, ++ 0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U, ++ 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U, ++ 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U, ++}; ++ ++static const u32 Td2[256] = { ++ 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U, ++ 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U, ++ 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U, ++ 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U, ++ 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU, ++ 0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U, ++ 0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U, ++ 0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U, ++ 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U, ++ 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU, ++ 0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U, ++ 0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U, ++ 0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU, ++ 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U, ++ 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U, ++ 0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U, ++ 0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U, ++ 0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U, ++ 0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U, ++ 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU, ++ 0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U, ++ 0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U, ++ 0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U, ++ 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U, ++ 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U, ++ 0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU, ++ 0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU, ++ 0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U, ++ 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU, ++ 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U, ++ 0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU, ++ 0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU, ++ 0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU, ++ 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU, ++ 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U, ++ 0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U, ++ 0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U, ++ 0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U, ++ 0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U, ++ 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U, ++ 0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U, ++ 0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU, ++ 0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU, ++ 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U, ++ 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U, ++ 0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU, ++ 0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU, ++ 0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U, ++ 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U, ++ 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U, ++ 0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U, ++ 0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U, ++ 0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U, ++ 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U, ++ 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU, ++ 0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U, ++ 0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U, ++ 0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U, ++ 0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U, ++ 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U, ++ 0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U, ++ 0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU, ++ 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U, ++ 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U, ++}; ++ ++static const u32 Td3[256] = { ++ 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU, ++ 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU, ++ 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U, ++ 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U, ++ 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU, ++ 0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU, ++ 0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U, ++ 0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU, ++ 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U, ++ 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU, ++ 0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U, ++ 0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U, ++ 0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U, ++ 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U, ++ 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U, ++ 0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU, ++ 0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU, ++ 0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U, ++ 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U, ++ 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU, ++ 0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU, ++ 0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U, ++ 0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U, ++ 0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U, ++ 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U, ++ 0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU, ++ 0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U, ++ 0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U, ++ 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU, ++ 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU, ++ 0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U, ++ 0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U, ++ 0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U, ++ 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU, ++ 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U, ++ 0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U, ++ 0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U, ++ 0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U, ++ 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U, ++ 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U, ++ 0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U, ++ 0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU, ++ 0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U, ++ 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U, ++ 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU, ++ 0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU, ++ 0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U, ++ 0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU, ++ 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U, ++ 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U, ++ 0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U, ++ 0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U, ++ 0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U, ++ 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U, ++ 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU, ++ 0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU, ++ 0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU, ++ 0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU, ++ 0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U, ++ 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U, ++ 0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U, ++ 0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU, ++ 0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U, ++ 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U, ++}; ++ ++static const u8 Td4[256] = { ++ 0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U, ++ 0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU, ++ 0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U, ++ 0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU, ++ 0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU, ++ 0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU, ++ 0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U, ++ 0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U, ++ 0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U, ++ 0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U, ++ 0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU, ++ 0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U, ++ 0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU, ++ 0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U, ++ 0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U, ++ 0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU, ++ 0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU, ++ 0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U, ++ 0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U, ++ 0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU, ++ 0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U, ++ 0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU, ++ 0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U, ++ 0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U, ++ 0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U, ++ 0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU, ++ 0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU, ++ 0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU, ++ 0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U, ++ 0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U, ++ 0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U, ++ 0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU, ++}; ++ ++#define GETU32(pt) \ ++ ( \ ++ ((u32)(pt)[0] << 24) \ ++ ^ ((u32)(pt)[1] << 16) \ ++ ^ ((u32)(pt)[2] << 8) \ ++ ^ ((u32)(pt)[3]) \ ++ ) ++ ++#define PUTU32(ct, st) \ ++ { \ ++ (ct)[0] = (u8)((st) >> 24); \ ++ (ct)[1] = (u8)((st) >> 16); \ ++ (ct)[2] = (u8)((st) >> 8); \ ++ (ct)[3] = (u8)(st); \ ++ } ++ ++void ++aes_decrypt (const unsigned char *in, unsigned char *out, ++ const u32 *rk, int nr) ++{ ++ u32 s0, s1, s2, s3, t0, t1, t2, t3; ++ ++ int r = nr >> 1; ++ ++ s0 = GETU32 (in ) ^ rk[0]; ++ s1 = GETU32 (in + 4) ^ rk[1]; ++ s2 = GETU32 (in + 8) ^ rk[2]; ++ s3 = GETU32 (in + 12) ^ rk[3]; ++ ++ for (;;) { ++ t0 = ++ Td0[(s0 >> 24) ] ^ ++ Td1[(s3 >> 16) & 0xff] ^ ++ Td2[(s2 >> 8) & 0xff] ^ ++ Td3[(s1 ) & 0xff] ^ ++ rk[4]; ++ t1 = ++ Td0[(s1 >> 24) ] ^ ++ Td1[(s0 >> 16) & 0xff] ^ ++ Td2[(s3 >> 8) & 0xff] ^ ++ Td3[(s2 ) & 0xff] ^ ++ rk[5]; ++ t2 = ++ Td0[(s2 >> 24) ] ^ ++ Td1[(s1 >> 16) & 0xff] ^ ++ Td2[(s0 >> 8) & 0xff] ^ ++ Td3[(s3 ) & 0xff] ^ ++ rk[6]; ++ t3 = ++ Td0[(s3 >> 24) ] ^ ++ Td1[(s2 >> 16) & 0xff] ^ ++ Td2[(s1 >> 8) & 0xff] ^ ++ Td3[(s0 ) & 0xff] ^ ++ rk[7]; ++ ++ rk += 8; ++ if (--r == 0) { ++ break; ++ } ++ ++ s0 = ++ Td0[(t0 >> 24) ] ^ ++ Td1[(t3 >> 16) & 0xff] ^ ++ Td2[(t2 >> 8) & 0xff] ^ ++ Td3[(t1 ) & 0xff] ^ ++ rk[0]; ++ s1 = ++ Td0[(t1 >> 24) ] ^ ++ Td1[(t0 >> 16) & 0xff] ^ ++ Td2[(t3 >> 8) & 0xff] ^ ++ Td3[(t2 ) & 0xff] ^ ++ rk[1]; ++ s2 = ++ Td0[(t2 >> 24) ] ^ ++ Td1[(t1 >> 16) & 0xff] ^ ++ Td2[(t0 >> 8) & 0xff] ^ ++ Td3[(t3 ) & 0xff] ^ ++ rk[2]; ++ s3 = ++ Td0[(t3 >> 24) ] ^ ++ Td1[(t2 >> 16) & 0xff] ^ ++ Td2[(t1 >> 8) & 0xff] ^ ++ Td3[(t0 ) & 0xff] ^ ++ rk[3]; ++ } ++ ++ s0 = ++ ((u32)Td4[(t0 >> 24) ] << 24) ^ ++ ((u32)Td4[(t3 >> 16) & 0xff] << 16) ^ ++ ((u32)Td4[(t2 >> 8) & 0xff] << 8) ^ ++ ((u32)Td4[(t1 ) & 0xff]) ^ ++ rk[0]; ++ PUTU32 (out , s0); ++ ++ s1 = ++ ((u32)Td4[(t1 >> 24) ] << 24) ^ ++ ((u32)Td4[(t0 >> 16) & 0xff] << 16) ^ ++ ((u32)Td4[(t3 >> 8) & 0xff] << 8) ^ ++ ((u32)Td4[(t2 ) & 0xff]) ^ ++ rk[1]; ++ PUTU32 (out + 4, s1); ++ ++ s2 = ++ ((u32)Td4[(t2 >> 24) ] << 24) ^ ++ ((u32)Td4[(t1 >> 16) & 0xff] << 16) ^ ++ ((u32)Td4[(t0 >> 8) & 0xff] << 8) ^ ++ ((u32)Td4[(t3 ) & 0xff]) ^ ++ rk[2]; ++ PUTU32 (out + 8, s2); ++ ++ s3 = ++ ((u32)Td4[(t3 >> 24) ] << 24) ^ ++ ((u32)Td4[(t2 >> 16) & 0xff] << 16) ^ ++ ((u32)Td4[(t1 >> 8) & 0xff] << 8) ^ ++ ((u32)Td4[(t0 ) & 0xff]) ^ ++ rk[3]; ++ PUTU32 (out + 12, s3); ++} ++ ++int main () ++{ ++ const u8 input[16] = { 0x39, 0x25, 0x84, 0x1d, 0x02, 0xdc, 0x09, 0xfb, ++ 0xdc, 0x11, 0x85, 0x97, 0x19, 0x6a, 0x0b, 0x32 }; ++ ++ const u8 expected[16] = { 0x32, 0x43, 0xf6, 0xa8, 0x88, 0x5a, 0x30, 0x8d, ++ 0x31, 0x31, 0x98, 0xa2, 0xe0, 0x37, 0x07, 0x34 }; ++ ++ const u8 key[] = { 0xa8, 0xf9, 0x14, 0xd0, 0x89, 0x25, 0xee, 0xc9, ++ 0xc8, 0x0c, 0x3f, 0xe1, 0xa6, 0x0c, 0x63, 0xb6, ++ 0x63, 0x5a, 0x7b, 0x0c, 0xfe, 0xea, 0x19, 0x13, ++ 0x90, 0x88, 0x39, 0xb0, 0xb4, 0xfb, 0x4c, 0x66, ++ 0x5a, 0x92, 0x7d, 0xdf, 0x9d, 0xb0, 0x62, 0x1f, ++ 0x6e, 0x62, 0x20, 0xa3, 0x24, 0x73, 0x75, 0xd6, ++ 0x47, 0x76, 0xc0, 0x12, 0xc7, 0x22, 0x1f, 0xc0, ++ 0xf3, 0xd2, 0x42, 0xbc, 0x4a, 0x11, 0x55, 0x75, ++ 0x76, 0xd8, 0xfc, 0x6e, 0x80, 0x54, 0xdf, 0xd2, ++ 0x34, 0xf0, 0x5d, 0x7c, 0xb9, 0xc3, 0x17, 0xc9, ++ 0xfc, 0x0a, 0xa3, 0x6e, 0xf6, 0x8c, 0x23, 0xbc, ++ 0xb4, 0xa4, 0x82, 0xae, 0x8d, 0x33, 0x4a, 0xb5, ++ 0x13, 0x44, 0x88, 0x90, 0x0a, 0x86, 0x80, 0xd2, ++ 0x42, 0x28, 0xa1, 0x12, 0x39, 0x97, 0xc8, 0x1b, ++ 0xf7, 0x13, 0x1f, 0x7c, 0x19, 0xc2, 0x08, 0x42, ++ 0x48, 0xae, 0x21, 0xc0, 0x7b, 0xbf, 0x69, 0x09, ++ 0xeb, 0x05, 0x75, 0xcc, 0xee, 0xd1, 0x17, 0x3e, ++ 0x51, 0x6c, 0x29, 0x82, 0x33, 0x11, 0x48, 0xc9, ++ 0xa7, 0x08, 0x37, 0x2b, 0x05, 0xd4, 0x62, 0xf2, ++ 0xbf, 0xbd, 0x3e, 0xbc, 0x62, 0x7d, 0x61, 0x4b, ++ 0x16, 0x15, 0x7e, 0x2b, 0xa6, 0xd2, 0xae, 0x28, ++ 0x88, 0x15, 0xf7, 0xab, 0x3c, 0x4f, 0xcf, 0x09 }; ++ ++ u8 output[16] = { 0 }; ++ ++ aes_decrypt (input, output, (u32*) key, 10); ++ ++ if (memcmp (output, expected, 16) != 0) ++ abort (); ++ ++ return 0; ++} ++ ++/* { dg-final { scan-assembler "rev32" } } */ ++/* { dg-final { scan-assembler "aesimc" } } */ ++/* { dg-final { scan-assembler "aesd" } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/aes-encrypt.c b/gcc/testsuite/gcc.target/aarch64/aes-encrypt.c +new file mode 100644 +index 000000000..e3f3c446f +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/aes-encrypt.c +@@ -0,0 +1,443 @@ ++/* { dg-do run } */ ++/* { dg-options "-O3 -fno-inline --save-temps -fcrypto-accel-aes -march=armv8.2-a+lse+crypto" } */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++typedef uint8_t u8; ++typedef uint32_t u32; ++ ++static const u32 Te0[256] = { ++ 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU, ++ 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U, ++ 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU, ++ 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU, ++ 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U, ++ 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU, ++ 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU, ++ 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU, ++ 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU, ++ 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU, ++ 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U, ++ 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU, ++ 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU, ++ 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U, ++ 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU, ++ 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU, ++ 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU, ++ 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU, ++ 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU, ++ 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U, ++ 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU, ++ 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU, ++ 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU, ++ 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU, ++ 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U, ++ 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U, ++ 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U, ++ 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U, ++ 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU, ++ 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U, ++ 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U, ++ 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU, ++ 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU, ++ 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U, ++ 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U, ++ 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U, ++ 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU, ++ 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U, ++ 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU, ++ 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U, ++ 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU, ++ 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U, ++ 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U, ++ 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU, ++ 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U, ++ 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U, ++ 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U, ++ 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U, ++ 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U, ++ 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U, ++ 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U, ++ 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U, ++ 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU, ++ 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U, ++ 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U, ++ 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U, ++ 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U, ++ 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U, ++ 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U, ++ 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU, ++ 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U, ++ 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U, ++ 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U, ++ 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU, ++}; ++ ++static const u32 Te1[256] = { ++ 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU, ++ 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U, ++ 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU, ++ 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U, ++ 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU, ++ 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U, ++ 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU, ++ 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U, ++ 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U, ++ 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU, ++ 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U, ++ 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U, ++ 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U, ++ 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU, ++ 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U, ++ 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U, ++ 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU, ++ 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U, ++ 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U, ++ 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U, ++ 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU, ++ 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU, ++ 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U, ++ 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU, ++ 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU, ++ 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U, ++ 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU, ++ 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U, ++ 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU, ++ 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U, ++ 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U, ++ 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U, ++ 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU, ++ 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U, ++ 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU, ++ 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U, ++ 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU, ++ 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U, ++ 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U, ++ 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU, ++ 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU, ++ 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU, ++ 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U, ++ 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U, ++ 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU, ++ 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U, ++ 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU, ++ 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U, ++ 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU, ++ 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U, ++ 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU, ++ 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU, ++ 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U, ++ 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU, ++ 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U, ++ 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU, ++ 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U, ++ 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U, ++ 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U, ++ 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU, ++ 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU, ++ 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U, ++ 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU, ++ 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U, ++}; ++ ++static const u32 Te2[256] = { ++ 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU, ++ 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U, ++ 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU, ++ 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U, ++ 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU, ++ 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U, ++ 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU, ++ 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U, ++ 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U, ++ 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU, ++ 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U, ++ 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U, ++ 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U, ++ 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU, ++ 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U, ++ 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U, ++ 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU, ++ 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U, ++ 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U, ++ 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U, ++ 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU, ++ 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU, ++ 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U, ++ 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU, ++ 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU, ++ 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U, ++ 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU, ++ 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U, ++ 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU, ++ 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U, ++ 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U, ++ 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U, ++ 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU, ++ 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U, ++ 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU, ++ 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U, ++ 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU, ++ 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U, ++ 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U, ++ 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU, ++ 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU, ++ 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU, ++ 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U, ++ 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U, ++ 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU, ++ 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U, ++ 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU, ++ 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U, ++ 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU, ++ 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U, ++ 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU, ++ 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU, ++ 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U, ++ 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU, ++ 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U, ++ 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU, ++ 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U, ++ 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U, ++ 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U, ++ 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU, ++ 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU, ++ 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U, ++ 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU, ++ 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U, ++}; ++ ++static const u32 Te3[256] = { ++ 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U, ++ 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U, ++ 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U, ++ 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU, ++ 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU, ++ 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU, ++ 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U, ++ 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU, ++ 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU, ++ 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U, ++ 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U, ++ 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU, ++ 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU, ++ 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU, ++ 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU, ++ 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU, ++ 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U, ++ 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU, ++ 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU, ++ 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U, ++ 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U, ++ 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U, ++ 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U, ++ 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U, ++ 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU, ++ 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U, ++ 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU, ++ 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU, ++ 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U, ++ 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U, ++ 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U, ++ 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU, ++ 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U, ++ 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU, ++ 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU, ++ 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U, ++ 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U, ++ 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU, ++ 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U, ++ 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU, ++ 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U, ++ 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U, ++ 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U, ++ 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U, ++ 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU, ++ 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U, ++ 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU, ++ 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U, ++ 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU, ++ 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U, ++ 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU, ++ 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU, ++ 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU, ++ 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU, ++ 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U, ++ 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U, ++ 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U, ++ 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U, ++ 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U, ++ 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U, ++ 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU, ++ 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U, ++ 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU, ++ 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU, ++}; ++ ++#define GETU32(pt) \ ++ ( \ ++ ((u32)(pt)[0] << 24) \ ++ ^ ((u32)(pt)[1] << 16) \ ++ ^ ((u32)(pt)[2] << 8) \ ++ ^ ((u32)(pt)[3]) \ ++ ) ++ ++#define PUTU32(ct, st) \ ++ { \ ++ (ct)[0] = (u8)((st) >> 24); \ ++ (ct)[1] = (u8)((st) >> 16); \ ++ (ct)[2] = (u8)((st) >> 8); \ ++ (ct)[3] = (u8)(st); \ ++ } ++ ++void ++aes_encrypt (const unsigned char *in, unsigned char *out, ++ const u32 *rk, int nr) ++{ ++ u32 s0, s1, s2, s3, t0, t1, t2, t3; ++ ++ int r = nr >> 1; ++ ++ s0 = GETU32 (in ) ^ rk[0]; ++ s1 = GETU32 (in + 4) ^ rk[1]; ++ s2 = GETU32 (in + 8) ^ rk[2]; ++ s3 = GETU32 (in + 12) ^ rk[3]; ++ ++ for (;;) { ++ t0 = ++ Te0[(s0 >> 24) ] ^ ++ Te1[(s1 >> 16) & 0xff] ^ ++ Te2[(s2 >> 8) & 0xff] ^ ++ Te3[(s3 ) & 0xff] ^ ++ rk[4]; ++ t1 = ++ Te0[(s1 >> 24) ] ^ ++ Te1[(s2 >> 16) & 0xff] ^ ++ Te2[(s3 >> 8) & 0xff] ^ ++ Te3[(s0 ) & 0xff] ^ ++ rk[5]; ++ t2 = ++ Te0[(s2 >> 24) ] ^ ++ Te1[(s3 >> 16) & 0xff] ^ ++ Te2[(s0 >> 8) & 0xff] ^ ++ Te3[(s1 ) & 0xff] ^ ++ rk[6]; ++ t3 = ++ Te0[(s3 >> 24) ] ^ ++ Te1[(s0 >> 16) & 0xff] ^ ++ Te2[(s1 >> 8) & 0xff] ^ ++ Te3[(s2 ) & 0xff] ^ ++ rk[7]; ++ ++ rk += 8; ++ if (--r == 0) ++ break; ++ ++ s0 = ++ Te0[(t0 >> 24) ] ^ ++ Te1[(t1 >> 16) & 0xff] ^ ++ Te2[(t2 >> 8) & 0xff] ^ ++ Te3[(t3 ) & 0xff] ^ ++ rk[0]; ++ s1 = ++ Te0[(t1 >> 24) ] ^ ++ Te1[(t2 >> 16) & 0xff] ^ ++ Te2[(t3 >> 8) & 0xff] ^ ++ Te3[(t0 ) & 0xff] ^ ++ rk[1]; ++ s2 = ++ Te0[(t2 >> 24) ] ^ ++ Te1[(t3 >> 16) & 0xff] ^ ++ Te2[(t0 >> 8) & 0xff] ^ ++ Te3[(t1 ) & 0xff] ^ ++ rk[2]; ++ s3 = ++ Te0[(t3 >> 24) ] ^ ++ Te1[(t0 >> 16) & 0xff] ^ ++ Te2[(t1 >> 8) & 0xff] ^ ++ Te3[(t2 ) & 0xff] ^ ++ rk[3]; ++ } ++ ++ s0 = ++ (Te2[(t0 >> 24) ] & 0xff000000) ^ ++ (Te3[(t1 >> 16) & 0xff] & 0x00ff0000) ^ ++ (Te0[(t2 >> 8) & 0xff] & 0x0000ff00) ^ ++ (Te1[(t3 ) & 0xff] & 0x000000ff) ^ ++ rk[0]; ++ PUTU32 (out , s0); ++ ++ s1 = ++ (Te2[(t1 >> 24) ] & 0xff000000) ^ ++ (Te3[(t2 >> 16) & 0xff] & 0x00ff0000) ^ ++ (Te0[(t3 >> 8) & 0xff] & 0x0000ff00) ^ ++ (Te1[(t0 ) & 0xff] & 0x000000ff) ^ ++ rk[1]; ++ PUTU32 (out + 4, s1); ++ ++ s2 = ++ (Te2[(t2 >> 24) ] & 0xff000000) ^ ++ (Te3[(t3 >> 16) & 0xff] & 0x00ff0000) ^ ++ (Te0[(t0 >> 8) & 0xff] & 0x0000ff00) ^ ++ (Te1[(t1 ) & 0xff] & 0x000000ff) ^ ++ rk[2]; ++ PUTU32 (out + 8, s2); ++ ++ s3 = ++ (Te2[(t3 >> 24) ] & 0xff000000) ^ ++ (Te3[(t0 >> 16) & 0xff] & 0x00ff0000) ^ ++ (Te0[(t1 >> 8) & 0xff] & 0x0000ff00) ^ ++ (Te1[(t2 ) & 0xff] & 0x000000ff) ^ ++ rk[3]; ++ PUTU32 (out + 12, s3); ++} ++ ++ ++int main () ++{ ++ const u8 input[16] = { 0x32, 0x43, 0xf6, 0xa8, 0x88, 0x5a, 0x30, 0x8d, ++ 0x31, 0x31, 0x98, 0xa2, 0xe0, 0x37, 0x07, 0x34 }; ++ ++ const u8 expected[16] = { 0x39, 0x25, 0x84, 0x1d, 0x02, 0xdc, 0x09, 0xfb, ++ 0xdc, 0x11, 0x85, 0x97, 0x19, 0x6a, 0x0b, 0x32 }; ++ ++ const u8 key[] = { 0x16, 0x15, 0x7e, 0x2b, 0xa6, 0xd2, 0xae, 0x28, ++ 0x88, 0x15, 0xf7, 0xab, 0x3c, 0x4f, 0xcf, 0x09, ++ 0x17, 0xfe, 0xfa, 0xa0, 0xb1, 0x2c, 0x54, 0x88, ++ 0x39, 0x39, 0xa3, 0x23, 0x05, 0x76, 0x6c, 0x2a, ++ 0xf2, 0x95, 0xc2, 0xf2, 0x43, 0xb9, 0x96, 0x7a, ++ 0x7a, 0x80, 0x35, 0x59, 0x7f, 0xf6, 0x59, 0x73, ++ 0x7d, 0x47, 0x80, 0x3d, 0x3e, 0xfe, 0x16, 0x47, ++ 0x44, 0x7e, 0x23, 0x1e, 0x3b, 0x88, 0x7a, 0x6d, ++ 0x41, 0xa5, 0x44, 0xef, 0x7f, 0x5b, 0x52, 0xa8, ++ 0x3b, 0x25, 0x71, 0xb6, 0x00, 0xad, 0x0b, 0xdb, ++ 0xf8, 0xc6, 0xd1, 0xd4, 0x87, 0x9d, 0x83, 0x7c, ++ 0xbc, 0xb8, 0xf2, 0xca, 0xbc, 0x15, 0xf9, 0x11, ++ 0x7a, 0xa3, 0x88, 0x6d, 0xfd, 0x3e, 0x0b, 0x11, ++ 0x41, 0x86, 0xf9, 0xdb, 0xfd, 0x93, 0x00, 0xca, ++ 0x0e, 0xf7, 0x54, 0x4e, 0xf3, 0xc9, 0x5f, 0x5f, ++ 0xb2, 0x4f, 0xa6, 0x84, 0x4f, 0xdc, 0xa6, 0x4e, ++ 0x21, 0x73, 0xd2, 0xea, 0xd2, 0xba, 0x8d, 0xb5, ++ 0x60, 0xf5, 0x2b, 0x31, 0x2f, 0x29, 0x8d, 0x7f, ++ 0xf3, 0x66, 0x77, 0xac, 0x21, 0xdc, 0xfa, 0x19, ++ 0x41, 0x29, 0xd1, 0x28, 0x6e, 0x00, 0x5c, 0x57, ++ 0xa8, 0xf9, 0x14, 0xd0, 0x89, 0x25, 0xee, 0xc9, ++ 0xc8, 0x0c, 0x3f, 0xe1, 0xa6, 0x0c, 0x63, 0xb6 }; ++ ++ u8 output[16] = { 0 }; ++ ++ aes_encrypt (input, output, (u32*) key, 10); ++ ++ if (memcmp (output, expected, 16) != 0) ++ abort (); ++ ++ return 0; ++} ++ ++/* { dg-final { scan-assembler "rev32" } } */ ++/* { dg-final { scan-assembler "aesmc" } } */ ++/* { dg-final { scan-assembler "aese" } } */ +-- +2.33.0 + diff --git a/0161-Fix-lost-ftree-fold-phiopt-option-in-tests.patch b/0161-Fix-lost-ftree-fold-phiopt-option-in-tests.patch new file mode 100644 index 0000000..e87ac76 --- /dev/null +++ b/0161-Fix-lost-ftree-fold-phiopt-option-in-tests.patch @@ -0,0 +1,51 @@ +From 885c6fbfa6412a81740a8c806fa82273b7114b24 Mon Sep 17 00:00:00 2001 +From: Pronin Alexander 00812787 +Date: Wed, 13 Dec 2023 18:38:33 +0800 +Subject: [PATCH 1/2] Fix lost ftree-fold-phiopt option in tests + +--- + gcc/testsuite/gcc.dg/double_sized_mul-1.c | 2 +- + gcc/testsuite/gcc.dg/double_sized_mul-2.c | 2 +- + gcc/testsuite/gcc.dg/ifcvt-gimple.c | 2 +- + 3 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-1.c b/gcc/testsuite/gcc.dg/double_sized_mul-1.c +index 4d475cc8a..bdb503bc4 100644 +--- a/gcc/testsuite/gcc.dg/double_sized_mul-1.c ++++ b/gcc/testsuite/gcc.dg/double_sized_mul-1.c +@@ -1,7 +1,7 @@ + /* { dg-do compile } */ + /* fif-conversion-gimple and fuaddsub-overflow-match-all are required for + proper overflow detection in some cases. */ +-/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */ ++/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -ftree-fold-phiopt -fdump-tree-widening_mul-stats" } */ + #include + + typedef unsigned __int128 uint128_t; +diff --git a/gcc/testsuite/gcc.dg/double_sized_mul-2.c b/gcc/testsuite/gcc.dg/double_sized_mul-2.c +index cc6e5af25..f9d58a2f6 100644 +--- a/gcc/testsuite/gcc.dg/double_sized_mul-2.c ++++ b/gcc/testsuite/gcc.dg/double_sized_mul-2.c +@@ -1,7 +1,7 @@ + /* { dg-do compile } */ + /* fif-conversion-gimple is required for proper overflow detection + in some cases. */ +-/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -fdump-tree-widening_mul-stats" } */ ++/* { dg-options "-O2 -fif-conversion-gimple -fuaddsub-overflow-match-all -ftree-fold-phiopt -fdump-tree-widening_mul-stats" } */ + #include + + typedef unsigned __int128 uint128_t; +diff --git a/gcc/testsuite/gcc.dg/ifcvt-gimple.c b/gcc/testsuite/gcc.dg/ifcvt-gimple.c +index 0f7c87e5c..4dc0f9206 100644 +--- a/gcc/testsuite/gcc.dg/ifcvt-gimple.c ++++ b/gcc/testsuite/gcc.dg/ifcvt-gimple.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -fif-conversion-gimple -fdump-tree-optimized" } */ ++/* { dg-options "-O2 -fif-conversion-gimple -fdump-tree-optimized -ftree-fold-phiopt" } */ + + int test_int (int optimizable_int) { + if (optimizable_int > 5) +-- +2.33.0 + diff --git a/0162-rtl-ifcvt-free-dominance-info-before-cleanup_cfg.patch b/0162-rtl-ifcvt-free-dominance-info-before-cleanup_cfg.patch new file mode 100644 index 0000000..4faec3c --- /dev/null +++ b/0162-rtl-ifcvt-free-dominance-info-before-cleanup_cfg.patch @@ -0,0 +1,25 @@ +From 126bd5722f96733e7fbe433062861d5c3534911a Mon Sep 17 00:00:00 2001 +From: vchernon +Date: Wed, 13 Dec 2023 21:52:03 +0800 +Subject: [PATCH 2/2] [rtl-ifcvt] free dominance info before cleanup_cfg not + cleaned dominance info can cause infite loop in cleanup_cfg + +--- + gcc/ifcvt.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c +index 209987ebc..04086c560 100644 +--- a/gcc/ifcvt.c ++++ b/gcc/ifcvt.c +@@ -5624,6 +5624,7 @@ if_convert (bool after_combine) + df_live_add_problem (); + df_live_set_all_dirty (); + } ++ free_dominance_info (CDI_DOMINATORS); + cleanup_cfg (CLEANUP_EXPENSIVE); + + /* Record whether we are after combine pass. */ +-- +2.33.0 + diff --git a/0163-Loop-CRC-Solving-the-problem-of-insufficient-CRC-tab.patch b/0163-Loop-CRC-Solving-the-problem-of-insufficient-CRC-tab.patch new file mode 100644 index 0000000..48d4a57 --- /dev/null +++ b/0163-Loop-CRC-Solving-the-problem-of-insufficient-CRC-tab.patch @@ -0,0 +1,42 @@ +From 3281cef37191a800d4fcc916c0e9d5c7a43802a4 Mon Sep 17 00:00:00 2001 +From: XingYuShuai <1150775134@qq.com> +Date: Thu, 14 Dec 2023 20:11:35 +0800 +Subject: [PATCH 1/2] [Loop CRC] Solving the problem of insufficient CRC table + validation + +--- + gcc/tree-ssa-loop-crc.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/gcc/tree-ssa-loop-crc.c b/gcc/tree-ssa-loop-crc.c +index 9878363eb..2dd9e1e3b 100644 +--- a/gcc/tree-ssa-loop-crc.c ++++ b/gcc/tree-ssa-loop-crc.c +@@ -336,11 +336,14 @@ only_one_array_read (class loop *loop, tree &crc_table) + && TREE_CODE (gimple_assign_lhs (stmt)) == ARRAY_REF) + return false; + ++ /* Only one-dimensional integer arrays meet the condition. */ + if (gimple_code (stmt) == GIMPLE_ASSIGN +- && TREE_CODE (gimple_assign_rhs1 (stmt)) == ARRAY_REF) ++ && TREE_CODE (gimple_assign_rhs1 (stmt)) == ARRAY_REF ++ && TREE_CODE (TREE_OPERAND (gimple_assign_rhs1 (stmt), 0)) == VAR_DECL ++ && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) == INTEGER_TYPE) + { + if (crc_table == NULL +- && gimple_assign_rhs1 (stmt)->base.readonly_flag) ++ && TREE_READONLY (gimple_assign_rhs1 (stmt))) + { + crc_table = gimple_assign_rhs1 (stmt); + crc_table_read_stmt = stmt; +@@ -438,7 +441,6 @@ match_crc_table (tree crc_table) + return true; + } + +- + /* Check the crc table. The loop should have only one data reference. + And match the data reference with the predefined array. */ + static bool +-- +2.33.0 + diff --git a/0164-LLC-Allocation-Fix-some-bugs-and-remove-variable-pre.patch b/0164-LLC-Allocation-Fix-some-bugs-and-remove-variable-pre.patch new file mode 100644 index 0000000..f1507fb --- /dev/null +++ b/0164-LLC-Allocation-Fix-some-bugs-and-remove-variable-pre.patch @@ -0,0 +1,924 @@ +From 1722afc51311a6bb0b892df50602f660c706162f Mon Sep 17 00:00:00 2001 +From: liuf9 +Date: Fri, 15 Dec 2023 11:25:48 +0800 +Subject: [PATCH 2/2] [LLC Allocation] Fix some bugs and remove variable + prefetch tool. After outer loop analysis, it is possible to get nested loops + for kernel candidates and this situation has conflicts with the early exiting + criterion for kernel filtering process and we restrict this criterion for + innermost loops only. We also fix some pass configuration bugs in common.opt + and params.opt. We remove variable prefetch tool due to the consideration of + unsafe inputs from users. + +--- + gcc/common.opt | 2 +- + gcc/params.opt | 24 +- + gcc/testsuite/gcc.dg/llc-allocate/llc-2.c | 2 +- + .../llc-allocate/llc-issue-builtin-prefetch.c | 48 ---- + .../llc-allocate/llc-tool-insertion-1.c | 48 ---- + .../llc-allocate/llc-tool-insertion-2.c | 48 ---- + .../llc-allocate/llc-tool-insertion-3.c | 48 ---- + .../llc-allocate/llc-tool-insertion-4.c | 47 --- + .../llc-allocate/llc-tool-insertion-5.c | 48 ---- + .../llc-allocate/llc-tool-insertion-6.c | 47 --- + .../llc-tool-insertion-7-null-var-name.c | 52 ---- + .../llc-tool-insertion-8-tmp-var-name.c | 54 ---- + gcc/tree-ssa-llc-allocate.c | 267 +----------------- + 13 files changed, 11 insertions(+), 724 deletions(-) + delete mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c + delete mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-1.c + delete mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-2.c + delete mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-3.c + delete mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-4.c + delete mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-5.c + delete mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-6.c + delete mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-7-null-var-name.c + delete mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index 56ad9a378..a8a2264ee 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -2255,7 +2255,7 @@ Generate prefetch instructions, if available, for arrays in loops. The prefetch + level can control the optimize level to array prefetch. + + fllc-allocate +-Common Report Var(flag_llc_allocate) Init(-1) Optimization ++Common Report Var(flag_llc_allocate) Optimization + Generate LLC hint instructions. + + fipa-prefetch +diff --git a/gcc/params.opt b/gcc/params.opt +index 792ca5c35..ef7bea311 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -1030,13 +1030,11 @@ Common Joined UInteger Var(param_mem_access_num) Init(3) Param Optimization + Memory access num. + + -param=prefetch-offset= +-Common Joined UInteger Var(param_prefetch_offset) Init(1024) +-IntegerRange(1, 999999) Param Optimization ++Common Joined UInteger Var(param_prefetch_offset) Init(1024) IntegerRange(1, 999999) Param Optimization + Prefetch Offset, which is usually a power of two due to cache line size. + + -param=branch-prob-threshold= +-Common Joined UInteger Var(param_branch_prob_threshold) Init(80) IntegerRange(50, 100) +-Param Optimization ++Common Joined UInteger Var(param_branch_prob_threshold) Init(80) IntegerRange(50, 100) Param Optimization + High Execution Rate Branch Threshold. + + -param=issue-topn= +@@ -1051,24 +1049,6 @@ Force issue the topn LLC mem_ref hint, without generating dynamic multi-branches + Common Joined UInteger Var(param_llc_capacity_per_core) Init(114) IntegerRange(0, 999999) Param + LLC capacity per core. + +--param=target-variables= +-Common Joined Var(param_target_variables) Init("") Param Optimization +---param=target-variables=[,,...] Target variables for prefetching, separated by comma, +-without space. The representation of a variable can be complex and containing space, please surround +-it by quotation marks and escape special characters in Linux. The input length should be no more +-than 512 characters. +- +--param=use-ref-group-index= +-Common Joined UInteger Var(param_use_ref_group_index) Init(0) IntegerRange(0, 1) Param Optimization +-Prefetch the target variables by their indices in sorted ref_groups, use together with parameter +-target-variables. +- +--param=mem-ref-index= +-Common Joined Var(param_mem_ref_index) Init("") Param Optimization +---param=mem-ref-index=[,,...] Prefetch the target variable at the memory reference +-location with the index of customized order, separated by comma, without space. The input length +-should be no more than 512 characters. +- + -param=filter-kernels= + Common Joined UInteger Var(param_filter_kernels) Init(1) IntegerRange(0, 1) Param + Allow LLC allocate pass to greedily filter kernels by traversing the corresponding basic blocks +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c +index 9f8a5c307..f8b1cc5c1 100644 +--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c +@@ -45,7 +45,7 @@ main (int argc, char *argv[]) + /* { dg-final { scan-tree-dump-not "static_data_size:" "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 2 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-not ", size: (?!(0\.000000))" "llc_allocate" } } */ +-/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 8 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 6 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-times "\\d x_data \\(0.000000, 1, 0\\) : 3" 2 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-times "\\d A_j \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-times "\\d A_data \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c +deleted file mode 100644 +index 2a58c501f..000000000 +--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-issue-builtin-prefetch.c ++++ /dev/null +@@ -1,48 +0,0 @@ +-/* { dg-do compile { target { aarch64*-*-linux* } } } */ +-/* { dg-options "-O3 -march=armv8.2-a+sve -funroll-loops -ffast-math -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param target-variables=uPtr" } */ +- +-#include +- +-#define N 131590 +-#define F 384477 +- +-double diagPtr[N]; +-double psiPtr[N]; +-double ApsiPtr[N]; +-int lPtr[F]; +-int uPtr[F]; +-double lowerPtr[F]; +-double upperPtr[F]; +- +-void +-AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, +- int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) +-{ +- for (int cell=0; cell +- +-#define N 131590 +-#define F 384477 +- +-double diagPtr[N]; +-double psiPtr[N]; +-double ApsiPtr[N]; +-int lPtr[F]; +-int uPtr[F]; +-double lowerPtr[F]; +-double upperPtr[F]; +- +-void +-AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, +- int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) +-{ +- for (int cell=0; cell +- +-#define N 131590 +-#define F 384477 +- +-double diagPtr[N]; +-double psiPtr[N]; +-double ApsiPtr[N]; +-int lPtr[F]; +-int uPtr[F]; +-double lowerPtr[F]; +-double upperPtr[F]; +- +-void +-AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, +- int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) +-{ +- for (int cell=0; cell +- +-#define N 131590 +-#define F 384477 +- +-double diagPtr[N]; +-double psiPtr[N]; +-double ApsiPtr[N]; +-int lPtr[F]; +-int uPtr[F]; +-double lowerPtr[F]; +-double upperPtr[F]; +- +-void +-AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, +- int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) +-{ +- for (int cell=0; cell +- +-#define N 131590 +-#define F 384477 +- +-double diagPtr[N]; +-double psiPtr[N]; +-double ApsiPtr[N]; +-int lPtr[F]; +-int uPtr[F]; +-double lowerPtr[F]; +-double upperPtr[F]; +- +-void +-AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, +- int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) +-{ +- for (int cell=0; cell +- +-#define N 131590 +-#define F 384477 +- +-double diagPtr[N]; +-double psiPtr[N]; +-double ApsiPtr[N]; +-int lPtr[F]; +-int uPtr[F]; +-double lowerPtr[F]; +-double upperPtr[F]; +- +-void +-AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, +- int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) +-{ +- for (int cell=0; cell +- +-#define N 131590 +-#define F 384477 +- +-double diagPtr[N]; +-double psiPtr[N]; +-double ApsiPtr[N]; +-int lPtr[F]; +-int uPtr[F]; +-double lowerPtr[F]; +-double upperPtr[F]; +- +-void +-AMUL (double *diagPtr, double *psiPtr, double *ApsiPtr, int *lPtr, +- int *uPtr, double *lowerPtr, double *upperPtr, int nCells, int nFaces) +-{ +- for (int cell=0; cellaux\"" } */ +- +-#include +- +-typedef struct stack_def +-{ +- int top; /* index to top stack element */ +- unsigned long reg_set; /* set of live registers */ +- unsigned char reg[128]; /* register - stack mapping */ +-} *stack; +- +-typedef struct block_info_def +-{ +- struct stack_def stack_in; /* Input stack configuration. */ +- struct stack_def stack_out; /* Output stack configuration. */ +- unsigned long out_reg_set; /* Stack regs live on output. */ +- int done; /* True if block already converted. */ +- int predecessors; /* Number of predecessors that need +- to be visited. */ +-} *block_info; +- +-typedef struct basic_block_def +-{ +- void *aux; +-} *basic_block; +- +-unsigned char +-convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high) +-{ +- stack output_stack; +- +- output_stack = &(((block_info) bb->aux)->stack_in); +- if (value_reg_low == -1) +- output_stack->top = -1; +- else +- { +- int reg; +- output_stack->top = value_reg_high - value_reg_low; +- for (reg = value_reg_low; reg <= value_reg_high; ++reg) +- { +- (output_stack->reg + 16)[value_reg_high - reg] = reg; +- output_stack->reg_set |= (unsigned long) 1 << reg; +- } +- } +- return output_stack->reg[0]; +-} +- +-/* { dg-final { scan-tree-dump-not "Unrecognizable variable name" +- "llc_allocate" } } */ +-/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */ +-/* { dg-final { scan-tree-dump-not "static issue" "llc_allocate" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c +deleted file mode 100644 +index 09a525ce1..000000000 +--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-tool-insertion-8-tmp-var-name.c ++++ /dev/null +@@ -1,54 +0,0 @@ +-/* { dg-do compile { target { aarch64*-*-linux* } } } */ +-/* { dg-options "-O3 -c -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=0 --param target-variables=tmp_var_0" } */ +- +-#include +- +-typedef struct stack_def +-{ +- int top; /* index to top stack element */ +- unsigned long reg_set; /* set of live registers */ +- unsigned char reg[128]; /* register - stack mapping */ +-} *stack; +- +-typedef struct block_info_def +-{ +- struct stack_def stack_in; /* Input stack configuration. */ +- struct stack_def stack_out; /* Output stack configuration. */ +- unsigned long out_reg_set; /* Stack regs live on output. */ +- int done; /* True if block already converted. */ +- int predecessors; /* Number of predecessors that need +- to be visited. */ +-} *block_info; +- +-typedef struct basic_block_def +-{ +- void *aux; +-} *basic_block; +- +-unsigned char +-convert_regs_exit (basic_block bb, int value_reg_low, int value_reg_high) +-{ +- stack output_stack; +- +- output_stack = &(((block_info) bb->aux)->stack_in); +- if (value_reg_low == -1) +- output_stack->top = -1; +- else +- { +- int reg; +- output_stack->top = value_reg_high - value_reg_low; +- for (reg = value_reg_low; reg <= value_reg_high; ++reg) +- { +- (output_stack->reg + 16)[value_reg_high - reg] = reg; +- output_stack->reg_set |= (unsigned long) 1 << reg; +- } +- } +- return output_stack->reg[0]; +-} +- +-/* { dg-final { scan-tree-dump-not "Unrecognizable variable name" +- "llc_allocate" } } */ +-/* { dg-final { scan-tree-dump-times "NOTICE: Prefetching target variable \"" +- " bb_16(D)->aux \"" 1 "llc_allocate" } } */ +-/* { dg-final { scan-tree-dump-not "runtime issue" "llc_allocate" } } */ +-/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */ +diff --git a/gcc/tree-ssa-llc-allocate.c b/gcc/tree-ssa-llc-allocate.c +index 890f66e54..fa8979401 100644 +--- a/gcc/tree-ssa-llc-allocate.c ++++ b/gcc/tree-ssa-llc-allocate.c +@@ -23,7 +23,6 @@ along with GCC; see the file COPYING3. If not see + #define INCLUDE_VECTOR + #define INCLUDE_LIST + #define INCLUDE_ALGORITHM +-#define INCLUDE_STRING + #include "system.h" + #include "coretypes.h" + #include "backend.h" +@@ -1866,7 +1865,10 @@ filter_and_sort_kernels (vector &sorted_kernels, + list walked_header_bb; /* Used to record nested loops. */ + + for (unsigned i = 0; i < kernels.size (); ++i) +- end_bb.insert (kernels[i]->header); ++ { ++ if (kernels[i]->inner == NULL) ++ end_bb.insert (kernels[i]->header); ++ } + + dump_loop_headers ("kernels", kernels); + +@@ -2380,30 +2382,6 @@ issue_builtin_prefetch (data_ref &mem_ref) + update_ssa (TODO_update_ssa_only_virtuals); + } + +-/* Retrieve memory reference at the specific index. */ +- +-data_ref +-get_data_ref_at_idx (ref_group &var_ref_group) +-{ +- unsigned int mem_ref_size = static_cast( +- var_ref_group.ref_scores.size ()); +- if (strlen (param_mem_ref_index) == 0) +- return var_ref_group.first_use; +- else +- { +- /* Insert prefetch hint at highly-likely-used location with the given +- index. */ +- if (var_ref_group.mem_ref_index >= mem_ref_size) +- { +- if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "WARNING: The target data_ref index is out " +- "of range. Use top index instead!\n"); +- return var_ref_group.ref_scores[0].d_ref; +- } +- return var_ref_group.ref_scores[var_ref_group.mem_ref_index].d_ref; +- } +-} +- + /* Static form insertion and issue instruction. We may check the + determination of the ARM SVE architecture before SVE hint insertion. */ + +@@ -2415,7 +2393,7 @@ static_issue (vector &ref_groups, int num_issue_var) + + for (int i = 0; i < num_issue_var; ++i) + { +- data_ref mem_ref = get_data_ref_at_idx (ref_groups[i]); ++ data_ref mem_ref = ref_groups[i].first_use; + if (mem_ref.vectorize_p) + { + enum internal_fn ifn_code = gimple_call_internal_fn +@@ -2591,10 +2569,7 @@ issue_llc_hint (vector &ref_groups) + } + if (param_force_issue) + { +- if (strlen (param_target_variables) > 0) +- static_issue (ref_groups, static_cast(ref_groups.size ())); +- else +- static_issue (ref_groups, num_issue_var); ++ static_issue (ref_groups, num_issue_var); + return; + } + calc_type topn_calc_type = STATIC_CALC; +@@ -2626,224 +2601,6 @@ issue_llc_hint (vector &ref_groups) + } + + /* ==================== phase entry ==================== */ +-/* Check whether a string can be converted to an unsigned integer. */ +- +-bool is_unsigned_int (const string &s) +-{ +- if (s.empty () || s.size () > PREFETCH_TOOL_NUM_MAX_LEN) +- return false; +- +- for (unsigned int i = 0; i < s.size (); ++i) +- { +- if (s[i] < '0' || s[i] > '9') +- return false; +- } +- return true; +-} +- +-/* Parse a substring separated by comma. If the substring is valid and +- non-empty, store it as a parsed element. */ +- +-bool +-parse_string_helper (const string &substr, vector& str_elts, +- bool check_unsigned, size_t start, size_t end) +-{ +- if (substr == "" && dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "WARNING: The input string from %lu to %lu is " +- "empty.\n", start, end); +- else if (check_unsigned && !is_unsigned_int (substr)) +- { +- if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "ERROR: not an unsigned integer: %s\n", +- substr.c_str ()); +- str_elts.clear (); +- return false; +- } +- else +- str_elts.push_back (substr); +- return true; +-} +- +-/* Parse a user input string, separated by comma. */ +- +-void +-parse_string (const string &s, vector& str_elts, +- bool check_unsigned = false) +-{ +- string delim = ","; +- size_t start = 0; +- size_t end = s.find (delim); +- string substr = s.substr (start, end - start); +- while (end != string::npos) +- { +- if (!parse_string_helper (substr, str_elts, check_unsigned, start, end)) +- return; +- start = end + delim.size (); +- end = s.find (delim, start); +- substr = s.substr (start, end - start); +- } +- parse_string_helper (substr, str_elts, check_unsigned, start, end); +-} +- +-/* Parse user input of target variables and memory indices and create a map +- that assigns a target variable to a memory index. */ +- +-void +-parse_param_inputs (map &var2mem_idx) +-{ +- /* The user input length should have an input length limit. */ +- if ((strlen (param_target_variables) >= PREFETCH_TOOL_INPUT_MAX_LEN +- || strlen (param_mem_ref_index) >= PREFETCH_TOOL_INPUT_MAX_LEN) +- && dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "INVALID INPUT: The user inputs for target variables " +- "and/or memory reference indices are too long for parsing.\n"); +- +- vector var_names; +- string target_variables = param_target_variables; +- if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "Start parsing target variables:\n"); +- if (param_use_ref_group_index) +- parse_string (target_variables, var_names, true); +- else +- parse_string (target_variables, var_names, false); +- if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "Finish parsing target variables.\n\n"); +- +- vector var_mem_indices; +- string mem_indices = param_mem_ref_index; +- if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "Start parsing memory reference indices:\n"); +- parse_string (mem_indices, var_mem_indices, true); +- if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "Finish parsing memory reference indices.\n\n"); +- +- /* Construct a map of var_name: var_mem_index. */ +- if (var_names.size () > 0) +- { +- if (var_mem_indices.size () < var_names.size ()) +- { +- if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "WARNING: The number of provided memory " +- "reference indices is less than that of target " +- "variables.\nUse the top index for all variables " +- "instead.\n"); +- for (string& var_name : var_names) +- var2mem_idx[var_name] = 0; +- } +- else +- { +- if (var_mem_indices.size () > var_names.size () +- && dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "WARNING: The number of target variables is " +- "less than that of memory reference indices.\n"); +- for (unsigned int i = 0; i < var_names.size (); ++i) +- { +- var2mem_idx[var_names[i]] = static_cast( +- atoi (var_mem_indices[i].c_str ())); +- } +- } +- } +-} +- +-/* Filter reference groups by only selecting target variables from the user +- input. There are two options for prefetching target variables: +- 1. Specify variable name parsed by the pass, which you can double-check at +- "sorted ref_groups" section in the dump file. +- 2. Specify variable rank exhibited at "sorted ref_groups" section in the +- dump file. +-*/ +- +-void +-prefetch_variables (const vector& ref_groups, +- vector& reduced_ref_groups) +-{ +- map ref_group2mem_idx; +- +- map var2mem_idx; /* externally defined. */ +- parse_param_inputs (var2mem_idx); +- +- if (param_use_ref_group_index) +- { +- /* Use ref_group index at "sorted ref_groups" section to specify +- variable. */ +- /* Collect the variables in "reduced_ref_group" only if their indices +- show up at "sorted ref_groups" section. */ +- for (const pair &var_mem_idx : var2mem_idx) +- { +- unsigned int var_idx = static_cast(atoi ( +- var_mem_idx.first.c_str ())); +- if (var_idx < ref_groups.size ()) +- ref_group2mem_idx[var_idx] = var_mem_idx.second; +- else if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "WARNING: The index \"%u\" does not show " +- "up in the ref_groups.\n", var_idx); +- } +- } +- else +- { +- /* Use variable name shown up at "sorted ref_groups" section to specify +- variable: +- var2ref_group_idx + var2mem_idx -> ref_group2mem_idx. */ +- /* Create a map that assigns the variable name to its corresponding +- ref_group index. */ +- map var2ref_group_idx; /* internally detected. */ +- for (unsigned int i = 0; i < ref_groups.size (); ++i) +- { +- const ref_group &curr_ref_group = ref_groups[i]; +- const int UINT_MAX_DIGIT = 10; +- /* Unrecognizable variable name related to ref_group. */ +- if (!get_name (curr_ref_group.var)) +- { +- /* If the variable name does not have a string representation, +- we can rename it by "tmp_var_" + . */ +- char group_idx[UINT_MAX_DIGIT]; +- sprintf (group_idx, "%u", i); +- string tmp_var_name = "tmp_var_" + std::string (group_idx); +- fprintf (dump_file, "Unrecognizable variable name at ref_group " +- "index %u.\nThe tree expression for variable is: ", i); +- print_generic_expr (dump_file, curr_ref_group.var, TDF_SLIM); +- fprintf (dump_file, "\n"); +- var2ref_group_idx[tmp_var_name] = i; +- } +- else +- var2ref_group_idx[std::string (get_name (curr_ref_group.var))] = i; +- } +- /* Collect the variables in "reduced_ref_group" only if they show up in +- the ref_groups. */ +- for (const pair &var_mem_idx : var2mem_idx) +- { +- if (var2ref_group_idx.count (var_mem_idx.first)) +- { +- unsigned int ref_group_idx = var2ref_group_idx[var_mem_idx.first]; +- ref_group2mem_idx[ref_group_idx] = var_mem_idx.second; +- } +- else if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "WARNING: Target variable \" %s \" does " +- "not show up in the ref_groups. Check whether it needs " +- "temporary variable name.\n", +- var_mem_idx.first.c_str ()); +- } +- } +- +- for (const pair &ref_group_mem_idx : +- ref_group2mem_idx) +- { +- ref_group curr_ref_group = ref_groups[ref_group_mem_idx.first]; +- curr_ref_group.mem_ref_index = ref_group_mem_idx.second; +- reduced_ref_groups.push_back (curr_ref_group); +- if (dump_file && (dump_flags & TDF_DETAILS)) +- { +- fprintf (dump_file, "\nNOTICE: Prefetching target variable \" "); +- print_generic_expr (dump_file, curr_ref_group.var, TDF_SLIM); +- fprintf (dump_file, " \" at ref_group index %u and memory location " +- "index %u.\n", ref_group_mem_idx.first, +- ref_group_mem_idx.second); +- } +- } +- if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "\n\n"); +-} +- + + /* The LLC intelligent allocation consists of 6 steps. */ + +@@ -2869,17 +2626,7 @@ llc_allocate (void) + if (!record_and_sort_ref_groups (ref_groups, sorted_kernels, kernels_refs)) + return; + +- if (strlen (param_target_variables) > 0) +- { +- /* If "param_target_variables" is not empty, we will issue parsed target +- variables compulsorily. */ +- param_force_issue = true; +- vector reduced_ref_groups; +- prefetch_variables (ref_groups, reduced_ref_groups); +- issue_llc_hint (reduced_ref_groups); +- } +- else +- issue_llc_hint (ref_groups); ++ issue_llc_hint (ref_groups); + } + + /* Check whether the function is an operator reloading function. */ +-- +2.33.0 + diff --git a/0165-rtl-ifcvt-BugFix-change-def-selection-logic-in-noce_.patch b/0165-rtl-ifcvt-BugFix-change-def-selection-logic-in-noce_.patch new file mode 100644 index 0000000..909b9d5 --- /dev/null +++ b/0165-rtl-ifcvt-BugFix-change-def-selection-logic-in-noce_.patch @@ -0,0 +1,30 @@ +From 506eb1ff5ca27cd8d741ddf1894a32645919f773 Mon Sep 17 00:00:00 2001 +From: vchernon +Date: Sun, 17 Dec 2023 01:47:02 +0800 +Subject: [PATCH] [rtl-ifcvt][BugFix] change def selection logic in noce_arith + +--- + gcc/ifcvt.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c +index 04086c560..a55ac16f3 100644 +--- a/gcc/ifcvt.c ++++ b/gcc/ifcvt.c +@@ -1985,11 +1985,10 @@ bbs_ok_for_cmove_arith (basic_block bb_a, + + if (!sset_a) + goto end_cmove_arith_check_and_fail; +- if (a_insn == last_a) +- continue; + /* Record all registers that BB_A sets. */ + FOR_EACH_INSN_DEF (def, a_insn) +- bitmap_set_bit (bba_sets, DF_REF_REGNO (def)); ++ if (!(to_rename && DF_REF_REG (def) == to_rename && a_insn == last_a)) ++ bitmap_set_bit (bba_sets, DF_REF_REGNO (def)); + } + + bitmap_and (intersections, df_get_live_in (bb_b), bba_sets); +-- +2.33.0 + diff --git a/0166-perm-propagation-Bugfix-Check-that-the-arithmetic-op.patch b/0166-perm-propagation-Bugfix-Check-that-the-arithmetic-op.patch new file mode 100644 index 0000000..9d4b534 --- /dev/null +++ b/0166-perm-propagation-Bugfix-Check-that-the-arithmetic-op.patch @@ -0,0 +1,57 @@ +From 43b6906c94ce6a683d325b8789267b7ee2d9bf15 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia WX1215920 +Date: Sat, 16 Dec 2023 11:56:30 +0300 +Subject: [PATCH] [perm propagation][Bugfix] Check that the arithmetic + operations follow each other + +--- + gcc/tree-ssa-forwprop.c | 20 +++++++++++++++++--- + 1 file changed, 17 insertions(+), 3 deletions(-) + +diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c +index 92ef5d036..d5e9ca9bb 100644 +--- a/gcc/tree-ssa-forwprop.c ++++ b/gcc/tree-ssa-forwprop.c +@@ -2662,14 +2662,28 @@ check_def_use_order (vec &first_stmts, vec &second_stmts) + /* Check similarity of stmts in the block of arithmetic operations. */ + + static bool +-check_arithmetic_block (vec &initial_perm_stmts, unsigned nstmts) ++check_arithmetic_block (auto_vec &all_arith_stmts, ++ vec &initial_perm_stmts, unsigned nstmts) + { + auto_vec next_stmts (nstmts); + auto_vec prev_stmts (nstmts); ++ hash_set arith_stmt_set; + + enum tree_code code; + unsigned i; +- gimple *stmt_it; ++ gimple *stmt_it, *last_stmt = all_arith_stmts[all_arith_stmts.length () - 1]; ++ ++ /* Check that the arithmetic operations follow each other. */ ++ all_arith_stmts.qsort (gimple_uid_cmp); ++ FOR_EACH_VEC_ELT (all_arith_stmts, i, stmt_it) ++ arith_stmt_set.add (stmt_it); ++ ++ gimple_stmt_iterator gsi; ++ for (gsi = gsi_for_stmt (all_arith_stmts[0]); gsi_stmt (gsi) != last_stmt; ++ gsi_next (&gsi)) ++ if (!arith_stmt_set.contains (gsi_stmt (gsi))) ++ return false; ++ + FOR_EACH_VEC_ELT (initial_perm_stmts, i, stmt_it) + prev_stmts.quick_push (stmt_it); + +@@ -2778,7 +2792,7 @@ analyze_perm_fwprop (tree type, unsigned HOST_WIDE_INT nelts, + } + + /* Check that all results has the same arithmetic patterns. */ +- if (!check_arithmetic_block (final_arith_stmts, nelts)) ++ if (!check_arithmetic_block (all_arith_stmts, final_arith_stmts, nelts)) + return false; + + if (final_arith_stmts.length () < nelts) +-- +2.33.0 + diff --git a/0167-perm-propagation-Bugfix-Fix-shll-shll2-patterns-for-.patch b/0167-perm-propagation-Bugfix-Fix-shll-shll2-patterns-for-.patch new file mode 100644 index 0000000..ec6b4c6 --- /dev/null +++ b/0167-perm-propagation-Bugfix-Fix-shll-shll2-patterns-for-.patch @@ -0,0 +1,62 @@ +From bed123b58aaf435653e01692830def8d564cf51f Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia WX1215920 +Date: Mon, 18 Dec 2023 22:49:54 +0300 +Subject: [PATCH] [perm propagation][Bugfix] Fix shll/shll2 patterns for perm + prop + +--- + gcc/config/aarch64/aarch64-simd.md | 8 ++++---- + gcc/config/aarch64/predicates.md | 7 +++++++ + 2 files changed, 11 insertions(+), 4 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md +index 66fcf0074..c7503561f 100644 +--- a/gcc/config/aarch64/aarch64-simd.md ++++ b/gcc/config/aarch64/aarch64-simd.md +@@ -4791,10 +4791,10 @@ + (vec_select:V4HI + (match_operand:V8HI 1 "register_operand" "w") + (match_operand:V8HI 2 "vect_par_cnst_lo_half" ""))) +- (match_operand:V4SI 3 "aarch64_simd_rshift_imm" "Dr")))] ++ (match_operand:V4SI 3 "aarch64_simd_shift_imm_bitsize_v4si" "i")))] + "TARGET_SIMD" + "shll\t%0.4s, %1.4h, #%3" +- [(set_attr "type" "neon_compare_zero")] ++ [(set_attr "type" "neon_shift_imm_long")] + ) + + ;; vshll_high_n +@@ -4821,10 +4821,10 @@ + (vec_select:V4HI + (match_operand:V8HI 1 "register_operand" "w") + (match_operand:V8HI 2 "vect_par_cnst_hi_half" ""))) +- (match_operand:V4SI 3 "aarch64_simd_rshift_imm" "Dr")))] ++ (match_operand:V4SI 3 "aarch64_simd_shift_imm_bitsize_v4si" "i")))] + "TARGET_SIMD" + "shll2\t%0.4s, %1.8h, #%3" +- [(set_attr "type" "neon_compare_zero")] ++ [(set_attr "type" "neon_shift_imm_long")] + ) + + ;; vrshr_n +diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md +index b1b3cf82c..90db0efba 100644 +--- a/gcc/config/aarch64/predicates.md ++++ b/gcc/config/aarch64/predicates.md +@@ -618,6 +618,13 @@ + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 0, 64)"))) + ++(define_predicate "aarch64_simd_shift_imm_bitsize_v4si" ++ (match_code "const_vector") ++{ ++ HOST_WIDE_INT val = INTVAL (unwrap_const_vec_duplicate (op)); ++ return val == 8 || val == 16 || val == 32; ++}) ++ + (define_predicate "aarch64_constant_pool_symref" + (and (match_code "symbol_ref") + (match_test "CONSTANT_POOL_ADDRESS_P (op)"))) +-- +2.33.0 + diff --git a/0168-LLC-Allocation-Bugfix-Terminate-kernel-filtering-for.patch b/0168-LLC-Allocation-Bugfix-Terminate-kernel-filtering-for.patch new file mode 100644 index 0000000..e97d345 --- /dev/null +++ b/0168-LLC-Allocation-Bugfix-Terminate-kernel-filtering-for.patch @@ -0,0 +1,175 @@ +From 4369e823f0883c079c0681bef68cead870d02063 Mon Sep 17 00:00:00 2001 +From: Feiyang Liu +Date: Wed, 20 Dec 2023 09:48:02 +0800 +Subject: [PATCH] [LLC Allocation][Bugfix] Terminate kernel filtering for + same-loop cycle. + +--- + .../gcc.dg/llc-allocate/llc-same-loop-cycle.c | 125 ++++++++++++++++++ + gcc/tree-ssa-llc-allocate.c | 11 +- + 2 files changed, 135 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.dg/llc-allocate/llc-same-loop-cycle.c + +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-same-loop-cycle.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-same-loop-cycle.c +new file mode 100644 +index 000000000..ba5b5b0c8 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-same-loop-cycle.c +@@ -0,0 +1,125 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -fwhole-program -flto-partition=one -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param filter-kernels=1 --param=branch-prob-threshold=50 -c -w" } */ ++ ++typedef unsigned long size_t; ++typedef long scalar_t__; ++ ++typedef struct TYPE_13__ TYPE_3__ ; ++typedef struct TYPE_12__ TYPE_2__ ; ++typedef struct TYPE_11__ TYPE_1__ ; ++ ++struct dom_info {int nodes; int* dfs_parent; int* dfs_order; int* key; int* next_bucket; int* bucket; int* dom; int fake_exit_edge; TYPE_3__** dfs_to_bb; } ; ++typedef enum cdi_direction { ____Placeholder_cdi_direction } cdi_direction ; ++struct TYPE_11__ {scalar_t__ index; } ; ++typedef TYPE_1__ edge_iterator ; ++typedef TYPE_2__* edge ; ++typedef TYPE_3__* basic_block ; ++struct TYPE_13__ {size_t index; int preds; int succs; } ; ++struct TYPE_12__ {TYPE_3__* src; TYPE_3__* dest; } ; ++typedef int TBB ; ++ ++basic_block ENTRY_BLOCK_PTR ; ++basic_block EXIT_BLOCK_PTR ; ++scalar_t__ bitmap_bit_p (int,size_t) ; ++edge ei_edge (edge_iterator) ; ++int ei_end_p (edge_iterator) ; ++int ei_next (edge_iterator*) ; ++edge_iterator ei_start (int) ; ++size_t eval (struct dom_info*,int) ; ++size_t last_basic_block ; ++int link_roots (struct dom_info*,int,int) ; ++ ++__attribute__((used)) static void ++calc_idoms (struct dom_info *di, enum cdi_direction reverse) ++{ ++ TBB v, w, k, par; ++ basic_block en_block; ++ edge_iterator ei, einext; ++ ++ if (reverse) ++ en_block = EXIT_BLOCK_PTR; ++ else ++ en_block = ENTRY_BLOCK_PTR; ++ ++ /* Go backwards in DFS order, to first look at the leafs. */ ++ v = di->nodes; ++ while (v > 1) ++ { ++ basic_block bb = di->dfs_to_bb[v]; ++ edge e; ++ ++ par = di->dfs_parent[v]; ++ k = v; ++ ++ ei = (reverse) ? ei_start (bb->succs) : ei_start (bb->preds); ++ ++ if (reverse) ++ { ++ /* If this block has a fake edge to exit, process that first. */ ++ if (bitmap_bit_p (di->fake_exit_edge, bb->index)) ++ { ++ einext = ei; ++ einext.index = 0; ++ goto do_fake_exit_edge; ++ } ++ } ++ ++ /* Search all direct predecessors for the smallest node with a path ++ to them. That way we have the smallest node with also a path to ++ us only over nodes behind us. In effect we search for our ++ semidominator. */ ++ while (!ei_end_p (ei)) ++ { ++ basic_block b; ++ TBB k1; ++ ++ e = ei_edge (ei); ++ b = (reverse) ? e->dest : e->src; ++ einext = ei; ++ ei_next (&einext); ++ ++ if (b == en_block) ++ { ++ do_fake_exit_edge: ++ k1 = di->dfs_order[last_basic_block]; ++ } ++ else ++ k1 = di->dfs_order[b->index]; ++ ++ /* Call eval() only if really needed. If k1 is above V in DFS tree, ++ then we know, that eval(k1) == k1 and key[k1] == k1. */ ++ if (k1 > v) ++ k1 = di->key[eval (di, k1)]; ++ if (k1 < k) ++ k = k1; ++ ++ ei = einext; ++ } ++ ++ di->key[v] = k; ++ link_roots (di, par, v); ++ di->next_bucket[v] = di->bucket[k]; ++ di->bucket[k] = v; ++ ++ /* Transform semidominators into dominators. */ ++ for (w = di->bucket[par]; w; w = di->next_bucket[w]) ++ { ++ k = eval (di, w); ++ if (di->key[k] < di->key[w]) ++ di->dom[w] = k; ++ else ++ di->dom[w] = par; ++ } ++ /* We don't need to cleanup next_bucket[]. */ ++ di->bucket[par] = 0; ++ v--; ++ } ++ ++ /* Explicitly define the dominators. */ ++ di->dom[1] = 0; ++ for (v = 2; v <= di->nodes; v++) ++ if (di->dom[v] != di->key[v]) ++ di->dom[v] = di->dom[di->dom[v]]; ++} ++ ++/* { dg-final { scan-tree-dump "Find same-loop cycle." "llc_allocate" } } */ +diff --git a/gcc/tree-ssa-llc-allocate.c b/gcc/tree-ssa-llc-allocate.c +index fa8979401..62b5f18ad 100644 +--- a/gcc/tree-ssa-llc-allocate.c ++++ b/gcc/tree-ssa-llc-allocate.c +@@ -1863,6 +1863,7 @@ filter_and_sort_kernels (vector &sorted_kernels, + + set end_bb; + list walked_header_bb; /* Used to record nested loops. */ ++ set walked_non_header_bb_idx; + + for (unsigned i = 0; i < kernels.size (); ++i) + { +@@ -1895,7 +1896,15 @@ filter_and_sort_kernels (vector &sorted_kernels, + /* bb is not the head of the loop, go to the next. */ + if (bb != bb->loop_father->header) + { +- bb = next_high_probability_bb (bb); ++ if (walked_non_header_bb_idx.count (bb->index)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Find same-loop cycle. " ++ "Abort filtering process.\n"); ++ return false; ++ } ++ walked_non_header_bb_idx.insert (bb->index); ++ bb = next_high_probability_bb (bb); + continue; + } + +-- +2.33.0 + diff --git a/0169-Struct-Reorg-Fix-several-bugs.patch b/0169-Struct-Reorg-Fix-several-bugs.patch new file mode 100644 index 0000000..72412e0 --- /dev/null +++ b/0169-Struct-Reorg-Fix-several-bugs.patch @@ -0,0 +1,183 @@ +From 708ffe6f132ee39441b66b6ab6b98847d35916b7 Mon Sep 17 00:00:00 2001 +From: eastb233 +Date: Tue, 19 Dec 2023 17:03:12 +0800 +Subject: [PATCH 1/2] [Struct Reorg] Fix several bugs + +--- + gcc/ipa-struct-reorg/ipa-struct-reorg.c | 50 ++++++------------- + gcc/testsuite/gcc.dg/struct/struct_reorg-10.c | 29 +++++++++++ + gcc/testsuite/gcc.dg/struct/struct_reorg-11.c | 16 ++++++ + gcc/testsuite/gcc.dg/struct/struct_reorg-12.c | 26 ++++++++++ + 4 files changed, 85 insertions(+), 36 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/struct/struct_reorg-10.c + create mode 100644 gcc/testsuite/gcc.dg/struct/struct_reorg-11.c + create mode 100644 gcc/testsuite/gcc.dg/struct/struct_reorg-12.c + +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.c b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +index 7aba74ff1..0064811ac 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.c ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +@@ -4105,6 +4105,12 @@ ipa_struct_reorg::maybe_record_assign (cgraph_node *node, gassign *stmt) + maybe_mark_or_record_other_side (rhs, lhs, stmt); + if (TREE_CODE (lhs) == SSA_NAME) + maybe_mark_or_record_other_side (lhs, rhs, stmt); ++ ++ /* Handle missing ARRAY_REF cases. */ ++ if (TREE_CODE (lhs) == ARRAY_REF) ++ mark_type_as_escape (TREE_TYPE (lhs), escape_array, stmt); ++ if (TREE_CODE (rhs) == ARRAY_REF) ++ mark_type_as_escape (TREE_TYPE (rhs), escape_array, stmt); + } + } + +@@ -6169,6 +6175,7 @@ ipa_struct_reorg::rewrite_expr (tree expr, tree newexpr[max_split], bool ignore_ + bool escape_from_base = false; + + tree newbase[max_split]; ++ memset (newbase, 0, sizeof (tree[max_split])); + memset (newexpr, 0, sizeof(tree[max_split])); + + if (TREE_CODE (expr) == CONSTRUCTOR) +@@ -8162,43 +8169,14 @@ ipa_struct_reorg::rewrite_cond (gcond *stmt, + should be removed. */ + + bool +-ipa_struct_reorg::rewrite_debug (gimple *stmt, gimple_stmt_iterator *) ++ipa_struct_reorg::rewrite_debug (gimple *, gimple_stmt_iterator *) + { +- if (current_layout_opt_level >= STRUCT_REORDER_FIELDS) +- { +- /* Delete debug gimple now. */ +- return true; +- } +- bool remove = false; +- if (gimple_debug_bind_p (stmt)) +- { +- tree var = gimple_debug_bind_get_var (stmt); +- tree newvar[max_split]; +- if (rewrite_expr (var, newvar, true)) +- remove = true; +- if (gimple_debug_bind_has_value_p (stmt)) +- { +- var = gimple_debug_bind_get_value (stmt); +- if (TREE_CODE (var) == POINTER_PLUS_EXPR) +- var = TREE_OPERAND (var, 0); +- if (rewrite_expr (var, newvar, true)) +- remove = true; +- } +- } +- else if (gimple_debug_source_bind_p (stmt)) +- { +- tree var = gimple_debug_source_bind_get_var (stmt); +- tree newvar[max_split]; +- if (rewrite_expr (var, newvar, true)) +- remove = true; +- var = gimple_debug_source_bind_get_value (stmt); +- if (TREE_CODE (var) == POINTER_PLUS_EXPR) +- var = TREE_OPERAND (var, 0); +- if (rewrite_expr (var, newvar, true)) +- remove = true; +- } +- +- return remove; ++ /* In debug statements, there might be some statements that have ++ been optimized out in gimple but left in debug gimple. Sometimes ++ these statements need to be analyzed to escape, but in rewrite ++ stage it shouldn't happen. It needs to care a lot to handle these ++ cases but seems useless. So now we just delete debug gimple. */ ++ return true; + } + + /* Rewrite PHI nodes, return true if the PHI was replaced. */ +diff --git a/gcc/testsuite/gcc.dg/struct/struct_reorg-10.c b/gcc/testsuite/gcc.dg/struct/struct_reorg-10.c +new file mode 100644 +index 000000000..ec422f76f +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/struct_reorg-10.c +@@ -0,0 +1,29 @@ ++/* { dg-do compile } */ ++/* { dg-options "-w -g -O3 -flto-partition=one -fipa-struct-reorg -fwhole-program -S" } */ ++ ++struct a { ++ int b; ++ char c; ++}; ++struct { ++ double d; ++ _Bool e; ++} * f; ++struct g { ++ struct a h; ++} i; ++long j; ++void k(); ++void l() { k(i); } ++void k(struct a m) { ++ f->e = 0; ++ for (;;) ++ l(); ++} ++int main() { ++ for (; j; f = 0) { ++ struct g *n = 0; ++ char o = n->h.c; ++ } ++ l(); ++} +diff --git a/gcc/testsuite/gcc.dg/struct/struct_reorg-11.c b/gcc/testsuite/gcc.dg/struct/struct_reorg-11.c +new file mode 100644 +index 000000000..3e42aa84a +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/struct_reorg-11.c +@@ -0,0 +1,16 @@ ++/* { dg-do compile } */ ++/* { dg-options "-w -g -O3 -flto-partition=one -fipa-struct-reorg -fwhole-program -S" } */ ++ ++struct a { ++ int b; ++ double c; ++}; ++struct d { ++ struct a e; ++}; ++int f; ++int main() { ++ _Bool g; ++ struct d **h = 0; ++ g = *h += f; ++} +diff --git a/gcc/testsuite/gcc.dg/struct/struct_reorg-12.c b/gcc/testsuite/gcc.dg/struct/struct_reorg-12.c +new file mode 100644 +index 000000000..d434f9fe0 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/struct_reorg-12.c +@@ -0,0 +1,26 @@ ++/* { dg-do compile } */ ++/* { dg-options "-w -g -O3 -flto-partition=one -fipa-struct-reorg -fwhole-program -S" } */ ++ ++struct foo { ++ long element1; ++ long element2; ++}; ++ ++struct goo { ++ struct foo element_foo; ++}; ++ ++struct goo g1; ++ ++void func () { ++ struct foo (*local)[] = 0; ++ long idx; ++ (g1).element_foo = (*local)[idx]; ++} ++ ++struct foo g2; ++int main () { ++ func (); ++ g2 = g1.element_foo; ++ return 0; ++} +-- +2.33.0 + diff --git a/0170-DFE-Add-escape-check.patch b/0170-DFE-Add-escape-check.patch new file mode 100644 index 0000000..0605fc3 --- /dev/null +++ b/0170-DFE-Add-escape-check.patch @@ -0,0 +1,104 @@ +From e875e4e7f3716aa268ffbbf55ee199ec82b6aeba Mon Sep 17 00:00:00 2001 +From: Mingchuan Wu +Date: Thu, 21 Dec 2023 15:50:34 +0800 +Subject: [PATCH 2/2] [DFE] Add escape check. Fields with escape risks should + not be processed. + +--- + gcc/ipa-struct-reorg/ipa-struct-reorg.c | 15 +++++-- + gcc/testsuite/gcc.dg/struct/dfe_escape.c | 50 ++++++++++++++++++++++++ + 2 files changed, 62 insertions(+), 3 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/struct/dfe_escape.c + +diff --git a/gcc/ipa-struct-reorg/ipa-struct-reorg.c b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +index 0064811ac..dcfa7cd95 100644 +--- a/gcc/ipa-struct-reorg/ipa-struct-reorg.c ++++ b/gcc/ipa-struct-reorg/ipa-struct-reorg.c +@@ -444,8 +444,13 @@ srtype::has_dead_field (void) + if (!(this_field->field_access & READ_FIELD) + && !FUNCTION_POINTER_TYPE_P (this_field->fieldtype)) + { +- may_dfe = true; +- break; ++ /* Fields with escape risks should not be processed. */ ++ if (this_field->type == NULL ++ || (this_field->type->escapes == does_not_escape)) ++ { ++ may_dfe = true; ++ break; ++ } + } + } + return may_dfe; +@@ -1030,7 +1035,11 @@ srtype::create_new_type (void) + if (current_layout_opt_level & DEAD_FIELD_ELIMINATION + && !(f->field_access & READ_FIELD) + && !FUNCTION_POINTER_TYPE_P (f->fieldtype)) +- continue; ++ { ++ /* Fields with escape risks should not be processed. */ ++ if (f->type == NULL || (f->type->escapes == does_not_escape)) ++ continue; ++ } + f->create_new_fields (newtype, newfields, newlast); + } + +diff --git a/gcc/testsuite/gcc.dg/struct/dfe_escape.c b/gcc/testsuite/gcc.dg/struct/dfe_escape.c +new file mode 100644 +index 000000000..1b143cd26 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/struct/dfe_escape.c +@@ -0,0 +1,50 @@ ++/* { dg-do compile } */ ++ ++#include ++#include ++ ++typedef struct arc arc_t; ++typedef struct arc *arc_p; ++ ++typedef struct network ++{ ++ int x; ++} network_t; ++ ++struct arc ++{ ++ int flow; ++ network_t* net_add; ++}; ++ ++const int MAX = 100; ++ ++/* let it escape_array, "Type is used in an array [not handled yet]". */ ++network_t* net[2]; ++arc_p stop_arcs = NULL; ++ ++int ++main () ++{ ++ net[0] = (network_t*) calloc (1, sizeof(network_t)); ++ stop_arcs = (arc_p) calloc (MAX, sizeof (arc_t)); ++ ++ net[0]->x = 100; ++ ++ for (unsigned i = 0; i < 3; i++) ++ { ++ net[0]->x = net[0]->x + 2; ++ stop_arcs->flow = net[0]->x / 2; ++ stop_arcs->flow = stop_arcs->flow + 20; ++ stop_arcs->net_add = net[0]; ++ stop_arcs++; ++ } ++ ++ if( net[1] != 0 && stop_arcs != 0) ++ { ++ return -1; ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-ipa-dump-times "Dead field elimination" 0 "struct_reorg" } } */ +-- +2.33.0 + diff --git a/0171-phiopt-testsuite-Add-ftree-fold-phiopt-option-to-5-t.patch b/0171-phiopt-testsuite-Add-ftree-fold-phiopt-option-to-5-t.patch new file mode 100644 index 0000000..614ab14 --- /dev/null +++ b/0171-phiopt-testsuite-Add-ftree-fold-phiopt-option-to-5-t.patch @@ -0,0 +1,80 @@ +From 1f4d422fd8008f0af015df53f496c6dce3534b26 Mon Sep 17 00:00:00 2001 +From: Mingchuan Wu +Date: Fri, 22 Dec 2023 11:38:15 +0800 +Subject: [PATCH] [phiopt][testsuite] Add -ftree-fold-phiopt option to 5 test + cases. + +Modified test cases include: +1.gcc.dg/pr45416.c +2.gcc.target/i386/pr65871-3.c +3.g++.dg/opt/pr99305.C +4.gcc.dg/pr107190.c +5.g++.dg/tree-ssa/mull64.C +--- + gcc/testsuite/g++.dg/opt/pr99305.C | 2 +- + gcc/testsuite/g++.dg/tree-ssa/mull64.C | 2 +- + gcc/testsuite/gcc.dg/pr107190.c | 2 +- + gcc/testsuite/gcc.dg/pr45416.c | 2 +- + gcc/testsuite/gcc.target/i386/pr65871-3.c | 2 +- + 5 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/gcc/testsuite/g++.dg/opt/pr99305.C b/gcc/testsuite/g++.dg/opt/pr99305.C +index 6fcdef391..06295116f 100644 +--- a/gcc/testsuite/g++.dg/opt/pr99305.C ++++ b/gcc/testsuite/g++.dg/opt/pr99305.C +@@ -1,6 +1,6 @@ + // PR tree-optimization/99305 + // { dg-do compile } +-// { dg-options "-O3 -fno-ipa-icf -fdump-tree-optimized" } ++// { dg-options "-O3 -ftree-fold-phiopt -fno-ipa-icf -fdump-tree-optimized" } + // { dg-final { scan-tree-dump-times " = \\\(unsigned char\\\) c_\[0-9]*\\\(D\\\);" 3 "optimized" { target { ! unsigned_char } } } } + // { dg-final { scan-tree-dump-times " = \[^\n\r]* \\+ \[0-9]*;" 3 "optimized" } } + // { dg-final { scan-tree-dump-times " = \[^\n\r]* <= 9;" 3 "optimized" } } +diff --git a/gcc/testsuite/g++.dg/tree-ssa/mull64.C b/gcc/testsuite/g++.dg/tree-ssa/mull64.C +index cad891e62..ec359f2ba 100644 +--- a/gcc/testsuite/g++.dg/tree-ssa/mull64.C ++++ b/gcc/testsuite/g++.dg/tree-ssa/mull64.C +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -fmerge-mull -Wno-psabi -fdump-tree-forwprop1-details -fdump-tree-forwprop4-details" } */ ++/* { dg-options "-O2 -ftree-fold-phiopt -fmerge-mull -Wno-psabi -fdump-tree-forwprop1-details -fdump-tree-forwprop4-details" } */ + + # define BN_BITS4 32 + # define BN_MASK2 (0xffffffffffffffffL) +diff --git a/gcc/testsuite/gcc.dg/pr107190.c b/gcc/testsuite/gcc.dg/pr107190.c +index d1e72e5df..d4e5fa0d0 100644 +--- a/gcc/testsuite/gcc.dg/pr107190.c ++++ b/gcc/testsuite/gcc.dg/pr107190.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -fmerge-mull -fexpensive-optimizations -fdump-tree-phiopt2-details" } */ ++/* { dg-options "-O2 -ftree-fold-phiopt -fmerge-mull -fexpensive-optimizations -fdump-tree-phiopt2-details" } */ + + # define BN_BITS4 32 + # define BN_MASK2 (0xffffffffffffffffL) +diff --git a/gcc/testsuite/gcc.dg/pr45416.c b/gcc/testsuite/gcc.dg/pr45416.c +index a3f6a759d..dd37ec534 100644 +--- a/gcc/testsuite/gcc.dg/pr45416.c ++++ b/gcc/testsuite/gcc.dg/pr45416.c +@@ -1,6 +1,6 @@ + /* { dg-do compile } */ + /* { dg-skip-if "Skip for Thumb1." { { arm*-*-* } && { arm_thumb1_ok } } } */ +-/* { dg-options "-O2" } */ ++/* { dg-options "-O2 -ftree-fold-phiopt" } */ + + int foo(long long a) + { +diff --git a/gcc/testsuite/gcc.target/i386/pr65871-3.c b/gcc/testsuite/gcc.target/i386/pr65871-3.c +index c7d9bdd96..4fd3b48f8 100644 +--- a/gcc/testsuite/gcc.target/i386/pr65871-3.c ++++ b/gcc/testsuite/gcc.target/i386/pr65871-3.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mbmi" } */ ++/* { dg-options "-O2 -ftree-fold-phiopt -mbmi" } */ + + int foo (int x, int y) + { +-- +2.33.0 + diff --git a/0172-minmax-Move-minmax-pattern-to-gimple.patch b/0172-minmax-Move-minmax-pattern-to-gimple.patch new file mode 100644 index 0000000..99e6682 --- /dev/null +++ b/0172-minmax-Move-minmax-pattern-to-gimple.patch @@ -0,0 +1,323 @@ +From df88d29c355c59e262397fdf3b22ee9099ce40c2 Mon Sep 17 00:00:00 2001 +From: Pronin Alexander 00812787 +Date: Tue, 19 Dec 2023 12:19:14 +0300 +Subject: [PATCH 1/5] [minmax] Move minmax pattern to gimple. + +--- + gcc/common.opt | 4 + + gcc/config/aarch64/aarch64-simd.md | 72 ---------------- + gcc/match.pd | 104 ++++++++++++++++++++++++ + gcc/testsuite/gcc.dg/combine-maxmin-1.c | 15 ++++ + gcc/testsuite/gcc.dg/combine-maxmin-2.c | 14 ++++ + gcc/testsuite/gcc.dg/combine-maxmin.c | 19 +++-- + 6 files changed, 151 insertions(+), 77 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-1.c + create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-2.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index a8a2264ee..73234dcc3 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1750,6 +1750,10 @@ fif-conversion-gimple + Common Report Var(flag_if_conversion_gimple) Optimization + Perform conversion of conditional jumps to branchless equivalents during gimple transformations. + ++fconvert-minmax ++Common Report Var(flag_convert_minmax) Optimization ++Convert saturating clipping to min max. ++ + fstack-reuse= + Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization + -fstack-reuse=[all|named_vars|none] Set stack reuse level for local variables. +diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md +index c7503561f..754343abc 100644 +--- a/gcc/config/aarch64/aarch64-simd.md ++++ b/gcc/config/aarch64/aarch64-simd.md +@@ -1535,78 +1535,6 @@ + [(set_attr "type" "neon_minmax")] + ) + +-;; Use sequential smax+smin to replace vector arithmetic operations like this: +-;; a = ((x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x); +-;; TODO: maybe extend to scalar operations. +- +-(define_insn_and_split "*aarch64_maxmin_arith" +- [(set (match_operand:VDQHSD 0 "register_operand" "=w") +- (xor:VDQHSD +- (and:VDQHSD +- (xor:VDQHSD +- (ashiftrt:VDQHSD +- (neg:VDQHSD +- (match_operand:VDQHSD 1 "register_operand")) +- (match_operand:VDQHSD 2 "maxmin_arith_shift_operand")) +- (match_dup 1)) +- (neg:VDQHSD +- (eq:VDQHSD +- (and:VDQHSD +- (match_dup 1) +- (match_operand:VDQHSD 3 "aarch64_bic_imm_for_maxmin")) +- (match_operand:VDQHSD 4 "aarch64_simd_or_scalar_imm_zero")))) +- (ashiftrt:VDQHSD +- (neg:VDQHSD +- (match_dup 1)) +- (match_dup 2))))] +- "TARGET_SIMD && !reload_completed" +- "#" +- "&& true" +- [(set (match_operand:VDQHSD 5 "register_operand" "w") (match_dup 3)) +- (set (match_operand:VDQHSD 6 "register_operand" "w") (match_dup 4)) +- (set (match_operand:VDQHSD 0 "register_operand" "=w") +- (smax:VDQHSD (match_operand:VDQHSD 1 "register_operand" "w") +- (match_operand:VDQHSD 6 "register_operand" "w"))) +- (set (match_operand:VDQHSD 0 "register_operand" "=w") +- (smin:VDQHSD (match_operand:VDQHSD 0 "register_operand" "w") +- (match_operand:VDQHSD 5 "register_operand" "w")))] +- { +- if (can_create_pseudo_p ()) +- { +- int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[3], 0)); +- operands[3] = aarch64_simd_gen_const_vector_dup (mode, +- ~val); +- operands[5] = gen_reg_rtx (mode); +- operands[6] = gen_reg_rtx (mode); +- } +- else +- FAIL; +- } +- [(set_attr "type" "neon_minmax")] +-) +- +-;; The helper definition that allows combiner to use the previous pattern. +- +-(define_insn_and_split "*aarch64_maxmin_tmp" +- [(set (match_operand:VDQHSD 0 "register_operand" "=w") +- (ashiftrt:VDQHSD +- (neg:VDQHSD +- (match_operand:VDQHSD 1 "register_operand" "w")) +- (match_operand:VDQHSD 2 "maxmin_arith_shift_operand")))] +- "TARGET_SIMD" +- "#" +- "&& reload_completed" +- [(set (match_operand:VDQHSD 0 "register_operand") +- (neg:VDQHSD +- (match_operand:VDQHSD 1 "register_operand" "w"))) +- (set (match_dup 0) +- (ashiftrt:VDQHSD +- (match_dup 0) +- (match_operand:VDQHSD 2 "maxmin_arith_shift_operand")))] +- "" +- [(set_attr "type" "neon_minmax")] +-) +- + ;; Pairwise FP Max/Min operations. + (define_insn "aarch64_p" + [(set (match_operand:VHSDF 0 "register_operand" "=w") +diff --git a/gcc/match.pd b/gcc/match.pd +index 24ae157af..1097cd926 100644 +--- a/gcc/match.pd ++++ b/gcc/match.pd +@@ -6595,3 +6595,107 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) + (plus:c@4 (op2:c @0 @1) + (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1)))) + (if (single_use (@4) && single_use (@5))))) ++ ++/* MinMax pattern matching helpers. More info on the transformation below. */ ++ ++/* Match (a & 0b11..100..0) pattern. */ ++(match (minmax_cmp_arg @0 @1) ++ (bit_and @0 INTEGER_CST@1) ++ (if (wi::popcount (~wi::to_widest (@1) + 1) == 1))) ++ ++/* Match (inversed_sign_bit >> sign_bit_pos) pattern. ++ This statement is blocking for the transformation of unsigned integers. ++ Do type check here to avoid unnecessary duplications. */ ++(match (minmax_sat_arg @0) ++ (rshift (negate @0) INTEGER_CST@1) ++ (if (!TYPE_UNSIGNED (TREE_TYPE (@0)) ++ && wi::eq_p (wi::to_widest (@1), TYPE_PRECISION (TREE_TYPE (@0)) - 1)))) ++ ++/* Transform ((x & ~mask) ? (-x)>>31 & mask : x) to (min (max (x, 0), mask)). ++ The matched pattern can be described as saturated clipping. ++ ++ The pattern supports truncation via both casts and bit_and. ++ Also there are patterns for possible inverted conditions. */ ++(if (flag_convert_minmax) ++/* Truncation via casts. Unfortunately convert? cannot be applied here ++ because convert and cond take different number of arguments. */ ++ (simplify ++ (convert ++ (cond ++ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? (minmax_sat_arg @0)) ++ (convert? @0))) ++ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ (simplify ++ (cond ++ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? (minmax_sat_arg @0)) ++ (convert? @0)) ++ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ ++ (simplify ++ (convert ++ (cond ++ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? @0) ++ (convert? (minmax_sat_arg @0)))) ++ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ (simplify ++ (cond ++ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? @0) ++ (convert? (minmax_sat_arg @0))) ++ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ ++ /* Truncation via bit_and with mask. Same concerns on convert? here. */ ++ (simplify ++ (convert ++ (cond ++ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)) ++ (convert? @0))) ++ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ (simplify ++ (cond ++ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)) ++ (convert? @0)) ++ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ ++ (simplify ++ (convert ++ (cond ++ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? @0) ++ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)))) ++ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ (simplify ++ (cond ++ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? @0) ++ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))) ++ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; })))))) +diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-1.c b/gcc/testsuite/gcc.dg/combine-maxmin-1.c +new file mode 100644 +index 000000000..859ff7df8 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/combine-maxmin-1.c +@@ -0,0 +1,15 @@ ++/* { dg-do compile { target aarch64-*-* } } */ ++/* { dg-options "-O3 -fconvert-minmax" } */ ++ ++#include ++ ++__attribute__((noinline)) ++void test (int32_t *restrict a, int32_t *restrict x) ++{ ++ for (int i = 0; i < 4; i++) ++ a[i] = ((((-x[i]) >> 31) ^ x[i]) ++ & (-((int32_t)((x[i] & (~((1 << 8)-1))) == 0)))) ^ ((-x[i]) >> 31); ++} ++ ++/* { dg-final { scan-assembler-not {smax\t} } } */ ++/* { dg-final { scan-assembler-not {smin\t} } } */ +diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-2.c b/gcc/testsuite/gcc.dg/combine-maxmin-2.c +new file mode 100644 +index 000000000..63d4d85b3 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/combine-maxmin-2.c +@@ -0,0 +1,14 @@ ++/* { dg-do compile { target aarch64-*-* } } */ ++/* { dg-options "-O3 -fconvert-minmax" } */ ++ ++#include ++ ++__attribute__((noinline)) ++void test (int8_t *restrict a, int32_t *restrict x) ++{ ++ for (int i = 0; i < 8; i++) ++ a[i] = ((x[i] & ~((1 << 9)-1)) ? (-x[i])>>31 & ((1 << 9)-1) : x[i]); ++} ++ ++/* { dg-final { scan-assembler-times {smax\t} 4 } } */ ++/* { dg-final { scan-assembler-times {smin\t} 4 } } */ +diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c +index 06bce7029..a984fa560 100755 +--- a/gcc/testsuite/gcc.dg/combine-maxmin.c ++++ b/gcc/testsuite/gcc.dg/combine-maxmin.c +@@ -1,5 +1,5 @@ + /* { dg-do compile { target aarch64-*-* } } */ +-/* { dg-options "-O3 -fdump-rtl-combine-all" } */ ++/* { dg-options "-O3 -fconvert-minmax" } */ + + /* The test checks usage of smax/smin insns for clip evaluation and + * uzp1/uzp2 insns for vector element narrowing. It's inspired by +@@ -19,20 +19,26 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, + { + const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0; + for( int y = 0; y < height; y++ ) { ++ /* This loop is not being vectorized now. */ + for( int x = -2; x < width+3; x++ ) { + int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride] + + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride])); + dstv[x] = clip ( (v + 16) >> 5 ); + buf[x+2] = v + pad; + } ++ ++ /* Produces two versions of the code: 3xUZP1/2xMAX/2xMIN + 1xUZP1/1xMAX/1xMIN. */ + for( int x = 0; x < width; x++ ) + dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1] + + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1])) + - 32*pad + 512) >> 10); ++ ++ /* Priduces two versions of the code: 1xUZP1/2xMAX/2xMIN + 0xUZP1/1xMAX/1xMIN. */ + for( int x = 0; x < width; x++ ) + dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1] + + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1])) + + 16) >> 5); ++ + dsth += stride; + dstv += stride; + dstc += stride; +@@ -40,7 +46,10 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, + } + } + +-/* { dg-final { scan-assembler-times {smax\t} 4 } } */ +-/* { dg-final { scan-assembler-times {smin\t} 4 } } */ +-/* { dg-final { scan-assembler-times {cmtst\t} 2 } } */ +-/* { dg-final { scan-assembler-times {uzp1\t} 6 } } */ ++/* Max is performed on 0 from signed values, match smax exactly. */ ++/* { dg-final { scan-assembler-times {smax\t} 6 } } */ ++/* Min is performed on signed val>0 and a mask, min sign doesn't matter. */ ++/* { dg-final { scan-assembler-times {[us]min\t} 6 } } */ ++/* All of the vectorized patterns are expected to be matched. */ ++/* { dg-final { scan-assembler-not {cmtst\t} } } */ ++/* { dg-final { scan-assembler-times {uzp1\t} 5 } } */ +-- +2.33.0 + diff --git a/0173-IPA-Fix-test-completion-1.c.patch b/0173-IPA-Fix-test-completion-1.c.patch new file mode 100644 index 0000000..8444b44 --- /dev/null +++ b/0173-IPA-Fix-test-completion-1.c.patch @@ -0,0 +1,24 @@ +From d6ef1c0c182267d3ab68e3ae1d7f1a576a7bbb2a Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia +Date: Wed, 20 Dec 2023 18:44:29 +0800 +Subject: [PATCH 2/5] [IPA] Fix test completion-1.c + +--- + gcc/testsuite/gcc.dg/completion-1.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/gcc/testsuite/gcc.dg/completion-1.c b/gcc/testsuite/gcc.dg/completion-1.c +index 64da64f1c..df2319c76 100644 +--- a/gcc/testsuite/gcc.dg/completion-1.c ++++ b/gcc/testsuite/gcc.dg/completion-1.c +@@ -2,6 +2,7 @@ + /* { dg-options "--completion=-fipa-ic" } */ + + /* { dg-begin-multiline-output "" } ++-fipa-ic + -fipa-icf + -fipa-icf-functions + -fipa-icf-variables +-- +2.33.0 + diff --git a/0174-IPA-Fix-fails-on-checked-build-and-comments-from-rev.patch b/0174-IPA-Fix-fails-on-checked-build-and-comments-from-rev.patch new file mode 100644 index 0000000..54f9d56 --- /dev/null +++ b/0174-IPA-Fix-fails-on-checked-build-and-comments-from-rev.patch @@ -0,0 +1,71 @@ +From ed548cec9d8efe8ef742225c39f5d84aba4be81b Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia WX1215920 +Date: Wed, 20 Dec 2023 13:53:47 +0300 +Subject: [PATCH 3/5] [IPA] Fix fails on checked build and comments from review + +--- + gcc/ipa-prefetch.c | 24 ++++++++++++++++++++++-- + gcc/params.opt | 4 ++-- + 2 files changed, 24 insertions(+), 4 deletions(-) + +diff --git a/gcc/ipa-prefetch.c b/gcc/ipa-prefetch.c +index 93483a6e8..d8bb9a251 100644 +--- a/gcc/ipa-prefetch.c ++++ b/gcc/ipa-prefetch.c +@@ -167,6 +167,7 @@ analyse_cgraph () + } + + /* TODO: maybe remove loop info here. */ ++ n->get_body (); + push_cfun (DECL_STRUCT_FUNCTION (n->decl)); + calculate_dominance_info (CDI_DOMINATORS); + loop_optimizer_init (LOOPS_NORMAL); +@@ -1540,9 +1541,28 @@ optimize_function (cgraph_node *n, function *fn) + return 0; + } + else if (dump_file) +- fprintf (dump_file, "Dominator bb %d for MRs\n", dom_bb->index); ++ { ++ fprintf (dump_file, "Dominator bb %d for MRs:\n", dom_bb->index); ++ gimple_dump_bb (dump_file, dom_bb, 0, dump_flags); ++ fprintf (dump_file, "\n"); ++ } ++ ++ /* Try to find comp_mr's stmt in the dominator bb. */ ++ gimple *last_used = NULL; ++ for (gimple_stmt_iterator si = gsi_last_bb (dom_bb); !gsi_end_p (si); ++ gsi_prev (&si)) ++ if (comp_mr->stmts[0] == gsi_stmt (si)) ++ { ++ last_used = gsi_stmt (si); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Last used stmt in dominator bb:\n"); ++ print_gimple_stmt (dump_file, last_used, 0); ++ } ++ break; ++ } + +- split_block (dom_bb, (gimple *) NULL); ++ split_block (dom_bb, last_used); + gimple_stmt_iterator gsi = gsi_last_bb (dom_bb); + + /* Create new inc var. Insert new_var = old_var + step * factor. */ +diff --git a/gcc/params.opt b/gcc/params.opt +index ef7bea311..76ae925fd 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -251,8 +251,8 @@ Common Joined UInteger Var(param_ipa_prefetch_distance_factor) Init(4) Param Opt + The factor represents the number of inductive variable incrementations to evaluate an indirect memory address for IPA prefetch. + + -param=ipa-prefetch-locality= +-Common Joined UInteger Var(param_ipa_prefetch_locality) Init(3) Param Optimization +-The flag represents temporal locality values in the following way: 0:pstl1strm, 1:pstl3keep, 2:pstl2keep, 3:pstl1keep. ++Common Joined UInteger Var(param_ipa_prefetch_locality) Init(3) IntegerRange(0, 3) Param Optimization ++The flag represents temporal locality value between 0 and 3, the higher value means the higher temporal locality in the data. + + -param=ira-loop-reserved-regs= + Common Joined UInteger Var(param_ira_loop_reserved_regs) Init(2) Param Optimization +-- +2.33.0 + diff --git a/0175-split-ldp-stp-Extending-and-refactoring-of-pass_spli.patch b/0175-split-ldp-stp-Extending-and-refactoring-of-pass_spli.patch new file mode 100644 index 0000000..b5a8054 --- /dev/null +++ b/0175-split-ldp-stp-Extending-and-refactoring-of-pass_spli.patch @@ -0,0 +1,1426 @@ +From abea8f1249a6efc5b7770f2243f31106b3ba9d58 Mon Sep 17 00:00:00 2001 +From: Gadzhiev Emin WX1195297 +Date: Wed, 20 Dec 2023 21:36:07 +0300 +Subject: [PATCH 4/5] [split ldp/stp] Extending and refactoring of + pass_split_complex_instructions + +- Add flag parameter in is_ldp_insn and is_stp_insn to know + if instruction has writeback operation +- Add support of PRE_*, POST_* operands as a memory address + expression +- Split only LDPs that intersect with a dependent store + instruction +- Make the selection of dependent store instructions stricter + so it will be enough to check by BFS that dependent store + instruction appears in search range. +- Add helper methods to retrieve fields of rtx +- Remove redundant iterations in find_dependent_stores_candidates +- Refactor generation of instructions +- Add more test cases +--- + gcc/config/aarch64/aarch64.c | 62 +- + gcc/doc/tm.texi | 12 +- + gcc/sched-rgn.c | 771 +++++++++--------- + gcc/target.def | 14 +- + .../gcc.dg/rtl/aarch64/test-ldp-dont-split.c | 35 +- + .../rtl/aarch64/test-ldp-split-rearrange.c | 2 +- + .../gcc.dg/rtl/aarch64/test-ldp-split.c | 181 +++- + 7 files changed, 603 insertions(+), 474 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c +index 75efbcb97..da4983236 100644 +--- a/gcc/config/aarch64/aarch64.c ++++ b/gcc/config/aarch64/aarch64.c +@@ -23858,39 +23858,59 @@ aarch64_run_selftests (void) + + #endif /* #if CHECKING_P */ + +-/* TODO: refuse to use ranges intead of full list of an instruction codes. */ ++/* TODO: refuse to use ranges instead of full list of an instruction codes. */ + + bool +-is_aarch64_ldp_insn (int icode) ++is_aarch64_ldp_insn (int icode, bool *has_wb) + { + if ((icode >= CODE_FOR_load_pair_sw_sisi +- && icode <= CODE_FOR_load_pair_dw_tftf) ++ && icode <= CODE_FOR_load_pair_sw_sfsf) ++ || (icode >= CODE_FOR_load_pair_dw_didi ++ && icode <= CODE_FOR_load_pair_dw_dfdf) ++ || (icode == CODE_FOR_load_pair_dw_tftf) + || (icode >= CODE_FOR_loadwb_pairsi_si +- && icode <= CODE_FOR_loadwb_pairtf_di) +- || (icode >= CODE_FOR_load_pairv8qiv8qi +- && icode <= CODE_FOR_load_pairdfdf) +- || (icode >= CODE_FOR_load_pairv16qiv16qi +- && icode <= CODE_FOR_load_pairv8bfv2df) +- || (icode >= CODE_FOR_load_pair_lanesv8qi +- && icode <= CODE_FOR_load_pair_lanesdf)) +- return true; ++ && icode <= CODE_FOR_loadwb_pairdi_di) ++ || (icode >= CODE_FOR_loadwb_pairsf_si ++ && icode <= CODE_FOR_loadwb_pairdf_di) ++ || (icode >= CODE_FOR_loadwb_pairti_si ++ && icode <= CODE_FOR_loadwb_pairtf_di)) ++ { ++ if (has_wb) ++ *has_wb = ((icode >= CODE_FOR_loadwb_pairsi_si ++ && icode <= CODE_FOR_loadwb_pairdi_di) ++ || (icode >= CODE_FOR_loadwb_pairsf_si ++ && icode <= CODE_FOR_loadwb_pairdf_di) ++ || (icode >= CODE_FOR_loadwb_pairti_si ++ && icode <= CODE_FOR_loadwb_pairtf_di)); ++ return true; ++ } + return false; + } + + bool +-is_aarch64_stp_insn (int icode) ++is_aarch64_stp_insn (int icode, bool *has_wb) + { + if ((icode >= CODE_FOR_store_pair_sw_sisi +- && icode <= CODE_FOR_store_pair_dw_tftf) ++ && icode <= CODE_FOR_store_pair_sw_sfsf) ++ || (icode >= CODE_FOR_store_pair_dw_didi ++ && icode <= CODE_FOR_store_pair_dw_dfdf) ++ || (icode == CODE_FOR_store_pair_dw_tftf) + || (icode >= CODE_FOR_storewb_pairsi_si +- && icode <= CODE_FOR_storewb_pairtf_di) +- || (icode >= CODE_FOR_vec_store_pairv8qiv8qi +- && icode <= CODE_FOR_vec_store_pairdfdf) +- || (icode >= CODE_FOR_vec_store_pairv16qiv16qi +- && icode <= CODE_FOR_vec_store_pairv8bfv2df) +- || (icode >= CODE_FOR_store_pair_lanesv8qi +- && icode <= CODE_FOR_store_pair_lanesdf)) +- return true; ++ && icode <= CODE_FOR_storewb_pairdi_di) ++ || (icode >= CODE_FOR_storewb_pairsf_si ++ && icode <= CODE_FOR_storewb_pairdf_di) ++ || (icode >= CODE_FOR_storewb_pairti_si ++ && icode <= CODE_FOR_storewb_pairtf_di)) ++ { ++ if (has_wb) ++ *has_wb = ((icode >= CODE_FOR_storewb_pairsi_si ++ && icode <= CODE_FOR_storewb_pairdi_di) ++ || (icode >= CODE_FOR_storewb_pairsf_si ++ && icode <= CODE_FOR_storewb_pairdf_di) ++ || (icode >= CODE_FOR_storewb_pairti_si ++ && icode <= CODE_FOR_storewb_pairtf_di)); ++ return true; ++ } + return false; + } + +diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi +index 4a998aa76..3dfd242ff 100644 +--- a/gcc/doc/tm.texi ++++ b/gcc/doc/tm.texi +@@ -11899,12 +11899,16 @@ This function generate the AES inversed mix columns instruction + of 16 byte elements vector if target supports this. + @end deftypefn + +-@deftypefn {Target Hook} bool TARGET_IS_LDP_INSN (int @var{icode}) +-Return true if icode is corresponding to any of the LDP instruction types. ++@deftypefn {Target Hook} bool TARGET_IS_LDP_INSN (int @var{icode}, bool *@var{has_wb}) ++Return true if @var{icode} is corresponding to any of the LDP instruction ++types. If @var{has_wb} is not NULL then its value is set to true if LDP ++contains post-index or pre-index operation. + @end deftypefn + +-@deftypefn {Target Hook} bool TARGET_IS_STP_INSN (int @var{icode}) +-Return true if icode is corresponding to any of the STP instruction types. ++@deftypefn {Target Hook} bool TARGET_IS_STP_INSN (int @var{icode}, bool *@var{has_wb}) ++Return true if @var{icode} is corresponding to any of the STP instruction ++types. If @var{has_wb} is not NULL then its value is set to true if STP ++contains post-index or pre-index operation. + @end deftypefn + + @deftypefn {Target Hook} bool TARGET_CANNOT_MODIFY_JUMPS_P (void) +diff --git a/gcc/sched-rgn.c b/gcc/sched-rgn.c +index 32f4489d8..43efe3926 100644 +--- a/gcc/sched-rgn.c ++++ b/gcc/sched-rgn.c +@@ -3960,7 +3960,7 @@ make_pass_sched_fusion (gcc::context *ctxt) + + namespace { + +-/* Def-use analisys special functions implementation. */ ++/* Def-use analysis special functions implementation. */ + + static struct df_link * + get_defs (rtx_insn *insn, rtx reg) +@@ -4036,42 +4036,66 @@ const pass_data pass_data_split_complex_instructions = { + (TODO_df_verify | TODO_df_finish), /* Todo_flags_finish. */ + }; + ++/* Pass split_complex_instructions finds LOAD PAIR instructions (LDP) that can ++ be split into two LDR instructions. It splits only those LDP for which one ++ half of the requested memory is contained in the preceding STORE (STR/STP) ++ instruction whose base register has the same definition. This allows ++ to use hardware store-to-load forwarding mechanism and to get one half of ++ requested memory from the store queue of CPU. ++ ++ TODO: Add split of STP. ++ TODO: Add split of vector STP and LDP. */ + class pass_split_complex_instructions : public rtl_opt_pass + { + private: +- enum complex_instructions_t ++ enum mem_access_insn_t + { + UNDEFINED, + LDP, ++ /* LDP with post-index (see loadwb_pair in config/aarch64.md). */ ++ LDP_WB, ++ /* LDP that contains one destination register in RTL IR ++ (see movti_aarch64 in config/aarch64.md). */ + LDP_TI, + STP, ++ /* STP with pre-index (see storewb_pair in config/aarch64.md). */ ++ STP_WB, ++ /* STP that contains one source register in RTL IR ++ (see movti_aarch64 in config/aarch64.md). */ ++ STP_TI, + STR + }; + +- void split_complex_insn (rtx_insn *insn); +- void split_ldp_ti (rtx_insn *insn); +- void split_ldp_with_offset (rtx_insn *ldp_insn); +- void split_simple_ldp (rtx_insn *ldp_insn); +- void split_ldp_stp (rtx_insn *insn); +- complex_instructions_t get_insn_type (rtx_insn *insn); +- +- basic_block bb; +- rtx_insn *insn; + std::set dependent_stores_candidates; + std::set ldp_to_split_list; + +- complex_instructions_t complex_insn_type = UNDEFINED; +- bool is_store_insn (rtx_insn *insn); +- bool is_ldp_dependent_on_store (rtx_insn *ldp_insn, basic_block bb); ++ void split_ldp_ti (rtx_insn *insn); ++ void split_ldp (rtx_insn *ldp_insn); ++ /* Emit a NEW_INSNS chain, recognize instruction code of each new instruction ++ and replace OLD_INSN with the emitted sequence. */ ++ void replace_insn (rtx_insn *old_insn, rtx_insn *new_insns); ++ ++ mem_access_insn_t get_insn_type (rtx_insn *insn); ++ bool is_typeof_ldp (mem_access_insn_t insn_type); ++ bool is_typeof_stp (mem_access_insn_t insn_type); ++ + bool bfs_for_reg_dependent_store (rtx_insn *ldp_insn, basic_block search_bb, + rtx_insn *search_insn, + int search_range + = param_ldp_dependency_search_range); + bool is_store_reg_dependent (rtx_insn *ldp_insn, rtx_insn *str_insn); + void init_df (); +- void find_dependent_stores_candidates (rtx_insn *ldp_insn); +- int get_insn_offset (rtx_insn *insn, complex_instructions_t insn_type, +- int *arith_operation_ptr = NULL); ++ void find_dependent_stores_candidates (rtx_insn *ldp_insn, ++ mem_access_insn_t insn_type); ++ ++ rtx get_memref (rtx_insn *insn, mem_access_insn_t insn_type); ++ rtx get_base_reg (rtx memref); ++ /* Set OFFSET to the offset value. Returns TRUE if MEMREF's address ++ expression is supported, FALSE otherwise. */ ++ bool get_offset (rtx memref, int &offset); ++ /* Return size of memory referenced by MEMREF. Returns -1 if INSN_TYPE ++ wasn't recognized. */ ++ int get_unit_size (rtx memref, mem_access_insn_t insn_type); + + public: + pass_split_complex_instructions (gcc::context *ctxt) +@@ -4084,28 +4108,22 @@ public: + virtual unsigned int + execute (function *) + { +- enum rtx_code ldp_memref_code; ++ basic_block bb; ++ rtx_insn *insn; ++ + init_df (); + ldp_to_split_list.clear (); + FOR_EACH_BB_FN (bb, cfun) + { + FOR_BB_INSNS (bb, insn) + { +- complex_instructions_t insn_type = get_insn_type (insn); +- /* TODO: Add splitting of STP instructions. */ +- if (insn_type != LDP && insn_type != LDP_TI) ++ mem_access_insn_t insn_type = get_insn_type (insn); ++ if (!is_typeof_ldp (insn_type)) + continue; +- /* TODO: Currently support only ldp_ti and ldp with REG or +- PLUS/MINUS offset expression. */ +- if (insn_type == LDP_TI) +- { +- ldp_memref_code = GET_CODE (XEXP (XEXP (PATTERN (insn), 1), +- 0)); +- if (ldp_memref_code != REG && ldp_memref_code != PLUS +- && ldp_memref_code != MINUS) +- continue; +- } +- if (is_ldp_dependent_on_store (insn, bb)) ++ ++ find_dependent_stores_candidates (insn, insn_type); ++ if (!dependent_stores_candidates.empty () ++ && bfs_for_reg_dependent_store (insn, bb, insn)) + { + ldp_to_split_list.insert (insn); + } +@@ -4114,18 +4132,107 @@ public: + + for (std::set::iterator i = ldp_to_split_list.begin (); + i != ldp_to_split_list.end (); ++i) +- split_complex_insn (*i); ++ split_ldp (*i); + + return 0; + } + }; // class pass_split_complex_instructions + + bool +-pass_split_complex_instructions::is_ldp_dependent_on_store (rtx_insn *ldp_insn, +- basic_block bb) ++pass_split_complex_instructions::is_typeof_ldp ( ++ mem_access_insn_t insn_type) + { +- find_dependent_stores_candidates (ldp_insn); +- return bfs_for_reg_dependent_store (ldp_insn, bb, ldp_insn); ++ return (insn_type == LDP || insn_type == LDP_WB || insn_type == LDP_TI); ++} ++ ++bool ++pass_split_complex_instructions::is_typeof_stp ( ++ mem_access_insn_t insn_type) ++{ ++ return (insn_type == STP || insn_type == STP_WB || insn_type == STP_TI); ++} ++ ++rtx ++pass_split_complex_instructions::get_memref ( ++ rtx_insn *insn, mem_access_insn_t insn_type) ++{ ++ rtx insn_pat = PATTERN (insn); ++ rtx memref = NULL; ++ ++ switch (insn_type) ++ { ++ case LDP: ++ memref = SET_SRC (XVECEXP (insn_pat, 0, 0)); ++ break; ++ case LDP_WB: ++ memref = SET_SRC (XVECEXP (insn_pat, 0, 1)); ++ break; ++ case LDP_TI: ++ memref = SET_SRC (insn_pat); ++ break; ++ case STP: ++ memref = SET_DEST (XVECEXP (insn_pat, 0, 0)); ++ break; ++ case STP_WB: ++ memref = SET_DEST (XVECEXP (insn_pat, 0, 1)); ++ break; ++ case STP_TI: ++ case STR: ++ memref = SET_DEST (insn_pat); ++ break; ++ default: ++ break; ++ } ++ ++ if (memref && !MEM_P (memref)) ++ return NULL; ++ return memref; ++} ++ ++rtx ++pass_split_complex_instructions::get_base_reg (rtx memref) ++{ ++ if (!memref || !MEM_P (memref)) ++ return NULL; ++ rtx addr_exp = XEXP (memref, 0); ++ ++ switch (GET_CODE (addr_exp)) ++ { ++ case REG: ++ return addr_exp; ++ case PLUS: ++ case PRE_DEC: ++ case PRE_INC: ++ case POST_DEC: ++ case POST_INC: ++ if (REG_P (XEXP (addr_exp, 0))) ++ return XEXP (addr_exp, 0); ++ default: ++ return NULL; ++ } ++} ++ ++int ++pass_split_complex_instructions::get_unit_size ( ++ rtx memref, mem_access_insn_t insn_type) ++{ ++ if (!memref) ++ return -1; ++ ++ switch (insn_type) ++ { ++ case LDP: ++ case STP: ++ case LDP_WB: ++ case STP_WB: ++ case STR: ++ return GET_MODE_SIZE (GET_MODE (memref)).to_constant (); ++ case LDP_TI: ++ case STP_TI: ++ return GET_MODE_SIZE (E_DImode).to_constant (); ++ default: ++ return -1; ++ } + } + + bool +@@ -4139,9 +4246,9 @@ pass_split_complex_instructions::bfs_for_reg_dependent_store ( + { + if (!current_search_insn) + return false; +- bool checking_result +- = is_store_reg_dependent (ldp_insn, current_search_insn); +- if (checking_result) ++ ++ if (dependent_stores_candidates.find (current_search_insn) ++ != dependent_stores_candidates.end ()) + { + if (dump_file) + { +@@ -4189,30 +4296,29 @@ pass_split_complex_instructions::init_df () + + void + pass_split_complex_instructions::find_dependent_stores_candidates ( +- rtx_insn *ldp_insn) ++ rtx_insn *ldp_insn, mem_access_insn_t insn_type) + { + dependent_stores_candidates.clear (); +- df_ref use; + +- FOR_EACH_INSN_USE (use, ldp_insn) +- { +- df_link *defs = get_defs (ldp_insn, DF_REF_REG (use)); +- if (!defs) +- return; ++ rtx base_reg = get_base_reg (get_memref (ldp_insn, insn_type)); ++ if (!base_reg) ++ return; + +- for (df_link *def = defs; def; def = def->next) +- { +- df_link *uses +- = get_uses (DF_REF_INSN (def->ref), DF_REF_REG (def->ref)); +- if (!uses) +- continue; ++ df_link *defs = get_defs (ldp_insn, base_reg); ++ if (!defs) ++ return; + +- for (df_link *use = uses; use; use = use->next) +- { +- if (DF_REF_CLASS (use->ref) == DF_REF_REGULAR +- && is_store_insn (DF_REF_INSN (use->ref))) +- dependent_stores_candidates.insert (DF_REF_INSN (use->ref)); +- } ++ for (df_link *def = defs; def; def = def->next) ++ { ++ df_link *uses = get_uses (DF_REF_INSN (def->ref), DF_REF_REG (def->ref)); ++ if (!uses) ++ continue; ++ for (df_link *use = uses; use; use = use->next) ++ { ++ if (DF_REF_CLASS (use->ref) == DF_REF_REGULAR ++ && DF_REF_INSN (use->ref) != ldp_insn ++ && is_store_reg_dependent (ldp_insn, DF_REF_INSN (use->ref))) ++ dependent_stores_candidates.insert (DF_REF_INSN (use->ref)); + } + } + } +@@ -4221,423 +4327,274 @@ bool + pass_split_complex_instructions::is_store_reg_dependent (rtx_insn *ldp_insn, + rtx_insn *str_insn) + { +- if (!is_store_insn (str_insn) +- || dependent_stores_candidates.find (str_insn) +- == dependent_stores_candidates.end ()) ++ if (!str_insn) + return false; + +- int ldp_offset_sign = UNDEFINED; +- int ldp_offset +- = get_insn_offset (ldp_insn, get_insn_type (ldp_insn), &ldp_offset_sign); +- if (ldp_offset_sign == MINUS) +- ldp_offset = -ldp_offset; ++ mem_access_insn_t st_type = get_insn_type (str_insn); ++ if (!is_typeof_stp (st_type) && st_type != STR) ++ return false; + +- int str_offset_sign = UNDEFINED; +- int str_offset = get_insn_offset (str_insn, STR, &str_offset_sign); +- if (str_offset_sign == MINUS) +- str_offset = -str_offset; ++ mem_access_insn_t ld_type = get_insn_type (ldp_insn); ++ rtx ld_memref = get_memref (ldp_insn, ld_type); ++ rtx st_memref = get_memref (str_insn, st_type); ++ rtx ld_base_reg = get_base_reg (ld_memref); ++ rtx st_base_reg = get_base_reg (st_memref); + +- if (str_offset == ldp_offset || str_offset == ldp_offset + 8) +- return true; ++ if (!ld_base_reg || !st_base_reg ++ || REGNO (ld_base_reg) != REGNO (st_base_reg)) ++ return false; + +- return false; +-} ++ int ld_offset = 0; ++ int st_offset = 0; ++ if (get_offset (ld_memref, ld_offset) ++ && get_offset (st_memref, st_offset)) ++ { ++ int ld_unit_size = get_unit_size (ld_memref, ld_type); ++ int st_size = get_unit_size (st_memref, st_type); ++ if (st_type != STR) ++ st_size *= 2; + +-bool +-pass_split_complex_instructions::is_store_insn (rtx_insn *insn) +-{ +- if (!insn) +- return false; +- rtx sset_b = single_set (insn); +- /* TODO: The condition below allow to take only store instructions in which +- the memory location's operand is either a register (base) or an plus/minus +- operation (base + #imm). So it might make sense to add support for other +- cases (e.g. multiply and shift). */ +- if (sset_b && MEM_P (SET_DEST (sset_b)) +- && GET_MODE (XEXP (sset_b, 0)) != BLKmode +- && (GET_CODE (XEXP (XEXP (sset_b, 0), 0)) == REG +- || (GET_CODE (XEXP (XEXP (sset_b, 0), 0)) == PLUS +- || GET_CODE (XEXP (XEXP (sset_b, 0), 0)) == MINUS) +- && (GET_CODE (XEXP (XEXP (XEXP (sset_b, 0), 0), 1)) == CONST_INT))) +- return true; ++ if (ld_unit_size < 0 || st_size < 0) ++ return false; ++ ++ bool st_has_low_ld_part = (ld_offset >= st_offset ++ && (ld_offset + ld_unit_size <= st_offset + st_size)); ++ bool st_has_high_ld_part = ((ld_offset + ld_unit_size >= st_offset) ++ && (ld_offset + 2 * ld_unit_size <= st_offset + st_size)); ++ bool st_has_not_full_ld = (ld_offset < st_offset ++ || (ld_offset + 2 * ld_unit_size > st_offset + st_size)); ++ ++ if ((st_has_low_ld_part || st_has_high_ld_part) && st_has_not_full_ld) ++ return true; ++ } + + return false; + } + +-int +-pass_split_complex_instructions::get_insn_offset ( +- rtx_insn *insn, complex_instructions_t insn_type, int *arith_operation_ptr) ++bool ++pass_split_complex_instructions::get_offset (rtx memref, int &offset) + { +- rtx insn_pat = PATTERN (insn); +- int returned_offset = 0; ++ rtx addr_exp = XEXP (memref, 0); + +- rtx offset_expr = NULL; +- rtx offset_value_expr = NULL; +- +- switch (insn_type) ++ switch (GET_CODE (addr_exp)) + { +- case LDP: +- { +- int number_of_sub_insns = XVECLEN (insn_pat, 0); +- +- /* Calculate it's own ofsset of first load insn. */ +- rtx_insn *first_load_insn = NULL; +- if (number_of_sub_insns == 2) ++ case REG: ++ case POST_DEC: ++ case POST_INC: ++ offset = 0; ++ return true; ++ case PRE_DEC: ++ offset = -(GET_MODE_SIZE (GET_MODE (memref)).to_constant ()); ++ return true; ++ case PRE_INC: ++ offset = GET_MODE_SIZE (GET_MODE (memref)).to_constant (); ++ return true; ++ case PLUS: ++ if (CONST_INT_P (XEXP (addr_exp, 1))) + { +- first_load_insn +- = make_insn_raw (copy_rtx (XVECEXP (insn_pat, 0, 0))); +- arith_operation_ptr = NULL; +- +- offset_expr = XEXP (XEXP (PATTERN (first_load_insn), 1), 0); +- if (GET_CODE (offset_expr) == PLUS +- || GET_CODE (offset_expr) == MINUS) +- offset_value_expr +- = XEXP (XEXP (XEXP (PATTERN (first_load_insn), 1), 0), 1); +- else +- offset_expr = NULL; ++ offset = INTVAL (XEXP (addr_exp, 1)); ++ return true; + } +- else if (number_of_sub_insns == 3) +- { +- rtx_insn *offset_sub_insn +- = make_insn_raw (copy_rtx (XVECEXP (insn_pat, 0, 0))); +- +- offset_expr = XEXP (PATTERN (offset_sub_insn), 1); +- offset_value_expr = XEXP (XEXP (PATTERN (offset_sub_insn), 1), 1); +- } +- else +- { +- gcc_assert (false +- && "Wrong number of elements in the ldp_insn vector"); +- } +- break; +- } +- case LDP_TI: +- { +- offset_expr = XEXP (XEXP (insn_pat, 1), 0); +- if (GET_CODE (offset_expr) != PLUS && GET_CODE (offset_expr) != MINUS) +- return 0; +- offset_value_expr = XEXP (XEXP (XEXP (insn_pat, 1), 0), 1); +- break; +- } +- case STR: +- { +- offset_expr = XEXP (XEXP (insn_pat, 0), 0); +- /* If memory location is specified by single base register then the +- offset is zero. */ +- if (GET_CODE (offset_expr) == REG) +- return 0; +- offset_value_expr = XEXP (XEXP (XEXP (insn_pat, 0), 0), 1); +- break; +- } +- default: +- { +- if (dumps_are_enabled && dump_file) +- { +- fprintf (dump_file, "Instruction that was tried to split:\n"); +- print_rtl_single (dump_file, insn); +- } +- gcc_assert (false && "Unsupported instruction type"); +- break; +- } +- } +- +- if (offset_expr != NULL && offset_value_expr +- && GET_CODE (offset_value_expr) == CONST_INT) +- returned_offset = XINT (offset_value_expr, 0); +- +- if (arith_operation_ptr != NULL) +- { +- *arith_operation_ptr = GET_CODE (offset_expr); +- gcc_assert ((*arith_operation_ptr == MINUS +- || *arith_operation_ptr == PLUS) +- && "Unexpected arithmetic operation in the offset expr"); ++ default: ++ return false; + } +- +- return returned_offset; + } + + void +-pass_split_complex_instructions::split_simple_ldp (rtx_insn *ldp_insn) ++pass_split_complex_instructions::replace_insn (rtx_insn *old_insn, ++ rtx_insn *new_insns) + { +- rtx pat = PATTERN (ldp_insn); +- +- rtx_insn *mem_insn_1 = make_insn_raw (copy_rtx (XVECEXP (pat, 0, 0))); +- rtx_insn *mem_insn_2 = make_insn_raw (copy_rtx (XVECEXP (pat, 0, 1))); +- +- int dest_regno = REGNO (SET_DEST (PATTERN (mem_insn_1))); +- int src_regno; +- +- rtx srs_reg_insn = XEXP (SET_SRC (PATTERN (mem_insn_1)), 0); +- +- if (GET_CODE (srs_reg_insn) == REG) +- src_regno = REGNO (srs_reg_insn); +- else +- src_regno = REGNO (XEXP (srs_reg_insn, 0)); +- +- rtx_insn *emited_insn_1, *emited_insn_2; ++ rtx_insn *prev_insn = PREV_INSN (old_insn); ++ start_sequence (); + +- /* in cases like ldp r1,r2,[r1] we emit ldr r2,[r1] first. */ +- if (src_regno == dest_regno) +- std::swap (mem_insn_1, mem_insn_2); ++ emit_insn (new_insns); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Split LDP:\n"); ++ print_rtl_single (dump_file, old_insn); ++ fprintf (dump_file, "Split into:\n"); ++ } + +- emited_insn_1 = emit_insn (PATTERN (mem_insn_1)); +- emited_insn_2 = emit_insn (PATTERN (mem_insn_2)); ++ for (rtx_insn *insn = new_insns; insn; insn = NEXT_INSN (insn)) ++ { ++ INSN_CODE (insn) = recog (PATTERN (insn), insn, NULL); ++ if (dump_file) ++ { ++ print_rtl_single (dump_file, insn); ++ } ++ } + +- int sub_insn_1_code = recog (PATTERN (mem_insn_1), mem_insn_1, 0); +- int sub_insn_2_code = recog (PATTERN (mem_insn_2), mem_insn_2, 0); ++ rtx_insn *seq = get_insns (); ++ unshare_all_rtl_in_chain (seq); ++ end_sequence (); + +- INSN_CODE (emited_insn_1) = sub_insn_1_code; +- INSN_CODE (emited_insn_2) = sub_insn_2_code; ++ emit_insn_after_setloc (seq, prev_insn, INSN_LOCATION (old_insn)); ++ delete_insn_and_edges (old_insn); + } + + void +-pass_split_complex_instructions::split_ldp_with_offset (rtx_insn *ldp_insn) ++pass_split_complex_instructions::split_ldp (rtx_insn *ldp_insn) + { + rtx pat = PATTERN (ldp_insn); +- bool post_index = true; +- +- rtx_insn offset_insn; +- rtx_insn mem_insn_1; +- rtx_insn mem_insn_2; ++ mem_access_insn_t insn_type = get_insn_type (ldp_insn); ++ gcc_assert (is_typeof_ldp (insn_type)); + +- int offset_insn_code; +- int mem_insn_1_code = -1; +- int mem_insn_2_code = -1; ++ rtx load_rtx_1 = NULL; ++ rtx load_rtx_2 = NULL; ++ rtx post_index_rtx = NULL; + +- int offset = 0; +- int arith_operation = UNDEFINED; +- +- for (int i = 0; i < 3; i++) ++ switch (insn_type) + { +- rtx sub_insn = XVECEXP (pat, 0, i); +- rtx_insn *copy_of_sub_insn = make_insn_raw (copy_rtx (sub_insn)); +- int sub_insn_code +- = recog (PATTERN (copy_of_sub_insn), copy_of_sub_insn, 0); +- +- /* If sub_insn is offset related. */ +- if (GET_RTX_CLASS (sub_insn_code) == RTX_UNARY) +- { +- offset_insn = *copy_of_sub_insn; +- offset_insn_code = sub_insn_code; +- gcc_assert (i == 0 +- && "Offset related insn must be the first " +- "element of a parallel insn vector"); +- +- offset = get_insn_offset (ldp_insn, LDP, &arith_operation); +- } +- else +- { +- if (GET_CODE (XEXP (PATTERN (copy_of_sub_insn), 0)) != REG) +- { +- rtx &offset_expr +- = XEXP (XEXP (XEXP (PATTERN (copy_of_sub_insn), 0), 0), 1); +- if (GET_CODE (offset_expr) == CONST_INT) +- { +- int local_offset = XINT (offset_expr, 0); +- offset = (arith_operation == PLUS ? offset : -offset); +- +- offset_expr = GEN_INT (local_offset + offset); +- +- gcc_assert ( +- (arith_operation == MINUS || arith_operation == PLUS) +- && "Unexpected arithmetic operation in offset related " +- "sub_insn"); +- +- if (i == 1) +- post_index = false; +- } +- else +- { +- post_index = true; +- } +- } +- } +- if (i == 1) +- { +- mem_insn_1 = *copy_of_sub_insn; +- mem_insn_1_code = sub_insn_code; +- } +- if (i == 2) +- { +- mem_insn_2 = *copy_of_sub_insn; +- mem_insn_2_code = sub_insn_code; +- } ++ case LDP: ++ load_rtx_1 = copy_rtx (XVECEXP (pat, 0, 0)); ++ load_rtx_2 = copy_rtx (XVECEXP (pat, 0, 1)); ++ break; ++ case LDP_WB: ++ post_index_rtx = copy_rtx (XVECEXP (pat, 0, 0)); ++ load_rtx_1 = copy_rtx (XVECEXP (pat, 0, 1)); ++ load_rtx_2 = copy_rtx (XVECEXP (pat, 0, 2)); ++ break; ++ case LDP_TI: ++ split_ldp_ti (ldp_insn); ++ return; ++ default: ++ return; + } +- gcc_assert (mem_insn_1_code != -1 && mem_insn_2_code != -1 +- && "Uninitialized memory insns"); + +- int dest_regno = REGNO (SET_DEST (PATTERN (&mem_insn_1))); +- int src_regno; +- +- rtx srs_reg_insn = XEXP (SET_SRC (PATTERN (&mem_insn_1)), 0); +- +- if (GET_CODE (srs_reg_insn) == REG) +- src_regno = REGNO (srs_reg_insn); +- else +- src_regno = REGNO (XEXP (srs_reg_insn, 0)); ++ int dest_regno = REGNO (SET_DEST (load_rtx_1)); ++ int base_regno = REGNO (get_base_reg (get_memref (ldp_insn, insn_type))); + +- /* Don't split such weird LDP. */ +- if (src_regno == dest_regno) +- return; +- +- rtx_insn *emited_offset_insn; +- if (!post_index) ++ /* In cases like ldp r1,r2,[r1[, #imm]] emit ldr r2,[r1[, #imm]] first. ++ For LDP with post-index don't split such instruction. */ ++ if (base_regno == dest_regno) + { +- emited_offset_insn = emit_insn (PATTERN (&offset_insn)); +- INSN_CODE (emited_offset_insn) = offset_insn_code; ++ if (insn_type == LDP) ++ std::swap (load_rtx_1, load_rtx_2); ++ else ++ return; + } + +- rtx_insn *emited_insn_1 = emit_insn (PATTERN (&mem_insn_1)); +- rtx_insn *emited_insn_2 = emit_insn (PATTERN (&mem_insn_2)); +- +- +- INSN_CODE (emited_insn_1) = mem_insn_1_code; +- INSN_CODE (emited_insn_2) = mem_insn_2_code; +- +- if (post_index) ++ /* Construct the instruction chain for subsequent emitting. */ ++ rtx_insn *insn_seq = make_insn_raw (load_rtx_1); ++ rtx_insn *load_insn_2 = make_insn_raw (load_rtx_2); ++ SET_NEXT_INSN (insn_seq) = load_insn_2; ++ SET_NEXT_INSN (load_insn_2) = NULL; ++ if (post_index_rtx) + { +- emited_offset_insn = emit_insn (PATTERN (&offset_insn)); +- INSN_CODE (emited_offset_insn) = offset_insn_code; ++ rtx_insn *post_index_insn = make_insn_raw (post_index_rtx); ++ SET_NEXT_INSN (load_insn_2) = post_index_insn; ++ SET_NEXT_INSN (post_index_insn) = NULL; + } +-} +- +-void +-pass_split_complex_instructions::split_ldp_stp (rtx_insn *insn) +-{ +- rtx_insn *prev_insn = PREV_INSN (insn); +- int number_of_sub_insns = XVECLEN (PATTERN (insn), 0); +- +- start_sequence (); + +- if (number_of_sub_insns == 2) +- split_simple_ldp (insn); +- else if (number_of_sub_insns == 3) +- split_ldp_with_offset (insn); +- else +- gcc_assert (false && "Broken complex insn vector"); +- +- rtx_insn *seq = get_insns (); +- unshare_all_rtl_in_chain (seq); +- end_sequence (); +- +- emit_insn_after_setloc (seq, prev_insn, INSN_LOCATION (insn)); +- delete_insn_and_edges (insn); ++ replace_insn (ldp_insn, insn_seq); + } + + void + pass_split_complex_instructions::split_ldp_ti (rtx_insn *insn) + { +- rtx_insn *prev_insn = PREV_INSN (insn); +- rtx_insn *load_insn_1 = make_insn_raw (copy_rtx (PATTERN (insn))); +- rtx_insn *load_insn_2 = make_insn_raw (copy_rtx (PATTERN (insn))); +- +- rtx reg_insn_1 = XEXP (PATTERN (load_insn_1), 0); +- rtx mem_insn_1 = XEXP (PATTERN (load_insn_1), 1); +- rtx mem_insn_2 = XEXP (PATTERN (load_insn_2), 1); +- +- PUT_MODE (mem_insn_1, DImode); +- PUT_MODE (mem_insn_2, DImode); +- +- int reg_no_1 = REGNO (reg_insn_1); ++ rtx pat = PATTERN (insn); ++ rtx memref = get_memref (insn, LDP_TI); ++ int unit_size = get_unit_size (memref, LDP_TI); ++ rtx base_reg = get_base_reg (memref); ++ rtx dest_reg = SET_DEST (pat); ++ ++ rtx reg_index_rtx = NULL; ++ rtx load_rtx_1 = NULL; ++ rtx load_rtx_2 = NULL; ++ bool post_index = false; ++ int offset = 0; + +- XEXP (PATTERN (load_insn_1), 0) = gen_rtx_REG (DImode, reg_no_1); +- XEXP (PATTERN (load_insn_2), 0) = gen_rtx_REG (DImode, reg_no_1 + 1); ++ rtx load_1_memref = gen_rtx_MEM (DImode, base_reg); + +- rtx load_insn_2_plus_expr = XEXP (XEXP (PATTERN (load_insn_2), 1), 0); +- if (GET_CODE (load_insn_2_plus_expr) == REG) ++ rtx addr_expr = XEXP (memref, 0); ++ if (GET_CODE (addr_expr) == PLUS) + { +- XEXP (XEXP (PATTERN (load_insn_2), 1), 0) +- = gen_rtx_PLUS (DImode, +- gen_rtx_REG (DImode, REGNO (load_insn_2_plus_expr)), +- GEN_INT (GET_MODE_SIZE (DImode))); ++ offset = INTVAL (XEXP (addr_expr, 1)); ++ XEXP (load_1_memref, 0) = gen_rtx_PLUS (DImode, base_reg, ++ GEN_INT (offset)); + } +- else +- { +- rtx load_insn_2_offset_expr +- = XEXP (XEXP (XEXP (PATTERN (load_insn_2), 1), 0), 1); + +- if (load_insn_2_offset_expr == NULL) +- return; +- +- if (GET_CODE (load_insn_2_offset_expr) == CONST_INT) +- { +- int load_insn_2_offset = XINT (load_insn_2_offset_expr, 0); +- XEXP (XEXP (XEXP (PATTERN (load_insn_2), 1), 0), 1) +- = GEN_INT (load_insn_2_offset + GET_MODE_SIZE (DImode)); +- } +- } +- +- start_sequence (); ++ rtx load_2_memref = gen_rtx_MEM (DImode, ++ gen_rtx_PLUS (DImode, base_reg, GEN_INT (offset + unit_size))); + +- int src_regno; +- rtx srs_reg_insn = XEXP (XEXP (PATTERN (load_insn_1), 1), 0); ++ load_rtx_1 = gen_rtx_SET (gen_rtx_REG (DImode, REGNO (dest_reg)), ++ load_1_memref); ++ load_rtx_2 = gen_rtx_SET (gen_rtx_REG (DImode, REGNO (dest_reg) + 1), ++ load_2_memref); + +- if (GET_CODE (srs_reg_insn) == REG) +- src_regno = REGNO (srs_reg_insn); +- else +- src_regno = REGNO (XEXP (srs_reg_insn, 0)); ++ if (GET_CODE (addr_expr) == PRE_INC || GET_CODE (addr_expr) == PRE_DEC ++ || GET_CODE (addr_expr) == POST_INC || GET_CODE (addr_expr) == POST_DEC) ++ { ++ /* The amount of increment or decrement is equal to size of ++ machine-mode of the containing MEMREF (see rtl.def). */ ++ int index_offset = GET_MODE_SIZE (GET_MODE (memref)).to_constant (); + +- /* in cases like ldp r1,r2,[r1] we emit ldr r2,[r1] first. */ +- if (src_regno == reg_no_1) +- std::swap (load_insn_1, load_insn_2); ++ if (GET_CODE (addr_expr) == PRE_DEC || GET_CODE (addr_expr) == POST_DEC) ++ index_offset = -index_offset; + +- rtx_insn *emited_load_insn_1 = emit_insn (PATTERN (load_insn_1)); +- rtx_insn *emited_load_insn_2 = emit_insn (PATTERN (load_insn_2)); ++ if (GET_CODE (addr_expr) == POST_INC || GET_CODE (addr_expr) == POST_DEC) ++ post_index = true; + +- INSN_CODE (emited_load_insn_1) +- = recog (PATTERN (emited_load_insn_1), emited_load_insn_1, 0); +- INSN_CODE (emited_load_insn_2) +- = recog (PATTERN (emited_load_insn_2), emited_load_insn_2, 0); ++ reg_index_rtx = gen_rtx_SET (base_reg, ++ gen_rtx_PLUS (DImode, base_reg, ++ GEN_INT (index_offset))); ++ } + +- rtx_insn *seq = get_insns (); +- unshare_all_rtl_in_chain (seq); +- end_sequence (); ++ /* In cases like ldp r1,r2,[r1] we emit ldr r2,[r1] first. */ ++ if (REGNO (base_reg) == REGNO (dest_reg)) ++ std::swap (load_rtx_1, load_rtx_2); + +- emit_insn_after_setloc (seq, prev_insn, INSN_LOCATION (insn)); +- delete_insn_and_edges (insn); +-} ++ /* Construct the instruction chain for subsequent emitting. */ ++ rtx_insn *insn_seq = make_insn_raw (load_rtx_1); ++ rtx_insn *load_insn_2 = make_insn_raw (load_rtx_2); ++ SET_NEXT_INSN (insn_seq) = load_insn_2; ++ SET_NEXT_INSN (load_insn_2) = NULL; ++ if (post_index && reg_index_rtx) ++ { ++ rtx_insn *post_index_insn = make_insn_raw (reg_index_rtx); ++ SET_NEXT_INSN (load_insn_2) = post_index_insn; ++ SET_NEXT_INSN (post_index_insn) = NULL; ++ } ++ else if (!post_index && reg_index_rtx) ++ { ++ rtx_insn *pre_index = make_insn_raw (reg_index_rtx); ++ SET_NEXT_INSN (pre_index) = insn_seq; ++ insn_seq = pre_index; ++ } + +-void +-pass_split_complex_instructions::split_complex_insn (rtx_insn *insn) +-{ +- complex_instructions_t insn_type = get_insn_type (insn); +- /* TODO: Add splitting of STP instructions. */ +- if (insn_type == LDP || insn_type == STP) +- split_ldp_stp (insn); +- else if (insn_type == LDP_TI) +- split_ldp_ti (insn); +- else +- gcc_assert (false && "Unsupported type of insn to split"); ++ replace_insn (insn, insn_seq); + } + +-pass_split_complex_instructions::complex_instructions_t ++pass_split_complex_instructions::mem_access_insn_t + pass_split_complex_instructions::get_insn_type (rtx_insn *insn) + { + if (!INSN_P (insn)) + return UNDEFINED; + +- rtx pat = PATTERN (insn); +- int icode = recog (PATTERN (insn), insn, NULL); ++ int icode = INSN_CODE (insn); ++ if (icode == -1) ++ icode = recog (PATTERN (insn), insn, 0); ++ bool has_wb = false; ++ ++ if (targetm.is_ldp_insn (icode, &has_wb)) ++ return (has_wb ? LDP_WB : LDP); + +- if (GET_CODE (pat) == PARALLEL) ++ if (targetm.is_stp_insn (icode, &has_wb)) ++ return (has_wb ? STP_WB : STP); ++ ++ rtx set_insn = single_set (insn); ++ if (set_insn && (GET_MODE (SET_SRC (set_insn)) == E_TImode ++ || GET_MODE (SET_DEST (set_insn)) == E_TImode)) + { +- if (targetm.is_ldp_insn (icode)) +- { +- return LDP; +- } +- if (targetm.is_stp_insn (icode)) +- { +- return STP; +- } +- else +- { +- return UNDEFINED; +- } ++ if (MEM_P (SET_SRC (set_insn)) && REG_P (SET_DEST (set_insn))) ++ return LDP_TI; ++ if (MEM_P (SET_DEST (set_insn)) && REG_P (SET_SRC (set_insn))) ++ return STP_TI; + } +- rtx set_insn = single_set (insn); +- if (set_insn && GET_CODE (XEXP (set_insn, 1)) == MEM +- && GET_MODE (XEXP (set_insn, 1)) == E_TImode) +- return LDP_TI; ++ ++ if (set_insn && MEM_P (SET_DEST (set_insn)) && REG_P (SET_SRC (set_insn)) ++ && GET_MODE (SET_DEST (set_insn)) != BLKmode) ++ return STR; + + return UNDEFINED; + } +diff --git a/gcc/target.def b/gcc/target.def +index b4dff78ea..2e3eae9f3 100644 +--- a/gcc/target.def ++++ b/gcc/target.def +@@ -2770,13 +2770,19 @@ DEFHOOK + + DEFHOOK + (is_ldp_insn, +- "Return true if icode is corresponding to any of the LDP instruction types.", +- bool, (int icode), NULL) ++ "Return true if @var{icode} is corresponding to any of the LDP instruction\n\ ++types. If @var{has_wb} is not NULL then its value is set to true if LDP\n\ ++contains post-index or pre-index operation.", ++ bool, (int icode, bool *has_wb), ++ NULL) + + DEFHOOK + (is_stp_insn, +- "Return true if icode is corresponding to any of the STP instruction types.", +- bool, (int icode), NULL) ++ "Return true if @var{icode} is corresponding to any of the STP instruction\n\ ++types. If @var{has_wb} is not NULL then its value is set to true if STP\n\ ++contains post-index or pre-index operation.", ++ bool, (int icode, bool *has_wb), ++ NULL) + + DEFHOOK + (gen_ccmp_first, +diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c +index 3918d43f6..2d42231dc 100644 +--- a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c ++++ b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-dont-split.c +@@ -1,5 +1,5 @@ + /* { dg-do compile { target aarch64-*-* } } */ +-/* { dg-additional-options "-fsplit-ldp-stp" } */ ++/* { dg-additional-options "-O1 -fsplit-ldp-stp" } */ + /* + * Tests are: + * Patterns where LDP insns should NOT be split +@@ -15,6 +15,9 @@ simple_ldp_after_store () + (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK) + (cinsn 228 (set (reg/i:DI sp) + (reg/i:DI x0))) ++ (cinsn 238 (set (reg/i:DI x1) ++ (reg/i:DI x0))) ++ + (cinsn 101 (set (mem/c:DI + (plus:DI (reg/f:DI sp) + (const_int 32))[1 S4 A32])(reg:DI x0))) +@@ -24,11 +27,27 @@ simple_ldp_after_store () + (set (reg:DI x30) + (mem:DI (plus:DI (reg/f:DI sp) + (const_int 16)) [1 S4 A32]))])) +- (cinsn 11 (use (reg/i:DI sp))) +- (cinsn 12 (use (reg/i:DI cc))) +- (cinsn 13 (use (reg/i:DI x29))) +- (cinsn 14 (use (reg/i:DI x30))) +- (cinsn 15 (use (reg/i:DI x0))) ++ (cinsn 11 (use (reg/i:DI x29))) ++ (cinsn 12 (use (reg/i:DI x30))) ++ ++ /* stp x0, x2, [x1]. */ ++ (cinsn 102 (parallel [ ++ (set (mem:DI (reg/f:DI x1) [1 S4 A32]) ++ (reg:DI x0)) ++ (set (mem:DI (plus:DI (reg/f:DI x1) (const_int 8)) [1 S4 A32]) ++ (reg:DI x2))])) ++ /* ldp x5, x6, [x1]. */ ++ (cinsn 13 (parallel [ ++ (set (reg:DI x5) (mem:DI (reg/f:DI x1) [1 S4 A32])) ++ (set (reg:DI x6) (mem:DI (plus:DI (reg/f:DI x1) ++ (const_int 8)) [1 S4 A32])) ++ ])) ++ (cinsn 14 (use (reg/i:DI x5))) ++ (cinsn 15 (use (reg/i:DI x6))) ++ ++ (cinsn 100 (use (reg/i:DI sp))) ++ (cinsn 200 (use (reg/i:DI cc))) ++ (cinsn 300 (use (reg/i:DI x0))) + (edge-to exit (flags "FALLTHRU")) + ) ;; block 2 + ) ;; insn-chain +@@ -70,5 +89,5 @@ ldp_after_store_in_different_bb () + ) ;; function "ldp_after_store_in_different_bb" + } + +-/* Verify that the output code contains exactly 2 ldp. */ +-/* { dg-final { scan-assembler-times {ldp\t} 2 } } */ +\ No newline at end of file ++/* Verify that the output code contains exactly 3 ldp. */ ++/* { dg-final { scan-assembler-times {ldp\t} 3 } } */ +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c +index 8c035c3e1..b9d745185 100644 +--- a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c ++++ b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split-rearrange.c +@@ -1,5 +1,5 @@ + /* { dg-do compile { target aarch64-*-* } } */ +-/* { dg-additional-options "-fsplit-ldp-stp" } */ ++/* { dg-additional-options "-O1 -fsplit-ldp-stp" } */ + /* + * Test is: + * Pattern where LDP insns should be split with rearrangement in order +diff --git a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c +index 2615e4fa1..0b280022f 100644 +--- a/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c ++++ b/gcc/testsuite/gcc.dg/rtl/aarch64/test-ldp-split.c +@@ -13,48 +13,131 @@ simple_ldp_after_store () + (block 2 + (edge-from entry (flags "FALLTHRU")) + (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK) ++ /* mov sp, x0. */ + (cinsn 228 (set (reg/i:DI sp) +- (reg/i:DI x0))) ++ (reg/i:DI x0))) ++ /* mov x1, x0. */ + (cinsn 238 (set (reg/i:DI x1) +- (reg/i:DI x0))) ++ (reg/i:DI x0))) + ++ /* str x0, [sp, 8]. */ + (cinsn 101 (set (mem/c:DI + (plus:DI (reg/f:DI sp) + (const_int 8))[1 S4 A32])(reg:DI x0))) ++ /* ldp x29, x30, [sp, 8]. */ + (cinsn 10 (parallel [ + (set (reg:DI x29) + (mem:DI (plus:DI (reg/f:DI sp) (const_int 8)) [1 S4 A32])) + (set (reg:DI x30) + (mem:DI (plus:DI (reg/f:DI sp) + (const_int 16)) [1 S4 A32]))])) ++ (cinsn 11 (use (reg/i:DI x29))) ++ (cinsn 12 (use (reg/i:DI x30))) + ++ /* str x0, [x1, -16]. */ + (cinsn 102 (set (mem/c:DI (plus:DI (reg/f:DI x1) + (const_int -16)) [1 S4 A32]) + (reg:DI x0))) +- (cinsn 11 (parallel [ ++ /* ldp x3, x4, [x1, -16]. */ ++ (cinsn 13 (parallel [ + (set (reg:DI x3) + (mem:DI (plus:DI (reg/f:DI x1) (const_int -16)) [1 S4 A32])) + (set (reg:DI x4) + (mem:DI (plus:DI (reg/f:DI x1) (const_int -8)) [1 S4 A32])) + ])) ++ (cinsn 14 (use (reg/i:DI x3))) ++ (cinsn 15 (use (reg/i:DI x4))) + ++ /* str x0, [x1]. */ + (cinsn 103 (set (mem/c:DI (reg/f:DI x1) [1 S4 A32]) + (reg:DI x0))) +- (cinsn 12 (parallel [ ++ /* ldp x5, x6, [x1]. */ ++ (cinsn 16 (parallel [ + (set (reg:DI x5) (mem:DI (reg/f:DI x1) [1 S4 A32])) + (set (reg:DI x6) (mem:DI (plus:DI (reg/f:DI x1) + (const_int 8)) [1 S4 A32])) + ])) ++ (cinsn 17 (use (reg/i:DI x5))) ++ (cinsn 18 (use (reg/i:DI x6))) + +- (cinsn 13 (use (reg/i:DI sp))) +- (cinsn 14 (use (reg/i:DI cc))) +- (cinsn 15 (use (reg/i:DI x29))) +- (cinsn 16 (use (reg/i:DI x30))) +- (cinsn 17 (use (reg/i:DI x0))) +- (cinsn 18 (use (reg/i:DI x3))) +- (cinsn 19 (use (reg/i:DI x4))) +- (cinsn 20 (use (reg/i:DI x5))) +- (cinsn 21 (use (reg/i:DI x6))) ++ /* ldp x29, x30, [sp], 96. */ ++ (cinsn 19 (parallel [ ++ (set (reg/f:DI sp) ++ (plus:DI (reg/f:DI sp) (const_int 96))) ++ (set (reg:DI x29) ++ (mem:DI (reg/f:DI sp) [1 S4 A32])) ++ (set (reg:DI x30) ++ (mem:DI (plus:DI (reg/f:DI sp) ++ (const_int 8)) [1 S4 A32]))])) ++ (cinsn 20 (use (reg/i:DI x29))) ++ (cinsn 21 (use (reg/i:DI x30))) ++ ++ /* stp x0, x2, [x1, 128]. */ ++ (cinsn 104 (parallel [ ++ (set (mem:DI (plus:DI (reg/f:DI x1) (const_int 128)) [1 S4 A32]) ++ (reg:DI x0)) ++ (set (mem:DI (plus:DI (reg/f:DI x1) (const_int 136)) [1 S4 A32]) ++ (reg:DI x2))])) ++ /* ldp x29, x30, [x1, 120]. */ ++ (cinsn 22 (parallel [ ++ (set (reg:DI x29) ++ (mem:DI (plus:DI (reg/f:DI x1) (const_int 120)) [1 S4 A32])) ++ (set (reg:DI x30) ++ (mem:DI (plus:DI (reg/f:DI x1) (const_int 128)) [1 S4 A32]))])) ++ (cinsn 23 (use (reg/i:DI x29))) ++ (cinsn 24 (use (reg/i:DI x30))) ++ ++ /* stp x0, x2, [x1, 128]. */ ++ (cinsn 105 (parallel [ ++ (set (mem:DI (plus:DI (reg/f:DI x1) (const_int 128)) [1 S4 A32]) ++ (reg:DI x0)) ++ (set (mem:DI (plus:DI (reg/f:DI x1) (const_int 136)) [1 S4 A32]) ++ (reg:DI x2))])) ++ /* ldp x3, x4, [x1, 136]. */ ++ (cinsn 25 (parallel [ ++ (set (reg:DI x3) ++ (mem:DI (plus:DI (reg/f:DI x1) (const_int 136)) [1 S4 A32])) ++ (set (reg:DI x4) ++ (mem:DI (plus:DI (reg/f:DI x1) (const_int 144)) [1 S4 A32])) ++ ])) ++ (cinsn 26 (use (reg/i:DI x3))) ++ (cinsn 27 (use (reg/i:DI x4))) ++ ++ /* stp w0, w2, [x1, 32]. */ ++ (cinsn 106 (parallel [ ++ (set (mem:SI (plus:DI (reg/f:DI x1) (const_int 32)) [1 S4 A32]) ++ (reg:SI x0)) ++ (set (mem:SI (plus:DI (reg/f:DI x1) (const_int 36)) [1 S4 A32]) ++ (reg:SI x2))])) ++ /* ldp x5, x6, [x1, 32]. */ ++ (cinsn 28 (parallel [ ++ (set (reg:DI x5) (mem:DI (plus:DI (reg/f:DI x1) ++ (const_int 32)) [1 S4 A32])) ++ (set (reg:DI x6) (mem:DI (plus:DI (reg/f:DI x1) ++ (const_int 40)) [1 S4 A32])) ++ ])) ++ (cinsn 29 (use (reg/i:DI x5))) ++ (cinsn 30 (use (reg/i:DI x6))) ++ ++ /* stp w0, w2, [x1, 40]. */ ++ (cinsn 107 (parallel [ ++ (set (mem:SI (plus:DI (reg/f:DI x1) (const_int 40)) [1 S4 A32]) ++ (reg:SI x0)) ++ (set (mem:SI (plus:DI (reg/f:DI x1) (const_int 44)) [1 S4 A32]) ++ (reg:SI x2))])) ++ /* ldp x5, x6, [x1, 32]. */ ++ (cinsn 31 (parallel [ ++ (set (reg:DI x5) (mem:DI (plus:DI (reg/f:DI x1) ++ (const_int 32)) [1 S4 A32])) ++ (set (reg:DI x6) (mem:DI (plus:DI (reg/f:DI x1) ++ (const_int 40)) [1 S4 A32])) ++ ])) ++ (cinsn 32 (use (reg/i:DI x5))) ++ (cinsn 33 (use (reg/i:DI x6))) ++ ++ (cinsn 100 (use (reg/i:DI sp))) ++ (cinsn 200 (use (reg/i:DI cc))) ++ (cinsn 400 (use (reg/i:DI x0))) + (edge-to exit (flags "FALLTHRU")) + ) ;; block 2 + ) ;; insn-chain +@@ -69,43 +152,83 @@ ldp_ti_after_store () + (block 2 + (edge-from entry (flags "FALLTHRU")) + (cnote 3 [bb 2] NOTE_INSN_BASIC_BLOCK) ++ /* mov sp, x0. */ + (cinsn 228 (set (reg/i:DI sp) +- (reg/i:DI x0))) ++ (reg/i:DI x0))) ++ /* mov x2, x0. */ + (cinsn 238 (set (reg/i:DI x2) +- (reg/i:DI x0))) +- ++ (reg/i:DI x0))) ++ /* str x0, [sp, 136]. */ + (cinsn 101 (set (mem/c:DI + (plus:DI (reg/f:DI sp) + (const_int 136))[1 S4 A32])(reg:DI x0))) +- (insn 81 (set (reg:TI x0 [1 S4 A32]) ++ /* ldp x0, x1, [sp, 136]. */ ++ (cinsn 81 (set (reg:TI x0 [1 S4 A32]) + (mem/c:TI (plus:DI (reg/f:DI sp) +- (const_int 136 )) [1 S4 A32])) +- (expr_list:REG_EQUIV (mem/c:TI (plus:DI (reg/f:DI sfp) +- (const_int -24 )) [1 S4 A32]) +- (nil))) +- ++ (const_int 136)) [1 S4 A32]))) ++ /* str x0, [x2, -16]. */ + (cinsn 102 (set (mem/c:DI (plus:DI (reg/f:DI x2) +- (const_int -16)) [1 S4 A32]) ++ (const_int -16)) [1 S4 A32]) + (reg:DI x0))) +- (insn 82 (set (reg:TI x3 [1 S4 A32]) ++ /* ldp x3, x4, [x2, -16]. */ ++ (cinsn 82 (set (reg:TI x3 [1 S4 A32]) + (mem/c:TI (plus:DI (reg/f:DI x2) +- (const_int -16)) [1 S4 A32]))) +- ++ (const_int -16)) [1 S4 A32]))) ++ /* str x0, [x2]. */ + (cinsn 103 (set (mem/c:DI (reg/f:DI x2) [1 S4 A32]) + (reg:DI x0))) +- (insn 83 (set (reg:TI x5 [1 S4 A32]) ++ /* ldp x5, x6, [x2]. */ ++ (cinsn 83 (set (reg:TI x5 [1 S4 A32]) + (mem/c:TI (reg/f:DI x2) [1 S4 A32]))) + ++ /* stp x0, x1, [sp, -8]. */ ++ (cinsn 104 (set (mem:TI (plus:DI (reg/v/f:DI sp) ++ (const_int -8)) [1 S4 A32]) ++ (reg:TI x0))) ++ /* ldp x5, x6, [sp], -16. */ ++ (cinsn 84 (set (reg/v:TI x5 [1 S4 A32]) ++ (mem:TI (post_dec:DI (reg/v/f:DI sp)) [1 S4 A32]))) ++ (cinsn 85 (use (reg/i:DI x5))) ++ (cinsn 86 (use (reg/i:DI x6))) ++ ++ /* stp x0, x1, [sp, 8]. */ ++ (cinsn 105 (set (mem:TI (plus:DI (reg/v/f:DI sp) ++ (const_int 8)) [1 S4 A32]) ++ (reg:TI x0))) ++ /* ldp x5, x6, [sp], -16. */ ++ (cinsn 87 (set (reg/v:TI x5 [1 S4 A32]) ++ (mem:TI (post_dec:DI (reg/v/f:DI sp)) [1 S4 A32]))) ++ (cinsn 88 (use (reg/i:DI x5))) ++ (cinsn 89 (use (reg/i:DI x6))) ++ ++ /* Intersects with insn 102. */ ++ /* ldp x2, x3, [x2, -16]!. */ ++ (cinsn 90 (set (reg/v:TI x2 [1 S4 A32]) ++ (mem:TI (pre_dec:DI (reg/v/f:DI x2)) [1 S4 A32]))) ++ (cinsn 91 (use (reg/i:DI x2))) ++ (cinsn 92 (use (reg/i:DI x3))) ++ ++ /* mov x2, x0. */ ++ (cinsn 248 (set (reg/i:DI x2) ++ (reg/i:DI x0))) ++ /* str x0, [x2, 16]. */ ++ (cinsn 106 (set (mem:DI (plus:DI (reg/v/f:DI x2) ++ (const_int 16)) [1 S4 A32]) ++ (reg:DI x0))) ++ /* ldp x3, x4, [x2, 16]!. */ ++ (cinsn 93 (set (reg/v:TI x3 [1 S4 A32]) ++ (mem:TI (pre_inc:DI (reg/v/f:DI x2)) [1 S4 A32]))) ++ (cinsn 94 (use (reg/i:DI x3))) ++ (cinsn 95 (use (reg/i:DI x4))) ++ + (cinsn 11 (use (reg/i:DI sp))) + (cinsn 12 (use (reg/i:DI cc))) + (cinsn 13 (use (reg/i:DI x29))) + (cinsn 14 (use (reg/i:DI x30))) + (cinsn 15 (use (reg/i:DI x0))) + (cinsn 16 (use (reg/i:DI x3))) +- (cinsn 17 (use (reg/i:DI x5))) + (cinsn 18 (use (reg/i:DI x1))) + (cinsn 19 (use (reg/i:DI x4))) +- (cinsn 20 (use (reg/i:DI x6))) + (edge-to exit (flags "FALLTHRU")) + ) ;; block 2 + ) ;; insn-chain +-- +2.33.0 + diff --git a/0176-Fix-bugs-in-ICP-src-openEuler-gcc-I8PYBF-I8PYLL.patch b/0176-Fix-bugs-in-ICP-src-openEuler-gcc-I8PYBF-I8PYLL.patch new file mode 100644 index 0000000..0b69fe7 --- /dev/null +++ b/0176-Fix-bugs-in-ICP-src-openEuler-gcc-I8PYBF-I8PYLL.patch @@ -0,0 +1,61 @@ +From d2742041454dbd4c4c3c3e0a27b5fb26d1e05832 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia WX1215920 +Date: Thu, 21 Dec 2023 11:14:06 +0300 +Subject: [PATCH 5/5] Fix bugs in ICP (src-openEuler/gcc: I8PYBF, I8PYLL) + +--- + gcc/ipa-devirt.c | 17 +++++++++++++++-- + 1 file changed, 15 insertions(+), 2 deletions(-) + +diff --git a/gcc/ipa-devirt.c b/gcc/ipa-devirt.c +index fbde7eb94..a18cbe36a 100644 +--- a/gcc/ipa-devirt.c ++++ b/gcc/ipa-devirt.c +@@ -4399,6 +4399,11 @@ print_type_set(unsigned ftype_uid, type_alias_map *map) + if (!map->count (ftype_uid)) + return; + type_set* s = (*map)[ftype_uid]; ++ if (!s) ++ { ++ fprintf (dump_file, "%d (no set)", ftype_uid); ++ return; ++ } + for (type_set::const_iterator it = s->begin (); it != s->end (); it++) + fprintf (dump_file, it == s->begin () ? "%d" : ", %d", *it); + } +@@ -4966,7 +4971,8 @@ analyze_assign_stmt (gimple *stmt) + { + rhs = TREE_OPERAND (rhs, 0); + if (VAR_OR_FUNCTION_DECL_P (rhs) || TREE_CODE (rhs) == STRING_CST +- || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL) ++ || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL ++ || TREE_CODE (rhs) == LABEL_DECL) + rhs_type = build_pointer_type (TREE_TYPE (rhs)); + else if (TREE_CODE (rhs) == COMPONENT_REF) + { +@@ -4980,7 +4986,12 @@ analyze_assign_stmt (gimple *stmt) + gcc_assert (POINTER_TYPE_P (rhs_type)); + } + else +- gcc_unreachable(); ++ { ++ fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ", ++ get_tree_code_name (TREE_CODE (rhs))); ++ print_gimple_stmt (dump_file, stmt, 0); ++ gcc_unreachable (); ++ } + } + else + rhs_type = TREE_TYPE (rhs); +@@ -5678,6 +5689,8 @@ merge_fs_map_for_ftype_aliases () + decl_set *d_set = it1->second; + tree type = (*type_uid_map)[it1->first]; + type_set *set = (*fta_map)[it1->first]; ++ if (!set) ++ continue; + for (type_set::const_iterator it2 = set->begin (); + it2 != set->end (); it2++) + { +-- +2.33.0 + diff --git a/0177-Fix-sqlite-build.patch b/0177-Fix-sqlite-build.patch new file mode 100644 index 0000000..e04e604 --- /dev/null +++ b/0177-Fix-sqlite-build.patch @@ -0,0 +1,168 @@ +From 71a992aca88f63ec1afb1608619b82a857d8e297 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia +Date: Fri, 22 Dec 2023 10:11:24 +0800 +Subject: [PATCH 1/4] Fix sqlite build + +--- + gcc/ipa-prefetch.c | 71 ++++++++++++++++++++++++++-------------------- + gcc/ipa-sra.c | 7 +++++ + 2 files changed, 47 insertions(+), 31 deletions(-) + +diff --git a/gcc/ipa-prefetch.c b/gcc/ipa-prefetch.c +index d8bb9a251..371702ad8 100644 +--- a/gcc/ipa-prefetch.c ++++ b/gcc/ipa-prefetch.c +@@ -1092,6 +1092,15 @@ analyse_loops () + memref_t *mr = it->first, *mr2 = it->second; + if (mr2 == NULL || !(*fmrs_map)[fn]->count (mr)) + continue; ++ /* For now optimize only MRs that mem is MEM_REF. ++ TODO: support other MR types. */ ++ if (TREE_CODE (mr->mem) != MEM_REF) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Skip MR %d: unsupported tree code = %s\n", ++ mr->mr_id, get_tree_code_name (TREE_CODE (mr->mem))); ++ continue; ++ } + if (!optimize_mrs_map->count (fn)) + (*optimize_mrs_map)[fn] = new memref_set; + (*optimize_mrs_map)[fn]->insert (mr); +@@ -1104,7 +1113,7 @@ analyse_loops () + it != (*optimize_mrs_map)[fn]->end (); it++) + { + memref_t *mr = *it, *mr2 = (*mr_candidate_map)[mr]; +- fprintf (dump_file, "MRs %d,%d with incremental offset ", ++ fprintf (dump_file, "MRs %d, %d with incremental offset ", + mr->mr_id, mr2->mr_id); + print_generic_expr (dump_file, mr2->offset); + fprintf (dump_file, "\n"); +@@ -1437,6 +1446,27 @@ remap_gimple_op_r (tree *tp, int *walk_subtrees, void *data) + return NULL_TREE; + } + ++/* Copy stmt and remap its operands. */ ++ ++static gimple * ++gimple_copy_and_remap (gimple *stmt) ++{ ++ gimple *copy = gimple_copy (stmt); ++ gcc_checking_assert (!is_gimple_debug (copy)); ++ ++ /* Remap all the operands in COPY. */ ++ struct walk_stmt_info wi; ++ memset (&wi, 0, sizeof (wi)); ++ wi.info = copy; ++ walk_gimple_op (copy, remap_gimple_op_r, &wi); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Stmt copy after remap:\n"); ++ print_gimple_stmt (dump_file, copy, 0); ++ } ++ return copy; ++} ++ + static void + create_cgraph_edge (cgraph_node *n, gimple *stmt) + { +@@ -1585,7 +1615,6 @@ optimize_function (cgraph_node *n, function *fn) + /* Create other new vars. Insert new stmts. */ + struct walk_stmt_info wi; + stmt_set processed_stmts; +- memref_tree_map mr_new_trees; + for (memref_set::const_iterator it = used_mrs.begin (); + it != used_mrs.end (); it++) + { +@@ -1606,23 +1635,10 @@ optimize_function (cgraph_node *n, function *fn) + } + /* Create a new copy of STMT and duplicate STMT's virtual + operands. */ +- gimple *copy = gimple_copy (mr->stmts[i]); +- gcc_checking_assert (!is_gimple_debug (copy)); +- +- /* Remap all the operands in COPY. */ +- memset (&wi, 0, sizeof (wi)); +- last_stmt = copy; +- wi.info = copy; +- walk_gimple_op (copy, remap_gimple_op_r, &wi); +- if (dump_file) +- { +- fprintf (dump_file, "Stmt %d after remap:\n",i); +- print_gimple_stmt (dump_file, copy, 0); +- } +- gimple_seq_add_stmt (&stmts, copy); ++ last_stmt = gimple_copy_and_remap (mr->stmts[i]); ++ gimple_seq_add_stmt (&stmts, last_stmt); + } + gcc_assert (last_stmt); +- mr_new_trees[mr] = gimple_assign_lhs (last_stmt); + if (dump_file) + { + fprintf (dump_file, "MR (%d) new mem: ", mr->mr_id); +@@ -1664,23 +1680,11 @@ optimize_function (cgraph_node *n, function *fn) + continue; + processed_stmts.insert (mr->stmts[i]); + +- gimple *copy = gimple_copy (mr->stmts[i]); +- gcc_checking_assert (!is_gimple_debug (copy)); +- +- /* Remap all the operands in COPY. */ +- memset (&wi, 0, sizeof (wi)); +- wi.info = copy; +- walk_gimple_op (copy, remap_gimple_op_r, &wi); +- if (dump_file) +- { +- fprintf (dump_file, "Stmt %d after remap:\n",i); +- print_gimple_stmt (dump_file, copy, 0); +- } ++ gimple *copy = gimple_copy_and_remap (mr->stmts[i]); + gimple_seq_add_stmt (&stmts, copy); + } + gimple *last_stmt = mr->stmts[0]; + gcc_assert (last_stmt); +- mr_new_trees[mr] = gimple_assign_lhs (last_stmt); + tree write_p = mr->is_store ? integer_one_node : integer_zero_node; + tree addr = get_mem_ref_address_ssa_name (mr->mem, NULL_TREE); + if (decl_map->count (addr)) +@@ -1689,6 +1693,11 @@ optimize_function (cgraph_node *n, function *fn) + 3, addr, write_p, local); + pcalls.safe_push (last_stmt); + gimple_seq_add_stmt (&stmts, last_stmt); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Insert %d prefetch stmt:\n", j); ++ print_gimple_stmt (dump_file, last_stmt, 0); ++ } + } + + gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT); +@@ -1827,7 +1836,7 @@ pass_ipa_prefetch::gate (function *) + /* Don't bother doing anything if the program has errors. */ + && !seen_error () + && flag_lto_partition == LTO_PARTITION_ONE +- /* Only enable struct optimizations in lto or whole_program. */ ++ /* Only enable prefetch optimizations in lto or whole_program. */ + && (in_lto_p || flag_whole_program)); + } + +diff --git a/gcc/ipa-sra.c b/gcc/ipa-sra.c +index d7019ec42..ee927bf6a 100644 +--- a/gcc/ipa-sra.c ++++ b/gcc/ipa-sra.c +@@ -3448,6 +3448,13 @@ param_splitting_across_edge (cgraph_edge *cs) + gcc_checking_assert (from_ifs && from_ifs->m_parameters); + + isra_call_summary *csum = call_sums->get (cs); ++ /* TODO: implement better support for call edges inserted after summary ++ collection but before sra wpa invocation. */ ++ if (!csum) ++ { ++ csum = call_sums->get_create (cs); ++ csum->m_return_ignored = true; ++ } + gcc_checking_assert (csum); + unsigned args_count = csum->m_arg_flow.length (); + isra_func_summary *to_ifs = func_sums->get (callee); +-- +2.33.0 + diff --git a/0178-Fix-freetype-build.patch b/0178-Fix-freetype-build.patch new file mode 100644 index 0000000..bb1dd41 --- /dev/null +++ b/0178-Fix-freetype-build.patch @@ -0,0 +1,52 @@ +From b187b3043c5a7aa96e6d1106e4b0f37d14c914a6 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia +Date: Fri, 22 Dec 2023 11:39:09 +0800 +Subject: [PATCH 2/4] Fix freetype build + +--- + gcc/ipa-prefetch.c | 17 +++++++++++++++-- + 1 file changed, 15 insertions(+), 2 deletions(-) + +diff --git a/gcc/ipa-prefetch.c b/gcc/ipa-prefetch.c +index 371702ad8..f91ac3edc 100644 +--- a/gcc/ipa-prefetch.c ++++ b/gcc/ipa-prefetch.c +@@ -1522,6 +1522,13 @@ optimize_function (cgraph_node *n, function *fn) + "Skip the case.\n"); + return 0; + } ++ if (!tree_fits_shwi_p (inc_mr->step)) ++ { ++ if (dump_file) ++ fprintf (dump_file, "Cannot represent incremental MR's step as " ++ "integer. Skip the case.\n"); ++ return 0; ++ } + if (dump_file && !used_mrs.empty ()) + print_mrs_ids (used_mrs, "Common list of used mrs:\n"); + +@@ -1607,13 +1614,19 @@ optimize_function (cgraph_node *n, function *fn) + else + inc_code = PLUS_EXPR; + tree step = inc_mr->step; +- unsigned dist_val = tree_to_uhwi (step) * param_ipa_prefetch_distance_factor; ++ HOST_WIDE_INT dist_val = tree_to_shwi (step) ++ * param_ipa_prefetch_distance_factor; + tree dist = build_int_cst (TREE_TYPE (step), dist_val); + tree new_inc_var = gimple_build (&stmts, inc_code, var_type, inc_var, dist); + (*decl_map)[inc_var] = new_inc_var; ++ if (dump_file) ++ { ++ fprintf (dump_file, "New distance value: %ld, new inc var: ", dist_val); ++ print_generic_expr (dump_file, new_inc_var); ++ fprintf (dump_file, "\n"); ++ } + + /* Create other new vars. Insert new stmts. */ +- struct walk_stmt_info wi; + stmt_set processed_stmts; + for (memref_set::const_iterator it = used_mrs.begin (); + it != used_mrs.end (); it++) +-- +2.33.0 + diff --git a/0179-rtl-ifcvt-refuse-to-rename-def-in-the-last-instructi.patch b/0179-rtl-ifcvt-refuse-to-rename-def-in-the-last-instructi.patch new file mode 100644 index 0000000..2edbf11 --- /dev/null +++ b/0179-rtl-ifcvt-refuse-to-rename-def-in-the-last-instructi.patch @@ -0,0 +1,29 @@ +From 3d1b0da292e383ce2a139c1612ec7e07336bbcd8 Mon Sep 17 00:00:00 2001 +From: vchernon +Date: Fri, 22 Dec 2023 22:05:27 +0800 +Subject: [PATCH 3/4] [rtl-ifcvt] refuse to rename def in the last instruction + in BB + +--- + gcc/ifcvt.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c +index 3df0f6fdd..025eb6cd1 100644 +--- a/gcc/ifcvt.c ++++ b/gcc/ifcvt.c +@@ -2176,7 +2176,10 @@ noce_rename_regs_in_bb (basic_block test_bb, bitmap rename_regs) + rtx x = SET_DEST (sset); + if (!REG_P (x) || !bitmap_bit_p (rename_regs, REGNO (x))) + continue; +- ++ /* Do not need to rename dest in the last instruction ++ it will be renamed anyway. */ ++ if (insn == last_insn) ++ continue; + machine_mode mode = GET_MODE (x); + rtx tmp = gen_reg_rtx (mode); + if (!validate_replace_rtx_part (x, tmp, &SET_DEST (sset), insn)) +-- +2.33.0 + diff --git a/0180-add-optimization-level-requirement-to-the-gate.patch b/0180-add-optimization-level-requirement-to-the-gate.patch new file mode 100644 index 0000000..068d36f --- /dev/null +++ b/0180-add-optimization-level-requirement-to-the-gate.patch @@ -0,0 +1,25 @@ +From aa66bcf2b684655d0fbcc6b4543ffef1b2e37288 Mon Sep 17 00:00:00 2001 +From: vchernon +Date: Thu, 28 Dec 2023 10:44:35 +0800 +Subject: [PATCH] add optimization level requirement to the gate. + +--- + gcc/crypto-accel.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/gcc/crypto-accel.c b/gcc/crypto-accel.c +index f4e810a6b..e7766a585 100644 +--- a/gcc/crypto-accel.c ++++ b/gcc/crypto-accel.c +@@ -2391,7 +2391,7 @@ public: + /* opt_pass methods: */ + virtual bool gate (function *) + { +- if (flag_crypto_accel_aes <= 0) ++ if (flag_crypto_accel_aes <= 0 || optimize < 1) + return false; + return targetm.get_v16qi_mode + && targetm.gen_rev32v16qi +-- +2.33.0 + diff --git a/0181-Fix-issue-I8QD9H.patch b/0181-Fix-issue-I8QD9H.patch new file mode 100644 index 0000000..fddd01f --- /dev/null +++ b/0181-Fix-issue-I8QD9H.patch @@ -0,0 +1,115 @@ +From 25f1ebeb88a4eae247f58488cac9da878f188d9f Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia +Date: Sat, 23 Dec 2023 10:05:10 +0800 +Subject: [PATCH 4/4] Fix issue I8QD9H + +--- + gcc/ipa-prefetch.c | 64 +++++++++++++++++++++++++++------------------- + 1 file changed, 37 insertions(+), 27 deletions(-) + +diff --git a/gcc/ipa-prefetch.c b/gcc/ipa-prefetch.c +index f91ac3edc..a471b118e 100644 +--- a/gcc/ipa-prefetch.c ++++ b/gcc/ipa-prefetch.c +@@ -1467,6 +1467,31 @@ gimple_copy_and_remap (gimple *stmt) + return copy; + } + ++/* Copy and remap stmts listed in MR in reverse order to last_idx, skipping ++ processed ones. Insert new stmts to the sequence. */ ++ ++static gimple * ++gimple_copy_and_remap_memref_stmts (memref_t *mr, gimple_seq &stmts, ++ int last_idx, stmt_set &processed) ++{ ++ gimple *last_stmt = NULL; ++ for (int i = mr->stmts.length () - 1; i >= last_idx ; i--) ++ { ++ if (processed.count (mr->stmts[i])) ++ continue; ++ processed.insert (mr->stmts[i]); ++ if (dump_file) ++ { ++ fprintf (dump_file, "Copy stmt %d from used MR (%d):\n", ++ i, mr->mr_id); ++ print_gimple_stmt (dump_file, mr->stmts[i], 0); ++ } ++ last_stmt = gimple_copy_and_remap (mr->stmts[i]); ++ gimple_seq_add_stmt (&stmts, last_stmt); ++ } ++ return last_stmt; ++} ++ + static void + create_cgraph_edge (cgraph_node *n, gimple *stmt) + { +@@ -1606,7 +1631,16 @@ optimize_function (cgraph_node *n, function *fn) + decl_map = new tree_map; + gcc_assert (comp_mr->stmts[0] && gimple_assign_single_p (comp_mr->stmts[0])); + tree inc_var = gimple_assign_lhs (comp_mr->stmts[0]); ++ /* If old_var definition dominates the current use, just use it, otherwise ++ evaluate it just before new inc var evaluation. */ + gimple_seq stmts = NULL; ++ stmt_set processed_stmts; ++ if (!dominated_by_p (CDI_DOMINATORS, dom_bb, gimple_bb (comp_mr->stmts[0]))) ++ { ++ gimple *tmp = gimple_copy_and_remap_memref_stmts (comp_mr, stmts, 0, ++ processed_stmts); ++ inc_var = gimple_assign_lhs (tmp); ++ } + tree var_type = TREE_TYPE (inc_var); + enum tree_code inc_code; + if (TREE_CODE (var_type) == POINTER_TYPE) +@@ -1627,30 +1661,14 @@ optimize_function (cgraph_node *n, function *fn) + } + + /* Create other new vars. Insert new stmts. */ +- stmt_set processed_stmts; + for (memref_set::const_iterator it = used_mrs.begin (); + it != used_mrs.end (); it++) + { + memref_t *mr = *it; +- gimple *last_stmt = NULL; + if (mr == comp_mr) + continue; +- for (int i = mr->stmts.length () - 1; i >= 0 ; i--) +- { +- if (processed_stmts.count (mr->stmts[i])) +- continue; +- processed_stmts.insert (mr->stmts[i]); +- if (dump_file) +- { +- fprintf (dump_file, "Copy stmt %d from used MR (%d):\n", +- i, mr->mr_id); +- print_gimple_stmt (dump_file, mr->stmts[i], 0); +- } +- /* Create a new copy of STMT and duplicate STMT's virtual +- operands. */ +- last_stmt = gimple_copy_and_remap (mr->stmts[i]); +- gimple_seq_add_stmt (&stmts, last_stmt); +- } ++ gimple *last_stmt = gimple_copy_and_remap_memref_stmts (mr, stmts, 0, ++ processed_stmts); + gcc_assert (last_stmt); + if (dump_file) + { +@@ -1687,15 +1705,7 @@ optimize_function (cgraph_node *n, function *fn) + memref_t *mr = vmrs[j]; + /* Don't need to copy the last stmt, since we insert prefetch insn + instead of it. */ +- for (int i = mr->stmts.length () - 1; i >= 1 ; i--) +- { +- if (processed_stmts.count (mr->stmts[i])) +- continue; +- processed_stmts.insert (mr->stmts[i]); +- +- gimple *copy = gimple_copy_and_remap (mr->stmts[i]); +- gimple_seq_add_stmt (&stmts, copy); +- } ++ gimple_copy_and_remap_memref_stmts (mr, stmts, 1, processed_stmts); + gimple *last_stmt = mr->stmts[0]; + gcc_assert (last_stmt); + tree write_p = mr->is_store ? integer_one_node : integer_zero_node; +-- +2.33.0 + diff --git a/0182-Fix-bugs-in-ICP-src-openEuler-gcc-I8RKFJ.patch b/0182-Fix-bugs-in-ICP-src-openEuler-gcc-I8RKFJ.patch new file mode 100644 index 0000000..b7fce8a --- /dev/null +++ b/0182-Fix-bugs-in-ICP-src-openEuler-gcc-I8RKFJ.patch @@ -0,0 +1,47 @@ +From 1724319692f3c5443802e0cef44a81667cfcc0ae Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia +Date: Wed, 27 Dec 2023 07:29:26 +0800 +Subject: [PATCH 1/4] Fix bugs in ICP (src-openEuler/gcc: I8RKFJ) + +--- + gcc/ipa-devirt.c | 13 ++++++++++--- + 1 file changed, 10 insertions(+), 3 deletions(-) + +diff --git a/gcc/ipa-devirt.c b/gcc/ipa-devirt.c +index a18cbe36a..987f15a15 100644 +--- a/gcc/ipa-devirt.c ++++ b/gcc/ipa-devirt.c +@@ -4669,12 +4669,19 @@ maybe_register_aliases (tree type1, tree type2) + if (register_ailas_type (type1, type2, ta_map)) + analyze_pointees (type1, type2); + } ++ unsigned type1_uid = TYPE_UID (type1); ++ unsigned type2_uid = TYPE_UID (type2); ++ if (type_uid_map->count (type1_uid) == 0) ++ (*type_uid_map)[type1_uid] = type1; ++ if (type_uid_map->count (type2_uid) == 0) ++ (*type_uid_map)[type2_uid] = type2; ++ + /* If function and non-function type pointers alias, + the function type is unsafe. */ + if (FUNCTION_POINTER_TYPE_P (type1) && !FUNCTION_POINTER_TYPE_P (type2)) +- unsafe_types->insert (TYPE_UID (type1)); ++ unsafe_types->insert (type1_uid); + if (FUNCTION_POINTER_TYPE_P (type2) && !FUNCTION_POINTER_TYPE_P (type1)) +- unsafe_types->insert (TYPE_UID (type2)); ++ unsafe_types->insert (type2_uid); + + /* Try to figure out with pointers to incomplete types. */ + if (POINTER_TYPE_P (type1) && POINTER_TYPE_P (type2)) +@@ -4972,7 +4979,7 @@ analyze_assign_stmt (gimple *stmt) + rhs = TREE_OPERAND (rhs, 0); + if (VAR_OR_FUNCTION_DECL_P (rhs) || TREE_CODE (rhs) == STRING_CST + || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL +- || TREE_CODE (rhs) == LABEL_DECL) ++ || TREE_CODE (rhs) == LABEL_DECL || TREE_CODE (rhs) == CONST_DECL) + rhs_type = build_pointer_type (TREE_TYPE (rhs)); + else if (TREE_CODE (rhs) == COMPONENT_REF) + { +-- +2.33.0 + diff --git a/0183-Fix-fail-in-ICP-src-openEuler-gcc-I8RP4H.patch b/0183-Fix-fail-in-ICP-src-openEuler-gcc-I8RP4H.patch new file mode 100644 index 0000000..8825443 --- /dev/null +++ b/0183-Fix-fail-in-ICP-src-openEuler-gcc-I8RP4H.patch @@ -0,0 +1,26 @@ +From 351d049f09b1e96e48c3038ab3a6a9c1d6a13f8d Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia +Date: Thu, 28 Dec 2023 09:51:34 +0800 +Subject: [PATCH 2/4] Fix fail in ICP (src-openEuler/gcc: I8RP4H) + +--- + gcc/ipa-devirt.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/gcc/ipa-devirt.c b/gcc/ipa-devirt.c +index 987f15a15..ed19a57a4 100644 +--- a/gcc/ipa-devirt.c ++++ b/gcc/ipa-devirt.c +@@ -4979,7 +4979,8 @@ analyze_assign_stmt (gimple *stmt) + rhs = TREE_OPERAND (rhs, 0); + if (VAR_OR_FUNCTION_DECL_P (rhs) || TREE_CODE (rhs) == STRING_CST + || TREE_CODE (rhs) == ARRAY_REF || TREE_CODE (rhs) == PARM_DECL +- || TREE_CODE (rhs) == LABEL_DECL || TREE_CODE (rhs) == CONST_DECL) ++ || TREE_CODE (rhs) == LABEL_DECL || TREE_CODE (rhs) == CONST_DECL ++ || TREE_CODE (rhs) == RESULT_DECL) + rhs_type = build_pointer_type (TREE_TYPE (rhs)); + else if (TREE_CODE (rhs) == COMPONENT_REF) + { +-- +2.33.0 + diff --git a/0184-Fix-fail-in-IPA-prefetch-src-openEuler-gcc-I8RURA.patch b/0184-Fix-fail-in-IPA-prefetch-src-openEuler-gcc-I8RURA.patch new file mode 100644 index 0000000..fb053ed --- /dev/null +++ b/0184-Fix-fail-in-IPA-prefetch-src-openEuler-gcc-I8RURA.patch @@ -0,0 +1,45 @@ +From 2a5c250262ec0497a5efbbd1d0d67e7147696074 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia +Date: Thu, 28 Dec 2023 20:20:16 +0800 +Subject: [PATCH 1/2] Fix fail in IPA prefetch (src-openEuler/gcc: I8RURA) + +--- + gcc/ipa-devirt.c | 9 ++++++--- + gcc/ipa-prefetch.c | 1 + + 2 files changed, 7 insertions(+), 3 deletions(-) + +diff --git a/gcc/ipa-devirt.c b/gcc/ipa-devirt.c +index ed19a57a4..9863084e4 100644 +--- a/gcc/ipa-devirt.c ++++ b/gcc/ipa-devirt.c +@@ -4995,9 +4995,12 @@ analyze_assign_stmt (gimple *stmt) + } + else + { +- fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ", +- get_tree_code_name (TREE_CODE (rhs))); +- print_gimple_stmt (dump_file, stmt, 0); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\nUnsupported rhs type %s in assign stmt: ", ++ get_tree_code_name (TREE_CODE (rhs))); ++ print_gimple_stmt (dump_file, stmt, 0); ++ } + gcc_unreachable (); + } + } +diff --git a/gcc/ipa-prefetch.c b/gcc/ipa-prefetch.c +index a471b118e..24cb4424a 100644 +--- a/gcc/ipa-prefetch.c ++++ b/gcc/ipa-prefetch.c +@@ -1730,6 +1730,7 @@ optimize_function (cgraph_node *n, function *fn) + for (unsigned i = 0; i < pcalls.length (); i++) + create_cgraph_edge (n, pcalls[i]); + ipa_update_overall_fn_summary (n); ++ renumber_gimple_stmt_uids (DECL_STRUCT_FUNCTION (n->decl)); + + return 1; + } +-- +2.33.0 + diff --git a/0185-Fix-fail-in-IPA-prefetch-src-openEuler-gcc-I8RV7T.patch b/0185-Fix-fail-in-IPA-prefetch-src-openEuler-gcc-I8RV7T.patch new file mode 100644 index 0000000..417b050 --- /dev/null +++ b/0185-Fix-fail-in-IPA-prefetch-src-openEuler-gcc-I8RV7T.patch @@ -0,0 +1,26 @@ +From 4014d651825c3e03e0ad2eabeddcfb94f5f00e68 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia +Date: Fri, 29 Dec 2023 05:59:00 +0800 +Subject: [PATCH 2/2] Fix fail in IPA prefetch (src-openEuler/gcc: I8RV7T) + +--- + gcc/ipa-prefetch.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/gcc/ipa-prefetch.c b/gcc/ipa-prefetch.c +index 24cb4424a..d9456519c 100644 +--- a/gcc/ipa-prefetch.c ++++ b/gcc/ipa-prefetch.c +@@ -943,6 +943,9 @@ compare_memrefs (memref_t* mr, memref_t* mr2) + (*mr_candidate_map)[mr] = mr2; + return; + } ++ /* Probably we shouldn't leave nulls in the map. */ ++ if ((*mr_candidate_map)[mr] == NULL) ++ return; + /* TODO: support analysis with incrementation of different fields. */ + if ((*mr_candidate_map)[mr]->offset != mr2->offset) + { +-- +2.33.0 + diff --git a/0186-Loop-CRC-Solving-the-problem-of-insufficient-CRC-tab.patch b/0186-Loop-CRC-Solving-the-problem-of-insufficient-CRC-tab.patch new file mode 100644 index 0000000..388c6ab --- /dev/null +++ b/0186-Loop-CRC-Solving-the-problem-of-insufficient-CRC-tab.patch @@ -0,0 +1,26 @@ +From 92dc99425b2566e8cc9cba7cec8774911db0c654 Mon Sep 17 00:00:00 2001 +From: XingYuShuai <1150775134@qq.com> +Date: Fri, 2 Feb 2024 15:55:07 +0800 +Subject: [PATCH 1/3] [Loop CRC] Solving the problem of insufficient CRC table. + +--- + gcc/tree-ssa-loop-crc.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/gcc/tree-ssa-loop-crc.c b/gcc/tree-ssa-loop-crc.c +index 2dd9e1e3b..26f8e64d1 100644 +--- a/gcc/tree-ssa-loop-crc.c ++++ b/gcc/tree-ssa-loop-crc.c +@@ -421,7 +421,8 @@ match_crc_table (tree crc_table) + tree low_bound = array_ref_low_bound (crc_table); + tree up_bound = array_ref_up_bound (crc_table); + tree element_size = array_ref_element_size (crc_table); +- if (low_bound == NULL || up_bound == NULL || element_size == NULL) ++ if (!tree_fits_uhwi_p(low_bound) || !tree_fits_uhwi_p(up_bound) || ++ !tree_fits_uhwi_p(element_size)) + return false; + unsigned HOST_WIDE_INT lb = tree_to_uhwi (low_bound); + unsigned HOST_WIDE_INT ub = tree_to_uhwi (up_bound); +-- +2.33.0 + diff --git a/0187-Add-IPA-prefetch-test.patch b/0187-Add-IPA-prefetch-test.patch new file mode 100644 index 0000000..f804a27 --- /dev/null +++ b/0187-Add-IPA-prefetch-test.patch @@ -0,0 +1,1862 @@ +From 748ca709302cf9dd9616b034b35c90420f3ea54d Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia +Date: Sat, 30 Dec 2023 18:08:54 +0800 +Subject: [PATCH 2/3] Add IPA prefetch test + +--- + gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c | 1843 ++++++++++++++++++++ + 1 file changed, 1843 insertions(+) + create mode 100644 gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c + +diff --git a/gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c b/gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c +new file mode 100644 +index 000000000..bd4fb2bdc +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/ipa/ipa-prefetch-xz.c +@@ -0,0 +1,1843 @@ ++/* { dg-do link } */ ++/* { dg-options "-O3 -fipa-ic -fipa-prefetch -flto -flto-partition=one -fdump-ipa-ipa_prefetch -fdump-ipa-icp" } */ ++/* { dg-require-effective-target lto } */ ++ ++/* Based on opensource xz code. */ ++ ++#include ++#include ++ ++typedef long int ptrdiff_t; ++typedef long unsigned int size_t; ++typedef unsigned int wchar_t; ++ ++typedef unsigned char __u_char; ++typedef unsigned short int __u_short; ++typedef unsigned int __u_int; ++typedef unsigned long int __u_long; ++ ++typedef signed char __int8_t; ++typedef unsigned char __uint8_t; ++typedef signed short int __int16_t; ++typedef unsigned short int __uint16_t; ++typedef signed int __int32_t; ++typedef unsigned int __uint32_t; ++ ++typedef signed long int __int64_t; ++typedef unsigned long int __uint64_t; ++ ++typedef __int8_t __int_least8_t; ++typedef __uint8_t __uint_least8_t; ++typedef __int16_t __int_least16_t; ++typedef __uint16_t __uint_least16_t; ++typedef __int32_t __int_least32_t; ++typedef __uint32_t __uint_least32_t; ++typedef __int64_t __int_least64_t; ++typedef __uint64_t __uint_least64_t; ++ ++typedef __int8_t int8_t; ++typedef __int16_t int16_t; ++typedef __int32_t int32_t; ++typedef __int64_t int64_t; ++ ++typedef __uint8_t uint8_t; ++typedef __uint16_t uint16_t; ++typedef __uint32_t uint32_t; ++typedef __uint64_t uint64_t; ++ ++typedef long int intptr_t; ++typedef unsigned long int uintptr_t; ++ ++static inline uint16_t ++read16ne(const uint8_t *buf) ++{ ++ uint16_t num; ++ memcpy(&num, buf, sizeof(num)); ++ return num; ++} ++ ++static inline uint32_t ++read32ne(const uint8_t *buf) ++{ ++ uint32_t num; ++ memcpy(&num, buf, sizeof(num)); ++ return num; ++} ++ ++static inline uint16_t ++aligned_read16ne(const uint8_t *buf) ++{ ++ uint16_t num; ++ memcpy(&num, __builtin_assume_aligned(buf, sizeof(num)), sizeof(num)); ++ return num; ++} ++ ++ ++static inline uint32_t ++aligned_read32ne(const uint8_t *buf) ++{ ++ uint32_t num; ++ memcpy(&num, __builtin_assume_aligned(buf, sizeof(num)), sizeof(num)); ++ return num; ++} ++ ++static inline uint64_t ++aligned_read64ne(const uint8_t *buf) ++{ ++ uint64_t num; ++ memcpy(&num, __builtin_assume_aligned(buf, sizeof(num)), sizeof(num)); ++ return num; ++} ++ ++typedef unsigned char lzma_bool; ++ ++typedef enum { ++ LZMA_RESERVED_ENUM = 0 ++} lzma_reserved_enum; ++ ++typedef enum { ++ LZMA_OK = 0, ++ LZMA_STREAM_END = 1, ++ LZMA_NO_CHECK = 2, ++ LZMA_UNSUPPORTED_CHECK = 3, ++ LZMA_GET_CHECK = 4, ++ LZMA_MEM_ERROR = 5, ++ LZMA_MEMLIMIT_ERROR = 6, ++ LZMA_FORMAT_ERROR = 7, ++ LZMA_OPTIONS_ERROR = 8, ++ LZMA_DATA_ERROR = 9, ++ LZMA_BUF_ERROR = 10, ++ LZMA_PROG_ERROR = 11, ++} lzma_ret; ++ ++typedef enum { ++ LZMA_RUN = 0, ++ LZMA_SYNC_FLUSH = 1, ++ LZMA_FULL_FLUSH = 2, ++ LZMA_FULL_BARRIER = 4, ++ LZMA_FINISH = 3 ++} lzma_action; ++ ++typedef struct { ++ void *( *alloc)(void *opaque, size_t nmemb, size_t size); ++ ++ void ( *free)(void *opaque, void *ptr); ++ ++ void *opaque; ++} lzma_allocator; ++ ++typedef uint64_t lzma_vli; ++ ++typedef enum { ++ LZMA_CHECK_NONE = 0, ++ LZMA_CHECK_CRC32 = 1, ++ LZMA_CHECK_CRC64 = 4, ++ LZMA_CHECK_SHA256 = 10 ++} lzma_check; ++ ++typedef struct { ++ lzma_vli id; ++ void *options; ++} lzma_filter; ++ ++typedef enum { ++ LZMA_MF_HC3 = 0x03, ++ LZMA_MF_HC4 = 0x04, ++ LZMA_MF_BT2 = 0x12, ++ LZMA_MF_BT3 = 0x13, ++ LZMA_MF_BT4 = 0x14 ++} lzma_match_finder; ++ ++typedef struct lzma_next_coder_s lzma_next_coder; ++ ++typedef struct lzma_filter_info_s lzma_filter_info; ++ ++typedef lzma_ret (*lzma_init_function)( ++ lzma_next_coder *next, const lzma_allocator *allocator, ++ const lzma_filter_info *filters); ++ ++typedef lzma_ret (*lzma_code_function)( ++ void *coder, const lzma_allocator *allocator, ++ const uint8_t *restrict in, size_t *restrict in_pos, ++ size_t in_size, uint8_t *restrict out, ++ size_t *restrict out_pos, size_t out_size, ++ lzma_action action); ++ ++typedef void (*lzma_end_function)( ++ void *coder, const lzma_allocator *allocator); ++ ++struct lzma_filter_info_s { ++ lzma_vli id; ++ lzma_init_function init; ++ void *options; ++}; ++ ++struct lzma_next_coder_s { ++ void *coder; ++ lzma_vli id; ++ uintptr_t init; ++ ++ lzma_code_function code; ++ lzma_end_function end; ++ void (*get_progress)(void *coder, ++ uint64_t *progress_in, uint64_t *progress_out); ++ ++ lzma_check (*get_check)(const void *coder); ++ lzma_ret (*memconfig)(void *coder, uint64_t *memusage, ++ uint64_t *old_memlimit, uint64_t new_memlimit); ++ lzma_ret (*update)(void *coder, const lzma_allocator *allocator, ++ const lzma_filter *filters, const lzma_filter *reversed_filters); ++}; ++ ++typedef struct { ++ uint32_t len; ++ uint32_t dist; ++} lzma_match; ++ ++typedef struct lzma_mf_s lzma_mf; ++struct lzma_mf_s { ++ uint8_t *buffer; ++ uint32_t size; ++ uint32_t keep_size_before; ++ uint32_t keep_size_after; ++ uint32_t offset; ++ uint32_t read_pos; ++ uint32_t read_ahead; ++ uint32_t read_limit; ++ uint32_t write_pos; ++ uint32_t pending; ++ uint32_t (*find)(lzma_mf *mf, lzma_match *matches); ++ void (*skip)(lzma_mf *mf, uint32_t num); ++ uint32_t *hash; ++ uint32_t *son; ++ uint32_t cyclic_pos; ++ uint32_t cyclic_size; ++ uint32_t hash_mask; ++ uint32_t depth; ++ uint32_t nice_len; ++ uint32_t match_len_max; ++ lzma_action action; ++ uint32_t hash_count; ++ uint32_t sons_count; ++}; ++ ++typedef struct { ++ size_t before_size; ++ size_t dict_size; ++ size_t after_size; ++ size_t match_len_max; ++ size_t nice_len; ++ lzma_match_finder match_finder; ++ uint32_t depth; ++ const uint8_t *preset_dict; ++ uint32_t preset_dict_size; ++} lzma_lz_options; ++ ++typedef struct { ++ void *coder; ++ lzma_ret (*code)(void *coder, ++ lzma_mf *restrict mf, uint8_t *restrict out, ++ size_t *restrict out_pos, size_t out_size); ++ void (*end)(void *coder, const lzma_allocator *allocator); ++ lzma_ret (*options_update)(void *coder, const lzma_filter *filter); ++} lzma_lz_encoder; ++ ++static inline const uint8_t * ++mf_ptr(const lzma_mf *mf) ++{ ++ return mf->buffer + mf->read_pos; ++} ++ ++static inline uint32_t ++mf_avail(const lzma_mf *mf) ++{ ++ return mf->write_pos - mf->read_pos; ++} ++ ++typedef struct { ++ uint32_t state[8]; ++ uint64_t size; ++} lzma_sha256_state; ++ ++typedef struct { ++ union { ++ uint8_t u8[64]; ++ uint32_t u32[16]; ++ uint64_t u64[8]; ++ } buffer; ++ union { ++ uint32_t crc32; ++ uint64_t crc64; ++ lzma_sha256_state sha256; ++ } state; ++} lzma_check_state; ++ ++// The table is constantly initialized in the original code. ++// Skip it in the test. ++const uint32_t lzma_crc32_table[8][256]; ++ ++static inline uint32_t __attribute__((__always_inline__)) ++lzma_memcmplen(const uint8_t *buf1, const uint8_t *buf2, ++ uint32_t len, uint32_t limit) ++{ ++ while (len < limit) { ++ uint32_t x = read32ne(buf1 + len) - read32ne(buf2 + len); ++ if (x != 0) { ++ if ((x & 0xFFFF) == 0) { ++ len += 2; ++ x >>= 16; ++ } ++ ++ if ((x & 0xFF) == 0) ++ ++len; ++ ++ return ((len) < (limit) ? (len) : (limit)); ++ } ++ ++ len += 4; ++ } ++ ++ return limit; ++} ++ ++extern uint32_t ++lzma_mf_find(lzma_mf *mf, uint32_t *count_ptr, lzma_match *matches) ++{ ++ const uint32_t count = mf->find(mf, matches); ++ uint32_t len_best = 0; ++ ++ if (count > 0) { ++ len_best = matches[count - 1].len; ++ if (len_best == mf->nice_len) { ++ uint32_t limit = mf_avail(mf) + 1; ++ if (limit > mf->match_len_max) ++ limit = mf->match_len_max; ++ const uint8_t *p1 = mf_ptr(mf) - 1; ++ const uint8_t *p2 = p1 - matches[count - 1].dist - 1; ++ len_best = lzma_memcmplen(p1, p2, len_best, limit); ++ } ++ } ++ ++ *count_ptr = count; ++ ++mf->read_ahead; ++ ++ return len_best; ++} ++ ++static void ++normalize(lzma_mf *mf) ++{ ++ const uint32_t subvalue = ((4294967295U) - mf->cyclic_size); ++ ++ for (uint32_t i = 0; i < mf->hash_count; ++i) { ++ if (mf->hash[i] <= subvalue) ++ mf->hash[i] = 0; ++ else ++ mf->hash[i] -= subvalue; ++ } ++ ++ for (uint32_t i = 0; i < mf->sons_count; ++i) { ++ if (mf->son[i] <= subvalue) ++ mf->son[i] = 0; ++ else ++ mf->son[i] -= subvalue; ++ } ++ ++ mf->offset -= subvalue; ++ return; ++} ++ ++static void ++move_pos(lzma_mf *mf) ++{ ++ if (++mf->cyclic_pos == mf->cyclic_size) ++ mf->cyclic_pos = 0; ++ ++mf->read_pos; ++ if (__builtin_expect(mf->read_pos + mf->offset == (4294967295U), 0 )) ++ normalize(mf); ++} ++ ++static void ++move_pending(lzma_mf *mf) ++{ ++ ++mf->read_pos; ++ ++mf->pending; ++} ++ ++static lzma_match * ++hc_find_func( ++ const uint32_t len_limit, ++ const uint32_t pos, ++ const uint8_t *const cur, ++ uint32_t cur_match, ++ uint32_t depth, ++ uint32_t *const son, ++ const uint32_t cyclic_pos, ++ const uint32_t cyclic_size, ++ lzma_match *matches, ++ uint32_t len_best) ++{ ++ son[cyclic_pos] = cur_match; ++ ++ while (1) { ++ const uint32_t delta = pos - cur_match; ++ if (depth-- == 0 || delta >= cyclic_size) ++ return matches; ++ ++ const uint8_t *const pb = cur - delta; ++ cur_match = son[cyclic_pos - delta ++ + (delta > cyclic_pos ? cyclic_size : 0)]; ++ ++ if (pb[len_best] == cur[len_best] && pb[0] == cur[0]) { ++ uint32_t len = lzma_memcmplen(pb, cur, 1, len_limit); ++ ++ if (len_best < len) { ++ len_best = len; ++ matches->len = len; ++ matches->dist = delta - 1; ++ ++matches; ++ ++ if (len == len_limit) ++ return matches; ++ } ++ } ++ } ++} ++ ++extern uint32_t ++lzma_mf_hc3_find(lzma_mf *mf, lzma_match *matches) ++{ ++ uint32_t len_limit = mf_avail(mf); ++ if (mf->nice_len <= len_limit) { ++ len_limit = mf->nice_len; ++ } else if (len_limit < (3)) { ++ move_pending(mf); ++ return 0; ++ } ++ const uint8_t *cur = mf_ptr(mf); ++ const uint32_t pos = mf->read_pos + mf->offset; ++ uint32_t matches_count = 0; ++ ++ const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; ++ const uint32_t hash_2_value = temp & ((1U << 10) - 1); ++ const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & mf->hash_mask; ++ ++ const uint32_t delta2 = pos - mf->hash[hash_2_value]; ++ const uint32_t cur_match = mf->hash[((1U << 10)) + hash_value]; ++ ++ mf->hash[hash_2_value] = pos; ++ mf->hash[((1U << 10)) + hash_value] = pos; ++ ++ uint32_t len_best = 2; ++ ++ if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) { ++ len_best = lzma_memcmplen(cur - delta2, cur, len_best, len_limit); ++ ++ matches[0].len = len_best; ++ matches[0].dist = delta2 - 1; ++ matches_count = 1; ++ ++ if (len_best == len_limit) { ++ mf->son[mf->cyclic_pos] = cur_match; ++ move_pos(mf); ++ return 1; ++ } ++ } ++ ++ matches_count = hc_find_func(len_limit, pos, cur, cur_match, mf->depth, ++ mf->son, mf->cyclic_pos, mf->cyclic_size, ++ matches + matches_count, len_best) - matches; ++ move_pos(mf); ++ return matches_count; ++} ++ ++extern void ++lzma_mf_hc3_skip(lzma_mf *mf, uint32_t amount) ++{ ++ do { ++ if (mf_avail(mf) < 3) { ++ move_pending(mf); ++ continue; ++ } ++ ++ const uint8_t *cur = mf_ptr(mf); ++ const uint32_t pos = mf->read_pos + mf->offset; ++ ++ const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; ++ const uint32_t hash_2_value = temp & ((1U << 10) - 1); ++ const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & mf->hash_mask; ++ ++ const uint32_t cur_match ++ = mf->hash[((1U << 10)) + hash_value]; ++ ++ mf->hash[hash_2_value] = pos; ++ mf->hash[((1U << 10)) + hash_value] = pos; ++ ++ do { mf->son[mf->cyclic_pos] = cur_match; move_pos(mf); } while (0); ++ ++ } while (--amount != 0); ++} ++ ++extern uint32_t ++lzma_mf_hc4_find(lzma_mf *mf, lzma_match *matches) ++{ ++ uint32_t len_limit = mf_avail(mf); ++ if (mf->nice_len <= len_limit) { ++ len_limit = mf->nice_len; ++ } else if (len_limit < (4)) { ++ move_pending(mf); ++ return 0; ++ } ++ const uint8_t *cur = mf_ptr(mf); ++ const uint32_t pos = mf->read_pos + mf->offset; ++ uint32_t matches_count = 0; ++ ++ const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; ++ const uint32_t hash_2_value = temp & ((1U << 10) - 1); ++ const uint32_t hash_3_value = (temp ^ ((uint32_t)(cur[2]) << 8)) ++ & ((1U << 16) - 1); ++ const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8) ++ ^ (lzma_crc32_table[0][cur[3]] << 5)) ++ & mf->hash_mask; ++ uint32_t delta2 = pos - mf->hash[hash_2_value]; ++ const uint32_t delta3 ++ = pos - mf->hash[((1U << 10)) + hash_3_value]; ++ const uint32_t cur_match = mf->hash[((1U << 10) + (1U << 16)) + hash_value]; ++ ++ mf->hash[hash_2_value ] = pos; ++ mf->hash[((1U << 10)) + hash_3_value] = pos; ++ mf->hash[((1U << 10) + (1U << 16)) + hash_value] = pos; ++ ++ uint32_t len_best = 1; ++ ++ if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) { ++ len_best = 2; ++ matches[0].len = 2; ++ matches[0].dist = delta2 - 1; ++ matches_count = 1; ++ } ++ ++ if (delta2 != delta3 && delta3 < mf->cyclic_size ++ && *(cur - delta3) == *cur) { ++ len_best = 3; ++ matches[matches_count++].dist = delta3 - 1; ++ delta2 = delta3; ++ } ++ ++ if (matches_count != 0) { ++ len_best = lzma_memcmplen(cur - delta2, cur, ++ len_best, len_limit); ++ ++ matches[matches_count - 1].len = len_best; ++ ++ if (len_best == len_limit) { ++ mf->son[mf->cyclic_pos] = cur_match; move_pos(mf); ++ return matches_count; ++ } ++ } ++ ++ if (len_best < 3) ++ len_best = 3; ++ ++ matches_count = hc_find_func(len_limit, pos, cur, cur_match, mf->depth, ++ mf->son, mf->cyclic_pos, mf->cyclic_size, ++ matches + matches_count, len_best) - matches; ++ move_pos(mf); ++ return matches_count; ++} ++ ++extern void ++lzma_mf_hc4_skip(lzma_mf *mf, uint32_t amount) ++{ ++ do { ++ if (mf_avail(mf) < 4) { ++ move_pending(mf); ++ continue; ++ } ++ ++ const uint8_t *cur = mf_ptr(mf); ++ const uint32_t pos = mf->read_pos + mf->offset; ++ ++ const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; ++ const uint32_t hash_2_value = temp & ((1U << 10) - 1); ++ const uint32_t hash_3_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & ((1U << 16) - 1); ++ const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8) ++ ^ (lzma_crc32_table[0][cur[3]] << 5)) ++ & mf->hash_mask; ++ ++ const uint32_t cur_match ++ = mf->hash[((1U << 10) + (1U << 16)) + hash_value]; ++ ++ mf->hash[hash_2_value] = pos; ++ mf->hash[((1U << 10)) + hash_3_value] = pos; ++ mf->hash[((1U << 10) + (1U << 16)) + hash_value] = pos; ++ ++ mf->son[mf->cyclic_pos] = cur_match; ++ move_pos(mf); ++ } while (--amount != 0); ++} ++ ++static lzma_match * ++bt_find_func( ++ const uint32_t len_limit, ++ const uint32_t pos, ++ const uint8_t *const cur, ++ uint32_t cur_match, ++ uint32_t depth, ++ uint32_t *const son, ++ const uint32_t cyclic_pos, ++ const uint32_t cyclic_size, ++ lzma_match *matches, ++ uint32_t len_best) ++{ ++ uint32_t *ptr0 = son + (cyclic_pos << 1) + 1; ++ uint32_t *ptr1 = son + (cyclic_pos << 1); ++ ++ uint32_t len0 = 0; ++ uint32_t len1 = 0; ++ ++ while (1) { ++ const uint32_t delta = pos - cur_match; ++ if (depth-- == 0 || delta >= cyclic_size) { ++ *ptr0 = 0; ++ *ptr1 = 0; ++ return matches; ++ } ++ ++ uint32_t *const pair = son + ((cyclic_pos - delta ++ + (delta > cyclic_pos ? cyclic_size : 0)) ++ << 1); ++ ++ const uint8_t *const pb = cur - delta; ++ uint32_t len = ((len0) < (len1) ? (len0) : (len1)); ++ ++ if (pb[len] == cur[len]) { ++ len = lzma_memcmplen(pb, cur, len + 1, len_limit); ++ ++ if (len_best < len) { ++ len_best = len; ++ matches->len = len; ++ matches->dist = delta - 1; ++ ++matches; ++ ++ if (len == len_limit) { ++ *ptr1 = pair[0]; ++ *ptr0 = pair[1]; ++ return matches; ++ } ++ } ++ } ++ ++ if (pb[len] < cur[len]) { ++ *ptr1 = cur_match; ++ ptr1 = pair + 1; ++ cur_match = *ptr1; ++ len1 = len; ++ } else { ++ *ptr0 = cur_match; ++ ptr0 = pair; ++ cur_match = *ptr0; ++ len0 = len; ++ } ++ } ++} ++ ++ ++static void ++bt_skip_func( ++ const uint32_t len_limit, ++ const uint32_t pos, ++ const uint8_t *const cur, ++ uint32_t cur_match, ++ uint32_t depth, ++ uint32_t *const son, ++ const uint32_t cyclic_pos, ++ const uint32_t cyclic_size) ++{ ++ uint32_t *ptr0 = son + (cyclic_pos << 1) + 1; ++ uint32_t *ptr1 = son + (cyclic_pos << 1); ++ ++ uint32_t len0 = 0; ++ uint32_t len1 = 0; ++ ++ while (1) { ++ const uint32_t delta = pos - cur_match; ++ if (depth-- == 0 || delta >= cyclic_size) { ++ *ptr0 = 0; ++ *ptr1 = 0; ++ return; ++ } ++ ++ uint32_t *pair = son + ((cyclic_pos - delta ++ + (delta > cyclic_pos ? cyclic_size : 0)) ++ << 1); ++ const uint8_t *pb = cur - delta; ++ uint32_t len = ((len0) < (len1) ? (len0) : (len1)); ++ ++ if (pb[len] == cur[len]) { ++ len = lzma_memcmplen(pb, cur, len + 1, len_limit); ++ ++ if (len == len_limit) { ++ *ptr1 = pair[0]; ++ *ptr0 = pair[1]; ++ return; ++ } ++ } ++ ++ if (pb[len] < cur[len]) { ++ *ptr1 = cur_match; ++ ptr1 = pair + 1; ++ cur_match = *ptr1; ++ len1 = len; ++ } else { ++ *ptr0 = cur_match; ++ ptr0 = pair; ++ cur_match = *ptr0; ++ len0 = len; ++ } ++ } ++} ++ ++extern uint32_t ++lzma_mf_bt2_find(lzma_mf *mf, lzma_match *matches) ++{ ++ uint32_t len_limit = mf_avail(mf); ++ if (mf->nice_len <= len_limit) { ++ len_limit = mf->nice_len; ++ } else if (len_limit < (2) || (mf->action == LZMA_SYNC_FLUSH)) { ++ move_pending(mf); ++ return 0; ++ } ++ const uint8_t *cur = mf_ptr(mf); ++ const uint32_t pos = mf->read_pos + mf->offset; ++ uint32_t matches_count = 0; ++ const uint32_t hash_value = read16ne(cur); ++ const uint32_t cur_match = mf->hash[hash_value]; ++ mf->hash[hash_value] = pos; ++ ++ matches_count = bt_find_func(len_limit, pos, cur, cur_match, mf->depth, ++ mf->son, mf->cyclic_pos, mf->cyclic_size, ++ matches + matches_count, 1) - matches; ++ move_pos(mf); ++ return matches_count; ++} ++ ++extern void ++lzma_mf_bt2_skip(lzma_mf *mf, uint32_t amount) ++{ ++ do { ++ uint32_t len_limit = mf_avail(mf); ++ if (mf->nice_len <= len_limit) { ++ len_limit = mf->nice_len; ++ } else if (len_limit < (2) || (mf->action == LZMA_SYNC_FLUSH)) { ++ move_pending(mf); ++ continue; ++ } ++ const uint8_t *cur = mf_ptr(mf); ++ const uint32_t pos = mf->read_pos + mf->offset; ++ ++ const uint32_t hash_value = read16ne(cur); ++ const uint32_t cur_match = mf->hash[hash_value]; ++ mf->hash[hash_value] = pos; ++ ++ bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, mf->son, ++ mf->cyclic_pos, mf->cyclic_size); ++ move_pos(mf); ++ } while (--amount != 0); ++} ++ ++extern uint32_t ++lzma_mf_bt3_find(lzma_mf *mf, lzma_match *matches) ++{ ++ uint32_t len_limit = mf_avail(mf); ++ if (mf->nice_len <= len_limit) { ++ len_limit = mf->nice_len; ++ } else if (len_limit < (3) || (1 && mf->action == LZMA_SYNC_FLUSH)) { ++ move_pending(mf); ++ return 0; ++ } ++ const uint8_t *cur = mf_ptr(mf); ++ const uint32_t pos = mf->read_pos + mf->offset; ++ uint32_t matches_count = 0; ++ ++ const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; ++ const uint32_t hash_2_value = temp & ((1U << 10) - 1); ++ const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & mf->hash_mask; ++ ++ const uint32_t delta2 = pos - mf->hash[hash_2_value]; ++ const uint32_t cur_match = mf->hash[((1U << 10)) + hash_value]; ++ ++ mf->hash[hash_2_value] = pos; ++ mf->hash[((1U << 10)) + hash_value] = pos; ++ ++ uint32_t len_best = 2; ++ ++ if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) { ++ len_best = lzma_memcmplen( ++ cur, cur - delta2, len_best, len_limit); ++ ++ matches[0].len = len_best; ++ matches[0].dist = delta2 - 1; ++ matches_count = 1; ++ ++ if (len_best == len_limit) { ++ bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, mf->son, ++ mf->cyclic_pos, mf->cyclic_size); ++ move_pos(mf); ++ return 1; ++ } ++ } ++ ++ matches_count = bt_find_func(len_limit, pos, cur, cur_match, mf->depth, ++ mf->son, mf->cyclic_pos, mf->cyclic_size, ++ matches + matches_count, len_best) - matches; ++ move_pos(mf); ++ return matches_count; ++} ++ ++ ++extern void ++lzma_mf_bt3_skip(lzma_mf *mf, uint32_t amount) ++{ ++ do { ++ uint32_t len_limit = mf_avail(mf); ++ if (mf->nice_len <= len_limit) { ++ len_limit = mf->nice_len; } ++ else if (len_limit < (3) || (1 && mf->action == LZMA_SYNC_FLUSH)) { ++ move_pending(mf); ++ continue; ++ } ++ const uint8_t *cur = mf_ptr(mf); ++ const uint32_t pos = mf->read_pos + mf->offset; ++ ++ const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; ++ const uint32_t hash_2_value = temp & ((1U << 10) - 1); ++ const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & mf->hash_mask; ++ ++ const uint32_t cur_match = mf->hash[((1U << 10)) + hash_value]; ++ ++ mf->hash[hash_2_value] = pos; ++ mf->hash[((1U << 10)) + hash_value] = pos; ++ ++ bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, mf->son, ++ mf->cyclic_pos, mf->cyclic_size); ++ move_pos(mf); ++ } while (--amount != 0); ++} ++ ++extern uint32_t ++lzma_mf_bt4_find(lzma_mf *mf, lzma_match *matches) ++{ ++ uint32_t len_limit = mf->write_pos - mf->read_pos; ++ if (mf->nice_len <= len_limit) { ++ len_limit = mf->nice_len; ++ } else if (len_limit < (4) || (mf->action == LZMA_SYNC_FLUSH)) { ++ ++mf->read_pos; ++ ++mf->pending; ++ return 0; ++ } ++ ++ const uint8_t *cur = mf->buffer + mf->read_pos; ++ const uint32_t pos = mf->read_pos + mf->offset; ++ uint32_t matches_count = 0; ++ ++ const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; ++ const uint32_t hash_2_value = temp & ((1U << 10) - 1); ++ const uint32_t hash_3_value = (temp ^ ((uint32_t)(cur[2]) << 8)) & ((1U << 16) - 1); ++ const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8) ++ ^ (lzma_crc32_table[0][cur[3]] << 5)) ++ & mf->hash_mask; ++ ++ uint32_t delta2 = pos - mf->hash[hash_2_value]; ++ const uint32_t delta3 = pos - mf->hash[((1U << 10)) + hash_3_value]; ++ const uint32_t cur_match = mf->hash[((1U << 10) + (1U << 16)) + hash_value]; ++ ++ mf->hash[hash_2_value] = pos; ++ mf->hash[((1U << 10)) + hash_3_value] = pos; ++ mf->hash[((1U << 10) + (1U << 16)) + hash_value] = pos; ++ ++ uint32_t len_best = 1; ++ ++ if (delta2 < mf->cyclic_size && *(cur - delta2) == *cur) { ++ len_best = 2; ++ matches[0].len = 2; ++ matches[0].dist = delta2 - 1; ++ matches_count = 1; ++ } ++ ++ if (delta2 != delta3 && delta3 < mf->cyclic_size && *(cur - delta3) == *cur) { ++ len_best = 3; ++ matches[matches_count++].dist = delta3 - 1; ++ delta2 = delta3; ++ } ++ ++ if (matches_count != 0) { ++ len_best = lzma_memcmplen(cur, cur - delta2, len_best, len_limit); ++ ++ matches[matches_count - 1].len = len_best; ++ ++ if (len_best == len_limit) { ++ bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, mf->son, ++ mf->cyclic_pos, mf->cyclic_size); ++ move_pos(mf); ++ return matches_count; ++ } ++ } ++ ++ if (len_best < 3) ++ len_best = 3; ++ ++ matches_count = bt_find_func(len_limit, pos, cur, cur_match, mf->depth, mf->son, ++ mf->cyclic_pos, mf->cyclic_size, ++ matches + matches_count, len_best) - matches; ++ move_pos(mf); ++ return matches_count; ++} ++ ++extern void ++lzma_mf_bt4_skip(lzma_mf *mf, uint32_t amount) ++{ ++ do { ++ uint32_t len_limit = mf_avail(mf); ++ if (mf->nice_len <= len_limit) { ++ len_limit = mf->nice_len; ++ } else if (len_limit < (4) || (mf->action == LZMA_SYNC_FLUSH)) { ++ move_pending(mf); ++ continue; ++ } ++ ++ const uint8_t *cur = mf->buffer + mf->read_pos; ++ const uint32_t pos = mf->read_pos + mf->offset; ++ ++ const uint32_t temp = lzma_crc32_table[0][cur[0]] ^ cur[1]; ++ const uint32_t hash_2_value = temp & ((1U << 10) - 1); ++ const uint32_t hash_3_value = (temp ^ ((uint32_t)(cur[2]) << 8)) ++ & ((1U << 16) - 1); ++ const uint32_t hash_value = (temp ^ ((uint32_t)(cur[2]) << 8) ++ ^ (lzma_crc32_table[0][cur[3]] << 5)) ++ & mf->hash_mask; ++ ++ const uint32_t cur_match = mf->hash[((1U << 10) + (1U << 16)) + hash_value]; ++ ++ mf->hash[hash_2_value] = pos; ++ mf->hash[((1U << 10)) + hash_3_value] = pos; ++ mf->hash[((1U << 10) + (1U << 16)) + hash_value] = pos; ++ ++ bt_skip_func(len_limit, pos, cur, cur_match, mf->depth, mf->son, ++ mf->cyclic_pos, mf->cyclic_size); ++ move_pos(mf); ++ } while (--amount != 0); ++} ++ ++static inline void ++mf_skip(lzma_mf *mf, uint32_t amount) ++{ ++ if (amount != 0) { ++ mf->skip(mf, amount); ++ mf->read_ahead += amount; ++ } ++} ++ ++typedef struct lzma_lzma1_encoder_s lzma_lzma1_encoder; ++typedef uint16_t probability; ++ ++typedef struct { ++ probability choice; ++ probability choice2; ++ probability low[(1 << 4)][(1 << 3)]; ++ probability mid[(1 << 4)][(1 << 3)]; ++ probability high[(1 << 8)]; ++ uint32_t prices[(1 << 4)][((1 << 3) + (1 << 3) + (1 << 8))]; ++ uint32_t table_size; ++ uint32_t counters[(1 << 4)]; ++} lzma_length_encoder; ++ ++typedef struct { ++ uint64_t low; ++ uint64_t cache_size; ++ uint32_t range; ++ uint8_t cache; ++ size_t count; ++ size_t pos; ++ ++ enum { ++ RC_BIT_0, ++ RC_BIT_1, ++ RC_DIRECT_0, ++ RC_DIRECT_1, ++ RC_FLUSH, ++ } symbols[58]; ++ ++ probability *probs[58]; ++} lzma_range_encoder; ++ ++ ++typedef enum { ++ STATE_LIT_LIT, ++ STATE_MATCH_LIT_LIT, ++ STATE_REP_LIT_LIT, ++ STATE_SHORTREP_LIT_LIT, ++ STATE_MATCH_LIT, ++ STATE_REP_LIT, ++ STATE_SHORTREP_LIT, ++ STATE_LIT_MATCH, ++ STATE_LIT_LONGREP, ++ STATE_LIT_SHORTREP, ++ STATE_NONLIT_MATCH, ++ STATE_NONLIT_REP, ++} lzma_lzma_state; ++ ++typedef struct { ++ lzma_lzma_state state; ++ _Bool prev_1_is_literal; ++ _Bool prev_2; ++ ++ uint32_t pos_prev_2; ++ uint32_t back_prev_2; ++ ++ uint32_t price; ++ uint32_t pos_prev; ++ uint32_t back_prev; ++ ++ uint32_t backs[4]; ++} lzma_optimal; ++ ++struct lzma_lzma1_encoder_s { ++ lzma_range_encoder rc; ++ lzma_lzma_state state; ++ uint32_t reps[4]; ++ lzma_match matches[(2 + ((1 << 3) + (1 << 3) + (1 << 8)) - 1) + 1]; ++ uint32_t matches_count; ++ uint32_t longest_match_length; ++ _Bool fast_mode; ++ _Bool is_initialized; ++ _Bool is_flushed; ++ uint32_t pos_mask; ++ uint32_t literal_context_bits; ++ uint32_t literal_pos_mask; ++ ++ probability literal[(1 << 4)][0x300]; ++ probability is_match[12][(1 << 4)]; ++ probability is_rep[12]; ++ probability is_rep0[12]; ++ probability is_rep1[12]; ++ probability is_rep2[12]; ++ probability is_rep0_long[12][(1 << 4)]; ++ probability dist_slot[4][(1 << 6)]; ++ probability dist_special[(1 << (14 / 2)) - 14]; ++ probability dist_align[(1 << 4)]; ++ ++ lzma_length_encoder match_len_encoder; ++ lzma_length_encoder rep_len_encoder; ++ ++ uint32_t dist_slot_prices[4][(1 << 6)]; ++ uint32_t dist_prices[4][(1 << (14 / 2))]; ++ uint32_t dist_table_size; ++ uint32_t match_price_count; ++ ++ uint32_t align_prices[(1 << 4)]; ++ uint32_t align_price_count; ++ uint32_t opts_end_index; ++ uint32_t opts_current_index; ++ lzma_optimal opts[(1 << 12)]; ++}; ++ ++extern void ++lzma_lzma_optimum_fast(lzma_lzma1_encoder *restrict coder, ++ lzma_mf *restrict mf, ++ uint32_t *restrict back_res, uint32_t *restrict len_res) ++{ ++ const uint32_t nice_len = mf->nice_len; ++ ++ uint32_t len_main; ++ uint32_t matches_count; ++ if (mf->read_ahead == 0) { ++ len_main = lzma_mf_find(mf, &matches_count, coder->matches); ++ } else { ++ len_main = coder->longest_match_length; ++ matches_count = coder->matches_count; ++ } ++ ++ const uint8_t *buf = mf_ptr(mf) - 1; ++ const uint32_t buf_avail ++ = ((mf_avail(mf) + 1) < ((2 + ((1 << 3) + (1 << 3) + (1 << 8)) - 1)) ++ ? (mf_avail(mf) + 1) : ((2 + ((1 << 3) + (1 << 3) + (1 << 8)) - 1))); ++ ++ if (buf_avail < 2) { ++ *back_res = (4294967295U); ++ *len_res = 1; ++ return; ++ } ++ ++ uint32_t rep_len = 0; ++ uint32_t rep_index = 0; ++ ++ for (uint32_t i = 0; i < 4; ++i) { ++ const uint8_t *const buf_back = buf - coder->reps[i] - 1; ++ if ((read16ne(buf) != read16ne(buf_back))) ++ continue; ++ const uint32_t len = lzma_memcmplen(buf, buf_back, 2, buf_avail); ++ if (len >= nice_len) { ++ *back_res = i; ++ *len_res = len; ++ mf_skip(mf, len - 1); ++ return; ++ } ++ if (len > rep_len) { ++ rep_index = i; ++ rep_len = len; ++ } ++ } ++ if (len_main >= nice_len) { ++ *back_res = coder->matches[matches_count - 1].dist + 4; ++ *len_res = len_main; ++ mf_skip(mf, len_main - 1); ++ return; ++ } ++ ++ uint32_t back_main = 0; ++ if (len_main >= 2) { ++ back_main = coder->matches[matches_count - 1].dist; ++ while (matches_count > 1 && len_main == ++ coder->matches[matches_count - 2].len + 1) { ++ if (!(((back_main) >> 7) > (coder->matches[ matches_count - 2].dist))) ++ break; ++ --matches_count; ++ len_main = coder->matches[matches_count - 1].len; ++ back_main = coder->matches[matches_count - 1].dist; ++ } ++ if (len_main == 2 && back_main >= 0x80) ++ len_main = 1; ++ } ++ ++ if (rep_len >= 2) { ++ if (rep_len + 1 >= len_main ++ || (rep_len + 2 >= len_main ++ && back_main > (1U << 9)) ++ || (rep_len + 3 >= len_main ++ && back_main > (1U << 15))) { ++ *back_res = rep_index; ++ *len_res = rep_len; ++ mf_skip(mf, rep_len - 1); ++ return; ++ } ++ } ++ ++ if (len_main < 2 || buf_avail <= 2) { ++ *back_res = (4294967295U); ++ *len_res = 1; ++ return; ++ } ++ ++ coder->longest_match_length = lzma_mf_find(mf, ++ &coder->matches_count, coder->matches); ++ ++ if (coder->longest_match_length >= 2) { ++ const uint32_t new_dist = coder->matches[ ++ coder->matches_count - 1].dist; ++ ++ if ((coder->longest_match_length >= len_main ++ && new_dist < back_main) ++ || (coder->longest_match_length == len_main + 1 ++ && !(((new_dist) >> 7) > (back_main))) ++ || (coder->longest_match_length > len_main + 1) ++ || (coder->longest_match_length + 1 >= len_main ++ && len_main >= 3 ++ && (((back_main) >> 7) > (new_dist)))) { ++ *back_res = (4294967295U); ++ *len_res = 1; ++ return; ++ } ++ } ++ ++buf; ++ const uint32_t limit = ((2) > (len_main - 1) ? (2) : (len_main - 1)); ++ for (uint32_t i = 0; i < 4; ++i) { ++ if (memcmp(buf, buf - coder->reps[i] - 1, limit) == 0) { ++ *back_res = (4294967295U); ++ *len_res = 1; ++ return; ++ } ++ } ++ ++ *back_res = back_main + 4; ++ *len_res = len_main; ++ mf_skip(mf, len_main - 2); ++ return; ++} ++ ++static inline void ++rc_bit(lzma_range_encoder *rc, probability *prob, uint32_t bit) ++{ ++ rc->symbols[rc->count] = bit; ++ rc->probs[rc->count] = prob; ++ ++rc->count; ++} ++ ++static inline void ++rc_bittree(lzma_range_encoder *rc, probability *probs, ++ uint32_t bit_count, uint32_t symbol) ++{ ++ uint32_t model_index = 1; ++ ++ do { ++ const uint32_t bit = (symbol >> --bit_count) & 1; ++ rc_bit(rc, &probs[model_index], bit); ++ model_index = (model_index << 1) + bit; ++ } while (bit_count != 0); ++} ++ ++static _Bool ++encode_init(lzma_lzma1_encoder *coder, lzma_mf *mf) ++{ ++ if (mf->read_pos == mf->read_limit) { ++ if (mf->action == LZMA_RUN) ++ return 0; ++ } else { ++ mf_skip(mf, 1); ++ mf->read_ahead = 0; ++ rc_bit(&coder->rc, &coder->is_match[0][0], 0); ++ rc_bittree(&coder->rc, coder->literal[0], 8, mf->buffer[0]); ++ } ++ ++ coder->is_initialized = 1; ++ ++ return 1; ++} ++ ++static inline uint32_t ++mf_position(const lzma_mf *mf) ++{ ++ return mf->read_pos - mf->read_ahead; ++} ++ ++static inline _Bool ++rc_shift_low(lzma_range_encoder *rc, ++ uint8_t *out, size_t *out_pos, size_t out_size) ++{ ++ if ((uint32_t)(rc->low) < (uint32_t)(0xFF000000) ++ || (uint32_t)(rc->low >> 32) != 0) { ++ do { ++ if (*out_pos == out_size) ++ return 1; ++ ++ out[*out_pos] = rc->cache + (uint8_t)(rc->low >> 32); ++ ++*out_pos; ++ rc->cache = 0xFF; ++ } while (--rc->cache_size != 0); ++ rc->cache = (rc->low >> 24) & 0xFF; ++ } ++ ++ ++rc->cache_size; ++ rc->low = (rc->low & 0x00FFFFFF) << 8; ++ return 0; ++} ++ ++static inline void ++rc_reset(lzma_range_encoder *rc) ++{ ++ rc->low = 0; ++ rc->cache_size = 1; ++ rc->range = (4294967295U); ++ rc->cache = 0; ++ rc->count = 0; ++ rc->pos = 0; ++} ++ ++static inline _Bool ++rc_encode(lzma_range_encoder *rc, ++ uint8_t *out, size_t *out_pos, size_t out_size) ++{ ++ while (rc->pos < rc->count) { ++ if (rc->range < (1U << 24)) { ++ if (rc_shift_low(rc, out, out_pos, out_size)) ++ return 1; ++ rc->range <<= 8; ++ } ++ ++ switch (rc->symbols[rc->pos]) { ++ case RC_BIT_0: { ++ probability prob = *rc->probs[rc->pos]; ++ rc->range = (rc->range >> 11) ++ * prob; ++ prob += ((1U << 11) - prob) >> 5; ++ *rc->probs[rc->pos] = prob; ++ break; ++ } ++ ++ case RC_BIT_1: { ++ probability prob = *rc->probs[rc->pos]; ++ const uint32_t bound = prob * (rc->range ++ >> 11); ++ rc->low += bound; ++ rc->range -= bound; ++ prob -= prob >> 5; ++ *rc->probs[rc->pos] = prob; ++ break; ++ } ++ ++ case RC_DIRECT_0: ++ rc->range >>= 1; ++ break; ++ ++ case RC_DIRECT_1: ++ rc->range >>= 1; ++ rc->low += rc->range; ++ break; ++ ++ case RC_FLUSH: ++ rc->range = (4294967295U); ++ do { ++ if (rc_shift_low(rc, out, out_pos, out_size)) ++ return 1; ++ } while (++rc->pos < rc->count); ++ ++ rc_reset(rc); ++ return 0; ++ ++ default: ++ break; ++ } ++ ++rc->pos; ++ } ++ ++ rc->count = 0; ++ rc->pos = 0; ++ return 0; ++} ++ ++static inline uint64_t ++rc_pending(const lzma_range_encoder *rc) ++{ ++ return rc->cache_size + 5 - 1; ++} ++ ++static inline void ++literal_matched(lzma_range_encoder *rc, probability *subcoder, ++ uint32_t match_byte, uint32_t symbol) ++{ ++ uint32_t offset = 0x100; ++ symbol += 1U << 8; ++ ++ do { ++ match_byte <<= 1; ++ const uint32_t match_bit = match_byte & offset; ++ const uint32_t subcoder_index ++ = offset + match_bit + (symbol >> 8); ++ const uint32_t bit = (symbol >> 7) & 1; ++ rc_bit(rc, &subcoder[subcoder_index], bit); ++ ++ symbol <<= 1; ++ offset &= ~(match_byte ^ symbol); ++ ++ } while (symbol < (1U << 16)); ++} ++ ++static inline void ++literal(lzma_lzma1_encoder *coder, lzma_mf *mf, uint32_t position) ++{ ++ const uint8_t cur_byte = mf->buffer[mf->read_pos - mf->read_ahead]; ++ probability *subcoder = ((coder->literal)[ ++ (((position) & (coder->literal_pos_mask)) ++ << (coder->literal_context_bits)) ++ + ((uint32_t)(mf->buffer[mf->read_pos - mf->read_ahead - 1]) ++ >> (8U - (coder->literal_context_bits)))]); ++ ++ if (((coder->state) < 7)) { ++ rc_bittree(&coder->rc, subcoder, 8, cur_byte); ++ } else { ++ const uint8_t match_byte ++ = mf->buffer[mf->read_pos - coder->reps[0] - 1 - mf->read_ahead]; ++ literal_matched(&coder->rc, subcoder, match_byte, cur_byte); ++ } ++ coder->state ++ = ((coder->state) <= STATE_SHORTREP_LIT_LIT ++ ? STATE_LIT_LIT : ((coder->state) <= STATE_LIT_SHORTREP ++ ? (coder->state) - 3 : (coder->state) - 6)); ++} ++ ++const uint8_t lzma_rc_prices[] = { ++ 128, 103, 91, 84, 78, 73, 69, 66, ++ 63, 61, 58, 56, 54, 52, 51, 49, ++ 48, 46, 45, 44, 43, 42, 41, 40, ++ 39, 38, 37, 36, 35, 34, 34, 33, ++ 32, 31, 31, 30, 29, 29, 28, 28, ++ 27, 26, 26, 25, 25, 24, 24, 23, ++ 23, 22, 22, 22, 21, 21, 20, 20, ++ 19, 19, 19, 18, 18, 17, 17, 17, ++ 16, 16, 16, 15, 15, 15, 14, 14, ++ 14, 13, 13, 13, 12, 12, 12, 11, ++ 11, 11, 11, 10, 10, 10, 10, 9, ++ 9, 9, 9, 8, 8, 8, 8, 7, ++ 7, 7, 7, 6, 6, 6, 6, 5, ++ 5, 5, 5, 5, 4, 4, 4, 4, ++ 3, 3, 3, 3, 3, 2, 2, 2, ++ 2, 2, 2, 1, 1, 1, 1, 1 ++}; ++ ++static inline uint32_t ++rc_bit_price(const probability prob, const uint32_t bit) ++{ ++ return lzma_rc_prices[(prob ^ ((0U - bit) ++ & ((1U << 11) - 1))) >> 4]; ++} ++ ++static inline uint32_t ++rc_bit_0_price(const probability prob) ++{ ++ return lzma_rc_prices[prob >> 4]; ++} ++ ++static inline uint32_t ++rc_bit_1_price(const probability prob) ++{ ++ return lzma_rc_prices[(prob ^ ((1U << 11) - 1)) ++ >> 4]; ++} ++ ++static inline uint32_t ++rc_bittree_price(const probability *const probs, ++ const uint32_t bit_levels, uint32_t symbol) ++{ ++ uint32_t price = 0; ++ symbol += 1U << bit_levels; ++ ++ do { ++ const uint32_t bit = symbol & 1; ++ symbol >>= 1; ++ price += rc_bit_price(probs[symbol], bit); ++ } while (symbol != 1); ++ ++ return price; ++} ++ ++static void ++length_update_prices(lzma_length_encoder *lc, const uint32_t pos_state) ++{ ++ const uint32_t table_size = lc->table_size; ++ lc->counters[pos_state] = table_size; ++ ++ const uint32_t a0 = rc_bit_0_price(lc->choice); ++ const uint32_t a1 = rc_bit_1_price(lc->choice); ++ const uint32_t b0 = a1 + rc_bit_0_price(lc->choice2); ++ const uint32_t b1 = a1 + rc_bit_1_price(lc->choice2); ++ uint32_t *const prices = lc->prices[pos_state]; ++ ++ uint32_t i; ++ for (i = 0; i < table_size && i < (1 << 3); ++i) ++ prices[i] = a0 + rc_bittree_price(lc->low[pos_state], ++ 3, i); ++ ++ for (; i < table_size && i < (1 << 3) + (1 << 3); ++i) ++ prices[i] = b0 + rc_bittree_price(lc->mid[pos_state], ++ 3, i - (1 << 3)); ++ ++ for (; i < table_size; ++i) ++ prices[i] = b1 + rc_bittree_price(lc->high, 8, ++ i - (1 << 3) - (1 << 3)); ++ ++ return; ++} ++ ++static inline void ++length(lzma_range_encoder *rc, lzma_length_encoder *lc, ++ const uint32_t pos_state, uint32_t len, const _Bool fast_mode) ++{ ++ len -= 2; ++ ++ if (len < (1 << 3)) { ++ rc_bit(rc, &lc->choice, 0); ++ rc_bittree(rc, lc->low[pos_state], 3, len); ++ } else { ++ rc_bit(rc, &lc->choice, 1); ++ len -= (1 << 3); ++ ++ if (len < (1 << 3)) { ++ rc_bit(rc, &lc->choice2, 0); ++ rc_bittree(rc, lc->mid[pos_state], 3, len); ++ } else { ++ rc_bit(rc, &lc->choice2, 1); ++ len -= (1 << 3); ++ rc_bittree(rc, lc->high, 8, len); ++ } ++ } ++ ++ if (!fast_mode) ++ if (--lc->counters[pos_state] == 0) ++ length_update_prices(lc, pos_state); ++} ++ ++static inline void ++rep_match(lzma_lzma1_encoder *coder, const uint32_t pos_state, ++ const uint32_t rep, const uint32_t len) ++{ ++ if (rep == 0) { ++ rc_bit(&coder->rc, &coder->is_rep0[coder->state], 0); ++ rc_bit(&coder->rc, ++ &coder->is_rep0_long[coder->state][pos_state], ++ len != 1); ++ } else { ++ const uint32_t distance = coder->reps[rep]; ++ rc_bit(&coder->rc, &coder->is_rep0[coder->state], 1); ++ ++ if (rep == 1) { ++ rc_bit(&coder->rc, &coder->is_rep1[coder->state], 0); ++ } else { ++ rc_bit(&coder->rc, &coder->is_rep1[coder->state], 1); ++ rc_bit(&coder->rc, &coder->is_rep2[coder->state], ++ rep - 2); ++ ++ if (rep == 3) ++ coder->reps[3] = coder->reps[2]; ++ ++ coder->reps[2] = coder->reps[1]; ++ } ++ ++ coder->reps[1] = coder->reps[0]; ++ coder->reps[0] = distance; ++ } ++ ++ if (len == 1) { ++ coder->state = ((coder->state) < 7 ? STATE_LIT_SHORTREP : STATE_NONLIT_REP); ++ } else { ++ length(&coder->rc, &coder->rep_len_encoder, pos_state, len, ++ coder->fast_mode); ++ coder->state = ((coder->state) < 7 ? STATE_LIT_LONGREP : STATE_NONLIT_REP); ++ } ++} ++ ++// This array is constantly initialized in the original code. It's quite big ++// so we skip it. ++const uint8_t lzma_fastpos[1 << 13]; ++ ++static inline uint32_t ++get_dist_slot(uint32_t dist) ++{ ++ if (dist < (1U << (13 + ((0) + (0) * (13 - 1))))) ++ return lzma_fastpos[dist]; ++ ++ if (dist < (1U << (13 + ((0) + (1) * (13 - 1))))) ++ return (uint32_t)(lzma_fastpos[(dist) >> ((0) + (1) * (13 - 1))]) + 2 * ((0) + (1) * (13 - 1)); ++ ++ return (uint32_t)(lzma_fastpos[(dist) >> ((0) + (2) * (13 - 1))]) + 2 * ((0) + (2) * (13 - 1)); ++} ++ ++static inline void ++rc_bittree_reverse(lzma_range_encoder *rc, probability *probs, ++ uint32_t bit_count, uint32_t symbol) ++{ ++ uint32_t model_index = 1; ++ do { ++ const uint32_t bit = symbol & 1; ++ symbol >>= 1; ++ rc_bit(rc, &probs[model_index], bit); ++ model_index = (model_index << 1) + bit; ++ } while (--bit_count != 0); ++} ++ ++static inline void ++rc_direct(lzma_range_encoder *rc, uint32_t value, uint32_t bit_count) ++{ ++ do { ++ rc->symbols[rc->count++] ++ = RC_DIRECT_0 + ((value >> --bit_count) & 1); ++ } while (bit_count != 0); ++} ++ ++static inline void ++match(lzma_lzma1_encoder *coder, const uint32_t pos_state, ++ const uint32_t distance, const uint32_t len) ++{ ++ coder->state = ((coder->state) < 7 ? STATE_LIT_MATCH : STATE_NONLIT_MATCH); ++ ++ length(&coder->rc, &coder->match_len_encoder, pos_state, len, ++ coder->fast_mode); ++ ++ const uint32_t dist_slot = get_dist_slot(distance); ++ const uint32_t dist_state = ((len) < 4 + 2 ? (len) - 2 : 4 - 1); ++ rc_bittree(&coder->rc, coder->dist_slot[dist_state], 6, dist_slot); ++ ++ if (dist_slot >= 4) { ++ const uint32_t footer_bits = (dist_slot >> 1) - 1; ++ const uint32_t base = (2 | (dist_slot & 1)) << footer_bits; ++ const uint32_t dist_reduced = distance - base; ++ ++ if (dist_slot < 14) { ++ rc_bittree_reverse(&coder->rc, coder->dist_special + base - dist_slot - 1, ++ footer_bits, dist_reduced); ++ } else { ++ rc_direct(&coder->rc, dist_reduced >> 4, ++ footer_bits - 4); ++ rc_bittree_reverse( ++ &coder->rc, coder->dist_align, ++ 4, dist_reduced & ((1 << 4) - 1)); ++ ++coder->align_price_count; ++ } ++ } ++ ++ coder->reps[3] = coder->reps[2]; ++ coder->reps[2] = coder->reps[1]; ++ coder->reps[1] = coder->reps[0]; ++ coder->reps[0] = distance; ++ ++coder->match_price_count; ++} ++ ++static void ++encode_symbol(lzma_lzma1_encoder *coder, lzma_mf *mf, ++ uint32_t back, uint32_t len, uint32_t position) ++{ ++ const uint32_t pos_state = position & coder->pos_mask; ++ ++ if (back == (4294967295U)) { ++ rc_bit(&coder->rc, ++ &coder->is_match[coder->state][pos_state], 0); ++ literal(coder, mf, position); ++ } else { ++ rc_bit(&coder->rc, ++ &coder->is_match[coder->state][pos_state], 1); ++ ++ if (back < 4) { ++ rc_bit(&coder->rc, &coder->is_rep[coder->state], 1); ++ rep_match(coder, pos_state, back, len); ++ } else { ++ rc_bit(&coder->rc, &coder->is_rep[coder->state], 0); ++ match(coder, pos_state, back - 4, len); ++ } ++ } ++ mf->read_ahead -= len; ++} ++ ++static void ++encode_eopm(lzma_lzma1_encoder *coder, uint32_t position) ++{ ++ const uint32_t pos_state = position & coder->pos_mask; ++ rc_bit(&coder->rc, &coder->is_match[coder->state][pos_state], 1); ++ rc_bit(&coder->rc, &coder->is_rep[coder->state], 0); ++ match(coder, pos_state, (4294967295U), 2); ++} ++ ++static inline void ++rc_flush(lzma_range_encoder *rc) ++{ ++ for (size_t i = 0; i < 5; ++i) ++ rc->symbols[rc->count++] = RC_FLUSH; ++} ++ ++extern void exit (int __status) ++ __attribute__ ((__nothrow__ , __leaf__ , __noreturn__)); ++ ++extern lzma_ret ++lzma_lzma_encode(lzma_lzma1_encoder *restrict coder, lzma_mf *restrict mf, ++ uint8_t *restrict out, size_t *restrict out_pos, ++ size_t out_size, uint32_t limit) ++{ ++ ++ if (!coder->is_initialized && !encode_init(coder, mf)) ++ return LZMA_OK; ++ ++ uint32_t position = mf_position(mf); ++ ++ while (1) { ++ if (rc_encode(&coder->rc, out, out_pos, out_size)) { ++ return LZMA_OK; ++ } ++ ++ if (limit != (4294967295U) ++ && (mf->read_pos - mf->read_ahead >= limit ++ || *out_pos + rc_pending(&coder->rc) ++ >= (1U << 16) - ((1 << 12) + 1))) ++ break; ++ ++ if (mf->read_pos >= mf->read_limit) { ++ if (mf->action == LZMA_RUN) ++ return LZMA_OK; ++ ++ ++ if (mf->read_ahead == 0) ++ break; ++ } ++ uint32_t len; ++ uint32_t back; ++ ++ if (coder->fast_mode) ++ lzma_lzma_optimum_fast(coder, mf, &back, &len); ++ else ++ // The original code contains the call to ++ // lzma_lzma_optimum_normal(coder, mf, &back, &len, position); ++ exit (-1); ++ ++ encode_symbol(coder, mf, back, len, position); ++ ++ position += len; ++ } ++ ++ if (!coder->is_flushed) { ++ coder->is_flushed = 1; ++ if (limit == (4294967295U)) ++ encode_eopm(coder, position); ++ ++ rc_flush(&coder->rc); ++ ++ if (rc_encode(&coder->rc, out, out_pos, out_size)) { ++ return LZMA_OK; ++ } ++ } ++ ++ coder->is_flushed = 0; ++ return LZMA_STREAM_END; ++} ++ ++extern void ++lzma_free(void *ptr, const lzma_allocator *allocator) ++{ ++ if (allocator != ((void *)0) && allocator->free != ((void *)0)) ++ allocator->free(allocator->opaque, ptr); ++ else ++ free(ptr); ++ return; ++} ++ ++static _Bool ++lz_encoder_prepare(lzma_mf *mf, const lzma_allocator *allocator, ++ const lzma_lz_options *lz_options) ++{ ++ if (lz_options->dict_size < 4096U ++ || lz_options->dict_size ++ > (1U << 30) + (1U << 29) ++ || lz_options->nice_len > lz_options->match_len_max) ++ return 1; ++ ++ mf->keep_size_before = lz_options->before_size + lz_options->dict_size; ++ mf->keep_size_after = lz_options->after_size ++ + lz_options->match_len_max; ++ uint32_t reserve = lz_options->dict_size / 2; ++ if (reserve > (1U << 30)) ++ reserve /= 2; ++ ++ reserve += (lz_options->before_size + lz_options->match_len_max ++ + lz_options->after_size) / 2 + (1U << 19); ++ ++ const uint32_t old_size = mf->size; ++ mf->size = mf->keep_size_before + reserve + mf->keep_size_after; ++ ++ if ((mf->buffer != ((void *)0)) && old_size != mf->size) { ++ lzma_free(mf->buffer, allocator); ++ mf->buffer = ((void *)0); ++ } ++ ++ mf->match_len_max = lz_options->match_len_max; ++ mf->nice_len = lz_options->nice_len; ++ mf->cyclic_size = lz_options->dict_size + 1; ++ ++ switch (lz_options->match_finder) { ++ case LZMA_MF_HC3: ++ mf->find = &lzma_mf_hc3_find; ++ mf->skip = &lzma_mf_hc3_skip; ++ break; ++ ++ case LZMA_MF_HC4: ++ mf->find = &lzma_mf_hc4_find; ++ mf->skip = &lzma_mf_hc4_skip; ++ break; ++ ++ case LZMA_MF_BT2: ++ mf->find = &lzma_mf_bt2_find; ++ mf->skip = &lzma_mf_bt2_skip; ++ break; ++ ++ case LZMA_MF_BT3: ++ mf->find = &lzma_mf_bt3_find; ++ mf->skip = &lzma_mf_bt3_skip; ++ break; ++ ++ case LZMA_MF_BT4: ++ mf->find = &lzma_mf_bt4_find; ++ mf->skip = &lzma_mf_bt4_skip; ++ break; ++ ++ default: ++ return 1; ++ } ++ ++ const uint32_t hash_bytes = lz_options->match_finder & 0x0F; ++ if (hash_bytes > mf->nice_len) ++ return 1; ++ ++ const _Bool is_bt = (lz_options->match_finder & 0x10) != 0; ++ uint32_t hs; ++ ++ if (hash_bytes == 2) { ++ hs = 0xFFFF; ++ } else { ++ hs = lz_options->dict_size - 1; ++ hs |= hs >> 1; ++ hs |= hs >> 2; ++ hs |= hs >> 4; ++ hs |= hs >> 8; ++ hs >>= 1; ++ hs |= 0xFFFF; ++ ++ if (hs > (1U << 24)) { ++ if (hash_bytes == 3) ++ hs = (1U << 24) - 1; ++ else ++ hs >>= 1; ++ } ++ } ++ ++ mf->hash_mask = hs; ++ ++ ++hs; ++ if (hash_bytes > 2) ++ hs += (1U << 10); ++ if (hash_bytes > 3) ++ hs += (1U << 16); ++ ++ const uint32_t old_hash_count = mf->hash_count; ++ const uint32_t old_sons_count = mf->sons_count; ++ mf->hash_count = hs; ++ mf->sons_count = mf->cyclic_size; ++ if (is_bt) ++ mf->sons_count *= 2; ++ ++ if (old_hash_count != mf->hash_count ++ || old_sons_count != mf->sons_count) { ++ lzma_free(mf->hash, allocator); ++ mf->hash = ((void *)0); ++ ++ lzma_free(mf->son, allocator); ++ mf->son = ((void *)0); ++ } ++ ++ mf->depth = lz_options->depth; ++ if (mf->depth == 0) { ++ if (is_bt) ++ mf->depth = 16 + mf->nice_len / 2; ++ else ++ mf->depth = 4 + mf->nice_len / 4; ++ } ++ ++ return 0; ++} ++ ++int ++main () ++{ ++ lzma_mf mf; ++ lzma_allocator allocator; ++ lzma_lz_options lz_options; ++ ++ void *coder; ++ uint8_t *restrict out; ++ size_t *restrict out_pos; ++ size_t out_size; ++ ++ lz_encoder_prepare(&mf, &allocator, &lz_options); ++ return (int) lzma_lzma_encode(coder, &mf, out, out_pos, out_size, (4294967295U)); ++} ++ ++ ++/* { dg-final { scan-wpa-ipa-dump "Save results of indirect call analysis." "icp"} } */ ++/* { dg-final { scan-wpa-ipa-dump-times "For call" 2 "icp"} } */ ++/* { dg-final { scan-wpa-ipa-dump-times "Insert 0 prefetch stmt:" 5 "ipa_prefetch"} } */ ++/* { dg-final { scan-wpa-ipa-dump-times "Insert 1 prefetch stmt:" 4 "ipa_prefetch"} } */ ++/* { dg-final { scan-wpa-ipa-dump-times "Insert 2 prefetch stmt:" 2 "ipa_prefetch"} } */ +-- +2.33.0 + diff --git a/0188-Fix-fails-in-ICP-for-src-openEuler-gcc-I90P7M-I91CZ8.patch b/0188-Fix-fails-in-ICP-for-src-openEuler-gcc-I90P7M-I91CZ8.patch new file mode 100644 index 0000000..a43353c --- /dev/null +++ b/0188-Fix-fails-in-ICP-for-src-openEuler-gcc-I90P7M-I91CZ8.patch @@ -0,0 +1,32 @@ +From e7f50fc07c76b60b272cb97151b228d96b67938a Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia WX1215920 +Date: Mon, 19 Feb 2024 11:06:37 +0300 +Subject: [PATCH 3/3] Fix fails in ICP (for src-openEuler/gcc: I90P7M, I91CZ8) + +--- + gcc/ipa-devirt.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/gcc/ipa-devirt.c b/gcc/ipa-devirt.c +index 9863084e4..194ad3f21 100644 +--- a/gcc/ipa-devirt.c ++++ b/gcc/ipa-devirt.c +@@ -4805,10 +4805,12 @@ compare_block_and_init_type (tree block, tree t1) + static void + analyze_global_var (varpool_node *var) + { +- var->get_constructor(); + tree decl = var->decl; +- if (TREE_CODE (decl) == SSA_NAME || !DECL_INITIAL (decl) +- || integer_zerop (DECL_INITIAL (decl))) ++ if (decl || !DECL_INITIAL (decl)) ++ return; ++ var->get_constructor (); ++ if (TREE_CODE (decl) == SSA_NAME || integer_zerop (DECL_INITIAL (decl)) ++ || TREE_CODE (DECL_INITIAL (decl)) == ERROR_MARK) + return; + + if (dump_file && (dump_flags & TDF_DETAILS)) +-- +2.33.0 + diff --git a/0189-Add-hip11-CPU-pipeline-scheduling.patch b/0189-Add-hip11-CPU-pipeline-scheduling.patch new file mode 100644 index 0000000..b453266 --- /dev/null +++ b/0189-Add-hip11-CPU-pipeline-scheduling.patch @@ -0,0 +1,739 @@ +From 431f80e6d3a323e3382f73a80bf7fc7ee7a73f02 Mon Sep 17 00:00:00 2001 +From: XingYuShuai <1150775134@qq.com> +Date: Mon, 26 Feb 2024 20:34:06 +0800 +Subject: [PATCH] Add hip11 CPU pipeline scheduling + +This patch adds an mcpu: hip11. It has been tested on aarch64 +and no regressions from this patch. +--- + gcc/config/aarch64/aarch64-cores.def | 3 +- + gcc/config/aarch64/aarch64-cost-tables.h | 104 ++++++ + gcc/config/aarch64/aarch64-tune.md | 2 +- + gcc/config/aarch64/aarch64.c | 83 +++++ + gcc/config/aarch64/aarch64.md | 1 + + gcc/config/aarch64/hip11.md | 418 +++++++++++++++++++++++ + gcc/doc/invoke.texi | 2 +- + 7 files changed, 610 insertions(+), 3 deletions(-) + create mode 100644 gcc/config/aarch64/hip11.md + +diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def +index 9c2902924..53125f6bd 100644 +--- a/gcc/config/aarch64/aarch64-cores.def ++++ b/gcc/config/aarch64/aarch64-cores.def +@@ -123,7 +123,7 @@ AARCH64_CORE("octeontx2f95mm", octeontx2f95mm, cortexa57, 8_2A, AARCH64_FL_FOR_ + AARCH64_CORE("a64fx", a64fx, a64fx, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_F16 | AARCH64_FL_SVE, a64fx, 0x46, 0x001, -1) + + /* HiSilicon ('H') cores. */ +-AARCH64_CORE("tsv110", tsv110, tsv110, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2, tsv110, 0x48, 0xd01, -1) ++AARCH64_CORE("tsv110", tsv110, tsv110, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2, tsv110, 0x48, 0xd01, -1) + + /* ARMv8.3-A Architecture Processors. */ + +@@ -141,6 +141,7 @@ AARCH64_CORE("saphira", saphira, saphira, 8_4A, AARCH64_FL_FOR_ARCH8_ + + /* Armv8.5-A Architecture Processors. */ + AARCH64_CORE("neoverse-n2", neoversen2, cortexa57, 8_5A, AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_I8MM | AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_SVE2_BITPERM | AARCH64_FL_RNG | AARCH64_FL_MEMTAG, neoversen2, 0x41, 0xd49, -1) ++AARCH64_CORE("hip11", hip11, hip11, 8_5A, AARCH64_FL_FOR_ARCH8_5| AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_F16, hip11, 0x48, 0xd22, -1) + + /* ARMv8-A big.LITTLE implementations. */ + +diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h +index c6805717f..377650be0 100644 +--- a/gcc/config/aarch64/aarch64-cost-tables.h ++++ b/gcc/config/aarch64/aarch64-cost-tables.h +@@ -541,6 +541,110 @@ const struct cpu_cost_table tsv110_extra_costs = + } + }; + ++const struct cpu_cost_table hip11_extra_costs = ++{ ++ /* ALU */ ++ { ++ 0, /* arith. */ ++ 0, /* logical. */ ++ 0, /* shift. */ ++ 0, /* shift_reg. */ ++ COSTS_N_INSNS (1), /* arith_shift. */ ++ COSTS_N_INSNS (1), /* arith_shift_reg. */ ++ COSTS_N_INSNS (1), /* log_shift. */ ++ COSTS_N_INSNS (1), /* log_shift_reg. */ ++ 0, /* extend. */ ++ COSTS_N_INSNS (1), /* extend_arith. */ ++ 0, /* bfi. */ ++ 0, /* bfx. */ ++ 0, /* clz. */ ++ 0, /* rev. */ ++ 0, /* non_exec. */ ++ true /* non_exec_costs_exec. */ ++ }, ++ ++ { ++ /* MULT SImode */ ++ { ++ COSTS_N_INSNS (2), /* simple. */ ++ COSTS_N_INSNS (2), /* flag_setting. */ ++ COSTS_N_INSNS (2), /* extend. */ ++ COSTS_N_INSNS (2), /* add. */ ++ COSTS_N_INSNS (2), /* extend_add. */ ++ COSTS_N_INSNS (11) /* idiv. */ ++ }, ++ /* MULT DImode */ ++ { ++ COSTS_N_INSNS (3), /* simple. */ ++ 0, /* flag_setting (N/A). */ ++ COSTS_N_INSNS (3), /* extend. */ ++ COSTS_N_INSNS (3), /* add. */ ++ COSTS_N_INSNS (3), /* extend_add. */ ++ COSTS_N_INSNS (19) /* idiv. */ ++ } ++ }, ++ /* LD/ST */ ++ { ++ COSTS_N_INSNS (3), /* load. */ ++ COSTS_N_INSNS (4), /* load_sign_extend. */ ++ COSTS_N_INSNS (3), /* ldrd. */ ++ COSTS_N_INSNS (3), /* ldm_1st. */ ++ 1, /* ldm_regs_per_insn_1st. */ ++ 2, /* ldm_regs_per_insn_subsequent. */ ++ COSTS_N_INSNS (4), /* loadf. */ ++ COSTS_N_INSNS (4), /* loadd. */ ++ COSTS_N_INSNS (4), /* load_unaligned. */ ++ 0, /* store. */ ++ 0, /* strd. */ ++ 0, /* stm_1st. */ ++ 1, /* stm_regs_per_insn_1st. */ ++ 2, /* stm_regs_per_insn_subsequent. */ ++ 0, /* storef. */ ++ 0, /* stored. */ ++ COSTS_N_INSNS (1), /* store_unaligned. */ ++ COSTS_N_INSNS (4), /* loadv. */ ++ COSTS_N_INSNS (4) /* storev. */ ++ }, ++ { ++ /* FP SFmode */ ++ { ++ COSTS_N_INSNS (10), /* div. */ ++ COSTS_N_INSNS (4), /* mult. */ ++ COSTS_N_INSNS (4), /* mult_addsub. */ ++ COSTS_N_INSNS (4), /* fma. */ ++ COSTS_N_INSNS (4), /* addsub. */ ++ COSTS_N_INSNS (1), /* fpconst. */ ++ COSTS_N_INSNS (1), /* neg. */ ++ COSTS_N_INSNS (1), /* compare. */ ++ COSTS_N_INSNS (2), /* widen. */ ++ COSTS_N_INSNS (2), /* narrow. */ ++ COSTS_N_INSNS (2), /* toint. */ ++ COSTS_N_INSNS (1), /* fromint. */ ++ COSTS_N_INSNS (2) /* roundint. */ ++ }, ++ /* FP DFmode */ ++ { ++ COSTS_N_INSNS (17), /* div. */ ++ COSTS_N_INSNS (4), /* mult. */ ++ COSTS_N_INSNS (6), /* mult_addsub. */ ++ COSTS_N_INSNS (6), /* fma. */ ++ COSTS_N_INSNS (3), /* addsub. */ ++ COSTS_N_INSNS (1), /* fpconst. */ ++ COSTS_N_INSNS (1), /* neg. */ ++ COSTS_N_INSNS (1), /* compare. */ ++ COSTS_N_INSNS (2), /* widen. */ ++ COSTS_N_INSNS (2), /* narrow. */ ++ COSTS_N_INSNS (2), /* toint. */ ++ COSTS_N_INSNS (1), /* fromint. */ ++ COSTS_N_INSNS (2) /* roundint. */ ++ } ++ }, ++ /* Vector */ ++ { ++ COSTS_N_INSNS (1) /* alu. */ ++ } ++}; ++ + const struct cpu_cost_table a64fx_extra_costs = + { + /* ALU */ +diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md +index 7fda2294b..f33a3330d 100644 +--- a/gcc/config/aarch64/aarch64-tune.md ++++ b/gcc/config/aarch64/aarch64-tune.md +@@ -1,5 +1,5 @@ + ;; -*- buffer-read-only: t -*- + ;; Generated automatically by gentune.sh from aarch64-cores.def + (define_attr "tune" +- "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa65,cortexa65ae,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55" ++ "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa65,cortexa65ae,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,saphira,neoversen2,hip11,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55" + (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) +diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c +index da4983236..938948f29 100644 +--- a/gcc/config/aarch64/aarch64.c ++++ b/gcc/config/aarch64/aarch64.c +@@ -448,6 +448,22 @@ static const struct cpu_addrcost_table tsv110_addrcost_table = + 0, /* imm_offset */ + }; + ++static const struct cpu_addrcost_table hip11_addrcost_table = ++{ ++ { ++ 1, /* hi */ ++ 0, /* si */ ++ 0, /* di */ ++ 1, /* ti */ ++ }, ++ 0, /* pre_modify */ ++ 0, /* post_modify */ ++ 0, /* register_offset */ ++ 1, /* register_sextend */ ++ 1, /* register_zextend */ ++ 0, /* imm_offset */ ++}; ++ + static const struct cpu_addrcost_table qdf24xx_addrcost_table = + { + { +@@ -575,6 +591,16 @@ static const struct cpu_regmove_cost tsv110_regmove_cost = + 2 /* FP2FP */ + }; + ++static const struct cpu_regmove_cost hip11_regmove_cost = ++{ ++ 1, /* GP2GP */ ++ /* Avoid the use of slow int<->fp moves for spilling by setting ++ their cost higher than memmov_cost. */ ++ 2, /* GP2FP */ ++ 3, /* FP2GP */ ++ 2 /* FP2FP */ ++}; ++ + static const struct cpu_regmove_cost a64fx_regmove_cost = + { + 1, /* GP2GP */ +@@ -664,6 +690,25 @@ static const struct cpu_vector_cost tsv110_vector_cost = + 1 /* cond_not_taken_branch_cost */ + }; + ++static const struct cpu_vector_cost hip11_vector_cost = ++{ ++ 1, /* scalar_int_stmt_cost */ ++ 1, /* scalar_fp_stmt_cost */ ++ 5, /* scalar_load_cost */ ++ 1, /* scalar_store_cost */ ++ 2, /* vec_int_stmt_cost */ ++ 2, /* vec_fp_stmt_cost */ ++ 2, /* vec_permute_cost */ ++ 5, /* vec_to_scalar_cost */ ++ 5, /* scalar_to_vec_cost */ ++ 5, /* vec_align_load_cost */ ++ 5, /* vec_unalign_load_cost */ ++ 1, /* vec_unalign_store_cost */ ++ 1, /* vec_store_cost */ ++ 1, /* cond_taken_branch_cost */ ++ 1 /* cond_not_taken_branch_cost */ ++}; ++ + /* Generic costs for vector insn classes. */ + static const struct cpu_vector_cost cortexa57_vector_cost = + { +@@ -902,6 +947,17 @@ static const cpu_prefetch_tune tsv110_prefetch_tune = + -1 /* default_opt_level */ + }; + ++static const cpu_prefetch_tune hip11_prefetch_tune = ++{ ++ 0, /* num_slots */ ++ 64, /* l1_cache_size */ ++ 64, /* l1_cache_line_size */ ++ 512, /* l2_cache_size */ ++ true, /* prefetch_dynamic_strides */ ++ -1, /* minimum_stride */ ++ -1 /* default_opt_level */ ++}; ++ + static const cpu_prefetch_tune xgene1_prefetch_tune = + { + 8, /* num_slots */ +@@ -1196,6 +1252,33 @@ static const struct tune_params tsv110_tunings = + &tsv110_prefetch_tune + }; + ++static const struct tune_params hip11_tunings = ++{ ++ &hip11_extra_costs, ++ &hip11_addrcost_table, ++ &hip11_regmove_cost, ++ &hip11_vector_cost, ++ &generic_branch_cost, ++ &generic_approx_modes, ++ SVE_512, /* sve_width */ ++ 4, /* memmov_cost */ ++ 4, /* issue_rate */ ++ (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH ++ | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ ++ "16", /* function_align. */ ++ "4", /* jump_align. */ ++ "8", /* loop_align. */ ++ 2, /* int_reassoc_width. */ ++ 4, /* fp_reassoc_width. */ ++ 1, /* vec_reassoc_width. */ ++ 2, /* min_div_recip_mul_sf. */ ++ 2, /* min_div_recip_mul_df. */ ++ 0, /* max_case_values. */ ++ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ ++ (AARCH64_EXTRA_TUNE_PREFER_ADVSIMD_AUTOVEC), /* tune_flags. */ ++ &hip11_prefetch_tune ++}; ++ + static const struct tune_params xgene1_tunings = + { + &xgene1_extra_costs, +diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md +index 7c2562f49..38af8d000 100644 +--- a/gcc/config/aarch64/aarch64.md ++++ b/gcc/config/aarch64/aarch64.md +@@ -452,6 +452,7 @@ + (include "../arm/cortex-a57.md") + (include "../arm/exynos-m1.md") + (include "falkor.md") ++(include "hip11.md") + (include "saphira.md") + (include "thunderx.md") + (include "../arm/xgene1.md") +diff --git a/gcc/config/aarch64/hip11.md b/gcc/config/aarch64/hip11.md +new file mode 100644 +index 000000000..57944fbc2 +--- /dev/null ++++ b/gcc/config/aarch64/hip11.md +@@ -0,0 +1,418 @@ ++;; hip11 pipeline description ++;; Copyright (C) 2018-2024 Free Software Foundation, Inc. ++;; ++;; This file is part of GCC. ++;; ++;; GCC is free software; you can redistribute it and/or modify it ++;; under the terms of the GNU General Public License as published by ++;; the Free Software Foundation; either version 3, or (at your option) ++;; any later version. ++;; ++;; GCC is distributed in the hope that it will be useful, but ++;; WITHOUT ANY WARRANTY; without even the implied warranty of ++;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++;; General Public License for more details. ++;; ++;; You should have received a copy of the GNU General Public License ++;; along with GCC; see the file COPYING3. If not see ++;; . ++ ++(define_automaton "hip11") ++ ++;; The hip11 core is modelled as issues pipeline that has ++;; the following functional units. ++;; 1. Three pipelines for integer operations: ALU1, ALU2, ALU3 ++ ++(define_cpu_unit "hip11_alu1_issue" "hip11") ++(define_reservation "hip11_alu1" "hip11_alu1_issue") ++ ++(define_cpu_unit "hip11_alu2_issue" "hip11") ++(define_reservation "hip11_alu2" "hip11_alu2_issue") ++ ++(define_cpu_unit "hip11_alu3_issue" "hip11") ++(define_reservation "hip11_alu3" "hip11_alu3_issue") ++ ++(define_reservation "hip11alu" "hip11_alu1|hip11_alu2|hip11_alu3") ++ ++;; 2. One pipeline for complex integer operations: MDU ++ ++(define_cpu_unit "hip11_mdu_issue" "hip11") ++(define_reservation "hip11_mdu" "hip11_mdu_issue") ++ ++;; 3. Two asymmetric pipelines for Asimd and FP operations: FSU1, FSU2 ++(define_automaton "hip11_fsu") ++ ++(define_cpu_unit "hip11_fsu1_issue" ++ "hip11_fsu") ++(define_cpu_unit "hip11_fsu2_issue" ++ "hip11_fsu") ++ ++(define_reservation "hip11_fsu1" "hip11_fsu1_issue") ++(define_reservation "hip11_fsu2" "hip11_fsu2_issue") ++(define_reservation "hip11_fsu_pipe" "hip11_fsu1|hip11_fsu2") ++ ++;; 4. Two pipeline for branch operations but same with alu2 and alu3: BRU1, BRU2 ++ ++;; 5. Two pipelines for load and store operations: LS1, LS2. ++ ++(define_cpu_unit "hip11_ls1_issue" "hip11") ++(define_cpu_unit "hip11_ls2_issue" "hip11") ++(define_reservation "hip11_ls1" "hip11_ls1_issue") ++(define_reservation "hip11_ls2" "hip11_ls2_issue") ++ ++;; Block all issue queues. ++ ++(define_reservation "hip11_block" "hip11_fsu1_issue + hip11_fsu2_issue ++ + hip11_mdu_issue + hip11_alu1_issue ++ + hip11_alu2_issue + hip11_alu3_issue + hip11_ls1_issue + hip11_ls2_issue") ++ ++;; Branch execution Unit ++;; ++(define_insn_reservation "hip11_branch" 1 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "branch")) ++ "hip11_alu2|hip11_alu3") ++ ++(define_insn_reservation "hip11_return_from_subroutine" 6 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "branch") ++ (eq_attr "sls_length" "retbr")) ++ "hip11_mdu,(hip11_alu2|hip11_alu3)") ++ ++ ;; Simple Execution Unit: ++;; ++;; Simple ALU without shift ++(define_insn_reservation "hip11_alu" 1 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "alu_imm,logic_imm,\ ++ alu_sreg,logic_reg,\ ++ adc_imm,adc_reg,\ ++ adr,bfm,clz,rbit,rev,\ ++ shift_imm,shift_reg,\ ++ mov_imm,mov_reg,\ ++ mvn_imm,mvn_reg,\ ++ mrs,multiple,csel,\ ++ rotate_imm")) ++ "hip11_alu1|hip11_alu2|hip11_alu3") ++ ++(define_insn_reservation "hip11_alus" 1 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "alus_imm,logics_imm,\ ++ alus_sreg,logics_reg,\ ++ adcs_imm,adcs_reg")) ++ "hip11_alu2|hip11_alu3") ++ ++;; ALU ops with shift ++(define_insn_reservation "hip11_alu_shift" 2 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "extend,\ ++ alu_shift_imm,alu_shift_reg,\ ++ crc,logic_shift_imm,logic_shift_reg,\ ++ mov_shift,mvn_shift,\ ++ mov_shift_reg,mvn_shift_reg")) ++ "hip11_mdu") ++ ++(define_insn_reservation "hip11_alus_shift" 2 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "alus_shift_imm,alus_shift_reg,\ ++ logics_shift_imm,logics_shift_reg")) ++ "hip11_alu2|hip11_alu3") ++ ++;; Multiplies instructions ++(define_insn_reservation "hip11_mult" 3 ++ (and (eq_attr "tune" "hip11") ++ (ior (eq_attr "mul32" "yes") ++ (eq_attr "widen_mul64" "yes"))) ++ "hip11_mdu") ++ ++;; Integer divide ++(define_insn_reservation "hip11_div" 10 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "udiv,sdiv")) ++ "hip11_mdu") ++ ++(define_insn_reservation "hip11_mla" 4 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "mla,smlal,umlal,smull,umull")) ++ "hip11_mdu") ++ ++;; Block all issue pipes for a cycle ++(define_insn_reservation "hip11_block" 1 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "block")) ++ "hip11_block") ++ ++;; Load-store execution Unit ++;; ++(define_insn_reservation "hip11_load1" 4 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "load_4,load_8,load_16")) ++ "hip11_ls1|hip11_ls2") ++ ++(define_insn_reservation "hip11_fp_load" 5 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "f_loads,f_loadd")) ++ "hip11_ls1|hip11_ls2") ++ ++(define_insn_reservation "hip11_neon_ld1_single" 7 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load1_one_lane,neon_load1_one_lane_q,\ ++ neon_load1_all_lanes,neon_load1_all_lanes_q")) ++ "(hip11_ls1|hip11_ls2)+hip11_fsu1") ++ ++(define_insn_reservation "hip11_neon_ld1_1reg" 5 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load1_1reg,neon_load1_1reg_q")) ++ "hip11_ls1|hip11_ls2") ++ ++(define_insn_reservation "hip11_neon_ld1_2reg" 6 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load1_2reg,neon_load1_2reg_q")) ++ "hip11_ls1|hip11_ls2") ++ ++(define_insn_reservation "hip11_neon_ld1_3reg" 7 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load1_3reg,neon_load1_3reg_q")) ++ "hip11_ls1|hip11_ls2") ++ ++(define_insn_reservation "hip11_neon_ld1_4reg" 8 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load1_4reg,neon_load1_4reg_q")) ++ "hip11_ls1|hip11_ls2") ++ ++(define_insn_reservation "hip11_neon_ld2" 8 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load2_one_lane,neon_load2_one_lane_q,\ ++ neon_load2_all_lanes,neon_load2_all_lanes_q,\ ++ neon_load2_2reg,neon_load2_2reg_q,\ ++ neon_load2_4reg,neon_load2_4reg_q")) ++ "(hip11_ls1|hip11_ls2)+hip11_fsu1") ++ ++(define_insn_reservation "hip11_neon_ld3_single" 9 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load3_one_lane,neon_load3_one_lane_q,\ ++ neon_load3_all_lanes,neon_load3_all_lanes_q")) ++ "(hip11_ls1|hip11_ls2)+hip11_fsu1") ++ ++(define_insn_reservation "hip11_neon_ld3_multiple" 13 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load3_3reg,neon_load3_3reg_q")) ++ "(hip11_ls1|hip11_ls2)+hip11_fsu1") ++ ++(define_insn_reservation "hip11_neon_ld4_single" 10 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load4_one_lane,neon_load4_one_lane_q,\ ++ neon_load4_all_lanes,neon_load4_all_lanes_q")) ++ "(hip11_ls1|hip11_ls2)+hip11_fsu1") ++ ++(define_insn_reservation "hip11_neon_ld4_multiple" 11 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_load4_4reg,neon_load4_4reg_q")) ++ "(hip11_ls1|hip11_ls2)+hip11_fsu1") ++ ++;; Stores of up to two words. ++(define_insn_reservation "hip11_store1" 1 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "store_4,store_8,store_16,\ ++ f_stored,f_stores")) ++ "hip11_ls1|hip11_ls2") ++ ++;; Floating-Point Operations. ++(define_insn_reservation "hip11_fp_arith" 2 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "ffariths,ffarithd,f_minmaxs,\ ++ f_minmaxd,fadds,faddd,neon_fcadd")) ++ "hip11_fsu_pipe") ++ ++(define_insn_reservation "hip11_fp_mul" 3 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_fp_mul_d,neon_fp_mul_d_q,\ ++ neon_fp_mul_s_scalar,neon_fp_mul_s_scalar_q,\ ++ neon_fp_mul_d_scalar_q,fmuld,fmuls")) ++ "hip11_fsu_pipe") ++ ++(define_insn_reservation "hip11_fp_cmp" 2 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fccmpd,fccmps")) ++ "hip11alu,hip11_fsu_pipe") ++ ++(define_insn_reservation "hip11_fp_csel" 2 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fcsel")) ++ "hip11alu,hip11_fsu1") ++ ++(define_insn_reservation "hip11_fp_fcmp" 1 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fcmpd,fcmps")) ++ "hip11_fsu_pipe") ++ ++(define_insn_reservation "hip11_fp_divs" 7 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fdivs")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_fp_divd" 10 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fdivd")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_fp_sqrts" 9 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fsqrts")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_fp_sqrtd" 15 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fsqrtd")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_fp_mac" 4 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fmacs,ffmas,fmacd,ffmad")) ++ "hip11_fsu_pipe") ++ ++(define_insn_reservation "hip11_fp_mov" 1 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "fmov,neon_dup,neon_dup_q,\ ++ neon_from_gp,neon_from_gp_q,\ ++ neon_ins,neon_ins_q,\ ++ neon_to_gp,neon_to_gp_q,\ ++ neon_move,neon_move_q,\ ++ neon_rev,neon_rev_q,\ ++ neon_permute,neon_permute_q,\ ++ neon_shift_imm_narrow_q,\ ++ neon_ext,neon_ext_q,\ ++ neon_rbit,\ ++ crypto_sha3,neon_tbl1,neon_tbl1_q,\ ++ neon_tbl2_q,f_mcr,neon_tst,neon_tst_q,\ ++ neon_move_narrow_q")) ++ "hip11_fsu1") ++ ++;; ASIMD instructions ++(define_insn_reservation "hip11_asimd_simple_arithmetic" 2 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_abs,neon_abs_q,neon_neg,neon_neg_q,\ ++ neon_abd,neon_abd_q,\ ++ neon_add_long,neon_sub_long,neon_sub_widen,neon_add_widen,\ ++ neon_add_halve_narrow_q,neon_sub_halve_narrow_q,\ ++ neon_arith_acc,neon_arith_acc_q,\ ++ neon_compare,neon_compare_q,\ ++ neon_compare_zero,neon_compare_zero_q,\ ++ neon_minmax,neon_minmax_q,\ ++ neon_logic,neon_logic_q,\ ++ neon_reduc_add,neon_reduc_add_q,\ ++ neon_reduc_minmax,neon_reduc_minmax_q,\ ++ neon_fp_to_int_s,neon_fp_to_int_s_q,\ ++ neon_fp_to_int_d,neon_fp_to_int_d_q,\ ++ neon_fp_cvt_widen_s,\ ++ neon_fp_cvt_narrow_d_q,\ ++ neon_cls,neon_cls_q,\ ++ neon_cnt,neon_cnt_q,\ ++ f_rints,f_rintd,f_cvtf2i,f_cvt,\ ++ neon_tbl3,neon_fp_round_s,neon_fp_round_s_q,\ ++ neon_fp_round_d,neon_fp_round_d_q,\ ++ neon_int_to_fp_s,neon_fp_recpe_s,neon_fp_recpe_s_q,\ ++ neon_fp_recpe_d,neon_fp_recpe_d_q,\ ++ neon_fp_cvt_narrow_s_q,\ ++ crypto_aese,crypto_aesmc,\ ++ crypto_sha1_fast,crypto_sha1_xor,\ ++ crypto_sha1_slow,\ ++ crypto_sha256_fast,\ ++ crypto_sha512,crypto_sm3,\ ++ neon_qabs,neon_qabs_q,\ ++ neon_qneg,neon_qneg_q,\ ++ neon_qadd,neon_qadd_q,\ ++ neon_qsub,neon_qsub_q,\ ++ neon_add_halve,neon_add_halve_q,\ ++ neon_sub_halve,neon_sub_halve_q,\ ++ neon_fp_reduc_minmax_s,neon_fp_reduc_minmax_s_q,\ ++ neon_fp_reduc_minmax_d,neon_fp_reduc_minmax_d_q,\ ++ neon_fp_rsqrte_s,neon_fp_rsqrte_s_q,\ ++ neon_fp_rsqrte_d,neon_fp_rsqrte_d_q")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_asimd_complex_arithmetic" 4 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_mul_b,neon_mul_b_q,\ ++ neon_mul_h,neon_mul_h_q,\ ++ neon_mul_s,neon_mul_s_q,\ ++ neon_mla_b,neon_mla_b_q,\ ++ neon_mla_h,neon_mla_h_q,\ ++ neon_mla_s,\ ++ neon_mla_h_scalar,neon_mla_h_scalar_q,\ ++ neon_mla_s_scalar,neon_mla_s_scalar_q,\ ++ neon_sat_mul_h_scalar,neon_sat_mul_h_scalar_q,\ ++ neon_sat_mul_s_scalar,neon_sat_mul_s_scalar_q,\ ++ neon_sat_mul_b,neon_sat_mul_b_q,\ ++ neon_sat_mul_h,neon_sat_mul_h_q,\ ++ neon_sat_mul_s,neon_sat_mul_s_q,\ ++ neon_mla_b_long,neon_mla_h_long,neon_mla_s_long,\ ++ neon_mul_b_long,neon_mul_h_long,neon_mul_s_long,\ ++ neon_sat_mla_b_long,neon_sat_mla_h_long,neon_sat_mla_s_long,\ ++ neon_sat_mla_h_scalar_long,neon_sat_mla_s_scalar_long,\ ++ neon_sat_mul_b_long,neon_sat_mul_h_long,neon_sat_mul_s_long,\ ++ neon_sat_mul_h_scalar_long,neon_sat_mul_s_scalar_long,\ ++ crypto_pmull,\ ++ neon_sat_shift_reg,neon_sat_shift_reg_q,\ ++ neon_shift_reg,neon_shift_reg_q,\ ++ neon_shift_imm,neon_shift_imm_q,\ ++ neon_shift_imm_long,\ ++ neon_sat_shift_imm,neon_sat_shift_imm_q,\ ++ neon_sat_shift_imm_narrow_q,\ ++ neon_shift_acc,neon_shift_acc_q,\ ++ crypto_sha256_slow")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_asimd_fp_compare" 2 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_fp_abs_s,neon_fp_abs_s_q,\ ++ neon_fp_abs_d,neon_fp_abs_d_q,\ ++ neon_fp_neg_s,neon_fp_neg_s_q,\ ++ neon_fp_neg_d,neon_fp_neg_d_q,\ ++ neon_fp_compare_s,neon_fp_compare_s_q,\ ++ neon_fp_compare_d,neon_fp_compare_d_q,\ ++ neon_fp_minmax_s,neon_fp_minmax_s_q,\ ++ neon_fp_minmax_d,neon_fp_minmax_d_q,\ ++ neon_fp_addsub_s,neon_fp_addsub_s_q,\ ++ neon_fp_addsub_d,neon_fp_addsub_d_q,\ ++ neon_fp_reduc_add_s,neon_fp_reduc_add_s_q,\ ++ neon_fp_reduc_add_d,neon_fp_reduc_add_d_q,\ ++ neon_fp_abd_s,neon_fp_abd_s_q,\ ++ neon_fp_abd_d,neon_fp_abd_d_q")) ++ "hip11_fsu_pipe") ++ ++(define_insn_reservation "hip11_asimd_fdiv" 10 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_fp_div_s,neon_fp_div_s_q,\ ++ neon_fp_div_d,neon_fp_div_d_q")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_asimd_fsqrt" 15 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_fp_sqrt_s,neon_fp_sqrt_s_q,\ ++ neon_fp_sqrt_d,neon_fp_sqrt_d_q")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_asimd_fp_multiply_add" 4 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_fp_mla_s,neon_fp_mla_s_q,\ ++ neon_fp_mla_d,neon_fp_mla_d_q,\ ++ neon_fp_mla_s_scalar,neon_fp_mla_s_scalar_q,\ ++ neon_fp_mul_s,neon_fp_mul_s_q,neon_fcmla,\ ++ neon_fp_recps_s,neon_fp_recps_s_q,\ ++ neon_fp_recps_d,neon_fp_recps_d_q,\ ++ neon_fp_rsqrts_s,neon_fp_rsqrts_s_q,\ ++ neon_fp_rsqrts_d,neon_fp_rsqrts_d_q")) ++ "hip11_fsu_pipe") ++ ++(define_insn_reservation "hip11_asimd_frecpx" 3 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_fp_recpx_s,neon_fp_recpx_s_q,\ ++ neon_fp_recpx_d,neon_fp_recpx_d_q,neon_tbl4,\ ++ neon_dot,neon_dot_q")) ++ "hip11_fsu1") ++ ++(define_insn_reservation "hip11_asimd_mmla" 6 ++ (and (eq_attr "tune" "hip11") ++ (eq_attr "type" "neon_mla_s_q")) ++ "hip11_fsu1") +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index a0a84c20b..ce0dd2ff6 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -17049,7 +17049,7 @@ performance of the code. Permissible values for this option are: + @samp{octeontx2}, @samp{octeontx2t98}, @samp{octeontx2t96} + @samp{octeontx2t93}, @samp{octeontx2f95}, @samp{octeontx2f95n}, + @samp{octeontx2f95mm}, +-@samp{a64fx}, ++@samp{a64fx},@samp{hip11} + @samp{thunderx}, @samp{thunderxt88}, + @samp{thunderxt88p1}, @samp{thunderxt81}, @samp{tsv110}, + @samp{thunderxt83}, @samp{thunderx2t99}, @samp{thunderx3t110}, @samp{zeus}, +-- +2.33.0 + diff --git a/gcc.spec b/gcc.spec index 49207d9..8522528 100644 --- a/gcc.spec +++ b/gcc.spec @@ -61,7 +61,7 @@ Summary: Various compilers (C, C++, Objective-C, ...) Name: gcc Version: %{gcc_version} -Release: 41 +Release: 42 License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD URL: https://gcc.gnu.org @@ -249,6 +249,54 @@ Patch138: 0138-Fix-ICE-bugs-in-transpose-test-cases-with-vector-ind.patch Patch139: 0139-Fix-errors-on-testsuite-c-c-tests-and-505.mcf_r.patch Patch140: 0140-Fix-an-error-in-memory-allocation-deallocation.patch Patch141: 0141-Fix-warnings-and-errors-with-debug-prints.patch +Patch142: 0142-crc-loop-optimization-initial.patch +Patch143: 0143-Perform-early-if-conversion-of-simple-arithmetic.patch +Patch144: 0144-Add-option-to-allow-matching-uaddsub-overflow-for-wi.patch +Patch145: 0145-Match-double-sized-mul-pattern.patch +Patch146: 0146-LOOP-CRC32-Add-Crc32-Optimization-in-Gzip-For-crc32-.patch +Patch147: 0147-add-insn-defs-and-correct-costs-for-cmlt-generation.patch +Patch148: 0148-Introduce-RTL-ifcvt-enhancements.patch +Patch149: 0149-Add-more-flexible-check-for-pointer-aliasing-during-.patch +Patch150: 0150-Implement-propagation-of-permutations-in-fwprop.patch +Patch151: 0151-Fix-bugs-and-add-tests-for-RTL-ifcvt.patch +Patch152: 0152-Add-LLC-Allocation-Pass.patch +Patch153: 0153-LLC-add-extending-outer-loop.patch +Patch154: 0154-Loop-CRC32-Judge-null-on-pointers-and-solving-coding.patch +Patch155: 0155-Add-maxmin-and-uzp1-uzp2-combining.patch +Patch156: 0156-add-icp-optimization.patch +Patch157: 0157-Add-split-complex-instructions-pass.patch +Patch158: 0158-Implement-IPA-prefetch-optimization.patch +Patch159: 0159-Implement-AES-pattern-matching.patch +Patch160: 0160-AES-Add-lost-files.patch +Patch161: 0161-Fix-lost-ftree-fold-phiopt-option-in-tests.patch +Patch162: 0162-rtl-ifcvt-free-dominance-info-before-cleanup_cfg.patch +Patch163: 0163-Loop-CRC-Solving-the-problem-of-insufficient-CRC-tab.patch +Patch164: 0164-LLC-Allocation-Fix-some-bugs-and-remove-variable-pre.patch +Patch165: 0165-rtl-ifcvt-BugFix-change-def-selection-logic-in-noce_.patch +Patch166: 0166-perm-propagation-Bugfix-Check-that-the-arithmetic-op.patch +Patch167: 0167-perm-propagation-Bugfix-Fix-shll-shll2-patterns-for-.patch +Patch168: 0168-LLC-Allocation-Bugfix-Terminate-kernel-filtering-for.patch +Patch169: 0169-Struct-Reorg-Fix-several-bugs.patch +Patch170: 0170-DFE-Add-escape-check.patch +Patch171: 0171-phiopt-testsuite-Add-ftree-fold-phiopt-option-to-5-t.patch +Patch172: 0172-minmax-Move-minmax-pattern-to-gimple.patch +Patch173: 0173-IPA-Fix-test-completion-1.c.patch +Patch174: 0174-IPA-Fix-fails-on-checked-build-and-comments-from-rev.patch +Patch175: 0175-split-ldp-stp-Extending-and-refactoring-of-pass_spli.patch +Patch176: 0176-Fix-bugs-in-ICP-src-openEuler-gcc-I8PYBF-I8PYLL.patch +Patch177: 0177-Fix-sqlite-build.patch +Patch178: 0178-Fix-freetype-build.patch +Patch179: 0179-rtl-ifcvt-refuse-to-rename-def-in-the-last-instructi.patch +Patch180: 0180-add-optimization-level-requirement-to-the-gate.patch +Patch181: 0181-Fix-issue-I8QD9H.patch +Patch182: 0182-Fix-bugs-in-ICP-src-openEuler-gcc-I8RKFJ.patch +Patch183: 0183-Fix-fail-in-ICP-src-openEuler-gcc-I8RP4H.patch +Patch184: 0184-Fix-fail-in-IPA-prefetch-src-openEuler-gcc-I8RURA.patch +Patch185: 0185-Fix-fail-in-IPA-prefetch-src-openEuler-gcc-I8RV7T.patch +Patch186: 0186-Loop-CRC-Solving-the-problem-of-insufficient-CRC-tab.patch +Patch187: 0187-Add-IPA-prefetch-test.patch +Patch188: 0188-Fix-fails-in-ICP-for-src-openEuler-gcc-I90P7M-I91CZ8.patch +Patch189: 0189-Add-hip11-CPU-pipeline-scheduling.patch %global gcc_target_platform %{_arch}-linux-gnu @@ -843,6 +891,54 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch139 -p1 %patch140 -p1 %patch141 -p1 +%patch142 -p1 +%patch143 -p1 +%patch144 -p1 +%patch145 -p1 +%patch146 -p1 +%patch147 -p1 +%patch148 -p1 +%patch149 -p1 +%patch150 -p1 +%patch151 -p1 +%patch152 -p1 +%patch153 -p1 +%patch154 -p1 +%patch155 -p1 +%patch156 -p1 +%patch157 -p1 +%patch158 -p1 +%patch159 -p1 +%patch160 -p1 +%patch161 -p1 +%patch162 -p1 +%patch163 -p1 +%patch164 -p1 +%patch165 -p1 +%patch166 -p1 +%patch167 -p1 +%patch168 -p1 +%patch169 -p1 +%patch170 -p1 +%patch171 -p1 +%patch172 -p1 +%patch173 -p1 +%patch174 -p1 +%patch175 -p1 +%patch176 -p1 +%patch177 -p1 +%patch178 -p1 +%patch179 -p1 +%patch180 -p1 +%patch181 -p1 +%patch182 -p1 +%patch183 -p1 +%patch184 -p1 +%patch185 -p1 +%patch186 -p1 +%patch187 -p1 +%patch188 -p1 +%patch189 -p1 %build @@ -2877,6 +2973,12 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Tue May 28 2024 zhengchenhui - 10.3.1-42 +- Type:Spec +- ID:NA +- SUG:NA +- DESC: Sync patch from openeuler/gcc + * Mon Apr 15 2024 huyubiao - 10.3.1-41 - Type:SPEC - ID:NA -- Gitee