From 29d4e6bc5abc7a2e740b2e9e2d82cd89f8fa4639 Mon Sep 17 00:00:00 2001 From: zhaowenyu <804544223@qq.com> Date: Mon, 20 Jun 2022 22:57:09 +0800 Subject: [PATCH 01/14] [Backport] Extend special_memory_constraint. Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=4de7b010038933dd6ca96bf186ca49f243d0def6 For operand with special_memory_constraint, there could be a wrapper for memory_operand. Extract mem for operand for conditional judgement like MEM_P, also for record_address_regs. --- gcc/ira-costs.c | 12 +++++++----- gcc/ira.c | 2 +- gcc/lra-constraints.c | 28 +++++++++++++++++++++++----- gcc/recog.c | 7 +++++-- gcc/rtl.h | 1 + 5 files changed, 37 insertions(+), 13 deletions(-) diff --git a/gcc/ira-costs.c b/gcc/ira-costs.c index 6891156b5aa..aeda6588bcd 100644 --- a/gcc/ira-costs.c +++ b/gcc/ira-costs.c @@ -781,7 +781,8 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops, case CT_SPECIAL_MEMORY: insn_allows_mem[i] = allows_mem[i] = 1; - if (MEM_P (op) && constraint_satisfied_p (op, cn)) + if (MEM_P (extract_mem_from_operand (op)) + && constraint_satisfied_p (op, cn)) win = 1; break; @@ -1397,15 +1398,16 @@ record_operand_costs (rtx_insn *insn, enum reg_class *pref) commutative. */ for (i = 0; i < recog_data.n_operands; i++) { + rtx op_mem = extract_mem_from_operand (recog_data.operand[i]); memcpy (op_costs[i], init_cost, struct_costs_size); if (GET_CODE (recog_data.operand[i]) == SUBREG) recog_data.operand[i] = SUBREG_REG (recog_data.operand[i]); - if (MEM_P (recog_data.operand[i])) - record_address_regs (GET_MODE (recog_data.operand[i]), - MEM_ADDR_SPACE (recog_data.operand[i]), - XEXP (recog_data.operand[i], 0), + if (MEM_P (op_mem)) + record_address_regs (GET_MODE (op_mem), + MEM_ADDR_SPACE (op_mem), + XEXP (op_mem, 0), 0, MEM, SCRATCH, frequency * 2); else if (constraints[i][0] == 'p' || (insn_extra_address_constraint diff --git a/gcc/ira.c b/gcc/ira.c index 681ec2f46f9..c136502298c 100644 --- a/gcc/ira.c +++ b/gcc/ira.c @@ -1868,7 +1868,7 @@ ira_setup_alts (rtx_insn *insn) case CT_MEMORY: case CT_SPECIAL_MEMORY: - if (MEM_P (op)) + if (MEM_P (extract_mem_from_operand (op))) goto op_success; win_p = true; break; diff --git a/gcc/lra-constraints.c b/gcc/lra-constraints.c index 7cc479b3042..df75c7b947b 100644 --- a/gcc/lra-constraints.c +++ b/gcc/lra-constraints.c @@ -409,14 +409,34 @@ valid_address_p (rtx op, struct address_info *ad, return valid_address_p (ad->mode, *ad->outer, ad->as); } +/* For special_memory_operand, it could be false for MEM_P (op), + i.e. bcst_mem_operand in i386 backend. + Extract and return real memory operand or op. */ +rtx +extract_mem_from_operand (rtx op) +{ + for (rtx x = op;; x = XEXP (x, 0)) + { + if (MEM_P (x)) + return x; + if (GET_RTX_LENGTH (GET_CODE (x)) != 1 + || GET_RTX_FORMAT (GET_CODE (x))[0] != 'e') + break; + } + return op; +} + /* Return true if the eliminated form of memory reference OP satisfies extra (special) memory constraint CONSTRAINT. */ static bool satisfies_memory_constraint_p (rtx op, enum constraint_num constraint) { struct address_info ad; + rtx mem = extract_mem_from_operand (op); + if (!MEM_P (mem)) + return false; - decompose_mem_address (&ad, op); + decompose_mem_address (&ad, mem); address_eliminator eliminator (&ad); return constraint_satisfied_p (op, constraint); } @@ -2344,8 +2364,7 @@ process_alt_operands (int only_alternative) break; case CT_MEMORY: - if (MEM_P (op) - && satisfies_memory_constraint_p (op, cn)) + if (satisfies_memory_constraint_p (op, cn)) win = true; else if (spilled_pseudo_p (op)) win = true; @@ -2386,8 +2405,7 @@ process_alt_operands (int only_alternative) break; case CT_SPECIAL_MEMORY: - if (MEM_P (op) - && satisfies_memory_constraint_p (op, cn)) + if (satisfies_memory_constraint_p (op, cn)) win = true; else if (spilled_pseudo_p (op)) win = true; diff --git a/gcc/recog.c b/gcc/recog.c index 2720aaaac85..8674054b95f 100644 --- a/gcc/recog.c +++ b/gcc/recog.c @@ -1798,7 +1798,8 @@ asm_operand_ok (rtx op, const char *constraint, const char **constraints) case CT_MEMORY: case CT_SPECIAL_MEMORY: /* Every memory operand can be reloaded to fit. */ - result = result || memory_operand (op, VOIDmode); + result = result || memory_operand (extract_mem_from_operand (op), + VOIDmode); break; case CT_ADDRESS: @@ -2584,7 +2585,9 @@ constrain_operands (int strict, alternative_mask alternatives) /* A unary operator may be accepted by the predicate, but it is irrelevant for matching constraints. */ - if (UNARY_P (op)) + /* For special_memory_operand, there could be a memory operand inside, + and it would cause a mismatch for constraint_satisfied_p. */ + if (UNARY_P (op) && op == extract_mem_from_operand (op)) op = XEXP (op, 0); if (GET_CODE (op) == SUBREG) diff --git a/gcc/rtl.h b/gcc/rtl.h index b29afca8d6b..35fb6ba73a0 100644 --- a/gcc/rtl.h +++ b/gcc/rtl.h @@ -4323,6 +4323,7 @@ extern rtx gen_hard_reg_clobber (machine_mode, unsigned int); extern rtx get_reg_known_value (unsigned int); extern bool get_reg_known_equiv_p (unsigned int); extern rtx get_reg_base_value (unsigned int); +extern rtx extract_mem_from_operand (rtx); #ifdef STACK_REGS extern int stack_regs_mentioned (const_rtx insn); -- Gitee From 0ab8898b378d47cd907a869fccc9c686c884fa34 Mon Sep 17 00:00:00 2001 From: zhaowenyu <804544223@qq.com> Date: Mon, 20 Jun 2022 22:57:55 +0800 Subject: [PATCH 02/14] [Backport] ira: Fix unnecessary register spill Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=edf95e51e53697f3050f076675c26a4cece17741 The variables first_moveable_pseudo and last_moveable_pseudo aren't reset after compiling a function, which means they leak into the first scheduler pass of the following function. In some cases, this can cause an extra spill during register location of the second function. --- gcc/ira.c | 2 ++ gcc/testsuite/gcc.target/aarch64/nospill.c | 35 ++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/nospill.c diff --git a/gcc/ira.c b/gcc/ira.c index c136502298c..7e4668c29f2 100644 --- a/gcc/ira.c +++ b/gcc/ira.c @@ -5130,6 +5130,8 @@ move_unallocated_pseudos (void) INSN_UID (newinsn), i); SET_REG_N_REFS (i, 0); } + + first_moveable_pseudo = last_moveable_pseudo = 0; } /* If the backend knows where to allocate pseudos for hard diff --git a/gcc/testsuite/gcc.target/aarch64/nospill.c b/gcc/testsuite/gcc.target/aarch64/nospill.c new file mode 100644 index 00000000000..968a4267e0d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/nospill.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ + +/* The pseudo for P is marked as moveable in the IRA pass. */ +float +func_0 (float a, float b, float c) +{ + float p = c / a; + + if (b > 1) + { + b /= p; + if (c > 2) + a /= 3; + } + + return b / c * a; +} + +/* If first_moveable_pseudo and last_moveable_pseudo are not reset correctly, + they will carry over and spill the pseudo for Q. */ +float +func_1 (float a, float b, float c) +{ + float q = a + b; + + c *= a / (b + b); + if (a > 0) + c *= q; + + return a * b * c; +} + +/* We have plenty of spare registers, so check nothing has been spilled. */ +/* { dg-final { scan-assembler-not "\tstr\t" } } */ -- Gitee From cce165db19a36c1dbdb3464bdf1acab1255b6f43 Mon Sep 17 00:00:00 2001 From: zhaowenyu <804544223@qq.com> Date: Mon, 20 Jun 2022 22:58:28 +0800 Subject: [PATCH 03/14] [Backport] Take insn scratch RA requirements into account in IRA. Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=44fbc9c6e02ca5b8f98f25b514ed7588e7ba733d The patch changes insn scratches which require registers for all insn alternatives (in other words w/o X constraint in scratch constraint string). This is done before IRA staring its work. LRA still continue to change the rest scratches (with X constraint and in insn created during IRA) into pseudos. As before the patch at the end of LRA work, spilled scratch seudos (for which X constraint was chosen) changed into scratches back. --- gcc/ira.c | 217 +++++++++++++++++++++++++++++++++++++++--- gcc/ira.h | 7 ++ gcc/lra-constraints.c | 8 +- gcc/lra-int.h | 3 - gcc/lra-remat.c | 4 +- gcc/lra-spills.c | 10 +- gcc/lra.c | 165 +++----------------------------- 7 files changed, 236 insertions(+), 178 deletions(-) diff --git a/gcc/ira.c b/gcc/ira.c index 7e4668c29f2..573e5246dd5 100644 --- a/gcc/ira.c +++ b/gcc/ira.c @@ -5133,7 +5133,191 @@ move_unallocated_pseudos (void) first_moveable_pseudo = last_moveable_pseudo = 0; } + + + +/* Code dealing with scratches (changing them onto + pseudos and restoring them from the pseudos). + + We change scratches into pseudos at the beginning of IRA to + simplify dealing with them (conflicts, hard register assignments). + + If the pseudo denoting scratch was spilled it means that we do not + need a hard register for it. Such pseudos are transformed back to + scratches at the end of LRA. */ + +/* Description of location of a former scratch operand. */ +struct sloc +{ + rtx_insn *insn; /* Insn where the scratch was. */ + int nop; /* Number of the operand which was a scratch. */ + unsigned regno; /* regno gnerated instead of scratch */ + int icode; /* Original icode from which scratch was removed. */ +}; + +typedef struct sloc *sloc_t; + +/* Locations of the former scratches. */ +static vec scratches; + +/* Bitmap of scratch regnos. */ +static bitmap_head scratch_bitmap; + +/* Bitmap of scratch operands. */ +static bitmap_head scratch_operand_bitmap; + +/* Return true if pseudo REGNO is made of SCRATCH. */ +bool +ira_former_scratch_p (int regno) +{ + return bitmap_bit_p (&scratch_bitmap, regno); +} + +/* Return true if the operand NOP of INSN is a former scratch. */ +bool +ira_former_scratch_operand_p (rtx_insn *insn, int nop) +{ + return bitmap_bit_p (&scratch_operand_bitmap, + INSN_UID (insn) * MAX_RECOG_OPERANDS + nop) != 0; +} + +/* Register operand NOP in INSN as a former scratch. It will be + changed to scratch back, if it is necessary, at the LRA end. */ +void +ira_register_new_scratch_op (rtx_insn *insn, int nop, int icode) +{ + rtx op = *recog_data.operand_loc[nop]; + sloc_t loc = XNEW (struct sloc); + ira_assert (REG_P (op)); + loc->insn = insn; + loc->nop = nop; + loc->regno = REGNO (op); + loc->icode = icode; + scratches.safe_push (loc); + bitmap_set_bit (&scratch_bitmap, REGNO (op)); + bitmap_set_bit (&scratch_operand_bitmap, + INSN_UID (insn) * MAX_RECOG_OPERANDS + nop); + add_reg_note (insn, REG_UNUSED, op); +} + +/* Return true if string STR contains constraint 'X'. */ +static bool +contains_X_constraint_p (const char *str) +{ + int c; + + while ((c = *str)) + { + str += CONSTRAINT_LEN (c, str); + if (c == 'X') return true; + } + return false; +} + +/* Change INSN's scratches into pseudos and save their location. */ +bool +ira_remove_insn_scratches (rtx_insn *insn, bool all_p, FILE *dump_file, + rtx (*get_reg) (rtx original)) +{ + int i; + bool insn_changed_p; + rtx reg, *loc; + + extract_insn (insn); + insn_changed_p = false; + for (i = 0; i < recog_data.n_operands; i++) + { + loc = recog_data.operand_loc[i]; + if (GET_CODE (*loc) == SCRATCH && GET_MODE (*loc) != VOIDmode) + { + if (! all_p && contains_X_constraint_p (recog_data.constraints[i])) + continue; + insn_changed_p = true; + *loc = reg = get_reg (*loc); + ira_register_new_scratch_op (insn, i, INSN_CODE (insn)); + if (ira_dump_file != NULL) + fprintf (dump_file, + "Removing SCRATCH to p%u in insn #%u (nop %d)\n", + REGNO (reg), INSN_UID (insn), i); + } + } + return insn_changed_p; +} + +/* Return new register of the same mode as ORIGINAL. Used in + ira_remove_scratches. */ +static rtx +get_scratch_reg (rtx original) +{ + return gen_reg_rtx (GET_MODE (original)); +} + +/* Change scratches into pseudos and save their location. */ +void +ira_remove_scratches (void) +{ + basic_block bb; + rtx_insn *insn; + + scratches.create (get_max_uid ()); + bitmap_initialize (&scratch_bitmap, ®_obstack); + bitmap_initialize (&scratch_operand_bitmap, ®_obstack); + FOR_EACH_BB_FN (bb, cfun) + FOR_BB_INSNS (bb, insn) + if (INSN_P (insn) + && ira_remove_insn_scratches (insn, false, ira_dump_file, get_scratch_reg)) + /* Because we might use DF, we need to keep DF info up to date. */ + df_insn_rescan (insn); +} + +/* Changes pseudos created by function remove_scratches onto scratches. */ +void +ira_restore_scratches (FILE *dump_file) +{ + int regno, n; + unsigned i; + rtx *op_loc; + sloc_t loc; + + for (i = 0; scratches.iterate (i, &loc); i++) + { + /* Ignore already deleted insns. */ + if (NOTE_P (loc->insn) + && NOTE_KIND (loc->insn) == NOTE_INSN_DELETED) + continue; + extract_insn (loc->insn); + if (loc->icode != INSN_CODE (loc->insn)) + { + /* The icode doesn't match, which means the insn has been modified + (e.g. register elimination). The scratch cannot be restored. */ + continue; + } + op_loc = recog_data.operand_loc[loc->nop]; + if (REG_P (*op_loc) + && ((regno = REGNO (*op_loc)) >= FIRST_PSEUDO_REGISTER) + && reg_renumber[regno] < 0) + { + /* It should be only case when scratch register with chosen + constraint 'X' did not get memory or hard register. */ + ira_assert (ira_former_scratch_p (regno)); + *op_loc = gen_rtx_SCRATCH (GET_MODE (*op_loc)); + for (n = 0; n < recog_data.n_dups; n++) + *recog_data.dup_loc[n] + = *recog_data.operand_loc[(int) recog_data.dup_num[n]]; + if (dump_file != NULL) + fprintf (dump_file, "Restoring SCRATCH in insn #%u(nop %d)\n", + INSN_UID (loc->insn), loc->nop); + } + } + for (i = 0; scratches.iterate (i, &loc); i++) + free (loc); + scratches.release (); + bitmap_clear (&scratch_bitmap); + bitmap_clear (&scratch_operand_bitmap); +} + + /* If the backend knows where to allocate pseudos for hard register initial values, register these allocations now. */ static void @@ -5182,8 +5366,10 @@ allocate_initial_values (void) &hreg, &preg)); } } + + /* True when we use LRA instead of reload pass for the current function. */ bool ira_use_lra_p; @@ -5204,6 +5390,17 @@ ira (FILE *f) bool saved_flag_caller_saves = flag_caller_saves; enum ira_region saved_flag_ira_region = flag_ira_region; + if (flag_ira_verbose < 10) + { + internal_flag_ira_verbose = flag_ira_verbose; + ira_dump_file = f; + } + else + { + internal_flag_ira_verbose = flag_ira_verbose - 10; + ira_dump_file = stderr; + } + clear_bb_flags (); /* Determine if the current function is a leaf before running IRA @@ -5250,17 +5447,6 @@ ira (FILE *f) if (flag_caller_saves && !ira_use_lra_p) init_caller_save (); - if (flag_ira_verbose < 10) - { - internal_flag_ira_verbose = flag_ira_verbose; - ira_dump_file = f; - } - else - { - internal_flag_ira_verbose = flag_ira_verbose - 10; - ira_dump_file = stderr; - } - setup_prohibited_mode_move_regs (); decrease_live_ranges_number (); df_note_add_problem (); @@ -5305,9 +5491,6 @@ ira (FILE *f) if (warn_clobbered) generate_setjmp_warnings (); - if (resize_reg_info () && flag_ira_loop_pressure) - ira_set_pseudo_classes (true, ira_dump_file); - init_alias_analysis (); loop_optimizer_init (AVOID_CFG_MODIFICATIONS); reg_equiv = XCNEWVEC (struct equivalence, max_reg_num ()); @@ -5331,6 +5514,12 @@ ira (FILE *f) end_alias_analysis (); free (reg_equiv); + if (ira_use_lra_p) + ira_remove_scratches (); + + if (resize_reg_info () && flag_ira_loop_pressure) + ira_set_pseudo_classes (true, ira_dump_file); + setup_reg_equiv (); grow_reg_equivs (); setup_reg_equiv_init (); diff --git a/gcc/ira.h b/gcc/ira.h index 09f40ef6a78..c30f36aecca 100644 --- a/gcc/ira.h +++ b/gcc/ira.h @@ -207,6 +207,13 @@ extern bool ira_bad_reload_regno (int, rtx, rtx); extern void ira_adjust_equiv_reg_cost (unsigned, int); +extern bool ira_former_scratch_p (int regno); +extern bool ira_former_scratch_operand_p (rtx_insn *insn, int nop); +extern void ira_register_new_scratch_op (rtx_insn *insn, int nop, int icode); +extern bool ira_remove_insn_scratches (rtx_insn *insn, bool all_p, FILE *dump_file, + rtx (*get_reg) (rtx original)); +extern void ira_restore_scratches (FILE *dump_file); + /* ira-costs.c */ extern void ira_costs_c_finalize (void); diff --git a/gcc/lra-constraints.c b/gcc/lra-constraints.c index df75c7b947b..ccc638a7d57 100644 --- a/gcc/lra-constraints.c +++ b/gcc/lra-constraints.c @@ -2443,7 +2443,7 @@ process_alt_operands (int only_alternative) while ((p += len), c); scratch_p = (operand_reg[nop] != NULL_RTX - && lra_former_scratch_p (REGNO (operand_reg[nop]))); + && ira_former_scratch_p (REGNO (operand_reg[nop]))); /* Record which operands fit this alternative. */ if (win) { @@ -4297,8 +4297,8 @@ curr_insn_transform (bool check_only_p) assigment pass and the scratch pseudo will be spilled. Spilled scratch pseudos are transformed back to scratches at the LRA end. */ - && lra_former_scratch_operand_p (curr_insn, i) - && lra_former_scratch_p (REGNO (op))) + && ira_former_scratch_operand_p (curr_insn, i) + && ira_former_scratch_p (REGNO (op))) { int regno = REGNO (op); lra_change_class (regno, NO_REGS, " Change to", true); @@ -4319,7 +4319,7 @@ curr_insn_transform (bool check_only_p) && goal_alt[i] != NO_REGS && REG_P (op) && (regno = REGNO (op)) >= FIRST_PSEUDO_REGISTER && regno < new_regno_start - && ! lra_former_scratch_p (regno) + && ! ira_former_scratch_p (regno) && reg_renumber[regno] < 0 /* Check that the optional reload pseudo will be able to hold given mode value. */ diff --git a/gcc/lra-int.h b/gcc/lra-int.h index 01fcbfa2664..f9e99a28baa 100644 --- a/gcc/lra-int.h +++ b/gcc/lra-int.h @@ -319,9 +319,6 @@ extern struct lra_insn_reg *lra_get_insn_regs (int); extern void lra_free_copies (void); extern void lra_create_copy (int, int, int); extern lra_copy_t lra_get_copy (int); -extern bool lra_former_scratch_p (int); -extern bool lra_former_scratch_operand_p (rtx_insn *, int); -extern void lra_register_new_scratch_op (rtx_insn *, int, int); extern int lra_new_regno_start; extern int lra_constraint_new_regno_start; diff --git a/gcc/lra-remat.c b/gcc/lra-remat.c index 09c3975bc6c..4b6308bc0dc 100644 --- a/gcc/lra-remat.c +++ b/gcc/lra-remat.c @@ -1036,12 +1036,12 @@ update_scratch_ops (rtx_insn *remat_insn) if (! REG_P (*loc)) continue; int regno = REGNO (*loc); - if (! lra_former_scratch_p (regno)) + if (! ira_former_scratch_p (regno)) continue; *loc = lra_create_new_reg (GET_MODE (*loc), *loc, lra_get_allocno_class (regno), "scratch pseudo copy"); - lra_register_new_scratch_op (remat_insn, i, id->icode); + ira_register_new_scratch_op (remat_insn, i, id->icode); } } diff --git a/gcc/lra-spills.c b/gcc/lra-spills.c index 0caa4acd3b5..8082a5b489f 100644 --- a/gcc/lra-spills.c +++ b/gcc/lra-spills.c @@ -446,7 +446,7 @@ remove_pseudos (rtx *loc, rtx_insn *insn) it might result in an address reload for some targets. In any case we transform such pseudos not getting hard registers into scratches back. */ - && ! lra_former_scratch_p (i)) + && ! ira_former_scratch_p (i)) { if (lra_reg_info[i].nrefs == 0 && pseudo_slots[i].mem == NULL && spill_hard_reg[i] == NULL) @@ -494,7 +494,7 @@ spill_pseudos (void) for (i = FIRST_PSEUDO_REGISTER; i < regs_num; i++) { if (lra_reg_info[i].nrefs != 0 && lra_get_regno_hard_regno (i) < 0 - && ! lra_former_scratch_p (i)) + && ! ira_former_scratch_p (i)) { bitmap_set_bit (spilled_pseudos, i); bitmap_ior_into (changed_insns, &lra_reg_info[i].insn_bitmap); @@ -578,7 +578,7 @@ lra_need_for_scratch_reg_p (void) for (i = FIRST_PSEUDO_REGISTER; i < max_regno; i++) if (lra_reg_info[i].nrefs != 0 && lra_get_regno_hard_regno (i) < 0 - && lra_former_scratch_p (i)) + && ira_former_scratch_p (i)) return true; return false; } @@ -591,7 +591,7 @@ lra_need_for_spills_p (void) for (i = FIRST_PSEUDO_REGISTER; i < max_regno; i++) if (lra_reg_info[i].nrefs != 0 && lra_get_regno_hard_regno (i) < 0 - && ! lra_former_scratch_p (i)) + && ! ira_former_scratch_p (i)) return true; return false; } @@ -612,7 +612,7 @@ lra_spill (void) for (n = 0, i = FIRST_PSEUDO_REGISTER; i < regs_num; i++) if (lra_reg_info[i].nrefs != 0 && lra_get_regno_hard_regno (i) < 0 /* We do not want to assign memory for former scratches. */ - && ! lra_former_scratch_p (i)) + && ! ira_former_scratch_p (i)) pseudo_regnos[n++] = i; lra_assert (n > 0); pseudo_slots = XNEWVEC (struct pseudo_slot, regs_num); diff --git a/gcc/lra.c b/gcc/lra.c index 3543ce3993c..f97bb8e077b 100644 --- a/gcc/lra.c +++ b/gcc/lra.c @@ -160,8 +160,6 @@ static void invalidate_insn_recog_data (int); static int get_insn_freq (rtx_insn *); static void invalidate_insn_data_regno_info (lra_insn_recog_data_t, rtx_insn *, int); -static void remove_scratches_1 (rtx_insn *); - /* Expand all regno related info needed for LRA. */ static void expand_reg_data (int old) @@ -482,6 +480,8 @@ lra_emit_add (rtx x, rtx y, rtx z) /* The number of emitted reload insns so far. */ int lra_curr_reload_num; +static void remove_insn_scratches (rtx_insn *insn); + /* Emit x := y, processing special case when y = u + v or y = u + v * scale + w through emit_add (Y can be an address which is base + index reg * scale + displacement in general case). X may be used @@ -503,7 +503,7 @@ lra_emit_move (rtx x, rtx y) /* The move pattern may require scratch registers, so convert them into real registers now. */ if (insn != NULL_RTX) - remove_scratches_1 (insn); + remove_insn_scratches (insn); if (REG_P (x)) lra_reg_info[ORIGINAL_REGNO (x)].last_reload = ++lra_curr_reload_num; /* Function emit_move can create pseudos -- so expand the pseudo @@ -1988,170 +1988,35 @@ lra_substitute_pseudo_within_insn (rtx_insn *insn, int old_regno, -/* This page contains code dealing with scratches (changing them onto - pseudos and restoring them from the pseudos). - - We change scratches into pseudos at the beginning of LRA to - simplify dealing with them (conflicts, hard register assignments). - - If the pseudo denoting scratch was spilled it means that we do need - a hard register for it. Such pseudos are transformed back to - scratches at the end of LRA. */ - -/* Description of location of a former scratch operand. */ -struct sloc +/* Return new register of the same mode as ORIGINAL of class ALL_REGS. + Used in ira_remove_scratches. */ +static rtx +get_scratch_reg (rtx original) { - rtx_insn *insn; /* Insn where the scratch was. */ - int nop; /* Number of the operand which was a scratch. */ - int icode; /* Original icode from which scratch was removed. */ -}; - -typedef struct sloc *sloc_t; - -/* Locations of the former scratches. */ -static vec scratches; - -/* Bitmap of scratch regnos. */ -static bitmap_head scratch_bitmap; - -/* Bitmap of scratch operands. */ -static bitmap_head scratch_operand_bitmap; - -/* Return true if pseudo REGNO is made of SCRATCH. */ -bool -lra_former_scratch_p (int regno) -{ - return bitmap_bit_p (&scratch_bitmap, regno); + return lra_create_new_reg (GET_MODE (original), original, ALL_REGS, NULL); } -/* Return true if the operand NOP of INSN is a former scratch. */ -bool -lra_former_scratch_operand_p (rtx_insn *insn, int nop) -{ - return bitmap_bit_p (&scratch_operand_bitmap, - INSN_UID (insn) * MAX_RECOG_OPERANDS + nop) != 0; -} - -/* Register operand NOP in INSN as a former scratch. It will be - changed to scratch back, if it is necessary, at the LRA end. */ -void -lra_register_new_scratch_op (rtx_insn *insn, int nop, int icode) -{ - lra_insn_recog_data_t id = lra_get_insn_recog_data (insn); - rtx op = *id->operand_loc[nop]; - sloc_t loc = XNEW (struct sloc); - lra_assert (REG_P (op)); - loc->insn = insn; - loc->nop = nop; - loc->icode = icode; - scratches.safe_push (loc); - bitmap_set_bit (&scratch_bitmap, REGNO (op)); - bitmap_set_bit (&scratch_operand_bitmap, - INSN_UID (insn) * MAX_RECOG_OPERANDS + nop); - add_reg_note (insn, REG_UNUSED, op); -} - -/* Change INSN's scratches into pseudos and save their location. */ +/* Remove all insn scratches in INSN. */ static void -remove_scratches_1 (rtx_insn *insn) +remove_insn_scratches (rtx_insn *insn) { - int i; - bool insn_changed_p; - rtx reg; - lra_insn_recog_data_t id; - struct lra_static_insn_data *static_id; - - id = lra_get_insn_recog_data (insn); - static_id = id->insn_static_data; - insn_changed_p = false; - for (i = 0; i < static_id->n_operands; i++) - if (GET_CODE (*id->operand_loc[i]) == SCRATCH - && GET_MODE (*id->operand_loc[i]) != VOIDmode) - { - insn_changed_p = true; - *id->operand_loc[i] = reg - = lra_create_new_reg (static_id->operand[i].mode, - *id->operand_loc[i], ALL_REGS, NULL); - lra_register_new_scratch_op (insn, i, id->icode); - if (lra_dump_file != NULL) - fprintf (lra_dump_file, - "Removing SCRATCH in insn #%u (nop %d)\n", - INSN_UID (insn), i); - } - if (insn_changed_p) - /* Because we might use DF right after caller-saves sub-pass - we need to keep DF info up to date. */ + if (ira_remove_insn_scratches (insn, true, lra_dump_file, get_scratch_reg)) df_insn_rescan (insn); } -/* Change scratches into pseudos and save their location. */ +/* Remove all insn scratches in the current function. */ static void remove_scratches (void) { basic_block bb; rtx_insn *insn; - scratches.create (get_max_uid ()); - bitmap_initialize (&scratch_bitmap, ®_obstack); - bitmap_initialize (&scratch_operand_bitmap, ®_obstack); FOR_EACH_BB_FN (bb, cfun) FOR_BB_INSNS (bb, insn) - if (INSN_P (insn)) - remove_scratches_1 (insn); -} - -/* Changes pseudos created by function remove_scratches onto scratches. */ -static void -restore_scratches (void) -{ - int regno; - unsigned i; - sloc_t loc; - rtx_insn *last = NULL; - lra_insn_recog_data_t id = NULL; - - for (i = 0; scratches.iterate (i, &loc); i++) - { - /* Ignore already deleted insns. */ - if (NOTE_P (loc->insn) - && NOTE_KIND (loc->insn) == NOTE_INSN_DELETED) - continue; - if (last != loc->insn) - { - last = loc->insn; - id = lra_get_insn_recog_data (last); - } - if (loc->icode != id->icode) - { - /* The icode doesn't match, which means the insn has been modified - (e.g. register elimination). The scratch cannot be restored. */ - continue; - } - if (REG_P (*id->operand_loc[loc->nop]) - && ((regno = REGNO (*id->operand_loc[loc->nop])) - >= FIRST_PSEUDO_REGISTER) - && lra_get_regno_hard_regno (regno) < 0) - { - /* It should be only case when scratch register with chosen - constraint 'X' did not get memory or hard register. */ - lra_assert (lra_former_scratch_p (regno)); - *id->operand_loc[loc->nop] - = gen_rtx_SCRATCH (GET_MODE (*id->operand_loc[loc->nop])); - lra_update_dup (id, loc->nop); - if (lra_dump_file != NULL) - fprintf (lra_dump_file, "Restoring SCRATCH in insn #%u(nop %d)\n", - INSN_UID (loc->insn), loc->nop); - } - } - for (i = 0; scratches.iterate (i, &loc); i++) - free (loc); - scratches.release (); - bitmap_clear (&scratch_bitmap); - bitmap_clear (&scratch_operand_bitmap); + if (INSN_P (insn)) + remove_insn_scratches (insn); } - - /* Function checks RTL for correctness. If FINAL_P is true, it is done at the end of LRA and the check is more rigorous. */ static void @@ -2571,7 +2436,7 @@ lra (FILE *f) lra_bad_spill_regno_start = lra_constraint_new_regno_start; lra_assignment_iter_after_spill = 0; } - restore_scratches (); + ira_restore_scratches (lra_dump_file); lra_eliminate (true, false); lra_final_code_change (); lra_in_progress = 0; -- Gitee From d212a0f4960e7493cf69bc0dc3a34279b97f08b6 Mon Sep 17 00:00:00 2001 From: zhaowenyu <804544223@qq.com> Date: Mon, 20 Jun 2022 22:59:07 +0800 Subject: [PATCH 04/14] [Backport] Don't extract memory from operand for normal memory constraint. Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=2e0aa43fc6ae689c595902310baec604e7e0d695 Don't extract memory from operand for normal memory constraint. --- gcc/ira.c | 7 ++++++- gcc/lra-constraints.c | 3 ++- gcc/recog.c | 8 ++++++-- gcc/testsuite/gcc.target/i386/pr97540.c | 6 ++++++ 4 files changed, 20 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr97540.c diff --git a/gcc/ira.c b/gcc/ira.c index 573e5246dd5..af69a9fa10d 100644 --- a/gcc/ira.c +++ b/gcc/ira.c @@ -1845,6 +1845,7 @@ ira_setup_alts (rtx_insn *insn) default: { enum constraint_num cn = lookup_constraint (p); + rtx mem = NULL; switch (get_constraint_type (cn)) { case CT_REGISTER: @@ -1867,8 +1868,12 @@ ira_setup_alts (rtx_insn *insn) goto op_success; case CT_MEMORY: + mem = op; + /* Fall through. */ case CT_SPECIAL_MEMORY: - if (MEM_P (extract_mem_from_operand (op))) + if (!mem) + mem = extract_mem_from_operand (op); + if (MEM_P (mem)) goto op_success; win_p = true; break; diff --git a/gcc/lra-constraints.c b/gcc/lra-constraints.c index ccc638a7d57..ce9294d71e1 100644 --- a/gcc/lra-constraints.c +++ b/gcc/lra-constraints.c @@ -2364,7 +2364,8 @@ process_alt_operands (int only_alternative) break; case CT_MEMORY: - if (satisfies_memory_constraint_p (op, cn)) + if (MEM_P (op) + && satisfies_memory_constraint_p (op, cn)) win = true; else if (spilled_pseudo_p (op)) win = true; diff --git a/gcc/recog.c b/gcc/recog.c index 8674054b95f..6af34399958 100644 --- a/gcc/recog.c +++ b/gcc/recog.c @@ -1778,6 +1778,7 @@ asm_operand_ok (rtx op, const char *constraint, const char **constraints) /* FALLTHRU */ default: cn = lookup_constraint (constraint); + rtx mem = NULL; switch (get_constraint_type (cn)) { case CT_REGISTER: @@ -1796,10 +1797,13 @@ asm_operand_ok (rtx op, const char *constraint, const char **constraints) break; case CT_MEMORY: + mem = op; + /* Fall through. */ case CT_SPECIAL_MEMORY: /* Every memory operand can be reloaded to fit. */ - result = result || memory_operand (extract_mem_from_operand (op), - VOIDmode); + if (!mem) + mem = extract_mem_from_operand (op); + result = result || memory_operand (mem, VOIDmode); break; case CT_ADDRESS: diff --git a/gcc/testsuite/gcc.target/i386/pr97540.c b/gcc/testsuite/gcc.target/i386/pr97540.c new file mode 100644 index 00000000000..20f8717372c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr97540.c @@ -0,0 +1,6 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +int mt7615_add_interface_dev_0; +int ffs(int x) { asm("" : : "rm"(x)); } +int mt7615_add_interface() { ffs(~mt7615_add_interface_dev_0); } -- Gitee From 0cfc5412a974427023115875665bfccb3deccc25 Mon Sep 17 00:00:00 2001 From: zhaowenyu <804544223@qq.com> Date: Mon, 20 Jun 2022 22:59:37 +0800 Subject: [PATCH 05/14] [Backport] Expand reg_equiv when scratches are removed. Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=3ceaafc95c77cded164cde58da41bed0cbe76489 Expand reg_equiv when scratches are removed. --- gcc/ira.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/gcc/ira.c b/gcc/ira.c index af69a9fa10d..c6b3edfb980 100644 --- a/gcc/ira.c +++ b/gcc/ira.c @@ -5219,7 +5219,8 @@ contains_X_constraint_p (const char *str) return false; } -/* Change INSN's scratches into pseudos and save their location. */ +/* Change INSN's scratches into pseudos and save their location. + Return true if we changed any scratch. */ bool ira_remove_insn_scratches (rtx_insn *insn, bool all_p, FILE *dump_file, rtx (*get_reg) (rtx original)) @@ -5250,17 +5251,19 @@ ira_remove_insn_scratches (rtx_insn *insn, bool all_p, FILE *dump_file, } /* Return new register of the same mode as ORIGINAL. Used in - ira_remove_scratches. */ + remove_scratches. */ static rtx get_scratch_reg (rtx original) { return gen_reg_rtx (GET_MODE (original)); } -/* Change scratches into pseudos and save their location. */ -void -ira_remove_scratches (void) +/* Change scratches into pseudos and save their location. Return true + if we changed any scratch. */ +static bool +remove_scratches (void) { + bool change_p = false; basic_block bb; rtx_insn *insn; @@ -5271,8 +5274,12 @@ ira_remove_scratches (void) FOR_BB_INSNS (bb, insn) if (INSN_P (insn) && ira_remove_insn_scratches (insn, false, ira_dump_file, get_scratch_reg)) - /* Because we might use DF, we need to keep DF info up to date. */ - df_insn_rescan (insn); + { + /* Because we might use DF, we need to keep DF info up to date. */ + df_insn_rescan (insn); + change_p = true; + } + return change_p; } /* Changes pseudos created by function remove_scratches onto scratches. */ @@ -5519,8 +5526,8 @@ ira (FILE *f) end_alias_analysis (); free (reg_equiv); - if (ira_use_lra_p) - ira_remove_scratches (); + if (ira_use_lra_p && remove_scratches ()) + ira_expand_reg_equiv (); if (resize_reg_info () && flag_ira_loop_pressure) ira_set_pseudo_classes (true, ira_dump_file); -- Gitee From 679299ff4f3067723d678fb37ac261dd5fc59ab2 Mon Sep 17 00:00:00 2001 From: zhaowenyu <804544223@qq.com> Date: Mon, 20 Jun 2022 23:00:06 +0800 Subject: [PATCH 06/14] [Backport] ira: Recompute regstat as max_regno changes [PR97705] Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=ce4ae1f4893e322495c5d24b2f0e807a7f7cf92f As PR97705 shows, the commit r11-4637 caused some dumping comparison difference error on pass ira. It exposed one issue about the newly introduced function emove_scratches, which can increase the largest pseudo reg number if it succeeds, later some function will use the max_reg_num() to get the latest max_regno, when iterating the numbers we can access some data structures which are allocated as the previous max_regno, some out of array bound accesses can occur, the failure can be random since the values beyond the array could be random. This patch is to free/reinit/recompute the relevant data structures that is regstat_n_sets_and_refs and reg_info_p to ensure we won't access beyond some array bounds. --- gcc/ira.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/gcc/ira.c b/gcc/ira.c index c6b3edfb980..053fdbff469 100644 --- a/gcc/ira.c +++ b/gcc/ira.c @@ -5526,8 +5526,26 @@ ira (FILE *f) end_alias_analysis (); free (reg_equiv); + /* Once max_regno changes, we need to free and re-init/re-compute + some data structures like regstat_n_sets_and_refs and reg_info_p. */ + auto regstat_recompute_for_max_regno = []() { + regstat_free_n_sets_and_refs (); + regstat_free_ri (); + regstat_init_n_sets_and_refs (); + regstat_compute_ri (); + }; + + int max_regno_before_rm = max_reg_num (); if (ira_use_lra_p && remove_scratches ()) - ira_expand_reg_equiv (); + { + ira_expand_reg_equiv (); + /* For now remove_scatches is supposed to create pseudos when it + succeeds, assert this happens all the time. Once it doesn't + hold, we should guard the regstat recompute for the case + max_regno changes. */ + gcc_assert (max_regno_before_rm != max_reg_num ()); + regstat_recompute_for_max_regno (); + } if (resize_reg_info () && flag_ira_loop_pressure) ira_set_pseudo_classes (true, ira_dump_file); @@ -5654,12 +5672,7 @@ ira (FILE *f) #endif if (max_regno != max_regno_before_ira) - { - regstat_free_n_sets_and_refs (); - regstat_free_ri (); - regstat_init_n_sets_and_refs (); - regstat_compute_ri (); - } + regstat_recompute_for_max_regno (); overall_cost_before = ira_overall_cost; if (! ira_conflicts_p) -- Gitee From 45adcf5e021fb1bbf06304a9d5128c598f8b5669 Mon Sep 17 00:00:00 2001 From: zhaowenyu <804544223@qq.com> Date: Mon, 20 Jun 2022 23:00:33 +0800 Subject: [PATCH 07/14] [Backport] ira: Add a ira_loop_border_costs class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=bf37fd35a37985a0e19817f843d0bdd5ad504aa9 The final index into (ira_)memory_move_cost is 1 for loads and 0 for stores. Thus the combination: entry_freq * memory_cost[1] + exit_freq * memory_cost[0] is the cost of loading a register on entry to a loop and storing it back on exit from the loop. This is the cost to use if the register is successfully allocated within the loop but is spilled in the parent loop. Similarly: entry_freq * memory_cost[0] + exit_freq * memory_cost[1] is the cost of storing a register on entry to the loop and restoring it on exit from the loop. This is the cost to use if the register is spilled within the loop but is successfully allocated in the parent loop. The patch adds a helper class for calculating these values and mechanically replaces the existing instances. There is no attempt to editorialise the choice between using “spill inside” and “spill outside” costs. (I think one of them is the wrong way round, but a later patch deals with that.) No functional change intended. --- gcc/ira-color.c | 76 +++++++++++++++++++------------------------------ gcc/ira-int.h | 56 ++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 46 deletions(-) diff --git a/gcc/ira-color.c b/gcc/ira-color.c index b0fc159a849..b1e2e49e26e 100644 --- a/gcc/ira-color.c +++ b/gcc/ira-color.c @@ -2549,13 +2549,23 @@ ira_loop_edge_freq (ira_loop_tree_node_t loop_node, int regno, bool exit_p) return REG_FREQ_FROM_EDGE_FREQ (freq); } +/* Construct an object that describes the boundary between A and its + parent allocno. */ +ira_loop_border_costs::ira_loop_border_costs (ira_allocno_t a) + : m_mode (ALLOCNO_MODE (a)), + m_class (ALLOCNO_CLASS (a)), + m_entry_freq (ira_loop_edge_freq (ALLOCNO_LOOP_TREE_NODE (a), + ALLOCNO_REGNO (a), false)), + m_exit_freq (ira_loop_edge_freq (ALLOCNO_LOOP_TREE_NODE (a), + ALLOCNO_REGNO (a), true)) +{ +} + /* Calculate and return the cost of putting allocno A into memory. */ static int calculate_allocno_spill_cost (ira_allocno_t a) { int regno, cost; - machine_mode mode; - enum reg_class rclass; ira_allocno_t parent_allocno; ira_loop_tree_node_t parent_node, loop_node; @@ -2568,24 +2578,12 @@ calculate_allocno_spill_cost (ira_allocno_t a) return cost; if ((parent_allocno = parent_node->regno_allocno_map[regno]) == NULL) return cost; - mode = ALLOCNO_MODE (a); - rclass = ALLOCNO_CLASS (a); + ira_loop_border_costs border_costs (a); if (ALLOCNO_HARD_REGNO (parent_allocno) < 0) - cost -= (ira_memory_move_cost[mode][rclass][0] - * ira_loop_edge_freq (loop_node, regno, true) - + ira_memory_move_cost[mode][rclass][1] - * ira_loop_edge_freq (loop_node, regno, false)); + cost -= border_costs.spill_outside_loop_cost (); else - { - ira_init_register_move_cost_if_necessary (mode); - cost += ((ira_memory_move_cost[mode][rclass][1] - * ira_loop_edge_freq (loop_node, regno, true) - + ira_memory_move_cost[mode][rclass][0] - * ira_loop_edge_freq (loop_node, regno, false)) - - (ira_register_move_cost[mode][rclass][rclass] - * (ira_loop_edge_freq (loop_node, regno, false) - + ira_loop_edge_freq (loop_node, regno, true)))); - } + cost += (border_costs.spill_inside_loop_cost () + - border_costs.move_between_loops_cost ()); return cost; } @@ -3306,7 +3304,7 @@ static void color_pass (ira_loop_tree_node_t loop_tree_node) { int regno, hard_regno, index = -1, n; - int cost, exit_freq, enter_freq; + int cost; unsigned int j; bitmap_iterator bi; machine_mode mode; @@ -3430,8 +3428,6 @@ color_pass (ira_loop_tree_node_t loop_tree_node) } continue; } - exit_freq = ira_loop_edge_freq (subloop_node, regno, true); - enter_freq = ira_loop_edge_freq (subloop_node, regno, false); ira_assert (regno < ira_reg_equiv_len); if (ira_equiv_no_lvalue_p (regno)) { @@ -3447,16 +3443,16 @@ color_pass (ira_loop_tree_node_t loop_tree_node) } else if (hard_regno < 0) { + ira_loop_border_costs border_costs (subloop_allocno); ALLOCNO_UPDATED_MEMORY_COST (subloop_allocno) - -= ((ira_memory_move_cost[mode][rclass][1] * enter_freq) - + (ira_memory_move_cost[mode][rclass][0] * exit_freq)); + -= border_costs.spill_outside_loop_cost (); } else { + ira_loop_border_costs border_costs (subloop_allocno); aclass = ALLOCNO_CLASS (subloop_allocno); ira_init_register_move_cost_if_necessary (mode); - cost = (ira_register_move_cost[mode][rclass][rclass] - * (exit_freq + enter_freq)); + cost = border_costs.move_between_loops_cost (); ira_allocate_and_set_or_copy_costs (&ALLOCNO_UPDATED_HARD_REG_COSTS (subloop_allocno), aclass, ALLOCNO_UPDATED_CLASS_COST (subloop_allocno), @@ -3472,8 +3468,7 @@ color_pass (ira_loop_tree_node_t loop_tree_node) ALLOCNO_UPDATED_CLASS_COST (subloop_allocno) = ALLOCNO_UPDATED_HARD_REG_COSTS (subloop_allocno)[index]; ALLOCNO_UPDATED_MEMORY_COST (subloop_allocno) - += (ira_memory_move_cost[mode][rclass][0] * enter_freq - + ira_memory_move_cost[mode][rclass][1] * exit_freq); + += border_costs.spill_inside_loop_cost (); } } } @@ -3514,7 +3509,6 @@ move_spill_restore (void) { int cost, regno, hard_regno, hard_regno2, index; bool changed_p; - int enter_freq, exit_freq; machine_mode mode; enum reg_class rclass; ira_allocno_t a, parent_allocno, subloop_allocno; @@ -3569,38 +3563,28 @@ move_spill_restore (void) - (ALLOCNO_HARD_REG_COSTS (subloop_allocno) == NULL ? ALLOCNO_CLASS_COST (subloop_allocno) : ALLOCNO_HARD_REG_COSTS (subloop_allocno)[index])); - exit_freq = ira_loop_edge_freq (subloop_node, regno, true); - enter_freq = ira_loop_edge_freq (subloop_node, regno, false); + ira_loop_border_costs border_costs (subloop_allocno); if ((hard_regno2 = ALLOCNO_HARD_REGNO (subloop_allocno)) < 0) - cost -= (ira_memory_move_cost[mode][rclass][0] * exit_freq - + ira_memory_move_cost[mode][rclass][1] * enter_freq); + cost -= border_costs.spill_outside_loop_cost (); else { - cost - += (ira_memory_move_cost[mode][rclass][0] * exit_freq - + ira_memory_move_cost[mode][rclass][1] * enter_freq); + cost += border_costs.spill_outside_loop_cost (); if (hard_regno2 != hard_regno) - cost -= (ira_register_move_cost[mode][rclass][rclass] - * (exit_freq + enter_freq)); + cost -= border_costs.move_between_loops_cost (); } } if ((parent = loop_node->parent) != NULL && (parent_allocno = parent->regno_allocno_map[regno]) != NULL) { ira_assert (rclass == ALLOCNO_CLASS (parent_allocno)); - exit_freq = ira_loop_edge_freq (loop_node, regno, true); - enter_freq = ira_loop_edge_freq (loop_node, regno, false); + ira_loop_border_costs border_costs (a); if ((hard_regno2 = ALLOCNO_HARD_REGNO (parent_allocno)) < 0) - cost -= (ira_memory_move_cost[mode][rclass][0] * exit_freq - + ira_memory_move_cost[mode][rclass][1] * enter_freq); + cost -= border_costs.spill_outside_loop_cost (); else { - cost - += (ira_memory_move_cost[mode][rclass][1] * exit_freq - + ira_memory_move_cost[mode][rclass][0] * enter_freq); + cost += border_costs.spill_inside_loop_cost (); if (hard_regno2 != hard_regno) - cost -= (ira_register_move_cost[mode][rclass][rclass] - * (exit_freq + enter_freq)); + cost -= border_costs.move_between_loops_cost (); } } if (cost < 0) diff --git a/gcc/ira-int.h b/gcc/ira-int.h index 4bee4eec66e..e0e2cc6ad20 100644 --- a/gcc/ira-int.h +++ b/gcc/ira-int.h @@ -1539,4 +1539,60 @@ ira_need_caller_save_p (ira_allocno_t a, unsigned int regno) ALLOCNO_MODE (a), regno); } +/* Represents the boundary between an allocno in one loop and its parent + allocno in the enclosing loop. It is usually possible to change a + register's allocation on this boundary; the class provides routines + for calculating the cost of such changes. */ +class ira_loop_border_costs +{ +public: + ira_loop_border_costs (ira_allocno_t); + + int move_between_loops_cost () const; + int spill_outside_loop_cost () const; + int spill_inside_loop_cost () const; + +private: + /* The mode and class of the child allocno. */ + machine_mode m_mode; + reg_class m_class; + + /* Sums the frequencies of the entry edges and the exit edges. */ + int m_entry_freq, m_exit_freq; +}; + +/* Return the cost of storing the register on entry to the loop and + loading it back on exit from the loop. This is the cost to use if + the register is spilled within the loop but is successfully allocated + in the parent loop. */ +inline int +ira_loop_border_costs::spill_inside_loop_cost () const +{ + return (m_entry_freq * ira_memory_move_cost[m_mode][m_class][0] + + m_exit_freq * ira_memory_move_cost[m_mode][m_class][1]); +} + +/* Return the cost of loading the register on entry to the loop and + storing it back on exit from the loop. This is the cost to use if + the register is successfully allocated within the loop but is spilled + in the parent loop. */ +inline int +ira_loop_border_costs::spill_outside_loop_cost () const +{ + return (m_entry_freq * ira_memory_move_cost[m_mode][m_class][1] + + m_exit_freq * ira_memory_move_cost[m_mode][m_class][0]); +} + +/* Return the cost of moving the pseudo register between different hard + registers on entry and exit from the loop. This is the cost to use + if the register is successfully allocated within both this loop and + the parent loop, but the allocations for the loops differ. */ +inline int +ira_loop_border_costs::move_between_loops_cost () const +{ + ira_init_register_move_cost_if_necessary (m_mode); + auto move_cost = ira_register_move_cost[m_mode][m_class][m_class]; + return move_cost * (m_entry_freq + m_exit_freq); +} + #endif /* GCC_IRA_INT_H */ -- Gitee From 7f2b5adba58bfa66bce3869aeb528ebcd4c7cd4a Mon Sep 17 00:00:00 2001 From: zhaowenyu <804544223@qq.com> Date: Mon, 20 Jun 2022 23:01:47 +0800 Subject: [PATCH 08/14] [Backport] ira: Add comments and fix move_spill_restore calculation Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=909a4b4764c4f270f09ccb2a950c91b21ed7b33a This patch adds comments to describe each use of ira_loop_border_costs. I think this highlights that move_spill_restore was using the wrong cost in one case, which came from tranposing [0] and [1] in the original (pre-ira_loop_border_costs) ira_memory_move_cost expressions. The difference would only be noticeable on targets that distinguish between load and store costs. --- gcc/ira-color.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/gcc/ira-color.c b/gcc/ira-color.c index b1e2e49e26e..f89dd7728bc 100644 --- a/gcc/ira-color.c +++ b/gcc/ira-color.c @@ -3443,6 +3443,13 @@ color_pass (ira_loop_tree_node_t loop_tree_node) } else if (hard_regno < 0) { + /* If we allocate a register to SUBLOOP_ALLOCNO, we'll need + to load the register on entry to the subloop and store + the register back on exit from the subloop. This incurs + a fixed cost for all registers. Since UPDATED_MEMORY_COST + is (and should only be) used relative to the register costs + for the same allocno, we can subtract this shared register + cost from the memory cost. */ ira_loop_border_costs border_costs (subloop_allocno); ALLOCNO_UPDATED_MEMORY_COST (subloop_allocno) -= border_costs.spill_outside_loop_cost (); @@ -3467,6 +3474,9 @@ color_pass (ira_loop_tree_node_t loop_tree_node) > ALLOCNO_UPDATED_HARD_REG_COSTS (subloop_allocno)[index]) ALLOCNO_UPDATED_CLASS_COST (subloop_allocno) = ALLOCNO_UPDATED_HARD_REG_COSTS (subloop_allocno)[index]; + /* If we spill SUBLOOP_ALLOCNO, we'll need to store HARD_REGNO + on entry to the subloop and restore HARD_REGNO on exit from + the subloop. */ ALLOCNO_UPDATED_MEMORY_COST (subloop_allocno) += border_costs.spill_inside_loop_cost (); } @@ -3565,9 +3575,17 @@ move_spill_restore (void) : ALLOCNO_HARD_REG_COSTS (subloop_allocno)[index])); ira_loop_border_costs border_costs (subloop_allocno); if ((hard_regno2 = ALLOCNO_HARD_REGNO (subloop_allocno)) < 0) - cost -= border_costs.spill_outside_loop_cost (); + /* The register was spilled in the subloop. If we spill + it in the outer loop too then we'll no longer need to + save the register on entry to the subloop and restore + the register on exit from the subloop. */ + cost -= border_costs.spill_inside_loop_cost (); else { + /* The register was also allocated in the subloop. If we + spill it in the outer loop then we'll need to load the + register on entry to the subloop and store the register + back on exit from the subloop. */ cost += border_costs.spill_outside_loop_cost (); if (hard_regno2 != hard_regno) cost -= border_costs.move_between_loops_cost (); @@ -3579,9 +3597,17 @@ move_spill_restore (void) ira_assert (rclass == ALLOCNO_CLASS (parent_allocno)); ira_loop_border_costs border_costs (a); if ((hard_regno2 = ALLOCNO_HARD_REGNO (parent_allocno)) < 0) + /* The register was spilled in the parent loop. If we spill + it in this loop too then we'll no longer need to load the + register on entry to this loop and save the register back + on exit from this loop. */ cost -= border_costs.spill_outside_loop_cost (); else { + /* The register was also allocated in the parent loop. + If we spill it in this loop then we'll need to save + the register on entry to this loop and restore the + register on exit from this loop. */ cost += border_costs.spill_inside_loop_cost (); if (hard_regno2 != hard_regno) cost -= border_costs.move_between_loops_cost (); -- Gitee From aeb4f2931e7a42a398b343894ffac2b4ac272638 Mon Sep 17 00:00:00 2001 From: zhaowenyu <804544223@qq.com> Date: Mon, 20 Jun 2022 23:02:27 +0800 Subject: [PATCH 09/14] [Backport] ira: Add ira_subloop_allocnos_can_differ_p Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=d54565d87ff79b882208dfb29af50232033c233d color_pass has two instances of the same code for propagating non-cap assignments from parent loops to subloops. This patch adds a helper function for testing when such propagations are required for correctness and uses it to remove the duplicated code. A later patch will use this in ira-build.c too, which is why the function is exported to ira-int.h. No functional change intended. --- gcc/ira-color.c | 21 +-------------------- gcc/ira-int.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/gcc/ira-color.c b/gcc/ira-color.c index f89dd7728bc..565971255da 100644 --- a/gcc/ira-color.c +++ b/gcc/ira-color.c @@ -3410,26 +3410,7 @@ color_pass (ira_loop_tree_node_t loop_tree_node) if ((flag_ira_region == IRA_REGION_MIXED && (loop_tree_node->reg_pressure[pclass] <= ira_class_hard_regs_num[pclass])) - || (pic_offset_table_rtx != NULL - && regno == (int) REGNO (pic_offset_table_rtx)) - /* Avoid overlapped multi-registers. Moves between them - might result in wrong code generation. */ - || (hard_regno >= 0 - && ira_reg_class_max_nregs[pclass][mode] > 1)) - { - if (! ALLOCNO_ASSIGNED_P (subloop_allocno)) - { - ALLOCNO_HARD_REGNO (subloop_allocno) = hard_regno; - ALLOCNO_ASSIGNED_P (subloop_allocno) = true; - if (hard_regno >= 0) - update_costs_from_copies (subloop_allocno, true, true); - /* We don't need updated costs anymore. */ - ira_free_allocno_updated_costs (subloop_allocno); - } - continue; - } - ira_assert (regno < ira_reg_equiv_len); - if (ira_equiv_no_lvalue_p (regno)) + || !ira_subloop_allocnos_can_differ_p (a, hard_regno >= 0)) { if (! ALLOCNO_ASSIGNED_P (subloop_allocno)) { diff --git a/gcc/ira-int.h b/gcc/ira-int.h index e0e2cc6ad20..77e96d00758 100644 --- a/gcc/ira-int.h +++ b/gcc/ira-int.h @@ -1595,4 +1595,32 @@ ira_loop_border_costs::move_between_loops_cost () const return move_cost * (m_entry_freq + m_exit_freq); } +/* Return true if subloops that contain allocnos for A's register can + use a different assignment from A. ALLOCATED_P is true for the case + in which allocation succeeded for A. */ +inline bool +ira_subloop_allocnos_can_differ_p (ira_allocno_t a, bool allocated_p = true) +{ + auto regno = ALLOCNO_REGNO (a); + + if (pic_offset_table_rtx != NULL + && regno == (int) REGNO (pic_offset_table_rtx)) + return false; + + ira_assert (regno < ira_reg_equiv_len); + if (ira_equiv_no_lvalue_p (regno)) + return false; + + /* Avoid overlapping multi-registers. Moves between them might result + in wrong code generation. */ + if (allocated_p) + { + auto pclass = ira_pressure_class_translate[ALLOCNO_CLASS (a)]; + if (ira_reg_class_max_nregs[pclass][ALLOCNO_MODE (a)] > 1) + return false; + } + + return true; +} + #endif /* GCC_IRA_INT_H */ -- Gitee From 1aef9b230a191484bbbc8b8d0116025c3c3125f1 Mon Sep 17 00:00:00 2001 From: zhaowenyu <804544223@qq.com> Date: Mon, 20 Jun 2022 23:05:17 +0800 Subject: [PATCH 10/14] [Backport] ira: Try to avoid propagating conflicts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=8e7a23728f66d2da88b47e34224410457fdefbf5 Suppose that: - an inner loop L contains an allocno A - L clobbers hard register R while A is live - A's parent allocno is AP Previously, propagate_allocno_info would propagate conflict sets up the loop tree, so that the conflict between A and R would become a conflict between AP and R (and so on for ancestors of AP). However, when IRA treats loops as separate allocation regions, it can decide on a loop-by-loop basis whether to allocate a register or spill to memory. Conflicts in inner loops therefore don't need to become hard conflicts in parent loops. Instead we can record that using the “conflicting” registers for the parent allocnos has a higher cost. In the example above, this higher cost is the sum of: - the cost of saving R on entry to L - the cost of keeping the pseudo register in memory throughout L - the cost of reloading R on exit from L This value is also a cap on the hard register cost that A can contribute to AP in general (not just for conflicts). Whatever allocation we pick for AP, there is always the option of spilling that register to memory throughout L, so the cost to A of allocating a register to AP can't be more than the cost of spilling A. To take an extreme example: if allocating a register R2 to A is more expensive than spilling A to memory, ALLOCNO_HARD_REG_COSTS (A)[R2] could be (say) 2 times greater than ALLOCNO_MEMORY_COST (A) or 100 times greater than ALLOCNO_MEMORY_COST (A). But this scale factor doesn't matter to AP. All that matters is that R2 is more expensive than memory for A, so that allocating R2 to AP should be costed as spilling A to memory (again assuming that A and AP are in different allocation regions). Propagating a factor of 100 would distort the register costs for AP. move_spill_restore tries to undo the propagation done by propagate_allocno_info, so we need some extra processing there. --- gcc/ira-build.c | 55 +++++++++++++++++-- gcc/ira-color.c | 53 ++++++++++++------ gcc/ira-int.h | 37 +++++++++++++ .../gcc.target/aarch64/reg-alloc-2.c | 47 ++++++++++++++++ 4 files changed, 169 insertions(+), 23 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/reg-alloc-2.c diff --git a/gcc/ira-build.c b/gcc/ira-build.c index 0bbdb4d0c4b..17aa2512ae9 100644 --- a/gcc/ira-build.c +++ b/gcc/ira-build.c @@ -499,6 +499,7 @@ ira_create_allocno (int regno, bool cap_p, bitmap_set_bit (loop_tree_node->all_allocnos, ALLOCNO_NUM (a)); ALLOCNO_NREFS (a) = 0; ALLOCNO_FREQ (a) = 0; + ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P (a) = false; ALLOCNO_HARD_REGNO (a) = -1; ALLOCNO_CALL_FREQ (a) = 0; ALLOCNO_CALLS_CROSSED_NUM (a) = 0; @@ -1990,6 +1991,35 @@ propagate_modified_regnos (ira_loop_tree_node_t loop_tree_node) loop_tree_node->modified_regnos); } +/* Propagate ALLOCNO_HARD_REG_COSTS from A to PARENT_A. Use SPILL_COST + as the cost of spilling a register throughout A (which we have to do + for PARENT_A allocations that conflict with A). */ +static void +ira_propagate_hard_reg_costs (ira_allocno_t parent_a, ira_allocno_t a, + int spill_cost) +{ + HARD_REG_SET conflicts = ira_total_conflict_hard_regs (a); + conflicts &= ~ira_total_conflict_hard_regs (parent_a); + + auto costs = ALLOCNO_HARD_REG_COSTS (a); + if (!hard_reg_set_empty_p (conflicts)) + ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P (a) = true; + else if (!costs) + return; + + auto aclass = ALLOCNO_CLASS (a); + ira_allocate_and_set_costs (&ALLOCNO_HARD_REG_COSTS (parent_a), + aclass, ALLOCNO_CLASS_COST (parent_a)); + auto parent_costs = ALLOCNO_HARD_REG_COSTS (parent_a); + for (int i = 0; i < ira_class_hard_regs_num[aclass]; ++i) + if (TEST_HARD_REG_BIT (conflicts, ira_class_hard_regs[aclass][i])) + parent_costs[i] += spill_cost; + else if (costs) + /* The cost to A of allocating this register to PARENT_A can't + be more than the cost of spilling the register throughout A. */ + parent_costs[i] += MIN (costs[i], spill_cost); +} + /* Propagate new info about allocno A (see comments about accumulated info in allocno definition) to the corresponding allocno on upper loop tree level. So allocnos on upper levels accumulate @@ -2018,12 +2048,27 @@ propagate_allocno_info (void) && bitmap_bit_p (ALLOCNO_LOOP_TREE_NODE (a)->border_allocnos, ALLOCNO_NUM (a))) { + /* Calculate the cost of storing to memory on entry to A's loop, + referencing as memory within A's loop, and restoring from + memory on exit from A's loop. */ + ira_loop_border_costs border_costs (a); + int spill_cost = INT_MAX; + if (ira_subloop_allocnos_can_differ_p (parent_a)) + spill_cost = (border_costs.spill_inside_loop_cost () + + ALLOCNO_MEMORY_COST (a)); + if (! ALLOCNO_BAD_SPILL_P (a)) ALLOCNO_BAD_SPILL_P (parent_a) = false; ALLOCNO_NREFS (parent_a) += ALLOCNO_NREFS (a); ALLOCNO_FREQ (parent_a) += ALLOCNO_FREQ (a); + + /* If A's allocation can differ from PARENT_A's, we can if necessary + spill PARENT_A on entry to A's loop and restore it afterwards. + Doing that has cost SPILL_COST. */ + if (!ira_subloop_allocnos_can_differ_p (parent_a)) + merge_hard_reg_conflicts (a, parent_a, true); + ALLOCNO_CALL_FREQ (parent_a) += ALLOCNO_CALL_FREQ (a); - merge_hard_reg_conflicts (a, parent_a, true); ALLOCNO_CALLS_CROSSED_NUM (parent_a) += ALLOCNO_CALLS_CROSSED_NUM (a); ALLOCNO_CHEAP_CALLS_CROSSED_NUM (parent_a) @@ -2036,15 +2081,15 @@ propagate_allocno_info (void) += ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (a); aclass = ALLOCNO_CLASS (a); ira_assert (aclass == ALLOCNO_CLASS (parent_a)); - ira_allocate_and_accumulate_costs - (&ALLOCNO_HARD_REG_COSTS (parent_a), aclass, - ALLOCNO_HARD_REG_COSTS (a)); + ira_propagate_hard_reg_costs (parent_a, a, spill_cost); ira_allocate_and_accumulate_costs (&ALLOCNO_CONFLICT_HARD_REG_COSTS (parent_a), aclass, ALLOCNO_CONFLICT_HARD_REG_COSTS (a)); + /* The cost to A of allocating a register to PARENT_A can't be + more than the cost of spilling the register throughout A. */ ALLOCNO_CLASS_COST (parent_a) - += ALLOCNO_CLASS_COST (a); + += MIN (ALLOCNO_CLASS_COST (a), spill_cost); ALLOCNO_MEMORY_COST (parent_a) += ALLOCNO_MEMORY_COST (a); } } diff --git a/gcc/ira-color.c b/gcc/ira-color.c index 565971255da..4989a3b14eb 100644 --- a/gcc/ira-color.c +++ b/gcc/ira-color.c @@ -3308,7 +3308,7 @@ color_pass (ira_loop_tree_node_t loop_tree_node) unsigned int j; bitmap_iterator bi; machine_mode mode; - enum reg_class rclass, aclass, pclass; + enum reg_class rclass, aclass; ira_allocno_t a, subloop_allocno; ira_loop_tree_node_t subloop_node; @@ -3353,10 +3353,9 @@ color_pass (ira_loop_tree_node_t loop_tree_node) /* Remove from processing in the next loop. */ bitmap_clear_bit (consideration_allocno_bitmap, j); rclass = ALLOCNO_CLASS (a); - pclass = ira_pressure_class_translate[rclass]; - if (flag_ira_region == IRA_REGION_MIXED - && (loop_tree_node->reg_pressure[pclass] - <= ira_class_hard_regs_num[pclass])) + subloop_allocno = ALLOCNO_CAP_MEMBER (a); + subloop_node = ALLOCNO_LOOP_TREE_NODE (subloop_allocno); + if (ira_single_region_allocno_p (a, subloop_allocno)) { mode = ALLOCNO_MODE (a); hard_regno = ALLOCNO_HARD_REGNO (a); @@ -3366,8 +3365,6 @@ color_pass (ira_loop_tree_node_t loop_tree_node) ira_assert (index >= 0); } regno = ALLOCNO_REGNO (a); - subloop_allocno = ALLOCNO_CAP_MEMBER (a); - subloop_node = ALLOCNO_LOOP_TREE_NODE (subloop_allocno); ira_assert (!ALLOCNO_ASSIGNED_P (subloop_allocno)); ALLOCNO_HARD_REGNO (subloop_allocno) = hard_regno; ALLOCNO_ASSIGNED_P (subloop_allocno) = true; @@ -3390,7 +3387,6 @@ color_pass (ira_loop_tree_node_t loop_tree_node) ira_assert (ALLOCNO_CAP_MEMBER (a) == NULL); mode = ALLOCNO_MODE (a); rclass = ALLOCNO_CLASS (a); - pclass = ira_pressure_class_translate[rclass]; hard_regno = ALLOCNO_HARD_REGNO (a); /* Use hard register class here. ??? */ if (hard_regno >= 0) @@ -3407,11 +3403,11 @@ color_pass (ira_loop_tree_node_t loop_tree_node) ira_assert (ALLOCNO_CLASS (subloop_allocno) == rclass); ira_assert (bitmap_bit_p (subloop_node->all_allocnos, ALLOCNO_NUM (subloop_allocno))); - if ((flag_ira_region == IRA_REGION_MIXED - && (loop_tree_node->reg_pressure[pclass] - <= ira_class_hard_regs_num[pclass])) + if (ira_single_region_allocno_p (a, subloop_allocno) || !ira_subloop_allocnos_can_differ_p (a, hard_regno >= 0)) { + gcc_assert (!ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P + (subloop_allocno)); if (! ALLOCNO_ASSIGNED_P (subloop_allocno)) { ALLOCNO_HARD_REGNO (subloop_allocno) = hard_regno; @@ -3547,14 +3543,35 @@ move_spill_restore (void) if (subloop_allocno == NULL) continue; ira_assert (rclass == ALLOCNO_CLASS (subloop_allocno)); - /* We have accumulated cost. To get the real cost of - allocno usage in the loop we should subtract costs of - the subloop allocnos. */ - cost -= (ALLOCNO_MEMORY_COST (subloop_allocno) - - (ALLOCNO_HARD_REG_COSTS (subloop_allocno) == NULL - ? ALLOCNO_CLASS_COST (subloop_allocno) - : ALLOCNO_HARD_REG_COSTS (subloop_allocno)[index])); ira_loop_border_costs border_costs (subloop_allocno); + + /* We have accumulated cost. To get the real cost of + allocno usage in the loop we should subtract the costs + added by propagate_allocno_info for the subloop allocnos. */ + int reg_cost + = (ALLOCNO_HARD_REG_COSTS (subloop_allocno) == NULL + ? ALLOCNO_CLASS_COST (subloop_allocno) + : ALLOCNO_HARD_REG_COSTS (subloop_allocno)[index]); + + int spill_cost + = (border_costs.spill_inside_loop_cost () + + ALLOCNO_MEMORY_COST (subloop_allocno)); + + /* If HARD_REGNO conflicts with SUBLOOP_A then + propagate_allocno_info will have propagated + the cost of spilling HARD_REGNO in SUBLOOP_NODE. + (ira_subloop_allocnos_can_differ_p must be true + in that case.) Otherwise, SPILL_COST acted as + a cap on the propagated register cost, in cases + where the allocations can differ. */ + auto conflicts = ira_total_conflict_hard_regs (subloop_allocno); + if (TEST_HARD_REG_BIT (conflicts, hard_regno)) + reg_cost = spill_cost; + else if (ira_subloop_allocnos_can_differ_p (a)) + reg_cost = MIN (reg_cost, spill_cost); + + cost -= ALLOCNO_MEMORY_COST (subloop_allocno) - reg_cost; + if ((hard_regno2 = ALLOCNO_HARD_REGNO (subloop_allocno)) < 0) /* The register was spilled in the subloop. If we spill it in the outer loop too then we'll no longer need to diff --git a/gcc/ira-int.h b/gcc/ira-int.h index 77e96d00758..59e9dab9166 100644 --- a/gcc/ira-int.h +++ b/gcc/ira-int.h @@ -314,6 +314,13 @@ struct ira_allocno vector where a bit with given index represents allocno with the same number. */ unsigned int conflict_vec_p : 1; + /* True if the parent loop has an allocno for the same register and + if the parent allocno's assignment might not be valid in this loop. + This means that we cannot merge this allocno and the parent allocno + together. + + This is only ever true for non-cap allocnos. */ + unsigned int might_conflict_with_parent_p : 1; /* Hard register assigned to given allocno. Negative value means that memory was allocated to the allocno. During the reload, spilled allocno has value equal to the corresponding stack slot @@ -423,6 +430,8 @@ struct ira_allocno #define ALLOCNO_CAP_MEMBER(A) ((A)->cap_member) #define ALLOCNO_NREFS(A) ((A)->nrefs) #define ALLOCNO_FREQ(A) ((A)->freq) +#define ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P(A) \ + ((A)->might_conflict_with_parent_p) #define ALLOCNO_HARD_REGNO(A) ((A)->hard_regno) #define ALLOCNO_CALL_FREQ(A) ((A)->call_freq) #define ALLOCNO_CALLS_CROSSED_NUM(A) ((A)->calls_crossed_num) @@ -1623,4 +1632,32 @@ ira_subloop_allocnos_can_differ_p (ira_allocno_t a, bool allocated_p = true) return true; } +/* Return true if we should treat A and SUBLOOP_A as belonging to a + single region. */ +inline bool +ira_single_region_allocno_p (ira_allocno_t a, ira_allocno_t subloop_a) +{ + if (flag_ira_region != IRA_REGION_MIXED) + return false; + + if (ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P (subloop_a)) + return false; + + auto rclass = ALLOCNO_CLASS (a); + auto pclass = ira_pressure_class_translate[rclass]; + auto loop_used_regs = ALLOCNO_LOOP_TREE_NODE (a)->reg_pressure[pclass]; + return loop_used_regs <= ira_class_hard_regs_num[pclass]; +} + +/* Return the set of all hard registers that conflict with A. */ +inline HARD_REG_SET +ira_total_conflict_hard_regs (ira_allocno_t a) +{ + auto obj_0 = ALLOCNO_OBJECT (a, 0); + HARD_REG_SET conflicts = OBJECT_TOTAL_CONFLICT_HARD_REGS (obj_0); + for (int i = 1; i < ALLOCNO_NUM_OBJECTS (a); i++) + conflicts |= OBJECT_TOTAL_CONFLICT_HARD_REGS (ALLOCNO_OBJECT (a, i)); + return conflicts; +} + #endif /* GCC_IRA_INT_H */ diff --git a/gcc/testsuite/gcc.target/aarch64/reg-alloc-2.c b/gcc/testsuite/gcc.target/aarch64/reg-alloc-2.c new file mode 100644 index 00000000000..d4d260c96b5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/reg-alloc-2.c @@ -0,0 +1,47 @@ +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */ + +#define PROB 0.1 + +struct L +{ + int data; + volatile struct L *next; + volatile struct L *inner; +}; + +/* The thing we're testing here is that the !head->inner path of the outer loop + body has no stack accesses. It's possible that we'll need to update this + pattern for unrelated code changes. but the test should be XFAILed rather + than changed if any new stack accesses occur on the !head->inner path. */ +/* +** foo: +** ... +** ldr (w[0-9]+), \[(x[0-9]+)\] +** add (w[0-9]+), (?:\3, \1|\1, \3) +** ldr (x[0-9]+), \[\2, #?16\] +** str \3, \[\2\] +** ldr \2, \[\2, #?8\] +** cbn?z \4, .* +** ... +** ret +*/ +void +foo (volatile struct L *head, int inc) +{ + while (head) + { + inc = head->data + inc; + volatile struct L *inner = head->inner; + head->data = inc; + head = head->next; + if (__builtin_expect_with_probability (inner != 0, 0, PROB)) + for (int i = 0; i < 1000; ++i) + /* Leave x30 for i. */ + asm volatile ("// foo" ::: + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", + "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", + "x24", "x25", "x26", "x27", "x28"); + } +} -- Gitee From f7636d03a7087b505c85270b214fb77791573ce6 Mon Sep 17 00:00:00 2001 From: zhaowenyu <804544223@qq.com> Date: Mon, 20 Jun 2022 23:06:12 +0800 Subject: [PATCH 11/14] [Backport] ira: Consider modelling caller-save allocations as loop spills Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=01f3e6a40e7202310abbeb41c345d325bd69554f If an allocno A in an inner loop L spans a call, a parent allocno AP can choose to handle a call-clobbered/caller-saved hard register R in one of two ways: (1) save R before each call in L and restore R after each call (2) spill R to memory throughout L (2) can be cheaper than (1) in some cases, particularly if L does not reference A. Before the patch we always did (1). The patch adds support for picking (2) instead, when it seems cheaper. It builds on the earlier support for not propagating conflicts to parent allocnos. --- gcc/ira-build.c | 23 ++++--- gcc/ira-color.c | 13 ++-- gcc/ira-costs.c | 7 +- gcc/ira-int.h | 39 +++++++++++ .../gcc.target/aarch64/reg-alloc-3.c | 65 +++++++++++++++++++ 5 files changed, 129 insertions(+), 18 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/reg-alloc-3.c diff --git a/gcc/ira-build.c b/gcc/ira-build.c index 17aa2512ae9..4dbe23dfe70 100644 --- a/gcc/ira-build.c +++ b/gcc/ira-build.c @@ -1999,6 +1999,8 @@ ira_propagate_hard_reg_costs (ira_allocno_t parent_a, ira_allocno_t a, int spill_cost) { HARD_REG_SET conflicts = ira_total_conflict_hard_regs (a); + if (ira_caller_save_loop_spill_p (parent_a, a, spill_cost)) + conflicts |= ira_need_caller_save_regs (a); conflicts &= ~ira_total_conflict_hard_regs (parent_a); auto costs = ALLOCNO_HARD_REG_COSTS (a); @@ -2068,15 +2070,18 @@ propagate_allocno_info (void) if (!ira_subloop_allocnos_can_differ_p (parent_a)) merge_hard_reg_conflicts (a, parent_a, true); - ALLOCNO_CALL_FREQ (parent_a) += ALLOCNO_CALL_FREQ (a); - ALLOCNO_CALLS_CROSSED_NUM (parent_a) - += ALLOCNO_CALLS_CROSSED_NUM (a); - ALLOCNO_CHEAP_CALLS_CROSSED_NUM (parent_a) - += ALLOCNO_CHEAP_CALLS_CROSSED_NUM (a); - ALLOCNO_CROSSED_CALLS_ABIS (parent_a) - |= ALLOCNO_CROSSED_CALLS_ABIS (a); - ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (parent_a) - |= ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a); + if (!ira_caller_save_loop_spill_p (parent_a, a, spill_cost)) + { + ALLOCNO_CALL_FREQ (parent_a) += ALLOCNO_CALL_FREQ (a); + ALLOCNO_CALLS_CROSSED_NUM (parent_a) + += ALLOCNO_CALLS_CROSSED_NUM (a); + ALLOCNO_CHEAP_CALLS_CROSSED_NUM (parent_a) + += ALLOCNO_CHEAP_CALLS_CROSSED_NUM (a); + ALLOCNO_CROSSED_CALLS_ABIS (parent_a) + |= ALLOCNO_CROSSED_CALLS_ABIS (a); + ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (parent_a) + |= ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a); + } ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (parent_a) += ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (a); aclass = ALLOCNO_CLASS (a); diff --git a/gcc/ira-color.c b/gcc/ira-color.c index 4989a3b14eb..4cee4637b44 100644 --- a/gcc/ira-color.c +++ b/gcc/ira-color.c @@ -3561,11 +3561,16 @@ move_spill_restore (void) propagate_allocno_info will have propagated the cost of spilling HARD_REGNO in SUBLOOP_NODE. (ira_subloop_allocnos_can_differ_p must be true - in that case.) Otherwise, SPILL_COST acted as - a cap on the propagated register cost, in cases - where the allocations can differ. */ + in that case.) If HARD_REGNO is a caller-saved + register, we might have modelled it in the same way. + + Otherwise, SPILL_COST acted as a cap on the propagated + register cost, in cases where the allocations can differ. */ auto conflicts = ira_total_conflict_hard_regs (subloop_allocno); - if (TEST_HARD_REG_BIT (conflicts, hard_regno)) + if (TEST_HARD_REG_BIT (conflicts, hard_regno) + || (ira_need_caller_save_p (subloop_allocno, hard_regno) + && ira_caller_save_loop_spill_p (a, subloop_allocno, + spill_cost))) reg_cost = spill_cost; else if (ira_subloop_allocnos_can_differ_p (a)) reg_cost = MIN (reg_cost, spill_cost); diff --git a/gcc/ira-costs.c b/gcc/ira-costs.c index aeda6588bcd..4ac2b788892 100644 --- a/gcc/ira-costs.c +++ b/gcc/ira-costs.c @@ -2328,7 +2328,7 @@ ira_tune_allocno_costs (void) { int j, n, regno; int cost, min_cost, *reg_costs; - enum reg_class aclass, rclass; + enum reg_class aclass; machine_mode mode; ira_allocno_t a; ira_allocno_iterator ai; @@ -2367,12 +2367,9 @@ ira_tune_allocno_costs (void) } if (skip_p) continue; - rclass = REGNO_REG_CLASS (regno); cost = 0; if (ira_need_caller_save_p (a, regno)) - cost += (ALLOCNO_CALL_FREQ (a) - * (ira_memory_move_cost[mode][rclass][0] - + ira_memory_move_cost[mode][rclass][1])); + cost += ira_caller_save_cost (a); #ifdef IRA_HARD_REGNO_ADD_COST_MULTIPLIER cost += ((ira_memory_move_cost[mode][rclass][0] + ira_memory_move_cost[mode][rclass][1]) diff --git a/gcc/ira-int.h b/gcc/ira-int.h index 59e9dab9166..1bacb255b81 100644 --- a/gcc/ira-int.h +++ b/gcc/ira-int.h @@ -1660,4 +1660,43 @@ ira_total_conflict_hard_regs (ira_allocno_t a) return conflicts; } +/* Return the cost of saving a caller-saved register before each call + in A's live range and restoring the same register after each call. */ +inline int +ira_caller_save_cost (ira_allocno_t a) +{ + auto mode = ALLOCNO_MODE (a); + auto rclass = ALLOCNO_CLASS (a); + return (ALLOCNO_CALL_FREQ (a) + * (ira_memory_move_cost[mode][rclass][0] + + ira_memory_move_cost[mode][rclass][1])); +} + +/* A and SUBLOOP_A are allocnos for the same pseudo register, with A's + loop immediately enclosing SUBLOOP_A's loop. If we allocate to A a + hard register R that is clobbered by a call in SUBLOOP_A, decide + which of the following approaches should be used for handling the + conflict: + + (1) Spill R on entry to SUBLOOP_A's loop, assign memory to SUBLOOP_A, + and restore R on exit from SUBLOOP_A's loop. + + (2) Spill R before each necessary call in SUBLOOP_A's live range and + restore R after each such call. + + Return true if (1) is better than (2). SPILL_COST is the cost of + doing (1). */ +inline bool +ira_caller_save_loop_spill_p (ira_allocno_t a, ira_allocno_t subloop_a, + int spill_cost) +{ + if (!ira_subloop_allocnos_can_differ_p (a)) + return false; + + /* Calculate the cost of saving a call-clobbered register + before each call and restoring it afterwards. */ + int call_cost = ira_caller_save_cost (subloop_a); + return call_cost && call_cost >= spill_cost; +} + #endif /* GCC_IRA_INT_H */ diff --git a/gcc/testsuite/gcc.target/aarch64/reg-alloc-3.c b/gcc/testsuite/gcc.target/aarch64/reg-alloc-3.c new file mode 100644 index 00000000000..7acdc432b0c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/reg-alloc-3.c @@ -0,0 +1,65 @@ +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */ + +#define PROB 0.1 + +struct L +{ + int data; + volatile struct L *next; + volatile struct L *inner; +}; + +void ext(); + +/* The thing we're testing here is that the !head->inner path of the outer loop + body has no stack accesses. It's possible that we'll need to update this + pattern for unrelated code changes. but the test should be XFAILed rather + than changed if any new stack accesses creep into the !head->inner path. */ +/* +** foo: +** ... +** ldr (w[0-9]+), \[(x[0-9]+)\] +** add (w[0-9]+), (?:\3, \1|\1, \3) +** ldr (x[0-9]+), \[\2, #?16\] +** str \3, \[\2\] +** ldr \2, \[\2, #?8\] +** cbn?z \4, .* +** ... +** ret +*/ +void +foo (volatile struct L *head, int inc, double *ptr) +{ + double d = *ptr; + while (head) + { + /* Clobber all call-preserved GPRs, so that the loop has to use + call-clobbered GPRs if it is to avoid spilling. */ + asm volatile ("" ::: + "x19", "x20", "x21", "x22", "x23", + "x24", "x25", "x26", "x27", "x28"); + inc = head->data + inc; + volatile struct L *inner = head->inner; + head->data = inc; + head = head->next; + if (__builtin_expect_with_probability (inner != 0, 0, PROB)) + for (int i = 0; i < 1000; ++i) + { + ext (); + /* Hack to create high register pressure, so that IRA doesn't + collapse this loop into the parent loop. */ + d += 1; + asm volatile ("// foo" ::: + "d0", "d1", "d2", "d3", + "d4", "d5", "d6", "d7", + "d8", "d9", "d10", "d11", + "d12", "d13", "d14", "d15", + "d16", "d17", "d18", "d19", + "d20", "d21", "d22", "d23", + "d24", "d25", "d26", "d27", + "d28", "d29", "d30", "d31"); + } + } + *ptr = d; +} -- Gitee From 3370bfc6942fcee137f1f11ecbc3d15d0aad16d8 Mon Sep 17 00:00:00 2001 From: zhaowenyu <804544223@qq.com> Date: Mon, 20 Jun 2022 23:06:55 +0800 Subject: [PATCH 12/14] [Backport] ira: Handle "soft" conflicts between cap and non-cap allocnos Reference: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=037cc0b4a6646cc86549247a3590215ebd5c4c43 This patch looks for allocno conflicts of the following form: - One allocno (X) is a cap allocno for some non-cap allocno X2. - X2 belongs to some loop L2. - The other allocno (Y) is a non-cap allocno. - Y is an ancestor of some allocno Y2 in L2. - Y2 is not referenced in L2 (that is, ALLOCNO_NREFS (Y2) == 0). - Y can use a different allocation from Y2. In this case, Y's register is live across L2 but is not used within it, whereas X's register is used only within L2. The conflict is therefore only "soft", in that it can easily be avoided by spilling Y2 inside L2 without affecting any insn references. In principle we could do this for ALLOCNO_NREFS (Y2) != 0 too, with the callers then taking Y2's ALLOCNO_MEMORY_COST into account. There would then be no "cliff edge" between a Y2 that has no references and a Y2 that has (say) a single cold reference. However, doing that isn't necessary for the PR and seems to give variable results in practice. (fotonik3d_r improves slightly but namd_r regresses slightly.) It therefore seemed better to start with the higher-value zero-reference case and see how things go. On top of the previous patches in the series, this fixes the exchange2 regression seen in GCC 11. --- gcc/ira-color.c | 284 ++++++++++++++++-- gcc/ira-int.h | 1 + gcc/ira.c | 2 + .../gcc.target/aarch64/reg-alloc-4.c | 69 +++++ 4 files changed, 326 insertions(+), 30 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/reg-alloc-4.c diff --git a/gcc/ira-color.c b/gcc/ira-color.c index 4cee4637b44..b340e3ab7fd 100644 --- a/gcc/ira-color.c +++ b/gcc/ira-color.c @@ -36,6 +36,11 @@ along with GCC; see the file COPYING3. If not see #include "reload.h" #include "cfgloop.h" +/* To prevent soft conflict detection becoming quadratic in the + loop depth. Only for very pathological cases, so it hardly + seems worth a --param. */ +const int max_soft_conflict_loop_depth = 64; + typedef struct allocno_hard_regs *allocno_hard_regs_t; /* The structure contains information about hard registers can be @@ -1698,6 +1703,167 @@ calculate_saved_nregs (int hard_regno, machine_mode mode) return nregs; } +/* Allocnos A1 and A2 are known to conflict. Check whether, in some loop L + that is either the current loop or a nested subloop, the conflict is of + the following form: + + - One allocno (X) is a cap allocno for some non-cap allocno X2. + + - X2 belongs to some loop L2. + + - The other allocno (Y) is a non-cap allocno. + + - Y is an ancestor of some allocno Y2 in L2. (Note that such a Y2 + must exist, given that X and Y conflict.) + + - Y2 is not referenced in L2 (that is, ALLOCNO_NREFS (Y2) == 0). + + - Y can use a different allocation from Y2. + + In this case, Y's register is live across L2 but is not used within it, + whereas X's register is used only within L2. The conflict is therefore + only "soft", in that it can easily be avoided by spilling Y2 inside L2 + without affecting any insn references. + + If the conflict does have this form, return the Y2 that would need to be + spilled in order to allow X and Y (and thus A1 and A2) to use the same + register. Return null otherwise. Returning null is conservatively correct; + any nonnnull return value is an optimization. */ +ira_allocno_t +ira_soft_conflict (ira_allocno_t a1, ira_allocno_t a2) +{ + /* Search for the loop L and its associated allocnos X and Y. */ + int search_depth = 0; + while (ALLOCNO_CAP_MEMBER (a1) && ALLOCNO_CAP_MEMBER (a2)) + { + a1 = ALLOCNO_CAP_MEMBER (a1); + a2 = ALLOCNO_CAP_MEMBER (a2); + if (search_depth++ > max_soft_conflict_loop_depth) + return nullptr; + } + /* This must be true if A1 and A2 conflict. */ + ira_assert (ALLOCNO_LOOP_TREE_NODE (a1) == ALLOCNO_LOOP_TREE_NODE (a2)); + + /* Make A1 the cap allocno (X in the comment above) and A2 the + non-cap allocno (Y in the comment above). */ + if (ALLOCNO_CAP_MEMBER (a2)) + std::swap (a1, a2); + if (!ALLOCNO_CAP_MEMBER (a1)) + return nullptr; + + /* Search for the real allocno that A1 caps (X2 in the comment above). */ + do + { + a1 = ALLOCNO_CAP_MEMBER (a1); + if (search_depth++ > max_soft_conflict_loop_depth) + return nullptr; + } + while (ALLOCNO_CAP_MEMBER (a1)); + + /* Find the associated allocno for A2 (Y2 in the comment above). */ + auto node = ALLOCNO_LOOP_TREE_NODE (a1); + auto local_a2 = node->regno_allocno_map[ALLOCNO_REGNO (a2)]; + + /* Find the parent of LOCAL_A2/Y2. LOCAL_A2 must be a descendant of A2 + for the conflict query to make sense, so this parent lookup must succeed. + + If the parent allocno has no references, it is usually cheaper to + spill at that loop level instead. Keep searching until we find + a parent allocno that does have references (but don't look past + the starting allocno). */ + ira_allocno_t local_parent_a2; + for (;;) + { + local_parent_a2 = ira_parent_allocno (local_a2); + if (local_parent_a2 == a2 || ALLOCNO_NREFS (local_parent_a2) != 0) + break; + local_a2 = local_parent_a2; + } + if (CHECKING_P) + { + /* Sanity check to make sure that the conflict we've been given + makes sense. */ + auto test_a2 = local_parent_a2; + while (test_a2 != a2) + { + test_a2 = ira_parent_allocno (test_a2); + ira_assert (test_a2); + } + } + if (local_a2 + && ALLOCNO_NREFS (local_a2) == 0 + && ira_subloop_allocnos_can_differ_p (local_parent_a2)) + return local_a2; + return nullptr; +} + +/* The caller has decided to allocate HREGNO to A and has proved that + this is safe. However, the allocation might require the kind of + spilling described in the comment above ira_soft_conflict. + The caller has recorded that: + + - The allocnos in ALLOCNOS_TO_SPILL are the ones that would need + to be spilled to satisfy soft conflicts for at least one allocation + (not necessarily HREGNO). + + - The soft conflicts apply only to A allocations that overlap + SOFT_CONFLICT_REGS. + + If allocating HREGNO is subject to any soft conflicts, record the + subloop allocnos that need to be spilled. */ +static void +spill_soft_conflicts (ira_allocno_t a, bitmap allocnos_to_spill, + HARD_REG_SET soft_conflict_regs, int hregno) +{ + auto nregs = hard_regno_nregs (hregno, ALLOCNO_MODE (a)); + bitmap_iterator bi; + unsigned int i; + EXECUTE_IF_SET_IN_BITMAP (allocnos_to_spill, 0, i, bi) + { + /* SPILL_A needs to be spilled for at least one allocation + (not necessarily this one). */ + auto spill_a = ira_allocnos[i]; + + /* Find the corresponding allocno for this loop. */ + auto conflict_a = spill_a; + do + { + conflict_a = ira_parent_or_cap_allocno (conflict_a); + ira_assert (conflict_a); + } + while (ALLOCNO_LOOP_TREE_NODE (conflict_a)->level + > ALLOCNO_LOOP_TREE_NODE (a)->level); + + ira_assert (ALLOCNO_LOOP_TREE_NODE (conflict_a) + == ALLOCNO_LOOP_TREE_NODE (a)); + + if (conflict_a == a) + { + /* SPILL_A is a descendant of A. We don't know (and don't need + to know) which cap allocnos have a soft conflict with A. + All we need to do is test whether the soft conflict applies + to the chosen allocation. */ + if (ira_hard_reg_set_intersection_p (hregno, ALLOCNO_MODE (a), + soft_conflict_regs)) + ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P (spill_a) = true; + } + else + { + /* SPILL_A is a descendant of CONFLICT_A, which has a soft conflict + with A. Test whether the soft conflict applies to the current + allocation. */ + ira_assert (ira_soft_conflict (a, conflict_a) == spill_a); + auto conflict_hregno = ALLOCNO_HARD_REGNO (conflict_a); + ira_assert (conflict_hregno >= 0); + auto conflict_nregs = hard_regno_nregs (conflict_hregno, + ALLOCNO_MODE (conflict_a)); + if (hregno + nregs > conflict_hregno + && conflict_hregno + conflict_nregs > hregno) + ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P (spill_a) = true; + } + } +} + /* Choose a hard register for allocno A. If RETRY_P is TRUE, it means that the function called from function `ira_reassign_conflict_allocnos' and `allocno_reload_assign'. In @@ -1737,6 +1903,8 @@ assign_hard_reg (ira_allocno_t a, bool retry_p) #ifdef STACK_REGS bool no_stack_reg_p; #endif + auto_bitmap allocnos_to_spill; + HARD_REG_SET soft_conflict_regs = {}; ira_assert (! ALLOCNO_ASSIGNED_P (a)); get_conflict_and_start_profitable_regs (a, retry_p, @@ -1824,23 +1992,56 @@ assign_hard_reg (ira_allocno_t a, bool retry_p) mode = ALLOCNO_MODE (conflict_a); conflict_nregs = hard_regno_nregs (hard_regno, mode); - if (conflict_nregs == n_objects && conflict_nregs > 1) + auto spill_a = (retry_p + ? nullptr + : ira_soft_conflict (a, conflict_a)); + if (spill_a) { - int num = OBJECT_SUBWORD (conflict_obj); - - if (REG_WORDS_BIG_ENDIAN) - SET_HARD_REG_BIT (conflicting_regs[word], - hard_regno + n_objects - num - 1); - else - SET_HARD_REG_BIT (conflicting_regs[word], - hard_regno + num); + if (bitmap_set_bit (allocnos_to_spill, + ALLOCNO_NUM (spill_a))) + { + ira_loop_border_costs border_costs (spill_a); + auto cost = border_costs.spill_inside_loop_cost (); + auto note_conflict = [&](int r) + { + SET_HARD_REG_BIT (soft_conflict_regs, r); + auto hri = ira_class_hard_reg_index[aclass][r]; + if (hri >= 0) + { + costs[hri] += cost; + full_costs[hri] += cost; + } + }; + for (int r = hard_regno; + r >= 0 && (int) end_hard_regno (mode, r) > hard_regno; + r--) + note_conflict (r); + for (int r = hard_regno + 1; + r < hard_regno + conflict_nregs; + r++) + note_conflict (r); + } } else - conflicting_regs[word] - |= ira_reg_mode_hard_regset[hard_regno][mode]; - if (hard_reg_set_subset_p (profitable_hard_regs, - conflicting_regs[word])) - goto fail; + { + if (conflict_nregs == n_objects && conflict_nregs > 1) + { + int num = OBJECT_SUBWORD (conflict_obj); + + if (REG_WORDS_BIG_ENDIAN) + SET_HARD_REG_BIT (conflicting_regs[word], + hard_regno + n_objects - num - 1); + else + SET_HARD_REG_BIT (conflicting_regs[word], + hard_regno + num); + } + else + conflicting_regs[word] + |= ira_reg_mode_hard_regset[hard_regno][mode]; + if (hard_reg_set_subset_p (profitable_hard_regs, + conflicting_regs[word])) + goto fail; + } } } else if (! retry_p @@ -1951,6 +2152,8 @@ assign_hard_reg (ira_allocno_t a, bool retry_p) { for (i = hard_regno_nregs (best_hard_regno, mode) - 1; i >= 0; i--) allocated_hardreg_p[best_hard_regno + i] = true; + spill_soft_conflicts (a, allocnos_to_spill, soft_conflict_regs, + best_hard_regno); } if (! retry_p) restore_costs_from_copies (a); @@ -2946,6 +3149,8 @@ improve_allocation (void) assigning hard register to allocno A even without spilling conflicting allocnos. */ continue; + auto_bitmap allocnos_to_spill; + HARD_REG_SET soft_conflict_regs = {}; mode = ALLOCNO_MODE (a); nwords = ALLOCNO_NUM_OBJECTS (a); /* Process each allocno conflicting with A and update the cost @@ -2971,31 +3176,49 @@ improve_allocation (void) ALLOCNO_COLOR_DATA (conflict_a)->temp = check; if ((conflict_hregno = ALLOCNO_HARD_REGNO (conflict_a)) < 0) continue; - spill_cost = ALLOCNO_UPDATED_MEMORY_COST (conflict_a); - k = (ira_class_hard_reg_index - [ALLOCNO_CLASS (conflict_a)][conflict_hregno]); - ira_assert (k >= 0); - if ((allocno_costs = ALLOCNO_HARD_REG_COSTS (conflict_a)) - != NULL) - spill_cost -= allocno_costs[k]; + auto spill_a = ira_soft_conflict (a, conflict_a); + if (spill_a) + { + if (!bitmap_set_bit (allocnos_to_spill, + ALLOCNO_NUM (spill_a))) + continue; + ira_loop_border_costs border_costs (spill_a); + spill_cost = border_costs.spill_inside_loop_cost (); + } else - spill_cost -= ALLOCNO_UPDATED_CLASS_COST (conflict_a); - spill_cost - += allocno_copy_cost_saving (conflict_a, conflict_hregno); + { + spill_cost = ALLOCNO_UPDATED_MEMORY_COST (conflict_a); + k = (ira_class_hard_reg_index + [ALLOCNO_CLASS (conflict_a)][conflict_hregno]); + ira_assert (k >= 0); + if ((allocno_costs = ALLOCNO_HARD_REG_COSTS (conflict_a)) + != NULL) + spill_cost -= allocno_costs[k]; + else + spill_cost -= ALLOCNO_UPDATED_CLASS_COST (conflict_a); + spill_cost + += allocno_copy_cost_saving (conflict_a, conflict_hregno); + } conflict_nregs = hard_regno_nregs (conflict_hregno, ALLOCNO_MODE (conflict_a)); + auto note_conflict = [&](int r) + { + if (check_hard_reg_p (a, r, + conflicting_regs, profitable_hard_regs)) + { + if (spill_a) + SET_HARD_REG_BIT (soft_conflict_regs, r); + costs[r] += spill_cost; + } + }; for (r = conflict_hregno; r >= 0 && (int) end_hard_regno (mode, r) > conflict_hregno; r--) - if (check_hard_reg_p (a, r, - conflicting_regs, profitable_hard_regs)) - costs[r] += spill_cost; + note_conflict (r); for (r = conflict_hregno + 1; r < conflict_hregno + conflict_nregs; r++) - if (check_hard_reg_p (a, r, - conflicting_regs, profitable_hard_regs)) - costs[r] += spill_cost; + note_conflict (r); } } min_cost = INT_MAX; @@ -3018,6 +3241,7 @@ improve_allocation (void) by spilling some conflicting allocnos does not improve the allocation cost. */ continue; + spill_soft_conflicts (a, allocnos_to_spill, soft_conflict_regs, best); nregs = hard_regno_nregs (best, mode); /* Now spill conflicting allocnos which contain a hard register of A when we assign the best chosen hard register to it. */ diff --git a/gcc/ira-int.h b/gcc/ira-int.h index 1bacb255b81..121147cbb1e 100644 --- a/gcc/ira-int.h +++ b/gcc/ira-int.h @@ -1067,6 +1067,7 @@ extern void ira_debug_conflicts (bool); extern void ira_build_conflicts (void); /* ira-color.c */ +extern ira_allocno_t ira_soft_conflict (ira_allocno_t, ira_allocno_t); extern void ira_debug_hard_regs_forest (void); extern int ira_loop_edge_freq (ira_loop_tree_node_t, int, bool); extern void ira_reassign_conflict_allocnos (int); diff --git a/gcc/ira.c b/gcc/ira.c index 053fdbff469..b7551c1c4e9 100644 --- a/gcc/ira.c +++ b/gcc/ira.c @@ -2531,6 +2531,8 @@ check_allocation (void) int conflict_hard_regno = ALLOCNO_HARD_REGNO (conflict_a); if (conflict_hard_regno < 0) continue; + if (ira_soft_conflict (a, conflict_a)) + continue; conflict_nregs = hard_regno_nregs (conflict_hard_regno, ALLOCNO_MODE (conflict_a)); diff --git a/gcc/testsuite/gcc.target/aarch64/reg-alloc-4.c b/gcc/testsuite/gcc.target/aarch64/reg-alloc-4.c new file mode 100644 index 00000000000..ceb6f50de2d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/reg-alloc-4.c @@ -0,0 +1,69 @@ +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */ + +#define PROB 0.1 + +struct L +{ + int data; + volatile struct L *next; + volatile struct L *inner; +}; + +/* The thing we're testing here is that the !head->inner path of the outer loop + body has no stack accesses. It's possible that we'll need to update this + pattern for unrelated code changes. but the test should be XFAILed rather + than changed if any new stack accesses occur on the !head->inner path. */ +/* +** foo: +** ... +** ldr (w[0-9]+), \[(x[0-9]+)\] +** add (w[0-9]+), (?:\3, \1|\1, \3) +** ldr (x[0-9]+), \[\2, #?16\] +** str \3, \[\2\] +** ldr \2, \[\2, #?8\] +** cbn?z \4, .* +** ... +** ret +*/ +void +foo (volatile struct L *head, int inc) +{ + while (head) + { + /* Clobber all call-preserved GPRs, so that the loop has to use + call-clobbered GPRs if it is to avoid spilling. */ + asm volatile ("" ::: + "x19", "x20", "x21", "x22", "x23", + "x24", "x25", "x26", "x27", "x28"); + inc = head->data + inc; + volatile struct L *inner = head->inner; + head->data = inc; + head = head->next; + if (__builtin_expect_with_probability (inner != 0, 0, PROB)) + for (int i = 0; i < 1000; ++i) + asm volatile ("" :: /* example allocation: */ + "r" (i), /* x0 */ + "r" (inner), /* x1 */ + "r" (inner->next), /* x2 */ + "r" (inner->next), /* x3 */ + "r" (inner->next), /* x4 */ + "r" (inner->next), /* x5 */ + "r" (inner->next), /* x6 */ + "r" (inner->next), /* x7 */ + "r" (inner->next), /* x8 */ + "r" (inner->next), /* x9 */ + "r" (inner->next), /* x10 */ + "r" (inner->next), /* x11 */ + "r" (inner->next), /* x12 */ + "r" (inner->next), /* x13 */ + "r" (inner->next), /* x14 */ + "r" (inner->next), /* x15 */ + "r" (inner->next), /* x16 */ + "r" (inner->next), /* x17 */ + "r" (inner->next), /* x18 */ + "r" (inner->next) : /* x30 */ + "x19", "x20", "x21", "x22", "x23", + "x24", "x25", "x26", "x27", "x28"); + } +} -- Gitee From 9e20a3ecdf89cb66e4e1c967fc3783f66e92624b Mon Sep 17 00:00:00 2001 From: zhaowenyu <804544223@qq.com> Date: Mon, 20 Jun 2022 23:08:48 +0800 Subject: [PATCH 13/14] [loop invariant] Add option to limit count check in loop-invariant motion backport Optimize for loop invariant motion introduce performance recession on 500.perlbench_r(-3.06%), 525.x264_r(-0.76%). So option -flim-count-check limits its impact. --- gcc/common.opt | 5 +++++ gcc/loop-invariant.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/gcc/common.opt b/gcc/common.opt index b5ea3c7a127..4a122cc82ed 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -3511,4 +3511,9 @@ fipa-ra Common Report Var(flag_ipa_ra) Optimization Use caller save register across calls if possible. +flim-count-check +Common Report Var(flag_lim_count_check) Init(0) Optimization +Limit count check in loop-invariant. + + ; This comment is to ensure we retain the blank line above. diff --git a/gcc/loop-invariant.c b/gcc/loop-invariant.c index 24b9bcb11dc..262683f0189 100644 --- a/gcc/loop-invariant.c +++ b/gcc/loop-invariant.c @@ -1192,7 +1192,7 @@ find_invariants_bb (class loop *loop, basic_block bb, bool always_reached, /* Don't move insn of cold BB out of loop to preheader to reduce calculations and register live range in hot loop with cold BB. */ - if (!always_executed && preheader->count > bb->count) + if (!always_executed && preheader->count > bb->count && flag_lim_count_check) { if (dump_file) fprintf (dump_file, "Don't move invariant from bb: %d out of loop %d\n", -- Gitee From aa89a8fac045aff4c2c07e83e90c374a5ca1ef6a Mon Sep 17 00:00:00 2001 From: zhaowenyu <804544223@qq.com> Date: Tue, 21 Jun 2022 00:09:46 +0800 Subject: [PATCH 14/14] [ira] Add option to avoid propagating conflicts. Introduce new option -favoid-propagating-conflicts to enable avoid propagation conflicts because it cause performance regression on some cases of SPEC2017 benchmarks. --- gcc/common.opt | 3 + gcc/ira-build.c | 64 +++++++++++++++---- gcc/ira-color.c | 63 +++++++++++++++--- .../gcc.target/aarch64/reg-alloc-2.c | 2 +- .../gcc.target/aarch64/reg-alloc-3.c | 2 +- 5 files changed, 111 insertions(+), 23 deletions(-) diff --git a/gcc/common.opt b/gcc/common.opt index 4a122cc82ed..5e6e32b8f4a 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -3515,5 +3515,8 @@ flim-count-check Common Report Var(flag_lim_count_check) Init(0) Optimization Limit count check in loop-invariant. +favoid-propagating-conflicts +Common Report Var(flag_avoid_propagating_conflicts) Init(0) Optimization +Avoid propagating conficts in ira. ; This comment is to ensure we retain the blank line above. diff --git a/gcc/ira-build.c b/gcc/ira-build.c index 4dbe23dfe70..734e89fe870 100644 --- a/gcc/ira-build.c +++ b/gcc/ira-build.c @@ -2050,25 +2050,64 @@ propagate_allocno_info (void) && bitmap_bit_p (ALLOCNO_LOOP_TREE_NODE (a)->border_allocnos, ALLOCNO_NUM (a))) { - /* Calculate the cost of storing to memory on entry to A's loop, - referencing as memory within A's loop, and restoring from - memory on exit from A's loop. */ - ira_loop_border_costs border_costs (a); + int spill_cost = INT_MAX; - if (ira_subloop_allocnos_can_differ_p (parent_a)) - spill_cost = (border_costs.spill_inside_loop_cost () - + ALLOCNO_MEMORY_COST (a)); + if(flag_avoid_propagating_conflicts) { + /* Calculate the cost of storing to memory on entry to A's loop, + referencing as memory within A's loop, and restoring from + memory on exit from A's loop. */ + ira_loop_border_costs border_costs (a); + + if (ira_subloop_allocnos_can_differ_p (parent_a)) + spill_cost = (border_costs.spill_inside_loop_cost () + + ALLOCNO_MEMORY_COST (a)); + } if (! ALLOCNO_BAD_SPILL_P (a)) ALLOCNO_BAD_SPILL_P (parent_a) = false; ALLOCNO_NREFS (parent_a) += ALLOCNO_NREFS (a); ALLOCNO_FREQ (parent_a) += ALLOCNO_FREQ (a); - /* If A's allocation can differ from PARENT_A's, we can if necessary - spill PARENT_A on entry to A's loop and restore it afterwards. - Doing that has cost SPILL_COST. */ - if (!ira_subloop_allocnos_can_differ_p (parent_a)) - merge_hard_reg_conflicts (a, parent_a, true); + + if (flag_avoid_propagating_conflicts) { + /* If A's allocation can differ from PARENT_A's, we can if necessary + spill PARENT_A on entry to A's loop and restore it afterwards. + Doing that has cost SPILL_COST. */ + if (!ira_subloop_allocnos_can_differ_p (parent_a)) + merge_hard_reg_conflicts (a, parent_a, true); + + } + + if (!flag_avoid_propagating_conflicts) { + + ALLOCNO_CALL_FREQ (parent_a) += ALLOCNO_CALL_FREQ (a); + merge_hard_reg_conflicts (a, parent_a, true); + ALLOCNO_CALLS_CROSSED_NUM (parent_a) + += ALLOCNO_CALLS_CROSSED_NUM (a); + ALLOCNO_CHEAP_CALLS_CROSSED_NUM (parent_a) + += ALLOCNO_CHEAP_CALLS_CROSSED_NUM (a); + ALLOCNO_CROSSED_CALLS_ABIS (parent_a) + |= ALLOCNO_CROSSED_CALLS_ABIS (a); + ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (parent_a) + |= ALLOCNO_CROSSED_CALLS_CLOBBERED_REGS (a); + ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (parent_a) + += ALLOCNO_EXCESS_PRESSURE_POINTS_NUM (a); + aclass = ALLOCNO_CLASS (a); + ira_assert (aclass == ALLOCNO_CLASS (parent_a)); + ira_allocate_and_accumulate_costs + (&ALLOCNO_HARD_REG_COSTS (parent_a), aclass, + ALLOCNO_HARD_REG_COSTS (a)); + ira_allocate_and_accumulate_costs + (&ALLOCNO_CONFLICT_HARD_REG_COSTS (parent_a), + aclass, + ALLOCNO_CONFLICT_HARD_REG_COSTS (a)); + ALLOCNO_CLASS_COST (parent_a) + += ALLOCNO_CLASS_COST (a); + ALLOCNO_MEMORY_COST (parent_a) += ALLOCNO_MEMORY_COST (a); + } + + else { + if (!ira_caller_save_loop_spill_p (parent_a, a, spill_cost)) { @@ -2097,6 +2136,7 @@ propagate_allocno_info (void) += MIN (ALLOCNO_CLASS_COST (a), spill_cost); ALLOCNO_MEMORY_COST (parent_a) += ALLOCNO_MEMORY_COST (a); } + } } /* Create allocnos corresponding to pseudo-registers in the current diff --git a/gcc/ira-color.c b/gcc/ira-color.c index b340e3ab7fd..263de6fae8f 100644 --- a/gcc/ira-color.c +++ b/gcc/ira-color.c @@ -3532,7 +3532,7 @@ color_pass (ira_loop_tree_node_t loop_tree_node) unsigned int j; bitmap_iterator bi; machine_mode mode; - enum reg_class rclass, aclass; + enum reg_class rclass, aclass, pclass; ira_allocno_t a, subloop_allocno; ira_loop_tree_node_t subloop_node; @@ -3577,9 +3577,34 @@ color_pass (ira_loop_tree_node_t loop_tree_node) /* Remove from processing in the next loop. */ bitmap_clear_bit (consideration_allocno_bitmap, j); rclass = ALLOCNO_CLASS (a); - subloop_allocno = ALLOCNO_CAP_MEMBER (a); - subloop_node = ALLOCNO_LOOP_TREE_NODE (subloop_allocno); - if (ira_single_region_allocno_p (a, subloop_allocno)) + + + pclass = ira_pressure_class_translate[rclass]; + if (flag_ira_region == IRA_REGION_MIXED + && (loop_tree_node->reg_pressure[pclass] + <= ira_class_hard_regs_num[pclass]) && !flag_avoid_propagating_conflicts) + { + mode = ALLOCNO_MODE (a); + hard_regno = ALLOCNO_HARD_REGNO (a); + if (hard_regno >= 0) + { + index = ira_class_hard_reg_index[rclass][hard_regno]; + ira_assert (index >= 0); + } + regno = ALLOCNO_REGNO (a); + subloop_allocno = ALLOCNO_CAP_MEMBER (a); + subloop_node = ALLOCNO_LOOP_TREE_NODE (subloop_allocno); + ira_assert (!ALLOCNO_ASSIGNED_P (subloop_allocno)); + ALLOCNO_HARD_REGNO (subloop_allocno) = hard_regno; + ALLOCNO_ASSIGNED_P (subloop_allocno) = true; + if (hard_regno >= 0) + update_costs_from_copies (subloop_allocno, true, true); + /* We don't need updated costs anymore. */ + ira_free_allocno_updated_costs (subloop_allocno); + } else { + subloop_allocno = ALLOCNO_CAP_MEMBER (a); + subloop_node = ALLOCNO_LOOP_TREE_NODE (subloop_allocno); + if (ira_single_region_allocno_p (a, subloop_allocno) && flag_avoid_propagating_conflicts) { mode = ALLOCNO_MODE (a); hard_regno = ALLOCNO_HARD_REGNO (a); @@ -3597,6 +3622,7 @@ color_pass (ira_loop_tree_node_t loop_tree_node) /* We don't need updated costs anymore. */ ira_free_allocno_updated_costs (subloop_allocno); } + } } /* Update costs of the corresponding allocnos (not caps) in the subloops. */ @@ -3611,6 +3637,9 @@ color_pass (ira_loop_tree_node_t loop_tree_node) ira_assert (ALLOCNO_CAP_MEMBER (a) == NULL); mode = ALLOCNO_MODE (a); rclass = ALLOCNO_CLASS (a); + + pclass = ira_pressure_class_translate[rclass]; + hard_regno = ALLOCNO_HARD_REGNO (a); /* Use hard register class here. ??? */ if (hard_regno >= 0) @@ -3627,11 +3656,16 @@ color_pass (ira_loop_tree_node_t loop_tree_node) ira_assert (ALLOCNO_CLASS (subloop_allocno) == rclass); ira_assert (bitmap_bit_p (subloop_node->all_allocnos, ALLOCNO_NUM (subloop_allocno))); - if (ira_single_region_allocno_p (a, subloop_allocno) - || !ira_subloop_allocnos_can_differ_p (a, hard_regno >= 0)) +if (((ira_single_region_allocno_p (a, subloop_allocno) + || !ira_subloop_allocnos_can_differ_p (a, hard_regno >= 0))&& flag_avoid_propagating_conflicts) || (!flag_avoid_propagating_conflicts && (flag_ira_region == IRA_REGION_MIXED + && (loop_tree_node->reg_pressure[pclass] + <= ira_class_hard_regs_num[pclass])) + || !ira_subloop_allocnos_can_differ_p (a, hard_regno >= 0) )) { - gcc_assert (!ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P - (subloop_allocno)); + if (flag_avoid_propagating_conflicts) { + gcc_assert (!ALLOCNO_MIGHT_CONFLICT_WITH_PARENT_P + (subloop_allocno)); + } if (! ALLOCNO_ASSIGNED_P (subloop_allocno)) { ALLOCNO_HARD_REGNO (subloop_allocno) = hard_regno; @@ -3769,7 +3803,15 @@ move_spill_restore (void) ira_assert (rclass == ALLOCNO_CLASS (subloop_allocno)); ira_loop_border_costs border_costs (subloop_allocno); - /* We have accumulated cost. To get the real cost of + if (!flag_avoid_propagating_conflicts) { + cost -= (ALLOCNO_MEMORY_COST (subloop_allocno) + - (ALLOCNO_HARD_REG_COSTS (subloop_allocno) == NULL + ? ALLOCNO_CLASS_COST (subloop_allocno) + : ALLOCNO_HARD_REG_COSTS (subloop_allocno)[index])); + ira_loop_border_costs border_costs (subloop_allocno); + } else { + + /* We have accumulated cost. To get the real cost of allocno usage in the loop we should subtract the costs added by propagate_allocno_info for the subloop allocnos. */ int reg_cost @@ -3800,6 +3842,9 @@ move_spill_restore (void) reg_cost = MIN (reg_cost, spill_cost); cost -= ALLOCNO_MEMORY_COST (subloop_allocno) - reg_cost; + + + } if ((hard_regno2 = ALLOCNO_HARD_REGNO (subloop_allocno)) < 0) /* The register was spilled in the subloop. If we spill diff --git a/gcc/testsuite/gcc.target/aarch64/reg-alloc-2.c b/gcc/testsuite/gcc.target/aarch64/reg-alloc-2.c index d4d260c96b5..7991ba7982f 100644 --- a/gcc/testsuite/gcc.target/aarch64/reg-alloc-2.c +++ b/gcc/testsuite/gcc.target/aarch64/reg-alloc-2.c @@ -1,4 +1,4 @@ -/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */ +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2 -favoid-propagating-conflicts" } */ /* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */ #define PROB 0.1 diff --git a/gcc/testsuite/gcc.target/aarch64/reg-alloc-3.c b/gcc/testsuite/gcc.target/aarch64/reg-alloc-3.c index 7acdc432b0c..ae5910ae35a 100644 --- a/gcc/testsuite/gcc.target/aarch64/reg-alloc-3.c +++ b/gcc/testsuite/gcc.target/aarch64/reg-alloc-3.c @@ -1,4 +1,4 @@ -/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */ +/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2 -favoid-propagating-conflicts" } */ /* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */ #define PROB 0.1 -- Gitee