diff --git a/0153-LLC-add-extending-outer-loop.patch b/0153-LLC-add-extending-outer-loop.patch new file mode 100644 index 0000000000000000000000000000000000000000..fef87dca3774ea0ae0c5213ddca30f2feebc38fa --- /dev/null +++ b/0153-LLC-add-extending-outer-loop.patch @@ -0,0 +1,1285 @@ +From 4a365290cd9563385d32a22f7b1532c50b69e063 Mon Sep 17 00:00:00 2001 +From: zhaoshujian +Date: Mon, 11 Dec 2023 15:06:28 +0800 +Subject: [PATCH] LLC add extending outer loop + + +diff --git a/gcc/params.opt b/gcc/params.opt +index c429359e3..227175eef 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -1058,4 +1058,10 @@ Common Joined UInteger Var(param_filter_kernels) Init(1) IntegerRange(0, 1) Para + Allow LLC allocate pass to greedily filter kernels by traversing the corresponding basic blocks + through edges with branch probability no less than param_branch_prob_threshold. + ++-param=outer-loop-nums= ++Common Joined UInteger Var(param_outer_loop_num) Init(1) IntegerRange(1, 10) Param ++Maximum number of outer loops allowed to extend outer loops for loops that ++cannot recognize inner loop boundaries. ++ ++ + ; This comment is to ensure we retain the blank line above. +diff --git a/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp +new file mode 100644 +index 000000000..9e98191ed +--- /dev/null ++++ b/gcc/testsuite/g++.dg/llc-allocate/llc-allocate.exp +@@ -0,0 +1,27 @@ ++# Copyright (C) 1997-2022 Free Software Foundation, Inc. ++ ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; either version 3 of the License, or ++# (at your option) any later version. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with GCC; see the file COPYING3. If not see ++# . ++ ++load_lib g++-dg.exp ++load_lib target-supports.exp ++ ++# Initialize `dg'. ++dg-init ++ ++dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.C]] \ ++ "" "-fllc-allocate" ++ ++# All done. ++dg-finish +\ No newline at end of file +diff --git a/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C b/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C +new file mode 100644 +index 000000000..44a9d7c66 +--- /dev/null ++++ b/gcc/testsuite/g++.dg/llc-allocate/llc-relion-expand-kernels.C +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param branch-prob-threshold=50 --param filter-kernels=0 --param mem-access-num=2 --param issue-topn=1 --param force-issue=1" } */ ++#include "multidim_array.h" ++ ++class Input ++{ ++ public: ++ int metadata_offset = 13; ++ int exp_nr_images = 1; ++ MultidimArray exp_Mweight; ++ void convertAllSquaredDifferencesToWeights(); ++}; ++ ++int main() ++{ ++ clock_t start = clock(); ++ Input input; ++ int testIter = 2; ++ ++ for (int i = 0; i < testIter; ++i) ++ { ++ input.convertAllSquaredDifferencesToWeights(); ++ } ++ return 0; ++} ++ ++void Input::convertAllSquaredDifferencesToWeights() ++{ ++ for (int img_id = 0; img_id < exp_nr_images; img_id++) ++ { ++ int my_metadata_offset = metadata_offset + img_id; ++ MultidimArray sorted_weight; ++ ++ exp_Mweight.getRow(img_id, sorted_weight); ++ long int np = 0; ++ FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(sorted_weight) ++ { ++ if (DIRECT_MULTIDIM_ELEM(sorted_weight, n) > 0.) ++ { ++ DIRECT_MULTIDIM_ELEM(sorted_weight, np) = DIRECT_MULTIDIM_ELEM( \ ++ sorted_weight, n); ++ np++; ++ } ++ } ++ } ++} ++ ++ ++ ++/* { dg-final { scan-tree-dump-times "dense memory access" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "__builtin_prefetch" 1 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "static issue" 1 "llc_allocate" } } */ ++ +diff --git a/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h b/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h +new file mode 100644 +index 000000000..d65066ebf +--- /dev/null ++++ b/gcc/testsuite/g++.dg/llc-allocate/multidim_array.h +@@ -0,0 +1,186 @@ ++#ifndef MULTIDIM_ARRAY_H ++#define MULTIDIM_ARRAY_H ++ ++#include ++ ++#define RELION_ALIGNED_MALLOC malloc ++#define RELION_ALIGNED_FREE free ++ ++#define STARTINGX(v) ((v).xinit) ++#define STARTINGY(v) ((v).yinit) ++#define NZYXSIZE(v) ((v).nzyxdim) ++ ++#define DIRECT_MULTIDIM_ELEM(v,n) ((v).data[(n)]) ++#define FOR_ALL_DIRECT_ELEMENTS_IN_MULTIDIMARRAY(v) \ ++ for (long int n=0; n ++class MultidimArray ++{ ++public: ++ T* data; ++ bool destroyData; ++ long int ndim; ++ long int zdim; ++ long int ydim; ++ long int xdim; ++ long int yxdim; ++ long int zyxdim; ++ long int nzyxdim; ++ long int zinit; ++ long int yinit; ++ long int xinit; ++ long int nzyxdimAlloc; ++ ++public: ++ void clear() ++ { ++ coreDeallocate(); ++ coreInit(); ++ } ++ ++ void coreInit() ++ { ++ xdim=0; ++ yxdim=0; ++ zyxdim=0; ++ nzyxdim=0; ++ ydim=1; ++ zdim=1; ++ ndim=1; ++ zinit=0; ++ yinit=0; ++ xinit=0; ++ data=NULL; ++ nzyxdimAlloc = 0; ++ destroyData=true; ++ } ++ ++ void coreAllocate(long int _ndim, long int _zdim, long int _ydim, long int _xdim) ++ { ++ if (_ndim <= 0 || _zdim <= 0 || _ydim<=0 || _xdim<=0) ++ { ++ clear(); ++ return; ++ } ++ ++ ndim=_ndim; ++ zdim=_zdim; ++ ydim=_ydim; ++ xdim=_xdim; ++ yxdim=ydim*xdim; ++ zyxdim=zdim*yxdim; ++ nzyxdim=ndim*zyxdim; ++ ++ coreAllocate(); ++ } ++ ++ void coreAllocate() ++ { ++ data = (T*)RELION_ALIGNED_MALLOC(sizeof(T) * nzyxdim); ++ nzyxdimAlloc = nzyxdim; ++ } ++ ++ void coreDeallocate() ++ { ++ if (data != NULL && destroyData) ++ { ++ RELION_ALIGNED_FREE(data); ++ } ++ data=NULL; ++ nzyxdimAlloc = 0; ++ } ++ ++ void resize(long int Ndim, long int Zdim, long int Ydim, long int Xdim) ++ { ++ if (Ndim*Zdim*Ydim*Xdim == nzyxdimAlloc && data != NULL) ++ { ++ ndim = Ndim; ++ xdim = Xdim; ++ ydim = Ydim; ++ zdim = Zdim; ++ yxdim = Ydim * Xdim; ++ zyxdim = Zdim * yxdim; ++ nzyxdim = Ndim * zyxdim; ++ nzyxdimAlloc = nzyxdim; ++ return; ++ } ++ ++ if (Xdim <= 0 || Ydim <= 0 || Zdim <= 0 || Ndim <= 0) ++ { ++ clear(); ++ return; ++ } ++ ++ if (NZYXSIZE(*this) > 0 && data == NULL) ++ { ++ coreAllocate(); ++ return; ++ } ++ ++ size_t YXdim=Ydim*Xdim; ++ size_t ZYXdim=Zdim*YXdim; ++ size_t NZYXdim=Ndim*ZYXdim; ++ ++ T * new_data = (T*)RELION_ALIGNED_MALLOC(sizeof(T) * NZYXdim); ++ for (long int l = 0; l < Ndim; l++) ++ for (long int k = 0; k < Zdim; k++) ++ for (long int i = 0; i < Ydim; i++) ++ for (long int j = 0; j < Xdim; j++) ++ { ++ T val; ++ new_data[l*ZYXdim + k*YXdim+i*Xdim+j] = val; ++ } ++ coreDeallocate(); ++ ++ data = new_data; ++ ndim = Ndim; ++ xdim = Xdim; ++ ydim = Ydim; ++ zdim = Zdim; ++ yxdim = Ydim * Xdim; ++ zyxdim = Zdim * yxdim; ++ nzyxdim = Ndim * zyxdim; ++ nzyxdimAlloc = nzyxdim; ++ } ++ ++ void resize(long int Xdim) ++ { ++ resize(1, 1, 1, Xdim); ++ } ++ ++ inline T& operator()(long int i, long int j) const ++ { ++ return A2D_ELEM(*this, i, j); ++ } ++ ++ inline T& operator()(long int i) const ++ { ++ return A1D_ELEM(*this, i); ++ } ++ ++ void getRow(long int i, MultidimArray& v) const ++ { ++ if (xdim == 0 || ydim == 0) ++ { ++ v.clear(); ++ return; ++ } ++ ++ v.resize(xdim); ++ for (long int j = 0; j < xdim; j++) ++ v(j) = (*this)(i, j); ++ } ++}; ++ ++#endif /* MULTIDIM_ARRAY_H */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c +index 9bc6cc32b..9f8a5c307 100644 +--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-2.c +@@ -39,13 +39,13 @@ main (int argc, char *argv[]) + return 0; + } + +-/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 2 "llc_allocate" } } */ +-/* { dg-final { scan-tree-dump-times "Tracing succeeded" 6 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "ref_count = (?:\[3-9\]|\[1-9\]\\d{1,}), ninsns = \[1-9\]\\d*, mem_to_insn_ratio = 0.\[2-9\]\\d*" 4 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "Tracing succeeded" 14 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-not "Tracing failed" "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-not "static_data_size:" "llc_allocate" } } */ +-/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 4 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times "\{ (?:\\d+\\(\\d+\\) ){1}\}" 2 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-not ", size: (?!(0\.000000))" "llc_allocate" } } */ +-/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 6 "llc_allocate" } } */ ++/* { dg-final { scan-tree-dump-times ", size: 0\.000000" 8 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-times "\\d x_data \\(0.000000, 1, 0\\) : 3" 2 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-times "\\d A_j \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */ + /* { dg-final { scan-tree-dump-times "\\d A_data \\(0.000000, 1, 0\\) : 2" 2 "llc_allocate" } } */ +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp +index 4f34e722f..05a3bf842 100644 +--- a/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-allocate.exp +@@ -24,4 +24,4 @@ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.c]] \ + "" "-fllc-allocate" + + # All done. +-dg-finish ++dg-finish +\ No newline at end of file +diff --git a/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c b/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c +new file mode 100644 +index 000000000..9b2b656fd +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/llc-allocate/llc-extend-outer-loop.c +@@ -0,0 +1,61 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -march=armv8.2-a+sve -static -fllc-allocate -fdump-tree-llc_allocate-details-lineno --param=outer-loop-nums=10 --param=issue-topn=4 --param=force-issue=1 --param=filter-kernels=0" } */ ++#include ++#define N 131590 ++#define F 384477 ++ ++int ownStartPtr[F]; ++double bPrimePtr[N]; ++double diagPtr[N]; ++double psiPtr[N]; ++double upperPtr[F]; ++double lowerPtr[F]; ++int uPtr[F]; ++ ++void SMOOTH(int *ownStartPtr, double *bPrimePtr, double *diagPtr, double *psiPtr, int *uPtr, double *lowerPtr, double *upperPtr, int nCells); ++ ++int main(int argc, char *argv[]) ++{ ++ int nCells = N; ++ int nFaces = F; ++ int testIter = 2; ++ for (int i = 0; i < testIter; i++) ++ { ++ SMOOTH(ownStartPtr, bPrimePtr, diagPtr, psiPtr, uPtr, lowerPtr, upperPtr, nCells); ++ } ++ return 0; ++} ++ ++ ++void SMOOTH(int *ownStartPtr, double *bPrimePtr, double *diagPtr, double *psiPtr, int *uPtr, double *lowerPtr, double *upperPtr, int nCells) ++{ ++ double psii; ++ int fStart; ++ int fEnd = ownStartPtr[0]; ++ ++ for (int celli = 0; celli < nCells; celli++) ++ { ++ fStart = fEnd; ++ fEnd = ownStartPtr[celli + 1]; ++ psii = bPrimePtr[celli]; ++ ++ for (int facei = fStart; facei &references) + + struct loop_filter_out_flag + { +- /* Use external gimple. */ +- bool use_ext_gimple; +- + /* Use external call. */ + bool use_ext_call; + +@@ -358,21 +355,7 @@ bool + filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt, + const vector &references, unsigned int &start) + { +- /* check use_ext_gimple. */ +- expanded_location cfun_xloc +- = expand_location (DECL_SOURCE_LOCATION (current_function_decl)); + expanded_location xloc = expand_location (stmt->location); +- if (xloc.file && filename_cmp (cfun_xloc.file, xloc.file)) +- { +- if (dump_file && (dump_flags & TDF_DETAILS)) +- { +- fprintf (dump_file, "use_ext_gimple: "); +- print_gimple_stmt (dump_file, stmt, 0, TDF_LINENO); +- } +- loop_filter.use_ext_gimple = true; +- return true; +- } +- + /* check use_ext_call. */ + if (gimple_code (stmt) == GIMPLE_CALL && !gimple_call_internal_p (stmt)) + { +@@ -421,11 +404,6 @@ filter_out_loop_by_stmt_p (loop_filter_out_flag &loop_filter, gimple *stmt, + void + dump_loop_filter_out_flag (loop_filter_out_flag &loop_filter) + { +- if (loop_filter.use_ext_gimple) +- { +- if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "non-dense mem access: use_ext_gimple\n"); +- } + if (loop_filter.use_ext_call) + { + if (dump_file && (dump_flags & TDF_DETAILS)) +@@ -493,45 +471,6 @@ get_references_in_loop (vector &references, + return !filter_out_loop; + } + +-/* Determine whether the loop is a single path. */ +- +-bool +-single_path_p (class loop *loop, basic_block bb) +-{ +- if (bb == NULL) +- return false; +- if (bb == loop->latch) +- return true; +- +- gimple *stmt = last_stmt (bb); +- bool res = false; +- +- if (stmt && gimple_code (stmt) == GIMPLE_COND) +- { +- gcc_assert (EDGE_COUNT (bb->succs) == 2); +- edge true_edge = NULL; +- edge false_edge = NULL; +- extract_true_false_edges_from_block (bb, &true_edge, &false_edge); +- +- /* Returns false, if a branch occurs. */ +- if (true_edge->dest->loop_father == loop +- && false_edge->dest->loop_father == loop) +- return false; +- +- if (true_edge->dest->loop_father == loop) +- res = single_path_p (loop, true_edge->dest); +- else +- res = single_path_p (loop, false_edge->dest); +- } +- else +- { +- edge e = find_fallthru_edge (bb->succs); +- if (e) +- res = single_path_p (loop, e->dest); +- } +- return res; +-} +- + /* Computes an estimated number of insns in LOOP, weighted by WEIGHTS. + Assume that the HPC data reading and calculation process does not involve + adding branches in loops. Therefore, all bbs of loops are directly used for +@@ -611,6 +550,45 @@ dense_memory_p (const vector &references, class loop *loop) + + /* Analyze the inner loop and get the loop with dense memory access. */ + ++void ++analyze_loop_dense_memory (vector &kernels, ++ map > &kernels_refs, ++ class loop *loop) ++{ ++ vector references; ++ number_of_latch_executions (loop); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "\n========== Processing loop %d: ==========\n", ++ loop->num); ++ loop_dump (dump_file, loop); ++ flow_loop_dump (loop, dump_file, NULL, 1); ++ fprintf (dump_file, "loop unroll: %d\n", loop->unroll); ++ } ++ ++ if (get_loop_exit_edges (loop).length () != 1) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "non-dense mem access: loop_branching\n"); ++ return; ++ } ++ ++ loop_filter_out_flag loop_filter = {false, false, true, false}; ++ ++ if (!get_references_in_loop (references, loop_filter, loop)) ++ { ++ dump_loop_filter_out_flag (loop_filter); ++ return; ++ } ++ ++ if (dense_memory_p (references, loop)) ++ { ++ kernels_refs[loop] = references; ++ kernels.push_back (loop); ++ } ++} ++/* Analyze the inner loop and get the loop with dense memory access. */ ++ + bool + get_dense_memory_kernels (vector &kernels, + map > &kernels_refs) +@@ -619,40 +597,7 @@ get_dense_memory_kernels (vector &kernels, + fprintf (dump_file, "\nPhase 1: get_dense_memory_kernels\n\n"); + class loop *loop = NULL; + FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) +- { +- number_of_latch_executions (loop); +- if (dump_file && (dump_flags & TDF_DETAILS)) +- { +- fprintf (dump_file, "\n========== Processing loop %d: ==========\n", +- loop->num); +- loop_dump (dump_file, loop); +- flow_loop_dump (loop, dump_file, NULL, 1); +- fprintf (dump_file, "loop unroll: %d\n", loop->unroll); +- } +- +- if (get_loop_exit_edges (loop).length () != 1 +- || !single_path_p (loop, loop->header)) +- { +- if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "non-dense mem access: loop_branching\n"); +- continue; +- } +- +- vector references; +- loop_filter_out_flag loop_filter = {false, false, false, true, false}; +- +- if (!get_references_in_loop (references, loop_filter, loop)) +- { +- dump_loop_filter_out_flag (loop_filter); +- continue; +- } +- +- if (dense_memory_p (references, loop)) +- { +- kernels_refs[loop] = references; +- kernels.push_back (loop); +- } +- } ++ analyze_loop_dense_memory (kernels, kernels_refs, loop); + return kernels.size () > 0; + } + +@@ -1094,33 +1039,41 @@ trace_ref_info (data_ref &mem_ref, set &traced_ref_stmt) + mem_ref.trace_status_p = true; + } + ++/* Trace all references in the loop. */ ++ ++void ++trace_loop_refs_info (vector &refs, set &traced_ref_stmt) ++{ ++ for (unsigned i = 0; i < refs.size (); ++i) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "trace_references_base_info %d:\n", i); ++ print_generic_expr (dump_file, refs[i].ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ trace_ref_info (refs[i], traced_ref_stmt); ++ } ++} ++ + /* Tracing and sorting reference groups. */ + + void + trace_data_refs_info (vector &kernels, +- map > &loop_refs) ++ map > &loop_refs, ++ set &traced_ref_stmt) + { + if (dump_file) + fprintf (dump_file, "\nPhase 2: trace_all_references_info\n\n"); + +- set traced_ref_stmt; +- + for (unsigned i = 0; i < kernels.size (); ++i) + { +- class loop* loop = kernels[i]; +- ++ class loop *loop = kernels[i]; ++ if (loop_refs.count (loop) == 0) ++ continue; + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "loop header %d:\n", loop->header->index); +- for (unsigned j = 0; j < loop_refs[loop].size (); ++j) +- { +- if (dump_file && (dump_flags & TDF_DETAILS)) +- { +- fprintf (dump_file, "trace_references_base_info %d:\n", j); +- print_generic_expr (dump_file, loop_refs[loop][j].ref, TDF_SLIM); +- fprintf (dump_file, "\n"); +- } +- trace_ref_info (loop_refs[loop][j], traced_ref_stmt); +- } ++ trace_loop_refs_info (loop_refs[loop], traced_ref_stmt); + } + } + +@@ -1205,7 +1158,7 @@ void + check_bound_iv_and_add_worklist (vector &worklist, set &walked, + tree t, data_ref &mem_ref) + { +- if (TREE_CODE (t) != SSA_NAME) ++ if (t == NULL_TREE || TREE_CODE (t) != SSA_NAME) + return; + + gimple *def_stmt = SSA_NAME_DEF_STMT (t); +@@ -1278,8 +1231,13 @@ trace_loop_bound_iv (data_ref &mem_ref) + } + + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "\nmem_ref access dimension: %ld\n", +- mem_ref.loop_bounds.size ()); ++ { ++ fprintf (dump_file, "\nmem_ref access dimension: %ld\n", ++ mem_ref.loop_bounds.size ()); ++ fprintf (dump_file, "Traced variables: "); ++ print_generic_expr (dump_file, mem_ref.base, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } + + return mem_ref.loop_bounds.size () > 0; + } +@@ -1487,7 +1445,7 @@ trace_and_create_dominate_loop_bounds (data_ref &mem_ref) + if (dump_file && (dump_flags & TDF_DETAILS)) + { + print_generic_expr (dump_file, mem_ref.ref, TDF_SLIM); +- fprintf (dump_file, "Tracing loop bound failed at dimension %d", ++ fprintf (dump_file, "Tracing loop bound failed at dimension %d\n", + i); + } + mem_ref.calc_by = UNHANDLE_CALC; +@@ -1565,42 +1523,246 @@ trace_ref_dimension_and_loop_bounds (data_ref &mem_ref) + static_calculate_data_size (mem_ref); + } + +-/* analyze nested kernels. +- 1. multidimension loop analyze. +- 2. extended outer loop analyze. +- Later we will extend outer loop analysis. ++/* Get the loop's niters tree. ++ Return NULL_TREE if not found. */ ++ ++tree ++get_cur_loop_niters (map > &loop_refs, ++ class loop* loop) ++{ ++ if (loop_refs.count (loop) == 0) ++ return NULL_TREE; ++ vector bounds = loop_refs[loop][0].loop_bounds; ++ return bounds.size () ? bounds[0].niters : NULL_TREE; ++} ++ ++/* Trace the sources of the niters tree and return the ++ outermost depth of the loops containing them. ++ Return start_depth if not found. ++ ++ example: ++ niters:(long) (((int) i_end_417 - (int) i_start_452) + 1) ++ operand_num: 1, subtree:(long) (((int) i_end_417 - (int) i_start_452) + 1) ++ operand_num: 2, subtree:((int) i_end_417 - (int) i_start_452) + 1 ++ operand_num: 2, subtree:(int) i_end_417 - (int) i_start_452 ++ operand_num: 1, subtree:(int) i_end_417 ++ SSA_NAME of niters: i_end_417 ++ gimple of SSA: i_end_417 = PHI ++ return gimple depth; ++*/ ++ ++unsigned ++trace_outer_loop_depth (tree niters, unsigned start_depth) ++{ ++ /* If niter does not exist or the type is INTEGER_CST, ++ the loop bound is determined and return start_depth. */ ++ if (niters == NULL_TREE || TREE_CODE (niters) == INTEGER_CST) ++ return start_depth; ++ ++ gimple *def_stmt = NULL; ++ /* niters examples: i_start_452, fEnd_35, fEnd_100. */ ++ enum tree_code niter_code = TREE_CODE (niters); ++ if (niter_code == SSA_NAME) ++ { ++ /* Trace the SSA that define this niter. */ ++ def_stmt = SSA_NAME_DEF_STMT (niters); ++ enum gimple_code stmt_code = gimple_code (def_stmt); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "ssa_name of niters: "); ++ print_generic_expr (dump_file, niters); ++ fprintf (dump_file, "\ngimple of ssa: \n"); ++ print_gimple_stmt (dump_file, def_stmt, 0, TDF_LINENO); ++ fprintf (dump_file, "\n"); ++ } ++ /* Termination condition of dfs. Return the depth of the bb block. */ ++ if (stmt_code == GIMPLE_PHI || stmt_code == GIMPLE_NOP) ++ { ++ basic_block def_bb = gimple_bb (SSA_NAME_DEF_STMT (niters)); ++ if (def_bb == NULL || def_bb->loop_father == NULL) ++ return start_depth; ++ unsigned ret_depth = loop_depth (def_bb->loop_father); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Stop tracing the outer loop depth, "); ++ fprintf (dump_file, "current depth: %d, current bb: %d\n", \ ++ ret_depth, def_bb->index); ++ } ++ return ret_depth; ++ } ++ /* 'ASSIGN': Use dfs to trace the rhs of the assignment statement. */ ++ else if (stmt_code == GIMPLE_ASSIGN) ++ { ++ tree rhs = gimple_assign_rhs1 (def_stmt); ++ if (TREE_CODE (rhs) == TARGET_MEM_REF) ++ /* fEnd_35 = MEM[base: _19, index: ivtmp.96, step: 4, ++ offset: 0B] */ ++ return trace_outer_loop_depth (TREE_OPERAND (rhs, 2), start_depth); ++ else ++ { ++ /* M.218_658 = MIN_EXPR <_631, _657> */ ++ unsigned min_depth = start_depth; ++ unsigned operand_num = gimple_num_ops (def_stmt); ++ /* 'ASSIGN': start from 1 because op[0] is the lhs. */ ++ for (unsigned i = 1; i < operand_num; i++) ++ { ++ tree subtree = GIMPLE_CHECK2 ++ (def_stmt)->op[i]; ++ if (subtree == NULL) ++ continue; ++ unsigned depth = trace_outer_loop_depth (subtree, \ ++ start_depth); ++ min_depth = MIN (min_depth, depth); ++ } ++ return min_depth; ++ } ++ } ++ else ++ { ++ /* Adding termination conditions: ++ 1. Niters is MEM variable; ++ 2. Niters is a runtime value (smooth_uPtr), and consider \ ++ finding footprint in other mem_ref; ++ 3. Niters is loop variable (i_start/i_end), and the boundary in \ ++ the outer loop depends on the variable j_start/j_end. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "The loop termination condition"); ++ fprintf (dump_file, "is to be extended.\n"); ++ } ++ return start_depth; ++ } ++ } ++ /* The operand nums can be obtained when the tree code is as follows. */ ++ else if (niter_code == NOP_EXPR || niter_code == MEM_REF ++ || niter_code == ARRAY_REF || niter_code == COND_EXPR ++ || niter_code == PLUS_EXPR || niter_code == MINUS_EXPR ++ || niter_code == TARGET_MEM_REF || niter_code == POINTER_PLUS_EXPR) ++ { ++ /* operand_num is the operand in the niters statement. ++ example: In the following niter statement, operand_num = 3. ++ (unsigned int) fEnd_35 - (unsigned int) fEnd_100 + 4294967295. */ ++ unsigned operand_num = TREE_OPERAND_LENGTH (niters); ++ unsigned min_depth = start_depth; ++ for (unsigned i = 0; i < operand_num; i++) ++ { ++ tree subtree = TREE_OPERAND (niters, i); ++ if (subtree == NULL) ++ continue; ++ unsigned depth = trace_outer_loop_depth (subtree, start_depth); ++ min_depth = MIN (min_depth, depth); ++ } ++ return min_depth; ++ } ++ else ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "niters is another tree code: %s\n", \ ++ get_tree_code_name (niter_code)); ++ print_generic_expr (dump_file, niters, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ return start_depth; ++ } ++} ++ ++/* Traces the ref dimension information in each loop. */ ++ ++void ++analyze_loop_refs_dimension (vector &refs) ++{ ++ for (unsigned i = 0; i < refs.size (); ++i) ++ { ++ if (refs[i].trace_status_p == false) ++ continue; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "trace_reference_dimension %d:\n", i); ++ print_generic_expr (dump_file, refs[i].ref, TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } ++ trace_ref_dimension_and_loop_bounds (refs[i]); ++ } ++} ++/* analyze nested kernels ++ 1. multidimension loop analyze ++ 2. extended outer loop analyze + */ + + bool + analyze_nested_kernels (vector &kernels, +- map > &loop_refs) ++ map > &loop_refs, ++ set &traced_ref_stmt) + { + if (dump_file) + fprintf (dump_file, "\nPhase 3: analyze_nested_kernels\n\n"); + +- for (unsigned i = 0; i < kernels.size (); ++i) ++ /* `kernels` may be added in during outer loop extension phase, ++ thus using initial size to avoid repeatedly analyzing. */ ++ unsigned init_kernels_size = kernels.size (); ++ for (unsigned i = 0; i < init_kernels_size; ++i) + { + class loop* loop = kernels[i]; + if (loop_refs.count (loop) == 0) + continue; + + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "\n\nloop header %d:\n", loop->header->index); +- for (unsigned j = 0; j < loop_refs[loop].size (); ++j) ++ fprintf (dump_file, "loop header %d:\n", loop->header->index); ++ analyze_loop_refs_dimension (loop_refs[loop]); ++ ++ unsigned depth = loop_depth (loop); ++ unsigned outer_depth = trace_outer_loop_depth (get_cur_loop_niters \ ++ (loop_refs, loop), depth); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "cur_depth: %d, outer_depth: %d\n", \ ++ depth, outer_depth); ++ /* param_outer_loop_num: number of loops of the extended outer loop. ++ Outermost loop should not be extended when outer_depth = 0. ++ `outer_depth == depth` means the current loop is the loop which ++ boundary is known, so there is no need to extend the outer loop. */ ++ if (outer_depth == 0 || outer_depth == depth ++ || depth > outer_depth + param_outer_loop_num) ++ continue; ++ /* Extend outer loop. */ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "\nStart extending outer loop\n"); ++ /* Superloops of the loop, start from the loop closest to the \ ++ current loop in the outermost loop. */ ++ for (unsigned j = 0; j < param_outer_loop_num && --depth; ++j) + { +- if (loop_refs[loop][j].trace_status_p == false) ++ class loop* outer_loop = (*loop->superloops)[depth]; ++ /* The outer loop may be added when analyzing previous inner loops, ++ i.e. the outer loop contains two or more inner loops. */ ++ if (loop_refs.count (outer_loop)) + continue; +- +- if (dump_file && (dump_flags & TDF_DETAILS)) ++ /* phase1~phase3 analysis on the extended outer loop. */ ++ analyze_loop_dense_memory (kernels, loop_refs, outer_loop); ++ if (loop_refs.count (outer_loop) == 0) ++ continue; ++ for (unsigned k = 0; k < loop_refs[outer_loop].size (); ++k) + { +- fprintf (dump_file, "\ntrace_reference_dimension at mem_ref " +- "index %d in loop %d:\n", j, loop->num); +- print_generic_expr (dump_file, loop_refs[loop][j].ref, TDF_SLIM); +- fprintf (dump_file, "\n"); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "outer_analyze_nested_kernels %d: ", k); ++ print_generic_expr (dump_file, loop_refs[outer_loop][k].ref,\ ++ TDF_SLIM); ++ fprintf (dump_file, "\n"); ++ } + } +- trace_ref_dimension_and_loop_bounds (loop_refs[loop][j]); ++ trace_loop_refs_info (loop_refs[outer_loop], traced_ref_stmt); ++ analyze_loop_refs_dimension (loop_refs[outer_loop]); ++ outer_depth = trace_outer_loop_depth (get_cur_loop_niters \ ++ (loop_refs, outer_loop), depth); ++ /* `outer_depth == depth` means the current loop is the loop which ++ boundary is known, so there is no need to extend the outer loop. */ ++ if (outer_depth == depth) ++ break; ++ else ++ /* The outer loop cannot find the current loop boundary, ++ Remove the record of outer_loop from the loop_refs. */ ++ loop_refs.erase (outer_loop); + } +- + } + return true; + } +@@ -2694,9 +2856,10 @@ llc_allocate (void) + if (!get_dense_memory_kernels (kernels, kernels_refs)) + return; + +- trace_data_refs_info (kernels, kernels_refs); ++ set traced_ref_stmt; ++ trace_data_refs_info (kernels, kernels_refs, traced_ref_stmt); + +- if (!analyze_nested_kernels (kernels, kernels_refs)) ++ if (!analyze_nested_kernels (kernels, kernels_refs, traced_ref_stmt)) + return; + + vector sorted_kernels; +-- +2.33.0 + diff --git a/gcc.spec b/gcc.spec index d80bb4ce36e0a6ca3936f733bad65d6f4064042d..926d7d8e4783a173d9f7377c9aa1bb43739363d6 100644 --- a/gcc.spec +++ b/gcc.spec @@ -61,7 +61,7 @@ Summary: Various compilers (C, C++, Objective-C, ...) Name: gcc Version: %{gcc_version} -Release: 41 +Release: 42 License: GPLv3+ and GPLv3+ with exceptions and GPLv2+ with exceptions and LGPLv2+ and BSD URL: https://gcc.gnu.org @@ -260,6 +260,7 @@ Patch149: 0149-Add-more-flexible-check-for-pointer-aliasing-during-.patch Patch150: 0150-Implement-propagation-of-permutations-in-fwprop.patch Patch151: 0151-Fix-bugs-and-add-tests-for-RTL-ifcvt.patch Patch152: 0152-Add-LLC-Allocation-Pass.patch +Patch153: 0153-LLC-add-extending-outer-loop.patch %global gcc_target_platform %{_arch}-linux-gnu @@ -865,6 +866,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch150 -p1 %patch151 -p1 %patch152 -p1 +%patch153 -p1 %build @@ -2889,6 +2891,12 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Tue Dec 12 2023 Shujian Zhao - 10.3.1-42 +- Type:Spec +- ID:NA +- SUG:NA +- DESC: Sync patch from openeuler/gcc, add LLC expending outer loop. + * Mon Dec 11 2023 Feiyang Liu - 10.3.1-41 - Type:Spec - ID:NA