From c3948da8f29a3227af844bc81614b951cd72ea25 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 17 Jan 2024 14:28:56 +0800 Subject: [PATCH 01/15] [Sync] x86: Add a new option -mdaz-ftz to enable FTZ and DAZ flags in MXCSR. if (mdaz-ftz) link crtfastmath.o else if ((Ofast || ffast-math || funsafe-math-optimizations) && !mno-daz-ftz) link crtfastmath.o else Don't link crtfastmath.o gcc/ChangeLog: * config/i386/cygwin.h (ENDFILE_SPEC): Link crtfastmath.o whenever -mdaz-ftz is specified. Don't link crtfastmath.o when -mno-daz-ftz is specified. * config/i386/darwin.h (ENDFILE_SPEC): Ditto. * config/i386/gnu-user-common.h (GNU_USER_TARGET_MATHFILE_SPEC): Ditto. * config/i386/mingw32.h (ENDFILE_SPEC): Ditto. * config/i386/i386.opt (mdaz-ftz): New option. * doc/invoke.texi (x86 options): Document mftz-daz. --- ...ption-mdaz-ftz-to-enable-FTZ-and-DAZ.patch | 135 ++++++++++++++++++ gcc.spec | 8 +- 2 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch diff --git a/0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch b/0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch new file mode 100644 index 0000000..5148d7e --- /dev/null +++ b/0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch @@ -0,0 +1,135 @@ +From 75d05d4e2cb6ac0b85391c51a59925a97eaee85f Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Wed, 10 May 2023 15:16:58 +0800 +Subject: x86: Add a new option -mdaz-ftz to enable FTZ and DAZ + flags in MXCSR. + + if (mdaz-ftz) + link crtfastmath.o + else if ((Ofast || ffast-math || funsafe-math-optimizations) + && !mno-daz-ftz) + link crtfastmath.o + else + Don't link crtfastmath.o + +gcc/ChangeLog: + + * config/i386/cygwin.h (ENDFILE_SPEC): Link crtfastmath.o + whenever -mdaz-ftz is specified. Don't link crtfastmath.o + when -mno-daz-ftz is specified. + * config/i386/darwin.h (ENDFILE_SPEC): Ditto. + * config/i386/gnu-user-common.h + (GNU_USER_TARGET_MATHFILE_SPEC): Ditto. + * config/i386/mingw32.h (ENDFILE_SPEC): Ditto. + * config/i386/i386.opt (mdaz-ftz): New option. + * doc/invoke.texi (x86 options): Document mftz-daz. +--- + gcc/config/i386/cygwin.h | 2 +- + gcc/config/i386/darwin.h | 4 ++-- + gcc/config/i386/gnu-user-common.h | 2 +- + gcc/config/i386/i386.opt | 4 ++++ + gcc/config/i386/mingw32.h | 2 +- + gcc/doc/invoke.texi | 11 ++++++++++- + 6 files changed, 19 insertions(+), 6 deletions(-) + +diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h +index d06eda369cf..5412c5d4479 100644 +--- a/gcc/config/i386/cygwin.h ++++ b/gcc/config/i386/cygwin.h +@@ -57,7 +57,7 @@ along with GCC; see the file COPYING3. If not see + + #undef ENDFILE_SPEC + #define ENDFILE_SPEC \ +- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}\ ++ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ + %{!shared:%:if-exists(default-manifest.o%s)}\ + %{fvtable-verify=none:%s; \ + fvtable-verify=preinit:vtv_end.o%s; \ +diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h +index a55f6b2b874..2f773924d6e 100644 +--- a/gcc/config/i386/darwin.h ++++ b/gcc/config/i386/darwin.h +@@ -109,8 +109,8 @@ along with GCC; see the file COPYING3. If not see + "%{!force_cpusubtype_ALL:-force_cpusubtype_ALL} " + + #undef ENDFILE_SPEC +-#define ENDFILE_SPEC \ +- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ ++#define ENDFILE_SPEC ++\ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ + %{mpc32:crtprec32.o%s} \ + %{mpc64:crtprec64.o%s} \ + %{mpc80:crtprec80.o%s}" TM_DESTRUCTOR +diff --git a/gcc/config/i386/gnu-user-common.h b/gcc/config/i386/gnu-user-common.h +index 23b54c5be52..3d2a33f1714 100644 +--- a/gcc/config/i386/gnu-user-common.h ++++ b/gcc/config/i386/gnu-user-common.h +@@ -47,7 +47,7 @@ along with GCC; see the file COPYING3. If not see + + /* Similar to standard GNU userspace, but adding -ffast-math support. */ + #define GNU_USER_TARGET_MATHFILE_SPEC \ +- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ ++ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ + %{mpc32:crtprec32.o%s} \ + %{mpc64:crtprec64.o%s} \ + %{mpc80:crtprec80.o%s}" +diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt +index fc1b944acb8..498fb454d01 100644 +--- a/gcc/config/i386/i386.opt ++++ b/gcc/config/i386/i386.opt +@@ -420,6 +420,10 @@ mpc80 + Target RejectNegative + Set 80387 floating-point precision to 80-bit. + ++mdaz-ftz ++Target ++Set the FTZ and DAZ Flags. ++ + mpreferred-stack-boundary= + Target RejectNegative Joined UInteger Var(ix86_preferred_stack_boundary_arg) + Attempt to keep stack aligned to this power of 2. +diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h +index d3ca0cd0279..ddbe6a4054b 100644 +--- a/gcc/config/i386/mingw32.h ++++ b/gcc/config/i386/mingw32.h +@@ -197,7 +197,7 @@ along with GCC; see the file COPYING3. If not see + + #undef ENDFILE_SPEC + #define ENDFILE_SPEC \ +- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ ++ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ + %{!shared:%:if-exists(default-manifest.o%s)}\ + %{fvtable-verify=none:%s; \ + fvtable-verify=preinit:vtv_end.o%s; \ +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index 2b376e0e995..3a48655e588 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -1437,7 +1437,7 @@ See RS/6000 and PowerPC Options. + -m96bit-long-double -mlong-double-64 -mlong-double-80 -mlong-double-128 @gol + -mregparm=@var{num} -msseregparm @gol + -mveclibabi=@var{type} -mvect8-ret-in-mem @gol +--mpc32 -mpc64 -mpc80 -mstackrealign @gol ++-mpc32 -mpc64 -mpc80 -mdaz-ftz -mstackrealign @gol + -momit-leaf-frame-pointer -mno-red-zone -mno-tls-direct-seg-refs @gol + -mcmodel=@var{code-model} -mabi=@var{name} -maddress-mode=@var{mode} @gol + -m32 -m64 -mx32 -m16 -miamcu -mlarge-data-threshold=@var{num} @gol +@@ -32122,6 +32122,15 @@ are enabled by default; routines in such libraries could suffer significant + loss of accuracy, typically through so-called ``catastrophic cancellation'', + when this option is used to set the precision to less than extended precision. + ++@item -mdaz-ftz ++@opindex mdaz-ftz ++ ++The flush-to-zero (FTZ) and denormals-are-zero (DAZ) flags in the MXCSR register ++are used to control floating-point calculations.SSE and AVX instructions ++including scalar and vector instructions could benefit from enabling the FTZ ++and DAZ flags when @option{-mdaz-ftz} is specified. Don't set FTZ/DAZ flags ++when @option{-mno-daz-ftz} is specified. ++ + @item -mstackrealign + @opindex mstackrealign + Realign the stack at entry. On the x86, the @option{-mstackrealign} +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index 937761f..ff7b64f 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 16 +%global gcc_release 17 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -163,6 +163,7 @@ Patch25: 0025-AArch64-Rewrite-the-tsv110-option.patch Patch26: 0026-GOMP-Enabling-moutline-atomics-improves-libgomp-perf.patch Patch27: 0027-LoopElim-Redundant-loop-elimination-optimization.patch Patch28: 0028-Array-widen-compare-Fix-the-return-value-match-after.patch +Patch29: 0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -657,6 +658,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch26 -p1 %patch27 -p1 %patch28 -p1 +%patch29 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2888,6 +2890,10 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 liuhongt 12.3.1-17 +- Type: Sync +- DESC: x86: Add a new option -mdaz-ftz to enable FTZ and DAZ flags in MXCSR. + * Mon Sep 11 2023 dingguangya 12.3.1-16 - Type: Sync - DESC: Sync patch from openeuler/gcc -- Gitee From 5d9af8463edaf55215cd9b36e777057b02e6f392 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 17 Jan 2024 14:37:25 +0800 Subject: [PATCH 02/15] [Sync] Explicitly view_convert_expr mask to signed type when folding pblendvb builtins. Since mask < 0 will be always false for vector char when -funsigned-char, but vpblendvb needs to check the most significant bit. The patch explicitly VCE to vector signed char. gcc/ChangeLog: PR target/110108 * config/i386/i386.cc (ix86_gimple_fold_builtin): Explicitly view_convert_expr mask to signed type when folding pblendvb builtins. gcc/testsuite/ChangeLog: * gcc.target/i386/pr110108-2.c: New test. --- ...convert_expr-mask-to-signed-type-whe.patch | 65 +++++++++++++++++++ gcc.spec | 9 ++- 2 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch diff --git a/0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch b/0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch new file mode 100644 index 0000000..4853877 --- /dev/null +++ b/0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch @@ -0,0 +1,65 @@ +From 91c609109438b970dd32b4bd7eefcfab1be7fed9 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Mon, 5 Jun 2023 12:38:41 +0800 +Subject: Explicitly view_convert_expr mask to signed type when + folding pblendvb builtins. + +Since mask < 0 will be always false for vector char when +-funsigned-char, but vpblendvb needs to check the most significant +bit. The patch explicitly VCE to vector signed char. + +gcc/ChangeLog: + + PR target/110108 + * config/i386/i386.cc (ix86_gimple_fold_builtin): Explicitly + view_convert_expr mask to signed type when folding pblendvb + builtins. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr110108-2.c: New test. +--- + gcc/config/i386/i386.cc | 4 +++- + gcc/testsuite/gcc.target/i386/pr110108-2.c | 14 ++++++++++++++ + 2 files changed, 17 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr110108-2.c + +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc +index 462dce10e5c..479fc601049 100644 +--- a/gcc/config/i386/i386.cc ++++ b/gcc/config/i386/i386.cc +@@ -18396,8 +18396,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) + tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode + ? intSI_type_node : intDI_type_node; + type = get_same_sized_vectype (itype, type); +- arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2); + } ++ else ++ type = signed_type_for (type); ++ arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2); + tree zero_vec = build_zero_cst (type); + tree cmp_type = truth_type_for (type); + tree cmp = gimple_build (&stmts, LT_EXPR, cmp_type, arg2, zero_vec); +diff --git a/gcc/testsuite/gcc.target/i386/pr110108-2.c b/gcc/testsuite/gcc.target/i386/pr110108-2.c +new file mode 100644 +index 00000000000..2d1d2fd4991 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr110108-2.c +@@ -0,0 +1,14 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx2 -O2 -funsigned-char" } */ ++/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */ ++ ++#include ++__m128i do_stuff_128(__m128i X0, __m128i X1, __m128i X2) { ++ __m128i Result = _mm_blendv_epi8(X0, X1, X2); ++ return Result; ++} ++ ++__m256i do_stuff_256(__m256i X0, __m256i X1, __m256i X2) { ++ __m256i Result = _mm256_blendv_epi8(X0, X1, X2); ++ return Result; ++} +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index ff7b64f..1163542 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 17 +%global gcc_release 18 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -164,6 +164,7 @@ Patch26: 0026-GOMP-Enabling-moutline-atomics-improves-libgomp-perf.patch Patch27: 0027-LoopElim-Redundant-loop-elimination-optimization.patch Patch28: 0028-Array-widen-compare-Fix-the-return-value-match-after.patch Patch29: 0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch +Patch30: 0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -659,6 +660,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch27 -p1 %patch28 -p1 %patch29 -p1 +%patch30 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2890,6 +2892,11 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 liuhongt 12.3.1-18 +- Type: Sync +- DESC: Explicitly view_convert_expr mask to signed type when folding + pblendvb builtins. + * Wed Jan 17 2024 liuhongt 12.3.1-17 - Type: Sync - DESC: x86: Add a new option -mdaz-ftz to enable FTZ and DAZ flags in MXCSR. -- Gitee From 3fd344c8f493de6f5ad65bd7327c304079421ace Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 17 Jan 2024 14:41:36 +0800 Subject: [PATCH 03/15] [Sync] Make option mvzeroupper independent of optimization level. pass_insert_vzeroupper is under condition TARGET_AVX && TARGET_VZEROUPPER && flag_expensive_optimizations && !optimize_size But the document of mvzeroupper doesn't mention the insertion required -O2 and above, it may confuse users when they explicitly use -Os -mvzeroupper. ------------ mvzeroupper Target Mask(VZEROUPPER) Save Generate vzeroupper instruction before a transfer of control flow out of the function. ------------ The patch moves flag_expensive_optimizations && !optimize_size to ix86_option_override_internal. It makes -mvzeroupper independent of optimization level, but still keeps the behavior of architecture tuning(emit_vzeroupper) unchanged. gcc/ChangeLog: * config/i386/i386-features.cc (pass_insert_vzeroupper:gate): Move flag_expensive_optimizations && !optimize_size to .. * config/i386/i386-options.cc (ix86_option_override_internal): .. this, it makes -mvzeroupper independent of optimization level, but still keeps the behavior of architecture tuning(emit_vzeroupper) unchanged. gcc/testsuite/ChangeLog: * gcc.target/i386/avx-vzeroupper-29.c: New testcase. * gcc.target/i386/avx-vzeroupper-12.c: Adjust testcase. * gcc.target/i386/avx-vzeroupper-7.c: Ditto. * gcc.target/i386/avx-vzeroupper-9.c: Ditto. --- ...eroupper-independent-of-optimization.patch | 137 ++++++++++++++++++ gcc.spec | 8 +- 2 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 0031-Make-option-mvzeroupper-independent-of-optimization.patch diff --git a/0031-Make-option-mvzeroupper-independent-of-optimization.patch b/0031-Make-option-mvzeroupper-independent-of-optimization.patch new file mode 100644 index 0000000..0a7aacd --- /dev/null +++ b/0031-Make-option-mvzeroupper-independent-of-optimization.patch @@ -0,0 +1,137 @@ +From 38364a48b8439a9b16717136835d28690d2a2dd6 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Mon, 26 Jun 2023 09:50:25 +0800 +Subject: [Sync] Make option mvzeroupper independent of optimization level. + +pass_insert_vzeroupper is under condition + +TARGET_AVX && TARGET_VZEROUPPER +&& flag_expensive_optimizations && !optimize_size + +But the document of mvzeroupper doesn't mention the insertion +required -O2 and above, it may confuse users when they explicitly +use -Os -mvzeroupper. + +------------ +mvzeroupper +Target Mask(VZEROUPPER) Save +Generate vzeroupper instruction before a transfer of control flow out of +the function. +------------ + +The patch moves flag_expensive_optimizations && !optimize_size to +ix86_option_override_internal. It makes -mvzeroupper independent of +optimization level, but still keeps the behavior of architecture +tuning(emit_vzeroupper) unchanged. + +gcc/ChangeLog: + + * config/i386/i386-features.cc (pass_insert_vzeroupper:gate): + Move flag_expensive_optimizations && !optimize_size to .. + * config/i386/i386-options.cc (ix86_option_override_internal): + .. this, it makes -mvzeroupper independent of optimization + level, but still keeps the behavior of architecture + tuning(emit_vzeroupper) unchanged. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/avx-vzeroupper-29.c: New testcase. + * gcc.target/i386/avx-vzeroupper-12.c: Adjust testcase. + * gcc.target/i386/avx-vzeroupper-7.c: Ditto. + * gcc.target/i386/avx-vzeroupper-9.c: Ditto. +--- + gcc/config/i386/i386-features.cc | 3 +-- + gcc/config/i386/i386-options.cc | 4 +++- + gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c | 3 ++- + gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c | 14 ++++++++++++++ + gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c | 3 ++- + gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c | 3 ++- + 6 files changed, 24 insertions(+), 6 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c + +diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc +index 6fe41c3c24f..6a2444eb6b6 100644 +--- a/gcc/config/i386/i386-features.cc ++++ b/gcc/config/i386/i386-features.cc +@@ -1875,8 +1875,7 @@ public: + /* opt_pass methods: */ + virtual bool gate (function *) + { +- return TARGET_AVX && TARGET_VZEROUPPER +- && flag_expensive_optimizations && !optimize_size; ++ return TARGET_AVX && TARGET_VZEROUPPER; + } + + virtual unsigned int execute (function *) +diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc +index ff44ad4e03c..74e969b6896 100644 +--- a/gcc/config/i386/i386-options.cc ++++ b/gcc/config/i386/i386-options.cc +@@ -2702,7 +2702,9 @@ ix86_option_override_internal (bool main_args_p, + sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH"); + + if (!(opts_set->x_target_flags & MASK_VZEROUPPER) +- && TARGET_EMIT_VZEROUPPER) ++ && TARGET_EMIT_VZEROUPPER ++ && flag_expensive_optimizations ++ && !optimize_size) + opts->x_target_flags |= MASK_VZEROUPPER; + if (!(opts_set->x_target_flags & MASK_STV)) + opts->x_target_flags |= MASK_STV; +diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c +index e694d4048bd..5a40e87832c 100644 +--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c ++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c +@@ -16,5 +16,6 @@ foo () + _mm256_zeroupper (); + } + +-/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */ + /* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */ +diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c +new file mode 100644 +index 00000000000..4af637757f7 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c +@@ -0,0 +1,14 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O0 -mavx -mtune=generic -mvzeroupper -dp" } */ ++ ++#include ++ ++extern __m256 x, y; ++ ++void ++foo () ++{ ++ x = y; ++} ++ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */ +diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c +index ab6d68779b3..75fe5889783 100644 +--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c ++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c +@@ -12,4 +12,5 @@ foo () + _mm256_zeroupper (); + } + +-/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 { target ia32 } } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 2 { target { ! ia32 } } } } */ +diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c +index 974e1626a6d..fa0a6dfcaac 100644 +--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c ++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c +@@ -15,4 +15,5 @@ foo () + _mm256_zeroupper (); + } + +-/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */ +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index 1163542..4a1279a 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 18 +%global gcc_release 19 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -165,6 +165,7 @@ Patch27: 0027-LoopElim-Redundant-loop-elimination-optimization.patch Patch28: 0028-Array-widen-compare-Fix-the-return-value-match-after.patch Patch29: 0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch Patch30: 0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch +Patch31: 0031-Make-option-mvzeroupper-independent-of-optimization.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -661,6 +662,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch28 -p1 %patch29 -p1 %patch30 -p1 +%patch31 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2892,6 +2894,10 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 liuhongt 12.3.1-19 +- Type: Sync +- DESC: Make option mvzeroupper independent of optimization level. + * Wed Jan 17 2024 liuhongt 12.3.1-18 - Type: Sync - DESC: Explicitly view_convert_expr mask to signed type when folding -- Gitee From 676ad4656ff3b132e1956905dfb84e936d64677d Mon Sep 17 00:00:00 2001 From: Hongyu Wang Date: Wed, 17 Jan 2024 15:08:50 +0800 Subject: [PATCH 04/15] [Sync] i386: Sync tune_string with arch_string for target attribute arch=* For function with target attribute arch=*, current logic will set its tune to -mtune from command line so all target_clones will get same tuning flags which would affect the performance for each clone. Override tune with arch if tune was not explicitly specified to get proper tuning flags for target_clones. gcc/ChangeLog: * config/i386/i386-options.cc (ix86_valid_target_attribute_tree): Override tune_string with arch_string if tune_string is not explicitly specified. gcc/testsuite/ChangeLog: * gcc.target/i386/mvc17.c: New test. --- ...tring-with-arch_string-for-target-at.patch | 66 +++++++++++++++++++ gcc.spec | 8 ++- 2 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch diff --git a/0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch b/0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch new file mode 100644 index 0000000..793ac8c --- /dev/null +++ b/0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch @@ -0,0 +1,66 @@ +From 60c432aac9e2d66b0c0ad09513f7c3b98f53ec35 Mon Sep 17 00:00:00 2001 +From: Hongyu Wang +Date: Sun, 25 Jun 2023 09:50:21 +0800 +Subject: [Sync] i386: Sync tune_string with arch_string for target attribute +arch=* + +For function with target attribute arch=*, current logic will set its +tune to -mtune from command line so all target_clones will get same +tuning flags which would affect the performance for each clone. Override +tune with arch if tune was not explicitly specified to get proper tuning +flags for target_clones. + +gcc/ChangeLog: + + * config/i386/i386-options.cc (ix86_valid_target_attribute_tree): + Override tune_string with arch_string if tune_string is not + explicitly specified. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/mvc17.c: New test. + +(cherry picked from commit 2916278d14e9ac28c361c396a67256acbebda6e8) +--- + gcc/config/i386/i386-options.cc | 6 +++++- + gcc/testsuite/gcc.target/i386/mvc17.c | 11 +++++++++++ + 2 files changed, 16 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.target/i386/mvc17.c + +diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc +index 74e969b6896..fb2ed942f67 100644 +--- a/gcc/config/i386/i386-options.cc ++++ b/gcc/config/i386/i386-options.cc +@@ -1378,7 +1378,11 @@ ix86_valid_target_attribute_tree (tree fndecl, tree args, + if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]) + opts->x_ix86_tune_string + = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]); +- else if (orig_tune_defaulted) ++ /* If we have explicit arch string and no tune string specified, set ++ tune_string to NULL and later it will be overriden by arch_string ++ so target clones can get proper optimization. */ ++ else if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH] ++ || orig_tune_defaulted) + opts->x_ix86_tune_string = NULL; + + /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */ +diff --git a/gcc/testsuite/gcc.target/i386/mvc17.c b/gcc/testsuite/gcc.target/i386/mvc17.c +new file mode 100644 +index 00000000000..8b83c1aecb3 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/mvc17.c +@@ -0,0 +1,11 @@ ++/* { dg-do compile } */ ++/* { dg-require-ifunc "" } */ ++/* { dg-options "-O2 -march=x86-64" } */ ++/* { dg-final { scan-assembler-times "rep mov" 1 } } */ ++ ++__attribute__((target_clones("default","arch=icelake-server"))) ++void ++foo (char *a, char *b, int size) ++{ ++ __builtin_memcpy (a, b, size & 0x7F); ++} +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index 4a1279a..c5df18e 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 19 +%global gcc_release 20 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -166,6 +166,7 @@ Patch28: 0028-Array-widen-compare-Fix-the-return-value-match-after.patch Patch29: 0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch Patch30: 0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch Patch31: 0031-Make-option-mvzeroupper-independent-of-optimization.patch +Patch32: 0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -663,6 +664,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch29 -p1 %patch30 -p1 %patch31 -p1 +%patch32 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2894,6 +2896,10 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 Hongyu Wang 12.3.1-20 +- Type: Sync +- DESC: i386: Sync tune_string with arch_string for target attribute arch=* + * Wed Jan 17 2024 liuhongt 12.3.1-19 - Type: Sync - DESC: Make option mvzeroupper independent of optimization level. -- Gitee From 0cd760a99de83978de370fca3341e959b8823868 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 17 Jan 2024 15:17:03 +0800 Subject: [PATCH 05/15] [Sync] Refine maskloadmn pattern with UNSPEC_MASKLOAD. If mem_addr points to a memory region with less than whole vector size bytes of accessible memory and k is a mask that would prevent reading the inaccessible bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd. gcc/ChangeLog: PR target/110309 * config/i386/sse.md (maskload): Refine pattern with UNSPEC_MASKLOAD. (maskload): Ditto. (*_load_mask): Extend mode iterator to VI12HF_AVX512VL. (*_load): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr110309.c: New test. --- ...kloadmn-pattern-with-UNSPEC_MASKLOAD.patch | 111 ++++++++++++++++++ gcc.spec | 8 +- 2 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch diff --git a/0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch b/0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch new file mode 100644 index 0000000..286f26e --- /dev/null +++ b/0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch @@ -0,0 +1,111 @@ +From d2db85f2ce63c84137fa8f01f6eddce810db8901 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Tue, 20 Jun 2023 15:41:00 +0800 +Subject: [Sync] Refine maskloadmn pattern with UNSPEC_MASKLOAD. + +If mem_addr points to a memory region with less than whole vector size +bytes of accessible memory and k is a mask that would prevent reading +the inaccessible bytes from mem_addr, add UNSPEC_MASKLOAD to prevent +it to be transformed to vpblendd. + +gcc/ChangeLog: + + PR target/110309 + * config/i386/sse.md (maskload): + Refine pattern with UNSPEC_MASKLOAD. + (maskload): Ditto. + (*_load_mask): Extend mode iterator to + VI12HF_AVX512VL. + (*_load): Ditto. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr110309.c: New test. +--- + gcc/config/i386/sse.md | 32 +++++++++++++----------- + gcc/testsuite/gcc.target/i386/pr110309.c | 10 ++++++++ + 2 files changed, 28 insertions(+), 14 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr110309.c + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index eb767e56ca4..b30e96cb1ab 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -1411,12 +1411,12 @@ + }) + + (define_insn "*_load_mask" +- [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v") +- (vec_merge:VI12_AVX512VL +- (unspec:VI12_AVX512VL +- [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")] ++ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v") ++ (vec_merge:VI12HF_AVX512VL ++ (unspec:VI12HF_AVX512VL ++ [(match_operand:VI12HF_AVX512VL 1 "memory_operand" "m")] + UNSPEC_MASKLOAD) +- (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C") ++ (match_operand:VI12HF_AVX512VL 2 "nonimm_or_0_operand" "0C") + (match_operand: 3 "register_operand" "Yk")))] + "TARGET_AVX512BW" + "vmovdqu\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" +@@ -1425,9 +1425,9 @@ + (set_attr "mode" "")]) + + (define_insn_and_split "*_load" +- [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v") +- (unspec:VI12_AVX512VL +- [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")] ++ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v") ++ (unspec:VI12HF_AVX512VL ++ [(match_operand:VI12HF_AVX512VL 1 "memory_operand" "m")] + UNSPEC_MASKLOAD))] + "TARGET_AVX512BW" + "#" +@@ -25973,17 +25973,21 @@ + "TARGET_AVX") + + (define_expand "maskload" +- [(set (match_operand:V48H_AVX512VL 0 "register_operand") +- (vec_merge:V48H_AVX512VL +- (match_operand:V48H_AVX512VL 1 "memory_operand") ++ [(set (match_operand:V48_AVX512VL 0 "register_operand") ++ (vec_merge:V48_AVX512VL ++ (unspec:V48_AVX512VL ++ [(match_operand:V48_AVX512VL 1 "memory_operand")] ++ UNSPEC_MASKLOAD) + (match_dup 0) + (match_operand: 2 "register_operand")))] + "TARGET_AVX512F") + + (define_expand "maskload" +- [(set (match_operand:VI12_AVX512VL 0 "register_operand") +- (vec_merge:VI12_AVX512VL +- (match_operand:VI12_AVX512VL 1 "memory_operand") ++ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand") ++ (vec_merge:VI12HF_AVX512VL ++ (unspec:VI12HF_AVX512VL ++ [(match_operand:VI12HF_AVX512VL 1 "memory_operand")] ++ UNSPEC_MASKLOAD) + (match_dup 0) + (match_operand: 2 "register_operand")))] + "TARGET_AVX512BW") +diff --git a/gcc/testsuite/gcc.target/i386/pr110309.c b/gcc/testsuite/gcc.target/i386/pr110309.c +new file mode 100644 +index 00000000000..f6e9e9c3c61 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr110309.c +@@ -0,0 +1,10 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O3 --param vect-partial-vector-usage=1 -march=znver4 -mprefer-vector-width=256" } */ ++/* { dg-final { scan-assembler-not {(?n)vpblendd.*ymm} } } */ ++ ++ ++void foo (int * __restrict a, int *b) ++{ ++ for (int i = 0; i < 6; ++i) ++ a[i] = b[i] + 42; ++} +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index c5df18e..c62cb37 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 20 +%global gcc_release 21 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -167,6 +167,7 @@ Patch29: 0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch Patch30: 0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch Patch31: 0031-Make-option-mvzeroupper-independent-of-optimization.patch Patch32: 0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch +Patch33: 0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -665,6 +666,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch30 -p1 %patch31 -p1 %patch32 -p1 +%patch33 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2896,6 +2898,10 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 liuhongt 12.3.1-21 +- Type: Sync +- DESC: Refine maskloadmn pattern with UNSPEC_MASKLOAD. + * Wed Jan 17 2024 Hongyu Wang 12.3.1-20 - Type: Sync - DESC: i386: Sync tune_string with arch_string for target attribute arch=* -- Gitee From d0c367fe563778ba55b07e3eb0080b88c6a97290 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 17 Jan 2024 15:20:19 +0800 Subject: [PATCH 06/15] [Sync] Refine maskstore patterns with UNSPEC_MASKMOV. Similar like r14-2070-gc79476da46728e If mem_addr points to a memory region with less than whole vector size bytes of accessible memory and k is a mask that would prevent reading the inaccessible bytes from mem_addr, add UNSPEC_MASKMOV to prevent it to be transformed to any other whole memory access instructions. gcc/ChangeLog: PR rtl-optimization/110237 * config/i386/sse.md (_store_mask): Refine with UNSPEC_MASKMOV. (maskstore_store_mask): New define_insn, it's renamed from original _store_mask. --- ...skstore-patterns-with-UNSPEC_MASKMOV.patch | 126 ++++++++++++++++++ gcc.spec | 8 +- 2 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch diff --git a/0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch b/0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch new file mode 100644 index 0000000..81c4819 --- /dev/null +++ b/0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch @@ -0,0 +1,126 @@ +From 389fe721ab32b154fb07e4865fa32be0608bd5d2 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Mon, 26 Jun 2023 21:07:09 +0800 +Subject: [Sync] Refine maskstore patterns with UNSPEC_MASKMOV. + +Similar like r14-2070-gc79476da46728e + +If mem_addr points to a memory region with less than whole vector size +bytes of accessible memory and k is a mask that would prevent reading +the inaccessible bytes from mem_addr, add UNSPEC_MASKMOV to prevent +it to be transformed to any other whole memory access instructions. + +gcc/ChangeLog: + + PR rtl-optimization/110237 + * config/i386/sse.md (_store_mask): Refine with + UNSPEC_MASKMOV. + (maskstore_store_mask): New define_insn, it's renamed + from original _store_mask. +--- + gcc/config/i386/sse.md | 69 ++++++++++++++++++++++++++++++++++-------- + 1 file changed, 57 insertions(+), 12 deletions(-) + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index b30e96cb1ab..3af15989631 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -1554,7 +1554,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + +-(define_insn "_store_mask" ++(define_insn "*_store_mask" + [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m") + (vec_merge:V48_AVX512VL + (match_operand:V48_AVX512VL 1 "register_operand" "v") +@@ -1582,7 +1582,7 @@ + (set_attr "memory" "store") + (set_attr "mode" "")]) + +-(define_insn "_store_mask" ++(define_insn "*_store_mask" + [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m") + (vec_merge:VI12HF_AVX512VL + (match_operand:VI12HF_AVX512VL 1 "register_operand" "v") +@@ -26002,21 +26002,66 @@ + "TARGET_AVX") + + (define_expand "maskstore" +- [(set (match_operand:V48H_AVX512VL 0 "memory_operand") +- (vec_merge:V48H_AVX512VL +- (match_operand:V48H_AVX512VL 1 "register_operand") +- (match_dup 0) +- (match_operand: 2 "register_operand")))] ++ [(set (match_operand:V48_AVX512VL 0 "memory_operand") ++ (unspec:V48_AVX512VL ++ [(match_operand:V48_AVX512VL 1 "register_operand") ++ (match_dup 0) ++ (match_operand: 2 "register_operand")] ++ UNSPEC_MASKMOV))] + "TARGET_AVX512F") + + (define_expand "maskstore" +- [(set (match_operand:VI12_AVX512VL 0 "memory_operand") +- (vec_merge:VI12_AVX512VL +- (match_operand:VI12_AVX512VL 1 "register_operand") +- (match_dup 0) +- (match_operand: 2 "register_operand")))] ++ [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand") ++ (unspec:VI12HF_AVX512VL ++ [(match_operand:VI12HF_AVX512VL 1 "register_operand") ++ (match_dup 0) ++ (match_operand: 2 "register_operand")] ++ UNSPEC_MASKMOV))] + "TARGET_AVX512BW") + ++(define_insn "_store_mask" ++ [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m") ++ (unspec:V48_AVX512VL ++ [(match_operand:V48_AVX512VL 1 "register_operand" "v") ++ (match_dup 0) ++ (match_operand: 2 "register_operand" "Yk")] ++ UNSPEC_MASKMOV))] ++ "TARGET_AVX512F" ++{ ++ if (FLOAT_MODE_P (GET_MODE_INNER (mode))) ++ { ++ if (misaligned_operand (operands[0], mode)) ++ return "vmovu\t{%1, %0%{%2%}|%0%{%2%}, %1}"; ++ else ++ return "vmova\t{%1, %0%{%2%}|%0%{%2%}, %1}"; ++ } ++ else ++ { ++ if (misaligned_operand (operands[0], mode)) ++ return "vmovdqu\t{%1, %0%{%2%}|%0%{%2%}, %1}"; ++ else ++ return "vmovdqa\t{%1, %0%{%2%}|%0%{%2%}, %1}"; ++ } ++} ++ [(set_attr "type" "ssemov") ++ (set_attr "prefix" "evex") ++ (set_attr "memory" "store") ++ (set_attr "mode" "")]) ++ ++(define_insn "_store_mask" ++ [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m") ++ (unspec:VI12HF_AVX512VL ++ [(match_operand:VI12HF_AVX512VL 1 "register_operand" "v") ++ (match_dup 0) ++ (match_operand: 2 "register_operand" "Yk")] ++ UNSPEC_MASKMOV))] ++ "TARGET_AVX512BW" ++ "vmovdqu\t{%1, %0%{%2%}|%0%{%2%}, %1}" ++ [(set_attr "type" "ssemov") ++ (set_attr "prefix" "evex") ++ (set_attr "memory" "store") ++ (set_attr "mode" "")]) ++ + (define_expand "cbranch4" + [(set (reg:CC FLAGS_REG) + (compare:CC (match_operand:VI48_AVX 1 "register_operand") +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index c62cb37..157ef1c 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 21 +%global gcc_release 22 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -168,6 +168,7 @@ Patch30: 0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch Patch31: 0031-Make-option-mvzeroupper-independent-of-optimization.patch Patch32: 0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch Patch33: 0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch +Patch34: 0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -667,6 +668,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch31 -p1 %patch32 -p1 %patch33 -p1 +%patch34 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2898,6 +2900,10 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 liuhongt 12.3.1-22 +- Type: Sync +- DESC: Refine maskstore patterns with UNSPEC_MASKMOV. + * Wed Jan 17 2024 liuhongt 12.3.1-21 - Type: Sync - DESC: Refine maskloadmn pattern with UNSPEC_MASKLOAD. -- Gitee From 379e84de84cdac4ee4ad5a0f203e4c8c06cf226c Mon Sep 17 00:00:00 2001 From: "Cui, Lili" Date: Wed, 17 Jan 2024 15:24:09 +0800 Subject: [PATCH 07/15] [Sync] x86: Update model values for Alderlake and Rocketlake. Update model values for Alderlake and Rocketlake according to SDM. gcc/ChangeLog * common/config/i386/cpuinfo.h (get_intel_cpu): Remove model value 0xa8 from Rocketlake, remove model value 0xbf from Alderlake. --- ...-values-for-Alderlake-and-Rocketlake.patch | 38 +++++++++++++++++++ gcc.spec | 8 +++- 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch diff --git a/0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch b/0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch new file mode 100644 index 0000000..938135a --- /dev/null +++ b/0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch @@ -0,0 +1,38 @@ +From 62cef9aea12f343cf86697f56884f585191b9545 Mon Sep 17 00:00:00 2001 +From: "Cui, Lili" +Date: Thu, 29 Jun 2023 03:10:35 +0000 +Subject: [Sync] x86: Update model values for Alderlake and Rocketlake. + +Update model values for Alderlake and Rocketlake according to SDM. + +gcc/ChangeLog + + * common/config/i386/cpuinfo.h (get_intel_cpu): Remove model value 0xa8 + from Rocketlake, remove model value 0xbf from Alderlake. +--- + gcc/common/config/i386/cpuinfo.h | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h +index 0333da56ba5..28b2ff0b033 100644 +--- a/gcc/common/config/i386/cpuinfo.h ++++ b/gcc/common/config/i386/cpuinfo.h +@@ -435,7 +435,6 @@ get_intel_cpu (struct __processor_model *cpu_model, + cpu_model->__cpu_subtype = INTEL_COREI7_SKYLAKE; + break; + case 0xa7: +- case 0xa8: + /* Rocket Lake. */ + cpu = "rocketlake"; + CHECK___builtin_cpu_is ("corei7"); +@@ -508,7 +507,6 @@ get_intel_cpu (struct __processor_model *cpu_model, + break; + case 0x97: + case 0x9a: +- case 0xbf: + /* Alder Lake. */ + cpu = "alderlake"; + CHECK___builtin_cpu_is ("corei7"); +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index 157ef1c..ade70b6 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 22 +%global gcc_release 23 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -169,6 +169,7 @@ Patch31: 0031-Make-option-mvzeroupper-independent-of-optimization.patch Patch32: 0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch Patch33: 0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch Patch34: 0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch +Patch35: 0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -669,6 +670,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch32 -p1 %patch33 -p1 %patch34 -p1 +%patch35 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2900,6 +2902,10 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 Cui, Lili 12.3.1-23 +- Type: Sync +- DESC: x86: Update model values for Alderlake and Rocketlake. + * Wed Jan 17 2024 liuhongt 12.3.1-22 - Type: Sync - DESC: Refine maskstore patterns with UNSPEC_MASKMOV. -- Gitee From 550fdbff301ff12712abd4fc1d972f1676c8da31 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 17 Jan 2024 15:27:32 +0800 Subject: [PATCH 08/15] [Sync] Workaround possible CPUID bug in Sandy Bridge. Don't access leaf 7 subleaf 1 unless subleaf 0 says it is supported via EAX. Intel documentation says invalid subleaves return 0. We had been relying on that behavior instead of checking the max sublef number. It appears that some Sandy Bridge CPUs return at least the subleaf 0 EDX value for subleaf 1. Best guess is that this is a bug in a microcode patch since all of the bits we're seeing set in EDX were introduced after Sandy Bridge was originally released. This is causing avxvnniint16 to be incorrectly enabled with -march=native on these CPUs. gcc/ChangeLog: * common/config/i386/cpuinfo.h (get_available_features): Check max_subleaf_level for valid subleaf before use CPUID. --- ...d-possible-CPUID-bug-in-Sandy-Bridge.patch | 78 +++++++++++++++++++ gcc.spec | 8 +- 2 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch diff --git a/0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch b/0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch new file mode 100644 index 0000000..f3c7900 --- /dev/null +++ b/0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch @@ -0,0 +1,78 @@ +From 161c0972243cf967196f4e9361f0736000637bfe Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Fri, 4 Aug 2023 09:27:39 +0800 +Subject: [Sync] Workaround possible CPUID bug in Sandy Bridge. + +Don't access leaf 7 subleaf 1 unless subleaf 0 says it is +supported via EAX. + +Intel documentation says invalid subleaves return 0. We had been +relying on that behavior instead of checking the max sublef number. + +It appears that some Sandy Bridge CPUs return at least the subleaf 0 +EDX value for subleaf 1. Best guess is that this is a bug in a +microcode patch since all of the bits we're seeing set in EDX were +introduced after Sandy Bridge was originally released. + +This is causing avxvnniint16 to be incorrectly enabled with +-march=native on these CPUs. + +gcc/ChangeLog: + + * common/config/i386/cpuinfo.h (get_available_features): Check + max_subleaf_level for valid subleaf before use CPUID. +--- + gcc/common/config/i386/cpuinfo.h | 29 +++++++++++++++++------------ + 1 file changed, 17 insertions(+), 12 deletions(-) + +diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h +index 28b2ff0b033..316ad3cb3e9 100644 +--- a/gcc/common/config/i386/cpuinfo.h ++++ b/gcc/common/config/i386/cpuinfo.h +@@ -647,7 +647,9 @@ get_available_features (struct __processor_model *cpu_model, + /* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */ + if (max_cpuid_level >= 7) + { +- __cpuid_count (7, 0, eax, ebx, ecx, edx); ++ unsigned int max_subleaf_level; ++ ++ __cpuid_count (7, 0, max_subleaf_level, ebx, ecx, edx); + if (ebx & bit_BMI) + set_feature (FEATURE_BMI); + if (ebx & bit_SGX) +@@ -759,18 +761,21 @@ get_available_features (struct __processor_model *cpu_model, + set_feature (FEATURE_AVX512FP16); + } + +- __cpuid_count (7, 1, eax, ebx, ecx, edx); +- if (eax & bit_HRESET) +- set_feature (FEATURE_HRESET); +- if (avx_usable) +- { +- if (eax & bit_AVXVNNI) +- set_feature (FEATURE_AVXVNNI); +- } +- if (avx512_usable) ++ if (max_subleaf_level >= 1) + { +- if (eax & bit_AVX512BF16) +- set_feature (FEATURE_AVX512BF16); ++ __cpuid_count (7, 1, eax, ebx, ecx, edx); ++ if (eax & bit_HRESET) ++ set_feature (FEATURE_HRESET); ++ if (avx_usable) ++ { ++ if (eax & bit_AVXVNNI) ++ set_feature (FEATURE_AVXVNNI); ++ } ++ if (avx512_usable) ++ { ++ if (eax & bit_AVX512BF16) ++ set_feature (FEATURE_AVX512BF16); ++ } + } + } + +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index ade70b6..2915ca4 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 23 +%global gcc_release 24 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -170,6 +170,7 @@ Patch32: 0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch Patch33: 0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch Patch34: 0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch Patch35: 0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch +Patch36: 0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -671,6 +672,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch33 -p1 %patch34 -p1 %patch35 -p1 +%patch36 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2902,6 +2904,10 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 liuhongt 12.3.1-24 +- Type: Sync +- DESC: Workaround possible CPUID bug in Sandy Bridge. + * Wed Jan 17 2024 Cui, Lili 12.3.1-23 - Type: Sync - DESC: x86: Update model values for Alderlake and Rocketlake. -- Gitee From d76e96b448f6ab57a66ae21895a5246ca9f7118d Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 17 Jan 2024 15:30:02 +0800 Subject: [PATCH 09/15] [Sync] Software mitigation: Disable gather generation in vectorization for GDS affected Intel Processors. For more details of GDS (Gather Data Sampling), refer to https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/gather-data-sampling.html After microcode update, there's performance regression. To avoid that, the patch disables gather generation in autovectorization but uses gather scalar emulation instead. gcc/ChangeLog: * config/i386/i386-options.cc (m_GDS): New macro. * config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Don't enable for m_GDS. (X86_TUNE_USE_GATHER_4PARTS): Ditto. (X86_TUNE_USE_GATHER): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx2-gather-2.c: Adjust options to keep gather vectorization. * gcc.target/i386/avx2-gather-6.c: Ditto. * gcc.target/i386/avx512f-pr88464-1.c: Ditto. * gcc.target/i386/avx512f-pr88464-5.c: Ditto. * gcc.target/i386/avx512vl-pr88464-1.c: Ditto. * gcc.target/i386/avx512vl-pr88464-11.c: Ditto. * gcc.target/i386/avx512vl-pr88464-3.c: Ditto. * gcc.target/i386/avx512vl-pr88464-9.c: Ditto. * gcc.target/i386/pr88531-1b.c: Ditto. * gcc.target/i386/pr88531-1c.c: Ditto. --- ...ion-Disable-gather-generation-in-vec.patch | 220 ++++++++++++++++++ gcc.spec | 9 +- 2 files changed, 228 insertions(+), 1 deletion(-) create mode 100644 0037-Software-mitigation-Disable-gather-generation-in-vec.patch diff --git a/0037-Software-mitigation-Disable-gather-generation-in-vec.patch b/0037-Software-mitigation-Disable-gather-generation-in-vec.patch new file mode 100644 index 0000000..224eeaf --- /dev/null +++ b/0037-Software-mitigation-Disable-gather-generation-in-vec.patch @@ -0,0 +1,220 @@ +From 49e9bb4de0ad94e8636d796b16aa09917145b0d3 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Thu, 10 Aug 2023 11:41:39 +0800 +Subject: [Sync] Software mitigation: Disable gather generation in + vectorization for GDS affected Intel Processors. + +For more details of GDS (Gather Data Sampling), refer to +https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/gather-data-sampling.html + +After microcode update, there's performance regression. To avoid that, +the patch disables gather generation in autovectorization but uses +gather scalar emulation instead. + +gcc/ChangeLog: + + * config/i386/i386-options.cc (m_GDS): New macro. + * config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Don't + enable for m_GDS. + (X86_TUNE_USE_GATHER_4PARTS): Ditto. + (X86_TUNE_USE_GATHER): Ditto. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/avx2-gather-2.c: Adjust options to keep + gather vectorization. + * gcc.target/i386/avx2-gather-6.c: Ditto. + * gcc.target/i386/avx512f-pr88464-1.c: Ditto. + * gcc.target/i386/avx512f-pr88464-5.c: Ditto. + * gcc.target/i386/avx512vl-pr88464-1.c: Ditto. + * gcc.target/i386/avx512vl-pr88464-11.c: Ditto. + * gcc.target/i386/avx512vl-pr88464-3.c: Ditto. + * gcc.target/i386/avx512vl-pr88464-9.c: Ditto. + * gcc.target/i386/pr88531-1b.c: Ditto. + * gcc.target/i386/pr88531-1c.c: Ditto. + +(cherry picked from commit 3064d1f5c48cb6ce1b4133570dd08ecca8abb52d) +--- + gcc/config/i386/i386-options.cc | 5 +++++ + gcc/config/i386/x86-tune.def | 9 ++++++--- + gcc/testsuite/gcc.target/i386/avx2-gather-2.c | 2 +- + gcc/testsuite/gcc.target/i386/avx2-gather-6.c | 2 +- + gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c | 2 +- + gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c | 2 +- + gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c | 2 +- + gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c | 2 +- + gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c | 2 +- + gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c | 2 +- + gcc/testsuite/gcc.target/i386/pr88531-1b.c | 2 +- + gcc/testsuite/gcc.target/i386/pr88531-1c.c | 2 +- + 12 files changed, 21 insertions(+), 13 deletions(-) + +diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc +index fb2ed942f67..9617fc162e0 100644 +--- a/gcc/config/i386/i386-options.cc ++++ b/gcc/config/i386/i386-options.cc +@@ -137,6 +137,11 @@ along with GCC; see the file COPYING3. If not see + #define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U< gcc/DEV-PHASE @@ -2904,6 +2906,11 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 liuhongt 12.3.1-25 +- Type: Sync +- DESC: Software mitigation: Disable gather generation in vectorization for GDS + affected Intel Processors. + * Wed Jan 17 2024 liuhongt 12.3.1-24 - Type: Sync - DESC: Workaround possible CPUID bug in Sandy Bridge. -- Gitee From 302d17b522bb7f4b37af1abb35b44772a3058274 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 17 Jan 2024 15:36:45 +0800 Subject: [PATCH 10/15] [Sync] Support -m[no-]gather -m[no-]scatter to enable/disable vectorization for all gather/scatter instructions Rename original use_gather to use_gather_8parts, Support -mtune-ctrl={,^}use_gather to set/clear tune features use_gather_{2parts, 4parts, 8parts}. Support the new option -mgather as alias of -mtune-ctrl=, use_gather, ^use_gather. Similar for use_scatter. gcc/ChangeLog: * config/i386/i386-builtins.cc (ix86_vectorize_builtin_gather): Adjust for use_gather_8parts. * config/i386/i386-options.cc (parse_mtune_ctrl_str): Set/Clear tune features use_{gather,scatter}_{2parts, 4parts, 8parts} for -mtune-crtl={,^}{use_gather,use_scatter}. * config/i386/i386.cc (ix86_vectorize_builtin_scatter): Adjust for use_scatter_8parts * config/i386/i386.h (TARGET_USE_GATHER): Rename to .. (TARGET_USE_GATHER_8PARTS): .. this. (TARGET_USE_SCATTER): Rename to .. (TARGET_USE_SCATTER_8PARTS): .. this. * config/i386/x86-tune.def (X86_TUNE_USE_GATHER): Rename to (X86_TUNE_USE_GATHER_8PARTS): .. this. (X86_TUNE_USE_SCATTER): Rename to (X86_TUNE_USE_SCATTER_8PARTS): .. this. * config/i386/i386.opt: Add new options mgather, mscatter. --- ...her-m-no-scatter-to-enable-disable-v.patch | 187 ++++++++++++++++++ gcc.spec | 9 +- 2 files changed, 195 insertions(+), 1 deletion(-) create mode 100644 0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch diff --git a/0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch b/0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch new file mode 100644 index 0000000..ddf2c4d --- /dev/null +++ b/0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch @@ -0,0 +1,187 @@ +From 1981bc7e6c4149d590190162939f911cb53547fd Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Thu, 10 Aug 2023 16:26:13 +0800 +Subject: [Sync] Support -m[no-]gather -m[no-]scatter to enable/disable + vectorization for all gather/scatter instructions + +Rename original use_gather to use_gather_8parts, Support +-mtune-ctrl={,^}use_gather to set/clear tune features +use_gather_{2parts, 4parts, 8parts}. Support the new option -mgather +as alias of -mtune-ctrl=, use_gather, ^use_gather. + +Similar for use_scatter. + +gcc/ChangeLog: + + * config/i386/i386-builtins.cc + (ix86_vectorize_builtin_gather): Adjust for use_gather_8parts. + * config/i386/i386-options.cc (parse_mtune_ctrl_str): + Set/Clear tune features use_{gather,scatter}_{2parts, 4parts, + 8parts} for -mtune-crtl={,^}{use_gather,use_scatter}. + * config/i386/i386.cc (ix86_vectorize_builtin_scatter): Adjust + for use_scatter_8parts + * config/i386/i386.h (TARGET_USE_GATHER): Rename to .. + (TARGET_USE_GATHER_8PARTS): .. this. + (TARGET_USE_SCATTER): Rename to .. + (TARGET_USE_SCATTER_8PARTS): .. this. + * config/i386/x86-tune.def (X86_TUNE_USE_GATHER): Rename to + (X86_TUNE_USE_GATHER_8PARTS): .. this. + (X86_TUNE_USE_SCATTER): Rename to + (X86_TUNE_USE_SCATTER_8PARTS): .. this. + * config/i386/i386.opt: Add new options mgather, mscatter. + +(cherry picked from commit b2a927fb5343db363ea4361da0d6bcee227b6737) +--- + gcc/config/i386/i386-builtins.cc | 2 +- + gcc/config/i386/i386-options.cc | 54 +++++++++++++++++++++++--------- + gcc/config/i386/i386.cc | 2 +- + gcc/config/i386/i386.h | 8 ++--- + gcc/config/i386/i386.opt | 4 +++ + gcc/config/i386/x86-tune.def | 4 +-- + 6 files changed, 52 insertions(+), 22 deletions(-) + +diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc +index 050c6228a18..8ed32e14f0a 100644 +--- a/gcc/config/i386/i386-builtins.cc ++++ b/gcc/config/i386/i386-builtins.cc +@@ -1790,7 +1790,7 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype, + ? !TARGET_USE_GATHER_2PARTS + : (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), 4u) + ? !TARGET_USE_GATHER_4PARTS +- : !TARGET_USE_GATHER))) ++ : !TARGET_USE_GATHER_8PARTS))) + return NULL_TREE; + + if ((TREE_CODE (index_type) != INTEGER_TYPE +diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc +index 9617fc162e0..3df1f0c41c3 100644 +--- a/gcc/config/i386/i386-options.cc ++++ b/gcc/config/i386/i386-options.cc +@@ -1705,20 +1705,46 @@ parse_mtune_ctrl_str (struct gcc_options *opts, bool dump) + curr_feature_string++; + clear = true; + } +- for (i = 0; i < X86_TUNE_LAST; i++) +- { +- if (!strcmp (curr_feature_string, ix86_tune_feature_names[i])) +- { +- ix86_tune_features[i] = !clear; +- if (dump) +- fprintf (stderr, "Explicitly %s feature %s\n", +- clear ? "clear" : "set", ix86_tune_feature_names[i]); +- break; +- } +- } +- if (i == X86_TUNE_LAST) +- error ("unknown parameter to option %<-mtune-ctrl%>: %s", +- clear ? curr_feature_string - 1 : curr_feature_string); ++ ++ if (!strcmp (curr_feature_string, "use_gather")) ++ { ++ ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS] = !clear; ++ ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS] = !clear; ++ ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS] = !clear; ++ if (dump) ++ fprintf (stderr, "Explicitly %s features use_gather_2parts," ++ " use_gather_4parts, use_gather_8parts\n", ++ clear ? "clear" : "set"); ++ ++ } ++ else if (!strcmp (curr_feature_string, "use_scatter")) ++ { ++ ix86_tune_features[X86_TUNE_USE_SCATTER_2PARTS] = !clear; ++ ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS] = !clear; ++ ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS] = !clear; ++ if (dump) ++ fprintf (stderr, "Explicitly %s features use_scatter_2parts," ++ " use_scatter_4parts, use_scatter_8parts\n", ++ clear ? "clear" : "set"); ++ } ++ else ++ { ++ for (i = 0; i < X86_TUNE_LAST; i++) ++ { ++ if (!strcmp (curr_feature_string, ix86_tune_feature_names[i])) ++ { ++ ix86_tune_features[i] = !clear; ++ if (dump) ++ fprintf (stderr, "Explicitly %s feature %s\n", ++ clear ? "clear" : "set", ix86_tune_feature_names[i]); ++ break; ++ } ++ } ++ ++ if (i == X86_TUNE_LAST) ++ error ("unknown parameter to option %<-mtune-ctrl%>: %s", ++ clear ? curr_feature_string - 1 : curr_feature_string); ++ } + curr_feature_string = next_feature_string; + } + while (curr_feature_string); +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc +index 479fc601049..e75d3702338 100644 +--- a/gcc/config/i386/i386.cc ++++ b/gcc/config/i386/i386.cc +@@ -18937,7 +18937,7 @@ ix86_vectorize_builtin_scatter (const_tree vectype, + ? !TARGET_USE_SCATTER_2PARTS + : (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u) + ? !TARGET_USE_SCATTER_4PARTS +- : !TARGET_USE_SCATTER)) ++ : !TARGET_USE_SCATTER_8PARTS)) + return NULL_TREE; + + if ((TREE_CODE (index_type) != INTEGER_TYPE +diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h +index 688aaabd3f8..aaa136ba0bf 100644 +--- a/gcc/config/i386/i386.h ++++ b/gcc/config/i386/i386.h +@@ -403,10 +403,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; + ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS] + #define TARGET_USE_SCATTER_4PARTS \ + ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS] +-#define TARGET_USE_GATHER \ +- ix86_tune_features[X86_TUNE_USE_GATHER] +-#define TARGET_USE_SCATTER \ +- ix86_tune_features[X86_TUNE_USE_SCATTER] ++#define TARGET_USE_GATHER_8PARTS \ ++ ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS] ++#define TARGET_USE_SCATTER_8PARTS \ ++ ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS] + #define TARGET_FUSE_CMP_AND_BRANCH_32 \ + ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32] + #define TARGET_FUSE_CMP_AND_BRANCH_64 \ +diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt +index 498fb454d01..b154110d813 100644 +--- a/gcc/config/i386/i386.opt ++++ b/gcc/config/i386/i386.opt +@@ -1222,3 +1222,7 @@ Instructions number above which STFL stall penalty can be compensated. + munroll-only-small-loops + Target Var(ix86_unroll_only_small_loops) Init(0) Save + Enable conservative small loop unrolling. ++ ++mscatter ++Target Alias(mtune-ctrl=, use_scatter, ^use_scatter) ++Enable vectorization for scatter instruction. +diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def +index 4392709fce2..bdb455d20ba 100644 +--- a/gcc/config/i386/x86-tune.def ++++ b/gcc/config/i386/x86-tune.def +@@ -488,13 +488,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts", + + /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more + elements. */ +-DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather", ++DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts", + ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE + | m_GENERIC | m_GDS)) + + /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more + elements. */ +-DEF_TUNE (X86_TUNE_USE_SCATTER, "use_scatter", ++DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts", + ~(m_ZNVER4)) + + /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index 6c852bb..3388b91 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 25 +%global gcc_release 26 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -172,6 +172,7 @@ Patch34: 0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch Patch35: 0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch Patch36: 0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch Patch37: 0037-Software-mitigation-Disable-gather-generation-in-vec.patch +Patch38: 0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -675,6 +676,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch35 -p1 %patch36 -p1 %patch37 -p1 +%patch38 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2906,6 +2908,11 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 liuhongt 12.3.1-26 +- Type: Sync +- DESC: Support -m[no-]gather -m[no-]scatter to enable/disable vectorization + for all gather/scatter instructions + * Wed Jan 17 2024 liuhongt 12.3.1-25 - Type: Sync - DESC: Software mitigation: Disable gather generation in vectorization for GDS -- Gitee From 30529fc0f4adf20160ded2cc81d557805b41d52f Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 17 Jan 2024 15:39:05 +0800 Subject: [PATCH 11/15] [Sync] Remove constraint modifier % for fcmaddcph/fmaddcph/fcmulcph since there're not commutative. gcc/ChangeLog: PR target/111306 PR target/111335 * config/i386/sse.md (int_comm): New int_attr. (fma__): Remove % for Complex conjugate operations since they're not commutative. (fma___pair): Ditto. (___mask): Ditto. (cmul3): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr111306.c: New test. --- ...t-modifier-for-fcmaddcph-fmaddcph-fc.patch | 129 ++++++++++++++++++ gcc.spec | 9 +- 2 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch diff --git a/0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch b/0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch new file mode 100644 index 0000000..b50adc2 --- /dev/null +++ b/0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch @@ -0,0 +1,129 @@ +From eefab6a2d901314a2c94b597e1d7766ae34529d0 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Fri, 8 Sep 2023 09:22:43 +0800 +Subject: [Sync] Remove constraint modifier % for + fcmaddcph/fmaddcph/fcmulcph since there're not commutative. + +gcc/ChangeLog: + + PR target/111306 + PR target/111335 + * config/i386/sse.md (int_comm): New int_attr. + (fma__): + Remove % for Complex conjugate operations since they're not + commutative. + (fma___pair): Ditto. + (___mask): Ditto. + (cmul3): Ditto. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr111306.c: New test. + +(cherry picked from commit f197392a16ffb1327f1d12ff8ff05f9295e015cb) +--- + gcc/config/i386/sse.md | 16 ++++++++--- + gcc/testsuite/gcc.target/i386/pr111306.c | 36 ++++++++++++++++++++++++ + 2 files changed, 48 insertions(+), 4 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr111306.c + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index 3af15989631..f25dd5f2bc4 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -6318,6 +6318,14 @@ + [(UNSPEC_COMPLEX_FMA_PAIR "fmaddc") + (UNSPEC_COMPLEX_FCMA_PAIR "fcmaddc")]) + ++(define_int_attr int_comm ++ [(UNSPEC_COMPLEX_FMA "") ++ (UNSPEC_COMPLEX_FMA_PAIR "") ++ (UNSPEC_COMPLEX_FCMA "") ++ (UNSPEC_COMPLEX_FCMA_PAIR "") ++ (UNSPEC_COMPLEX_FMUL "%") ++ (UNSPEC_COMPLEX_FCMUL "")]) ++ + (define_int_attr conj_op + [(UNSPEC_COMPLEX_FMA "") + (UNSPEC_COMPLEX_FCMA "_conj") +@@ -6431,7 +6439,7 @@ + (define_insn "fma__" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v") + (unspec:VF_AVX512FP16VL +- [(match_operand:VF_AVX512FP16VL 1 "" "%v") ++ [(match_operand:VF_AVX512FP16VL 1 "" "v") + (match_operand:VF_AVX512FP16VL 2 "" "") + (match_operand:VF_AVX512FP16VL 3 "" "0")] + UNSPEC_COMPLEX_F_C_MA))] +@@ -6495,7 +6503,7 @@ + (define_insn "fma___pair" + [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=&v") + (unspec:VF1_AVX512VL +- [(match_operand:VF1_AVX512VL 1 "vector_operand" "%v") ++ [(match_operand:VF1_AVX512VL 1 "vector_operand" "v") + (match_operand:VF1_AVX512VL 2 "bcst_vector_operand" "vmBr") + (match_operand:VF1_AVX512VL 3 "vector_operand" "0")] + UNSPEC_COMPLEX_F_C_MA_PAIR))] +@@ -6562,7 +6570,7 @@ + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v") + (vec_merge:VF_AVX512FP16VL + (unspec:VF_AVX512FP16VL +- [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v") ++ [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "v") + (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "") + (match_operand:VF_AVX512FP16VL 3 "register_operand" "0")] + UNSPEC_COMPLEX_F_C_MA) +@@ -6586,7 +6594,7 @@ + (define_insn "__" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v") + (unspec:VF_AVX512FP16VL +- [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v") ++ [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "v") + (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "")] + UNSPEC_COMPLEX_F_C_MUL))] + "TARGET_AVX512FP16 && " +diff --git a/gcc/testsuite/gcc.target/i386/pr111306.c b/gcc/testsuite/gcc.target/i386/pr111306.c +new file mode 100644 +index 00000000000..541725ebdad +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr111306.c +@@ -0,0 +1,36 @@ ++/* { dg-do run } */ ++/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */ ++/* { dg-require-effective-target avx512fp16 } */ ++ ++#define AVX512FP16 ++#include "avx512f-helper.h" ++ ++__attribute__((optimize("O2"),noipa)) ++void func1(_Float16 *a, _Float16 *b, int n, _Float16 *c) { ++ __m512h rA = _mm512_loadu_ph(a); ++ for (int i = 0; i < n; i += 32) { ++ __m512h rB = _mm512_loadu_ph(b + i); ++ _mm512_storeu_ph(c + i, _mm512_fcmul_pch(rB, rA)); ++ } ++} ++ ++void ++test_512 (void) ++{ ++ int n = 32; ++ _Float16 a[n], b[n], c[n]; ++ _Float16 exp[n]; ++ for (int i = 1; i <= n; i++) { ++ a[i - 1] = i & 1 ? -i : i; ++ b[i - 1] = i; ++ } ++ ++ func1(a, b, n, c); ++ for (int i = 0; i < n / 32; i += 2) { ++ if (c[i] != a[i] * b[i] + a[i+1] * b[i+1] ++ || c[i+1] != a[i] * b[i+1] - a[i+1]*b[i]) ++ __builtin_abort (); ++ } ++} ++ ++ +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index 3388b91..219a9a4 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 26 +%global gcc_release 27 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -173,6 +173,7 @@ Patch35: 0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch Patch36: 0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch Patch37: 0037-Software-mitigation-Disable-gather-generation-in-vec.patch Patch38: 0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch +Patch39: 0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -677,6 +678,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch36 -p1 %patch37 -p1 %patch38 -p1 +%patch39 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2908,6 +2910,11 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 liuhongt 12.3.1-27 +- Type: Sync +- DESC: Remove constraint modifier % for fcmaddcph/fmaddcph/fcmulcph since + there're not commutative. + * Wed Jan 17 2024 liuhongt 12.3.1-26 - Type: Sync - DESC: Support -m[no-]gather -m[no-]scatter to enable/disable vectorization -- Gitee From 955517eb53245da57dd56591653a8bec3fca57be Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 17 Jan 2024 15:41:23 +0800 Subject: [PATCH 12/15] [Sync] Disparage slightly for the alternative which move DFmode between SSE_REGS and GENERAL_REGS. For testcase void __cond_swap(double* __x, double* __y) { bool __r = (*__x < *__y); auto __tmp = __r ? *__x : *__y; *__y = __r ? *__y : *__x; *__x = __tmp; } GCC-14 with -O2 and -march=x86-64 options generates the following code: __cond_swap(double*, double*): movsd xmm1, QWORD PTR [rdi] movsd xmm0, QWORD PTR [rsi] comisd xmm0, xmm1 jbe .L2 movq rax, xmm1 movapd xmm1, xmm0 movq xmm0, rax .L2: movsd QWORD PTR [rsi], xmm1 movsd QWORD PTR [rdi], xmm0 ret rax is used to save and restore DFmode value. In RA both GENERAL_REGS and SSE_REGS cost zero since we didn't disparage the alternative in movdf_internal pattern, according to register allocation order, GENERAL_REGS is allocated. The patch add ? for alternative (r,v) and (v,r) just like we did for movsf/hf/bf_internal pattern, after that we get optimal RA. __cond_swap: .LFB0: .cfi_startproc movsd (%rdi), %xmm1 movsd (%rsi), %xmm0 comisd %xmm1, %xmm0 jbe .L2 movapd %xmm1, %xmm2 movapd %xmm0, %xmm1 movapd %xmm2, %xmm0 .L2: movsd %xmm1, (%rsi) movsd %xmm0, (%rdi) ret gcc/ChangeLog: PR target/110170 * config/i386/i386.md (movdf_internal): Disparage slightly for 2 alternatives (r,v) and (v,r) by adding constraint modifier '?'. gcc/testsuite/ChangeLog: * gcc.target/i386/pr110170-3.c: New test. --- ...ly-for-the-alternative-which-move-DF.patch | 106 ++++++++++++++++++ gcc.spec | 9 +- 2 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 0040-Disparage-slightly-for-the-alternative-which-move-DF.patch diff --git a/0040-Disparage-slightly-for-the-alternative-which-move-DF.patch b/0040-Disparage-slightly-for-the-alternative-which-move-DF.patch new file mode 100644 index 0000000..6ca7b70 --- /dev/null +++ b/0040-Disparage-slightly-for-the-alternative-which-move-DF.patch @@ -0,0 +1,106 @@ +From 655f62a56a76b99db90faecaa44a4335d674d91c Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Wed, 5 Jul 2023 13:45:11 +0800 +Subject: [Sync] Disparage slightly for the alternative which move + DFmode between SSE_REGS and GENERAL_REGS. + +For testcase + +void __cond_swap(double* __x, double* __y) { + bool __r = (*__x < *__y); + auto __tmp = __r ? *__x : *__y; + *__y = __r ? *__y : *__x; + *__x = __tmp; +} + +GCC-14 with -O2 and -march=x86-64 options generates the following code: + +__cond_swap(double*, double*): + movsd xmm1, QWORD PTR [rdi] + movsd xmm0, QWORD PTR [rsi] + comisd xmm0, xmm1 + jbe .L2 + movq rax, xmm1 + movapd xmm1, xmm0 + movq xmm0, rax +.L2: + movsd QWORD PTR [rsi], xmm1 + movsd QWORD PTR [rdi], xmm0 + ret + +rax is used to save and restore DFmode value. In RA both GENERAL_REGS +and SSE_REGS cost zero since we didn't disparage the +alternative in movdf_internal pattern, according to register +allocation order, GENERAL_REGS is allocated. The patch add ? for +alternative (r,v) and (v,r) just like we did for movsf/hf/bf_internal +pattern, after that we get optimal RA. + +__cond_swap: +.LFB0: + .cfi_startproc + movsd (%rdi), %xmm1 + movsd (%rsi), %xmm0 + comisd %xmm1, %xmm0 + jbe .L2 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm1 + movapd %xmm2, %xmm0 +.L2: + movsd %xmm1, (%rsi) + movsd %xmm0, (%rdi) + ret + +gcc/ChangeLog: + + PR target/110170 + * config/i386/i386.md (movdf_internal): Disparage slightly for + 2 alternatives (r,v) and (v,r) by adding constraint modifier + '?'. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr110170-3.c: New test. + +(cherry picked from commit 37a231cc7594d12ba0822077018aad751a6fb94e) +--- + gcc/config/i386/i386.md | 4 ++-- + gcc/testsuite/gcc.target/i386/pr110170-3.c | 11 +++++++++++ + 2 files changed, 13 insertions(+), 2 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-3.c + +diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md +index be07be10d8a..71691f59894 100644 +--- a/gcc/config/i386/i386.md ++++ b/gcc/config/i386/i386.md +@@ -3582,9 +3582,9 @@ + ;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7. + (define_insn "*movdf_internal" + [(set (match_operand:DF 0 "nonimmediate_operand" +- "=Yf*f,m ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,v,r ,o ,r ,m") ++ "=Yf*f,m ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,?r,?v,r ,o ,r ,m") + (match_operand:DF 1 "general_operand" +- "Yf*fm,Yf*f,G ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,v,r ,roF,rF,rmF,rC"))] ++ "Yf*fm,Yf*f,G ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x, v, r,roF,rF,rmF,rC"))] + "!(MEM_P (operands[0]) && MEM_P (operands[1])) + && (lra_in_progress || reload_completed + || !CONST_DOUBLE_P (operands[1]) +diff --git a/gcc/testsuite/gcc.target/i386/pr110170-3.c b/gcc/testsuite/gcc.target/i386/pr110170-3.c +new file mode 100644 +index 00000000000..70daa89e9aa +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr110170-3.c +@@ -0,0 +1,11 @@ ++/* { dg-do compile { target { ! ia32 } } } */ ++/* { dg-options "-O2 -fno-if-conversion -fno-if-conversion2" } */ ++/* { dg-final { scan-assembler-not {(?n)movq.*r} } } */ ++ ++void __cond_swap(double* __x, double* __y) { ++ _Bool __r = (*__x < *__y); ++ double __tmp = __r ? *__x : *__y; ++ *__y = __r ? *__y : *__x; ++ *__x = __tmp; ++} ++ +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index 219a9a4..f69adcc 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 27 +%global gcc_release 28 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -174,6 +174,7 @@ Patch36: 0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch Patch37: 0037-Software-mitigation-Disable-gather-generation-in-vec.patch Patch38: 0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch Patch39: 0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch +Patch40: 0040-Disparage-slightly-for-the-alternative-which-move-DF.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -679,6 +680,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch37 -p1 %patch38 -p1 %patch39 -p1 +%patch40 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2910,6 +2912,11 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 liuhongt 12.3.1-28 +- Type: Sync +- DESC: Disparage slightly for the alternative which move DFmode between + SSE_REGS and GENERAL_REGS. + * Wed Jan 17 2024 liuhongt 12.3.1-27 - Type: Sync - DESC: Remove constraint modifier % for fcmaddcph/fmaddcph/fcmulcph since -- Gitee From ff74d335e4d6c7727dd3bcc36908ba131cf8f5b2 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 17 Jan 2024 15:43:36 +0800 Subject: [PATCH 13/15] [Sync] Fix wrong code due to vec_merge + pcmp to blendvb splitter. gcc/ChangeLog: PR target/112443 * config/i386/sse.md (*avx2_pcmp3_4): Fix swap condition from LT to GT since there's not in the pattern. (*avx2_pcmp3_5): Ditto. gcc/testsuite/ChangeLog: * g++.target/i386/pr112443.C: New test. --- ...ue-to-vec_merge-pcmp-to-blendvb-spli.patch | 163 ++++++++++++++++++ gcc.spec | 8 +- 2 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch diff --git a/0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch b/0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch new file mode 100644 index 0000000..5a5c5ee --- /dev/null +++ b/0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch @@ -0,0 +1,163 @@ +From 87d4ac895c32f894f0efd7efc82fcde65161a38d Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Thu, 9 Nov 2023 13:20:05 +0800 +Subject: [Sync] Fix wrong code due to vec_merge + pcmp to blendvb + splitter. + +gcc/ChangeLog: + + PR target/112443 + * config/i386/sse.md (*avx2_pcmp3_4): Fix swap condition + from LT to GT since there's not in the pattern. + (*avx2_pcmp3_5): Ditto. + +gcc/testsuite/ChangeLog: + + * g++.target/i386/pr112443.C: New test. + +(cherry picked from commit 9a0cc04b9c9b02426762892b88efc5c44ba546bd) +--- + gcc/config/i386/sse.md | 4 +- + gcc/testsuite/g++.target/i386/pr112443.C | 108 +++++++++++++++++++++++ + 2 files changed, 110 insertions(+), 2 deletions(-) + create mode 100644 gcc/testsuite/g++.target/i386/pr112443.C + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index f25dd5f2bc4..23b858ab21c 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -16358,7 +16358,7 @@ + (match_dup 4))] + UNSPEC_BLENDV))] + { +- if (INTVAL (operands[5]) == 1) ++ if (INTVAL (operands[5]) == 5) + std::swap (operands[1], operands[2]); + operands[3] = gen_lowpart (mode, operands[3]); + }) +@@ -16388,7 +16388,7 @@ + (match_dup 4))] + UNSPEC_BLENDV))] + { +- if (INTVAL (operands[5]) == 1) ++ if (INTVAL (operands[5]) == 5) + std::swap (operands[1], operands[2]); + }) + +diff --git a/gcc/testsuite/g++.target/i386/pr112443.C b/gcc/testsuite/g++.target/i386/pr112443.C +new file mode 100644 +index 00000000000..ebfa9b4a753 +--- /dev/null ++++ b/gcc/testsuite/g++.target/i386/pr112443.C +@@ -0,0 +1,108 @@ ++/* { dg-do run } */ ++/* { dg-require-effective-target avx512bw } */ ++/* { dg-require-effective-target avx512vl } */ ++/* { dg-options "-O2 -std=c++17 -mavx512bw -mavx512vl" } */ ++ ++#include ++#include ++#include ++#include ++ ++#define AVX512BW ++#define AVX512VL ++ ++#include "avx512f-helper.h" ++ ++struct TensorIteratorBase{ ++ char* in; ++ char* out; ++ ++ void for_each(std::function loop){ ++ loop(out, in, 32); ++ } ++}; ++ ++class Vectorized { ++protected: ++ __m256i values; ++ ++ static inline __m256i invert(const __m256i& v) { ++ const auto ones = _mm256_set1_epi64x(-1); ++ return _mm256_xor_si256(ones, v); ++ } ++public: ++ operator __m256i() const { ++ return values; ++ } ++ ++ static constexpr int size() { ++ return 32; ++ } ++ ++ Vectorized() {} ++ Vectorized(__m256i v) : values(v) {} ++ Vectorized(uint8_t v) { values = _mm256_set1_epi8(v); } ++ static Vectorized blendv(const Vectorized& a, const Vectorized& b, ++ const Vectorized& mask) { ++ return _mm256_blendv_epi8(a, b, mask); ++ } ++ static Vectorized loadu(const void* ptr) { ++ return _mm256_loadu_si256(reinterpret_cast(ptr)); ++ } ++ void store(void* ptr) const { ++ _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); ++ } ++ ++ Vectorized operator<(const Vectorized& other) const { ++ __m256i max = _mm256_max_epu8(values, other); ++ return invert(_mm256_cmpeq_epi8(max, values)); ++ } ++ Vectorized operator-(const Vectorized& b) { ++ return _mm256_sub_epi8(values, b); ++ } ++}; ++ ++std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { ++ uint8_t buf[Vectorized::size()]; ++ vec.store(buf); ++ stream << "vec["; ++ for (int i = 0; i != Vectorized::size(); i++) { ++ if (i != 0) ++ stream << ", "; ++ stream << buf[i]*1; ++ } ++ stream << "]"; ++ return stream; ++} ++ ++void run(TensorIteratorBase iter){ ++ Vectorized zero_vec(0); ++ Vectorized one_vec(1); ++ ++ iter.for_each([=](char* out, char* in, int64_t size) { ++ for (int64_t i = 0; i <= size - Vectorized::size(); i += Vectorized::size()) { ++ auto self_vec = Vectorized::loadu(in + i); ++ auto left = Vectorized::blendv(zero_vec, one_vec, zero_vec < self_vec); ++ auto right = Vectorized::blendv(zero_vec, one_vec, self_vec < zero_vec); ++ auto outv = left - right; ++ outv.store(out + i); ++ } ++ }); ++} ++ ++void ++test_256 (){ ++ char in[32]; ++ char out[32]; ++ for(auto& x: in) x = 1; ++ run(TensorIteratorBase{in, out}); ++ Vectorized::loadu (out); ++ for (int i = 0; i != 32; i++) ++ if (out[i] != 1) ++ __builtin_abort (); ++} ++ ++void ++test_128 () ++{ ++} +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index f69adcc..08e5ef7 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 28 +%global gcc_release 29 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -175,6 +175,7 @@ Patch37: 0037-Software-mitigation-Disable-gather-generation-in-vec.patch Patch38: 0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch Patch39: 0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch Patch40: 0040-Disparage-slightly-for-the-alternative-which-move-DF.patch +Patch41: 0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -681,6 +682,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch38 -p1 %patch39 -p1 %patch40 -p1 +%patch41 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2912,6 +2914,10 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 liuhongt 12.3.1-29 +- Type: Sync +- DESC: Fix wrong code due to vec_merge + pcmp to blendvb splitter. + * Wed Jan 17 2024 liuhongt 12.3.1-28 - Type: Sync - DESC: Disparage slightly for the alternative which move DFmode between -- Gitee From 715c8e17de7f532a5bf86166a22832ee144cfa3a Mon Sep 17 00:00:00 2001 From: liuhongt Date: Wed, 17 Jan 2024 15:45:31 +0800 Subject: [PATCH 14/15] [Sync] Don't assume it's AVX_U128_CLEAN after call_insn whose abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS. If the function desn't clobber any sse registers or only clobber 128-bit part, then vzeroupper isn't issued before the function exit. the status not CLEAN but ANY after the function. Also for sibling_call, it's safe to issue an vzeroupper. Also there could be missing vzeroupper since there's no mode_exit for sibling_call_p. gcc/ChangeLog: PR target/112891 * config/i386/i386.cc (ix86_avx_u128_mode_after): Return AVX_U128_ANY if callee_abi doesn't clobber all_sse_regs to align with ix86_avx_u128_mode_needed. (ix86_avx_u128_mode_needed): Return AVX_U128_ClEAN for sibling_call. gcc/testsuite/ChangeLog: * gcc.target/i386/pr112891.c: New test. * gcc.target/i386/pr112891-2.c: New test. --- ...s-AVX_U128_CLEAN-after-call_insn-who.patch | 151 ++++++++++++++++++ gcc.spec | 9 +- 2 files changed, 159 insertions(+), 1 deletion(-) create mode 100644 0042-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch diff --git a/0042-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch b/0042-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch new file mode 100644 index 0000000..043b521 --- /dev/null +++ b/0042-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch @@ -0,0 +1,151 @@ +From 068c5ee54e6275fab3fcb501c4d6221f86abae68 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Thu, 7 Dec 2023 09:17:27 +0800 +Subject: [Sync] Don't assume it's AVX_U128_CLEAN after call_insn whose + abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS. + +If the function desn't clobber any sse registers or only clobber +128-bit part, then vzeroupper isn't issued before the function exit. +the status not CLEAN but ANY after the function. + +Also for sibling_call, it's safe to issue an vzeroupper. Also there +could be missing vzeroupper since there's no mode_exit for +sibling_call_p. + +gcc/ChangeLog: + + PR target/112891 + * config/i386/i386.cc (ix86_avx_u128_mode_after): Return + AVX_U128_ANY if callee_abi doesn't clobber all_sse_regs to + align with ix86_avx_u128_mode_needed. + (ix86_avx_u128_mode_needed): Return AVX_U128_ClEAN for + sibling_call. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr112891.c: New test. + * gcc.target/i386/pr112891-2.c: New test. + +(cherry picked from commit fc189a08f5b7ad5889bd4c6b320c1dd99dd5d642) +--- + gcc/config/i386/i386.cc | 22 +++++++++++++--- + gcc/testsuite/gcc.target/i386/pr112891-2.c | 30 ++++++++++++++++++++++ + gcc/testsuite/gcc.target/i386/pr112891.c | 29 +++++++++++++++++++++ + 3 files changed, 78 insertions(+), 3 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr112891-2.c + create mode 100644 gcc/testsuite/gcc.target/i386/pr112891.c + +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc +index e75d3702338..60f3296b00c 100644 +--- a/gcc/config/i386/i386.cc ++++ b/gcc/config/i386/i386.cc +@@ -14416,8 +14416,12 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) + modes wider than 256 bits. It's only safe to issue a + vzeroupper if all SSE registers are clobbered. */ + const function_abi &abi = insn_callee_abi (insn); +- if (!hard_reg_set_subset_p (reg_class_contents[SSE_REGS], +- abi.mode_clobbers (V4DImode))) ++ /* Should be safe to issue an vzeroupper before sibling_call_p. ++ Also there not mode_exit for sibling_call, so there could be ++ missing vzeroupper for that. */ ++ if (!(SIBLING_CALL_P (insn) ++ || hard_reg_set_subset_p (reg_class_contents[SSE_REGS], ++ abi.mode_clobbers (V4DImode)))) + return AVX_U128_ANY; + + return AVX_U128_CLEAN; +@@ -14555,7 +14559,19 @@ ix86_avx_u128_mode_after (int mode, rtx_insn *insn) + bool avx_upper_reg_found = false; + note_stores (insn, ix86_check_avx_upper_stores, &avx_upper_reg_found); + +- return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN; ++ if (avx_upper_reg_found) ++ return AVX_U128_DIRTY; ++ ++ /* If the function desn't clobber any sse registers or only clobber ++ 128-bit part, Then vzeroupper isn't issued before the function exit. ++ the status not CLEAN but ANY after the function. */ ++ const function_abi &abi = insn_callee_abi (insn); ++ if (!(SIBLING_CALL_P (insn) ++ || hard_reg_set_subset_p (reg_class_contents[SSE_REGS], ++ abi.mode_clobbers (V4DImode)))) ++ return AVX_U128_ANY; ++ ++ return AVX_U128_CLEAN; + } + + /* Otherwise, return current mode. Remember that if insn +diff --git a/gcc/testsuite/gcc.target/i386/pr112891-2.c b/gcc/testsuite/gcc.target/i386/pr112891-2.c +new file mode 100644 +index 00000000000..164c3985d50 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr112891-2.c +@@ -0,0 +1,30 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx2 -O3" } */ ++/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */ ++ ++void ++__attribute__((noinline)) ++bar (double* a) ++{ ++ a[0] = 1.0; ++ a[1] = 2.0; ++} ++ ++double ++__attribute__((noinline)) ++foo (double* __restrict a, double* b) ++{ ++ a[0] += b[0]; ++ a[1] += b[1]; ++ a[2] += b[2]; ++ a[3] += b[3]; ++ bar (b); ++ return a[5] + b[5]; ++} ++ ++double ++foo1 (double* __restrict a, double* b) ++{ ++ double c = foo (a, b); ++ return __builtin_exp (c); ++} +diff --git a/gcc/testsuite/gcc.target/i386/pr112891.c b/gcc/testsuite/gcc.target/i386/pr112891.c +new file mode 100644 +index 00000000000..dbf6c67948a +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr112891.c +@@ -0,0 +1,29 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx2 -O3" } */ ++/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */ ++ ++void ++__attribute__((noinline)) ++bar (double* a) ++{ ++ a[0] = 1.0; ++ a[1] = 2.0; ++} ++ ++void ++__attribute__((noinline)) ++foo (double* __restrict a, double* b) ++{ ++ a[0] += b[0]; ++ a[1] += b[1]; ++ a[2] += b[2]; ++ a[3] += b[3]; ++ bar (b); ++} ++ ++double ++foo1 (double* __restrict a, double* b) ++{ ++ foo (a, b); ++ return __builtin_exp (b[1]); ++} +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index 08e5ef7..50a1a79 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 29 +%global gcc_release 30 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -176,6 +176,7 @@ Patch38: 0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch Patch39: 0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch Patch40: 0040-Disparage-slightly-for-the-alternative-which-move-DF.patch Patch41: 0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch +Patch42: 0042-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch # On ARM EABI systems, we do want -gnueabi to be part of the # target triple. @@ -683,6 +684,7 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch39 -p1 %patch40 -p1 %patch41 -p1 +%patch42 -p1 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE @@ -2914,6 +2916,11 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 liuhongt 12.3.1-30 +- Type: Sync +- DESC: Don't assume it's AVX_U128_CLEAN after call_insn whose + abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS. + * Wed Jan 17 2024 liuhongt 12.3.1-29 - Type: Sync - DESC: Fix wrong code due to vec_merge + pcmp to blendvb splitter. -- Gitee From 9022a215a645cae1022a02ac616abecf979e3cc2 Mon Sep 17 00:00:00 2001 From: Jan Hubicka Date: Wed, 17 Jan 2024 15:47:46 +0800 Subject: [PATCH 15/15] [Sync] Disable FMADD in chains for Zen4 and generic this patch disables use of FMA in matrix multiplication loop for generic (for x86-64-v3) and zen4. I tested this on zen4 and Xenon Gold Gold 6212U. For Intel this is neutral both on the matrix multiplication microbenchmark (attached) and spec2k17 where the difference was within noise for Core. On core the micro-benchmark runs as follows: With FMA: 578,500,241 cycles:u # 3.645 GHz ( +- 0.12% ) 753,318,477 instructions:u # 1.30 insn per cycle ( +- 0.00% ) 125,417,701 branches:u # 790.227 M/sec ( +- 0.00% ) 0.159146 +- 0.000363 seconds time elapsed ( +- 0.23% ) No FMA: 577,573,960 cycles:u # 3.514 GHz ( +- 0.15% ) 878,318,479 instructions:u # 1.52 insn per cycle ( +- 0.00% ) 125,417,702 branches:u # 763.035 M/sec ( +- 0.00% ) 0.164734 +- 0.000321 seconds time elapsed ( +- 0.19% ) So the cycle count is unchanged and discrete multiply+add takes same time as FMA. While on zen: With FMA: 484875179 cycles:u # 3.599 GHz ( +- 0.05% ) (82.11%) 752031517 instructions:u # 1.55 insn per cycle 125106525 branches:u # 928.712 M/sec ( +- 0.03% ) (85.09%) 128356 branch-misses:u # 0.10% of all branches ( +- 0.06% ) (83.58%) No FMA: 375875209 cycles:u # 3.592 GHz ( +- 0.08% ) (80.74%) 875725341 instructions:u # 2.33 insn per cycle 124903825 branches:u # 1.194 G/sec ( +- 0.04% ) (84.59%) 0.105203 +- 0.000188 seconds time elapsed ( +- 0.18% ) The diffrerence is that Cores understand the fact that fmadd does not need all three parameters to start computation, while Zen cores doesn't. Since this seems noticeable win on zen and not loss on Core it seems like good default for generic. float a[SIZE][SIZE]; float b[SIZE][SIZE]; float c[SIZE][SIZE]; void init(void) { int i, j, k; for(i=0; i +Date: Fri, 29 Dec 2023 23:51:03 +0100 +Subject: [Sync] Disable FMADD in chains for Zen4 and generic + +this patch disables use of FMA in matrix multiplication loop for generic (for +x86-64-v3) and zen4. I tested this on zen4 and Xenon Gold Gold 6212U. + +For Intel this is neutral both on the matrix multiplication microbenchmark +(attached) and spec2k17 where the difference was within noise for Core. + +On core the micro-benchmark runs as follows: + +With FMA: + + 578,500,241 cycles:u # 3.645 GHz + ( +- 0.12% ) + 753,318,477 instructions:u # 1.30 insn per +cycle ( +- 0.00% ) + 125,417,701 branches:u # 790.227 M/sec + ( +- 0.00% ) + 0.159146 +- 0.000363 seconds time elapsed ( +- 0.23% ) + +No FMA: + + 577,573,960 cycles:u # 3.514 GHz + ( +- 0.15% ) + 878,318,479 instructions:u # 1.52 insn per +cycle ( +- 0.00% ) + 125,417,702 branches:u # 763.035 M/sec + ( +- 0.00% ) + 0.164734 +- 0.000321 seconds time elapsed ( +- 0.19% ) + +So the cycle count is unchanged and discrete multiply+add takes same time as +FMA. + +While on zen: + +With FMA: + 484875179 cycles:u # 3.599 GHz + ( +- 0.05% ) (82.11%) + 752031517 instructions:u # 1.55 insn per +cycle + 125106525 branches:u # 928.712 M/sec + ( +- 0.03% ) (85.09%) + 128356 branch-misses:u # 0.10% of all +branches ( +- 0.06% ) (83.58%) + +No FMA: + 375875209 cycles:u # 3.592 GHz + ( +- 0.08% ) (80.74%) + 875725341 instructions:u # 2.33 insn per +cycle + 124903825 branches:u # 1.194 G/sec + ( +- 0.04% ) (84.59%) + 0.105203 +- 0.000188 seconds time elapsed ( +- 0.18% ) + +The diffrerence is that Cores understand the fact that fmadd does not need +all three parameters to start computation, while Zen cores doesn't. + +Since this seems noticeable win on zen and not loss on Core it seems like good +default for generic. + +float a[SIZE][SIZE]; +float b[SIZE][SIZE]; +float c[SIZE][SIZE]; + +void init(void) +{ + int i, j, k; + for(i=0; i gcc/DEV-PHASE @@ -2916,6 +2918,10 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 Jan Hubicka 12.3.1-31 +- Type: Sync +- DESC: Disable FMADD in chains for Zen4 and generic + * Wed Jan 17 2024 liuhongt 12.3.1-30 - Type: Sync - DESC: Don't assume it's AVX_U128_CLEAN after call_insn whose -- Gitee