From c3948da8f29a3227af844bc81614b951cd72ea25 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 17 Jan 2024 14:28:56 +0800
Subject: [PATCH 01/15] [Sync] x86: Add a new option -mdaz-ftz to enable FTZ
 and DAZ flags in MXCSR.

    if (mdaz-ftz)
      link crtfastmath.o
    else if ((Ofast || ffast-math || funsafe-math-optimizations)
             && !mno-daz-ftz)
      link crtfastmath.o
    else
      Don't link crtfastmath.o

gcc/ChangeLog:

	* config/i386/cygwin.h (ENDFILE_SPEC): Link crtfastmath.o
	whenever -mdaz-ftz is specified. Don't link crtfastmath.o
	when -mno-daz-ftz is specified.
	* config/i386/darwin.h (ENDFILE_SPEC): Ditto.
	* config/i386/gnu-user-common.h
	(GNU_USER_TARGET_MATHFILE_SPEC): Ditto.
	* config/i386/mingw32.h (ENDFILE_SPEC): Ditto.
	* config/i386/i386.opt (mdaz-ftz): New option.
	* doc/invoke.texi (x86 options): Document mftz-daz.
---
 ...ption-mdaz-ftz-to-enable-FTZ-and-DAZ.patch | 135 ++++++++++++++++++
 gcc.spec                                      |   8 +-
 2 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch

diff --git a/0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch b/0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch
new file mode 100644
index 0000000..5148d7e
--- /dev/null
+++ b/0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch
@@ -0,0 +1,135 @@
+From 75d05d4e2cb6ac0b85391c51a59925a97eaee85f Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Wed, 10 May 2023 15:16:58 +0800
+Subject: x86: Add a new option -mdaz-ftz to enable FTZ and DAZ
+ flags in MXCSR.
+
+    if (mdaz-ftz)
+      link crtfastmath.o
+    else if ((Ofast || ffast-math || funsafe-math-optimizations)
+             && !mno-daz-ftz)
+      link crtfastmath.o
+    else
+      Don't link crtfastmath.o
+
+gcc/ChangeLog:
+
+	* config/i386/cygwin.h (ENDFILE_SPEC): Link crtfastmath.o
+	whenever -mdaz-ftz is specified. Don't link crtfastmath.o
+	when -mno-daz-ftz is specified.
+	* config/i386/darwin.h (ENDFILE_SPEC): Ditto.
+	* config/i386/gnu-user-common.h
+	(GNU_USER_TARGET_MATHFILE_SPEC): Ditto.
+	* config/i386/mingw32.h (ENDFILE_SPEC): Ditto.
+	* config/i386/i386.opt (mdaz-ftz): New option.
+	* doc/invoke.texi (x86 options): Document mftz-daz.
+---
+ gcc/config/i386/cygwin.h          |  2 +-
+ gcc/config/i386/darwin.h          |  4 ++--
+ gcc/config/i386/gnu-user-common.h |  2 +-
+ gcc/config/i386/i386.opt          |  4 ++++
+ gcc/config/i386/mingw32.h         |  2 +-
+ gcc/doc/invoke.texi               | 11 ++++++++++-
+ 6 files changed, 19 insertions(+), 6 deletions(-)
+
+diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h
+index d06eda369cf..5412c5d4479 100644
+--- a/gcc/config/i386/cygwin.h
++++ b/gcc/config/i386/cygwin.h
+@@ -57,7 +57,7 @@ along with GCC; see the file COPYING3.  If not see
+ 
+ #undef ENDFILE_SPEC
+ #define ENDFILE_SPEC \
+-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}\
++  "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
+    %{!shared:%:if-exists(default-manifest.o%s)}\
+    %{fvtable-verify=none:%s; \
+     fvtable-verify=preinit:vtv_end.o%s; \
+diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h
+index a55f6b2b874..2f773924d6e 100644
+--- a/gcc/config/i386/darwin.h
++++ b/gcc/config/i386/darwin.h
+@@ -109,8 +109,8 @@ along with GCC; see the file COPYING3.  If not see
+ "%{!force_cpusubtype_ALL:-force_cpusubtype_ALL} "
+ 
+ #undef ENDFILE_SPEC
+-#define ENDFILE_SPEC \
+-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
++#define ENDFILE_SPEC
++\  "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
+    %{mpc32:crtprec32.o%s} \
+    %{mpc64:crtprec64.o%s} \
+    %{mpc80:crtprec80.o%s}" TM_DESTRUCTOR
+diff --git a/gcc/config/i386/gnu-user-common.h b/gcc/config/i386/gnu-user-common.h
+index 23b54c5be52..3d2a33f1714 100644
+--- a/gcc/config/i386/gnu-user-common.h
++++ b/gcc/config/i386/gnu-user-common.h
+@@ -47,7 +47,7 @@ along with GCC; see the file COPYING3.  If not see
+ 
+ /* Similar to standard GNU userspace, but adding -ffast-math support.  */
+ #define GNU_USER_TARGET_MATHFILE_SPEC \
+-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
++  "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
+    %{mpc32:crtprec32.o%s} \
+    %{mpc64:crtprec64.o%s} \
+    %{mpc80:crtprec80.o%s}"
+diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
+index fc1b944acb8..498fb454d01 100644
+--- a/gcc/config/i386/i386.opt
++++ b/gcc/config/i386/i386.opt
+@@ -420,6 +420,10 @@ mpc80
+ Target RejectNegative
+ Set 80387 floating-point precision to 80-bit.
+ 
++mdaz-ftz
++Target
++Set the FTZ and DAZ Flags.
++
+ mpreferred-stack-boundary=
+ Target RejectNegative Joined UInteger Var(ix86_preferred_stack_boundary_arg)
+ Attempt to keep stack aligned to this power of 2.
+diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h
+index d3ca0cd0279..ddbe6a4054b 100644
+--- a/gcc/config/i386/mingw32.h
++++ b/gcc/config/i386/mingw32.h
+@@ -197,7 +197,7 @@ along with GCC; see the file COPYING3.  If not see
+ 
+ #undef ENDFILE_SPEC
+ #define ENDFILE_SPEC \
+-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
++  "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \
+    %{!shared:%:if-exists(default-manifest.o%s)}\
+    %{fvtable-verify=none:%s; \
+     fvtable-verify=preinit:vtv_end.o%s; \
+diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
+index 2b376e0e995..3a48655e588 100644
+--- a/gcc/doc/invoke.texi
++++ b/gcc/doc/invoke.texi
+@@ -1437,7 +1437,7 @@ See RS/6000 and PowerPC Options.
+ -m96bit-long-double  -mlong-double-64  -mlong-double-80  -mlong-double-128 @gol
+ -mregparm=@var{num}  -msseregparm @gol
+ -mveclibabi=@var{type}  -mvect8-ret-in-mem @gol
+--mpc32  -mpc64  -mpc80  -mstackrealign @gol
++-mpc32  -mpc64  -mpc80 -mdaz-ftz -mstackrealign @gol
+ -momit-leaf-frame-pointer  -mno-red-zone  -mno-tls-direct-seg-refs @gol
+ -mcmodel=@var{code-model}  -mabi=@var{name}  -maddress-mode=@var{mode} @gol
+ -m32  -m64  -mx32  -m16  -miamcu  -mlarge-data-threshold=@var{num} @gol
+@@ -32122,6 +32122,15 @@ are enabled by default; routines in such libraries could suffer significant
+ loss of accuracy, typically through so-called ``catastrophic cancellation'',
+ when this option is used to set the precision to less than extended precision.
+ 
++@item -mdaz-ftz
++@opindex mdaz-ftz
++
++The flush-to-zero (FTZ) and denormals-are-zero (DAZ) flags in the MXCSR register
++are used to control floating-point calculations.SSE and AVX instructions
++including scalar and vector instructions could benefit from enabling the FTZ
++and DAZ flags when @option{-mdaz-ftz} is specified. Don't set FTZ/DAZ flags
++when @option{-mno-daz-ftz} is specified.
++
+ @item -mstackrealign
+ @opindex mstackrealign
+ Realign the stack at entry.  On the x86, the @option{-mstackrealign}
+-- 
+2.31.1
+
diff --git a/gcc.spec b/gcc.spec
index 937761f..ff7b64f 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 16
+%global gcc_release 17
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -163,6 +163,7 @@ Patch25: 0025-AArch64-Rewrite-the-tsv110-option.patch
 Patch26: 0026-GOMP-Enabling-moutline-atomics-improves-libgomp-perf.patch
 Patch27: 0027-LoopElim-Redundant-loop-elimination-optimization.patch
 Patch28: 0028-Array-widen-compare-Fix-the-return-value-match-after.patch
+Patch29: 0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@@ -657,6 +658,7 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch26 -p1
 %patch27 -p1
 %patch28 -p1
+%patch29 -p1
 
 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE
 
@@ -2888,6 +2890,10 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-17
+- Type: Sync
+- DESC: x86: Add a new option -mdaz-ftz to enable FTZ and DAZ flags in MXCSR.
+
 * Mon Sep 11 2023 dingguangya <dingguangya1@huawei.com> 12.3.1-16
 - Type: Sync
 - DESC: Sync patch from openeuler/gcc
-- 
Gitee


From 5d9af8463edaf55215cd9b36e777057b02e6f392 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 17 Jan 2024 14:37:25 +0800
Subject: [PATCH 02/15] [Sync] Explicitly view_convert_expr mask to signed type
 when folding pblendvb builtins.

Since mask < 0 will be always false for vector char when
-funsigned-char, but vpblendvb needs to check the most significant
bit. The patch explicitly VCE to vector signed char.

gcc/ChangeLog:

	PR target/110108
	* config/i386/i386.cc (ix86_gimple_fold_builtin): Explicitly
	view_convert_expr mask to signed type when folding pblendvb
	builtins.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr110108-2.c: New test.
---
 ...convert_expr-mask-to-signed-type-whe.patch | 65 +++++++++++++++++++
 gcc.spec                                      |  9 ++-
 2 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch

diff --git a/0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch b/0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch
new file mode 100644
index 0000000..4853877
--- /dev/null
+++ b/0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch
@@ -0,0 +1,65 @@
+From 91c609109438b970dd32b4bd7eefcfab1be7fed9 Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Mon, 5 Jun 2023 12:38:41 +0800
+Subject: Explicitly view_convert_expr mask to signed type when
+ folding pblendvb builtins.
+
+Since mask < 0 will be always false for vector char when
+-funsigned-char, but vpblendvb needs to check the most significant
+bit. The patch explicitly VCE to vector signed char.
+
+gcc/ChangeLog:
+
+	PR target/110108
+	* config/i386/i386.cc (ix86_gimple_fold_builtin): Explicitly
+	view_convert_expr mask to signed type when folding pblendvb
+	builtins.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.target/i386/pr110108-2.c: New test.
+---
+ gcc/config/i386/i386.cc                    |  4 +++-
+ gcc/testsuite/gcc.target/i386/pr110108-2.c | 14 ++++++++++++++
+ 2 files changed, 17 insertions(+), 1 deletion(-)
+ create mode 100644 gcc/testsuite/gcc.target/i386/pr110108-2.c
+
+diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
+index 462dce10e5c..479fc601049 100644
+--- a/gcc/config/i386/i386.cc
++++ b/gcc/config/i386/i386.cc
+@@ -18396,8 +18396,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
+ 	      tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode
+ 		? intSI_type_node : intDI_type_node;
+ 	      type = get_same_sized_vectype (itype, type);
+-	      arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
+ 	    }
++	  else
++	    type = signed_type_for (type);
++	  arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
+ 	  tree zero_vec = build_zero_cst (type);
+ 	  tree cmp_type = truth_type_for (type);
+ 	  tree cmp = gimple_build (&stmts, LT_EXPR, cmp_type, arg2, zero_vec);
+diff --git a/gcc/testsuite/gcc.target/i386/pr110108-2.c b/gcc/testsuite/gcc.target/i386/pr110108-2.c
+new file mode 100644
+index 00000000000..2d1d2fd4991
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr110108-2.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++/* { dg-options "-mavx2 -O2 -funsigned-char" } */
++/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */
++
++#include <immintrin.h>
++__m128i do_stuff_128(__m128i X0, __m128i X1, __m128i X2) {
++  __m128i Result = _mm_blendv_epi8(X0, X1, X2);
++  return Result;
++}
++
++__m256i do_stuff_256(__m256i X0, __m256i X1, __m256i X2) {
++  __m256i Result = _mm256_blendv_epi8(X0, X1, X2);
++  return Result;
++}
+-- 
+2.31.1
+
diff --git a/gcc.spec b/gcc.spec
index ff7b64f..1163542 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 17
+%global gcc_release 18
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -164,6 +164,7 @@ Patch26: 0026-GOMP-Enabling-moutline-atomics-improves-libgomp-perf.patch
 Patch27: 0027-LoopElim-Redundant-loop-elimination-optimization.patch
 Patch28: 0028-Array-widen-compare-Fix-the-return-value-match-after.patch
 Patch29: 0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch
+Patch30: 0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@@ -659,6 +660,7 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch27 -p1
 %patch28 -p1
 %patch29 -p1
+%patch30 -p1
 
 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE
 
@@ -2890,6 +2892,11 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-18
+- Type: Sync
+- DESC: Explicitly view_convert_expr mask to signed type when folding
+  pblendvb builtins.
+
 * Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-17
 - Type: Sync
 - DESC: x86: Add a new option -mdaz-ftz to enable FTZ and DAZ flags in MXCSR.
-- 
Gitee


From 3fd344c8f493de6f5ad65bd7327c304079421ace Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 17 Jan 2024 14:41:36 +0800
Subject: [PATCH 03/15] [Sync] Make option mvzeroupper independent of
 optimization level.

pass_insert_vzeroupper is under condition

TARGET_AVX && TARGET_VZEROUPPER
&& flag_expensive_optimizations && !optimize_size

But the document of mvzeroupper doesn't mention the insertion
required -O2 and above, it may confuse users when they explicitly
use -Os -mvzeroupper.

------------
mvzeroupper
Target Mask(VZEROUPPER) Save
Generate vzeroupper instruction before a transfer of control flow out of
the function.
------------

The patch moves flag_expensive_optimizations && !optimize_size to
ix86_option_override_internal. It makes -mvzeroupper independent of
optimization level, but still keeps the behavior of architecture
tuning(emit_vzeroupper) unchanged.

gcc/ChangeLog:

	* config/i386/i386-features.cc (pass_insert_vzeroupper:gate):
	Move flag_expensive_optimizations && !optimize_size to ..
	* config/i386/i386-options.cc (ix86_option_override_internal):
	.. this, it makes -mvzeroupper independent of optimization
	level, but still keeps the behavior of architecture
	tuning(emit_vzeroupper) unchanged.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx-vzeroupper-29.c: New testcase.
	* gcc.target/i386/avx-vzeroupper-12.c: Adjust testcase.
	* gcc.target/i386/avx-vzeroupper-7.c: Ditto.
	* gcc.target/i386/avx-vzeroupper-9.c: Ditto.
---
 ...eroupper-independent-of-optimization.patch | 137 ++++++++++++++++++
 gcc.spec                                      |   8 +-
 2 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 0031-Make-option-mvzeroupper-independent-of-optimization.patch

diff --git a/0031-Make-option-mvzeroupper-independent-of-optimization.patch b/0031-Make-option-mvzeroupper-independent-of-optimization.patch
new file mode 100644
index 0000000..0a7aacd
--- /dev/null
+++ b/0031-Make-option-mvzeroupper-independent-of-optimization.patch
@@ -0,0 +1,137 @@
+From 38364a48b8439a9b16717136835d28690d2a2dd6 Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Mon, 26 Jun 2023 09:50:25 +0800
+Subject: [Sync] Make option mvzeroupper independent of optimization level.
+
+pass_insert_vzeroupper is under condition
+
+TARGET_AVX && TARGET_VZEROUPPER
+&& flag_expensive_optimizations && !optimize_size
+
+But the document of mvzeroupper doesn't mention the insertion
+required -O2 and above, it may confuse users when they explicitly
+use -Os -mvzeroupper.
+
+------------
+mvzeroupper
+Target Mask(VZEROUPPER) Save
+Generate vzeroupper instruction before a transfer of control flow out of
+the function.
+------------
+
+The patch moves flag_expensive_optimizations && !optimize_size to
+ix86_option_override_internal. It makes -mvzeroupper independent of
+optimization level, but still keeps the behavior of architecture
+tuning(emit_vzeroupper) unchanged.
+
+gcc/ChangeLog:
+
+	* config/i386/i386-features.cc (pass_insert_vzeroupper:gate):
+	Move flag_expensive_optimizations && !optimize_size to ..
+	* config/i386/i386-options.cc (ix86_option_override_internal):
+	.. this, it makes -mvzeroupper independent of optimization
+	level, but still keeps the behavior of architecture
+	tuning(emit_vzeroupper) unchanged.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.target/i386/avx-vzeroupper-29.c: New testcase.
+	* gcc.target/i386/avx-vzeroupper-12.c: Adjust testcase.
+	* gcc.target/i386/avx-vzeroupper-7.c: Ditto.
+	* gcc.target/i386/avx-vzeroupper-9.c: Ditto.
+---
+ gcc/config/i386/i386-features.cc                  |  3 +--
+ gcc/config/i386/i386-options.cc                   |  4 +++-
+ gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c |  3 ++-
+ gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c | 14 ++++++++++++++
+ gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c  |  3 ++-
+ gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c  |  3 ++-
+ 6 files changed, 24 insertions(+), 6 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
+
+diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
+index 6fe41c3c24f..6a2444eb6b6 100644
+--- a/gcc/config/i386/i386-features.cc
++++ b/gcc/config/i386/i386-features.cc
+@@ -1875,8 +1875,7 @@ public:
+   /* opt_pass methods: */
+   virtual bool gate (function *)
+     {
+-      return TARGET_AVX && TARGET_VZEROUPPER
+-	&& flag_expensive_optimizations && !optimize_size;
++      return TARGET_AVX && TARGET_VZEROUPPER;
+     }
+ 
+   virtual unsigned int execute (function *)
+diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
+index ff44ad4e03c..74e969b6896 100644
+--- a/gcc/config/i386/i386-options.cc
++++ b/gcc/config/i386/i386-options.cc
+@@ -2702,7 +2702,9 @@ ix86_option_override_internal (bool main_args_p,
+     sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH");
+ 
+   if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
+-      && TARGET_EMIT_VZEROUPPER)
++      && TARGET_EMIT_VZEROUPPER
++      && flag_expensive_optimizations
++      && !optimize_size)
+     opts->x_target_flags |= MASK_VZEROUPPER;
+   if (!(opts_set->x_target_flags & MASK_STV))
+     opts->x_target_flags |= MASK_STV;
+diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c
+index e694d4048bd..5a40e87832c 100644
+--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c
++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c
+@@ -16,5 +16,6 @@ foo ()
+   _mm256_zeroupper ();
+ }
+ 
+-/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */
++/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */
++/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */
+ /* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */
+diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
+new file mode 100644
+index 00000000000..4af637757f7
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile } */
++/* { dg-options "-O0 -mavx -mtune=generic -mvzeroupper -dp" } */
++
++#include <immintrin.h>
++
++extern __m256 x, y;
++
++void
++foo ()
++{
++  x = y;
++}
++
++/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
+diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c
+index ab6d68779b3..75fe5889783 100644
+--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c
++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c
+@@ -12,4 +12,5 @@ foo ()
+   _mm256_zeroupper ();
+ }
+ 
+-/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
++/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 { target ia32 } } } */
++/* { dg-final { scan-assembler-times "avx_vzeroupper" 2 { target { ! ia32 } } } } */
+diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c
+index 974e1626a6d..fa0a6dfcaac 100644
+--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c
++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c
+@@ -15,4 +15,5 @@ foo ()
+   _mm256_zeroupper ();
+ }
+ 
+-/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */
++/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */
++/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */
+-- 
+2.31.1
+
diff --git a/gcc.spec b/gcc.spec
index 1163542..4a1279a 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 18
+%global gcc_release 19
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -165,6 +165,7 @@ Patch27: 0027-LoopElim-Redundant-loop-elimination-optimization.patch
 Patch28: 0028-Array-widen-compare-Fix-the-return-value-match-after.patch
 Patch29: 0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch
 Patch30: 0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch
+Patch31: 0031-Make-option-mvzeroupper-independent-of-optimization.patch
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@@ -661,6 +662,7 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch28 -p1
 %patch29 -p1
 %patch30 -p1
+%patch31 -p1
 
 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE
 
@@ -2892,6 +2894,10 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-19
+- Type: Sync
+- DESC: Make option mvzeroupper independent of optimization level.
+
 * Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-18
 - Type: Sync
 - DESC: Explicitly view_convert_expr mask to signed type when folding
-- 
Gitee


From 676ad4656ff3b132e1956905dfb84e936d64677d Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu.wang@intel.com>
Date: Wed, 17 Jan 2024 15:08:50 +0800
Subject: [PATCH 04/15] [Sync] i386: Sync tune_string with arch_string for
 target attribute arch=*

For function with target attribute arch=*, current logic will set its
tune to -mtune from command line so all target_clones will get same
tuning flags which would affect the performance for each clone. Override
tune with arch if tune was not explicitly specified to get proper tuning
flags for target_clones.

gcc/ChangeLog:

	* config/i386/i386-options.cc (ix86_valid_target_attribute_tree):
	Override tune_string with arch_string if tune_string is not
	explicitly specified.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/mvc17.c: New test.
---
 ...tring-with-arch_string-for-target-at.patch | 66 +++++++++++++++++++
 gcc.spec                                      |  8 ++-
 2 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch

diff --git a/0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch b/0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch
new file mode 100644
index 0000000..793ac8c
--- /dev/null
+++ b/0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch
@@ -0,0 +1,66 @@
+From 60c432aac9e2d66b0c0ad09513f7c3b98f53ec35 Mon Sep 17 00:00:00 2001
+From: Hongyu Wang <hongyu.wang@intel.com>
+Date: Sun, 25 Jun 2023 09:50:21 +0800
+Subject: [Sync] i386: Sync tune_string with arch_string for target attribute
+arch=*
+
+For function with target attribute arch=*, current logic will set its
+tune to -mtune from command line so all target_clones will get same
+tuning flags which would affect the performance for each clone. Override
+tune with arch if tune was not explicitly specified to get proper tuning
+flags for target_clones.
+
+gcc/ChangeLog:
+
+	* config/i386/i386-options.cc (ix86_valid_target_attribute_tree):
+	Override tune_string with arch_string if tune_string is not
+	explicitly specified.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.target/i386/mvc17.c: New test.
+
+(cherry picked from commit 2916278d14e9ac28c361c396a67256acbebda6e8)
+---
+ gcc/config/i386/i386-options.cc       |  6 +++++-
+ gcc/testsuite/gcc.target/i386/mvc17.c | 11 +++++++++++
+ 2 files changed, 16 insertions(+), 1 deletion(-)
+ create mode 100644 gcc/testsuite/gcc.target/i386/mvc17.c
+
+diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
+index 74e969b6896..fb2ed942f67 100644
+--- a/gcc/config/i386/i386-options.cc
++++ b/gcc/config/i386/i386-options.cc
+@@ -1378,7 +1378,11 @@ ix86_valid_target_attribute_tree (tree fndecl, tree args,
+       if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
+ 	opts->x_ix86_tune_string
+ 	  = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
+-      else if (orig_tune_defaulted)
++      /* If we have explicit arch string and no tune string specified, set
++	 tune_string to NULL and later it will be overriden by arch_string
++	 so target clones can get proper optimization.  */
++      else if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
++	       || orig_tune_defaulted)
+ 	opts->x_ix86_tune_string = NULL;
+ 
+       /* If fpmath= is not set, and we now have sse2 on 32-bit, use it.  */
+diff --git a/gcc/testsuite/gcc.target/i386/mvc17.c b/gcc/testsuite/gcc.target/i386/mvc17.c
+new file mode 100644
+index 00000000000..8b83c1aecb3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/mvc17.c
+@@ -0,0 +1,11 @@
++/* { dg-do compile } */
++/* { dg-require-ifunc "" } */
++/* { dg-options "-O2 -march=x86-64" } */
++/* { dg-final { scan-assembler-times "rep mov" 1 } } */
++
++__attribute__((target_clones("default","arch=icelake-server")))
++void
++foo (char *a, char *b, int size)
++{
++  __builtin_memcpy (a, b, size & 0x7F);
++}
+-- 
+2.31.1
+
diff --git a/gcc.spec b/gcc.spec
index 4a1279a..c5df18e 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 19
+%global gcc_release 20
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -166,6 +166,7 @@ Patch28: 0028-Array-widen-compare-Fix-the-return-value-match-after.patch
 Patch29: 0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch
 Patch30: 0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch
 Patch31: 0031-Make-option-mvzeroupper-independent-of-optimization.patch
+Patch32: 0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@@ -663,6 +664,7 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch29 -p1
 %patch30 -p1
 %patch31 -p1
+%patch32 -p1
 
 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE
 
@@ -2894,6 +2896,10 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed Jan 17 2024 Hongyu Wang <hongyu.wang@intel.com> 12.3.1-20
+- Type: Sync
+- DESC: i386: Sync tune_string with arch_string for target attribute arch=*
+
 * Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-19
 - Type: Sync
 - DESC: Make option mvzeroupper independent of optimization level.
-- 
Gitee


From 0cd760a99de83978de370fca3341e959b8823868 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 17 Jan 2024 15:17:03 +0800
Subject: [PATCH 05/15] [Sync] Refine maskloadmn pattern with UNSPEC_MASKLOAD.

If mem_addr points to a memory region with less than whole vector size
bytes of accessible memory and k is a mask that would prevent reading
the inaccessible bytes from mem_addr, add UNSPEC_MASKLOAD to prevent
it to be transformed to vpblendd.

gcc/ChangeLog:

	PR target/110309
	* config/i386/sse.md (maskload<mode><avx512fmaskmodelower>):
	Refine pattern with UNSPEC_MASKLOAD.
	(maskload<mode><avx512fmaskmodelower>): Ditto.
	(*<avx512>_load<mode>_mask): Extend mode iterator to
	VI12HF_AVX512VL.
	(*<avx512>_load<mode>): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr110309.c: New test.
---
 ...kloadmn-pattern-with-UNSPEC_MASKLOAD.patch | 111 ++++++++++++++++++
 gcc.spec                                      |   8 +-
 2 files changed, 118 insertions(+), 1 deletion(-)
 create mode 100644 0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch

diff --git a/0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch b/0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch
new file mode 100644
index 0000000..286f26e
--- /dev/null
+++ b/0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch
@@ -0,0 +1,111 @@
+From d2db85f2ce63c84137fa8f01f6eddce810db8901 Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Tue, 20 Jun 2023 15:41:00 +0800
+Subject: [Sync] Refine maskloadmn pattern with UNSPEC_MASKLOAD.
+
+If mem_addr points to a memory region with less than whole vector size
+bytes of accessible memory and k is a mask that would prevent reading
+the inaccessible bytes from mem_addr, add UNSPEC_MASKLOAD to prevent
+it to be transformed to vpblendd.
+
+gcc/ChangeLog:
+
+	PR target/110309
+	* config/i386/sse.md (maskload<mode><avx512fmaskmodelower>):
+	Refine pattern with UNSPEC_MASKLOAD.
+	(maskload<mode><avx512fmaskmodelower>): Ditto.
+	(*<avx512>_load<mode>_mask): Extend mode iterator to
+	VI12HF_AVX512VL.
+	(*<avx512>_load<mode>): Ditto.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.target/i386/pr110309.c: New test.
+---
+ gcc/config/i386/sse.md                   | 32 +++++++++++++-----------
+ gcc/testsuite/gcc.target/i386/pr110309.c | 10 ++++++++
+ 2 files changed, 28 insertions(+), 14 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/i386/pr110309.c
+
+diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
+index eb767e56ca4..b30e96cb1ab 100644
+--- a/gcc/config/i386/sse.md
++++ b/gcc/config/i386/sse.md
+@@ -1411,12 +1411,12 @@
+ })
+ 
+ (define_insn "*<avx512>_load<mode>_mask"
+-  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
+-	(vec_merge:VI12_AVX512VL
+-	  (unspec:VI12_AVX512VL
+-	    [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
++  [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v")
++	(vec_merge:VI12HF_AVX512VL
++	  (unspec:VI12HF_AVX512VL
++	    [(match_operand:VI12HF_AVX512VL 1 "memory_operand" "m")]
+ 	    UNSPEC_MASKLOAD)
+-	  (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C")
++	  (match_operand:VI12HF_AVX512VL 2 "nonimm_or_0_operand" "0C")
+ 	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
+   "TARGET_AVX512BW"
+   "vmovdqu<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+@@ -1425,9 +1425,9 @@
+    (set_attr "mode" "<sseinsnmode>")])
+ 
+ (define_insn_and_split "*<avx512>_load<mode>"
+-  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
+-	(unspec:VI12_AVX512VL
+-	  [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
++  [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v")
++	(unspec:VI12HF_AVX512VL
++	  [(match_operand:VI12HF_AVX512VL 1 "memory_operand" "m")]
+ 	  UNSPEC_MASKLOAD))]
+   "TARGET_AVX512BW"
+   "#"
+@@ -25973,17 +25973,21 @@
+   "TARGET_AVX")
+ 
+ (define_expand "maskload<mode><avx512fmaskmodelower>"
+-  [(set (match_operand:V48H_AVX512VL 0 "register_operand")
+-	(vec_merge:V48H_AVX512VL
+-	  (match_operand:V48H_AVX512VL 1 "memory_operand")
++  [(set (match_operand:V48_AVX512VL 0 "register_operand")
++	(vec_merge:V48_AVX512VL
++	  (unspec:V48_AVX512VL
++	    [(match_operand:V48_AVX512VL 1 "memory_operand")]
++	    UNSPEC_MASKLOAD)
+ 	  (match_dup 0)
+ 	  (match_operand:<avx512fmaskmode> 2 "register_operand")))]
+   "TARGET_AVX512F")
+ 
+ (define_expand "maskload<mode><avx512fmaskmodelower>"
+-  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
+-	(vec_merge:VI12_AVX512VL
+-	  (match_operand:VI12_AVX512VL 1 "memory_operand")
++  [(set (match_operand:VI12HF_AVX512VL 0 "register_operand")
++	(vec_merge:VI12HF_AVX512VL
++	  (unspec:VI12HF_AVX512VL
++	    [(match_operand:VI12HF_AVX512VL 1 "memory_operand")]
++	    UNSPEC_MASKLOAD)
+ 	  (match_dup 0)
+ 	  (match_operand:<avx512fmaskmode> 2 "register_operand")))]
+   "TARGET_AVX512BW")
+diff --git a/gcc/testsuite/gcc.target/i386/pr110309.c b/gcc/testsuite/gcc.target/i386/pr110309.c
+new file mode 100644
+index 00000000000..f6e9e9c3c61
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr110309.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-options "-O3 --param vect-partial-vector-usage=1 -march=znver4 -mprefer-vector-width=256" } */
++/* { dg-final { scan-assembler-not {(?n)vpblendd.*ymm} } } */
++
++
++void foo (int * __restrict a, int *b)
++{
++  for (int i = 0; i < 6; ++i)
++    a[i] = b[i] + 42;
++}
+-- 
+2.31.1
+
diff --git a/gcc.spec b/gcc.spec
index c5df18e..c62cb37 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 20
+%global gcc_release 21
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -167,6 +167,7 @@ Patch29: 0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch
 Patch30: 0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch
 Patch31: 0031-Make-option-mvzeroupper-independent-of-optimization.patch
 Patch32: 0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch
+Patch33: 0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@@ -665,6 +666,7 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch30 -p1
 %patch31 -p1
 %patch32 -p1
+%patch33 -p1
 
 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE
 
@@ -2896,6 +2898,10 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-21
+- Type: Sync
+- DESC: Refine maskloadmn pattern with UNSPEC_MASKLOAD.
+
 * Wed Jan 17 2024 Hongyu Wang <hongyu.wang@intel.com> 12.3.1-20
 - Type: Sync
 - DESC: i386: Sync tune_string with arch_string for target attribute arch=*
-- 
Gitee


From d0c367fe563778ba55b07e3eb0080b88c6a97290 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 17 Jan 2024 15:20:19 +0800
Subject: [PATCH 06/15] [Sync] Refine maskstore patterns with UNSPEC_MASKMOV.

Similar like r14-2070-gc79476da46728e

If mem_addr points to a memory region with less than whole vector size
bytes of accessible memory and k is a mask that would prevent reading
the inaccessible bytes from mem_addr, add UNSPEC_MASKMOV to prevent
it to be transformed to any other whole memory access instructions.

gcc/ChangeLog:

	PR rtl-optimization/110237
	* config/i386/sse.md (<avx512>_store<mode>_mask): Refine with
	UNSPEC_MASKMOV.
	(maskstore<mode><avx512fmaskmodelower): Ditto.
	(*<avx512>_store<mode>_mask): New define_insn, it's renamed
	from original <avx512>_store<mode>_mask.
---
 ...skstore-patterns-with-UNSPEC_MASKMOV.patch | 126 ++++++++++++++++++
 gcc.spec                                      |   8 +-
 2 files changed, 133 insertions(+), 1 deletion(-)
 create mode 100644 0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch

diff --git a/0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch b/0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch
new file mode 100644
index 0000000..81c4819
--- /dev/null
+++ b/0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch
@@ -0,0 +1,126 @@
+From 389fe721ab32b154fb07e4865fa32be0608bd5d2 Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Mon, 26 Jun 2023 21:07:09 +0800
+Subject: [Sync] Refine maskstore patterns with UNSPEC_MASKMOV.
+
+Similar like r14-2070-gc79476da46728e
+
+If mem_addr points to a memory region with less than whole vector size
+bytes of accessible memory and k is a mask that would prevent reading
+the inaccessible bytes from mem_addr, add UNSPEC_MASKMOV to prevent
+it to be transformed to any other whole memory access instructions.
+
+gcc/ChangeLog:
+
+	PR rtl-optimization/110237
+	* config/i386/sse.md (<avx512>_store<mode>_mask): Refine with
+	UNSPEC_MASKMOV.
+	(maskstore<mode><avx512fmaskmodelower): Ditto.
+	(*<avx512>_store<mode>_mask): New define_insn, it's renamed
+	from original <avx512>_store<mode>_mask.
+---
+ gcc/config/i386/sse.md | 69 ++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 57 insertions(+), 12 deletions(-)
+
+diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
+index b30e96cb1ab..3af15989631 100644
+--- a/gcc/config/i386/sse.md
++++ b/gcc/config/i386/sse.md
+@@ -1554,7 +1554,7 @@
+    (set_attr "prefix" "evex")
+    (set_attr "mode" "<sseinsnmode>")])
+ 
+-(define_insn "<avx512>_store<mode>_mask"
++(define_insn "*<avx512>_store<mode>_mask"
+   [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m")
+ 	(vec_merge:V48_AVX512VL
+ 	  (match_operand:V48_AVX512VL 1 "register_operand" "v")
+@@ -1582,7 +1582,7 @@
+    (set_attr "memory" "store")
+    (set_attr "mode" "<sseinsnmode>")])
+ 
+-(define_insn "<avx512>_store<mode>_mask"
++(define_insn "*<avx512>_store<mode>_mask"
+   [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m")
+ 	(vec_merge:VI12HF_AVX512VL
+ 	  (match_operand:VI12HF_AVX512VL 1 "register_operand" "v")
+@@ -26002,21 +26002,66 @@
+   "TARGET_AVX")
+ 
+ (define_expand "maskstore<mode><avx512fmaskmodelower>"
+-  [(set (match_operand:V48H_AVX512VL 0 "memory_operand")
+-	(vec_merge:V48H_AVX512VL
+-	  (match_operand:V48H_AVX512VL 1 "register_operand")
+-	  (match_dup 0)
+-	  (match_operand:<avx512fmaskmode> 2 "register_operand")))]
++  [(set (match_operand:V48_AVX512VL 0 "memory_operand")
++	(unspec:V48_AVX512VL
++	  [(match_operand:V48_AVX512VL 1 "register_operand")
++	   (match_dup 0)
++	   (match_operand:<avx512fmaskmode> 2 "register_operand")]
++	  UNSPEC_MASKMOV))]
+   "TARGET_AVX512F")
+ 
+ (define_expand "maskstore<mode><avx512fmaskmodelower>"
+-  [(set (match_operand:VI12_AVX512VL 0 "memory_operand")
+-	(vec_merge:VI12_AVX512VL
+-	  (match_operand:VI12_AVX512VL 1 "register_operand")
+-	  (match_dup 0)
+-	  (match_operand:<avx512fmaskmode> 2 "register_operand")))]
++  [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand")
++	(unspec:VI12HF_AVX512VL
++	  [(match_operand:VI12HF_AVX512VL 1 "register_operand")
++	   (match_dup 0)
++	   (match_operand:<avx512fmaskmode> 2 "register_operand")]
++	  UNSPEC_MASKMOV))]
+   "TARGET_AVX512BW")
+ 
++(define_insn "<avx512>_store<mode>_mask"
++  [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m")
++	(unspec:V48_AVX512VL
++	  [(match_operand:V48_AVX512VL 1 "register_operand" "v")
++	   (match_dup 0)
++	   (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")]
++	  UNSPEC_MASKMOV))]
++  "TARGET_AVX512F"
++{
++  if (FLOAT_MODE_P (GET_MODE_INNER (<MODE>mode)))
++    {
++      if (misaligned_operand (operands[0], <MODE>mode))
++	return "vmovu<ssemodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
++      else
++	return "vmova<ssemodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
++    }
++  else
++    {
++      if (misaligned_operand (operands[0], <MODE>mode))
++	return "vmovdqu<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
++      else
++	return "vmovdqa<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
++    }
++}
++  [(set_attr "type" "ssemov")
++   (set_attr "prefix" "evex")
++   (set_attr "memory" "store")
++   (set_attr "mode" "<sseinsnmode>")])
++
++(define_insn "<avx512>_store<mode>_mask"
++  [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m")
++	(unspec:VI12HF_AVX512VL
++	  [(match_operand:VI12HF_AVX512VL 1 "register_operand" "v")
++	   (match_dup 0)
++	   (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")]
++	   UNSPEC_MASKMOV))]
++  "TARGET_AVX512BW"
++  "vmovdqu<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}"
++  [(set_attr "type" "ssemov")
++   (set_attr "prefix" "evex")
++   (set_attr "memory" "store")
++   (set_attr "mode" "<sseinsnmode>")])
++
+ (define_expand "cbranch<mode>4"
+   [(set (reg:CC FLAGS_REG)
+ 	(compare:CC (match_operand:VI48_AVX 1 "register_operand")
+-- 
+2.31.1
+
diff --git a/gcc.spec b/gcc.spec
index c62cb37..157ef1c 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 21
+%global gcc_release 22
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -168,6 +168,7 @@ Patch30: 0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch
 Patch31: 0031-Make-option-mvzeroupper-independent-of-optimization.patch
 Patch32: 0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch
 Patch33: 0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch
+Patch34: 0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@@ -667,6 +668,7 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch31 -p1
 %patch32 -p1
 %patch33 -p1
+%patch34 -p1
 
 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE
 
@@ -2898,6 +2900,10 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-22
+- Type: Sync
+- DESC: Refine maskstore patterns with UNSPEC_MASKMOV.
+
 * Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-21
 - Type: Sync
 - DESC: Refine maskloadmn pattern with UNSPEC_MASKLOAD.
-- 
Gitee


From 379e84de84cdac4ee4ad5a0f203e4c8c06cf226c Mon Sep 17 00:00:00 2001
From: "Cui, Lili" <lili.cui@intel.com>
Date: Wed, 17 Jan 2024 15:24:09 +0800
Subject: [PATCH 07/15] [Sync] x86: Update model values for Alderlake and
 Rocketlake.

Update model values for Alderlake and Rocketlake according to SDM.

gcc/ChangeLog

	* common/config/i386/cpuinfo.h (get_intel_cpu): Remove model value 0xa8
	from Rocketlake, remove model value 0xbf from Alderlake.
---
 ...-values-for-Alderlake-and-Rocketlake.patch | 38 +++++++++++++++++++
 gcc.spec                                      |  8 +++-
 2 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch

diff --git a/0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch b/0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch
new file mode 100644
index 0000000..938135a
--- /dev/null
+++ b/0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch
@@ -0,0 +1,38 @@
+From 62cef9aea12f343cf86697f56884f585191b9545 Mon Sep 17 00:00:00 2001
+From: "Cui, Lili" <lili.cui@intel.com>
+Date: Thu, 29 Jun 2023 03:10:35 +0000
+Subject: [Sync] x86: Update model values for Alderlake and Rocketlake.
+
+Update model values for Alderlake and Rocketlake according to SDM.
+
+gcc/ChangeLog
+
+	* common/config/i386/cpuinfo.h (get_intel_cpu): Remove model value 0xa8
+	from Rocketlake, remove model value 0xbf from Alderlake.
+---
+ gcc/common/config/i386/cpuinfo.h | 2 --
+ 1 file changed, 2 deletions(-)
+
+diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
+index 0333da56ba5..28b2ff0b033 100644
+--- a/gcc/common/config/i386/cpuinfo.h
++++ b/gcc/common/config/i386/cpuinfo.h
+@@ -435,7 +435,6 @@ get_intel_cpu (struct __processor_model *cpu_model,
+       cpu_model->__cpu_subtype = INTEL_COREI7_SKYLAKE;
+       break;
+     case 0xa7:
+-    case 0xa8:
+       /* Rocket Lake.  */
+       cpu = "rocketlake";
+       CHECK___builtin_cpu_is ("corei7");
+@@ -508,7 +507,6 @@ get_intel_cpu (struct __processor_model *cpu_model,
+       break;
+     case 0x97:
+     case 0x9a:
+-    case 0xbf:
+       /* Alder Lake.  */
+       cpu = "alderlake";
+       CHECK___builtin_cpu_is ("corei7");
+-- 
+2.31.1
+
diff --git a/gcc.spec b/gcc.spec
index 157ef1c..ade70b6 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 22
+%global gcc_release 23
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -169,6 +169,7 @@ Patch31: 0031-Make-option-mvzeroupper-independent-of-optimization.patch
 Patch32: 0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch
 Patch33: 0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch
 Patch34: 0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch
+Patch35: 0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@@ -669,6 +670,7 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch32 -p1
 %patch33 -p1
 %patch34 -p1
+%patch35 -p1
 
 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE
 
@@ -2900,6 +2902,10 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed Jan 17 2024 Cui, Lili <lili.cui@intel.com> 12.3.1-23
+- Type: Sync
+- DESC: x86: Update model values for Alderlake and Rocketlake.
+
 * Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-22
 - Type: Sync
 - DESC: Refine maskstore patterns with UNSPEC_MASKMOV.
-- 
Gitee


From 550fdbff301ff12712abd4fc1d972f1676c8da31 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 17 Jan 2024 15:27:32 +0800
Subject: [PATCH 08/15] [Sync] Workaround possible CPUID bug in Sandy Bridge.

Don't access leaf 7 subleaf 1 unless subleaf 0 says it is
supported via EAX.

Intel documentation says invalid subleaves return 0. We had been
relying on that behavior instead of checking the max sublef number.

It appears that some Sandy Bridge CPUs return at least the subleaf 0
EDX value for subleaf 1. Best guess is that this is a bug in a
microcode patch since all of the bits we're seeing set in EDX were
introduced after Sandy Bridge was originally released.

This is causing avxvnniint16 to be incorrectly enabled with
-march=native on these CPUs.

gcc/ChangeLog:

	* common/config/i386/cpuinfo.h (get_available_features): Check
	max_subleaf_level for valid subleaf before use CPUID.
---
 ...d-possible-CPUID-bug-in-Sandy-Bridge.patch | 78 +++++++++++++++++++
 gcc.spec                                      |  8 +-
 2 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch

diff --git a/0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch b/0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch
new file mode 100644
index 0000000..f3c7900
--- /dev/null
+++ b/0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch
@@ -0,0 +1,78 @@
+From 161c0972243cf967196f4e9361f0736000637bfe Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Fri, 4 Aug 2023 09:27:39 +0800
+Subject: [Sync] Workaround possible CPUID bug in Sandy Bridge.
+
+Don't access leaf 7 subleaf 1 unless subleaf 0 says it is
+supported via EAX.
+
+Intel documentation says invalid subleaves return 0. We had been
+relying on that behavior instead of checking the max sublef number.
+
+It appears that some Sandy Bridge CPUs return at least the subleaf 0
+EDX value for subleaf 1. Best guess is that this is a bug in a
+microcode patch since all of the bits we're seeing set in EDX were
+introduced after Sandy Bridge was originally released.
+
+This is causing avxvnniint16 to be incorrectly enabled with
+-march=native on these CPUs.
+
+gcc/ChangeLog:
+
+	* common/config/i386/cpuinfo.h (get_available_features): Check
+	max_subleaf_level for valid subleaf before use CPUID.
+---
+ gcc/common/config/i386/cpuinfo.h | 29 +++++++++++++++++------------
+ 1 file changed, 17 insertions(+), 12 deletions(-)
+
+diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
+index 28b2ff0b033..316ad3cb3e9 100644
+--- a/gcc/common/config/i386/cpuinfo.h
++++ b/gcc/common/config/i386/cpuinfo.h
+@@ -647,7 +647,9 @@ get_available_features (struct __processor_model *cpu_model,
+   /* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */
+   if (max_cpuid_level >= 7)
+     {
+-      __cpuid_count (7, 0, eax, ebx, ecx, edx);
++      unsigned int max_subleaf_level;
++
++      __cpuid_count (7, 0, max_subleaf_level, ebx, ecx, edx);
+       if (ebx & bit_BMI)
+ 	set_feature (FEATURE_BMI);
+       if (ebx & bit_SGX)
+@@ -759,18 +761,21 @@ get_available_features (struct __processor_model *cpu_model,
+ 	    set_feature (FEATURE_AVX512FP16);
+ 	}
+ 
+-      __cpuid_count (7, 1, eax, ebx, ecx, edx);
+-      if (eax & bit_HRESET)
+-	set_feature (FEATURE_HRESET);
+-      if (avx_usable)
+-	{
+-	  if (eax & bit_AVXVNNI)
+-	    set_feature (FEATURE_AVXVNNI);
+-	}
+-      if (avx512_usable)
++      if (max_subleaf_level >= 1)
+ 	{
+-	  if (eax & bit_AVX512BF16)
+-	    set_feature (FEATURE_AVX512BF16);
++	  __cpuid_count (7, 1, eax, ebx, ecx, edx);
++	  if (eax & bit_HRESET)
++	    set_feature (FEATURE_HRESET);
++	  if (avx_usable)
++	    {
++	      if (eax & bit_AVXVNNI)
++		set_feature (FEATURE_AVXVNNI);
++	    }
++	  if (avx512_usable)
++	    {
++	      if (eax & bit_AVX512BF16)
++		set_feature (FEATURE_AVX512BF16);
++	    }
+ 	}
+     }
+ 
+-- 
+2.31.1
+
diff --git a/gcc.spec b/gcc.spec
index ade70b6..2915ca4 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 23
+%global gcc_release 24
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -170,6 +170,7 @@ Patch32: 0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch
 Patch33: 0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch
 Patch34: 0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch
 Patch35: 0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch
+Patch36: 0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@@ -671,6 +672,7 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch33 -p1
 %patch34 -p1
 %patch35 -p1
+%patch36 -p1
 
 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE
 
@@ -2902,6 +2904,10 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-24
+- Type: Sync
+- DESC: Workaround possible CPUID bug in Sandy Bridge.
+
 * Wed Jan 17 2024 Cui, Lili <lili.cui@intel.com> 12.3.1-23
 - Type: Sync
 - DESC: x86: Update model values for Alderlake and Rocketlake.
-- 
Gitee


From d76e96b448f6ab57a66ae21895a5246ca9f7118d Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 17 Jan 2024 15:30:02 +0800
Subject: [PATCH 09/15] [Sync] Software mitigation: Disable gather generation
 in vectorization for GDS affected Intel Processors.

For more details of GDS (Gather Data Sampling), refer to
https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/gather-data-sampling.html

After microcode update, there's performance regression. To avoid that,
the patch disables gather generation in autovectorization but uses
gather scalar emulation instead.

gcc/ChangeLog:

	* config/i386/i386-options.cc (m_GDS): New macro.
	* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Don't
	enable for m_GDS.
	(X86_TUNE_USE_GATHER_4PARTS): Ditto.
	(X86_TUNE_USE_GATHER): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx2-gather-2.c: Adjust options to keep
	gather vectorization.
	* gcc.target/i386/avx2-gather-6.c: Ditto.
	* gcc.target/i386/avx512f-pr88464-1.c: Ditto.
	* gcc.target/i386/avx512f-pr88464-5.c: Ditto.
	* gcc.target/i386/avx512vl-pr88464-1.c: Ditto.
	* gcc.target/i386/avx512vl-pr88464-11.c: Ditto.
	* gcc.target/i386/avx512vl-pr88464-3.c: Ditto.
	* gcc.target/i386/avx512vl-pr88464-9.c: Ditto.
	* gcc.target/i386/pr88531-1b.c: Ditto.
	* gcc.target/i386/pr88531-1c.c: Ditto.
---
 ...ion-Disable-gather-generation-in-vec.patch | 220 ++++++++++++++++++
 gcc.spec                                      |   9 +-
 2 files changed, 228 insertions(+), 1 deletion(-)
 create mode 100644 0037-Software-mitigation-Disable-gather-generation-in-vec.patch

diff --git a/0037-Software-mitigation-Disable-gather-generation-in-vec.patch b/0037-Software-mitigation-Disable-gather-generation-in-vec.patch
new file mode 100644
index 0000000..224eeaf
--- /dev/null
+++ b/0037-Software-mitigation-Disable-gather-generation-in-vec.patch
@@ -0,0 +1,220 @@
+From 49e9bb4de0ad94e8636d796b16aa09917145b0d3 Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Thu, 10 Aug 2023 11:41:39 +0800
+Subject: [Sync] Software mitigation: Disable gather generation in
+ vectorization for GDS affected Intel Processors.
+
+For more details of GDS (Gather Data Sampling), refer to
+https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/gather-data-sampling.html
+
+After microcode update, there's performance regression. To avoid that,
+the patch disables gather generation in autovectorization but uses
+gather scalar emulation instead.
+
+gcc/ChangeLog:
+
+	* config/i386/i386-options.cc (m_GDS): New macro.
+	* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Don't
+	enable for m_GDS.
+	(X86_TUNE_USE_GATHER_4PARTS): Ditto.
+	(X86_TUNE_USE_GATHER): Ditto.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.target/i386/avx2-gather-2.c: Adjust options to keep
+	gather vectorization.
+	* gcc.target/i386/avx2-gather-6.c: Ditto.
+	* gcc.target/i386/avx512f-pr88464-1.c: Ditto.
+	* gcc.target/i386/avx512f-pr88464-5.c: Ditto.
+	* gcc.target/i386/avx512vl-pr88464-1.c: Ditto.
+	* gcc.target/i386/avx512vl-pr88464-11.c: Ditto.
+	* gcc.target/i386/avx512vl-pr88464-3.c: Ditto.
+	* gcc.target/i386/avx512vl-pr88464-9.c: Ditto.
+	* gcc.target/i386/pr88531-1b.c: Ditto.
+	* gcc.target/i386/pr88531-1c.c: Ditto.
+
+(cherry picked from commit 3064d1f5c48cb6ce1b4133570dd08ecca8abb52d)
+---
+ gcc/config/i386/i386-options.cc                     | 5 +++++
+ gcc/config/i386/x86-tune.def                        | 9 ++++++---
+ gcc/testsuite/gcc.target/i386/avx2-gather-2.c       | 2 +-
+ gcc/testsuite/gcc.target/i386/avx2-gather-6.c       | 2 +-
+ gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c   | 2 +-
+ gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c   | 2 +-
+ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c  | 2 +-
+ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c | 2 +-
+ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c  | 2 +-
+ gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c  | 2 +-
+ gcc/testsuite/gcc.target/i386/pr88531-1b.c          | 2 +-
+ gcc/testsuite/gcc.target/i386/pr88531-1c.c          | 2 +-
+ 12 files changed, 21 insertions(+), 13 deletions(-)
+
+diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
+index fb2ed942f67..9617fc162e0 100644
+--- a/gcc/config/i386/i386-options.cc
++++ b/gcc/config/i386/i386-options.cc
+@@ -137,6 +137,11 @@ along with GCC; see the file COPYING3.  If not see
+ #define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS)
+ #define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT)
+ #define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
++/* Gather Data Sampling / CVE-2022-40982 / INTEL-SA-00828.
++   Software mitigation.  */
++#define m_GDS (m_SKYLAKE | m_SKYLAKE_AVX512 | m_CANNONLAKE \
++	       | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE \
++	       | m_TIGERLAKE | m_COOPERLAKE | m_ROCKETLAKE)
+ 
+ #define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
+ #define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
+diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
+index e6b9e21250f..4392709fce2 100644
+--- a/gcc/config/i386/x86-tune.def
++++ b/gcc/config/i386/x86-tune.def
+@@ -467,7 +467,8 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
+ /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
+    elements.  */
+ DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
+-	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC))
++	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
++	    | m_GENERIC | m_GDS))
+ 
+ /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
+    elements.  */
+@@ -477,7 +478,8 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
+ /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
+    elements.  */
+ DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
+-	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 |  m_ALDERLAKE | m_GENERIC))
++	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
++	    | m_GENERIC | m_GDS))
+ 
+ /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
+    elements.  */
+@@ -487,7 +489,8 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
+ /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
+    elements.  */
+ DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
+-	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE | m_GENERIC))
++	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE
++	    | m_GENERIC | m_GDS))
+ 
+ /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
+    elements.  */
+diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-2.c b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
+index ad5ef73107c..978924b0f57 100644
+--- a/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
++++ b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -fdump-tree-vect-details -march=skylake" } */
++/* { dg-options "-O3 -fdump-tree-vect-details -march=skylake -mtune=haswell" } */
+ 
+ #include "avx2-gather-1.c"
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-6.c b/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
+index b9119581ae2..067b251e393 100644
+--- a/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
++++ b/gcc/testsuite/gcc.target/i386/avx2-gather-6.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -mavx2 -fno-common -fdump-tree-vect-details -mtune=skylake" } */
++/* { dg-options "-O3 -mavx2 -fno-common -fdump-tree-vect-details  -mtune=haswell" } */
+ 
+ #include "avx2-gather-5.c"
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c
+index 06d21bb0129..d1a2298618e 100644
+--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c
++++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c
+@@ -1,6 +1,6 @@
+ /* PR tree-optimization/88464 */
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
++/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=haswell -fdump-tree-vect-details" } */
+ /* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c
+index 462e951fdc1..d7b0b2b28cb 100644
+--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c
++++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c
+@@ -1,6 +1,6 @@
+ /* PR tree-optimization/88464 */
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
++/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=haswell -fdump-tree-vect-details" } */
+ /* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c
+index 55a28dddbf8..07439185ec1 100644
+--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c
++++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c
+@@ -1,6 +1,6 @@
+ /* PR tree-optimization/88464 */
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
++/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=haswell -fdump-tree-vect-details" } */
+ /* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" 4 "vect" } } */
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c
+index 9696008855d..3a98108279a 100644
+--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c
++++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c
+@@ -1,6 +1,6 @@
+ /* PR tree-optimization/88464 */
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
++/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=haswell -fdump-tree-vect-details" } */
+ /* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" 4 "vect" } } */
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c
+index 6b0c8a85957..ac669e04812 100644
+--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c
++++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c
+@@ -1,6 +1,6 @@
+ /* PR tree-optimization/88464 */
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
++/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=128 -mtune=haswell -fdump-tree-vect-details" } */
+ /* { dg-final { scan-tree-dump-times "loop vectorized using 16 byte vectors" 4 "vect" } } */
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c
+index 3af568ab323..14a1083b6d1 100644
+--- a/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c
++++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c
+@@ -1,6 +1,6 @@
+ /* PR tree-optimization/88464 */
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
++/* { dg-options "-O3 -mavx512vl -mprefer-vector-width=256 -mtune=haswell -fdump-tree-vect-details" } */
+ /* { dg-final { scan-tree-dump-times "loop vectorized using 32 byte vectors" 4 "vect" } } */
+ /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/pr88531-1b.c b/gcc/testsuite/gcc.target/i386/pr88531-1b.c
+index 812c8a10fab..e6df789de90 100644
+--- a/gcc/testsuite/gcc.target/i386/pr88531-1b.c
++++ b/gcc/testsuite/gcc.target/i386/pr88531-1b.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -march=skylake -mfpmath=sse" } */
++/* { dg-options "-O3 -march=skylake -mfpmath=sse -mtune=haswell" } */
+ 
+ #include "pr88531-1a.c"
+ 
+diff --git a/gcc/testsuite/gcc.target/i386/pr88531-1c.c b/gcc/testsuite/gcc.target/i386/pr88531-1c.c
+index 43fc5913ed3..a093c87c01f 100644
+--- a/gcc/testsuite/gcc.target/i386/pr88531-1c.c
++++ b/gcc/testsuite/gcc.target/i386/pr88531-1c.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile } */
+-/* { dg-options "-O3 -march=skylake-avx512 -mfpmath=sse" } */
++/* { dg-options "-O3 -march=skylake-avx512 -mfpmath=sse -mtune=haswell" } */
+ 
+ #include "pr88531-1a.c"
+ 
+-- 
+2.31.1
+
diff --git a/gcc.spec b/gcc.spec
index 2915ca4..6c852bb 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 24
+%global gcc_release 25
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -171,6 +171,7 @@ Patch33: 0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch
 Patch34: 0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch
 Patch35: 0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch
 Patch36: 0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch
+Patch37: 0037-Software-mitigation-Disable-gather-generation-in-vec.patch
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@@ -673,6 +674,7 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch34 -p1
 %patch35 -p1
 %patch36 -p1
+%patch37 -p1
 
 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE
 
@@ -2904,6 +2906,11 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-25
+- Type: Sync
+- DESC: Software mitigation: Disable gather generation in vectorization for GDS
+  affected Intel Processors.
+
 * Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-24
 - Type: Sync
 - DESC: Workaround possible CPUID bug in Sandy Bridge.
-- 
Gitee


From 302d17b522bb7f4b37af1abb35b44772a3058274 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 17 Jan 2024 15:36:45 +0800
Subject: [PATCH 10/15] [Sync] Support -m[no-]gather -m[no-]scatter to
 enable/disable vectorization for all gather/scatter instructions

Rename original use_gather to use_gather_8parts, Support
-mtune-ctrl={,^}use_gather to set/clear tune features
use_gather_{2parts, 4parts, 8parts}. Support the new option -mgather
as alias of -mtune-ctrl=, use_gather, ^use_gather.

Similar for use_scatter.

gcc/ChangeLog:

	* config/i386/i386-builtins.cc
	(ix86_vectorize_builtin_gather): Adjust for use_gather_8parts.
	* config/i386/i386-options.cc (parse_mtune_ctrl_str):
	Set/Clear tune features use_{gather,scatter}_{2parts, 4parts,
	8parts} for -mtune-crtl={,^}{use_gather,use_scatter}.
	* config/i386/i386.cc (ix86_vectorize_builtin_scatter): Adjust
	for use_scatter_8parts
	* config/i386/i386.h (TARGET_USE_GATHER): Rename to ..
	(TARGET_USE_GATHER_8PARTS): .. this.
	(TARGET_USE_SCATTER): Rename to ..
	(TARGET_USE_SCATTER_8PARTS): .. this.
	* config/i386/x86-tune.def (X86_TUNE_USE_GATHER): Rename to
	(X86_TUNE_USE_GATHER_8PARTS): .. this.
	(X86_TUNE_USE_SCATTER): Rename to
	(X86_TUNE_USE_SCATTER_8PARTS): .. this.
	* config/i386/i386.opt: Add new options mgather, mscatter.
---
 ...her-m-no-scatter-to-enable-disable-v.patch | 187 ++++++++++++++++++
 gcc.spec                                      |   9 +-
 2 files changed, 195 insertions(+), 1 deletion(-)
 create mode 100644 0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch

diff --git a/0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch b/0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch
new file mode 100644
index 0000000..ddf2c4d
--- /dev/null
+++ b/0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch
@@ -0,0 +1,187 @@
+From 1981bc7e6c4149d590190162939f911cb53547fd Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Thu, 10 Aug 2023 16:26:13 +0800
+Subject: [Sync] Support -m[no-]gather -m[no-]scatter to enable/disable
+ vectorization for all gather/scatter instructions
+
+Rename original use_gather to use_gather_8parts, Support
+-mtune-ctrl={,^}use_gather to set/clear tune features
+use_gather_{2parts, 4parts, 8parts}. Support the new option -mgather
+as alias of -mtune-ctrl=, use_gather, ^use_gather.
+
+Similar for use_scatter.
+
+gcc/ChangeLog:
+
+	* config/i386/i386-builtins.cc
+	(ix86_vectorize_builtin_gather): Adjust for use_gather_8parts.
+	* config/i386/i386-options.cc (parse_mtune_ctrl_str):
+	Set/Clear tune features use_{gather,scatter}_{2parts, 4parts,
+	8parts} for -mtune-crtl={,^}{use_gather,use_scatter}.
+	* config/i386/i386.cc (ix86_vectorize_builtin_scatter): Adjust
+	for use_scatter_8parts
+	* config/i386/i386.h (TARGET_USE_GATHER): Rename to ..
+	(TARGET_USE_GATHER_8PARTS): .. this.
+	(TARGET_USE_SCATTER): Rename to ..
+	(TARGET_USE_SCATTER_8PARTS): .. this.
+	* config/i386/x86-tune.def (X86_TUNE_USE_GATHER): Rename to
+	(X86_TUNE_USE_GATHER_8PARTS): .. this.
+	(X86_TUNE_USE_SCATTER): Rename to
+	(X86_TUNE_USE_SCATTER_8PARTS): .. this.
+	* config/i386/i386.opt: Add new options mgather, mscatter.
+
+(cherry picked from commit b2a927fb5343db363ea4361da0d6bcee227b6737)
+---
+ gcc/config/i386/i386-builtins.cc |  2 +-
+ gcc/config/i386/i386-options.cc  | 54 +++++++++++++++++++++++---------
+ gcc/config/i386/i386.cc          |  2 +-
+ gcc/config/i386/i386.h           |  8 ++---
+ gcc/config/i386/i386.opt         |  4 +++
+ gcc/config/i386/x86-tune.def     |  4 +--
+ 6 files changed, 52 insertions(+), 22 deletions(-)
+
+diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc
+index 050c6228a18..8ed32e14f0a 100644
+--- a/gcc/config/i386/i386-builtins.cc
++++ b/gcc/config/i386/i386-builtins.cc
+@@ -1790,7 +1790,7 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
+ 	  ? !TARGET_USE_GATHER_2PARTS
+ 	  : (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), 4u)
+ 	     ? !TARGET_USE_GATHER_4PARTS
+-	     : !TARGET_USE_GATHER)))
++	     : !TARGET_USE_GATHER_8PARTS)))
+     return NULL_TREE;
+ 
+   if ((TREE_CODE (index_type) != INTEGER_TYPE
+diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
+index 9617fc162e0..3df1f0c41c3 100644
+--- a/gcc/config/i386/i386-options.cc
++++ b/gcc/config/i386/i386-options.cc
+@@ -1705,20 +1705,46 @@ parse_mtune_ctrl_str (struct gcc_options *opts, bool dump)
+           curr_feature_string++;
+           clear = true;
+         }
+-      for (i = 0; i < X86_TUNE_LAST; i++)
+-        {
+-          if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
+-            {
+-              ix86_tune_features[i] = !clear;
+-              if (dump)
+-                fprintf (stderr, "Explicitly %s feature %s\n",
+-                         clear ? "clear" : "set", ix86_tune_feature_names[i]);
+-              break;
+-            }
+-        }
+-      if (i == X86_TUNE_LAST)
+-	error ("unknown parameter to option %<-mtune-ctrl%>: %s",
+-	       clear ? curr_feature_string - 1 : curr_feature_string);
++
++      if (!strcmp (curr_feature_string, "use_gather"))
++	{
++	  ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS] = !clear;
++	  ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS] = !clear;
++	  ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS] = !clear;
++	  if (dump)
++	    fprintf (stderr, "Explicitly %s features use_gather_2parts,"
++		     " use_gather_4parts, use_gather_8parts\n",
++		     clear ? "clear" : "set");
++
++	}
++      else if (!strcmp (curr_feature_string, "use_scatter"))
++	{
++	  ix86_tune_features[X86_TUNE_USE_SCATTER_2PARTS] = !clear;
++	  ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS] = !clear;
++	  ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS] = !clear;
++	  if (dump)
++	    fprintf (stderr, "Explicitly %s features use_scatter_2parts,"
++		     " use_scatter_4parts, use_scatter_8parts\n",
++		     clear ? "clear" : "set");
++	}
++      else
++	{
++	  for (i = 0; i < X86_TUNE_LAST; i++)
++	    {
++	      if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
++		{
++		  ix86_tune_features[i] = !clear;
++		  if (dump)
++		    fprintf (stderr, "Explicitly %s feature %s\n",
++			     clear ? "clear" : "set", ix86_tune_feature_names[i]);
++		  break;
++		}
++	    }
++
++	  if (i == X86_TUNE_LAST)
++	    error ("unknown parameter to option %<-mtune-ctrl%>: %s",
++		   clear ? curr_feature_string - 1 : curr_feature_string);
++	}
+       curr_feature_string = next_feature_string;
+     }
+   while (curr_feature_string);
+diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
+index 479fc601049..e75d3702338 100644
+--- a/gcc/config/i386/i386.cc
++++ b/gcc/config/i386/i386.cc
+@@ -18937,7 +18937,7 @@ ix86_vectorize_builtin_scatter (const_tree vectype,
+       ? !TARGET_USE_SCATTER_2PARTS
+       : (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
+ 	 ? !TARGET_USE_SCATTER_4PARTS
+-	 : !TARGET_USE_SCATTER))
++	 : !TARGET_USE_SCATTER_8PARTS))
+     return NULL_TREE;
+ 
+   if ((TREE_CODE (index_type) != INTEGER_TYPE
+diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
+index 688aaabd3f8..aaa136ba0bf 100644
+--- a/gcc/config/i386/i386.h
++++ b/gcc/config/i386/i386.h
+@@ -403,10 +403,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
+ 	ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS]
+ #define TARGET_USE_SCATTER_4PARTS \
+ 	ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS]
+-#define TARGET_USE_GATHER \
+-	ix86_tune_features[X86_TUNE_USE_GATHER]
+-#define TARGET_USE_SCATTER \
+-	ix86_tune_features[X86_TUNE_USE_SCATTER]
++#define TARGET_USE_GATHER_8PARTS \
++	ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS]
++#define TARGET_USE_SCATTER_8PARTS \
++	ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS]
+ #define TARGET_FUSE_CMP_AND_BRANCH_32 \
+ 	ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
+ #define TARGET_FUSE_CMP_AND_BRANCH_64 \
+diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
+index 498fb454d01..b154110d813 100644
+--- a/gcc/config/i386/i386.opt
++++ b/gcc/config/i386/i386.opt
+@@ -1222,3 +1222,7 @@ Instructions number above which STFL stall penalty can be compensated.
+ munroll-only-small-loops
+ Target Var(ix86_unroll_only_small_loops) Init(0) Save
+ Enable conservative small loop unrolling.
++
++mscatter
++Target Alias(mtune-ctrl=, use_scatter, ^use_scatter)
++Enable vectorization for scatter instruction.
+diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
+index 4392709fce2..bdb455d20ba 100644
+--- a/gcc/config/i386/x86-tune.def
++++ b/gcc/config/i386/x86-tune.def
+@@ -488,13 +488,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
+ 
+ /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
+    elements.  */
+-DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
++DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
+ 	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE
+ 	    | m_GENERIC | m_GDS))
+ 
+ /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
+    elements.  */
+-DEF_TUNE (X86_TUNE_USE_SCATTER, "use_scatter",
++DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
+ 	  ~(m_ZNVER4))
+ 
+ /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
+-- 
+2.31.1
+
diff --git a/gcc.spec b/gcc.spec
index 6c852bb..3388b91 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 25
+%global gcc_release 26
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -172,6 +172,7 @@ Patch34: 0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch
 Patch35: 0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch
 Patch36: 0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch
 Patch37: 0037-Software-mitigation-Disable-gather-generation-in-vec.patch
+Patch38: 0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@@ -675,6 +676,7 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch35 -p1
 %patch36 -p1
 %patch37 -p1
+%patch38 -p1
 
 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE
 
@@ -2906,6 +2908,11 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-26
+- Type: Sync
+- DESC: Support -m[no-]gather -m[no-]scatter to enable/disable vectorization
+  for all gather/scatter instructions
+
 * Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-25
 - Type: Sync
 - DESC: Software mitigation: Disable gather generation in vectorization for GDS
-- 
Gitee


From 30529fc0f4adf20160ded2cc81d557805b41d52f Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 17 Jan 2024 15:39:05 +0800
Subject: [PATCH 11/15] [Sync] Remove constraint modifier % for
 fcmaddcph/fmaddcph/fcmulcph since there're not commutative.

gcc/ChangeLog:

	PR target/111306
	PR target/111335
	* config/i386/sse.md (int_comm): New int_attr.
	(fma_<complexopname>_<mode><sdc_maskz_name><round_name>):
	Remove % for Complex conjugate operations since they're not
	commutative.
	(fma_<complexpairopname>_<mode>_pair): Ditto.
	(<avx512>_<complexopname>_<mode>_mask<round_name>): Ditto.
	(cmul<conj_op><mode>3): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr111306.c: New test.
---
 ...t-modifier-for-fcmaddcph-fmaddcph-fc.patch | 129 ++++++++++++++++++
 gcc.spec                                      |   9 +-
 2 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch

diff --git a/0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch b/0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch
new file mode 100644
index 0000000..b50adc2
--- /dev/null
+++ b/0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch
@@ -0,0 +1,129 @@
+From eefab6a2d901314a2c94b597e1d7766ae34529d0 Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Fri, 8 Sep 2023 09:22:43 +0800
+Subject: [Sync] Remove constraint modifier % for
+ fcmaddcph/fmaddcph/fcmulcph since there're not commutative.
+
+gcc/ChangeLog:
+
+	PR target/111306
+	PR target/111335
+	* config/i386/sse.md (int_comm): New int_attr.
+	(fma_<complexopname>_<mode><sdc_maskz_name><round_name>):
+	Remove % for Complex conjugate operations since they're not
+	commutative.
+	(fma_<complexpairopname>_<mode>_pair): Ditto.
+	(<avx512>_<complexopname>_<mode>_mask<round_name>): Ditto.
+	(cmul<conj_op><mode>3): Ditto.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.target/i386/pr111306.c: New test.
+
+(cherry picked from commit f197392a16ffb1327f1d12ff8ff05f9295e015cb)
+---
+ gcc/config/i386/sse.md                   | 16 ++++++++---
+ gcc/testsuite/gcc.target/i386/pr111306.c | 36 ++++++++++++++++++++++++
+ 2 files changed, 48 insertions(+), 4 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/i386/pr111306.c
+
+diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
+index 3af15989631..f25dd5f2bc4 100644
+--- a/gcc/config/i386/sse.md
++++ b/gcc/config/i386/sse.md
+@@ -6318,6 +6318,14 @@
+ 	[(UNSPEC_COMPLEX_FMA_PAIR "fmaddc")
+ 	 (UNSPEC_COMPLEX_FCMA_PAIR "fcmaddc")])
+ 
++(define_int_attr int_comm
++	[(UNSPEC_COMPLEX_FMA "")
++	 (UNSPEC_COMPLEX_FMA_PAIR "")
++	 (UNSPEC_COMPLEX_FCMA "")
++	 (UNSPEC_COMPLEX_FCMA_PAIR "")
++	 (UNSPEC_COMPLEX_FMUL "%")
++	 (UNSPEC_COMPLEX_FCMUL "")])
++
+ (define_int_attr conj_op
+ 	[(UNSPEC_COMPLEX_FMA "")
+ 	 (UNSPEC_COMPLEX_FCMA "_conj")
+@@ -6431,7 +6439,7 @@
+ (define_insn "fma_<complexopname>_<mode><sdc_maskz_name><round_name>"
+   [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
+ 	(unspec:VF_AVX512FP16VL
+-	  [(match_operand:VF_AVX512FP16VL 1 "<round_nimm_predicate>" "%v")
++	  [(match_operand:VF_AVX512FP16VL 1 "<round_nimm_predicate>" "<int_comm>v")
+ 	   (match_operand:VF_AVX512FP16VL 2 "<round_nimm_predicate>" "<round_constraint>")
+ 	   (match_operand:VF_AVX512FP16VL 3 "<round_nimm_predicate>" "0")]
+ 	   UNSPEC_COMPLEX_F_C_MA))]
+@@ -6495,7 +6503,7 @@
+ (define_insn "fma_<complexpairopname>_<mode>_pair"
+  [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=&v")
+        (unspec:VF1_AVX512VL
+-	 [(match_operand:VF1_AVX512VL 1 "vector_operand" "%v")
++	 [(match_operand:VF1_AVX512VL 1 "vector_operand" "<int_comm>v")
+ 	  (match_operand:VF1_AVX512VL 2 "bcst_vector_operand" "vmBr")
+ 	  (match_operand:VF1_AVX512VL 3 "vector_operand" "0")]
+ 	  UNSPEC_COMPLEX_F_C_MA_PAIR))]
+@@ -6562,7 +6570,7 @@
+   [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
+ 	(vec_merge:VF_AVX512FP16VL
+ 	  (unspec:VF_AVX512FP16VL
+-	    [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v")
++	    [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "<int_comm>v")
+ 	     (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")
+ 	     (match_operand:VF_AVX512FP16VL 3 "register_operand" "0")]
+ 	     UNSPEC_COMPLEX_F_C_MA)
+@@ -6586,7 +6594,7 @@
+ (define_insn "<avx512>_<complexopname>_<mode><maskc_name><round_name>"
+   [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
+ 	  (unspec:VF_AVX512FP16VL
+-	    [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v")
++	    [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "<int_comm>v")
+ 	     (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "<round_constraint>")]
+ 	     UNSPEC_COMPLEX_F_C_MUL))]
+   "TARGET_AVX512FP16 && <round_mode512bit_condition>"
+diff --git a/gcc/testsuite/gcc.target/i386/pr111306.c b/gcc/testsuite/gcc.target/i386/pr111306.c
+new file mode 100644
+index 00000000000..541725ebdad
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr111306.c
+@@ -0,0 +1,36 @@
++/* { dg-do run } */
++/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
++/* { dg-require-effective-target avx512fp16 } */
++
++#define AVX512FP16
++#include "avx512f-helper.h"
++
++__attribute__((optimize("O2"),noipa))
++void func1(_Float16 *a, _Float16 *b, int n, _Float16 *c) {
++  __m512h rA = _mm512_loadu_ph(a);
++  for (int i = 0; i < n; i += 32) {
++    __m512h rB = _mm512_loadu_ph(b + i);
++    _mm512_storeu_ph(c + i, _mm512_fcmul_pch(rB, rA));
++  }
++}
++
++void
++test_512 (void)
++{
++  int n = 32;
++  _Float16 a[n], b[n], c[n];
++  _Float16 exp[n];
++  for (int i = 1; i <= n; i++) {
++    a[i - 1] = i & 1 ? -i : i;
++    b[i - 1] = i;
++  }
++
++  func1(a, b, n, c);
++  for (int i = 0; i < n / 32; i += 2) {
++    if (c[i] != a[i] * b[i] + a[i+1] * b[i+1]
++	|| c[i+1] != a[i] * b[i+1] - a[i+1]*b[i])
++      __builtin_abort ();
++    }
++}
++
++
+-- 
+2.31.1
+
diff --git a/gcc.spec b/gcc.spec
index 3388b91..219a9a4 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 26
+%global gcc_release 27
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -173,6 +173,7 @@ Patch35: 0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch
 Patch36: 0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch
 Patch37: 0037-Software-mitigation-Disable-gather-generation-in-vec.patch
 Patch38: 0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch
+Patch39: 0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@@ -677,6 +678,7 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch36 -p1
 %patch37 -p1
 %patch38 -p1
+%patch39 -p1
 
 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE
 
@@ -2908,6 +2910,11 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-27
+- Type: Sync
+- DESC: Remove constraint modifier % for fcmaddcph/fmaddcph/fcmulcph since
+  there're not commutative.
+
 * Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-26
 - Type: Sync
 - DESC: Support -m[no-]gather -m[no-]scatter to enable/disable vectorization
-- 
Gitee


From 955517eb53245da57dd56591653a8bec3fca57be Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 17 Jan 2024 15:41:23 +0800
Subject: [PATCH 12/15] [Sync] Disparage slightly for the alternative which
 move DFmode between SSE_REGS and GENERAL_REGS.

For testcase

void __cond_swap(double* __x, double* __y) {
  bool __r = (*__x < *__y);
  auto __tmp = __r ? *__x : *__y;
  *__y = __r ? *__y : *__x;
  *__x = __tmp;
}

GCC-14 with -O2 and -march=x86-64 options generates the following code:

__cond_swap(double*, double*):
        movsd   xmm1, QWORD PTR [rdi]
        movsd   xmm0, QWORD PTR [rsi]
        comisd  xmm0, xmm1
        jbe     .L2
        movq    rax, xmm1
        movapd  xmm1, xmm0
        movq    xmm0, rax
.L2:
        movsd   QWORD PTR [rsi], xmm1
        movsd   QWORD PTR [rdi], xmm0
        ret

rax is used to save and restore DFmode value. In RA both GENERAL_REGS
and SSE_REGS cost zero since we didn't disparage the
alternative in movdf_internal pattern, according to register
allocation order, GENERAL_REGS is allocated. The patch add ? for
alternative (r,v) and (v,r) just like we did for movsf/hf/bf_internal
pattern, after that we get optimal RA.

__cond_swap:
.LFB0:
	.cfi_startproc
	movsd	(%rdi), %xmm1
	movsd	(%rsi), %xmm0
	comisd	%xmm1, %xmm0
	jbe	.L2
	movapd	%xmm1, %xmm2
	movapd	%xmm0, %xmm1
	movapd	%xmm2, %xmm0
.L2:
	movsd	%xmm1, (%rsi)
	movsd	%xmm0, (%rdi)
	ret

gcc/ChangeLog:

	PR target/110170
	* config/i386/i386.md (movdf_internal): Disparage slightly for
	2 alternatives (r,v) and (v,r) by adding constraint modifier
	'?'.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr110170-3.c: New test.
---
 ...ly-for-the-alternative-which-move-DF.patch | 106 ++++++++++++++++++
 gcc.spec                                      |   9 +-
 2 files changed, 114 insertions(+), 1 deletion(-)
 create mode 100644 0040-Disparage-slightly-for-the-alternative-which-move-DF.patch

diff --git a/0040-Disparage-slightly-for-the-alternative-which-move-DF.patch b/0040-Disparage-slightly-for-the-alternative-which-move-DF.patch
new file mode 100644
index 0000000..6ca7b70
--- /dev/null
+++ b/0040-Disparage-slightly-for-the-alternative-which-move-DF.patch
@@ -0,0 +1,106 @@
+From 655f62a56a76b99db90faecaa44a4335d674d91c Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Wed, 5 Jul 2023 13:45:11 +0800
+Subject: [Sync] Disparage slightly for the alternative which move
+ DFmode between SSE_REGS and GENERAL_REGS.
+
+For testcase
+
+void __cond_swap(double* __x, double* __y) {
+  bool __r = (*__x < *__y);
+  auto __tmp = __r ? *__x : *__y;
+  *__y = __r ? *__y : *__x;
+  *__x = __tmp;
+}
+
+GCC-14 with -O2 and -march=x86-64 options generates the following code:
+
+__cond_swap(double*, double*):
+        movsd   xmm1, QWORD PTR [rdi]
+        movsd   xmm0, QWORD PTR [rsi]
+        comisd  xmm0, xmm1
+        jbe     .L2
+        movq    rax, xmm1
+        movapd  xmm1, xmm0
+        movq    xmm0, rax
+.L2:
+        movsd   QWORD PTR [rsi], xmm1
+        movsd   QWORD PTR [rdi], xmm0
+        ret
+
+rax is used to save and restore DFmode value. In RA both GENERAL_REGS
+and SSE_REGS cost zero since we didn't disparage the
+alternative in movdf_internal pattern, according to register
+allocation order, GENERAL_REGS is allocated. The patch add ? for
+alternative (r,v) and (v,r) just like we did for movsf/hf/bf_internal
+pattern, after that we get optimal RA.
+
+__cond_swap:
+.LFB0:
+	.cfi_startproc
+	movsd	(%rdi), %xmm1
+	movsd	(%rsi), %xmm0
+	comisd	%xmm1, %xmm0
+	jbe	.L2
+	movapd	%xmm1, %xmm2
+	movapd	%xmm0, %xmm1
+	movapd	%xmm2, %xmm0
+.L2:
+	movsd	%xmm1, (%rsi)
+	movsd	%xmm0, (%rdi)
+	ret
+
+gcc/ChangeLog:
+
+	PR target/110170
+	* config/i386/i386.md (movdf_internal): Disparage slightly for
+	2 alternatives (r,v) and (v,r) by adding constraint modifier
+	'?'.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.target/i386/pr110170-3.c: New test.
+
+(cherry picked from commit 37a231cc7594d12ba0822077018aad751a6fb94e)
+---
+ gcc/config/i386/i386.md                    |  4 ++--
+ gcc/testsuite/gcc.target/i386/pr110170-3.c | 11 +++++++++++
+ 2 files changed, 13 insertions(+), 2 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-3.c
+
+diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
+index be07be10d8a..71691f59894 100644
+--- a/gcc/config/i386/i386.md
++++ b/gcc/config/i386/i386.md
+@@ -3582,9 +3582,9 @@
+ ;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7.
+ (define_insn "*movdf_internal"
+   [(set (match_operand:DF 0 "nonimmediate_operand"
+-    "=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,v,r  ,o ,r  ,m")
++    "=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,?r,?v,r  ,o ,r  ,m")
+ 	(match_operand:DF 1 "general_operand"
+-    "Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,v,r ,roF,rF,rmF,rC"))]
++    "Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x, v, r,roF,rF,rmF,rC"))]
+   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
+    && (lra_in_progress || reload_completed
+        || !CONST_DOUBLE_P (operands[1])
+diff --git a/gcc/testsuite/gcc.target/i386/pr110170-3.c b/gcc/testsuite/gcc.target/i386/pr110170-3.c
+new file mode 100644
+index 00000000000..70daa89e9aa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr110170-3.c
+@@ -0,0 +1,11 @@
++/* { dg-do compile { target { ! ia32 } } } */
++/* { dg-options "-O2 -fno-if-conversion -fno-if-conversion2" } */
++/* { dg-final { scan-assembler-not {(?n)movq.*r} } } */
++
++void __cond_swap(double* __x, double* __y) {
++  _Bool __r = (*__x < *__y);
++  double __tmp = __r ? *__x : *__y;
++  *__y = __r ? *__y : *__x;
++  *__x = __tmp;
++}
++
+-- 
+2.31.1
+
diff --git a/gcc.spec b/gcc.spec
index 219a9a4..f69adcc 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 27
+%global gcc_release 28
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -174,6 +174,7 @@ Patch36: 0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch
 Patch37: 0037-Software-mitigation-Disable-gather-generation-in-vec.patch
 Patch38: 0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch
 Patch39: 0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch
+Patch40: 0040-Disparage-slightly-for-the-alternative-which-move-DF.patch
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@@ -679,6 +680,7 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch37 -p1
 %patch38 -p1
 %patch39 -p1
+%patch40 -p1
 
 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE
 
@@ -2910,6 +2912,11 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-28
+- Type: Sync
+- DESC: Disparage slightly for the alternative which move DFmode between
+  SSE_REGS and GENERAL_REGS.
+
 * Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-27
 - Type: Sync
 - DESC: Remove constraint modifier % for fcmaddcph/fmaddcph/fcmulcph since
-- 
Gitee


From ff74d335e4d6c7727dd3bcc36908ba131cf8f5b2 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 17 Jan 2024 15:43:36 +0800
Subject: [PATCH 13/15] [Sync] Fix wrong code due to vec_merge + pcmp to
 blendvb splitter.

gcc/ChangeLog:

	PR target/112443
	* config/i386/sse.md (*avx2_pcmp<mode>3_4): Fix swap condition
	from LT to GT since there's not in the pattern.
	(*avx2_pcmp<mode>3_5): Ditto.

gcc/testsuite/ChangeLog:

	* g++.target/i386/pr112443.C: New test.
---
 ...ue-to-vec_merge-pcmp-to-blendvb-spli.patch | 163 ++++++++++++++++++
 gcc.spec                                      |   8 +-
 2 files changed, 170 insertions(+), 1 deletion(-)
 create mode 100644 0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch

diff --git a/0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch b/0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch
new file mode 100644
index 0000000..5a5c5ee
--- /dev/null
+++ b/0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch
@@ -0,0 +1,163 @@
+From 87d4ac895c32f894f0efd7efc82fcde65161a38d Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Thu, 9 Nov 2023 13:20:05 +0800
+Subject: [Sync] Fix wrong code due to vec_merge + pcmp to blendvb
+ splitter.
+
+gcc/ChangeLog:
+
+	PR target/112443
+	* config/i386/sse.md (*avx2_pcmp<mode>3_4): Fix swap condition
+	from LT to GT since there's not in the pattern.
+	(*avx2_pcmp<mode>3_5): Ditto.
+
+gcc/testsuite/ChangeLog:
+
+	* g++.target/i386/pr112443.C: New test.
+
+(cherry picked from commit 9a0cc04b9c9b02426762892b88efc5c44ba546bd)
+---
+ gcc/config/i386/sse.md                   |   4 +-
+ gcc/testsuite/g++.target/i386/pr112443.C | 108 +++++++++++++++++++++++
+ 2 files changed, 110 insertions(+), 2 deletions(-)
+ create mode 100644 gcc/testsuite/g++.target/i386/pr112443.C
+
+diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
+index f25dd5f2bc4..23b858ab21c 100644
+--- a/gcc/config/i386/sse.md
++++ b/gcc/config/i386/sse.md
+@@ -16358,7 +16358,7 @@
+ 	     (match_dup 4))]
+ 	     UNSPEC_BLENDV))]
+ {
+-  if (INTVAL (operands[5]) == 1)
++  if (INTVAL (operands[5]) == 5)
+     std::swap (operands[1], operands[2]);
+   operands[3] = gen_lowpart (<MODE>mode, operands[3]);
+ })
+@@ -16388,7 +16388,7 @@
+ 	     (match_dup 4))]
+ 	     UNSPEC_BLENDV))]
+ {
+-  if (INTVAL (operands[5]) == 1)
++  if (INTVAL (operands[5]) == 5)
+     std::swap (operands[1], operands[2]);
+ })
+ 
+diff --git a/gcc/testsuite/g++.target/i386/pr112443.C b/gcc/testsuite/g++.target/i386/pr112443.C
+new file mode 100644
+index 00000000000..ebfa9b4a753
+--- /dev/null
++++ b/gcc/testsuite/g++.target/i386/pr112443.C
+@@ -0,0 +1,108 @@
++/* { dg-do run } */
++/* { dg-require-effective-target avx512bw } */
++/* { dg-require-effective-target avx512vl } */
++/* { dg-options "-O2 -std=c++17 -mavx512bw -mavx512vl" } */
++
++#include <cstdint>
++#include <x86intrin.h>
++#include <functional>
++#include <ostream>
++
++#define AVX512BW
++#define AVX512VL
++
++#include "avx512f-helper.h"
++
++struct TensorIteratorBase{
++  char* in;
++  char* out;
++
++  void for_each(std::function<void(char*, char*, int64_t size)> loop){
++    loop(out, in, 32);
++  }    
++};
++
++class Vectorized {
++protected:
++  __m256i values;
++
++  static inline __m256i invert(const __m256i& v) {
++    const auto ones = _mm256_set1_epi64x(-1);
++    return _mm256_xor_si256(ones, v);
++  }
++public:
++  operator __m256i() const {
++    return values;
++  }
++
++  static constexpr int size() {
++    return 32;
++  }
++
++  Vectorized() {}
++  Vectorized(__m256i v) : values(v) {}
++  Vectorized(uint8_t v) { values = _mm256_set1_epi8(v); }
++  static Vectorized blendv(const Vectorized& a, const Vectorized& b,
++			   const Vectorized& mask) {
++    return _mm256_blendv_epi8(a, b, mask);
++  }
++  static Vectorized loadu(const void* ptr) {
++    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
++  }
++  void store(void* ptr) const {
++    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
++  }
++
++  Vectorized operator<(const Vectorized& other) const {
++    __m256i max = _mm256_max_epu8(values, other);
++    return invert(_mm256_cmpeq_epi8(max, values));
++  }
++  Vectorized operator-(const Vectorized& b) {
++    return _mm256_sub_epi8(values, b);
++  }
++};
++
++std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) {
++  uint8_t buf[Vectorized::size()];
++  vec.store(buf);
++  stream << "vec[";
++  for (int i = 0; i != Vectorized::size(); i++) {
++    if (i != 0)
++      stream << ", ";
++    stream << buf[i]*1;
++  }
++  stream << "]";
++  return stream;
++}
++
++void run(TensorIteratorBase iter){
++  Vectorized zero_vec(0);
++  Vectorized one_vec(1);
++
++  iter.for_each([=](char* out, char* in, int64_t size) {
++    for (int64_t i = 0; i <= size - Vectorized::size(); i += Vectorized::size()) {
++      auto self_vec = Vectorized::loadu(in + i);
++      auto left = Vectorized::blendv(zero_vec, one_vec, zero_vec < self_vec);
++      auto right = Vectorized::blendv(zero_vec, one_vec, self_vec < zero_vec);
++      auto outv = left - right;
++      outv.store(out + i);
++    }
++  });
++}
++
++void
++test_256 (){
++  char in[32];
++  char out[32];
++  for(auto& x: in) x = 1;
++  run(TensorIteratorBase{in, out});
++  Vectorized::loadu (out);
++  for (int i = 0; i != 32; i++)
++    if (out[i] != 1)
++      __builtin_abort ();
++}
++
++void
++test_128 ()
++{
++}
+-- 
+2.31.1
+
diff --git a/gcc.spec b/gcc.spec
index f69adcc..08e5ef7 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 28
+%global gcc_release 29
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -175,6 +175,7 @@ Patch37: 0037-Software-mitigation-Disable-gather-generation-in-vec.patch
 Patch38: 0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch
 Patch39: 0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch
 Patch40: 0040-Disparage-slightly-for-the-alternative-which-move-DF.patch
+Patch41: 0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@@ -681,6 +682,7 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch38 -p1
 %patch39 -p1
 %patch40 -p1
+%patch41 -p1
 
 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE
 
@@ -2912,6 +2914,10 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-29
+- Type: Sync
+- DESC: Fix wrong code due to vec_merge + pcmp to blendvb splitter.
+
 * Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-28
 - Type: Sync
 - DESC: Disparage slightly for the alternative which move DFmode between
-- 
Gitee


From 715c8e17de7f532a5bf86166a22832ee144cfa3a Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 17 Jan 2024 15:45:31 +0800
Subject: [PATCH 14/15] [Sync] Don't assume it's AVX_U128_CLEAN after call_insn
 whose abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS.

If the function desn't clobber any sse registers or only clobber
128-bit part, then vzeroupper isn't issued before the function exit.
the status not CLEAN but ANY after the function.

Also for sibling_call, it's safe to issue an vzeroupper. Also there
could be missing vzeroupper since there's no mode_exit for
sibling_call_p.

gcc/ChangeLog:

	PR target/112891
	* config/i386/i386.cc (ix86_avx_u128_mode_after): Return
	AVX_U128_ANY if callee_abi doesn't clobber all_sse_regs to
	align with ix86_avx_u128_mode_needed.
	(ix86_avx_u128_mode_needed): Return AVX_U128_ClEAN for
	sibling_call.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr112891.c: New test.
	* gcc.target/i386/pr112891-2.c: New test.
---
 ...s-AVX_U128_CLEAN-after-call_insn-who.patch | 151 ++++++++++++++++++
 gcc.spec                                      |   9 +-
 2 files changed, 159 insertions(+), 1 deletion(-)
 create mode 100644 0042-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch

diff --git a/0042-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch b/0042-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch
new file mode 100644
index 0000000..043b521
--- /dev/null
+++ b/0042-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch
@@ -0,0 +1,151 @@
+From 068c5ee54e6275fab3fcb501c4d6221f86abae68 Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Thu, 7 Dec 2023 09:17:27 +0800
+Subject: [Sync] Don't assume it's AVX_U128_CLEAN after call_insn whose
+ abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS.
+
+If the function desn't clobber any sse registers or only clobber
+128-bit part, then vzeroupper isn't issued before the function exit.
+the status not CLEAN but ANY after the function.
+
+Also for sibling_call, it's safe to issue an vzeroupper. Also there
+could be missing vzeroupper since there's no mode_exit for
+sibling_call_p.
+
+gcc/ChangeLog:
+
+	PR target/112891
+	* config/i386/i386.cc (ix86_avx_u128_mode_after): Return
+	AVX_U128_ANY if callee_abi doesn't clobber all_sse_regs to
+	align with ix86_avx_u128_mode_needed.
+	(ix86_avx_u128_mode_needed): Return AVX_U128_ClEAN for
+	sibling_call.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.target/i386/pr112891.c: New test.
+	* gcc.target/i386/pr112891-2.c: New test.
+
+(cherry picked from commit fc189a08f5b7ad5889bd4c6b320c1dd99dd5d642)
+---
+ gcc/config/i386/i386.cc                    | 22 +++++++++++++---
+ gcc/testsuite/gcc.target/i386/pr112891-2.c | 30 ++++++++++++++++++++++
+ gcc/testsuite/gcc.target/i386/pr112891.c   | 29 +++++++++++++++++++++
+ 3 files changed, 78 insertions(+), 3 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/i386/pr112891-2.c
+ create mode 100644 gcc/testsuite/gcc.target/i386/pr112891.c
+
+diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
+index e75d3702338..60f3296b00c 100644
+--- a/gcc/config/i386/i386.cc
++++ b/gcc/config/i386/i386.cc
+@@ -14416,8 +14416,12 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
+ 	 modes wider than 256 bits.  It's only safe to issue a
+ 	 vzeroupper if all SSE registers are clobbered.  */
+       const function_abi &abi = insn_callee_abi (insn);
+-      if (!hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
+-				  abi.mode_clobbers (V4DImode)))
++      /* Should be safe to issue an vzeroupper before sibling_call_p.
++	 Also there not mode_exit for sibling_call, so there could be
++	 missing vzeroupper for that.  */
++      if (!(SIBLING_CALL_P (insn)
++	    || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
++				      abi.mode_clobbers (V4DImode))))
+ 	return AVX_U128_ANY;
+ 
+       return AVX_U128_CLEAN;
+@@ -14555,7 +14559,19 @@ ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
+       bool avx_upper_reg_found = false;
+       note_stores (insn, ix86_check_avx_upper_stores, &avx_upper_reg_found);
+ 
+-      return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
++      if (avx_upper_reg_found)
++	return AVX_U128_DIRTY;
++
++      /* If the function desn't clobber any sse registers or only clobber
++	 128-bit part, Then vzeroupper isn't issued before the function exit.
++	 the status not CLEAN but ANY after the function.  */
++      const function_abi &abi = insn_callee_abi (insn);
++      if (!(SIBLING_CALL_P (insn)
++	    || hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
++				      abi.mode_clobbers (V4DImode))))
++	return AVX_U128_ANY;
++
++      return  AVX_U128_CLEAN;
+     }
+ 
+   /* Otherwise, return current mode.  Remember that if insn
+diff --git a/gcc/testsuite/gcc.target/i386/pr112891-2.c b/gcc/testsuite/gcc.target/i386/pr112891-2.c
+new file mode 100644
+index 00000000000..164c3985d50
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr112891-2.c
+@@ -0,0 +1,30 @@
++/* { dg-do compile } */
++/* { dg-options "-mavx2 -O3" } */
++/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
++
++void
++__attribute__((noinline))
++bar (double* a)
++{
++  a[0] = 1.0;
++  a[1] = 2.0;
++}
++
++double
++__attribute__((noinline))
++foo (double* __restrict a, double* b)
++{
++  a[0] += b[0];
++  a[1] += b[1];
++  a[2] += b[2];
++  a[3] += b[3];
++  bar (b);
++  return a[5] + b[5];
++}
++
++double
++foo1 (double* __restrict a, double* b)
++{
++  double c = foo (a, b);
++  return __builtin_exp (c);
++}
+diff --git a/gcc/testsuite/gcc.target/i386/pr112891.c b/gcc/testsuite/gcc.target/i386/pr112891.c
+new file mode 100644
+index 00000000000..dbf6c67948a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr112891.c
+@@ -0,0 +1,29 @@
++/* { dg-do compile } */
++/* { dg-options "-mavx2 -O3" } */
++/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */
++
++void
++__attribute__((noinline))
++bar (double* a)
++{
++  a[0] = 1.0;
++  a[1] = 2.0;
++}
++
++void
++__attribute__((noinline))
++foo (double* __restrict a, double* b)
++{
++  a[0] += b[0];
++  a[1] += b[1];
++  a[2] += b[2];
++  a[3] += b[3];
++  bar (b);
++}
++
++double
++foo1 (double* __restrict a, double* b)
++{
++  foo (a, b);
++  return __builtin_exp (b[1]);
++}
+-- 
+2.31.1
+
diff --git a/gcc.spec b/gcc.spec
index 08e5ef7..50a1a79 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 29
+%global gcc_release 30
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -176,6 +176,7 @@ Patch38: 0038-Support-m-no-gather-m-no-scatter-to-enable-disable-v.patch
 Patch39: 0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch
 Patch40: 0040-Disparage-slightly-for-the-alternative-which-move-DF.patch
 Patch41: 0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch
+Patch42: 0042-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@@ -683,6 +684,7 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch39 -p1
 %patch40 -p1
 %patch41 -p1
+%patch42 -p1
 
 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE
 
@@ -2914,6 +2916,11 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-30
+- Type: Sync
+- DESC: Don't assume it's AVX_U128_CLEAN after call_insn whose
+  abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS.
+
 * Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-29
 - Type: Sync
 - DESC: Fix wrong code due to vec_merge + pcmp to blendvb splitter.
-- 
Gitee


From 9022a215a645cae1022a02ac616abecf979e3cc2 Mon Sep 17 00:00:00 2001
From: Jan Hubicka <jh@suse.cz>
Date: Wed, 17 Jan 2024 15:47:46 +0800
Subject: [PATCH 15/15] [Sync] Disable FMADD in chains for Zen4 and generic

this patch disables use of FMA in matrix multiplication loop for generic (for
x86-64-v3) and zen4.  I tested this on zen4 and Xenon Gold Gold 6212U.

For Intel this is neutral both on the matrix multiplication microbenchmark
(attached) and spec2k17 where the difference was within noise for Core.

On core the micro-benchmark runs as follows:

With FMA:

       578,500,241      cycles:u                         #    3.645 GHz
                ( +-  0.12% )
       753,318,477      instructions:u                   #    1.30  insn per
cycle              ( +-  0.00% )
       125,417,701      branches:u                       #  790.227 M/sec
                ( +-  0.00% )
          0.159146 +- 0.000363 seconds time elapsed  ( +-  0.23% )

No FMA:

       577,573,960      cycles:u                         #    3.514 GHz
                ( +-  0.15% )
       878,318,479      instructions:u                   #    1.52  insn per
cycle              ( +-  0.00% )
       125,417,702      branches:u                       #  763.035 M/sec
                ( +-  0.00% )
          0.164734 +- 0.000321 seconds time elapsed  ( +-  0.19% )

So the cycle count is unchanged and discrete multiply+add takes same time as
FMA.

While on zen:

With FMA:
         484875179      cycles:u                         #    3.599 GHz
             ( +-  0.05% )  (82.11%)
         752031517      instructions:u                   #    1.55  insn per
cycle
         125106525      branches:u                       #  928.712 M/sec
             ( +-  0.03% )  (85.09%)
            128356      branch-misses:u                  #    0.10% of all
branches          ( +-  0.06% )  (83.58%)

No FMA:
         375875209      cycles:u                         #    3.592 GHz
             ( +-  0.08% )  (80.74%)
         875725341      instructions:u                   #    2.33  insn per
cycle
         124903825      branches:u                       #    1.194 G/sec
             ( +-  0.04% )  (84.59%)
          0.105203 +- 0.000188 seconds time elapsed  ( +-  0.18% )

The diffrerence is that Cores understand the fact that fmadd does not need
all three parameters to start computation, while Zen cores doesn't.

Since this seems noticeable win on zen and not loss on Core it seems like good
default for generic.

float a[SIZE][SIZE];
float b[SIZE][SIZE];
float c[SIZE][SIZE];

void init(void)
{
   int i, j, k;
   for(i=0; i<SIZE; ++i)
   {
      for(j=0; j<SIZE; ++j)
      {
         a[i][j] = (float)i + j;
         b[i][j] = (float)i - j;
         c[i][j] = 0.0f;
      }
   }
}

void mult(void)
{
   int i, j, k;

   for(i=0; i<SIZE; ++i)
   {
      for(j=0; j<SIZE; ++j)
      {
         for(k=0; k<SIZE; ++k)
         {
            c[i][j] += a[i][k] * b[k][j];
         }
      }
   }
}

int main(void)
{
   clock_t s, e;

   init();
   s=clock();
   mult();
   e=clock();
   printf("        mult took %10d clocks\n", (int)(e-s));

   return 0;

}

gcc/ChangeLog:

	* config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS,
	X86_TUNE_AVOID_256FMA_CHAINS): Enable for znver4 and Core.
---
 ...FMADD-in-chains-for-Zen4-and-generic.patch | 142 ++++++++++++++++++
 gcc.spec                                      |   8 +-
 2 files changed, 149 insertions(+), 1 deletion(-)
 create mode 100644 0043-Disable-FMADD-in-chains-for-Zen4-and-generic.patch

diff --git a/0043-Disable-FMADD-in-chains-for-Zen4-and-generic.patch b/0043-Disable-FMADD-in-chains-for-Zen4-and-generic.patch
new file mode 100644
index 0000000..a32a924
--- /dev/null
+++ b/0043-Disable-FMADD-in-chains-for-Zen4-and-generic.patch
@@ -0,0 +1,142 @@
+From 16ee6e4d87bb484f92a5d971bbf24177a6ab6b25 Mon Sep 17 00:00:00 2001
+From: Jan Hubicka <jh@suse.cz>
+Date: Fri, 29 Dec 2023 23:51:03 +0100
+Subject: [Sync] Disable FMADD in chains for Zen4 and generic
+
+this patch disables use of FMA in matrix multiplication loop for generic (for
+x86-64-v3) and zen4.  I tested this on zen4 and Xenon Gold Gold 6212U.
+
+For Intel this is neutral both on the matrix multiplication microbenchmark
+(attached) and spec2k17 where the difference was within noise for Core.
+
+On core the micro-benchmark runs as follows:
+
+With FMA:
+
+       578,500,241      cycles:u                         #    3.645 GHz
+                ( +-  0.12% )
+       753,318,477      instructions:u                   #    1.30  insn per
+cycle              ( +-  0.00% )
+       125,417,701      branches:u                       #  790.227 M/sec
+                ( +-  0.00% )
+          0.159146 +- 0.000363 seconds time elapsed  ( +-  0.23% )
+
+No FMA:
+
+       577,573,960      cycles:u                         #    3.514 GHz
+                ( +-  0.15% )
+       878,318,479      instructions:u                   #    1.52  insn per
+cycle              ( +-  0.00% )
+       125,417,702      branches:u                       #  763.035 M/sec
+                ( +-  0.00% )
+          0.164734 +- 0.000321 seconds time elapsed  ( +-  0.19% )
+
+So the cycle count is unchanged and discrete multiply+add takes same time as
+FMA.
+
+While on zen:
+
+With FMA:
+         484875179      cycles:u                         #    3.599 GHz
+             ( +-  0.05% )  (82.11%)
+         752031517      instructions:u                   #    1.55  insn per
+cycle
+         125106525      branches:u                       #  928.712 M/sec
+             ( +-  0.03% )  (85.09%)
+            128356      branch-misses:u                  #    0.10% of all
+branches          ( +-  0.06% )  (83.58%)
+
+No FMA:
+         375875209      cycles:u                         #    3.592 GHz
+             ( +-  0.08% )  (80.74%)
+         875725341      instructions:u                   #    2.33  insn per
+cycle
+         124903825      branches:u                       #    1.194 G/sec
+             ( +-  0.04% )  (84.59%)
+          0.105203 +- 0.000188 seconds time elapsed  ( +-  0.18% )
+
+The diffrerence is that Cores understand the fact that fmadd does not need
+all three parameters to start computation, while Zen cores doesn't.
+
+Since this seems noticeable win on zen and not loss on Core it seems like good
+default for generic.
+
+float a[SIZE][SIZE];
+float b[SIZE][SIZE];
+float c[SIZE][SIZE];
+
+void init(void)
+{
+   int i, j, k;
+   for(i=0; i<SIZE; ++i)
+   {
+      for(j=0; j<SIZE; ++j)
+      {
+         a[i][j] = (float)i + j;
+         b[i][j] = (float)i - j;
+         c[i][j] = 0.0f;
+      }
+   }
+}
+
+void mult(void)
+{
+   int i, j, k;
+
+   for(i=0; i<SIZE; ++i)
+   {
+      for(j=0; j<SIZE; ++j)
+      {
+         for(k=0; k<SIZE; ++k)
+         {
+            c[i][j] += a[i][k] * b[k][j];
+         }
+      }
+   }
+}
+
+int main(void)
+{
+   clock_t s, e;
+
+   init();
+   s=clock();
+   mult();
+   e=clock();
+   printf("        mult took %10d clocks\n", (int)(e-s));
+
+   return 0;
+
+}
+
+gcc/ChangeLog:
+
+	* config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS,
+	X86_TUNE_AVOID_256FMA_CHAINS): Enable for znver4 and Core.
+---
+ gcc/config/i386/x86-tune.def | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
+index bdb455d20ba..fd095f3ec99 100644
+--- a/gcc/config/i386/x86-tune.def
++++ b/gcc/config/i386/x86-tune.def
+@@ -499,12 +499,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
+ 
+ /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
+    smaller FMA chain.  */
+-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
++DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2
++	  | m_ZNVER3 | m_ZNVER4 | m_GENERIC)
+ 
+ /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
+    smaller FMA chain.  */
+ DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3
+-	  | m_ALDERLAKE | m_SAPPHIRERAPIDS)
++	  | m_ZNVER4 | m_ALDERLAKE | m_SAPPHIRERAPIDS | m_GENERIC)
+ 
+ /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
+    smaller FMA chain.  */
+-- 
+2.31.1
+
diff --git a/gcc.spec b/gcc.spec
index 50a1a79..89da2b6 100644
--- a/gcc.spec
+++ b/gcc.spec
@@ -2,7 +2,7 @@
 %global gcc_major 12
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 30
+%global gcc_release 31
 
 %global _unpackaged_files_terminate_build 0
 %global _performance_build 1
@@ -177,6 +177,7 @@ Patch39: 0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch
 Patch40: 0040-Disparage-slightly-for-the-alternative-which-move-DF.patch
 Patch41: 0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch
 Patch42: 0042-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch
+Patch43: 0043-Disable-FMADD-in-chains-for-Zen4-and-generic.patch
 
 # On ARM EABI systems, we do want -gnueabi to be part of the
 # target triple.
@@ -685,6 +686,7 @@ not stable, so plugins must be rebuilt any time GCC is updated.
 %patch40 -p1
 %patch41 -p1
 %patch42 -p1
+%patch43 -p1
 
 echo '%{_vendor} %{version}-%{release}' > gcc/DEV-PHASE
 
@@ -2916,6 +2918,10 @@ end
 %doc rpm.doc/changelogs/libcc1/ChangeLog*
 
 %changelog
+* Wed Jan 17 2024 Jan Hubicka <jh@suse.cz> 12.3.1-31
+- Type: Sync
+- DESC: Disable FMADD in chains for Zen4 and generic
+
 * Wed Jan 17 2024 liuhongt <hongtao.liu@intel.com> 12.3.1-30
 - Type: Sync
 - DESC: Don't assume it's AVX_U128_CLEAN after call_insn whose
-- 
Gitee