diff --git a/0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch b/0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch new file mode 100644 index 0000000000000000000000000000000000000000..5148d7e05582b592e05dc3b9c3b3a1c25c83a89d --- /dev/null +++ b/0029-x86-Add-a-new-option-mdaz-ftz-to-enable-FTZ-and-DAZ.patch @@ -0,0 +1,135 @@ +From 75d05d4e2cb6ac0b85391c51a59925a97eaee85f Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Wed, 10 May 2023 15:16:58 +0800 +Subject: x86: Add a new option -mdaz-ftz to enable FTZ and DAZ + flags in MXCSR. + + if (mdaz-ftz) + link crtfastmath.o + else if ((Ofast || ffast-math || funsafe-math-optimizations) + && !mno-daz-ftz) + link crtfastmath.o + else + Don't link crtfastmath.o + +gcc/ChangeLog: + + * config/i386/cygwin.h (ENDFILE_SPEC): Link crtfastmath.o + whenever -mdaz-ftz is specified. Don't link crtfastmath.o + when -mno-daz-ftz is specified. + * config/i386/darwin.h (ENDFILE_SPEC): Ditto. + * config/i386/gnu-user-common.h + (GNU_USER_TARGET_MATHFILE_SPEC): Ditto. + * config/i386/mingw32.h (ENDFILE_SPEC): Ditto. + * config/i386/i386.opt (mdaz-ftz): New option. + * doc/invoke.texi (x86 options): Document mftz-daz. +--- + gcc/config/i386/cygwin.h | 2 +- + gcc/config/i386/darwin.h | 4 ++-- + gcc/config/i386/gnu-user-common.h | 2 +- + gcc/config/i386/i386.opt | 4 ++++ + gcc/config/i386/mingw32.h | 2 +- + gcc/doc/invoke.texi | 11 ++++++++++- + 6 files changed, 19 insertions(+), 6 deletions(-) + +diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h +index d06eda369cf..5412c5d4479 100644 +--- a/gcc/config/i386/cygwin.h ++++ b/gcc/config/i386/cygwin.h +@@ -57,7 +57,7 @@ along with GCC; see the file COPYING3. If not see + + #undef ENDFILE_SPEC + #define ENDFILE_SPEC \ +- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}\ ++ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ + %{!shared:%:if-exists(default-manifest.o%s)}\ + %{fvtable-verify=none:%s; \ + fvtable-verify=preinit:vtv_end.o%s; \ +diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h +index a55f6b2b874..2f773924d6e 100644 +--- a/gcc/config/i386/darwin.h ++++ b/gcc/config/i386/darwin.h +@@ -109,8 +109,8 @@ along with GCC; see the file COPYING3. If not see + "%{!force_cpusubtype_ALL:-force_cpusubtype_ALL} " + + #undef ENDFILE_SPEC +-#define ENDFILE_SPEC \ +- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ ++#define ENDFILE_SPEC ++\ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ + %{mpc32:crtprec32.o%s} \ + %{mpc64:crtprec64.o%s} \ + %{mpc80:crtprec80.o%s}" TM_DESTRUCTOR +diff --git a/gcc/config/i386/gnu-user-common.h b/gcc/config/i386/gnu-user-common.h +index 23b54c5be52..3d2a33f1714 100644 +--- a/gcc/config/i386/gnu-user-common.h ++++ b/gcc/config/i386/gnu-user-common.h +@@ -47,7 +47,7 @@ along with GCC; see the file COPYING3. If not see + + /* Similar to standard GNU userspace, but adding -ffast-math support. */ + #define GNU_USER_TARGET_MATHFILE_SPEC \ +- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ ++ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ + %{mpc32:crtprec32.o%s} \ + %{mpc64:crtprec64.o%s} \ + %{mpc80:crtprec80.o%s}" +diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt +index fc1b944acb8..498fb454d01 100644 +--- a/gcc/config/i386/i386.opt ++++ b/gcc/config/i386/i386.opt +@@ -420,6 +420,10 @@ mpc80 + Target RejectNegative + Set 80387 floating-point precision to 80-bit. + ++mdaz-ftz ++Target ++Set the FTZ and DAZ Flags. ++ + mpreferred-stack-boundary= + Target RejectNegative Joined UInteger Var(ix86_preferred_stack_boundary_arg) + Attempt to keep stack aligned to this power of 2. +diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h +index d3ca0cd0279..ddbe6a4054b 100644 +--- a/gcc/config/i386/mingw32.h ++++ b/gcc/config/i386/mingw32.h +@@ -197,7 +197,7 @@ along with GCC; see the file COPYING3. If not see + + #undef ENDFILE_SPEC + #define ENDFILE_SPEC \ +- "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \ ++ "%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}} \ + %{!shared:%:if-exists(default-manifest.o%s)}\ + %{fvtable-verify=none:%s; \ + fvtable-verify=preinit:vtv_end.o%s; \ +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index 2b376e0e995..3a48655e588 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -1437,7 +1437,7 @@ See RS/6000 and PowerPC Options. + -m96bit-long-double -mlong-double-64 -mlong-double-80 -mlong-double-128 @gol + -mregparm=@var{num} -msseregparm @gol + -mveclibabi=@var{type} -mvect8-ret-in-mem @gol +--mpc32 -mpc64 -mpc80 -mstackrealign @gol ++-mpc32 -mpc64 -mpc80 -mdaz-ftz -mstackrealign @gol + -momit-leaf-frame-pointer -mno-red-zone -mno-tls-direct-seg-refs @gol + -mcmodel=@var{code-model} -mabi=@var{name} -maddress-mode=@var{mode} @gol + -m32 -m64 -mx32 -m16 -miamcu -mlarge-data-threshold=@var{num} @gol +@@ -32122,6 +32122,15 @@ are enabled by default; routines in such libraries could suffer significant + loss of accuracy, typically through so-called ``catastrophic cancellation'', + when this option is used to set the precision to less than extended precision. + ++@item -mdaz-ftz ++@opindex mdaz-ftz ++ ++The flush-to-zero (FTZ) and denormals-are-zero (DAZ) flags in the MXCSR register ++are used to control floating-point calculations.SSE and AVX instructions ++including scalar and vector instructions could benefit from enabling the FTZ ++and DAZ flags when @option{-mdaz-ftz} is specified. Don't set FTZ/DAZ flags ++when @option{-mno-daz-ftz} is specified. ++ + @item -mstackrealign + @opindex mstackrealign + Realign the stack at entry. On the x86, the @option{-mstackrealign} +-- +2.31.1 + diff --git a/0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch b/0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch new file mode 100644 index 0000000000000000000000000000000000000000..4853877e042dceba02c761d9247ff84118fe905c --- /dev/null +++ b/0030-Explicitly-view_convert_expr-mask-to-signed-type-whe.patch @@ -0,0 +1,65 @@ +From 91c609109438b970dd32b4bd7eefcfab1be7fed9 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Mon, 5 Jun 2023 12:38:41 +0800 +Subject: Explicitly view_convert_expr mask to signed type when + folding pblendvb builtins. + +Since mask < 0 will be always false for vector char when +-funsigned-char, but vpblendvb needs to check the most significant +bit. The patch explicitly VCE to vector signed char. + +gcc/ChangeLog: + + PR target/110108 + * config/i386/i386.cc (ix86_gimple_fold_builtin): Explicitly + view_convert_expr mask to signed type when folding pblendvb + builtins. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr110108-2.c: New test. +--- + gcc/config/i386/i386.cc | 4 +++- + gcc/testsuite/gcc.target/i386/pr110108-2.c | 14 ++++++++++++++ + 2 files changed, 17 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr110108-2.c + +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc +index 462dce10e5c..479fc601049 100644 +--- a/gcc/config/i386/i386.cc ++++ b/gcc/config/i386/i386.cc +@@ -18396,8 +18396,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) + tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode + ? intSI_type_node : intDI_type_node; + type = get_same_sized_vectype (itype, type); +- arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2); + } ++ else ++ type = signed_type_for (type); ++ arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2); + tree zero_vec = build_zero_cst (type); + tree cmp_type = truth_type_for (type); + tree cmp = gimple_build (&stmts, LT_EXPR, cmp_type, arg2, zero_vec); +diff --git a/gcc/testsuite/gcc.target/i386/pr110108-2.c b/gcc/testsuite/gcc.target/i386/pr110108-2.c +new file mode 100644 +index 00000000000..2d1d2fd4991 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr110108-2.c +@@ -0,0 +1,14 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx2 -O2 -funsigned-char" } */ ++/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */ ++ ++#include ++__m128i do_stuff_128(__m128i X0, __m128i X1, __m128i X2) { ++ __m128i Result = _mm_blendv_epi8(X0, X1, X2); ++ return Result; ++} ++ ++__m256i do_stuff_256(__m256i X0, __m256i X1, __m256i X2) { ++ __m256i Result = _mm256_blendv_epi8(X0, X1, X2); ++ return Result; ++} +-- +2.31.1 + diff --git a/0031-Make-option-mvzeroupper-independent-of-optimization.patch b/0031-Make-option-mvzeroupper-independent-of-optimization.patch new file mode 100644 index 0000000000000000000000000000000000000000..0a7aacd772641b14676ee216e2f562b8ab597517 --- /dev/null +++ b/0031-Make-option-mvzeroupper-independent-of-optimization.patch @@ -0,0 +1,137 @@ +From 38364a48b8439a9b16717136835d28690d2a2dd6 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Mon, 26 Jun 2023 09:50:25 +0800 +Subject: [Sync] Make option mvzeroupper independent of optimization level. + +pass_insert_vzeroupper is under condition + +TARGET_AVX && TARGET_VZEROUPPER +&& flag_expensive_optimizations && !optimize_size + +But the document of mvzeroupper doesn't mention the insertion +required -O2 and above, it may confuse users when they explicitly +use -Os -mvzeroupper. + +------------ +mvzeroupper +Target Mask(VZEROUPPER) Save +Generate vzeroupper instruction before a transfer of control flow out of +the function. +------------ + +The patch moves flag_expensive_optimizations && !optimize_size to +ix86_option_override_internal. It makes -mvzeroupper independent of +optimization level, but still keeps the behavior of architecture +tuning(emit_vzeroupper) unchanged. + +gcc/ChangeLog: + + * config/i386/i386-features.cc (pass_insert_vzeroupper:gate): + Move flag_expensive_optimizations && !optimize_size to .. + * config/i386/i386-options.cc (ix86_option_override_internal): + .. this, it makes -mvzeroupper independent of optimization + level, but still keeps the behavior of architecture + tuning(emit_vzeroupper) unchanged. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/avx-vzeroupper-29.c: New testcase. + * gcc.target/i386/avx-vzeroupper-12.c: Adjust testcase. + * gcc.target/i386/avx-vzeroupper-7.c: Ditto. + * gcc.target/i386/avx-vzeroupper-9.c: Ditto. +--- + gcc/config/i386/i386-features.cc | 3 +-- + gcc/config/i386/i386-options.cc | 4 +++- + gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c | 3 ++- + gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c | 14 ++++++++++++++ + gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c | 3 ++- + gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c | 3 ++- + 6 files changed, 24 insertions(+), 6 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c + +diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc +index 6fe41c3c24f..6a2444eb6b6 100644 +--- a/gcc/config/i386/i386-features.cc ++++ b/gcc/config/i386/i386-features.cc +@@ -1875,8 +1875,7 @@ public: + /* opt_pass methods: */ + virtual bool gate (function *) + { +- return TARGET_AVX && TARGET_VZEROUPPER +- && flag_expensive_optimizations && !optimize_size; ++ return TARGET_AVX && TARGET_VZEROUPPER; + } + + virtual unsigned int execute (function *) +diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc +index ff44ad4e03c..74e969b6896 100644 +--- a/gcc/config/i386/i386-options.cc ++++ b/gcc/config/i386/i386-options.cc +@@ -2702,7 +2702,9 @@ ix86_option_override_internal (bool main_args_p, + sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH"); + + if (!(opts_set->x_target_flags & MASK_VZEROUPPER) +- && TARGET_EMIT_VZEROUPPER) ++ && TARGET_EMIT_VZEROUPPER ++ && flag_expensive_optimizations ++ && !optimize_size) + opts->x_target_flags |= MASK_VZEROUPPER; + if (!(opts_set->x_target_flags & MASK_STV)) + opts->x_target_flags |= MASK_STV; +diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c +index e694d4048bd..5a40e87832c 100644 +--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c ++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c +@@ -16,5 +16,6 @@ foo () + _mm256_zeroupper (); + } + +-/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */ + /* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */ +diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c +new file mode 100644 +index 00000000000..4af637757f7 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c +@@ -0,0 +1,14 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O0 -mavx -mtune=generic -mvzeroupper -dp" } */ ++ ++#include ++ ++extern __m256 x, y; ++ ++void ++foo () ++{ ++ x = y; ++} ++ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */ +diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c +index ab6d68779b3..75fe5889783 100644 +--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c ++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c +@@ -12,4 +12,5 @@ foo () + _mm256_zeroupper (); + } + +-/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 { target ia32 } } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 2 { target { ! ia32 } } } } */ +diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c +index 974e1626a6d..fa0a6dfcaac 100644 +--- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c ++++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c +@@ -15,4 +15,5 @@ foo () + _mm256_zeroupper (); + } + +-/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 4 { target ia32 } } } */ ++/* { dg-final { scan-assembler-times "avx_vzeroupper" 5 { target { ! ia32 } } } } */ +-- +2.31.1 + diff --git a/0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch b/0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch new file mode 100644 index 0000000000000000000000000000000000000000..793ac8c60d561afb0c9812153c5dacf1f3fe08b2 --- /dev/null +++ b/0032-i386-Sync-tune_string-with-arch_string-for-target-at.patch @@ -0,0 +1,66 @@ +From 60c432aac9e2d66b0c0ad09513f7c3b98f53ec35 Mon Sep 17 00:00:00 2001 +From: Hongyu Wang +Date: Sun, 25 Jun 2023 09:50:21 +0800 +Subject: [Sync] i386: Sync tune_string with arch_string for target attribute +arch=* + +For function with target attribute arch=*, current logic will set its +tune to -mtune from command line so all target_clones will get same +tuning flags which would affect the performance for each clone. Override +tune with arch if tune was not explicitly specified to get proper tuning +flags for target_clones. + +gcc/ChangeLog: + + * config/i386/i386-options.cc (ix86_valid_target_attribute_tree): + Override tune_string with arch_string if tune_string is not + explicitly specified. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/mvc17.c: New test. + +(cherry picked from commit 2916278d14e9ac28c361c396a67256acbebda6e8) +--- + gcc/config/i386/i386-options.cc | 6 +++++- + gcc/testsuite/gcc.target/i386/mvc17.c | 11 +++++++++++ + 2 files changed, 16 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.target/i386/mvc17.c + +diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc +index 74e969b6896..fb2ed942f67 100644 +--- a/gcc/config/i386/i386-options.cc ++++ b/gcc/config/i386/i386-options.cc +@@ -1378,7 +1378,11 @@ ix86_valid_target_attribute_tree (tree fndecl, tree args, + if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]) + opts->x_ix86_tune_string + = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]); +- else if (orig_tune_defaulted) ++ /* If we have explicit arch string and no tune string specified, set ++ tune_string to NULL and later it will be overriden by arch_string ++ so target clones can get proper optimization. */ ++ else if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH] ++ || orig_tune_defaulted) + opts->x_ix86_tune_string = NULL; + + /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */ +diff --git a/gcc/testsuite/gcc.target/i386/mvc17.c b/gcc/testsuite/gcc.target/i386/mvc17.c +new file mode 100644 +index 00000000000..8b83c1aecb3 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/mvc17.c +@@ -0,0 +1,11 @@ ++/* { dg-do compile } */ ++/* { dg-require-ifunc "" } */ ++/* { dg-options "-O2 -march=x86-64" } */ ++/* { dg-final { scan-assembler-times "rep mov" 1 } } */ ++ ++__attribute__((target_clones("default","arch=icelake-server"))) ++void ++foo (char *a, char *b, int size) ++{ ++ __builtin_memcpy (a, b, size & 0x7F); ++} +-- +2.31.1 + diff --git a/0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch b/0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch new file mode 100644 index 0000000000000000000000000000000000000000..286f26e4c250e689c577f81a74643d0650f93289 --- /dev/null +++ b/0033-Refine-maskloadmn-pattern-with-UNSPEC_MASKLOAD.patch @@ -0,0 +1,111 @@ +From d2db85f2ce63c84137fa8f01f6eddce810db8901 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Tue, 20 Jun 2023 15:41:00 +0800 +Subject: [Sync] Refine maskloadmn pattern with UNSPEC_MASKLOAD. + +If mem_addr points to a memory region with less than whole vector size +bytes of accessible memory and k is a mask that would prevent reading +the inaccessible bytes from mem_addr, add UNSPEC_MASKLOAD to prevent +it to be transformed to vpblendd. + +gcc/ChangeLog: + + PR target/110309 + * config/i386/sse.md (maskload): + Refine pattern with UNSPEC_MASKLOAD. + (maskload): Ditto. + (*_load_mask): Extend mode iterator to + VI12HF_AVX512VL. + (*_load): Ditto. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr110309.c: New test. +--- + gcc/config/i386/sse.md | 32 +++++++++++++----------- + gcc/testsuite/gcc.target/i386/pr110309.c | 10 ++++++++ + 2 files changed, 28 insertions(+), 14 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr110309.c + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index eb767e56ca4..b30e96cb1ab 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -1411,12 +1411,12 @@ + }) + + (define_insn "*_load_mask" +- [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v") +- (vec_merge:VI12_AVX512VL +- (unspec:VI12_AVX512VL +- [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")] ++ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v") ++ (vec_merge:VI12HF_AVX512VL ++ (unspec:VI12HF_AVX512VL ++ [(match_operand:VI12HF_AVX512VL 1 "memory_operand" "m")] + UNSPEC_MASKLOAD) +- (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C") ++ (match_operand:VI12HF_AVX512VL 2 "nonimm_or_0_operand" "0C") + (match_operand: 3 "register_operand" "Yk")))] + "TARGET_AVX512BW" + "vmovdqu\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}" +@@ -1425,9 +1425,9 @@ + (set_attr "mode" "")]) + + (define_insn_and_split "*_load" +- [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v") +- (unspec:VI12_AVX512VL +- [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")] ++ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand" "=v") ++ (unspec:VI12HF_AVX512VL ++ [(match_operand:VI12HF_AVX512VL 1 "memory_operand" "m")] + UNSPEC_MASKLOAD))] + "TARGET_AVX512BW" + "#" +@@ -25973,17 +25973,21 @@ + "TARGET_AVX") + + (define_expand "maskload" +- [(set (match_operand:V48H_AVX512VL 0 "register_operand") +- (vec_merge:V48H_AVX512VL +- (match_operand:V48H_AVX512VL 1 "memory_operand") ++ [(set (match_operand:V48_AVX512VL 0 "register_operand") ++ (vec_merge:V48_AVX512VL ++ (unspec:V48_AVX512VL ++ [(match_operand:V48_AVX512VL 1 "memory_operand")] ++ UNSPEC_MASKLOAD) + (match_dup 0) + (match_operand: 2 "register_operand")))] + "TARGET_AVX512F") + + (define_expand "maskload" +- [(set (match_operand:VI12_AVX512VL 0 "register_operand") +- (vec_merge:VI12_AVX512VL +- (match_operand:VI12_AVX512VL 1 "memory_operand") ++ [(set (match_operand:VI12HF_AVX512VL 0 "register_operand") ++ (vec_merge:VI12HF_AVX512VL ++ (unspec:VI12HF_AVX512VL ++ [(match_operand:VI12HF_AVX512VL 1 "memory_operand")] ++ UNSPEC_MASKLOAD) + (match_dup 0) + (match_operand: 2 "register_operand")))] + "TARGET_AVX512BW") +diff --git a/gcc/testsuite/gcc.target/i386/pr110309.c b/gcc/testsuite/gcc.target/i386/pr110309.c +new file mode 100644 +index 00000000000..f6e9e9c3c61 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr110309.c +@@ -0,0 +1,10 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O3 --param vect-partial-vector-usage=1 -march=znver4 -mprefer-vector-width=256" } */ ++/* { dg-final { scan-assembler-not {(?n)vpblendd.*ymm} } } */ ++ ++ ++void foo (int * __restrict a, int *b) ++{ ++ for (int i = 0; i < 6; ++i) ++ a[i] = b[i] + 42; ++} +-- +2.31.1 + diff --git a/0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch b/0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch new file mode 100644 index 0000000000000000000000000000000000000000..81c48191af3dc4cb79e4b803a54f52ecbe5acdc6 --- /dev/null +++ b/0034-Refine-maskstore-patterns-with-UNSPEC_MASKMOV.patch @@ -0,0 +1,126 @@ +From 389fe721ab32b154fb07e4865fa32be0608bd5d2 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Mon, 26 Jun 2023 21:07:09 +0800 +Subject: [Sync] Refine maskstore patterns with UNSPEC_MASKMOV. + +Similar like r14-2070-gc79476da46728e + +If mem_addr points to a memory region with less than whole vector size +bytes of accessible memory and k is a mask that would prevent reading +the inaccessible bytes from mem_addr, add UNSPEC_MASKMOV to prevent +it to be transformed to any other whole memory access instructions. + +gcc/ChangeLog: + + PR rtl-optimization/110237 + * config/i386/sse.md (_store_mask): Refine with + UNSPEC_MASKMOV. + (maskstore_store_mask): New define_insn, it's renamed + from original _store_mask. +--- + gcc/config/i386/sse.md | 69 ++++++++++++++++++++++++++++++++++-------- + 1 file changed, 57 insertions(+), 12 deletions(-) + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index b30e96cb1ab..3af15989631 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -1554,7 +1554,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + +-(define_insn "_store_mask" ++(define_insn "*_store_mask" + [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m") + (vec_merge:V48_AVX512VL + (match_operand:V48_AVX512VL 1 "register_operand" "v") +@@ -1582,7 +1582,7 @@ + (set_attr "memory" "store") + (set_attr "mode" "")]) + +-(define_insn "_store_mask" ++(define_insn "*_store_mask" + [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m") + (vec_merge:VI12HF_AVX512VL + (match_operand:VI12HF_AVX512VL 1 "register_operand" "v") +@@ -26002,21 +26002,66 @@ + "TARGET_AVX") + + (define_expand "maskstore" +- [(set (match_operand:V48H_AVX512VL 0 "memory_operand") +- (vec_merge:V48H_AVX512VL +- (match_operand:V48H_AVX512VL 1 "register_operand") +- (match_dup 0) +- (match_operand: 2 "register_operand")))] ++ [(set (match_operand:V48_AVX512VL 0 "memory_operand") ++ (unspec:V48_AVX512VL ++ [(match_operand:V48_AVX512VL 1 "register_operand") ++ (match_dup 0) ++ (match_operand: 2 "register_operand")] ++ UNSPEC_MASKMOV))] + "TARGET_AVX512F") + + (define_expand "maskstore" +- [(set (match_operand:VI12_AVX512VL 0 "memory_operand") +- (vec_merge:VI12_AVX512VL +- (match_operand:VI12_AVX512VL 1 "register_operand") +- (match_dup 0) +- (match_operand: 2 "register_operand")))] ++ [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand") ++ (unspec:VI12HF_AVX512VL ++ [(match_operand:VI12HF_AVX512VL 1 "register_operand") ++ (match_dup 0) ++ (match_operand: 2 "register_operand")] ++ UNSPEC_MASKMOV))] + "TARGET_AVX512BW") + ++(define_insn "_store_mask" ++ [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m") ++ (unspec:V48_AVX512VL ++ [(match_operand:V48_AVX512VL 1 "register_operand" "v") ++ (match_dup 0) ++ (match_operand: 2 "register_operand" "Yk")] ++ UNSPEC_MASKMOV))] ++ "TARGET_AVX512F" ++{ ++ if (FLOAT_MODE_P (GET_MODE_INNER (mode))) ++ { ++ if (misaligned_operand (operands[0], mode)) ++ return "vmovu\t{%1, %0%{%2%}|%0%{%2%}, %1}"; ++ else ++ return "vmova\t{%1, %0%{%2%}|%0%{%2%}, %1}"; ++ } ++ else ++ { ++ if (misaligned_operand (operands[0], mode)) ++ return "vmovdqu\t{%1, %0%{%2%}|%0%{%2%}, %1}"; ++ else ++ return "vmovdqa\t{%1, %0%{%2%}|%0%{%2%}, %1}"; ++ } ++} ++ [(set_attr "type" "ssemov") ++ (set_attr "prefix" "evex") ++ (set_attr "memory" "store") ++ (set_attr "mode" "")]) ++ ++(define_insn "_store_mask" ++ [(set (match_operand:VI12HF_AVX512VL 0 "memory_operand" "=m") ++ (unspec:VI12HF_AVX512VL ++ [(match_operand:VI12HF_AVX512VL 1 "register_operand" "v") ++ (match_dup 0) ++ (match_operand: 2 "register_operand" "Yk")] ++ UNSPEC_MASKMOV))] ++ "TARGET_AVX512BW" ++ "vmovdqu\t{%1, %0%{%2%}|%0%{%2%}, %1}" ++ [(set_attr "type" "ssemov") ++ (set_attr "prefix" "evex") ++ (set_attr "memory" "store") ++ (set_attr "mode" "")]) ++ + (define_expand "cbranch4" + [(set (reg:CC FLAGS_REG) + (compare:CC (match_operand:VI48_AVX 1 "register_operand") +-- +2.31.1 + diff --git a/0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch b/0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch new file mode 100644 index 0000000000000000000000000000000000000000..938135ab54f049548fe669317ac5678830c214f3 --- /dev/null +++ b/0035-x86-Update-model-values-for-Alderlake-and-Rocketlake.patch @@ -0,0 +1,38 @@ +From 62cef9aea12f343cf86697f56884f585191b9545 Mon Sep 17 00:00:00 2001 +From: "Cui, Lili" +Date: Thu, 29 Jun 2023 03:10:35 +0000 +Subject: [Sync] x86: Update model values for Alderlake and Rocketlake. + +Update model values for Alderlake and Rocketlake according to SDM. + +gcc/ChangeLog + + * common/config/i386/cpuinfo.h (get_intel_cpu): Remove model value 0xa8 + from Rocketlake, remove model value 0xbf from Alderlake. +--- + gcc/common/config/i386/cpuinfo.h | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h +index 0333da56ba5..28b2ff0b033 100644 +--- a/gcc/common/config/i386/cpuinfo.h ++++ b/gcc/common/config/i386/cpuinfo.h +@@ -435,7 +435,6 @@ get_intel_cpu (struct __processor_model *cpu_model, + cpu_model->__cpu_subtype = INTEL_COREI7_SKYLAKE; + break; + case 0xa7: +- case 0xa8: + /* Rocket Lake. */ + cpu = "rocketlake"; + CHECK___builtin_cpu_is ("corei7"); +@@ -508,7 +507,6 @@ get_intel_cpu (struct __processor_model *cpu_model, + break; + case 0x97: + case 0x9a: +- case 0xbf: + /* Alder Lake. */ + cpu = "alderlake"; + CHECK___builtin_cpu_is ("corei7"); +-- +2.31.1 + diff --git a/0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch b/0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch new file mode 100644 index 0000000000000000000000000000000000000000..f3c790018f818b0919311a1596e705bc188070ff --- /dev/null +++ b/0036-Workaround-possible-CPUID-bug-in-Sandy-Bridge.patch @@ -0,0 +1,78 @@ +From 161c0972243cf967196f4e9361f0736000637bfe Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Fri, 4 Aug 2023 09:27:39 +0800 +Subject: [Sync] Workaround possible CPUID bug in Sandy Bridge. + +Don't access leaf 7 subleaf 1 unless subleaf 0 says it is +supported via EAX. + +Intel documentation says invalid subleaves return 0. We had been +relying on that behavior instead of checking the max sublef number. + +It appears that some Sandy Bridge CPUs return at least the subleaf 0 +EDX value for subleaf 1. Best guess is that this is a bug in a +microcode patch since all of the bits we're seeing set in EDX were +introduced after Sandy Bridge was originally released. + +This is causing avxvnniint16 to be incorrectly enabled with +-march=native on these CPUs. + +gcc/ChangeLog: + + * common/config/i386/cpuinfo.h (get_available_features): Check + max_subleaf_level for valid subleaf before use CPUID. +--- + gcc/common/config/i386/cpuinfo.h | 29 +++++++++++++++++------------ + 1 file changed, 17 insertions(+), 12 deletions(-) + +diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h +index 28b2ff0b033..316ad3cb3e9 100644 +--- a/gcc/common/config/i386/cpuinfo.h ++++ b/gcc/common/config/i386/cpuinfo.h +@@ -647,7 +647,9 @@ get_available_features (struct __processor_model *cpu_model, + /* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */ + if (max_cpuid_level >= 7) + { +- __cpuid_count (7, 0, eax, ebx, ecx, edx); ++ unsigned int max_subleaf_level; ++ ++ __cpuid_count (7, 0, max_subleaf_level, ebx, ecx, edx); + if (ebx & bit_BMI) + set_feature (FEATURE_BMI); + if (ebx & bit_SGX) +@@ -759,18 +761,21 @@ get_available_features (struct __processor_model *cpu_model, + set_feature (FEATURE_AVX512FP16); + } + +- __cpuid_count (7, 1, eax, ebx, ecx, edx); +- if (eax & bit_HRESET) +- set_feature (FEATURE_HRESET); +- if (avx_usable) +- { +- if (eax & bit_AVXVNNI) +- set_feature (FEATURE_AVXVNNI); +- } +- if (avx512_usable) ++ if (max_subleaf_level >= 1) + { +- if (eax & bit_AVX512BF16) +- set_feature (FEATURE_AVX512BF16); ++ __cpuid_count (7, 1, eax, ebx, ecx, edx); ++ if (eax & bit_HRESET) ++ set_feature (FEATURE_HRESET); ++ if (avx_usable) ++ { ++ if (eax & bit_AVXVNNI) ++ set_feature (FEATURE_AVXVNNI); ++ } ++ if (avx512_usable) ++ { ++ if (eax & bit_AVX512BF16) ++ set_feature (FEATURE_AVX512BF16); ++ } + } + } + +-- +2.31.1 + diff --git a/0037-Software-mitigation-Disable-gather-generation-in-vec.patch b/0037-Software-mitigation-Disable-gather-generation-in-vec.patch new file mode 100644 index 0000000000000000000000000000000000000000..224eeaff0d8d2397f838a5e4b18807071b74bb26 --- /dev/null +++ b/0037-Software-mitigation-Disable-gather-generation-in-vec.patch @@ -0,0 +1,220 @@ +From 49e9bb4de0ad94e8636d796b16aa09917145b0d3 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Thu, 10 Aug 2023 11:41:39 +0800 +Subject: [Sync] Software mitigation: Disable gather generation in + vectorization for GDS affected Intel Processors. + +For more details of GDS (Gather Data Sampling), refer to +https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/gather-data-sampling.html + +After microcode update, there's performance regression. To avoid that, +the patch disables gather generation in autovectorization but uses +gather scalar emulation instead. + +gcc/ChangeLog: + + * config/i386/i386-options.cc (m_GDS): New macro. + * config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Don't + enable for m_GDS. + (X86_TUNE_USE_GATHER_4PARTS): Ditto. + (X86_TUNE_USE_GATHER): Ditto. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/avx2-gather-2.c: Adjust options to keep + gather vectorization. + * gcc.target/i386/avx2-gather-6.c: Ditto. + * gcc.target/i386/avx512f-pr88464-1.c: Ditto. + * gcc.target/i386/avx512f-pr88464-5.c: Ditto. + * gcc.target/i386/avx512vl-pr88464-1.c: Ditto. + * gcc.target/i386/avx512vl-pr88464-11.c: Ditto. + * gcc.target/i386/avx512vl-pr88464-3.c: Ditto. + * gcc.target/i386/avx512vl-pr88464-9.c: Ditto. + * gcc.target/i386/pr88531-1b.c: Ditto. + * gcc.target/i386/pr88531-1c.c: Ditto. + +(cherry picked from commit 3064d1f5c48cb6ce1b4133570dd08ecca8abb52d) +--- + gcc/config/i386/i386-options.cc | 5 +++++ + gcc/config/i386/x86-tune.def | 9 ++++++--- + gcc/testsuite/gcc.target/i386/avx2-gather-2.c | 2 +- + gcc/testsuite/gcc.target/i386/avx2-gather-6.c | 2 +- + gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c | 2 +- + gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c | 2 +- + gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c | 2 +- + gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c | 2 +- + gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c | 2 +- + gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c | 2 +- + gcc/testsuite/gcc.target/i386/pr88531-1b.c | 2 +- + gcc/testsuite/gcc.target/i386/pr88531-1c.c | 2 +- + 12 files changed, 21 insertions(+), 13 deletions(-) + +diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc +index fb2ed942f67..9617fc162e0 100644 +--- a/gcc/config/i386/i386-options.cc ++++ b/gcc/config/i386/i386-options.cc +@@ -137,6 +137,11 @@ along with GCC; see the file COPYING3. If not see + #define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U< +Date: Thu, 10 Aug 2023 16:26:13 +0800 +Subject: [Sync] Support -m[no-]gather -m[no-]scatter to enable/disable + vectorization for all gather/scatter instructions + +Rename original use_gather to use_gather_8parts, Support +-mtune-ctrl={,^}use_gather to set/clear tune features +use_gather_{2parts, 4parts, 8parts}. Support the new option -mgather +as alias of -mtune-ctrl=, use_gather, ^use_gather. + +Similar for use_scatter. + +gcc/ChangeLog: + + * config/i386/i386-builtins.cc + (ix86_vectorize_builtin_gather): Adjust for use_gather_8parts. + * config/i386/i386-options.cc (parse_mtune_ctrl_str): + Set/Clear tune features use_{gather,scatter}_{2parts, 4parts, + 8parts} for -mtune-crtl={,^}{use_gather,use_scatter}. + * config/i386/i386.cc (ix86_vectorize_builtin_scatter): Adjust + for use_scatter_8parts + * config/i386/i386.h (TARGET_USE_GATHER): Rename to .. + (TARGET_USE_GATHER_8PARTS): .. this. + (TARGET_USE_SCATTER): Rename to .. + (TARGET_USE_SCATTER_8PARTS): .. this. + * config/i386/x86-tune.def (X86_TUNE_USE_GATHER): Rename to + (X86_TUNE_USE_GATHER_8PARTS): .. this. + (X86_TUNE_USE_SCATTER): Rename to + (X86_TUNE_USE_SCATTER_8PARTS): .. this. + * config/i386/i386.opt: Add new options mgather, mscatter. + +(cherry picked from commit b2a927fb5343db363ea4361da0d6bcee227b6737) +--- + gcc/config/i386/i386-builtins.cc | 2 +- + gcc/config/i386/i386-options.cc | 54 +++++++++++++++++++++++--------- + gcc/config/i386/i386.cc | 2 +- + gcc/config/i386/i386.h | 8 ++--- + gcc/config/i386/i386.opt | 4 +++ + gcc/config/i386/x86-tune.def | 4 +-- + 6 files changed, 52 insertions(+), 22 deletions(-) + +diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc +index 050c6228a18..8ed32e14f0a 100644 +--- a/gcc/config/i386/i386-builtins.cc ++++ b/gcc/config/i386/i386-builtins.cc +@@ -1790,7 +1790,7 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype, + ? !TARGET_USE_GATHER_2PARTS + : (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), 4u) + ? !TARGET_USE_GATHER_4PARTS +- : !TARGET_USE_GATHER))) ++ : !TARGET_USE_GATHER_8PARTS))) + return NULL_TREE; + + if ((TREE_CODE (index_type) != INTEGER_TYPE +diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc +index 9617fc162e0..3df1f0c41c3 100644 +--- a/gcc/config/i386/i386-options.cc ++++ b/gcc/config/i386/i386-options.cc +@@ -1705,20 +1705,46 @@ parse_mtune_ctrl_str (struct gcc_options *opts, bool dump) + curr_feature_string++; + clear = true; + } +- for (i = 0; i < X86_TUNE_LAST; i++) +- { +- if (!strcmp (curr_feature_string, ix86_tune_feature_names[i])) +- { +- ix86_tune_features[i] = !clear; +- if (dump) +- fprintf (stderr, "Explicitly %s feature %s\n", +- clear ? "clear" : "set", ix86_tune_feature_names[i]); +- break; +- } +- } +- if (i == X86_TUNE_LAST) +- error ("unknown parameter to option %<-mtune-ctrl%>: %s", +- clear ? curr_feature_string - 1 : curr_feature_string); ++ ++ if (!strcmp (curr_feature_string, "use_gather")) ++ { ++ ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS] = !clear; ++ ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS] = !clear; ++ ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS] = !clear; ++ if (dump) ++ fprintf (stderr, "Explicitly %s features use_gather_2parts," ++ " use_gather_4parts, use_gather_8parts\n", ++ clear ? "clear" : "set"); ++ ++ } ++ else if (!strcmp (curr_feature_string, "use_scatter")) ++ { ++ ix86_tune_features[X86_TUNE_USE_SCATTER_2PARTS] = !clear; ++ ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS] = !clear; ++ ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS] = !clear; ++ if (dump) ++ fprintf (stderr, "Explicitly %s features use_scatter_2parts," ++ " use_scatter_4parts, use_scatter_8parts\n", ++ clear ? "clear" : "set"); ++ } ++ else ++ { ++ for (i = 0; i < X86_TUNE_LAST; i++) ++ { ++ if (!strcmp (curr_feature_string, ix86_tune_feature_names[i])) ++ { ++ ix86_tune_features[i] = !clear; ++ if (dump) ++ fprintf (stderr, "Explicitly %s feature %s\n", ++ clear ? "clear" : "set", ix86_tune_feature_names[i]); ++ break; ++ } ++ } ++ ++ if (i == X86_TUNE_LAST) ++ error ("unknown parameter to option %<-mtune-ctrl%>: %s", ++ clear ? curr_feature_string - 1 : curr_feature_string); ++ } + curr_feature_string = next_feature_string; + } + while (curr_feature_string); +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc +index 479fc601049..e75d3702338 100644 +--- a/gcc/config/i386/i386.cc ++++ b/gcc/config/i386/i386.cc +@@ -18937,7 +18937,7 @@ ix86_vectorize_builtin_scatter (const_tree vectype, + ? !TARGET_USE_SCATTER_2PARTS + : (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u) + ? !TARGET_USE_SCATTER_4PARTS +- : !TARGET_USE_SCATTER)) ++ : !TARGET_USE_SCATTER_8PARTS)) + return NULL_TREE; + + if ((TREE_CODE (index_type) != INTEGER_TYPE +diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h +index 688aaabd3f8..aaa136ba0bf 100644 +--- a/gcc/config/i386/i386.h ++++ b/gcc/config/i386/i386.h +@@ -403,10 +403,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; + ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS] + #define TARGET_USE_SCATTER_4PARTS \ + ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS] +-#define TARGET_USE_GATHER \ +- ix86_tune_features[X86_TUNE_USE_GATHER] +-#define TARGET_USE_SCATTER \ +- ix86_tune_features[X86_TUNE_USE_SCATTER] ++#define TARGET_USE_GATHER_8PARTS \ ++ ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS] ++#define TARGET_USE_SCATTER_8PARTS \ ++ ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS] + #define TARGET_FUSE_CMP_AND_BRANCH_32 \ + ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32] + #define TARGET_FUSE_CMP_AND_BRANCH_64 \ +diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt +index 498fb454d01..b154110d813 100644 +--- a/gcc/config/i386/i386.opt ++++ b/gcc/config/i386/i386.opt +@@ -1222,3 +1222,7 @@ Instructions number above which STFL stall penalty can be compensated. + munroll-only-small-loops + Target Var(ix86_unroll_only_small_loops) Init(0) Save + Enable conservative small loop unrolling. ++ ++mscatter ++Target Alias(mtune-ctrl=, use_scatter, ^use_scatter) ++Enable vectorization for scatter instruction. +diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def +index 4392709fce2..bdb455d20ba 100644 +--- a/gcc/config/i386/x86-tune.def ++++ b/gcc/config/i386/x86-tune.def +@@ -488,13 +488,13 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts", + + /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more + elements. */ +-DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather", ++DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts", + ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE + | m_GENERIC | m_GDS)) + + /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more + elements. */ +-DEF_TUNE (X86_TUNE_USE_SCATTER, "use_scatter", ++DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts", + ~(m_ZNVER4)) + + /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or +-- +2.31.1 + diff --git a/0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch b/0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch new file mode 100644 index 0000000000000000000000000000000000000000..b50adc295a61168c9f339d094d77a1979b98aa43 --- /dev/null +++ b/0039-Remove-constraint-modifier-for-fcmaddcph-fmaddcph-fc.patch @@ -0,0 +1,129 @@ +From eefab6a2d901314a2c94b597e1d7766ae34529d0 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Fri, 8 Sep 2023 09:22:43 +0800 +Subject: [Sync] Remove constraint modifier % for + fcmaddcph/fmaddcph/fcmulcph since there're not commutative. + +gcc/ChangeLog: + + PR target/111306 + PR target/111335 + * config/i386/sse.md (int_comm): New int_attr. + (fma__): + Remove % for Complex conjugate operations since they're not + commutative. + (fma___pair): Ditto. + (___mask): Ditto. + (cmul3): Ditto. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr111306.c: New test. + +(cherry picked from commit f197392a16ffb1327f1d12ff8ff05f9295e015cb) +--- + gcc/config/i386/sse.md | 16 ++++++++--- + gcc/testsuite/gcc.target/i386/pr111306.c | 36 ++++++++++++++++++++++++ + 2 files changed, 48 insertions(+), 4 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr111306.c + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index 3af15989631..f25dd5f2bc4 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -6318,6 +6318,14 @@ + [(UNSPEC_COMPLEX_FMA_PAIR "fmaddc") + (UNSPEC_COMPLEX_FCMA_PAIR "fcmaddc")]) + ++(define_int_attr int_comm ++ [(UNSPEC_COMPLEX_FMA "") ++ (UNSPEC_COMPLEX_FMA_PAIR "") ++ (UNSPEC_COMPLEX_FCMA "") ++ (UNSPEC_COMPLEX_FCMA_PAIR "") ++ (UNSPEC_COMPLEX_FMUL "%") ++ (UNSPEC_COMPLEX_FCMUL "")]) ++ + (define_int_attr conj_op + [(UNSPEC_COMPLEX_FMA "") + (UNSPEC_COMPLEX_FCMA "_conj") +@@ -6431,7 +6439,7 @@ + (define_insn "fma__" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v") + (unspec:VF_AVX512FP16VL +- [(match_operand:VF_AVX512FP16VL 1 "" "%v") ++ [(match_operand:VF_AVX512FP16VL 1 "" "v") + (match_operand:VF_AVX512FP16VL 2 "" "") + (match_operand:VF_AVX512FP16VL 3 "" "0")] + UNSPEC_COMPLEX_F_C_MA))] +@@ -6495,7 +6503,7 @@ + (define_insn "fma___pair" + [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=&v") + (unspec:VF1_AVX512VL +- [(match_operand:VF1_AVX512VL 1 "vector_operand" "%v") ++ [(match_operand:VF1_AVX512VL 1 "vector_operand" "v") + (match_operand:VF1_AVX512VL 2 "bcst_vector_operand" "vmBr") + (match_operand:VF1_AVX512VL 3 "vector_operand" "0")] + UNSPEC_COMPLEX_F_C_MA_PAIR))] +@@ -6562,7 +6570,7 @@ + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v") + (vec_merge:VF_AVX512FP16VL + (unspec:VF_AVX512FP16VL +- [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v") ++ [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "v") + (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "") + (match_operand:VF_AVX512FP16VL 3 "register_operand" "0")] + UNSPEC_COMPLEX_F_C_MA) +@@ -6586,7 +6594,7 @@ + (define_insn "__" + [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v") + (unspec:VF_AVX512FP16VL +- [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "%v") ++ [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "v") + (match_operand:VF_AVX512FP16VL 2 "nonimmediate_operand" "")] + UNSPEC_COMPLEX_F_C_MUL))] + "TARGET_AVX512FP16 && " +diff --git a/gcc/testsuite/gcc.target/i386/pr111306.c b/gcc/testsuite/gcc.target/i386/pr111306.c +new file mode 100644 +index 00000000000..541725ebdad +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr111306.c +@@ -0,0 +1,36 @@ ++/* { dg-do run } */ ++/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */ ++/* { dg-require-effective-target avx512fp16 } */ ++ ++#define AVX512FP16 ++#include "avx512f-helper.h" ++ ++__attribute__((optimize("O2"),noipa)) ++void func1(_Float16 *a, _Float16 *b, int n, _Float16 *c) { ++ __m512h rA = _mm512_loadu_ph(a); ++ for (int i = 0; i < n; i += 32) { ++ __m512h rB = _mm512_loadu_ph(b + i); ++ _mm512_storeu_ph(c + i, _mm512_fcmul_pch(rB, rA)); ++ } ++} ++ ++void ++test_512 (void) ++{ ++ int n = 32; ++ _Float16 a[n], b[n], c[n]; ++ _Float16 exp[n]; ++ for (int i = 1; i <= n; i++) { ++ a[i - 1] = i & 1 ? -i : i; ++ b[i - 1] = i; ++ } ++ ++ func1(a, b, n, c); ++ for (int i = 0; i < n / 32; i += 2) { ++ if (c[i] != a[i] * b[i] + a[i+1] * b[i+1] ++ || c[i+1] != a[i] * b[i+1] - a[i+1]*b[i]) ++ __builtin_abort (); ++ } ++} ++ ++ +-- +2.31.1 + diff --git a/0040-Disparage-slightly-for-the-alternative-which-move-DF.patch b/0040-Disparage-slightly-for-the-alternative-which-move-DF.patch new file mode 100644 index 0000000000000000000000000000000000000000..6ca7b706e8feed73b20301f4efa97928e34ddaf9 --- /dev/null +++ b/0040-Disparage-slightly-for-the-alternative-which-move-DF.patch @@ -0,0 +1,106 @@ +From 655f62a56a76b99db90faecaa44a4335d674d91c Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Wed, 5 Jul 2023 13:45:11 +0800 +Subject: [Sync] Disparage slightly for the alternative which move + DFmode between SSE_REGS and GENERAL_REGS. + +For testcase + +void __cond_swap(double* __x, double* __y) { + bool __r = (*__x < *__y); + auto __tmp = __r ? *__x : *__y; + *__y = __r ? *__y : *__x; + *__x = __tmp; +} + +GCC-14 with -O2 and -march=x86-64 options generates the following code: + +__cond_swap(double*, double*): + movsd xmm1, QWORD PTR [rdi] + movsd xmm0, QWORD PTR [rsi] + comisd xmm0, xmm1 + jbe .L2 + movq rax, xmm1 + movapd xmm1, xmm0 + movq xmm0, rax +.L2: + movsd QWORD PTR [rsi], xmm1 + movsd QWORD PTR [rdi], xmm0 + ret + +rax is used to save and restore DFmode value. In RA both GENERAL_REGS +and SSE_REGS cost zero since we didn't disparage the +alternative in movdf_internal pattern, according to register +allocation order, GENERAL_REGS is allocated. The patch add ? for +alternative (r,v) and (v,r) just like we did for movsf/hf/bf_internal +pattern, after that we get optimal RA. + +__cond_swap: +.LFB0: + .cfi_startproc + movsd (%rdi), %xmm1 + movsd (%rsi), %xmm0 + comisd %xmm1, %xmm0 + jbe .L2 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm1 + movapd %xmm2, %xmm0 +.L2: + movsd %xmm1, (%rsi) + movsd %xmm0, (%rdi) + ret + +gcc/ChangeLog: + + PR target/110170 + * config/i386/i386.md (movdf_internal): Disparage slightly for + 2 alternatives (r,v) and (v,r) by adding constraint modifier + '?'. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr110170-3.c: New test. + +(cherry picked from commit 37a231cc7594d12ba0822077018aad751a6fb94e) +--- + gcc/config/i386/i386.md | 4 ++-- + gcc/testsuite/gcc.target/i386/pr110170-3.c | 11 +++++++++++ + 2 files changed, 13 insertions(+), 2 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-3.c + +diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md +index be07be10d8a..71691f59894 100644 +--- a/gcc/config/i386/i386.md ++++ b/gcc/config/i386/i386.md +@@ -3582,9 +3582,9 @@ + ;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7. + (define_insn "*movdf_internal" + [(set (match_operand:DF 0 "nonimmediate_operand" +- "=Yf*f,m ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,v,r ,o ,r ,m") ++ "=Yf*f,m ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,?r,?v,r ,o ,r ,m") + (match_operand:DF 1 "general_operand" +- "Yf*fm,Yf*f,G ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,v,r ,roF,rF,rmF,rC"))] ++ "Yf*fm,Yf*f,G ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x, v, r,roF,rF,rmF,rC"))] + "!(MEM_P (operands[0]) && MEM_P (operands[1])) + && (lra_in_progress || reload_completed + || !CONST_DOUBLE_P (operands[1]) +diff --git a/gcc/testsuite/gcc.target/i386/pr110170-3.c b/gcc/testsuite/gcc.target/i386/pr110170-3.c +new file mode 100644 +index 00000000000..70daa89e9aa +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr110170-3.c +@@ -0,0 +1,11 @@ ++/* { dg-do compile { target { ! ia32 } } } */ ++/* { dg-options "-O2 -fno-if-conversion -fno-if-conversion2" } */ ++/* { dg-final { scan-assembler-not {(?n)movq.*r} } } */ ++ ++void __cond_swap(double* __x, double* __y) { ++ _Bool __r = (*__x < *__y); ++ double __tmp = __r ? *__x : *__y; ++ *__y = __r ? *__y : *__x; ++ *__x = __tmp; ++} ++ +-- +2.31.1 + diff --git a/0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch b/0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch new file mode 100644 index 0000000000000000000000000000000000000000..5a5c5eefaf5c07e7aff22e553fffaa8e871b6082 --- /dev/null +++ b/0041-Fix-wrong-code-due-to-vec_merge-pcmp-to-blendvb-spli.patch @@ -0,0 +1,163 @@ +From 87d4ac895c32f894f0efd7efc82fcde65161a38d Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Thu, 9 Nov 2023 13:20:05 +0800 +Subject: [Sync] Fix wrong code due to vec_merge + pcmp to blendvb + splitter. + +gcc/ChangeLog: + + PR target/112443 + * config/i386/sse.md (*avx2_pcmp3_4): Fix swap condition + from LT to GT since there's not in the pattern. + (*avx2_pcmp3_5): Ditto. + +gcc/testsuite/ChangeLog: + + * g++.target/i386/pr112443.C: New test. + +(cherry picked from commit 9a0cc04b9c9b02426762892b88efc5c44ba546bd) +--- + gcc/config/i386/sse.md | 4 +- + gcc/testsuite/g++.target/i386/pr112443.C | 108 +++++++++++++++++++++++ + 2 files changed, 110 insertions(+), 2 deletions(-) + create mode 100644 gcc/testsuite/g++.target/i386/pr112443.C + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index f25dd5f2bc4..23b858ab21c 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -16358,7 +16358,7 @@ + (match_dup 4))] + UNSPEC_BLENDV))] + { +- if (INTVAL (operands[5]) == 1) ++ if (INTVAL (operands[5]) == 5) + std::swap (operands[1], operands[2]); + operands[3] = gen_lowpart (mode, operands[3]); + }) +@@ -16388,7 +16388,7 @@ + (match_dup 4))] + UNSPEC_BLENDV))] + { +- if (INTVAL (operands[5]) == 1) ++ if (INTVAL (operands[5]) == 5) + std::swap (operands[1], operands[2]); + }) + +diff --git a/gcc/testsuite/g++.target/i386/pr112443.C b/gcc/testsuite/g++.target/i386/pr112443.C +new file mode 100644 +index 00000000000..ebfa9b4a753 +--- /dev/null ++++ b/gcc/testsuite/g++.target/i386/pr112443.C +@@ -0,0 +1,108 @@ ++/* { dg-do run } */ ++/* { dg-require-effective-target avx512bw } */ ++/* { dg-require-effective-target avx512vl } */ ++/* { dg-options "-O2 -std=c++17 -mavx512bw -mavx512vl" } */ ++ ++#include ++#include ++#include ++#include ++ ++#define AVX512BW ++#define AVX512VL ++ ++#include "avx512f-helper.h" ++ ++struct TensorIteratorBase{ ++ char* in; ++ char* out; ++ ++ void for_each(std::function loop){ ++ loop(out, in, 32); ++ } ++}; ++ ++class Vectorized { ++protected: ++ __m256i values; ++ ++ static inline __m256i invert(const __m256i& v) { ++ const auto ones = _mm256_set1_epi64x(-1); ++ return _mm256_xor_si256(ones, v); ++ } ++public: ++ operator __m256i() const { ++ return values; ++ } ++ ++ static constexpr int size() { ++ return 32; ++ } ++ ++ Vectorized() {} ++ Vectorized(__m256i v) : values(v) {} ++ Vectorized(uint8_t v) { values = _mm256_set1_epi8(v); } ++ static Vectorized blendv(const Vectorized& a, const Vectorized& b, ++ const Vectorized& mask) { ++ return _mm256_blendv_epi8(a, b, mask); ++ } ++ static Vectorized loadu(const void* ptr) { ++ return _mm256_loadu_si256(reinterpret_cast(ptr)); ++ } ++ void store(void* ptr) const { ++ _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); ++ } ++ ++ Vectorized operator<(const Vectorized& other) const { ++ __m256i max = _mm256_max_epu8(values, other); ++ return invert(_mm256_cmpeq_epi8(max, values)); ++ } ++ Vectorized operator-(const Vectorized& b) { ++ return _mm256_sub_epi8(values, b); ++ } ++}; ++ ++std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { ++ uint8_t buf[Vectorized::size()]; ++ vec.store(buf); ++ stream << "vec["; ++ for (int i = 0; i != Vectorized::size(); i++) { ++ if (i != 0) ++ stream << ", "; ++ stream << buf[i]*1; ++ } ++ stream << "]"; ++ return stream; ++} ++ ++void run(TensorIteratorBase iter){ ++ Vectorized zero_vec(0); ++ Vectorized one_vec(1); ++ ++ iter.for_each([=](char* out, char* in, int64_t size) { ++ for (int64_t i = 0; i <= size - Vectorized::size(); i += Vectorized::size()) { ++ auto self_vec = Vectorized::loadu(in + i); ++ auto left = Vectorized::blendv(zero_vec, one_vec, zero_vec < self_vec); ++ auto right = Vectorized::blendv(zero_vec, one_vec, self_vec < zero_vec); ++ auto outv = left - right; ++ outv.store(out + i); ++ } ++ }); ++} ++ ++void ++test_256 (){ ++ char in[32]; ++ char out[32]; ++ for(auto& x: in) x = 1; ++ run(TensorIteratorBase{in, out}); ++ Vectorized::loadu (out); ++ for (int i = 0; i != 32; i++) ++ if (out[i] != 1) ++ __builtin_abort (); ++} ++ ++void ++test_128 () ++{ ++} +-- +2.31.1 + diff --git a/0042-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch b/0042-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch new file mode 100644 index 0000000000000000000000000000000000000000..043b521693a682b04654b20ea42a1e968dcfd289 --- /dev/null +++ b/0042-Don-t-assume-it-s-AVX_U128_CLEAN-after-call_insn-who.patch @@ -0,0 +1,151 @@ +From 068c5ee54e6275fab3fcb501c4d6221f86abae68 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Thu, 7 Dec 2023 09:17:27 +0800 +Subject: [Sync] Don't assume it's AVX_U128_CLEAN after call_insn whose + abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS. + +If the function desn't clobber any sse registers or only clobber +128-bit part, then vzeroupper isn't issued before the function exit. +the status not CLEAN but ANY after the function. + +Also for sibling_call, it's safe to issue an vzeroupper. Also there +could be missing vzeroupper since there's no mode_exit for +sibling_call_p. + +gcc/ChangeLog: + + PR target/112891 + * config/i386/i386.cc (ix86_avx_u128_mode_after): Return + AVX_U128_ANY if callee_abi doesn't clobber all_sse_regs to + align with ix86_avx_u128_mode_needed. + (ix86_avx_u128_mode_needed): Return AVX_U128_ClEAN for + sibling_call. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr112891.c: New test. + * gcc.target/i386/pr112891-2.c: New test. + +(cherry picked from commit fc189a08f5b7ad5889bd4c6b320c1dd99dd5d642) +--- + gcc/config/i386/i386.cc | 22 +++++++++++++--- + gcc/testsuite/gcc.target/i386/pr112891-2.c | 30 ++++++++++++++++++++++ + gcc/testsuite/gcc.target/i386/pr112891.c | 29 +++++++++++++++++++++ + 3 files changed, 78 insertions(+), 3 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr112891-2.c + create mode 100644 gcc/testsuite/gcc.target/i386/pr112891.c + +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc +index e75d3702338..60f3296b00c 100644 +--- a/gcc/config/i386/i386.cc ++++ b/gcc/config/i386/i386.cc +@@ -14416,8 +14416,12 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) + modes wider than 256 bits. It's only safe to issue a + vzeroupper if all SSE registers are clobbered. */ + const function_abi &abi = insn_callee_abi (insn); +- if (!hard_reg_set_subset_p (reg_class_contents[SSE_REGS], +- abi.mode_clobbers (V4DImode))) ++ /* Should be safe to issue an vzeroupper before sibling_call_p. ++ Also there not mode_exit for sibling_call, so there could be ++ missing vzeroupper for that. */ ++ if (!(SIBLING_CALL_P (insn) ++ || hard_reg_set_subset_p (reg_class_contents[SSE_REGS], ++ abi.mode_clobbers (V4DImode)))) + return AVX_U128_ANY; + + return AVX_U128_CLEAN; +@@ -14555,7 +14559,19 @@ ix86_avx_u128_mode_after (int mode, rtx_insn *insn) + bool avx_upper_reg_found = false; + note_stores (insn, ix86_check_avx_upper_stores, &avx_upper_reg_found); + +- return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN; ++ if (avx_upper_reg_found) ++ return AVX_U128_DIRTY; ++ ++ /* If the function desn't clobber any sse registers or only clobber ++ 128-bit part, Then vzeroupper isn't issued before the function exit. ++ the status not CLEAN but ANY after the function. */ ++ const function_abi &abi = insn_callee_abi (insn); ++ if (!(SIBLING_CALL_P (insn) ++ || hard_reg_set_subset_p (reg_class_contents[SSE_REGS], ++ abi.mode_clobbers (V4DImode)))) ++ return AVX_U128_ANY; ++ ++ return AVX_U128_CLEAN; + } + + /* Otherwise, return current mode. Remember that if insn +diff --git a/gcc/testsuite/gcc.target/i386/pr112891-2.c b/gcc/testsuite/gcc.target/i386/pr112891-2.c +new file mode 100644 +index 00000000000..164c3985d50 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr112891-2.c +@@ -0,0 +1,30 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx2 -O3" } */ ++/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */ ++ ++void ++__attribute__((noinline)) ++bar (double* a) ++{ ++ a[0] = 1.0; ++ a[1] = 2.0; ++} ++ ++double ++__attribute__((noinline)) ++foo (double* __restrict a, double* b) ++{ ++ a[0] += b[0]; ++ a[1] += b[1]; ++ a[2] += b[2]; ++ a[3] += b[3]; ++ bar (b); ++ return a[5] + b[5]; ++} ++ ++double ++foo1 (double* __restrict a, double* b) ++{ ++ double c = foo (a, b); ++ return __builtin_exp (c); ++} +diff --git a/gcc/testsuite/gcc.target/i386/pr112891.c b/gcc/testsuite/gcc.target/i386/pr112891.c +new file mode 100644 +index 00000000000..dbf6c67948a +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr112891.c +@@ -0,0 +1,29 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx2 -O3" } */ ++/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */ ++ ++void ++__attribute__((noinline)) ++bar (double* a) ++{ ++ a[0] = 1.0; ++ a[1] = 2.0; ++} ++ ++void ++__attribute__((noinline)) ++foo (double* __restrict a, double* b) ++{ ++ a[0] += b[0]; ++ a[1] += b[1]; ++ a[2] += b[2]; ++ a[3] += b[3]; ++ bar (b); ++} ++ ++double ++foo1 (double* __restrict a, double* b) ++{ ++ foo (a, b); ++ return __builtin_exp (b[1]); ++} +-- +2.31.1 + diff --git a/0043-Disable-FMADD-in-chains-for-Zen4-and-generic.patch b/0043-Disable-FMADD-in-chains-for-Zen4-and-generic.patch new file mode 100644 index 0000000000000000000000000000000000000000..a32a924ab93acd86941b0e99a282b732e57fd547 --- /dev/null +++ b/0043-Disable-FMADD-in-chains-for-Zen4-and-generic.patch @@ -0,0 +1,142 @@ +From 16ee6e4d87bb484f92a5d971bbf24177a6ab6b25 Mon Sep 17 00:00:00 2001 +From: Jan Hubicka +Date: Fri, 29 Dec 2023 23:51:03 +0100 +Subject: [Sync] Disable FMADD in chains for Zen4 and generic + +this patch disables use of FMA in matrix multiplication loop for generic (for +x86-64-v3) and zen4. I tested this on zen4 and Xenon Gold Gold 6212U. + +For Intel this is neutral both on the matrix multiplication microbenchmark +(attached) and spec2k17 where the difference was within noise for Core. + +On core the micro-benchmark runs as follows: + +With FMA: + + 578,500,241 cycles:u # 3.645 GHz + ( +- 0.12% ) + 753,318,477 instructions:u # 1.30 insn per +cycle ( +- 0.00% ) + 125,417,701 branches:u # 790.227 M/sec + ( +- 0.00% ) + 0.159146 +- 0.000363 seconds time elapsed ( +- 0.23% ) + +No FMA: + + 577,573,960 cycles:u # 3.514 GHz + ( +- 0.15% ) + 878,318,479 instructions:u # 1.52 insn per +cycle ( +- 0.00% ) + 125,417,702 branches:u # 763.035 M/sec + ( +- 0.00% ) + 0.164734 +- 0.000321 seconds time elapsed ( +- 0.19% ) + +So the cycle count is unchanged and discrete multiply+add takes same time as +FMA. + +While on zen: + +With FMA: + 484875179 cycles:u # 3.599 GHz + ( +- 0.05% ) (82.11%) + 752031517 instructions:u # 1.55 insn per +cycle + 125106525 branches:u # 928.712 M/sec + ( +- 0.03% ) (85.09%) + 128356 branch-misses:u # 0.10% of all +branches ( +- 0.06% ) (83.58%) + +No FMA: + 375875209 cycles:u # 3.592 GHz + ( +- 0.08% ) (80.74%) + 875725341 instructions:u # 2.33 insn per +cycle + 124903825 branches:u # 1.194 G/sec + ( +- 0.04% ) (84.59%) + 0.105203 +- 0.000188 seconds time elapsed ( +- 0.18% ) + +The diffrerence is that Cores understand the fact that fmadd does not need +all three parameters to start computation, while Zen cores doesn't. + +Since this seems noticeable win on zen and not loss on Core it seems like good +default for generic. + +float a[SIZE][SIZE]; +float b[SIZE][SIZE]; +float c[SIZE][SIZE]; + +void init(void) +{ + int i, j, k; + for(i=0; i gcc/DEV-PHASE @@ -2888,6 +2918,72 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Wed Jan 17 2024 Jan Hubicka 12.3.1-31 +- Type: Sync +- DESC: Disable FMADD in chains for Zen4 and generic + +* Wed Jan 17 2024 liuhongt 12.3.1-30 +- Type: Sync +- DESC: Don't assume it's AVX_U128_CLEAN after call_insn whose + abi.mode_clobber(V4DImode) deosn't contains all SSE_REGS. + +* Wed Jan 17 2024 liuhongt 12.3.1-29 +- Type: Sync +- DESC: Fix wrong code due to vec_merge + pcmp to blendvb splitter. + +* Wed Jan 17 2024 liuhongt 12.3.1-28 +- Type: Sync +- DESC: Disparage slightly for the alternative which move DFmode between + SSE_REGS and GENERAL_REGS. + +* Wed Jan 17 2024 liuhongt 12.3.1-27 +- Type: Sync +- DESC: Remove constraint modifier % for fcmaddcph/fmaddcph/fcmulcph since + there're not commutative. + +* Wed Jan 17 2024 liuhongt 12.3.1-26 +- Type: Sync +- DESC: Support -m[no-]gather -m[no-]scatter to enable/disable vectorization + for all gather/scatter instructions + +* Wed Jan 17 2024 liuhongt 12.3.1-25 +- Type: Sync +- DESC: Software mitigation: Disable gather generation in vectorization for GDS + affected Intel Processors. + +* Wed Jan 17 2024 liuhongt 12.3.1-24 +- Type: Sync +- DESC: Workaround possible CPUID bug in Sandy Bridge. + +* Wed Jan 17 2024 Cui, Lili 12.3.1-23 +- Type: Sync +- DESC: x86: Update model values for Alderlake and Rocketlake. + +* Wed Jan 17 2024 liuhongt 12.3.1-22 +- Type: Sync +- DESC: Refine maskstore patterns with UNSPEC_MASKMOV. + +* Wed Jan 17 2024 liuhongt 12.3.1-21 +- Type: Sync +- DESC: Refine maskloadmn pattern with UNSPEC_MASKLOAD. + +* Wed Jan 17 2024 Hongyu Wang 12.3.1-20 +- Type: Sync +- DESC: i386: Sync tune_string with arch_string for target attribute arch=* + +* Wed Jan 17 2024 liuhongt 12.3.1-19 +- Type: Sync +- DESC: Make option mvzeroupper independent of optimization level. + +* Wed Jan 17 2024 liuhongt 12.3.1-18 +- Type: Sync +- DESC: Explicitly view_convert_expr mask to signed type when folding + pblendvb builtins. + +* Wed Jan 17 2024 liuhongt 12.3.1-17 +- Type: Sync +- DESC: x86: Add a new option -mdaz-ftz to enable FTZ and DAZ flags in MXCSR. + * Mon Sep 11 2023 dingguangya 12.3.1-16 - Type: Sync - DESC: Sync patch from openeuler/gcc