From 544aa777a4b74deb74d9d6efb384a3586d6930b2 Mon Sep 17 00:00:00 2001 From: "Hu, Lin1" Date: Mon, 17 Feb 2025 14:46:48 +0800 Subject: [PATCH] [Sync] Sync patches from openeuler/gcc Sync some patches from openeuler/gcc, mainly patches for x86 bugs fixes. --- 0338-i386-Fix-AVX512-intrin-macro-typo.patch | 268 +++++++++++ ...zero_ps-d-instead-of-_mm_avx512_setz.patch | 46 ++ ...t-Bk-to-define_special_memory_constr.patch | 107 +++++ ..._-move_max-store_max-with-vectorizer.patch | 232 +++++++++ 0342-Fix-testcase-failure.patch | 120 +++++ ...heck-avx-upper-register-for-parallel.patch | 148 ++++++ ...Fix-vfpclassph-non-optimizied-intrin.patch | 134 ++++++ ...as-option-and-reorder-Intel-CPU-marc.patch | 278 +++++++++++ ...-related-to-combine-vpcmpuw-zero_ext.patch | 416 ++++++++++++++++ ...due-to-isa-mismatch-for-the-builtins.patch | 95 ++++ 0348-Fix-ICE-due-to-subreg-us_truncate.patch | 444 ++++++++++++++++++ ...-32-bit-address-to-64-bit-with-optio.patch | 104 ++++ ...ed-operands-2-in-vec_unpacks_hi_v4sf.patch | 37 ++ 0351-GCC13-GCC12-Fix-testcase.patch | 34 ++ gcc.spec | 36 +- 15 files changed, 2498 insertions(+), 1 deletion(-) create mode 100644 0338-i386-Fix-AVX512-intrin-macro-typo.patch create mode 100644 0339-i386-Use-_mm_setzero_ps-d-instead-of-_mm_avx512_setz.patch create mode 100644 0340-Refine-constraint-Bk-to-define_special_memory_constr.patch create mode 100644 0341-Align-ix86_-move_max-store_max-with-vectorizer.patch create mode 100644 0342-Fix-testcase-failure.patch create mode 100644 0343-Check-avx-upper-register-for-parallel.patch create mode 100644 0344-i386-Fix-vfpclassph-non-optimizied-intrin.patch create mode 100644 0345-doc-Add-more-alias-option-and-reorder-Intel-CPU-marc.patch create mode 100644 0346-Refine-splitters-related-to-combine-vpcmpuw-zero_ext.patch create mode 100644 0347-Fix-ICE-due-to-isa-mismatch-for-the-builtins.patch create mode 100644 0348-Fix-ICE-due-to-subreg-us_truncate.patch create mode 100644 0349-i386-Zero-extend-32-bit-address-to-64-bit-with-optio.patch create mode 100644 0350-Fix-uninitialized-operands-2-in-vec_unpacks_hi_v4sf.patch create mode 100644 0351-GCC13-GCC12-Fix-testcase.patch diff --git a/0338-i386-Fix-AVX512-intrin-macro-typo.patch b/0338-i386-Fix-AVX512-intrin-macro-typo.patch new file mode 100644 index 0000000..d2a7072 --- /dev/null +++ b/0338-i386-Fix-AVX512-intrin-macro-typo.patch @@ -0,0 +1,268 @@ +From c511b753a24ba48bbe4cdec5cf98e0f33cdb86ad Mon Sep 17 00:00:00 2001 +From: Haochen Jiang +Date: Thu, 25 Jul 2024 16:12:20 +0800 +Subject: [PATCH 01/14] i386: Fix AVX512 intrin macro typo + +There are several typo in AVX512 intrins macro define. Correct them to solve +errors when compiled with -O0. + +gcc/ChangeLog: + + * config/i386/avx512dqintrin.h + (_mm_mask_fpclass_ss_mask): Correct operand order. + (_mm_mask_fpclass_sd_mask): Ditto. + (_mm256_maskz_reduce_round_ss): Use __builtin_ia32_reducess_mask_round + instead of __builtin_ia32_reducesd_mask_round. + (_mm_reduce_round_sd): Use -1 as mask since it is non-mask. + (_mm_reduce_round_ss): Ditto. + * config/i386/avx512vlbwintrin.h + (_mm256_mask_alignr_epi8): Correct operand usage. + (_mm_mask_alignr_epi8): Ditto. + * config/i386/avx512vlintrin.h (_mm_mask_alignr_epi64): Ditto. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/avx512bw-vpalignr-1b.c: New test. + * gcc.target/i386/avx512dq-vfpclasssd-1b.c: Ditto. + * gcc.target/i386/avx512dq-vfpclassss-1b.c: Ditto. + * gcc.target/i386/avx512dq-vreducesd-1b.c: Ditto. + * gcc.target/i386/avx512dq-vreducess-1b.c: Ditto. + * gcc.target/i386/avx512vl-valignq-1b.c: Ditto. + +(cherry picked from commit 16daeb262af4566e665a941368cb15bc2cba3f07) +--- + gcc/config/i386/avx512dqintrin.h | 16 +++++++++------- + gcc/config/i386/avx512vlbwintrin.h | 4 ++-- + gcc/config/i386/avx512vlintrin.h | 2 +- + .../gcc.target/i386/avx512bw-vpalignr-1b.c | 18 ++++++++++++++++++ + .../gcc.target/i386/avx512dq-vfpclasssd-1b.c | 14 ++++++++++++++ + .../gcc.target/i386/avx512dq-vfpclassss-1b.c | 14 ++++++++++++++ + .../gcc.target/i386/avx512dq-vreducesd-1b.c | 16 ++++++++++++++++ + .../gcc.target/i386/avx512dq-vreducess-1b.c | 16 ++++++++++++++++ + .../gcc.target/i386/avx512vl-valignq-1b.c | 15 +++++++++++++++ + 9 files changed, 105 insertions(+), 10 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-1b.c + create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-vfpclasssd-1b.c + create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-vfpclassss-1b.c + create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1b.c + create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1b.c + create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-valignq-1b.c + +diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h +index e924250a4ad..4f9451e949b 100644 +--- a/gcc/config/i386/avx512dqintrin.h ++++ b/gcc/config/i386/avx512dqintrin.h +@@ -2800,11 +2800,11 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) + ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X), \ + (int) (C), (__mmask8) (-1))) \ + +-#define _mm_mask_fpclass_ss_mask(X, C, U) \ ++#define _mm_mask_fpclass_ss_mask(U, X, C) \ + ((__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) (__m128) (X), \ + (int) (C), (__mmask8) (U))) + +-#define _mm_mask_fpclass_sd_mask(X, C, U) \ ++#define _mm_mask_fpclass_sd_mask(U, X, C) \ + ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X), \ + (int) (C), (__mmask8) (U))) + +@@ -2839,8 +2839,9 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) + (__mmask8)(U))) + + #define _mm_reduce_round_sd(A, B, C, R) \ +- ((__m128d) __builtin_ia32_reducesd_round ((__v2df)(__m128d)(A), \ +- (__v2df)(__m128d)(B), (int)(C), (__mmask8)(U), (int)(R))) ++ ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \ ++ (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), \ ++ (__mmask8)(-1), (int)(R))) + + #define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \ + ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \ +@@ -2867,8 +2868,9 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) + (__mmask8)(U))) + + #define _mm_reduce_round_ss(A, B, C, R) \ +- ((__m128) __builtin_ia32_reducess_round ((__v4sf)(__m128)(A), \ +- (__v4sf)(__m128)(B), (int)(C), (__mmask8)(U), (int)(R))) ++ ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A), \ ++ (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (), \ ++ (__mmask8)(-1), (int)(R))) + + #define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \ + ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A), \ +@@ -2876,7 +2878,7 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) + (__mmask8)(U), (int)(R))) + + #define _mm_maskz_reduce_round_ss(U, A, B, C, R) \ +- ((__m128) __builtin_ia32_reducesd_mask_round ((__v4sf)(__m128)(A), \ ++ ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ + (__mmask8)(U), (int)(R))) + +diff --git a/gcc/config/i386/avx512vlbwintrin.h b/gcc/config/i386/avx512vlbwintrin.h +index 192d54e743f..c918ed520c5 100644 +--- a/gcc/config/i386/avx512vlbwintrin.h ++++ b/gcc/config/i386/avx512vlbwintrin.h +@@ -1839,7 +1839,7 @@ _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B) + #define _mm256_mask_alignr_epi8(W, U, X, Y, N) \ + ((__m256i) __builtin_ia32_palignr256_mask ((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)((N) * 8), \ +- (__v4di)(__m256i)(X), (__mmask32)(U))) ++ (__v4di)(__m256i)(W), (__mmask32)(U))) + + #define _mm256_mask_srli_epi16(W, U, A, B) \ + ((__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi)(__m256i)(A), \ +@@ -1922,7 +1922,7 @@ _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B) + #define _mm_mask_alignr_epi8(W, U, X, Y, N) \ + ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)((N) * 8), \ +- (__v2di)(__m128i)(X), (__mmask16)(U))) ++ (__v2di)(__m128i)(W), (__mmask16)(U))) + + #define _mm_maskz_alignr_epi8(U, X, Y, N) \ + ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X), \ +diff --git a/gcc/config/i386/avx512vlintrin.h b/gcc/config/i386/avx512vlintrin.h +index 26b286eae6b..c6f3f35a009 100644 +--- a/gcc/config/i386/avx512vlintrin.h ++++ b/gcc/config/i386/avx512vlintrin.h +@@ -13609,7 +13609,7 @@ _mm256_permutex_pd (__m256d __X, const int __M) + + #define _mm_mask_alignr_epi64(W, U, X, Y, C) \ + ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), \ +- (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(X), (__mmask8)-1)) ++ (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(W), (__mmask8)(U))) + + #define _mm_maskz_alignr_epi64(U, X, Y, C) \ + ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), \ +diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-1b.c b/gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-1b.c +new file mode 100644 +index 00000000000..2b42aa90b91 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-1b.c +@@ -0,0 +1,18 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O0 -mavx512bw -mavx512vl" } */ ++/* { dg-final { scan-assembler-times "vpalignr\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ ++/* { dg-final { scan-assembler-times "vpalignr\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ ++ ++#include ++ ++volatile __m256i y; ++volatile __m128i x; ++volatile __mmask32 m2; ++volatile __mmask16 m3; ++ ++void extern ++avx512bw_test (void) ++{ ++ y = _mm256_mask_alignr_epi8 (y, m2, y, y, 10); ++ x = _mm_mask_alignr_epi8 (x, m3, x, x, 10); ++} +diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vfpclasssd-1b.c b/gcc/testsuite/gcc.target/i386/avx512dq-vfpclasssd-1b.c +new file mode 100644 +index 00000000000..8c7f96fb7a7 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx512dq-vfpclasssd-1b.c +@@ -0,0 +1,14 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx512dq -O0" } */ ++/* { dg-final { scan-assembler-times "vfpclasssd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n^k\]*%k\[0-7\]\{%k\[0-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ ++ ++#include ++ ++volatile __m128d x128; ++volatile __mmask8 m8; ++ ++void extern ++avx512dq_test (void) ++{ ++ m8 = _mm_mask_fpclass_sd_mask (m8, x128, 13); ++} +diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vfpclassss-1b.c b/gcc/testsuite/gcc.target/i386/avx512dq-vfpclassss-1b.c +new file mode 100644 +index 00000000000..3196fd60d64 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx512dq-vfpclassss-1b.c +@@ -0,0 +1,14 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx512dq -O0" } */ ++/* { dg-final { scan-assembler-times "vfpclassss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n^k\]*%k\[0-7\]\{%k\[0-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ ++ ++#include ++ ++volatile __m128 x128; ++volatile __mmask8 m8; ++ ++void extern ++avx512dq_test (void) ++{ ++ m8 = _mm_mask_fpclass_ss_mask (m8, x128, 13); ++} +diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1b.c b/gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1b.c +new file mode 100644 +index 00000000000..9ae8259d373 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1b.c +@@ -0,0 +1,16 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx512dq -O0" } */ ++/* { dg-final { scan-assembler-times "vreducesd\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ ++ ++#include ++ ++#define IMM 123 ++ ++volatile __m128d x1, x2, xx1, xx2; ++volatile __mmask8 m; ++ ++void extern ++avx512dq_test (void) ++{ ++ xx1 = _mm_reduce_round_sd (xx1, xx2, IMM, _MM_FROUND_NO_EXC); ++} +diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1b.c b/gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1b.c +new file mode 100644 +index 00000000000..47bf48fb617 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1b.c +@@ -0,0 +1,16 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx512dq -O0" } */ ++/* { dg-final { scan-assembler-times "vreducess\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ ++ ++#include ++ ++#define IMM 123 ++ ++volatile __m128 x1, x2, xx1, xx2; ++volatile __mmask8 m; ++ ++void extern ++avx512dq_test (void) ++{ ++ xx1 = _mm_reduce_round_ss (xx1, xx2, IMM, _MM_FROUND_NO_EXC); ++} +diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-valignq-1b.c b/gcc/testsuite/gcc.target/i386/avx512vl-valignq-1b.c +new file mode 100644 +index 00000000000..0ab16b27733 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx512vl-valignq-1b.c +@@ -0,0 +1,15 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O0 -mavx512vl" } */ ++/* { dg-final { scan-assembler-times "valignq\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ ++ ++#include ++ ++volatile __m256i y; ++volatile __m128i x; ++volatile __mmask8 m; ++ ++void extern ++avx512vl_test (void) ++{ ++ x = _mm_mask_alignr_epi64 (x, m, x, x, 1); ++} +-- +2.31.1 + diff --git a/0339-i386-Use-_mm_setzero_ps-d-instead-of-_mm_avx512_setz.patch b/0339-i386-Use-_mm_setzero_ps-d-instead-of-_mm_avx512_setz.patch new file mode 100644 index 0000000..0863492 --- /dev/null +++ b/0339-i386-Use-_mm_setzero_ps-d-instead-of-_mm_avx512_setz.patch @@ -0,0 +1,46 @@ +From 22584572ff9a1c3256da20f5438cacc6102fa2ac Mon Sep 17 00:00:00 2001 +From: Haochen Jiang +Date: Mon, 29 Jul 2024 14:10:49 +0800 +Subject: [PATCH 02/14] i386: Use _mm_setzero_ps/d instead of + _mm_avx512_setzero_ps/d for GCC13/12 + +In GCC13/12, there is no _mm_avx512_setzero_ps/d since it is introduced +in GCC14. + +gcc/ChangeLog: + + * config/i386/avx512dqintrin.h (_mm_reduce_round_sd): Use + _mm_setzero_pd instead of _mm_avx512_setzero_pd. + (_mm_reduce_round_ss): Use _mm_setzero_ps instead of + _mm_avx512_setzero_ps. + +(cherry picked from commit 77ad22e4eaa97bb10068c6170f53caca77c99392) (gcc-12) +--- + gcc/config/i386/avx512dqintrin.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h +index 4f9451e949b..e8f8efe3be8 100644 +--- a/gcc/config/i386/avx512dqintrin.h ++++ b/gcc/config/i386/avx512dqintrin.h +@@ -2840,7 +2840,7 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) + + #define _mm_reduce_round_sd(A, B, C, R) \ + ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \ +- (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), \ ++ (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \ + (__mmask8)(-1), (int)(R))) + + #define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \ +@@ -2869,7 +2869,7 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) + + #define _mm_reduce_round_ss(A, B, C, R) \ + ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A), \ +- (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (), \ ++ (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ + (__mmask8)(-1), (int)(R))) + + #define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \ +-- +2.31.1 + diff --git a/0340-Refine-constraint-Bk-to-define_special_memory_constr.patch b/0340-Refine-constraint-Bk-to-define_special_memory_constr.patch new file mode 100644 index 0000000..024209e --- /dev/null +++ b/0340-Refine-constraint-Bk-to-define_special_memory_constr.patch @@ -0,0 +1,107 @@ +From bdc11c30981f8954249aa534c9b5b2ea51efa042 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Wed, 24 Jul 2024 11:29:23 +0800 +Subject: [PATCH 03/14] Refine constraint "Bk" to + define_special_memory_constraint. + +For below pattern, RA may still allocate r162 as v/k register, try to +reload for address with leaq __libc_tsd_CTYPE_B@gottpoff(%rip), %rsi +which result a linker error. + +(set (reg:DI 162) + (mem/u/c:DI + (const:DI (unspec:DI + [(symbol_ref:DI ("a") [flags 0x60] )] + UNSPEC_GOTNTPOFF)) + +Quote from H.J for why linker issue an error. +>What do these do: +> +> leaq __libc_tsd_CTYPE_B@gottpoff(%rip), %rax +> vmovq (%rax), %xmm0 +> +>From x86-64 TLS psABI: +> +>The assembler generates for the x@gottpoff(%rip) expressions a R X86 +>64 GOTTPOFF relocation for the symbol x which requests the linker to +>generate a GOT entry with a R X86 64 TPOFF64 relocation. The offset of +>the GOT entry relative to the end of the instruction is then used in +>the instruction. The R X86 64 TPOFF64 relocation is pro- cessed at +>program startup time by the dynamic linker by looking up the symbol x +>in the modules loaded at that point. The offset is written in the GOT +>entry and later loaded by the addq instruction. +> +>The above code sequence looks wrong to me. + +gcc/ChangeLog: + + PR target/116043 + * config/i386/constraints.md (Bk): Refine to + define_special_memory_constraint. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr116043.c: New test. + +(cherry picked from commit bc1fda00d5f20e2f3e77a50b2822562b6e0040b2) +--- + gcc/config/i386/constraints.md | 2 +- + gcc/testsuite/gcc.target/i386/pr116043.c | 33 ++++++++++++++++++++++++ + 2 files changed, 34 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr116043.c + +diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md +index 7361687632f..e4b66340589 100644 +--- a/gcc/config/i386/constraints.md ++++ b/gcc/config/i386/constraints.md +@@ -187,7 +187,7 @@ + (and (match_operand 0 "memory_operand") + (match_test "constant_address_p (XEXP (op, 0))"))) + +-(define_memory_constraint "Bk" ++(define_special_memory_constraint "Bk" + "@internal TLS address that allows insn using non-integer registers." + (and (match_operand 0 "memory_operand") + (not (match_test "ix86_gpr_tls_address_pattern_p (op)")))) +diff --git a/gcc/testsuite/gcc.target/i386/pr116043.c b/gcc/testsuite/gcc.target/i386/pr116043.c +new file mode 100644 +index 00000000000..76553496c10 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr116043.c +@@ -0,0 +1,33 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx512bf16 -O3" } */ ++/* { dg-final { scan-assembler-not {(?n)lea.*@gottpoff} } } */ ++ ++extern __thread int a, c, i, j, k, l; ++int *b; ++struct d { ++ int e; ++} f, g; ++char *h; ++ ++void m(struct d *n) { ++ b = &k; ++ for (; n->e; b++, n--) { ++ i = b && a; ++ if (i) ++ j = c; ++ } ++} ++ ++char *o(struct d *n) { ++ for (; n->e;) ++ return h; ++} ++ ++int q() { ++ if (l) ++ return 1; ++ int p = *o(&g); ++ m(&f); ++ m(&g); ++ l = p; ++} +-- +2.31.1 + diff --git a/0341-Align-ix86_-move_max-store_max-with-vectorizer.patch b/0341-Align-ix86_-move_max-store_max-with-vectorizer.patch new file mode 100644 index 0000000..36876a3 --- /dev/null +++ b/0341-Align-ix86_-move_max-store_max-with-vectorizer.patch @@ -0,0 +1,232 @@ +From 002e45c7f46a0f8dd2b5381cd1ee1341f8987fca Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Thu, 15 Aug 2024 12:54:07 +0800 +Subject: [PATCH 04/14] Align ix86_{move_max,store_max} with vectorizer. + +When none of mprefer-vector-width, avx256_optimal/avx128_optimal, +avx256_store_by_pieces/avx512_store_by_pieces is specified, GCC will +set ix86_{move_max,store_max} as max available vector length except +for AVX part. + + if (TARGET_AVX512F_P (opts->x_ix86_isa_flags) + && TARGET_EVEX512_P (opts->x_ix86_isa_flags2)) + opts->x_ix86_move_max = PVW_AVX512; + else + opts->x_ix86_move_max = PVW_AVX128; + +So for -mavx2, vectorizer will choose 256-bit for vectorization, but +128-bit is used for struct copy, there could be a potential STLF issue +due to this "misalign". + +The patch fixes that. + +gcc/ChangeLog: + + * config/i386/i386-options.cc (ix86_option_override_internal): + set ix86_{move_max,store_max} to PVW_AVX256 when TARGET_AVX + instead of PVW_AVX128. + +gcc/testsuite/ChangeLog: + * gcc.target/i386/pieces-memcpy-10.c: Add -mprefer-vector-width=128. + * gcc.target/i386/pieces-memcpy-6.c: Ditto. + * gcc.target/i386/pieces-memset-38.c: Ditto. + * gcc.target/i386/pieces-memset-40.c: Ditto. + * gcc.target/i386/pieces-memset-41.c: Ditto. + * gcc.target/i386/pieces-memset-42.c: Ditto. + * gcc.target/i386/pieces-memset-43.c: Ditto. + * gcc.target/i386/pieces-strcpy-2.c: Ditto. + * gcc.target/i386/pieces-memcpy-22.c: New test. + * gcc.target/i386/pieces-memset-51.c: New test. + * gcc.target/i386/pieces-strcpy-3.c: New test. + +(cherry picked from commit aea374238cec1a1e53fb79575d2f998e16926999) +--- + gcc/config/i386/i386-options.cc | 6 ++++++ + gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c | 12 ++++++++++++ + gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-38.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-40.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-41.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-42.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-43.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-51.c | 12 ++++++++++++ + gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c | 15 +++++++++++++++ + 12 files changed, 53 insertions(+), 8 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c + create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-51.c + create mode 100644 gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c + +diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc +index 991661fe4a2..061a1584318 100644 +--- a/gcc/config/i386/i386-options.cc ++++ b/gcc/config/i386/i386-options.cc +@@ -2802,6 +2802,9 @@ ix86_option_override_internal (bool main_args_p, + { + if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)) + opts->x_ix86_move_max = PVW_AVX512; ++ /* Align with vectorizer to avoid potential STLF issue. */ ++ else if (TARGET_AVX_P (opts->x_ix86_isa_flags)) ++ opts->x_ix86_move_max = PVW_AVX256; + else + opts->x_ix86_move_max = PVW_AVX128; + } +@@ -2823,6 +2826,9 @@ ix86_option_override_internal (bool main_args_p, + { + if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)) + opts->x_ix86_store_max = PVW_AVX512; ++ /* Align with vectorizer to avoid potential STLF issue. */ ++ else if (TARGET_AVX_P (opts->x_ix86_isa_flags)) ++ opts->x_ix86_store_max = PVW_AVX256; + else + opts->x_ix86_store_max = PVW_AVX128; + } +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c +index 5faee21f9b9..53ad0b3be44 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ + + extern char *dst, *src; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c +new file mode 100644 +index 00000000000..605b3623ffc +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c +@@ -0,0 +1,12 @@ ++/* { dg-do compile { target { ! ia32 } } } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */ ++ ++extern char *dst, *src; ++ ++void ++foo (void) ++{ ++ __builtin_memcpy (dst, src, 33); ++} ++ ++/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */ +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c +index 5f99cc98c47..cfd2a86cf33 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c +@@ -1,5 +1,5 @@ + /* { dg-do compile { target { ! ia32 } } } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ + + extern char *dst, *src; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c +index ed4a24a54fd..ddd194debd5 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx512f -mavx2 -mprefer-vector-width=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c +index 4eda73ead59..9c206465d46 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx512f -mavx2 -mprefer-vector-width=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c +index 93df8101e4d..b0756182e35 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge -mno-stackrealign" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge -mno-stackrealign" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c +index df0c122aae7..103da699ae5 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c +index 2f2179c2df9..f1494e17610 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-51.c b/gcc/testsuite/gcc.target/i386/pieces-memset-51.c +new file mode 100644 +index 00000000000..192ec0d1647 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-51.c +@@ -0,0 +1,12 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */ ++ ++extern char *dst; ++ ++void ++foo (int x) ++{ ++ __builtin_memset (dst, x, 64); ++} ++ ++/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */ +diff --git a/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c b/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c +index 90446edb4f3..9bb94b7419b 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c +@@ -1,5 +1,5 @@ + /* { dg-do compile { target { ! ia32 } } } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ + + extern char *strcpy (char *, const char *); + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c b/gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c +new file mode 100644 +index 00000000000..df7571b547f +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c +@@ -0,0 +1,15 @@ ++/* { dg-do compile { target { ! ia32 } } } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */ ++ ++extern char *strcpy (char *, const char *); ++ ++void ++foo (char *s) ++{ ++ strcpy (s, ++ "1234567890abcdef123456abcdef5678123456abcdef567abcdef678" ++ "1234567"); ++} ++ ++/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\[^\n\]*%ymm" 2 } } */ ++/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */ +-- +2.31.1 + diff --git a/0342-Fix-testcase-failure.patch b/0342-Fix-testcase-failure.patch new file mode 100644 index 0000000..a1d3ed6 --- /dev/null +++ b/0342-Fix-testcase-failure.patch @@ -0,0 +1,120 @@ +From c5c6183ab3132d40fb0f10c57c26c6ef4f69bfda Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Thu, 22 Aug 2024 14:31:40 +0800 +Subject: [PATCH 05/14] Fix testcase failure. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pieces-memcpy-10.c: Use -mmove-max=256 and + -mstore-max=256. + * gcc.target/i386/pieces-memcpy-6.c: Ditto. + * gcc.target/i386/pieces-memset-38.c: Ditto. + * gcc.target/i386/pieces-memset-40.c: Ditto. + * gcc.target/i386/pieces-memset-41.c: Ditto. + * gcc.target/i386/pieces-memset-42.c: Ditto. + * gcc.target/i386/pieces-memset-43.c: Ditto. + * gcc.target/i386/pieces-strcpy-2.c: Ditto. + +(cherry picked from commit ea9c508927ec032c6d67a24df59ffa429e4d3d95) +--- + gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-38.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-40.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-41.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-42.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-43.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c | 2 +- + 8 files changed, 8 insertions(+), 8 deletions(-) + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c +index 53ad0b3be44..78f92ac5197 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 -mtune=sandybridge" } */ + + extern char *dst, *src; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c +index cfd2a86cf33..57b74ae4b23 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c +@@ -1,5 +1,5 @@ + /* { dg-do compile { target { ! ia32 } } } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 -mtune=sandybridge" } */ + + extern char *dst, *src; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c +index ddd194debd5..d9443678735 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx512f -mavx2 -mprefer-vector-width=128 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx512f -mavx2 -mmove-max=128 -mstore-max=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c +index 9c206465d46..8ad6ad7e494 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx512f -mavx2 -mprefer-vector-width=128 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx512f -mavx2 -mmove-max=128 -mstore-max=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c +index b0756182e35..08fd6e9a927 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge -mno-stackrealign" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 -mtune=sandybridge -mno-stackrealign" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c +index 103da699ae5..6b73bb256af 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c +index f1494e17610..c6c7ff234da 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c b/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c +index 9bb94b7419b..40ada119625 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c +@@ -1,5 +1,5 @@ + /* { dg-do compile { target { ! ia32 } } } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 -mtune=sandybridge" } */ + + extern char *strcpy (char *, const char *); + +-- +2.31.1 + diff --git a/0343-Check-avx-upper-register-for-parallel.patch b/0343-Check-avx-upper-register-for-parallel.patch new file mode 100644 index 0000000..6112e87 --- /dev/null +++ b/0343-Check-avx-upper-register-for-parallel.patch @@ -0,0 +1,148 @@ +From 8d7562fe6bd0284dc15cae8f1cd1b59ee940064a Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Thu, 29 Aug 2024 11:39:20 +0800 +Subject: [PATCH 06/14] Check avx upper register for parallel. + +For function arguments/return, when it's BLK mode, it's put in a +parallel with an expr_list, and the expr_list contains the real mode +and registers. +Current ix86_check_avx_upper_register only checked for SSE_REG_P, and +failed to handle that. The patch extend the handle to each subrtx. + +gcc/ChangeLog: + + PR target/116512 + * config/i386/i386.cc (ix86_check_avx_upper_register): Iterate + subrtx to scan for avx upper register. + (ix86_check_avx_upper_stores): Inline old + ix86_check_avx_upper_register. + (ix86_avx_u128_mode_needed): Ditto, and replace + FOR_EACH_SUBRTX with call to new + ix86_check_avx_upper_register. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr116512.c: New test. + +(cherry picked from commit ab214ef734bfc3dcffcf79ff9e1dd651c2b40566) +--- + gcc/config/i386/i386.cc | 36 +++++++++++++++--------- + gcc/testsuite/gcc.target/i386/pr116512.c | 26 +++++++++++++++++ + 2 files changed, 49 insertions(+), 13 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr116512.c + +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc +index ade965927ac..e2743e0bd5c 100644 +--- a/gcc/config/i386/i386.cc ++++ b/gcc/config/i386/i386.cc +@@ -14359,9 +14359,19 @@ ix86_dirflag_mode_needed (rtx_insn *insn) + static bool + ix86_check_avx_upper_register (const_rtx exp) + { +- return (SSE_REG_P (exp) +- && !EXT_REX_SSE_REG_P (exp) +- && GET_MODE_BITSIZE (GET_MODE (exp)) > 128); ++ /* construct_container may return a parallel with expr_list ++ which contains the real reg and mode */ ++ subrtx_iterator::array_type array; ++ FOR_EACH_SUBRTX (iter, array, exp, NONCONST) ++ { ++ const_rtx x = *iter; ++ if (SSE_REG_P (x) ++ && !EXT_REX_SSE_REG_P (x) ++ && GET_MODE_BITSIZE (GET_MODE (x)) > 128) ++ return true; ++ } ++ ++ return false; + } + + /* Check if a 256bit or 512bit AVX register is referenced in stores. */ +@@ -14369,7 +14379,9 @@ ix86_check_avx_upper_register (const_rtx exp) + static void + ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data) + { +- if (ix86_check_avx_upper_register (dest)) ++ if (SSE_REG_P (dest) ++ && !EXT_REX_SSE_REG_P (dest) ++ && GET_MODE_BITSIZE (GET_MODE (dest)) > 128) + { + bool *used = (bool *) data; + *used = true; +@@ -14427,14 +14439,14 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) + return AVX_U128_CLEAN; + } + +- subrtx_iterator::array_type array; +- + rtx set = single_set (insn); + if (set) + { + rtx dest = SET_DEST (set); + rtx src = SET_SRC (set); +- if (ix86_check_avx_upper_register (dest)) ++ if (SSE_REG_P (dest) ++ && !EXT_REX_SSE_REG_P (dest) ++ && GET_MODE_BITSIZE (GET_MODE (dest)) > 128) + { + /* This is an YMM/ZMM load. Return AVX_U128_DIRTY if the + source isn't zero. */ +@@ -14445,9 +14457,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) + } + else + { +- FOR_EACH_SUBRTX (iter, array, src, NONCONST) +- if (ix86_check_avx_upper_register (*iter)) +- return AVX_U128_DIRTY; ++ if (ix86_check_avx_upper_register (src)) ++ return AVX_U128_DIRTY; + } + + /* This isn't YMM/ZMM load/store. */ +@@ -14458,9 +14469,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) + Hardware changes state only when a 256bit register is written to, + but we need to prevent the compiler from moving optimal insertion + point above eventual read from 256bit or 512 bit register. */ +- FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST) +- if (ix86_check_avx_upper_register (*iter)) +- return AVX_U128_DIRTY; ++ if (ix86_check_avx_upper_register (PATTERN (insn))) ++ return AVX_U128_DIRTY; + + return AVX_U128_ANY; + } +diff --git a/gcc/testsuite/gcc.target/i386/pr116512.c b/gcc/testsuite/gcc.target/i386/pr116512.c +new file mode 100644 +index 00000000000..c2bc6c91b64 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr116512.c +@@ -0,0 +1,26 @@ ++/* { dg-do compile } */ ++/* { dg-options "-march=x86-64-v4 -O2" } */ ++/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */ ++ ++#include ++ ++struct B { ++ union { ++ __m512 f; ++ __m512i s; ++ }; ++}; ++ ++struct B foo(int n) { ++ struct B res; ++ res.s = _mm512_set1_epi32(n); ++ ++ return res; ++} ++ ++__m512i bar(int n) { ++ struct B res; ++ res.s = _mm512_set1_epi32(n); ++ ++ return res.s; ++} +-- +2.31.1 + diff --git a/0344-i386-Fix-vfpclassph-non-optimizied-intrin.patch b/0344-i386-Fix-vfpclassph-non-optimizied-intrin.patch new file mode 100644 index 0000000..77a60bb --- /dev/null +++ b/0344-i386-Fix-vfpclassph-non-optimizied-intrin.patch @@ -0,0 +1,134 @@ +From 9cb8d824a580c1ea79718300deed14b8ec5cc1e2 Mon Sep 17 00:00:00 2001 +From: Haochen Jiang +Date: Mon, 2 Sep 2024 15:00:22 +0800 +Subject: [PATCH 07/14] i386: Fix vfpclassph non-optimizied intrin + +The intrin for non-optimized got a typo in mask type, which will cause +the high bits of __mmask32 being unexpectedly zeroed. + +The test does not fail under O0 with current 1b since the testcase is +wrong. We need to include avx512-mask-type.h after SIZE is defined, or +it will always be __mmask8. That problem also happened in AVX10.2 testcases. +I will write a seperate patch to fix that. + +gcc/ChangeLog: + + * config/i386/avx512fp16intrin.h + (_mm512_mask_fpclass_ph_mask): Correct mask type to __mmask32. + (_mm512_fpclass_ph_mask): Ditto. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/avx512fp16-vfpclassph-1c.c: New test. + +(cherry picked from commit 6e59b188c4a051d4f2de5220d30681e6963d96c0) (gcc-12) +--- + gcc/config/i386/avx512fp16intrin.h | 4 +- + .../i386/avx512fp16-vfpclassph-1c.c | 77 +++++++++++++++++++ + 2 files changed, 79 insertions(+), 2 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c + +diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h +index b16ccfcb7f1..6330e57ebb8 100644 +--- a/gcc/config/i386/avx512fp16intrin.h ++++ b/gcc/config/i386/avx512fp16intrin.h +@@ -2321,11 +2321,11 @@ _mm512_fpclass_ph_mask (__m512h __A, const int __imm) + #else + #define _mm512_mask_fpclass_ph_mask(u, x, c) \ + ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \ +- (int) (c),(__mmask8)(u))) ++ (int) (c),(__mmask32)(u))) + + #define _mm512_fpclass_ph_mask(x, c) \ + ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \ +- (int) (c),(__mmask8)-1)) ++ (int) (c),(__mmask32)-1)) + #endif /* __OPIMTIZE__ */ + + /* Intrinsics vgetexpph, vgetexpsh. */ +diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c +new file mode 100644 +index 00000000000..4739f1228e3 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c +@@ -0,0 +1,77 @@ ++/* { dg-do run } */ ++/* { dg-options "-O0 -mavx512fp16" } */ ++/* { dg-require-effective-target avx512fp16 } */ ++ ++#define AVX512FP16 ++#include "avx512f-helper.h" ++ ++#include ++#include ++#include ++#define SIZE (AVX512F_LEN / 16) ++#include "avx512f-mask-type.h" ++ ++#ifndef __FPCLASSPH__ ++#define __FPCLASSPH__ ++int check_fp_class_hp (_Float16 src, int imm) ++{ ++ int qNaN_res = isnan (src); ++ int sNaN_res = isnan (src); ++ int Pzero_res = (src == 0.0); ++ int Nzero_res = (src == -0.0); ++ int PInf_res = (isinf (src) == 1); ++ int NInf_res = (isinf (src) == -1); ++ int Denorm_res = (fpclassify (src) == FP_SUBNORMAL); ++ int FinNeg_res = __builtin_finite (src) && (src < 0); ++ ++ int result = (((imm & 1) && qNaN_res) ++ || (((imm >> 1) & 1) && Pzero_res) ++ || (((imm >> 2) & 1) && Nzero_res) ++ || (((imm >> 3) & 1) && PInf_res) ++ || (((imm >> 4) & 1) && NInf_res) ++ || (((imm >> 5) & 1) && Denorm_res) ++ || (((imm >> 6) & 1) && FinNeg_res) ++ || (((imm >> 7) & 1) && sNaN_res)); ++ return result; ++} ++#endif ++ ++MASK_TYPE ++CALC (_Float16 *s1, int imm) ++{ ++ int i; ++ MASK_TYPE res = 0; ++ ++ for (i = 0; i < SIZE; i++) ++ if (check_fp_class_hp(s1[i], imm)) ++ res = res | (1 << i); ++ ++ return res; ++} ++ ++void ++TEST (void) ++{ ++ int i; ++ UNION_TYPE (AVX512F_LEN, h) src; ++ MASK_TYPE res1, res2, res_ref = 0; ++ MASK_TYPE mask = MASK_VALUE; ++ ++ src.a[SIZE - 1] = NAN; ++ src.a[SIZE - 2] = 1.0 / 0.0; ++ for (i = 0; i < SIZE - 2; i++) ++ { ++ src.a[i] = -24.43 + 0.6 * i; ++ } ++ ++ res1 = INTRINSIC (_fpclass_ph_mask) (src.x, 0xFF); ++ res2 = INTRINSIC (_mask_fpclass_ph_mask) (mask, src.x, 0xFF); ++ ++ res_ref = CALC (src.a, 0xFF); ++ ++ if (res_ref != res1) ++ abort (); ++ ++ if ((mask & res_ref) != res2) ++ abort (); ++} +-- +2.31.1 + diff --git a/0345-doc-Add-more-alias-option-and-reorder-Intel-CPU-marc.patch b/0345-doc-Add-more-alias-option-and-reorder-Intel-CPU-marc.patch new file mode 100644 index 0000000..b691700 --- /dev/null +++ b/0345-doc-Add-more-alias-option-and-reorder-Intel-CPU-marc.patch @@ -0,0 +1,278 @@ +From 3bdecde26f8edffbd0e981c280e3fb8519709ea4 Mon Sep 17 00:00:00 2001 +From: Haochen Jiang +Date: Wed, 18 Sep 2024 11:20:15 +0800 +Subject: [PATCH 08/14] doc: Add more alias option and reorder Intel CPU -march + documentation + +This patch is backported from GCC15 with some tweaks. + +Since r15-3539, there are requests coming in to add other alias option +documentation. This patch will add all of them, including corei7, corei7-avx, +core-avx-i, core-avx2, atom and slm. + +Also in the patch, I reordered that part of documentation, currently all +the CPUs/products are just all over the place. I regrouped them by +date-to-now products (since the very first CPU to latest Panther Lake), P-core +(since the clients become hybrid cores, starting from Sapphire Rapids) and +E-core (since Bonnell). In GCC14 and eariler GCC, Xeon Phi CPUs are still +there, I put them after E-core CPUs. + +And in the patch, I refined the product names in documentation. + +gcc/ChangeLog: + + * doc/invoke.texi: Add corei7, corei7-avx, core-avx-i, + core-avx2, atom, and slm. Reorder the -march documentation by + splitting them into date-to-now products, P-core, E-core and + Xeon Phi. Refine the product names in documentation. + +(cherry picked from commit 8483527158024d200b3a9e4edecbe188fa22fdaa) +--- + gcc/doc/invoke.texi | 162 +++++++++++++++++++++++--------------------- + 1 file changed, 84 insertions(+), 78 deletions(-) + +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index 109858f7666..90073ac9832 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -31441,6 +31441,7 @@ Intel Core 2 CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, CX16, + SAHF and FXSR instruction set support. + + @item nehalem ++@itemx corei7 + Intel Nehalem CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, + SSE4.1, SSE4.2, POPCNT, CX16, SAHF and FXSR instruction set support. + +@@ -31449,17 +31450,20 @@ Intel Westmere CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, + SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR and PCLMUL instruction set support. + + @item sandybridge ++@itemx corei7-avx + Intel Sandy Bridge CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, + SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE and PCLMUL instruction set + support. + + @item ivybridge ++@itemx core-avx-i + Intel Ivy Bridge CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, + SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND + and F16C instruction set support. + + @item haswell +-Intel Haswell CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, ++@itemx core-avx2 ++Intel Haswell CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, + SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, + F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE and HLE instruction set support. + +@@ -31475,47 +31479,6 @@ SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, + F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, + CLFLUSHOPT, XSAVEC, XSAVES and SGX instruction set support. + +-@item bonnell +-Intel Bonnell CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3 and SSSE3 +-instruction set support. +- +-@item silvermont +-Intel Silvermont CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, +-SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW and RDRND +-instruction set support. +- +-@item goldmont +-Intel Goldmont CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, +-SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW, RDRND, AES, SHA, +-RDSEED, XSAVE, XSAVEC, XSAVES, XSAVEOPT, CLFLUSHOPT and FSGSBASE instruction +-set support. +- +-@item goldmont-plus +-Intel Goldmont Plus CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, +-SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW, RDRND, AES, +-SHA, RDSEED, XSAVE, XSAVEC, XSAVES, XSAVEOPT, CLFLUSHOPT, FSGSBASE, PTWRITE, +-RDPID and SGX instruction set support. +- +-@item tremont +-Intel Tremont CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, +-SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW, RDRND, AES, SHA, +-RDSEED, XSAVE, XSAVEC, XSAVES, XSAVEOPT, CLFLUSHOPT, FSGSBASE, PTWRITE, RDPID, +-SGX, CLWB, GFNI-SSE, MOVDIRI, MOVDIR64B, CLDEMOTE and WAITPKG instruction set +-support. +- +-@item knl +-Intel Knight's Landing CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, +-SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, +-RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, +-AVX512PF, AVX512ER, AVX512F, AVX512CD and PREFETCHWT1 instruction set support. +- +-@item knm +-Intel Knights Mill CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, +-SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, +-RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, +-AVX512PF, AVX512ER, AVX512F, AVX512CD and PREFETCHWT1, AVX5124VNNIW, +-AVX5124FMAPS and AVX512VPOPCNTDQ instruction set support. +- + @item skylake-avx512 + Intel Skylake Server CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, + SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, +@@ -31523,16 +31486,30 @@ RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, + AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, CLWB, AVX512VL, AVX512BW, + AVX512DQ and AVX512CD instruction set support. + ++@item cascadelake ++Intel Cascade Lake CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, ++SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, ++F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, ++CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, CLWB, AVX512VL, AVX512BW, AVX512DQ, ++AVX512CD and AVX512VNNI instruction set support. ++ + @item cannonlake +-Intel Cannonlake Server CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, ++Intel Cannon Lake Server CPU with 64-bit extensions, MMX, SSE, SSE2, + SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, + FSGSBASE, RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, + PREFETCHW, AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, + AVX512DQ, AVX512CD, PKU, AVX512VBMI, AVX512IFMA and SHA instruction set + support. + ++@item cooperlake ++Intel Cooper Lake CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, ++SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, ++F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, ++CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, CLWB, AVX512VL, AVX512BW, AVX512DQ, ++AVX512CD, AVX512VNNI and AVX512BF16 instruction set support. ++ + @item icelake-client +-Intel Icelake Client CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, ++Intel Ice Lake Client CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, + SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, + RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, + AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, +@@ -31540,7 +31517,7 @@ AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2 + , VPCLMULQDQ, AVX512BITALG, RDPID and AVX512VPOPCNTDQ instruction set support. + + @item icelake-server +-Intel Icelake Server CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, ++Intel Ice Lake Server CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, + SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, + RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, + AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, +@@ -31548,55 +31525,84 @@ AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2 + , VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, PCONFIG, WBNOINVD and CLWB + instruction set support. + +-@item cascadelake +-Intel Cascadelake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, +-SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, +-F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, +-CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, CLWB, AVX512VL, AVX512BW, AVX512DQ, +-AVX512CD and AVX512VNNI instruction set support. +- +-@item cooperlake +-Intel cooperlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, ++@item tigerlake ++Intel Tiger Lake CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, + SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, + F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, +-CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, CLWB, AVX512VL, AVX512BW, AVX512DQ, +-AVX512CD, AVX512VNNI and AVX512BF16 instruction set support. ++CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, ++AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2, ++VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, MOVDIRI, MOVDIR64B, CLWB, ++AVX512VP2INTERSECT and KEYLOCKER instruction set support. + +-@item tigerlake +-Intel Tigerlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, ++@item rocketlake ++Intel Rocket Lake CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, + SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, + F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, +-CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, AVX512CD ++CLFLUSHOPT, XSAVEC, XSAVES, AVX512F, AVX512VL, AVX512BW, AVX512DQ, AVX512CD, + PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2, +-VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, MOVDIRI, MOVDIR64B, CLWB, +-AVX512VP2INTERSECT and KEYLOCKER instruction set support. ++VPCLMULQDQ, AVX512BITALG, RDPID and AVX512VPOPCNTDQ instruction set support. ++ ++@item alderlake ++Intel Alder Lake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, ++SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, PCLMUL, RDRND, XSAVE, XSAVEC, XSAVES, ++XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB, MOVDIRI, MOVDIR64B, ++CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA, LZCNT, PCONFIG, PKU, ++VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL and AVX-VNNI instruction set ++support. + + @item sapphirerapids +-Intel sapphirerapids CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, +-SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, +-RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, +-AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, ++Intel Sapphire Rapids CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, ++SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, ++F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, ++CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, + AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2, + VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, PCONFIG, WBNOINVD, CLWB, + MOVDIRI, MOVDIR64B, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG, SERIALIZE, TSXLDTRK, + UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512-FP16 and AVX512BF16 + instruction set support. + +-@item alderlake +-Intel Alderlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, +-SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, PCLMUL, RDRND, XSAVE, XSAVEC, XSAVES, +-XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB, MOVDIRI, MOVDIR64B, +-CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA, LZCNT, PCONFIG, PKU, +-VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL and AVX-VNNI instruction set ++@item bonnell ++@itemx atom ++Intel Bonnell CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3 and SSSE3 ++instruction set support. ++ ++@item silvermont ++@itemx slm ++Intel Silvermont CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, ++SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW and RDRND ++instruction set support. ++ ++@item goldmont ++Intel Goldmont CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, ++SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW, RDRND, AES, SHA, ++RDSEED, XSAVE, XSAVEC, XSAVES, XSAVEOPT, CLFLUSHOPT and FSGSBASE instruction ++set support. ++ ++@item goldmont-plus ++Intel Goldmont Plus CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, ++SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW, RDRND, AES, ++SHA, RDSEED, XSAVE, XSAVEC, XSAVES, XSAVEOPT, CLFLUSHOPT, FSGSBASE, PTWRITE, ++RDPID and SGX instruction set support. ++ ++@item tremont ++Intel Tremont CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, ++SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW, RDRND, AES, SHA, ++RDSEED, XSAVE, XSAVEC, XSAVES, XSAVEOPT, CLFLUSHOPT, FSGSBASE, PTWRITE, RDPID, ++SGX, CLWB, GFNI-SSE, MOVDIRI, MOVDIR64B, CLDEMOTE and WAITPKG instruction set + support. + +-@item rocketlake +-Intel Rocketlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3 +-, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, +-F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, +-CLFLUSHOPT, XSAVEC, XSAVES, AVX512F, AVX512VL, AVX512BW, AVX512DQ, AVX512CD +-PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2, +-VPCLMULQDQ, AVX512BITALG, RDPID and AVX512VPOPCNTDQ instruction set support. ++@item knl ++Intel Knights Landing CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, ++SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, ++RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, ++AVX512PF, AVX512ER, AVX512F, AVX512CD and PREFETCHWT1 instruction set support. ++ ++@item knm ++Intel Knights Mill CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, ++SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, ++RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, ++AVX512PF, AVX512ER, AVX512F, AVX512CD and PREFETCHWT1, AVX5124VNNIW, ++AVX5124FMAPS and AVX512VPOPCNTDQ instruction set support. + + @item graniterapids + Intel graniterapids CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, +-- +2.31.1 + diff --git a/0346-Refine-splitters-related-to-combine-vpcmpuw-zero_ext.patch b/0346-Refine-splitters-related-to-combine-vpcmpuw-zero_ext.patch new file mode 100644 index 0000000..ede3e8f --- /dev/null +++ b/0346-Refine-splitters-related-to-combine-vpcmpuw-zero_ext.patch @@ -0,0 +1,416 @@ +From 45b9ba8abc0379b5f83e9209325f9c9a31faec8e Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Wed, 16 Oct 2024 13:43:48 +0800 +Subject: [PATCH 09/14] Refine splitters related to "combine vpcmpuw + + zero_extend to vpcmpuw" + +r12-6103-g1a7ce8570997eb combines vpcmpuw + zero_extend to vpcmpuw +with the pre_reload splitter, but the splitter transforms the +zero_extend into a subreg which make reload think the upper part is +garbage, it's not correct. + +The patch adjusts the zero_extend define_insn_and_split to +define_insn to keep zero_extend. + +gcc/ChangeLog: + + PR target/117159 + * config/i386/sse.md + (*_cmp3_zero_extend): + Change from define_insn_and_split to define_insn. + (*_cmp3_zero_extend): + Ditto. + (*_ucmp3_zero_extend): + Ditto. + (*_ucmp3_zero_extend): + Ditto. + (*_cmp3_zero_extend_2): + Split to the zero_extend pattern. + (*_cmp3_zero_extend_2): + Ditto. + (*_ucmp3_zero_extend_2): + Ditto. + (*_ucmp3_zero_extend_2): + Ditto. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr117159.c: New test. + * gcc.target/i386/avx512bw-pr103750-1.c: Remove xfail. + * gcc.target/i386/avx512bw-pr103750-2.c: Remove xfail. + +(cherry picked from commit 5259d3927c1c8e3a15b4b844adef59b48c241233) +--- + gcc/config/i386/sse.md | 196 +++++++----------- + .../gcc.target/i386/avx512bw-pr103750-1.c | 3 +- + .../gcc.target/i386/avx512bw-pr103750-2.c | 3 +- + gcc/testsuite/gcc.target/i386/pr117159.c | 42 ++++ + 4 files changed, 124 insertions(+), 120 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr117159.c + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index 23b858ab21c..7d01c00a848 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -3724,32 +3724,19 @@ + + ;; Since vpcmpd implicitly clear the upper bits of dest, transform + ;; vpcmpd + zero_extend to vpcmpd since the instruction +-(define_insn_and_split "*_cmp3_zero_extend" +- [(set (match_operand:SWI248x 0 "register_operand") ++(define_insn "*_cmp3_zero_extend" ++ [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec: +- [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand") +- (match_operand:V48H_AVX512VL 2 "nonimmediate_operand") ++ [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand" "v") ++ (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "const_0_to_7_operand" "n")] + UNSPEC_PCMP)))] + "TARGET_AVX512F + && (!VALID_MASK_AVX512BW_MODE (mode) || TARGET_AVX512BW) +- && ix86_pre_reload_split () + && (GET_MODE_NUNITS (mode) + < GET_MODE_PRECISION (mode))" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_PCMP))] +-{ +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, +- operands[0], mode); +-} ++ "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") +@@ -3777,21 +3764,22 @@ + "#" + "&& 1" + [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_PCMP)) +- (set (match_dup 4) (match_dup 0))] ++ (zero_extend:SWI248x ++ (unspec: ++ [(match_dup 1) ++ (match_dup 2) ++ (match_dup 3)] ++ UNSPEC_PCMP))) ++ (set (match_dup 4) (match_dup 5))] + { +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, ++ operands[5] = lowpart_subreg (mode, + operands[0], mode); +-} +- [(set_attr "type" "ssecmp") +- (set_attr "length_immediate" "1") +- (set_attr "prefix" "evex") +- (set_attr "mode" "")]) ++ if (SUBREG_P (operands[5])) ++ { ++ SUBREG_PROMOTED_VAR_P (operands[5]) = 1; ++ SUBREG_PROMOTED_SET (operands[5], 1); ++ } ++}) + + (define_insn_and_split "*_cmp3" + [(set (match_operand: 0 "register_operand") +@@ -3826,31 +3814,18 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + +-(define_insn_and_split "*_cmp3_zero_extend" +- [(set (match_operand:SWI248x 0 "register_operand") ++(define_insn "*_cmp3_zero_extend" ++ [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec: +- [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand") +- (match_operand:VI12_AVX512VL 2 "nonimmediate_operand") +- (match_operand:SI 3 "const_0_to_7_operand")] ++ [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "v") ++ (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") ++ (match_operand:SI 3 "const_0_to_7_operand" "n")] + UNSPEC_PCMP)))] + "TARGET_AVX512BW +- && ix86_pre_reload_split () +- && (GET_MODE_NUNITS (mode) +- < GET_MODE_PRECISION (mode))" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_PCMP))] +-{ +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, +- operands[0], mode); +-} ++ && (GET_MODE_NUNITS (mode) ++ < GET_MODE_PRECISION (mode))" ++ "vpcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") +@@ -3877,16 +3852,21 @@ + "#" + "&& 1" + [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_PCMP)) +- (set (match_dup 4) (match_dup 0))] ++ (zero_extend:SWI248x ++ (unspec: ++ [(match_dup 1) ++ (match_dup 2) ++ (match_dup 3)] ++ UNSPEC_PCMP))) ++ (set (match_dup 4) (match_dup 5))] + { +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, ++ operands[5] = lowpart_subreg (mode, + operands[0], mode); ++ if (SUBREG_P (operands[5])) ++ { ++ SUBREG_PROMOTED_VAR_P (operands[5]) = 1; ++ SUBREG_PROMOTED_SET (operands[5], 1); ++ } + } + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") +@@ -3945,31 +3925,18 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + +-(define_insn_and_split "*_ucmp3_zero_extend" +- [(set (match_operand:SWI248x 0 "register_operand") ++(define_insn "*_ucmp3_zero_extend" ++ [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec: +- [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand") +- (match_operand:VI12_AVX512VL 2 "nonimmediate_operand") +- (match_operand:SI 3 "const_0_to_7_operand")] ++ [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "v") ++ (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") ++ (match_operand:SI 3 "const_0_to_7_operand" "n")] + UNSPEC_UNSIGNED_PCMP)))] + "TARGET_AVX512BW +- && ix86_pre_reload_split () + && (GET_MODE_NUNITS (mode) + < GET_MODE_PRECISION (mode))" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_UNSIGNED_PCMP))] +-{ +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, +- operands[0], mode); +-} ++ "vpcmpu\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") +@@ -3997,16 +3964,21 @@ + "#" + "&& 1" + [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_UNSIGNED_PCMP)) +- (set (match_dup 4) (match_dup 0))] +-{ +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, ++ (zero_extend:SWI248x ++ (unspec: ++ [(match_dup 1) ++ (match_dup 2) ++ (match_dup 3)] ++ UNSPEC_UNSIGNED_PCMP))) ++ (set (match_dup 4) (match_dup 5))] ++{ ++ operands[5] = lowpart_subreg (mode, + operands[0], mode); ++ if (SUBREG_P (operands[5])) ++ { ++ SUBREG_PROMOTED_VAR_P (operands[5]) = 1; ++ SUBREG_PROMOTED_SET (operands[5], 1); ++ } + } + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") +@@ -4043,32 +4015,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + +-(define_insn_and_split "*_ucmp3_zero_extend" +- [(set (match_operand:SWI248x 0 "register_operand") ++(define_insn "*_ucmp3_zero_extend" ++ [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec: +- [(match_operand:VI48_AVX512VL 1 "nonimmediate_operand") +- (match_operand:VI48_AVX512VL 2 "nonimmediate_operand") +- (match_operand:SI 3 "const_0_to_7_operand")] ++ [(match_operand:VI48_AVX512VL 1 "nonimmediate_operand" "v") ++ (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") ++ (match_operand:SI 3 "const_0_to_7_operand" "n")] + UNSPEC_UNSIGNED_PCMP)))] + "TARGET_AVX512F + && (!VALID_MASK_AVX512BW_MODE (mode) || TARGET_AVX512BW) +- && ix86_pre_reload_split () + && (GET_MODE_NUNITS (mode) + < GET_MODE_PRECISION (mode))" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_UNSIGNED_PCMP))] +-{ +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, +- operands[0], mode); +-} ++ "vpcmpu\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") +@@ -4096,16 +4055,21 @@ + "#" + "&& 1" + [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_UNSIGNED_PCMP)) +- (set (match_dup 4) (match_dup 0))] +-{ +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, ++ (zero_extend:SWI248x ++ (unspec: ++ [(match_dup 1) ++ (match_dup 2) ++ (match_dup 3)] ++ UNSPEC_UNSIGNED_PCMP))) ++ (set (match_dup 4) (match_dup 5))] ++{ ++ operands[5] = lowpart_subreg (mode, + operands[0], mode); ++ if (SUBREG_P (operands[5])) ++ { ++ SUBREG_PROMOTED_VAR_P (operands[5]) = 1; ++ SUBREG_PROMOTED_SET (operands[5], 1); ++ } + } + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") +diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-1.c +index b1165f069bb..e7d6183232b 100644 +--- a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-1.c ++++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-1.c +@@ -1,8 +1,7 @@ + /* PR target/103750 */ + /* { dg-do compile } */ + /* { dg-options "-O2 -mavx512bw -mavx512vl" } */ +-/* { dg-final { scan-assembler-not "kmov" { xfail ia32 } } } */ +-/* xfail need to be fixed. */ ++/* { dg-final { scan-assembler-not "kmov" } } */ + + #include + extern __m128i* pi128; +diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c +index 7303f5403ba..3392e193222 100644 +--- a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c ++++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c +@@ -1,8 +1,7 @@ + /* PR target/103750 */ + /* { dg-do compile } */ + /* { dg-options "-O2 -mavx512dq -mavx512bw -mavx512vl" } */ +-/* { dg-final { scan-assembler-not "kmov" { xfail ia32 } } } */ +-/* xfail need to be fixed. */ ++/* { dg-final { scan-assembler-not "kmov" } } */ + + #include + extern __m128i* pi128; +diff --git a/gcc/testsuite/gcc.target/i386/pr117159.c b/gcc/testsuite/gcc.target/i386/pr117159.c +new file mode 100644 +index 00000000000..b67d682ecef +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr117159.c +@@ -0,0 +1,42 @@ ++/* { dg-do run } */ ++/* { dg-options "-Os -mavx512bw" } */ ++/* { dg-require-effective-target avx512bw } */ ++ ++typedef __attribute__((__vector_size__ (4))) unsigned char W; ++typedef __attribute__((__vector_size__ (64))) int V; ++typedef __attribute__((__vector_size__ (64))) long long Vq; ++ ++W w; ++V v; ++Vq vq; ++ ++static inline W ++foo (short m) ++{ ++ unsigned k = __builtin_ia32_pcmpgtq512_mask ((Vq) { }, vq, m); ++ W r = (W) k + w; ++ return r; ++} ++ ++static inline W ++foo1 (short m) ++{ ++ unsigned k = __builtin_ia32_pcmpgtd512_mask ((V) {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, v, m); ++ W r = (W) k + w; ++ return r; ++} ++ ++int ++main () ++{ ++ if (!__builtin_cpu_supports ("avx512bw")) ++ return 0; ++ W y = foo1 (65535); ++ if (!y[0] || !y[1] || y[2] || y[3]) ++ __builtin_abort(); ++ W x = foo (65535); ++ if (x[0] || x[1] || x[2] || x[3]) ++ __builtin_abort(); ++ ++ return 0; ++} +-- +2.31.1 + diff --git a/0347-Fix-ICE-due-to-isa-mismatch-for-the-builtins.patch b/0347-Fix-ICE-due-to-isa-mismatch-for-the-builtins.patch new file mode 100644 index 0000000..7488bb8 --- /dev/null +++ b/0347-Fix-ICE-due-to-isa-mismatch-for-the-builtins.patch @@ -0,0 +1,95 @@ +From 6e8e4260a895298d27660783aaac45bb2e13941f Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Tue, 22 Oct 2024 01:54:40 -0700 +Subject: [PATCH 10/14] Fix ICE due to isa mismatch for the builtins. + +gcc/ChangeLog: + + PR target/117240 + * config/i386/i386-builtin.def: Add avx/avx512f to vaes + ymm/zmm builtins. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr117240_avx.c: New test. + * gcc.target/i386/pr117240_avx512f.c: New test. + +(cherry picked from commit 403e361d5aa620e77c9832578b2409a0fdd79d96) +--- + gcc/config/i386/i386-builtin.def | 24 +++++++++---------- + gcc/testsuite/gcc.target/i386/pr117240_avx.c | 10 ++++++++ + .../gcc.target/i386/pr117240_avx512f.c | 10 ++++++++ + 3 files changed, 32 insertions(+), 12 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr117240_avx.c + create mode 100644 gcc/testsuite/gcc.target/i386/pr117240_avx512f.c + +diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def +index d3ab21eeac3..d1713b70e07 100644 +--- a/gcc/config/i386/i386-builtin.def ++++ b/gcc/config/i386/i386-builtin.def +@@ -2751,18 +2751,18 @@ BDESC (0, OPTION_MASK_ISA2_AVX5124VNNIW, CODE_FOR_avx5124vnniw_vp4dpwssds_mask, + BDESC (0, OPTION_MASK_ISA2_RDPID, CODE_FOR_rdpid, "__builtin_ia32_rdpid", IX86_BUILTIN_RDPID, UNKNOWN, (int) UNSIGNED_FTYPE_VOID) + + /* VAES. */ +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, "__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v64qi, "__builtin_ia32_vaesdec_v64qi", IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, "__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v64qi, "__builtin_ia32_vaesdeclast_v64qi", IX86_BUILTIN_VAESDECLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, "__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v64qi, "__builtin_ia32_vaesenc_v64qi", IX86_BUILTIN_VAESENC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) ++BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) ++BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, "__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) ++BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v64qi, "__builtin_ia32_vaesdec_v64qi", IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) ++BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) ++BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, "__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) ++BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v64qi, "__builtin_ia32_vaesdeclast_v64qi", IX86_BUILTIN_VAESDECLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) ++BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) ++BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, "__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) ++BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v64qi, "__builtin_ia32_vaesenc_v64qi", IX86_BUILTIN_VAESENC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) ++BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) ++BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) ++BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) + + /* BF16 */ + BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi, "__builtin_ia32_cvtne2ps2bf16_v32hi", IX86_BUILTIN_CVTNE2PS2HI16_V32HI, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF) +diff --git a/gcc/testsuite/gcc.target/i386/pr117240_avx.c b/gcc/testsuite/gcc.target/i386/pr117240_avx.c +new file mode 100644 +index 00000000000..24a97a9f74c +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr117240_avx.c +@@ -0,0 +1,10 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -mvaes -mno-xsave -Wno-psabi -Wno-implicit-function-declaration" } */ ++ ++typedef __attribute__((__vector_size__(32))) char V; ++ ++V ++foo(V v) ++{ ++ return __builtin_ia32_vaesenc_v32qi(v, v);/* { dg-error "incompatible types when returning" } */ ++} +diff --git a/gcc/testsuite/gcc.target/i386/pr117240_avx512f.c b/gcc/testsuite/gcc.target/i386/pr117240_avx512f.c +new file mode 100644 +index 00000000000..1e7b5a88d7a +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr117240_avx512f.c +@@ -0,0 +1,10 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -mvaes -mno-xsave -Wno-psabi -Wno-implicit-function-declaration" } */ ++ ++typedef __attribute__((__vector_size__(64))) char V; ++ ++V ++foo(V v) ++{ ++ return __builtin_ia32_vaesenc_v64qi(v, v);/* { dg-error "incompatible types when returning" } */ ++} +-- +2.31.1 + diff --git a/0348-Fix-ICE-due-to-subreg-us_truncate.patch b/0348-Fix-ICE-due-to-subreg-us_truncate.patch new file mode 100644 index 0000000..ca4578d --- /dev/null +++ b/0348-Fix-ICE-due-to-subreg-us_truncate.patch @@ -0,0 +1,444 @@ +From b817cad361eac1754101114b9beb7abc1aab3435 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Tue, 29 Oct 2024 02:09:39 -0700 +Subject: [PATCH 11/14] Fix ICE due to subreg:us_truncate. + +Force_operand issues an ICE when input +is (subreg:DI (us_truncate:V8QI)), it's probably because it's an +invalid rtx, So refine backend patterns for that. + +gcc/ChangeLog: + + PR target/117318 + * config/i386/sse.md (*avx512vl_v2div2qi2_mask_store_1): + Rename to .. + (avx512vl_v2div2qi2_mask_store_1): .. this. + (avx512vl_v2div2qi2_mask_store_2): Change to + define_expand. + (*avx512vl_v4qi2_mask_store_1): Rename to .. + (avx512vl_v4qi2_mask_store_1): .. this. + (avx512vl_v4qi2_mask_store_2): Change to + define_expand. + (*avx512vl_v8qi2_mask_store_1): Rename to .. + (avx512vl_v8qi2_mask_store_1): .. this. + (avx512vl_v8qi2_mask_store_2): Change to + define_expand. + (*avx512vl_v4hi2_mask_store_1): Rename to .. + (avx512vl_v4hi2_mask_store_1): .. this. + (avx512vl_v4hi2_mask_store_2): Change to + define_expand. + (*avx512vl_v2div2hi2_mask_store_1): Rename to .. + (avx512vl_v2div2hi2_mask_store_1): .. this. + (avx512vl_v2div2hi2_mask_store_2): Change to + define_expand. + (*avx512vl_v2div2si2_mask_store_1): Rename to .. + (avx512vl_v2div2si2_mask_store_1): .. this. + (avx512vl_v2div2si2_mask_store_2): Change to + define_expand. + (*avx512f_v8div16qi2_mask_store_1): Rename to .. + (avx512f_v8div16qi2_mask_store_1): .. this. + (avx512f_v8div16qi2_mask_store_2): Change to + define_expand. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr117318.c: New test. + +(cherry picked from commit bc0eeccf27a084461a2d5661e23468350acb43da) +--- + gcc/config/i386/sse.md | 268 +++++++++-------------- + gcc/testsuite/gcc.target/i386/pr117318.c | 12 + + 2 files changed, 110 insertions(+), 170 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr117318.c + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index 7d01c00a848..a7d61bf0044 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -13850,7 +13850,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn "*avx512vl_v2div2qi2_mask_store_1" ++(define_insn "avx512vl_v2div2qi2_mask_store_1" + [(set (match_operand:V2QI 0 "memory_operand" "=m") + (vec_merge:V2QI + (any_truncate:V2QI +@@ -13864,28 +13864,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn_and_split "avx512vl_v2div2qi2_mask_store_2" +- [(set (match_operand:HI 0 "memory_operand") +- (subreg:HI +- (vec_merge:V2QI +- (any_truncate:V2QI +- (match_operand:V2DI 1 "register_operand")) +- (vec_select:V2QI +- (subreg:V4QI +- (vec_concat:V2HI +- (match_dup 0) +- (const_int 0)) 0) +- (parallel [(const_int 0) (const_int 1)])) +- (match_operand:QI 2 "register_operand")) 0))] +- "TARGET_AVX512VL && ix86_pre_reload_split ()" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (vec_merge:V2QI +- (any_truncate:V2QI (match_dup 1)) +- (match_dup 0) +- (match_dup 2)))] +- "operands[0] = adjust_address_nv (operands[0], V2QImode, 0);") ++(define_expand "avx512vl_v2div2qi2_mask_store_2" ++ [(match_operand:HI 0 "memory_operand") ++ (any_truncate:V2QI ++ (match_operand:V2DI 1 "register_operand")) ++ (match_operand:QI 2 "register_operand")] ++ "TARGET_AVX512VL" ++{ ++ operands[0] = adjust_address_nv (operands[0], V2QImode, 0); ++ emit_insn (gen_avx512vl_v2div2qi2_mask_store_1 (operands[0], ++ operands[1], ++ operands[2])); ++ DONE; ++}) + + (define_insn "*avx512vl_v4qi2_store_1" + [(set (match_operand:V4QI 0 "memory_operand" "=m") +@@ -13954,7 +13945,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn "*avx512vl_v4qi2_mask_store_1" ++(define_insn "avx512vl_v4qi2_mask_store_1" + [(set (match_operand:V4QI 0 "memory_operand" "=m") + (vec_merge:V4QI + (any_truncate:V4QI +@@ -13968,29 +13959,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn_and_split "avx512vl_v4qi2_mask_store_2" +- [(set (match_operand:SI 0 "memory_operand") +- (subreg:SI +- (vec_merge:V4QI +- (any_truncate:V4QI +- (match_operand:VI4_128_8_256 1 "register_operand")) +- (vec_select:V4QI +- (subreg:V8QI +- (vec_concat:V2SI +- (match_dup 0) +- (const_int 0)) 0) +- (parallel [(const_int 0) (const_int 1) +- (const_int 2) (const_int 3)])) +- (match_operand:QI 2 "register_operand")) 0))] +- "TARGET_AVX512VL && ix86_pre_reload_split ()" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (vec_merge:V4QI +- (any_truncate:V4QI (match_dup 1)) +- (match_dup 0) +- (match_dup 2)))] +- "operands[0] = adjust_address_nv (operands[0], V4QImode, 0);") ++(define_expand "avx512vl_v4qi2_mask_store_2" ++ [(match_operand:SI 0 "memory_operand") ++ (any_truncate:V4QI ++ (match_operand:VI4_128_8_256 1 "register_operand")) ++ (match_operand:QI 2 "register_operand")] ++ "TARGET_AVX512VL" ++{ ++ operands[0] = adjust_address_nv (operands[0], V4QImode, 0); ++ emit_insn (gen_avx512vl_v4qi2_mask_store_1 (operands[0], ++ operands[1], ++ operands[2])); ++ DONE; ++}) + + (define_mode_iterator VI2_128_BW_4_256 + [(V8HI "TARGET_AVX512BW") V8SI]) +@@ -14062,7 +14043,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn "*avx512vl_v8qi2_mask_store_1" ++(define_insn "avx512vl_v8qi2_mask_store_1" + [(set (match_operand:V8QI 0 "memory_operand" "=m") + (vec_merge:V8QI + (any_truncate:V8QI +@@ -14076,31 +14057,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn_and_split "avx512vl_v8qi2_mask_store_2" +- [(set (match_operand:DI 0 "memory_operand") +- (subreg:DI +- (vec_merge:V8QI +- (any_truncate:V8QI +- (match_operand:VI2_128_BW_4_256 1 "register_operand")) +- (vec_select:V8QI +- (subreg:V16QI +- (vec_concat:V2DI +- (match_dup 0) +- (const_int 0)) 0) +- (parallel [(const_int 0) (const_int 1) +- (const_int 2) (const_int 3) +- (const_int 4) (const_int 5) +- (const_int 6) (const_int 7)])) +- (match_operand:QI 2 "register_operand")) 0))] +- "TARGET_AVX512VL && ix86_pre_reload_split ()" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (vec_merge:V8QI +- (any_truncate:V8QI (match_dup 1)) +- (match_dup 0) +- (match_dup 2)))] +- "operands[0] = adjust_address_nv (operands[0], V8QImode, 0);") ++(define_expand "avx512vl_v8qi2_mask_store_2" ++ [(match_operand:DI 0 "memory_operand") ++ (any_truncate:V8QI ++ (match_operand:VI2_128_BW_4_256 1 "register_operand")) ++ (match_operand:QI 2 "register_operand")] ++ "TARGET_AVX512VL" ++{ ++ operands[0] = adjust_address_nv (operands[0], V8QImode, 0); ++ emit_insn (gen_avx512vl_v8qi2_mask_store_1 (operands[0], ++ operands[1], ++ operands[2])); ++ DONE; ++}) + + (define_mode_iterator PMOV_SRC_MODE_4 [V4DI V2DI V4SI]) + (define_mode_attr pmov_dst_4 +@@ -14222,7 +14191,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn "*avx512vl_v4hi2_mask_store_1" ++(define_insn "avx512vl_v4hi2_mask_store_1" + [(set (match_operand:V4HI 0 "memory_operand" "=m") + (vec_merge:V4HI + (any_truncate:V4HI +@@ -14240,30 +14209,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn_and_split "avx512vl_v4hi2_mask_store_2" +- [(set (match_operand:DI 0 "memory_operand") +- (subreg:DI +- (vec_merge:V4HI +- (any_truncate:V4HI +- (match_operand:VI4_128_8_256 1 "register_operand")) +- (vec_select:V4HI +- (subreg:V8HI +- (vec_concat:V2DI +- (match_dup 0) +- (const_int 0)) 0) +- (parallel [(const_int 0) (const_int 1) +- (const_int 2) (const_int 3)])) +- (match_operand:QI 2 "register_operand")) 0))] +- "TARGET_AVX512VL && ix86_pre_reload_split ()" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (vec_merge:V4HI +- (any_truncate:V4HI (match_dup 1)) +- (match_dup 0) +- (match_dup 2)))] +- "operands[0] = adjust_address_nv (operands[0], V4HImode, 0);") +- ++(define_expand "avx512vl_v4hi2_mask_store_2" ++ [(match_operand:DI 0 "memory_operand") ++ (any_truncate:V4HI ++ (match_operand:VI4_128_8_256 1 "register_operand")) ++ (match_operand:QI 2 "register_operand")] ++ "TARGET_AVX512VL" ++{ ++ operands[0] = adjust_address_nv (operands[0], V4HImode, 0); ++ emit_insn (gen_avx512vl_v4hi2_mask_store_1 (operands[0], ++ operands[1], ++ operands[2])); ++ DONE; ++}) + + (define_insn "*avx512vl_v2div2hi2_store_1" + [(set (match_operand:V2HI 0 "memory_operand" "=m") +@@ -14324,7 +14282,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn "*avx512vl_v2div2hi2_mask_store_1" ++(define_insn "avx512vl_v2div2hi2_mask_store_1" + [(set (match_operand:V2HI 0 "memory_operand" "=m") + (vec_merge:V2HI + (any_truncate:V2HI +@@ -14338,28 +14296,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn_and_split "avx512vl_v2div2hi2_mask_store_2" +- [(set (match_operand:SI 0 "memory_operand") +- (subreg:SI +- (vec_merge:V2HI +- (any_truncate:V2HI +- (match_operand:V2DI 1 "register_operand")) +- (vec_select:V2HI +- (subreg:V4HI +- (vec_concat:V2SI +- (match_dup 0) +- (const_int 0)) 0) +- (parallel [(const_int 0) (const_int 1)])) +- (match_operand:QI 2 "register_operand")) 0))] +- "TARGET_AVX512VL && ix86_pre_reload_split ()" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (vec_merge:V2HI +- (any_truncate:V2HI (match_dup 1)) +- (match_dup 0) +- (match_dup 2)))] +- "operands[0] = adjust_address_nv (operands[0], V2HImode, 0);") ++(define_expand "avx512vl_v2div2hi2_mask_store_2" ++ [(match_operand:SI 0 "memory_operand") ++ (any_truncate:V2HI ++ (match_operand:V2DI 1 "register_operand")) ++ (match_operand:QI 2 "register_operand")] ++ "TARGET_AVX512VL" ++{ ++ operands[0] = adjust_address_nv (operands[0], V2HImode, 0); ++ emit_insn (gen_avx512vl_v2div2hi2_mask_store_1 (operands[0], ++ operands[1], ++ operands[2])); ++ DONE; ++}) + + (define_expand "truncv2div2si2" + [(set (match_operand:V2SI 0 "register_operand") +@@ -14467,7 +14416,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn "*avx512vl_v2div2si2_mask_store_1" ++(define_insn "avx512vl_v2div2si2_mask_store_1" + [(set (match_operand:V2SI 0 "memory_operand" "=m") + (vec_merge:V2SI + (any_truncate:V2SI +@@ -14481,28 +14430,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn_and_split "avx512vl_v2div2si2_mask_store_2" +- [(set (match_operand:DI 0 "memory_operand") +- (subreg:DI +- (vec_merge:V2SI +- (any_truncate:V2SI +- (match_operand:V2DI 1 "register_operand")) +- (vec_select:V2SI +- (subreg:V4SI +- (vec_concat:V2DI +- (match_dup 0) +- (const_int 0)) 0) +- (parallel [(const_int 0) (const_int 1)])) +- (match_operand:QI 2 "register_operand")) 0))] +- "TARGET_AVX512VL && ix86_pre_reload_split ()" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (vec_merge:V2SI +- (any_truncate:V2SI (match_dup 1)) +- (match_dup 0) +- (match_dup 2)))] +- "operands[0] = adjust_address_nv (operands[0], V2SImode, 0);") ++(define_expand "avx512vl_v2div2si2_mask_store_2" ++ [(match_operand:DI 0 "memory_operand") ++ (any_truncate:V2SI ++ (match_operand:V2DI 1 "register_operand")) ++ (match_operand:QI 2 "register_operand")] ++ "TARGET_AVX512VL" ++{ ++ operands[0] = adjust_address_nv (operands[0], V2SImode, 0); ++ emit_insn (gen_avx512vl_v2div2si2_mask_store_1 (operands[0], ++ operands[1], ++ operands[2])); ++ DONE; ++}) + + (define_expand "truncv8div8qi2" + [(set (match_operand:V8QI 0 "register_operand") +@@ -14601,7 +14541,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn "*avx512f_v8div16qi2_mask_store_1" ++(define_insn "avx512f_v8div16qi2_mask_store_1" + [(set (match_operand:V8QI 0 "memory_operand" "=m") + (vec_merge:V8QI + (any_truncate:V8QI +@@ -14615,31 +14555,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn_and_split "avx512f_v8div16qi2_mask_store_2" +- [(set (match_operand:DI 0 "memory_operand") +- (subreg:DI +- (vec_merge:V8QI +- (any_truncate:V8QI +- (match_operand:V8DI 1 "register_operand")) +- (vec_select:V8QI +- (subreg:V16QI +- (vec_concat:V2DI +- (match_dup 0) +- (const_int 0)) 0) +- (parallel [(const_int 0) (const_int 1) +- (const_int 2) (const_int 3) +- (const_int 4) (const_int 5) +- (const_int 6) (const_int 7)])) +- (match_operand:QI 2 "register_operand")) 0))] +- "TARGET_AVX512F && ix86_pre_reload_split ()" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (vec_merge:V8QI +- (any_truncate:V8QI (match_dup 1)) +- (match_dup 0) +- (match_dup 2)))] +- "operands[0] = adjust_address_nv (operands[0], V8QImode, 0);") ++(define_expand "avx512f_v8div16qi2_mask_store_2" ++ [(match_operand:DI 0 "memory_operand") ++ (any_truncate:V8QI ++ (match_operand:V8DI 1 "register_operand")) ++ (match_operand:QI 2 "register_operand")] ++ "TARGET_AVX512F" ++{ ++ operands[0] = adjust_address_nv (operands[0], V8QImode, 0); ++ emit_insn (gen_avx512f_v8div16qi2_mask_store_1 (operands[0], ++ operands[1], ++ operands[2])); ++ DONE; ++}) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; +diff --git a/gcc/testsuite/gcc.target/i386/pr117318.c b/gcc/testsuite/gcc.target/i386/pr117318.c +new file mode 100644 +index 00000000000..3d316ad04cf +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr117318.c +@@ -0,0 +1,12 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx512f -O" } */ ++ ++typedef __attribute__((__vector_size__ (64))) long long V; ++unsigned long long x; ++ ++unsigned long long ++foo() ++{ ++ __builtin_ia32_pmovusqb512mem_mask (&x, (V){8000000000000000}, 255); ++ return x; ++} +-- +2.31.1 + diff --git a/0349-i386-Zero-extend-32-bit-address-to-64-bit-with-optio.patch b/0349-i386-Zero-extend-32-bit-address-to-64-bit-with-optio.patch new file mode 100644 index 0000000..6fd40e3 --- /dev/null +++ b/0349-i386-Zero-extend-32-bit-address-to-64-bit-with-optio.patch @@ -0,0 +1,104 @@ +From a23da8527f517e439ab634d9995b44740cbbc05b Mon Sep 17 00:00:00 2001 +From: "Hu, Lin1" +Date: Wed, 6 Nov 2024 15:42:13 +0800 +Subject: [PATCH 12/14] i386: Zero extend 32-bit address to 64-bit with option + -mx32 -maddress-mode=long. [PR 117418] + +-maddress-mode=long let Pmode = DI_mode, so zero extend 32-bit address to +64-bit and uses a 64-bit register as a pointer for avoid raise an ICE. + +gcc/ChangeLog: + + PR target/117418 + * config/i386/i386-expand.cc (ix86_expand_builtin): Convert + pointer's mode according to Pmode. + +gcc/testsuite/ChangeLog: + + PR target/117418 + * gcc.target/i386/pr117418-1.c: New test. + +(cherry picked from commit 2272cd2508f1854c880082f792de15e76ec09a99) +--- + gcc/config/i386/i386-expand.cc | 12 +++++++++++ + gcc/testsuite/gcc.target/i386/pr117418-1.c | 24 ++++++++++++++++++++++ + 2 files changed, 36 insertions(+) + create mode 100644 gcc/testsuite/gcc.target/i386/pr117418-1.c + +diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc +index bc2e6198007..52e32749928 100644 +--- a/gcc/config/i386/i386-expand.cc ++++ b/gcc/config/i386/i386-expand.cc +@@ -12730,6 +12730,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget, + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + ++ if (GET_MODE (op1) != Pmode) ++ op1 = convert_to_mode (Pmode, op1, 1); ++ + if (!address_operand (op2, VOIDmode)) + { + op2 = convert_memory_address (Pmode, op2); +@@ -12765,6 +12768,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget, + emit_label (ok_label); + emit_insn (gen_rtx_SET (target, pat)); + ++ if (GET_MODE (op0) != Pmode) ++ op0 = convert_to_mode (Pmode, op0, 1); ++ + for (i = 0; i < 8; i++) + { + op = gen_rtx_MEM (V2DImode, +@@ -12789,6 +12795,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget, + if (!REG_P (op0)) + op0 = copy_to_mode_reg (SImode, op0); + ++ if (GET_MODE (op2) != Pmode) ++ op2 = convert_to_mode (Pmode, op2, 1); ++ + op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0)); + emit_move_insn (op, op1); + +@@ -12826,6 +12835,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget, + if (!REG_P (op0)) + op0 = copy_to_mode_reg (SImode, op0); + ++ if (GET_MODE (op3) != Pmode) ++ op3 = convert_to_mode (Pmode, op3, 1); ++ + /* Force to use xmm0, xmm1 for keylow, keyhi*/ + op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0)); + emit_move_insn (op, op1); +diff --git a/gcc/testsuite/gcc.target/i386/pr117418-1.c b/gcc/testsuite/gcc.target/i386/pr117418-1.c +new file mode 100644 +index 00000000000..4839b139b79 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr117418-1.c +@@ -0,0 +1,24 @@ ++/* PR target/117418 */ ++/* { dg-do compile { target { ! ia32 } } } */ ++/* { dg-options "-maddress-mode=long -mwidekl -mx32" } */ ++/* { dg-require-effective-target maybe_x32 } */ ++/* { dg-final { scan-assembler-times "aesdec128kl" 1 } } */ ++/* { dg-final { scan-assembler-times "aesdec256kl" 1 } } */ ++/* { dg-final { scan-assembler-times "aesenc128kl" 1 } } */ ++/* { dg-final { scan-assembler-times "aesenc256kl" 1 } } */ ++/* { dg-final { scan-assembler-times "encodekey128" 1 } } */ ++/* { dg-final { scan-assembler-times "encodekey256" 1 } } */ ++ ++typedef __attribute__((__vector_size__(16))) long long V; ++V a; ++ ++void ++foo() ++{ ++ __builtin_ia32_aesdec128kl_u8 (&a, a, &a); ++ __builtin_ia32_aesdec256kl_u8 (&a, a, &a); ++ __builtin_ia32_aesenc128kl_u8 (&a, a, &a); ++ __builtin_ia32_aesenc256kl_u8 (&a, a, &a); ++ __builtin_ia32_encodekey128_u32 (0, a, &a); ++ __builtin_ia32_encodekey256_u32 (0, a, a, &a); ++} +-- +2.31.1 + diff --git a/0350-Fix-uninitialized-operands-2-in-vec_unpacks_hi_v4sf.patch b/0350-Fix-uninitialized-operands-2-in-vec_unpacks_hi_v4sf.patch new file mode 100644 index 0000000..b7e6f15 --- /dev/null +++ b/0350-Fix-uninitialized-operands-2-in-vec_unpacks_hi_v4sf.patch @@ -0,0 +1,37 @@ +From 94ab46d9486464b3158a9fc9bc1c463dd4d62d72 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Thu, 21 Nov 2024 23:57:38 -0800 +Subject: [PATCH 13/14] Fix uninitialized operands[2] in vec_unpacks_hi_v4sf. + +It could cause weired spill in RA when register pressure is high. + +gcc/ChangeLog: + + PR target/117562 + * config/i386/sse.md (vec_unpacks_hi_v4sf): Initialize + operands[2] with CONST0_RTX. + +(cherry picked from commit ba4cf2e296d8d5950c3d356fa6b6efcad00d0189) +--- + gcc/config/i386/sse.md | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index a7d61bf0044..c6a8e301145 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -9126,7 +9126,10 @@ + (match_dup 2) + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_SSE2" +- "operands[2] = gen_reg_rtx (V4SFmode);") ++{ ++ operands[2] = gen_reg_rtx (V4SFmode); ++ emit_move_insn (operands[2], CONST0_RTX (V4SFmode)); ++}) + + (define_expand "vec_unpacks_hi_v8sf" + [(set (match_dup 2) +-- +2.31.1 + diff --git a/0351-GCC13-GCC12-Fix-testcase.patch b/0351-GCC13-GCC12-Fix-testcase.patch new file mode 100644 index 0000000..4d746f2 --- /dev/null +++ b/0351-GCC13-GCC12-Fix-testcase.patch @@ -0,0 +1,34 @@ +From 6494fd12311561551bcf8d8529108fba79c45fd7 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Tue, 22 Oct 2024 11:24:23 +0800 +Subject: [PATCH 14/14] [GCC13/GCC12] Fix testcase. + +The optimization relies on other patterns which are only available at +GCC14 and obove, so restore the xfail for GCC13/12 branch. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/avx512bw-pr103750-2.c: Add xfail for ia32. + +(cherry picked from commit 8b43518a01cbbbafe042b85a48fa09a32948380a) +--- + gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c +index 3392e193222..7303f5403ba 100644 +--- a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c ++++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c +@@ -1,7 +1,8 @@ + /* PR target/103750 */ + /* { dg-do compile } */ + /* { dg-options "-O2 -mavx512dq -mavx512bw -mavx512vl" } */ +-/* { dg-final { scan-assembler-not "kmov" } } */ ++/* { dg-final { scan-assembler-not "kmov" { xfail ia32 } } } */ ++/* xfail need to be fixed. */ + + #include + extern __m128i* pi128; +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index 1a8e1e7..8d2f2c1 100644 --- a/gcc.spec +++ b/gcc.spec @@ -2,7 +2,7 @@ %global gcc_major 12 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 69 +%global gcc_release 70 %global _unpackaged_files_terminate_build 0 %global _performance_build 1 @@ -443,6 +443,20 @@ Patch334: 0334-Dont-use-local_detect_cpu-when-cross-build.patch Patch335: 0335-fix-costs-for-hip09.patch Patch336: 0336-sfc-Add-struct-static-field-compression-optimization.patch Patch337: 0337-Reduce-ipa-inline-warning-output.patch +Patch338: 0338-i386-Fix-AVX512-intrin-macro-typo.patch +Patch339: 0339-i386-Use-_mm_setzero_ps-d-instead-of-_mm_avx512_setz.patch +Patch340: 0340-Refine-constraint-Bk-to-define_special_memory_constr.patch +Patch341: 0341-Align-ix86_-move_max-store_max-with-vectorizer.patch +Patch342: 0342-Fix-testcase-failure.patch +Patch343: 0343-Check-avx-upper-register-for-parallel.patch +Patch344: 0344-i386-Fix-vfpclassph-non-optimizied-intrin.patch +Patch345: 0345-doc-Add-more-alias-option-and-reorder-Intel-CPU-marc.patch +Patch346: 0346-Refine-splitters-related-to-combine-vpcmpuw-zero_ext.patch +Patch347: 0347-Fix-ICE-due-to-isa-mismatch-for-the-builtins.patch +Patch348: 0348-Fix-ICE-due-to-subreg-us_truncate.patch +Patch349: 0349-i386-Zero-extend-32-bit-address-to-64-bit-with-optio.patch +Patch350: 0350-Fix-uninitialized-operands-2-in-vec_unpacks_hi_v4sf.patch +Patch351: 0351-GCC13-GCC12-Fix-testcase.patch # Part 1001-1999 %ifarch sw_64 @@ -1566,6 +1580,20 @@ not stable, so plugins must be rebuilt any time GCC is updated. %patch -P335 -p1 %patch -P336 -p1 %patch -P337 -p1 +%patch -P338 -p1 +%patch -P339 -p1 +%patch -P340 -p1 +%patch -P341 -p1 +%patch -P342 -p1 +%patch -P343 -p1 +%patch -P344 -p1 +%patch -P345 -p1 +%patch -P346 -p1 +%patch -P347 -p1 +%patch -P348 -p1 +%patch -P349 -p1 +%patch -P350 -p1 +%patch -P351 -p1 %ifarch sw_64 %patch -P1001 -p1 @@ -4193,6 +4221,12 @@ end %doc rpm.doc/changelogs/libcc1/ChangeLog* %changelog +* Mon Feb 17 2025 Hu,Lin1 - 12.3.1-70 +- Type:Sync +- ID:NA +- SUG:NA +- DESC:Sync patches from openeuler/gcc. + * Wed Feb 12 2025 huang-xiaoquan - 12.3.1-69 - Type:Bugfix - ID:NA -- Gitee