From 910c9e3e36557af913251a7ac2ae4e0a3f189750 Mon Sep 17 00:00:00 2001 From: Xie jiamei Date: Wed, 18 Jun 2025 12:57:41 +0800 Subject: [PATCH] [Feature]Add support for Hygon processors Signed-off-by: Xie jiamei --- ...chitecture-type-for-Hygon-processors.patch | 69 +++++ ...formation-support-for-Hygon-processo.patch | 97 +++++++ ...ix-Zen3-Zen4-ERMS-selection-BZ-30994.patch | 152 ++++++++++ ...rate-non-temporal-tunable-for-memset.patch | 215 ++++++++++++++ ...-non-temporal-memset-tunable-for-AMD.patch | 47 ++++ ...r-x86_memset_non_temporal_threshold-.patch | 41 +++ ...on-temporal-memset-on-Skylake-Server.patch | 262 ++++++++++++++++++ ...n_Temporal_Memset-to-control-non-tem.patch | 95 +++++++ ...OSB-tunable-to-allow-NT-memset-witho.patch | 221 +++++++++++++++ ...temporal-memset-for-Hygon-processors.patch | 92 ++++++ ...e-memset-perf-with-non-temporal-stor.patch | 248 +++++++++++++++++ glibc.spec | 27 +- 12 files changed, 1560 insertions(+), 6 deletions(-) create mode 100644 0001-x86-Add-new-architecture-type-for-Hygon-processors.patch create mode 100644 0002-x86-Add-cache-information-support-for-Hygon-processo.patch create mode 100644 0003-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch create mode 100644 0004-x86-Add-seperate-non-temporal-tunable-for-memset.patch create mode 100644 0005-x86-Enable-non-temporal-memset-tunable-for-AMD.patch create mode 100644 0006-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch create mode 100644 0007-x86-Disable-non-temporal-memset-on-Skylake-Server.patch create mode 100644 0008-x86-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch create mode 100644 0009-x86-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch create mode 100644 0010-x86-Enable-non-temporal-memset-for-Hygon-processors.patch create mode 100644 0011-x86-Improve-large-memset-perf-with-non-temporal-stor.patch diff --git a/0001-x86-Add-new-architecture-type-for-Hygon-processors.patch b/0001-x86-Add-new-architecture-type-for-Hygon-processors.patch new file mode 100644 index 0000000..45165a0 --- /dev/null +++ b/0001-x86-Add-new-architecture-type-for-Hygon-processors.patch @@ -0,0 +1,69 @@ +From 5e1c0ca3aacae059f1971162d5a9f586265e72d3 Mon Sep 17 00:00:00 2001 +From: Feifei Wang +Date: Mon, 19 Aug 2024 14:57:53 +0800 +Subject: [PATCH 01/11] x86: Add new architecture type for Hygon processors + +Add a new architecture type arch_kind_hygon to spilt Hygon branch +from AMD. This is to facilitate the Hygon processors to make settings +that are suitable for its own characteristics. + +Signed-off-by: Feifei Wang +Reviewed-by: Jing Li +Reviewed-by: H.J. Lu +--- + sysdeps/x86/cpu-features.c | 19 ++++++++++++++++--- + sysdeps/x86/include/cpu-features.h | 1 + + 2 files changed, 17 insertions(+), 3 deletions(-) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index f752ebd24d..c4dd85145e 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -851,9 +851,8 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht + cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB] + |= bit_arch_Avoid_Short_Distance_REP_MOVSB; + } +- /* This spells out "AuthenticAMD" or "HygonGenuine". */ +- else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) +- || (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)) ++ /* This spells out "AuthenticAMD". */ ++ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) + { + unsigned int extended_model; + +@@ -963,6 +962,20 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht + } + } + } ++ /* This spells out "HygonGenuine". */ ++ else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e) ++ { ++ unsigned int extended_model; ++ ++ kind = arch_kind_hygon; ++ ++ get_common_indices (cpu_features, &family, &model, &extended_model, ++ &stepping); ++ ++ get_extended_indices (cpu_features); ++ ++ update_active (cpu_features); ++ } + else + { + kind = arch_kind_other; +diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h +index eb30d342a6..594feeb2f4 100644 +--- a/sysdeps/x86/include/cpu-features.h ++++ b/sysdeps/x86/include/cpu-features.h +@@ -856,6 +856,7 @@ enum cpu_features_kind + arch_kind_intel, + arch_kind_amd, + arch_kind_zhaoxin, ++ arch_kind_hygon, + arch_kind_other + }; + +-- +2.17.1 + diff --git a/0002-x86-Add-cache-information-support-for-Hygon-processo.patch b/0002-x86-Add-cache-information-support-for-Hygon-processo.patch new file mode 100644 index 0000000..69affcf --- /dev/null +++ b/0002-x86-Add-cache-information-support-for-Hygon-processo.patch @@ -0,0 +1,97 @@ +From 8a14035ba9574c26b6d504fda99e630a8bcaf5c7 Mon Sep 17 00:00:00 2001 +From: Feifei Wang +Date: Mon, 19 Aug 2024 14:57:54 +0800 +Subject: [PATCH 02/11] x86: Add cache information support for Hygon processors + +Add hygon branch in dl_init_cacheinfo function to initialize +cache size variables for hygon processors. In the meanwhile, +add handle_hygon() function to get cache information. + +Signed-off-by: Feifei Wang +Reviewed-by: Jing Li +Reviewed-by: H.J. Lu +--- + sysdeps/x86/dl-cacheinfo.h | 60 ++++++++++++++++++++++++++++++++++++++ + 1 file changed, 60 insertions(+) + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 2c5b6d6980..8f141e7634 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -406,6 +406,48 @@ handle_zhaoxin (int name) + return 0; + } + ++static long int __attribute__ ((noinline)) ++handle_hygon (int name) ++{ ++ unsigned int eax; ++ unsigned int ebx; ++ unsigned int ecx; ++ unsigned int edx; ++ unsigned int count = 0x1; ++ ++ if (name >= _SC_LEVEL3_CACHE_SIZE) ++ count = 0x3; ++ else if (name >= _SC_LEVEL2_CACHE_SIZE) ++ count = 0x2; ++ else if (name >= _SC_LEVEL1_DCACHE_SIZE) ++ count = 0x0; ++ ++ /* Use __cpuid__ '0x8000_001D' to compute cache details. */ ++ __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx); ++ ++ switch (name) ++ { ++ case _SC_LEVEL1_ICACHE_ASSOC: ++ case _SC_LEVEL1_DCACHE_ASSOC: ++ case _SC_LEVEL2_CACHE_ASSOC: ++ case _SC_LEVEL3_CACHE_ASSOC: ++ return ((ebx >> 22) & 0x3ff) + 1; ++ case _SC_LEVEL1_ICACHE_LINESIZE: ++ case _SC_LEVEL1_DCACHE_LINESIZE: ++ case _SC_LEVEL2_CACHE_LINESIZE: ++ case _SC_LEVEL3_CACHE_LINESIZE: ++ return (ebx & 0xfff) + 1; ++ case _SC_LEVEL1_ICACHE_SIZE: ++ case _SC_LEVEL1_DCACHE_SIZE: ++ case _SC_LEVEL2_CACHE_SIZE: ++ case _SC_LEVEL3_CACHE_SIZE: ++ return (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1); ++ default: ++ __builtin_unreachable (); ++ } ++ return -1; ++} ++ + static void + get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr, + long int core) +@@ -724,6 +766,24 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (shared_per_thread <= 0) + shared_per_thread = shared; + } ++ else if (cpu_features->basic.kind == arch_kind_hygon) ++ { ++ data = handle_hygon (_SC_LEVEL1_DCACHE_SIZE); ++ shared = handle_hygon (_SC_LEVEL3_CACHE_SIZE); ++ shared_per_thread = shared; ++ ++ level1_icache_size = handle_hygon (_SC_LEVEL1_ICACHE_SIZE); ++ level1_icache_linesize = handle_hygon (_SC_LEVEL1_ICACHE_LINESIZE); ++ level1_dcache_size = data; ++ level1_dcache_assoc = handle_hygon (_SC_LEVEL1_DCACHE_ASSOC); ++ level1_dcache_linesize = handle_hygon (_SC_LEVEL1_DCACHE_LINESIZE); ++ level2_cache_size = handle_hygon (_SC_LEVEL2_CACHE_SIZE);; ++ level2_cache_assoc = handle_hygon (_SC_LEVEL2_CACHE_ASSOC); ++ level2_cache_linesize = handle_hygon (_SC_LEVEL2_CACHE_LINESIZE); ++ level3_cache_size = shared; ++ level3_cache_assoc = handle_hygon (_SC_LEVEL3_CACHE_ASSOC); ++ level3_cache_linesize = handle_hygon (_SC_LEVEL3_CACHE_LINESIZE); ++ } + + cpu_features->level1_icache_size = level1_icache_size; + cpu_features->level1_icache_linesize = level1_icache_linesize; +-- +2.17.1 + diff --git a/0003-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch b/0003-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch new file mode 100644 index 0000000..7ef1413 --- /dev/null +++ b/0003-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch @@ -0,0 +1,152 @@ +From 9aae95da1d78018c1961c60ee80c95192131020c Mon Sep 17 00:00:00 2001 +From: Adhemerval Zanella +Date: Thu, 8 Feb 2024 10:08:38 -0300 +Subject: [PATCH 03/11] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994) + +The REP MOVSB usage on memcpy/memmove does not show much performance +improvement on Zen3/Zen4 cores compared to the vectorized loops. Also, +as from BZ 30994, if the source is aligned and the destination is not +the performance can be 20x slower. + +The performance difference is noticeable with small buffer sizes, closer +to the lower bounds limits when memcpy/memmove starts to use ERMS. The +performance of REP MOVSB is similar to vectorized instruction on the +size limit (the L2 cache). Also, there is no drawback to multiple cores +sharing the cache. + +Checked on x86_64-linux-gnu on Zen3. +Reviewed-by: H.J. Lu +--- + sysdeps/x86/dl-cacheinfo.h | 45 +++++++++++++++++++++----------------- + 1 file changed, 25 insertions(+), 20 deletions(-) + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 8f141e7634..a8ad6cbefa 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -673,7 +673,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + long int data = -1; + long int shared = -1; + long int shared_per_thread = -1; +- long int core = -1; + unsigned int threads = 0; + unsigned long int level1_icache_size = -1; + unsigned long int level1_icache_linesize = -1; +@@ -691,7 +690,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (cpu_features->basic.kind == arch_kind_intel) + { + data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); +- core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); + shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); + shared_per_thread = shared; + +@@ -704,7 +702,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features); + level1_dcache_linesize + = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features); +- level2_cache_size = core; ++ level2_cache_size ++ = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); + level2_cache_assoc + = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features); + level2_cache_linesize +@@ -717,12 +716,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + level4_cache_size + = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features); + +- get_common_cache_info (&shared, &shared_per_thread, &threads, core); ++ get_common_cache_info (&shared, &shared_per_thread, &threads, ++ level2_cache_size); + } + else if (cpu_features->basic.kind == arch_kind_zhaoxin) + { + data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); +- core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); + shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); + shared_per_thread = shared; + +@@ -731,19 +730,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + level1_dcache_size = data; + level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC); + level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE); +- level2_cache_size = core; ++ level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); + level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC); + level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE); + level3_cache_size = shared; + level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC); + level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE); + +- get_common_cache_info (&shared, &shared_per_thread, &threads, core); ++ get_common_cache_info (&shared, &shared_per_thread, &threads, ++ level2_cache_size); + } + else if (cpu_features->basic.kind == arch_kind_amd) + { + data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); +- core = handle_amd (_SC_LEVEL2_CACHE_SIZE); + shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); + shared_per_thread = shared; + +@@ -752,7 +751,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + level1_dcache_size = data; + level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC); + level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE); +- level2_cache_size = core; ++ level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);; + level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC); + level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE); + level3_cache_size = shared; +@@ -760,8 +759,15 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE); + + if (shared <= 0) +- /* No shared L3 cache. All we have is the L2 cache. */ +- shared = core; ++ { ++ /* No shared L3 cache. All we have is the L2 cache. */ ++ shared = level2_cache_size; ++ } ++ else if (cpu_features->basic.family < 0x17) ++ { ++ /* Account for exclusive L2 and L3 caches. */ ++ shared += level2_cache_size; ++ } + + if (shared_per_thread <= 0) + shared_per_thread = shared; +@@ -883,6 +889,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) + rep_movsb_threshold = 2112; + ++ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of ++ cases slower than the vectorized path (and for some alignments, ++ it is really slow, check BZ #30994). */ ++ if (cpu_features->basic.kind == arch_kind_amd) ++ rep_movsb_threshold = non_temporal_threshold; ++ + /* The default threshold to use Enhanced REP STOSB. */ + unsigned long int rep_stosb_threshold = 2048; + +@@ -924,16 +936,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + SIZE_MAX); + + unsigned long int rep_movsb_stop_threshold; +- /* ERMS feature is implemented from AMD Zen3 architecture and it is +- performing poorly for data above L2 cache size. Henceforth, adding +- an upper bound threshold parameter to limit the usage of Enhanced +- REP MOVSB operations and setting its value to L2 cache size. */ +- if (cpu_features->basic.kind == arch_kind_amd) +- rep_movsb_stop_threshold = core; + /* Setting the upper bound of ERMS to the computed value of +- non-temporal threshold for architectures other than AMD. */ +- else +- rep_movsb_stop_threshold = non_temporal_threshold; ++ non-temporal threshold for all architectures. */ ++ rep_movsb_stop_threshold = non_temporal_threshold; + + cpu_features->data_cache_size = data; + cpu_features->shared_cache_size = shared; +-- +2.17.1 + diff --git a/0004-x86-Add-seperate-non-temporal-tunable-for-memset.patch b/0004-x86-Add-seperate-non-temporal-tunable-for-memset.patch new file mode 100644 index 0000000..b5ebe58 --- /dev/null +++ b/0004-x86-Add-seperate-non-temporal-tunable-for-memset.patch @@ -0,0 +1,215 @@ +From 57ce020adf1acf50d67f4693de5c3e786ce195ec Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 24 May 2024 12:38:51 -0500 +Subject: [PATCH 04/11] x86: Add seperate non-temporal tunable for memset + +The tuning for non-temporal stores for memset vs memcpy is not always +the same. This includes both the exact value and whether non-temporal +stores are profitable at all for a given arch. + +This patch add `x86_memset_non_temporal_threshold`. Currently we +disable non-temporal stores for non Intel vendors as the only +benchmarks showing its benefit have been on Intel hardware. +Reviewed-by: H.J. Lu +--- + manual/tunables.texi | 16 +++++++++++++++- + sysdeps/x86/cacheinfo.h | 8 +++++++- + sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++ + sysdeps/x86/dl-diagnostics-cpu.c | 2 ++ + sysdeps/x86/dl-tunables.list | 3 +++ + sysdeps/x86/include/cpu-features.h | 4 +++- + .../x86_64/multiarch/memset-vec-unaligned-erms.S | 11 +++++++++-- + 7 files changed, 55 insertions(+), 5 deletions(-) + +diff --git a/manual/tunables.texi b/manual/tunables.texi +index bdd3bacb2a..eaef1604c7 100644 +--- a/manual/tunables.texi ++++ b/manual/tunables.texi +@@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647) + glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff) + glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff) + glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff) ++glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff) + glibc.cpu.x86_shstk: + glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff) + glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff) +@@ -486,7 +487,8 @@ thread stack originally backup by Huge Pages to default pages. + @cindex shared_cache_size tunables + @cindex tunables, shared_cache_size + @cindex non_temporal_threshold tunables +-@cindex tunables, non_temporal_threshold ++@cindex memset_non_temporal_threshold tunables ++@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold + + @deftp {Tunable namespace} glibc.cpu + Behavior of @theglibc{} can be tuned to assume specific hardware capabilities +@@ -562,6 +564,18 @@ like memmove and memcpy. + This tunable is specific to i386 and x86-64. + @end deftp + ++@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold ++The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows ++the user to set threshold in bytes for non temporal store in ++memset. Non temporal stores give a hint to the hardware to move data ++directly to memory without displacing other data from the cache. This ++tunable is used by some platforms to determine when to use non ++temporal stores memset. ++ ++This tunable is specific to i386 and x86-64. ++@end deftp ++ ++ + @deftp Tunable glibc.cpu.x86_rep_movsb_threshold + The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to + set threshold in bytes to start using "rep movsb". The value must be +diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h +index ec1bc142c4..fd2b2ae66b 100644 +--- a/sysdeps/x86/cacheinfo.h ++++ b/sysdeps/x86/cacheinfo.h +@@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024; + long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; + long int __x86_shared_cache_size attribute_hidden = 1024 * 1024; + +-/* Threshold to use non temporal store. */ ++/* Threshold to use non temporal store in memmove. */ + long int __x86_shared_non_temporal_threshold attribute_hidden; + ++/* Threshold to use non temporal store in memset. */ ++long int __x86_memset_non_temporal_threshold attribute_hidden; ++ + /* Threshold to use Enhanced REP MOVSB. */ + long int __x86_rep_movsb_threshold attribute_hidden = 2048; + +@@ -77,6 +80,9 @@ init_cacheinfo (void) + __x86_shared_non_temporal_threshold + = cpu_features->non_temporal_threshold; + ++ __x86_memset_non_temporal_threshold ++ = cpu_features->memset_non_temporal_threshold; ++ + __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; + __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; + __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold; +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index a8ad6cbefa..cbcc154e24 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -889,6 +889,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) + rep_movsb_threshold = 2112; + ++ /* Non-temporal stores in memset have only been tested on Intel hardware. ++ Until we benchmark data on other x86 processor, disable non-temporal ++ stores in memset. */ ++ unsigned long int memset_non_temporal_threshold = SIZE_MAX; ++ if (cpu_features->basic.kind == arch_kind_intel) ++ memset_non_temporal_threshold = non_temporal_threshold; ++ + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of + cases slower than the vectorized path (and for some alignments, + it is really slow, check BZ #30994). */ +@@ -915,6 +922,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + && tunable_size <= maximum_non_temporal_threshold) + non_temporal_threshold = tunable_size; + ++ tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); ++ if (tunable_size > minimum_non_temporal_threshold ++ && tunable_size <= maximum_non_temporal_threshold) ++ memset_non_temporal_threshold = tunable_size; ++ + tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL); + if (tunable_size > minimum_rep_movsb_threshold) + rep_movsb_threshold = tunable_size; +@@ -930,6 +942,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, + minimum_non_temporal_threshold, + maximum_non_temporal_threshold); ++ TUNABLE_SET_WITH_BOUNDS ( ++ x86_memset_non_temporal_threshold, memset_non_temporal_threshold, ++ minimum_non_temporal_threshold, maximum_non_temporal_threshold); + TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, + minimum_rep_movsb_threshold, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, +@@ -943,6 +958,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + cpu_features->data_cache_size = data; + cpu_features->shared_cache_size = shared; + cpu_features->non_temporal_threshold = non_temporal_threshold; ++ cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold; + cpu_features->rep_movsb_threshold = rep_movsb_threshold; + cpu_features->rep_stosb_threshold = rep_stosb_threshold; + cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold; +diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c +index 5aab63e532..05d54b5eba 100644 +--- a/sysdeps/x86/dl-diagnostics-cpu.c ++++ b/sysdeps/x86/dl-diagnostics-cpu.c +@@ -83,6 +83,8 @@ _dl_diagnostics_cpu (void) + cpu_features->shared_cache_size); + print_cpu_features_value ("non_temporal_threshold", + cpu_features->non_temporal_threshold); ++ print_cpu_features_value ("memset_non_temporal_threshold", ++ cpu_features->memset_non_temporal_threshold); + print_cpu_features_value ("rep_movsb_threshold", + cpu_features->rep_movsb_threshold); + print_cpu_features_value ("rep_movsb_stop_threshold", +diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list +index feb7004036..f334a2ad6a 100644 +--- a/sysdeps/x86/dl-tunables.list ++++ b/sysdeps/x86/dl-tunables.list +@@ -30,6 +30,9 @@ glibc { + x86_non_temporal_threshold { + type: SIZE_T + } ++ x86_memset_non_temporal_threshold { ++ type: SIZE_T ++ } + x86_rep_movsb_threshold { + type: SIZE_T + # Since there is overhead to set up REP MOVSB operation, REP +diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h +index 594feeb2f4..e2d641dcd0 100644 +--- a/sysdeps/x86/include/cpu-features.h ++++ b/sysdeps/x86/include/cpu-features.h +@@ -918,8 +918,10 @@ struct cpu_features + /* Shared cache size for use in memory and string routines, typically + L2 or L3 size. */ + unsigned long int shared_cache_size; +- /* Threshold to use non temporal store. */ ++ /* Threshold to use non temporal store in memmove. */ + unsigned long int non_temporal_threshold; ++ /* Threshold to use non temporal store in memset. */ ++ unsigned long int memset_non_temporal_threshold; + /* Threshold to use "rep movsb". */ + unsigned long int rep_movsb_threshold; + /* Threshold to stop using "rep movsb". */ +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 3d9ad49cb9..eb9cbf0da9 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -21,8 +21,13 @@ + 2. If size is less than VEC, use integer register stores. + 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. + 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. +- 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with +- 4 VEC stores and store 4 * VEC at a time until done. */ ++ 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with ++ 4 VEC stores and store 4 * VEC at a time until done. ++ 6. On machines ERMS feature, if size is range ++ [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold) ++ then REP STOSB will be used. ++ 7. If size >= __x86_memset_non_temporal_threshold, use a ++ non-temporal stores. */ + + #include + +@@ -322,6 +327,8 @@ L(return_vzeroupper): + /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in + range for 2-byte jump encoding. */ + L(stosb_local): ++ cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP ++ jae L(nt_memset) + movzbl %sil, %eax + mov %RDX_LP, %RCX_LP + mov %RDI_LP, %RDX_LP +-- +2.17.1 + diff --git a/0005-x86-Enable-non-temporal-memset-tunable-for-AMD.patch b/0005-x86-Enable-non-temporal-memset-tunable-for-AMD.patch new file mode 100644 index 0000000..6ab5ad9 --- /dev/null +++ b/0005-x86-Enable-non-temporal-memset-tunable-for-AMD.patch @@ -0,0 +1,47 @@ +From fb7889b5c34b85a9ac9b50c252b6cd6f81c8630b Mon Sep 17 00:00:00 2001 +From: Joe Damato +Date: Fri, 7 Jun 2024 23:04:47 +0000 +Subject: [PATCH 05/11] x86: Enable non-temporal memset tunable for AMD + +In commit 46b5e98ef6f1 ("x86: Add seperate non-temporal tunable for +memset") a tunable threshold for enabling non-temporal memset was added, +but only for Intel hardware. + +Since that commit, new benchmark results suggest that non-temporal +memset is beneficial on AMD, as well, so allow this tunable to be set +for AMD. + +See: +https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing +which has been updated to include data using different stategies for +large memset on AMD Zen2, Zen3, and Zen4. + +Signed-off-by: Joe Damato +Reviewed-by: Noah Goldstein +--- + sysdeps/x86/dl-cacheinfo.h | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index cbcc154e24..362e9bc24e 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -889,11 +889,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) + rep_movsb_threshold = 2112; + +- /* Non-temporal stores in memset have only been tested on Intel hardware. +- Until we benchmark data on other x86 processor, disable non-temporal +- stores in memset. */ ++ /* Non-temporal stores are more performant on Intel and AMD hardware above ++ non_temporal_threshold. Enable this for both Intel and AMD hardware. */ + unsigned long int memset_non_temporal_threshold = SIZE_MAX; +- if (cpu_features->basic.kind == arch_kind_intel) ++ if (cpu_features->basic.kind == arch_kind_intel ++ || cpu_features->basic.kind == arch_kind_amd) + memset_non_temporal_threshold = non_temporal_threshold; + + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of +-- +2.17.1 + diff --git a/0006-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch b/0006-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch new file mode 100644 index 0000000..aa9ce9b --- /dev/null +++ b/0006-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch @@ -0,0 +1,41 @@ +From 00cbe2f60e51acb8230032e71e2910492f132ed5 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 14 Jun 2024 13:01:58 -0500 +Subject: [PATCH 06/11] x86: Fix value for `x86_memset_non_temporal_threshold` + when it is undesirable + +When we don't want to use non-temporal stores for memset, we set +`x86_memset_non_temporal_threshold` to SIZE_MAX. + +The current code, however, we using `maximum_non_temporal_threshold` +as the upper bound which is `SIZE_MAX >> 4` so we ended up with a +value of `0`. + +Fix is to just use `SIZE_MAX` as the upper bound for when setting the +tunable. +Tested-by: Borislav Petkov (AMD) +Reviewed-by: H.J. Lu +--- + sysdeps/x86/dl-cacheinfo.h | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 362e9bc24e..8e2fd8799f 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -942,9 +942,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, + minimum_non_temporal_threshold, + maximum_non_temporal_threshold); +- TUNABLE_SET_WITH_BOUNDS ( +- x86_memset_non_temporal_threshold, memset_non_temporal_threshold, +- minimum_non_temporal_threshold, maximum_non_temporal_threshold); ++ TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, ++ memset_non_temporal_threshold, ++ minimum_non_temporal_threshold, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, + minimum_rep_movsb_threshold, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, +-- +2.17.1 + diff --git a/0007-x86-Disable-non-temporal-memset-on-Skylake-Server.patch b/0007-x86-Disable-non-temporal-memset-on-Skylake-Server.patch new file mode 100644 index 0000000..f3220f4 --- /dev/null +++ b/0007-x86-Disable-non-temporal-memset-on-Skylake-Server.patch @@ -0,0 +1,262 @@ +From 5431ab7ca77eb3b89e18ad6174230e83e15494fd Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 15 Jul 2024 16:19:17 +0800 +Subject: [PATCH 07/11] x86: Disable non-temporal memset on Skylake Server + +The original commit enabling non-temporal memset on Skylake Server had +erroneous benchmarks (actually done on ICX). + +Further benchmarks indicate non-temporal stores may in fact by a +regression on Skylake Server. + +This commit may be over-cautious in some cases, but should avoid any +regressions for 2.40. + +Tested using qemu on all x86_64 cpu arch supported by both qemu + +GLIBC. + +Reviewed-by: DJ Delorie +Reviewed-by: H.J. Lu +--- + sysdeps/x86/cpu-features.c | 13 +- + sysdeps/x86/cpu-tunables.c | 5 + + sysdeps/x86/dl-cacheinfo.h | 15 +- + ...cpu-features-preferred_feature_index_1.def | 1 + + sysdeps/x86/tst-hwcap-tunables.c | 148 ++++++++++++++++++ + 5 files changed, 172 insertions(+), 10 deletions(-) + create mode 100644 sysdeps/x86/tst-hwcap-tunables.c + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index c4dd85145e..b4030776a7 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -754,11 +754,18 @@ init_cpu_features (struct cpu_features *cpu_features) + + /* Newer Bigcore microarch (larger non-temporal store + threshold). */ +- case INTEL_BIGCORE_SKYLAKE: +- case INTEL_BIGCORE_KABYLAKE: +- case INTEL_BIGCORE_COMETLAKE: + case INTEL_BIGCORE_SKYLAKE_AVX512: + case INTEL_BIGCORE_CANNONLAKE: ++ /* Benchmarks indicate non-temporal memset is not ++ necessarily profitable on SKX (and in some cases much ++ worse). This is likely unique to SKX due its it unique ++ mesh interconnect (not present on ICX or BWD). Disable ++ non-temporal on all Skylake servers. */ ++ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] ++ |= bit_arch_Avoid_Non_Temporal_Memset; ++ case INTEL_BIGCORE_COMETLAKE: ++ case INTEL_BIGCORE_SKYLAKE: ++ case INTEL_BIGCORE_KABYLAKE: + case INTEL_BIGCORE_ICELAKE: + case INTEL_BIGCORE_TIGERLAKE: + case INTEL_BIGCORE_ROCKETLAKE: +diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c +index 0d4f328585..4b62d697a2 100644 +--- a/sysdeps/x86/cpu-tunables.c ++++ b/sysdeps/x86/cpu-tunables.c +@@ -272,6 +272,11 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) + disable, 24); + } + break; ++ case 25: ++ { ++ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, ++ Avoid_Non_Temporal_Memset, 25); ++ } + case 26: + { + CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 8e2fd8799f..13923e4f1e 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -892,13 +892,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + /* Non-temporal stores are more performant on Intel and AMD hardware above + non_temporal_threshold. Enable this for both Intel and AMD hardware. */ + unsigned long int memset_non_temporal_threshold = SIZE_MAX; +- if (cpu_features->basic.kind == arch_kind_intel +- || cpu_features->basic.kind == arch_kind_amd) +- memset_non_temporal_threshold = non_temporal_threshold; +- +- /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of +- cases slower than the vectorized path (and for some alignments, +- it is really slow, check BZ #30994). */ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) ++ && (cpu_features->basic.kind == arch_kind_intel ++ || cpu_features->basic.kind == arch_kind_amd)) ++ memset_non_temporal_threshold = non_temporal_threshold; ++ ++ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of ++ cases slower than the vectorized path (and for some alignments, ++ it is really slow, check BZ #30994). */ + if (cpu_features->basic.kind == arch_kind_amd) + rep_movsb_threshold = non_temporal_threshold; + +diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +index d20c5b3196..aae1c85551 100644 +--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def ++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512) + BIT (MathVec_Prefer_No_AVX512) + BIT (Prefer_FSRM) + BIT (Avoid_Short_Distance_REP_MOVSB) ++BIT (Avoid_Non_Temporal_Memset) +diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c +new file mode 100644 +index 0000000000..94307283d7 +--- /dev/null ++++ b/sysdeps/x86/tst-hwcap-tunables.c +@@ -0,0 +1,148 @@ ++/* Tests for x86 GLIBC_TUNABLES=glibc.cpu.hwcaps filter. ++ Copyright (C) 2023-2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* Nonzero if the program gets called via `exec'. */ ++#define CMDLINE_OPTIONS \ ++ { "restart", no_argument, &restart, 1 }, ++static int restart; ++ ++/* Disable everything. */ ++static const char *test_1[] = ++{ ++ "__memcpy_avx512_no_vzeroupper", ++ "__memcpy_avx512_unaligned", ++ "__memcpy_avx512_unaligned_erms", ++ "__memcpy_evex_unaligned", ++ "__memcpy_evex_unaligned_erms", ++ "__memcpy_avx_unaligned", ++ "__memcpy_avx_unaligned_erms", ++ "__memcpy_avx_unaligned_rtm", ++ "__memcpy_avx_unaligned_erms_rtm", ++ "__memcpy_ssse3", ++}; ++ ++static const struct test_t ++{ ++ const char *env; ++ const char *const *funcs; ++ size_t nfuncs; ++} tests[] = ++{ ++ { ++ /* Disable everything. */ ++ "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," ++ "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," ++ "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", ++ test_1, ++ array_length (test_1) ++ }, ++ { ++ /* Same as before, but with some empty suboptions. */ ++ ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," ++ "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," ++ "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", ++ test_1, ++ array_length (test_1) ++ } ++}; ++ ++/* Called on process re-execution. */ ++_Noreturn static void ++handle_restart (int ntest) ++{ ++ struct libc_ifunc_impl impls[32]; ++ int cnt = __libc_ifunc_impl_list ("memcpy", impls, array_length (impls)); ++ if (cnt == 0) ++ _exit (EXIT_SUCCESS); ++ TEST_VERIFY_EXIT (cnt >= 1); ++ for (int i = 0; i < cnt; i++) ++ { ++ for (int f = 0; f < tests[ntest].nfuncs; f++) ++ { ++ if (strcmp (impls[i].name, tests[ntest].funcs[f]) == 0) ++ TEST_COMPARE (impls[i].usable, false); ++ } ++ } ++ ++ _exit (EXIT_SUCCESS); ++} ++ ++static int ++do_test (int argc, char *argv[]) ++{ ++ /* We must have either: ++ - One our fource parameters left if called initially: ++ + path to ld.so optional ++ + "--library-path" optional ++ + the library path optional ++ + the application name ++ + the test to check */ ++ ++ TEST_VERIFY_EXIT (argc == 2 || argc == 5); ++ ++ if (restart) ++ handle_restart (atoi (argv[1])); ++ ++ char nteststr[INT_BUFSIZE_BOUND (int)]; ++ ++ char *spargv[10]; ++ { ++ int i = 0; ++ for (; i < argc - 1; i++) ++ spargv[i] = argv[i + 1]; ++ spargv[i++] = (char *) "--direct"; ++ spargv[i++] = (char *) "--restart"; ++ spargv[i++] = nteststr; ++ spargv[i] = NULL; ++ } ++ ++ for (int i = 0; i < array_length (tests); i++) ++ { ++ snprintf (nteststr, sizeof nteststr, "%d", i); ++ ++ printf ("[%d] Spawned test for %s\n", i, tests[i].env); ++ char *tunable = xasprintf ("glibc.cpu.hwcaps=%s", tests[i].env); ++ setenv ("GLIBC_TUNABLES", tunable, 1); ++ ++ struct support_capture_subprocess result ++ = support_capture_subprogram (spargv[0], spargv, NULL); ++ support_capture_subprocess_check (&result, "tst-tunables", 0, ++ sc_allow_stderr); ++ support_capture_subprocess_free (&result); ++ ++ free (tunable); ++ } ++ ++ return 0; ++} ++ ++#define TEST_FUNCTION_ARGV do_test ++#include +-- +2.17.1 + diff --git a/0008-x86-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch b/0008-x86-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch new file mode 100644 index 0000000..92cdb5c --- /dev/null +++ b/0008-x86-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch @@ -0,0 +1,95 @@ +From c2a035a04585ec554490e6500ab04011df8c883d Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 14 Aug 2024 14:37:30 +0800 +Subject: [PATCH 08/11] x86: Use `Avoid_Non_Temporal_Memset` to control + non-temporal path + +This is just a refactor and there should be no behavioral change from +this commit. + +The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob +for controlling whether we use non-temporal memset rather than having +extra logic based on vendor. +Reviewed-by: H.J. Lu +--- + sysdeps/x86/cpu-features.c | 16 ++++++++++++++++ + sysdeps/x86/dl-cacheinfo.h | 15 +++++++-------- + 2 files changed, 23 insertions(+), 8 deletions(-) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index b4030776a7..c9f2297524 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -640,6 +640,12 @@ init_cpu_features (struct cpu_features *cpu_features) + unsigned int stepping = 0; + enum cpu_features_kind kind; + ++ /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is, ++ as of writing this, we only have benchmarks indicatings it profitability ++ on Intel/AMD. */ ++ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] ++ |= bit_arch_Avoid_Non_Temporal_Memset; ++ + cpu_features->cachesize_non_temporal_divisor = 4; + #if !HAS_CPUID + if (__get_cpuid_max (0, 0) == 0) +@@ -665,6 +671,11 @@ init_cpu_features (struct cpu_features *cpu_features) + + update_active (cpu_features); + ++ /* Benchmarks indicate non-temporal memset can be profitable on Intel ++ hardware. */ ++ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] ++ &= ~bit_arch_Avoid_Non_Temporal_Memset; ++ + if (family == 0x06) + { + model += extended_model; +@@ -874,6 +885,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht + + ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx; + ++ /* Benchmarks indicate non-temporal memset can be profitable on AMD ++ hardware. */ ++ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] ++ &= ~bit_arch_Avoid_Non_Temporal_Memset; ++ + if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) + { + /* Since the FMA4 bit is in CPUID_INDEX_80000001 and +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 13923e4f1e..d1ea14b2c7 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -889,14 +889,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) + rep_movsb_threshold = 2112; + +- /* Non-temporal stores are more performant on Intel and AMD hardware above +- non_temporal_threshold. Enable this for both Intel and AMD hardware. */ +- unsigned long int memset_non_temporal_threshold = SIZE_MAX; +- if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset) +- && (cpu_features->basic.kind == arch_kind_intel +- || cpu_features->basic.kind == arch_kind_amd)) +- memset_non_temporal_threshold = non_temporal_threshold; +- + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of + cases slower than the vectorized path (and for some alignments, + it is really slow, check BZ #30994). */ +@@ -918,6 +910,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (tunable_size != 0) + shared = tunable_size; + ++ /* Non-temporal stores are more performant on some hardware above ++ non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both ++ Intel and AMD hardware. */ ++ unsigned long int memset_non_temporal_threshold = SIZE_MAX; ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)) ++ memset_non_temporal_threshold = non_temporal_threshold; ++ + tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); + if (tunable_size > minimum_non_temporal_threshold + && tunable_size <= maximum_non_temporal_threshold) +-- +2.17.1 + diff --git a/0009-x86-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch b/0009-x86-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch new file mode 100644 index 0000000..4bd4e39 --- /dev/null +++ b/0009-x86-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch @@ -0,0 +1,221 @@ +From f42d4cd6e73a9881b1323456dd8698caebec9a9a Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 14 Aug 2024 14:37:31 +0800 +Subject: [PATCH 09/11] x86: Add `Avoid_STOSB` tunable to allow NT memset + without ERMS + +The goal of this flag is to allow targets which don't prefer/have ERMS +to still access the non-temporal memset implementation. + +There are 4 cases for tuning memset: + 1) `Avoid_STOSB && Avoid_Non_Temporal_Memset` + - Memset with temporal stores + 2) `Avoid_STOSB && !Avoid_Non_Temporal_Memset` + - Memset with temporal/non-temporal stores. Non-temporal path + goes through `rep stosb` path. We accomplish this by setting + `x86_rep_stosb_threshold` to + `x86_memset_non_temporal_threshold`. + 3) `!Avoid_STOSB && Avoid_Non_Temporal_Memset` + - Memset with temporal stores/`rep stosb` + 3) `!Avoid_STOSB && !Avoid_Non_Temporal_Memset` + - Memset with temporal stores/`rep stosb`/non-temporal stores. +Reviewed-by: H.J. Lu +--- + sysdeps/x86/cpu-features.c | 4 +++ + sysdeps/x86/cpu-tunables.c | 6 +++- + sysdeps/x86/dl-cacheinfo.h | 34 ++++++++++++++++--- + ...cpu-features-preferred_feature_index_1.def | 1 + + sysdeps/x86/tst-hwcap-tunables.c | 6 ++-- + sysdeps/x86_64/multiarch/ifunc-memset.h | 18 +++++++--- + 6 files changed, 56 insertions(+), 13 deletions(-) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index c9f2297524..287edc5b08 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -1014,6 +1014,10 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht + if (CPU_FEATURES_CPU_P (cpu_features, CMOV)) + cpu_features->preferred[index_arch_I686] |= bit_arch_I686; + ++ /* No ERMS, we want to avoid stosb for memset. */ ++ if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ cpu_features->preferred[index_arch_Avoid_STOSB] |= bit_arch_Avoid_STOSB; ++ + #if !HAS_CPUID + no_cpuid: + #endif +diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c +index 4b62d697a2..e8bf8ea731 100644 +--- a/sysdeps/x86/cpu-tunables.c ++++ b/sysdeps/x86/cpu-tunables.c +@@ -214,6 +214,10 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, + Prefer_FSRM, + disable, 11); ++ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, ++ Avoid_STOSB, ++ disable, 11); ++ + CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH (n, cpu_features, + Slow_SSE4_2, + SSE4_2, +@@ -275,7 +279,7 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) + case 25: + { + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, +- Avoid_Non_Temporal_Memset, 25); ++ Avoid_Non_Temporal_Memset, disable, 25); + } + case 26: + { +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index d1ea14b2c7..efe1bb7e4a 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -937,18 +937,42 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold, + long int, NULL); + ++ /* ++ For memset, the non-temporal implementation is only accessed through the ++ stosb code. ie: ++ ``` ++ if (size >= rep_stosb_thresh) ++ { ++ if (size >= non_temporal_thresh) ++ { ++ do_non_temporal (); ++ } ++ do_stosb (); ++ } ++ do_normal_vec_loop (); ++ ``` ++ So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh` ++ to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`, ++ `rep stosb` will never be used. ++ */ ++ TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, ++ memset_non_temporal_threshold, ++ minimum_non_temporal_threshold, SIZE_MAX); ++ /* Do `rep_stosb_thresh = non_temporal_thresh` after setting/getting the ++ final value of `x86_memset_non_temporal_threshold`. In some cases this can ++ be a matter of correctness. */ ++ if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_STOSB)) ++ rep_stosb_threshold ++ = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); ++ TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, ++ SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, + minimum_non_temporal_threshold, + maximum_non_temporal_threshold); +- TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, +- memset_non_temporal_threshold, +- minimum_non_temporal_threshold, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, + minimum_rep_movsb_threshold, SIZE_MAX); +- TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, +- SIZE_MAX); + + unsigned long int rep_movsb_stop_threshold; + /* Setting the upper bound of ERMS to the computed value of +diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +index aae1c85551..38a0c9226c 100644 +--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def ++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +@@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512) + BIT (Prefer_FSRM) + BIT (Avoid_Short_Distance_REP_MOVSB) + BIT (Avoid_Non_Temporal_Memset) ++BIT (Avoid_STOSB) +diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c +index 94307283d7..1920f5057e 100644 +--- a/sysdeps/x86/tst-hwcap-tunables.c ++++ b/sysdeps/x86/tst-hwcap-tunables.c +@@ -60,7 +60,8 @@ static const struct test_t + /* Disable everything. */ + "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," + "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," +- "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", ++ "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," ++ "-Avoid_STOSB", + test_1, + array_length (test_1) + }, +@@ -68,7 +69,8 @@ static const struct test_t + /* Same as before, but with some empty suboptions. */ + ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," + "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," +- "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", ++ "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," ++ "-Avoid_STOSB,-,", + test_1, + array_length (test_1) + } +diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h +index 5c5096ec5a..6b3b9a17a2 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h +@@ -46,6 +46,13 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) + attribute_hidden; + ++static inline int ++prefer_erms_nt_impl (const struct cpu_features *cpu_features) ++{ ++ return CPU_FEATURE_USABLE_P (cpu_features, ERMS) ++ || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset); ++} ++ + static inline void * + IFUNC_SELECTOR (void) + { +@@ -61,7 +68,7 @@ IFUNC_SELECTOR (void) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ if (prefer_erms_nt_impl (cpu_features)) + return OPTIMIZE (avx512_unaligned_erms); + + return OPTIMIZE (avx512_unaligned); +@@ -76,7 +83,7 @@ IFUNC_SELECTOR (void) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ if (prefer_erms_nt_impl (cpu_features)) + return OPTIMIZE (evex_unaligned_erms); + + return OPTIMIZE (evex_unaligned); +@@ -84,7 +91,7 @@ IFUNC_SELECTOR (void) + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ if (prefer_erms_nt_impl (cpu_features)) + return OPTIMIZE (avx2_unaligned_erms_rtm); + + return OPTIMIZE (avx2_unaligned_rtm); +@@ -93,14 +100,15 @@ IFUNC_SELECTOR (void) + if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, + Prefer_No_VZEROUPPER, !)) + { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ if (prefer_erms_nt_impl (cpu_features)) + return OPTIMIZE (avx2_unaligned_erms); + + return OPTIMIZE (avx2_unaligned); + } + } + +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) ++ || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)) + return OPTIMIZE (sse2_unaligned_erms); + + return OPTIMIZE (sse2_unaligned); +-- +2.17.1 + diff --git a/0010-x86-Enable-non-temporal-memset-for-Hygon-processors.patch b/0010-x86-Enable-non-temporal-memset-for-Hygon-processors.patch new file mode 100644 index 0000000..3f39f06 --- /dev/null +++ b/0010-x86-Enable-non-temporal-memset-for-Hygon-processors.patch @@ -0,0 +1,92 @@ +From ddfc6cc086ce3fd4b76c5fc981eba96290d1501c Mon Sep 17 00:00:00 2001 +From: Feifei Wang +Date: Mon, 19 Aug 2024 14:57:55 +0800 +Subject: [PATCH 10/11] x86: Enable non-temporal memset for Hygon processors + +This patch uses 'Avoid_Non_Temporal_Memset' flag to access +the non-temporal memset implementation for hygon processors. + +Test Results: + +hygon1 arch +x86_memset_non_temporal_threshold = 8MB +size new performance time / old performance time +1MB 0.994 +4MB 0.996 +8MB 0.670 +16MB 0.343 +32MB 0.355 + +hygon2 arch +x86_memset_non_temporal_threshold = 8MB +size new performance time / old performance time +1MB 1 +4MB 1 +8MB 1.312 +16MB 0.822 +32MB 0.830 + +hygon3 arch +x86_memset_non_temporal_threshold = 8MB +size new performance time / old performance time +1MB 1 +4MB 0.990 +8MB 0.737 +16MB 0.390 +32MB 0.401 + +For hygon arch with this patch, non-temporal stores can improve +performance by 20% - 65%. + +Signed-off-by: Feifei Wang +Reviewed-by: Jing Li +Reviewed-by: H.J. Lu +--- + sysdeps/x86/cpu-features.c | 9 +++++++-- + sysdeps/x86/dl-cacheinfo.h | 2 +- + 2 files changed, 8 insertions(+), 3 deletions(-) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 287edc5b08..f5539aea6f 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -640,9 +640,9 @@ init_cpu_features (struct cpu_features *cpu_features) + unsigned int stepping = 0; + enum cpu_features_kind kind; + +- /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is, ++ /* Default is avoid non-temporal memset for non Intel/AMD/Hygon hardware. This is, + as of writing this, we only have benchmarks indicatings it profitability +- on Intel/AMD. */ ++ on Intel/AMD/Hygon. */ + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] + |= bit_arch_Avoid_Non_Temporal_Memset; + +@@ -998,6 +998,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht + get_extended_indices (cpu_features); + + update_active (cpu_features); ++ ++ /* Benchmarks indicate non-temporal memset can be profitable on Hygon ++ hardware. */ ++ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] ++ &= ~bit_arch_Avoid_Non_Temporal_Memset; + } + else + { +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index efe1bb7e4a..2a837eb24b 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -912,7 +912,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + + /* Non-temporal stores are more performant on some hardware above + non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both +- Intel and AMD hardware. */ ++ Intel, AMD and Hygon hardware. */ + unsigned long int memset_non_temporal_threshold = SIZE_MAX; + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)) + memset_non_temporal_threshold = non_temporal_threshold; +-- +2.17.1 + diff --git a/0011-x86-Improve-large-memset-perf-with-non-temporal-stor.patch b/0011-x86-Improve-large-memset-perf-with-non-temporal-stor.patch new file mode 100644 index 0000000..091c73a --- /dev/null +++ b/0011-x86-Improve-large-memset-perf-with-non-temporal-stor.patch @@ -0,0 +1,248 @@ +From 708e5218e65602e8a3495b908e86cfbb50f65126 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 24 May 2024 12:38:50 -0500 +Subject: [PATCH 11/11] x86: Improve large memset perf with non-temporal stores + [RHEL-29312] + +Previously we use `rep stosb` for all medium/large memsets. This is +notably worse than non-temporal stores for large (above a +few MBs) memsets. +See: +https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing +For data using different stategies for large memset on ICX and SKX. + +Using non-temporal stores can be up to 3x faster on ICX and 2x faster +on SKX. Historically, these numbers would not have been so good +because of the zero-over-zero writeback optimization that `rep stosb` +is able to do. But, the zero-over-zero writeback optimization has been +removed as a potential side-channel attack, so there is no longer any +good reason to only rely on `rep stosb` for large memsets. On the flip +size, non-temporal writes can avoid data in their RFO requests saving +memory bandwidth. + +All of the other changes to the file are to re-organize the +code-blocks to maintain "good" alignment given the new code added in +the `L(stosb_local)` case. + +The results from running the GLIBC memset benchmarks on TGL-client for +N=20 runs: + +Geometric Mean across the suite New / Old EXEX256: 0.979 +Geometric Mean across the suite New / Old EXEX512: 0.979 +Geometric Mean across the suite New / Old AVX2 : 0.986 +Geometric Mean across the suite New / Old SSE2 : 0.979 + +Most of the cases are essentially unchanged, this is mostly to show +that adding the non-temporal case didn't add any regressions to the +other cases. + +The results on the memset-large benchmark suite on TGL-client for N=20 +runs: + +Geometric Mean across the suite New / Old EXEX256: 0.926 +Geometric Mean across the suite New / Old EXEX512: 0.925 +Geometric Mean across the suite New / Old AVX2 : 0.928 +Geometric Mean across the suite New / Old SSE2 : 0.924 + +So roughly a 7.5% speedup. This is lower than what we see on servers +(likely because clients typically have faster single-core bandwidth so +saving bandwidth on RFOs is less impactful), but still advantageous. + +Full test-suite passes on x86_64 w/ and w/o multiarch. +Reviewed-by: H.J. Lu +--- + .../multiarch/memset-vec-unaligned-erms.S | 142 +++++++++++------- + 1 file changed, 85 insertions(+), 57 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index eb9cbf0da9..f08c8646b1 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -24,10 +24,10 @@ + 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with + 4 VEC stores and store 4 * VEC at a time until done. + 6. On machines ERMS feature, if size is range +- [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold) +- then REP STOSB will be used. ++ [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold) ++ then REP STOSB will be used. + 7. If size >= __x86_memset_non_temporal_threshold, use a +- non-temporal stores. */ ++ non-temporal stores. */ + + #include + +@@ -150,6 +150,41 @@ L(entry_from_wmemset): + VMOVU %VMM(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VMM(0), (%rdi) + VZEROUPPER_RETURN ++ ++ /* If have AVX512 mask instructions put L(less_vec) close to ++ entry as it doesn't take much space and is likely a hot target. */ ++#ifdef USE_LESS_VEC_MASK_STORE ++ /* Align to ensure the L(less_vec) logic all fits in 1x cache lines. */ ++ .p2align 6,, 47 ++ .p2align 4 ++L(less_vec): ++L(less_vec_from_wmemset): ++ /* Less than 1 VEC. */ ++# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 ++# error Unsupported VEC_SIZE! ++# endif ++ /* Clear high bits from edi. Only keeping bits relevant to page ++ cross check. Note that we are using rax which is set in ++ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ ++ andl $(PAGE_SIZE - 1), %edi ++ /* Check if VEC_SIZE store cross page. Mask stores suffer ++ serious performance degradation when it has to fault suppress. */ ++ cmpl $(PAGE_SIZE - VEC_SIZE), %edi ++ /* This is generally considered a cold target. */ ++ ja L(cross_page) ++# if VEC_SIZE > 32 ++ movq $-1, %rcx ++ bzhiq %rdx, %rcx, %rcx ++ kmovq %rcx, %k1 ++# else ++ movl $-1, %ecx ++ bzhil %edx, %ecx, %ecx ++ kmovd %ecx, %k1 ++# endif ++ vmovdqu8 %VMM(0), (%rax){%k1} ++ VZEROUPPER_RETURN ++#endif ++ + #if defined USE_MULTIARCH && IS_IN (libc) + END (MEMSET_SYMBOL (__memset, unaligned)) + +@@ -188,54 +223,6 @@ L(last_2x_vec): + #endif + VZEROUPPER_RETURN + +- /* If have AVX512 mask instructions put L(less_vec) close to +- entry as it doesn't take much space and is likely a hot target. +- */ +-#ifdef USE_LESS_VEC_MASK_STORE +- .p2align 4,, 10 +-L(less_vec): +-L(less_vec_from_wmemset): +- /* Less than 1 VEC. */ +-# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +-# error Unsupported VEC_SIZE! +-# endif +- /* Clear high bits from edi. Only keeping bits relevant to page +- cross check. Note that we are using rax which is set in +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ +- andl $(PAGE_SIZE - 1), %edi +- /* Check if VEC_SIZE store cross page. Mask stores suffer +- serious performance degradation when it has to fault suppress. +- */ +- cmpl $(PAGE_SIZE - VEC_SIZE), %edi +- /* This is generally considered a cold target. */ +- ja L(cross_page) +-# if VEC_SIZE > 32 +- movq $-1, %rcx +- bzhiq %rdx, %rcx, %rcx +- kmovq %rcx, %k1 +-# else +- movl $-1, %ecx +- bzhil %edx, %ecx, %ecx +- kmovd %ecx, %k1 +-# endif +- vmovdqu8 %VMM(0), (%rax){%k1} +- VZEROUPPER_RETURN +- +-# if defined USE_MULTIARCH && IS_IN (libc) +- /* Include L(stosb_local) here if including L(less_vec) between +- L(stosb_more_2x_vec) and ENTRY. This is to cache align the +- L(stosb_more_2x_vec) target. */ +- .p2align 4,, 10 +-L(stosb_local): +- movzbl %sil, %eax +- mov %RDX_LP, %RCX_LP +- mov %RDI_LP, %RDX_LP +- rep stosb +- mov %RDX_LP, %RAX_LP +- VZEROUPPER_RETURN +-# endif +-#endif +- + #if defined USE_MULTIARCH && IS_IN (libc) + .p2align 4 + L(stosb_more_2x_vec): +@@ -321,9 +308,13 @@ L(return_vzeroupper): + ret + #endif + +- .p2align 4,, 10 +-#ifndef USE_LESS_VEC_MASK_STORE +-# if defined USE_MULTIARCH && IS_IN (libc) ++#ifdef USE_WITH_AVX2 ++ .p2align 4 ++#else ++ .p2align 4,, 4 ++#endif ++ ++#if defined USE_MULTIARCH && IS_IN (libc) + /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in + range for 2-byte jump encoding. */ + L(stosb_local): +@@ -333,11 +324,17 @@ L(stosb_local): + mov %RDX_LP, %RCX_LP + mov %RDI_LP, %RDX_LP + rep stosb ++# if (defined USE_WITH_SSE2) || (defined USE_WITH_AVX512) ++ /* Use xchg to save 1-byte (this helps align targets below). */ ++ xchg %RDX_LP, %RAX_LP ++# else + mov %RDX_LP, %RAX_LP +- VZEROUPPER_RETURN + # endif ++ VZEROUPPER_RETURN ++#endif ++#ifndef USE_LESS_VEC_MASK_STORE + /* Define L(less_vec) only if not otherwise defined. */ +- .p2align 4 ++ .p2align 4,, 12 + L(less_vec): + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to + xmm). This is only does anything for AVX2. */ +@@ -428,4 +425,35 @@ L(between_2_3): + movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) + #endif + ret +-END (MEMSET_SYMBOL (__memset, unaligned_erms)) ++ ++#if defined USE_MULTIARCH && IS_IN (libc) ++# ifdef USE_WITH_AVX512 ++ /* Force align so the loop doesn't cross a cache-line. */ ++ .p2align 4 ++# endif ++ .p2align 4,, 7 ++ /* Memset using non-temporal stores. */ ++L(nt_memset): ++ VMOVU %VMM(0), (VEC_SIZE * 0)(%rdi) ++ leaq (VEC_SIZE * -4)(%rdi, %rdx), %rdx ++ /* Align DST. */ ++ orq $(VEC_SIZE * 1 - 1), %rdi ++ incq %rdi ++ .p2align 4,, 7 ++L(nt_loop): ++ VMOVNT %VMM(0), (VEC_SIZE * 0)(%rdi) ++ VMOVNT %VMM(0), (VEC_SIZE * 1)(%rdi) ++ VMOVNT %VMM(0), (VEC_SIZE * 2)(%rdi) ++ VMOVNT %VMM(0), (VEC_SIZE * 3)(%rdi) ++ subq $(VEC_SIZE * -4), %rdi ++ cmpq %rdx, %rdi ++ jb L(nt_loop) ++ sfence ++ VMOVU %VMM(0), (VEC_SIZE * 0)(%rdx) ++ VMOVU %VMM(0), (VEC_SIZE * 1)(%rdx) ++ VMOVU %VMM(0), (VEC_SIZE * 2)(%rdx) ++ VMOVU %VMM(0), (VEC_SIZE * 3)(%rdx) ++ VZEROUPPER_RETURN ++#endif ++ ++END(MEMSET_SYMBOL(__memset, unaligned_erms)) +-- +2.17.1 + diff --git a/glibc.spec b/glibc.spec index 4c81d8e..cfe4699 100644 --- a/glibc.spec +++ b/glibc.spec @@ -1,4 +1,4 @@ -%define anolis_release 10 +%define anolis_release 11 %bcond_without testsuite %bcond_without benchtests @@ -165,6 +165,18 @@ Patch3049: 0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch Patch3050: LoongArch-Force-SHMLBA-the-same-as-kernel.patch +Patch3051: 0001-x86-Add-new-architecture-type-for-Hygon-processors.patch +Patch3052: 0002-x86-Add-cache-information-support-for-Hygon-processo.patch +Patch3053: 0003-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch +Patch3054: 0004-x86-Add-seperate-non-temporal-tunable-for-memset.patch +Patch3055: 0005-x86-Enable-non-temporal-memset-tunable-for-AMD.patch +Patch3056: 0006-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch +Patch3057: 0007-x86-Disable-non-temporal-memset-on-Skylake-Server.patch +Patch3058: 0008-x86-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch +Patch3059: 0009-x86-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch +Patch3060: 0010-x86-Enable-non-temporal-memset-for-Hygon-processors.patch +Patch3061: 0011-x86-Improve-large-memset-perf-with-non-temporal-stor.patch + BuildRequires: audit-libs-devel >= 1.1.3 libcap-devel systemtap-sdt-devel BuildRequires: procps-ng util-linux gawk sed >= 3.95 gettext BuildRequires: python3 python3-devel @@ -1127,22 +1139,25 @@ update_gconv_modules_cache () %{_libdir}/libpthread_nonshared.a %changelog +* Wed Jun 18 2025 xiajiamei - 2.38-11 +- Add patches to support Hygon + * Wed May 28 2025 mgb01105731 - 2.38-10 - Add patch to fix CVE-2025-4802 * Tue May 27 2025 mgb01105731 - 2.38-9 - Add patch to fix CVE-2025-0395 -* Thu Jan 02 2025 Peng Fan - 2.38-8 -- LoongArch: Force SHMLBA the same as kernel - -* Sat Apr 26 2025 Yihao Yan - 2.38-7 +* Sat Apr 26 2025 Yihao Yan - 2.38-8 - Add support for riscv64 - Add lp64d into glibc -* Wed Jan 15 2025 MayShao - 2.38-6 +* Wed Jan 15 2025 MayShao - 2.38-7 - x86: Set preferred CPU features and default NT threshold for Zhaoxin processors +* Thu Jan 02 2025 Peng Fan - 2.38-6 +- LoongArch: Force SHMLBA the same as kernel + * Fri Dec 20 2024 Zhao Hang - 2.38-5 - fix CVE-2024-33602 -- Gitee