From 8dee2397c000f29f339a43e930e8d818a650a889 Mon Sep 17 00:00:00 2001 From: Feifei Wang1994 Date: Wed, 4 Sep 2024 06:01:47 +0000 Subject: [PATCH] x86: backport x86 optimization patches Backport 3 x86 optimization patches. The first one 'Reversing calculation of __x86_shared_non_temporal_threshold' patch reverse calculation of __x86_shared_non_temporal_threshold from 3/4 of the entire shared cache size of a multi-threaded system to 3/4 of one thread's share of the cache size, and then improve the memcpy performance. The second patch 'Optimizing memcpy for AMD Zen architecture' recomputing the shareable cache as 'L3 per CCX(Core-Complex)' and improve performance for amd. The third patch 'Add Hygon Dhyana support' fix Hygon Dhyana processor CPU Vendor ID detection problem in glibc sysdep module. --- ...ng-calculation-of-__x86_shared_non_t.patch | 95 +++++++++++++++++++ backport-x86-Add-Hygon-support.patch | 38 ++++++++ ...imizing-memcpy-for-AMD-Zen-architect.patch | 61 ++++++++++++ glibc.spec | 13 ++- 4 files changed, 206 insertions(+), 1 deletion(-) create mode 100644 backport-Reversing-calculation-of-__x86_shared_non_t.patch create mode 100644 backport-x86-Add-Hygon-support.patch create mode 100644 backport-x86-Optimizing-memcpy-for-AMD-Zen-architect.patch diff --git a/backport-Reversing-calculation-of-__x86_shared_non_t.patch b/backport-Reversing-calculation-of-__x86_shared_non_t.patch new file mode 100644 index 0000000..905f255 --- /dev/null +++ b/backport-Reversing-calculation-of-__x86_shared_non_t.patch @@ -0,0 +1,95 @@ +From 4d1d91c7fdb52e847a6a7ff096736968e10c6509 Mon Sep 17 00:00:00 2001 +From: Patrick McGehearty +Date: Wed, 4 Sep 2024 05:47:15 +0000 +Subject: [PATCH] + backport-Reversing-calculation-of-__x86_shared_non_temporal_threshold + +The __x86_shared_non_temporal_threshold determines when memcpy on x86 +uses non_temporal stores to avoid pushing other data out of the last +level cache. + +This patch proposes to revert the calculation change made by H.J. Lu's +patch of June 2, 2017. + +H.J. Lu's patch selected a threshold suitable for a single thread +getting maximum performance. It was tuned using the single threaded +large memcpy micro benchmark on an 8 core processor. The last change +changes the threshold from using 3/4 of one thread's share of the +cache to using 3/4 of the entire cache of a multi-threaded system +before switching to non-temporal stores. Multi-threaded systems with +more than a few threads are server-class and typically have many +active threads. If one thread consumes 3/4 of the available cache for +all threads, it will cause other active threads to have data removed +from the cache. Two examples show the range of the effect. John +McCalpin's widely parallel Stream benchmark, which runs in parallel +and fetches data sequentially, saw a 20% slowdown with this patch on +an internal system test of 128 threads. This regression was discovered +when comparing OL8 performance to OL7. An example that compares +normal stores to non-temporal stores may be found at +https://vgatherps.github.io/2018-09-02-nontemporal/. A simple test +shows performance loss of 400 to 500% due to a failure to use +nontemporal stores. These performance losses are most likely to occur +when the system load is heaviest and good performance is critical. + +The tunable x86_non_temporal_threshold can be used to override the +default for the knowledgable user who really wants maximum cache +allocation to a single thread in a multi-threaded system. +The manual entry for the tunable has been expanded to provide +more information about its purpose. + +Origin backport: https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=d3c57027470 +--- + manual/tunables.texi | 6 +++++- + sysdeps/x86/cacheinfo.c | 16 +++++++++++----- + 2 files changed, 16 insertions(+), 6 deletions(-) + +diff --git a/manual/tunables.texi b/manual/tunables.texi +index 124b39b6..79347bf3 100644 +--- a/manual/tunables.texi ++++ b/manual/tunables.texi +@@ -352,7 +352,11 @@ set shared cache size in bytes for use in memory and string routines. + + @deftp Tunable glibc.tune.x86_non_temporal_threshold + The @code{glibc.tune.x86_non_temporal_threshold} tunable allows the user +-to set threshold in bytes for non temporal store. ++to set threshold in bytes for non temporal store. Non temporal stores ++give a hint to the hardware to move data directly to memory without ++displacing other data from the cache. This tunable is used by some ++platforms to determine when to use non temporal stores in operations ++like memmove and memcpy. + + This tunable is specific to i386 and x86-64. + @end deftp +diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c +index 28bcf2f6..5b43fa78 100644 +--- a/sysdeps/x86/cacheinfo.c ++++ b/sysdeps/x86/cacheinfo.c +@@ -784,14 +784,20 @@ intel_bug_no_cache_info: + __x86_shared_cache_size = shared; + } + +- /* The large memcpy micro benchmark in glibc shows that 6 times of +- shared cache size is the approximate value above which non-temporal +- store becomes faster on a 8-core processor. This is the 3/4 of the +- total shared cache size. */ ++ /* The default setting for the non_temporal threshold is 3/4 of one ++ thread's share of the chip's cache. For most Intel and AMD processors ++ with an initial release date between 2017 and 2020, a thread's typical ++ share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4 ++ threshold leaves 125 KBytes to 500 KBytes of the thread's data ++ in cache after a maximum temporal copy, which will maintain ++ in cache a reasonable portion of the thread's stack and other ++ active data. If the threshold is set higher than one thread's ++ share of the cache, it has a substantial risk of negatively ++ impacting the performance of other threads running on the chip. */ + __x86_shared_non_temporal_threshold + = (cpu_features->non_temporal_threshold != 0 + ? cpu_features->non_temporal_threshold +- : __x86_shared_cache_size * threads * 3 / 4); ++ : __x86_shared_cache_size * 3 / 4); + } + + #endif +-- +2.27.0 + diff --git a/backport-x86-Add-Hygon-support.patch b/backport-x86-Add-Hygon-support.patch new file mode 100644 index 0000000..3625557 --- /dev/null +++ b/backport-x86-Add-Hygon-support.patch @@ -0,0 +1,38 @@ +From 2b46bc9b5a148f6da198321a8396a6c2c6a1b070 Mon Sep 17 00:00:00 2001 +From: Feifei Wang1994 +Date: Tue, 3 Sep 2024 08:30:43 +0000 +Subject: [PATCH] backport-x86-Add-Hygon-support + +This patch fix Hygon processor CPU Vendor ID detection problem +in glibc sysdep module, current glibc-2.28 doesn't recognize +Hygon CPU Vendor ID("HygonGenuine") and sets kind to arch_kind_other, +which result in incorrect zero value for __cache_sysconf() syscall. + +This patch add Hygon CPU Vendor ID check, setup kind to arch_kind_amd +and reuse AMD code path, which lead to correct return value in __cache_sysconf() syscall. +Test case shows no failure with this patch in Hygon arch. + +Signed-off-by: Feifei Wang +--- + sysdeps/x86/cpu-features.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index ea0b64fd..4b1a0169 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -344,8 +344,9 @@ init_cpu_features (struct cpu_features *cpu_features) + cpu_features->feature[index_arch_Prefer_No_AVX512] + |= bit_arch_Prefer_No_AVX512; + } +- /* This spells out "AuthenticAMD". */ +- else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) ++ /* This spells out "AuthenticAMD" or "HygonGenuine". */ ++ else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) ++ || (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)) + { + unsigned int extended_model, stepping; + +-- +2.27.0 + diff --git a/backport-x86-Optimizing-memcpy-for-AMD-Zen-architect.patch b/backport-x86-Optimizing-memcpy-for-AMD-Zen-architect.patch new file mode 100644 index 0000000..a59c25d --- /dev/null +++ b/backport-x86-Optimizing-memcpy-for-AMD-Zen-architect.patch @@ -0,0 +1,61 @@ +From 8374ca9a2f66ad1b36dbd4b53abba9c692fccee6 Mon Sep 17 00:00:00 2001 +From: Sajan Karumanchi +Date: Tue, 3 Sep 2024 08:23:27 +0000 +Subject: [PATCH] backport-x86-Optimizing-memcpy-for-AMD-Zen-architecture + +Modifying the shareable cache '__x86_shared_cache_size', which is a +factor in computing the non-temporal threshold parameter +'__x86_shared_non_temporal_threshold' to optimize memcpy for AMD Zen +architectures. +In the existing implementation, the shareable cache is computed as 'L3 +per thread, L2 per core'. Recomputing this shareable cache as 'L3 per +CCX(Core-Complex)' has brought in performance gains. +As per the large bench variant results, this patch also addresses the +regression problem on AMD Zen architectures. + +Origin backport: https://sourceware.org/git/?p=glibc.git;a=commit;h=8813b2682e4094e43b0cf1634e99619f1b8b2c62 +--- + sysdeps/x86/cacheinfo.c | 20 +++++++++++++++++--- + 1 file changed, 17 insertions(+), 3 deletions(-) + +diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c +index 5b43fa78..37a03af0 100644 +--- a/sysdeps/x86/cacheinfo.c ++++ b/sysdeps/x86/cacheinfo.c +@@ -728,7 +728,7 @@ intel_bug_no_cache_info: + threads = 1 << ((ecx >> 12) & 0x0f); + } + +- if (threads == 0) ++ if (threads == 0 || cpu_features->family >= 0x17) + { + /* If APIC ID width is not available, use logical + processor count. */ +@@ -743,8 +743,22 @@ intel_bug_no_cache_info: + if (threads > 0) + shared /= threads; + +- /* Account for exclusive L2 and L3 caches. */ +- shared += core; ++ /* Get shared cache per ccx for Zen architectures. */ ++ if (cpu_features->family >= 0x17) ++ { ++ unsigned int eax; ++ ++ /* Get number of threads share the L3 cache in CCX. */ ++ __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx); ++ ++ unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1; ++ shared *= threads_per_ccx; ++ } ++ else ++ { ++ /* Account for exclusive L2 and L3 caches. */ ++ shared += core; ++ } + } + + #ifndef DISABLE_PREFETCHW +-- +2.27.0 + diff --git a/glibc.spec b/glibc.spec index 75d9227..2189263 100644 --- a/glibc.spec +++ b/glibc.spec @@ -62,7 +62,7 @@ ############################################################################## Name: glibc Version: 2.28 -Release: 101 +Release: 102 Summary: The GNU libc libraries License: %{all_license} URL: http://www.gnu.org/software/glibc/ @@ -167,6 +167,9 @@ Patch80: backport-CVE-2024-33601-CVE-2024-33602-nscd-Use-two-buffer-in-addgetnet Patch81: iconv-ISO-2022-CN-EXT-fix-out-of-bound-writes-when-w.patch Patch82: backport-Use-errval-not-errno-to-guide-cache-update.patch Patch83: backport-Skip-unusable-entries-in-first-pass-in-prune_cache.patch +Patch84: backport-Reversing-calculation-of-__x86_shared_non_t.patch +Patch85: backport-x86-Optimizing-memcpy-for-AMD-Zen-architect.patch +Patch86: backport-x86-Add-Hygon-support.patch Provides: ldconfig rtld(GNU_HASH) bundled(gnulib) @@ -1283,6 +1286,14 @@ fi %endif %changelog +* Tue Sep 03 2024 Feifei Wang - 2.28-102 +- x86: Reversing calculation of __x86_shared_non_temporal_threshold + x86: Optimizing memcpy for AMD Zen architecture + x86: Add Hygon Dhyana support + https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=d3c57027470b + https://sourceware.org/git/?p=glibc.git;a=commit;h=59803e81f96b479c17f583b31eac44b57591a1bf + https://sourceware.org/git/?p=glibc.git;a=commit;h=ade8b817fead73b302d08c88cd44ea2ea56793d4 + * Mon May 06 2024 chengyechun - 2.28-101 - Type:bugfix - ID: -- Gitee