From d5576a8feda207f06e46bcbcc1bdb566f0fd460a Mon Sep 17 00:00:00 2001 From: Qingqing Li Date: Sun, 26 Jan 2025 10:05:30 +0800 Subject: [PATCH] backport form glibc upstream 2.38 branch, this include below patches: - stdlib: Test using setenv with updated environ [BZ #32588] - Fix underallocation of abort_msg_s struct (CVE-2025-0395) - elf: Support recursive use of dynamic TLS in interposed malloc - elf: Avoid some free (NULL) calls in _dl_update_slotinfo - x86/string: Fixup alignment of main loop in str{n}cmp-evex [BZ #32212] - x86: Improve large memset perf with non-temporal stores [RHEL-29312] - x86_64: Fix missing wcsncat function definition without multiarch (x86-64-v4) - sysdeps/x86/Makefile: Split and sort tests - x86: Only align destination to 1x VEC_SIZE in memset 4x loop - elf: Fix slow tls access after dlopen [BZ #19924] - x86: Check the lower byte of EAX of CPUID leaf 2 [BZ #30643] - x86_64: Add log1p with FMA - x86_64: Add expm1 with FMA - x86_64: Add log2 with FMA - x86_64: Sort fpu/multiarch/Makefile --- ...ion-of-abort_msg_s-struct-CVE-2025-0.patch | 89 +++ ...ree-NULL-calls-in-_dl_update_slotinf.patch | 50 ++ ...low-tls-access-after-dlopen-BZ-19924.patch | 328 +++++++++++ ...rsive-use-of-dynamic-TLS-in-interpos.patch | 521 ++++++++++++++++++ glibc.spec | 34 +- ...g-setenv-with-updated-environ-BZ-325.patch | 75 +++ ...ps-x86-Makefile-Split-and-sort-tests.patch | 178 ++++++ ...wer-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch | 77 +++ ...e-memset-perf-with-non-temporal-stor.patch | 254 +++++++++ ...estination-to-1x-VEC_SIZE-in-memset-.patch | 34 ++ ...-alignment-of-main-loop-in-str-n-cmp.patch | 149 +++++ x86_64-Add-expm1-with-FMA.patch | 135 +++++ x86_64-Add-log1p-with-FMA.patch | 140 +++++ x86_64-Add-log2-with-FMA.patch | 102 ++++ ...ng-wcsncat-function-definition-witho.patch | 44 ++ x86_64-Sort-fpu-multiarch-Makefile.patch | 144 +++++ 16 files changed, 2353 insertions(+), 1 deletion(-) create mode 100644 Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch create mode 100644 elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch create mode 100644 elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch create mode 100644 elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch create mode 100644 stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch create mode 100644 sysdeps-x86-Makefile-Split-and-sort-tests.patch create mode 100644 x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch create mode 100644 x86-Improve-large-memset-perf-with-non-temporal-stor.patch create mode 100644 x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch create mode 100644 x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch create mode 100644 x86_64-Add-expm1-with-FMA.patch create mode 100644 x86_64-Add-log1p-with-FMA.patch create mode 100644 x86_64-Add-log2-with-FMA.patch create mode 100644 x86_64-Fix-missing-wcsncat-function-definition-witho.patch create mode 100644 x86_64-Sort-fpu-multiarch-Makefile.patch diff --git a/Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch b/Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch new file mode 100644 index 0000000..64ab3bf --- /dev/null +++ b/Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch @@ -0,0 +1,89 @@ +From c32fd59314c343db88c3ea4a203870481d33c3d2 Mon Sep 17 00:00:00 2001 +From: Siddhesh Poyarekar +Date: Tue, 21 Jan 2025 16:11:06 -0500 +Subject: [PATCH] Fix underallocation of abort_msg_s struct + (CVE-2025-0395) + +Include the space needed to store the length of the message itself, in +addition to the message string. This resolves BZ #32582. + +Signed-off-by: Siddhesh Poyarekar +Reviewed: Adhemerval Zanella +(cherry picked from commit 68ee0f704cb81e9ad0a78c644a83e1e9cd2ee578) +--- + NEWS | 6 ++++++ + assert/assert.c | 4 +++- + sysdeps/posix/libc_fatal.c | 4 +++- + 3 files changed, 12 insertions(+), 2 deletions(-) + +diff --git a/NEWS b/NEWS +index d0815514e0..3e511d6de4 100644 +--- a/NEWS ++++ b/NEWS +@@ -34,6 +34,11 @@ Security related changes: + buffer overflow, which could be exploited to achieve escalated + privileges. This flaw was introduced in glibc 2.34. + ++ CVE-2025-0395: When the assert() function fails, it does not allocate ++ enough space for the assertion failure message string and size ++ information, which may lead to a buffer overflow if the message string ++ size aligns to page size. ++ + The following bugs are resolved with this release: + + [27821] ungetc: Fix backup buffer leak on program exit +@@ -61,6 +66,7 @@ The following bugs are resolved with this release: + [32137] libio: Attempt wide backup free only for non-legacy code + [32231] elf: Change ldconfig auxcache magic number + [32470] x86: Avoid integer truncation with large cache sizes ++ [32582] Fix underallocation of abort_msg_s struct (CVE-2025-0395) + + Version 2.38 + +diff --git a/assert/assert.c b/assert/assert.c +index b7c7a4a1ba..65a9fedf0d 100644 +--- a/assert/assert.c ++++ b/assert/assert.c +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -64,7 +65,8 @@ __assert_fail_base (const char *fmt, const char *assertion, const char *file, + (void) __fxprintf (NULL, "%s", str); + (void) fflush (stderr); + +- total = (total + 1 + GLRO(dl_pagesize) - 1) & ~(GLRO(dl_pagesize) - 1); ++ total = ALIGN_UP (total + sizeof (struct abort_msg_s) + 1, ++ GLRO(dl_pagesize)); + struct abort_msg_s *buf = __mmap (NULL, total, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + if (__glibc_likely (buf != MAP_FAILED)) +diff --git a/sysdeps/posix/libc_fatal.c b/sysdeps/posix/libc_fatal.c +index 70edcc10c1..5b9e4b7918 100644 +--- a/sysdeps/posix/libc_fatal.c ++++ b/sysdeps/posix/libc_fatal.c +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -123,7 +124,8 @@ __libc_message (const char *fmt, ...) + + WRITEV_FOR_FATAL (fd, iov, nlist, total); + +- total = (total + 1 + GLRO(dl_pagesize) - 1) & ~(GLRO(dl_pagesize) - 1); ++ total = ALIGN_UP (total + sizeof (struct abort_msg_s) + 1, ++ GLRO(dl_pagesize)); + struct abort_msg_s *buf = __mmap (NULL, total, + PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); +-- +2.27.0 + diff --git a/elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch b/elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch new file mode 100644 index 0000000..6cfe03e --- /dev/null +++ b/elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch @@ -0,0 +1,50 @@ +From 48642ef1a5721e0a7694d84fe46d83b6086dfe75 Mon Sep 17 00:00:00 2001 +From: Florian Weimer +Date: Mon, 3 Jun 2024 10:49:40 +0200 +Subject: [PATCH] elf: Avoid some free (NULL) calls in + _dl_update_slotinfo + +This has been confirmed to work around some interposed mallocs. Here +is a discussion of the impact test ust/libc-wrapper/test_libc-wrapper +in lttng-tools: + + New TLS usage in libgcc_s.so.1, compatibility impact + + +Reportedly, this patch also papers over a similar issue when tcmalloc +2.9.1 is not compiled with -ftls-model=initial-exec. Of course the +goal really should be to compile mallocs with the initial-exec TLS +model, but this commit appears to be a useful interim workaround. + +Fixes commit d2123d68275acc0f061e73d5f86ca504e0d5a344 ("elf: Fix slow +tls access after dlopen [BZ #19924]"). + +Reviewed-by: Carlos O'Donell +(cherry picked from commit afe42e935b3ee97bac9a7064157587777259c60e) +--- + elf/dl-tls.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/elf/dl-tls.c b/elf/dl-tls.c +index 70446e71a8..de0168319c 100644 +--- a/elf/dl-tls.c ++++ b/elf/dl-tls.c +@@ -819,7 +819,14 @@ _dl_update_slotinfo (unsigned long int req_modid, size_t new_gen) + dtv entry free it. Note: this is not AS-safe. */ + /* XXX Ideally we will at some point create a memory + pool. */ +- free (dtv[modid].pointer.to_free); ++ /* Avoid calling free on a null pointer. Some mallocs ++ incorrectly use dynamic TLS, and depending on how the ++ free function was compiled, it could call ++ __tls_get_addr before the null pointer check in the ++ free implementation. Checking here papers over at ++ least some dynamic TLS usage by interposed mallocs. */ ++ if (dtv[modid].pointer.to_free != NULL) ++ free (dtv[modid].pointer.to_free); + dtv[modid].pointer.val = TLS_DTV_UNALLOCATED; + dtv[modid].pointer.to_free = NULL; + +-- +2.27.0 + diff --git a/elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch b/elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch new file mode 100644 index 0000000..d401ab1 --- /dev/null +++ b/elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch @@ -0,0 +1,328 @@ +From 7772f9358c9a947251196ea7844b339f0a423ff6 Mon Sep 17 00:00:00 2001 +From: Szabolcs Nagy +Date: Tue, 16 Feb 2021 12:55:13 +0000 +Subject: [PATCH] elf: Fix slow tls access after dlopen [BZ #19924] + +In short: __tls_get_addr checks the global generation counter and if +the current dtv is older then _dl_update_slotinfo updates dtv up to the +generation of the accessed module. So if the global generation is newer +than generation of the module then __tls_get_addr keeps hitting the +slow dtv update path. The dtv update path includes a number of checks +to see if any update is needed and this already causes measurable tls +access slow down after dlopen. + +It may be possible to detect up-to-date dtv faster. But if there are +many modules loaded (> TLS_SLOTINFO_SURPLUS) then this requires at +least walking the slotinfo list. + +This patch tries to update the dtv to the global generation instead, so +after a dlopen the tls access slow path is only hit once. The modules +with larger generation than the accessed one were not necessarily +synchronized before, so additional synchronization is needed. + +This patch uses acquire/release synchronization when accessing the +generation counter. + +Note: in the x86_64 version of dl-tls.c the generation is only loaded +once, since relaxed mo is not faster than acquire mo load. + +I have not benchmarked this. Tested by Adhemerval Zanella on aarch64, +powerpc, sparc, x86 who reported that it fixes the performance issue +of bug 19924. + +Reviewed-by: Adhemerval Zanella +(cherry picked from commit d2123d68275acc0f061e73d5f86ca504e0d5a344) +--- + elf/dl-close.c | 2 +- + elf/dl-open.c | 8 +-- + elf/dl-reloc.c | 6 +- + elf/dl-tls.c | 117 ++++++++++++++++++++----------------- + sysdeps/generic/ldsodefs.h | 3 +- + sysdeps/x86_64/dl-tls.c | 4 +- + 6 files changed, 74 insertions(+), 66 deletions(-) + +diff --git a/elf/dl-close.c b/elf/dl-close.c +index b887a44888..1c7a861db1 100644 +--- a/elf/dl-close.c ++++ b/elf/dl-close.c +@@ -703,7 +703,7 @@ _dl_close_worker (struct link_map *map, bool force) + if (__glibc_unlikely (newgen == 0)) + _dl_fatal_printf ("TLS generation counter wrapped! Please report as described in "REPORT_BUGS_TO".\n"); + /* Can be read concurrently. */ +- atomic_store_relaxed (&GL(dl_tls_generation), newgen); ++ atomic_store_release (&GL(dl_tls_generation), newgen); + + if (tls_free_end == GL(dl_tls_static_used)) + GL(dl_tls_static_used) = tls_free_start; +diff --git a/elf/dl-open.c b/elf/dl-open.c +index 2d985e21d8..351931af04 100644 +--- a/elf/dl-open.c ++++ b/elf/dl-open.c +@@ -405,7 +405,7 @@ update_tls_slotinfo (struct link_map *new) + _dl_fatal_printf (N_("\ + TLS generation counter wrapped! Please report this.")); + /* Can be read concurrently. */ +- atomic_store_relaxed (&GL(dl_tls_generation), newgen); ++ atomic_store_release (&GL(dl_tls_generation), newgen); + + /* We need a second pass for static tls data, because + _dl_update_slotinfo must not be run while calls to +@@ -422,8 +422,8 @@ TLS generation counter wrapped! Please report this.")); + now, but we can delay updating the DTV. */ + imap->l_need_tls_init = 0; + #ifdef SHARED +- /* Update the slot information data for at least the +- generation of the DSO we are allocating data for. */ ++ /* Update the slot information data for the current ++ generation. */ + + /* FIXME: This can terminate the process on memory + allocation failure. It is not possible to raise +@@ -431,7 +431,7 @@ TLS generation counter wrapped! Please report this.")); + _dl_update_slotinfo would have to be split into two + operations, similar to resize_scopes and update_scopes + above. This is related to bug 16134. */ +- _dl_update_slotinfo (imap->l_tls_modid); ++ _dl_update_slotinfo (imap->l_tls_modid, newgen); + #endif + + dl_init_static_tls (imap); +diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c +index 1d558c1e0c..e5c555d82c 100644 +--- a/elf/dl-reloc.c ++++ b/elf/dl-reloc.c +@@ -112,11 +112,11 @@ _dl_try_allocate_static_tls (struct link_map *map, bool optional) + if (map->l_real->l_relocated) + { + #ifdef SHARED ++ /* Update the DTV of the current thread. Note: GL(dl_load_tls_lock) ++ is held here so normal load of the generation counter is valid. */ + if (__builtin_expect (THREAD_DTV()[0].counter != GL(dl_tls_generation), + 0)) +- /* Update the slot information data for at least the generation of +- the DSO we are allocating data for. */ +- (void) _dl_update_slotinfo (map->l_tls_modid); ++ (void) _dl_update_slotinfo (map->l_tls_modid, GL(dl_tls_generation)); + #endif + + dl_init_static_tls (map); +diff --git a/elf/dl-tls.c b/elf/dl-tls.c +index 1f6f820819..70446e71a8 100644 +--- a/elf/dl-tls.c ++++ b/elf/dl-tls.c +@@ -716,57 +716,57 @@ allocate_and_init (struct link_map *map) + + + struct link_map * +-_dl_update_slotinfo (unsigned long int req_modid) ++_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen) + { + struct link_map *the_map = NULL; + dtv_t *dtv = THREAD_DTV (); + +- /* The global dl_tls_dtv_slotinfo array contains for each module +- index the generation counter current when the entry was created. ++ /* CONCURRENCY NOTES: ++ ++ The global dl_tls_dtv_slotinfo_list array contains for each module ++ index the generation counter current when that entry was updated. + This array never shrinks so that all module indices which were +- valid at some time can be used to access it. Before the first +- use of a new module index in this function the array was extended +- appropriately. Access also does not have to be guarded against +- modifications of the array. It is assumed that pointer-size +- values can be read atomically even in SMP environments. It is +- possible that other threads at the same time dynamically load +- code and therefore add to the slotinfo list. This is a problem +- since we must not pick up any information about incomplete work. +- The solution to this is to ignore all dtv slots which were +- created after the one we are currently interested. We know that +- dynamic loading for this module is completed and this is the last +- load operation we know finished. */ +- unsigned long int idx = req_modid; ++ valid at some time can be used to access it. Concurrent loading ++ and unloading of modules can update slotinfo entries or extend ++ the array. The updates happen under the GL(dl_load_tls_lock) and ++ finish with the release store of the generation counter to ++ GL(dl_tls_generation) which is synchronized with the load of ++ new_gen in the caller. So updates up to new_gen are synchronized ++ but updates for later generations may not be. ++ ++ Here we update the thread dtv from old_gen (== dtv[0].counter) to ++ new_gen generation. For this, each dtv[i] entry is either set to ++ an unallocated state (set), or left unmodified (nop). Where (set) ++ may resize the dtv first if modid i >= dtv[-1].counter. The rules ++ for the decision between (set) and (nop) are ++ ++ (1) If slotinfo entry i is concurrently updated then either (set) ++ or (nop) is valid: TLS access cannot use dtv[i] unless it is ++ synchronized with a generation > new_gen. ++ ++ Otherwise, if the generation of slotinfo entry i is gen and the ++ loaded module for this entry is map then ++ ++ (2) If gen <= old_gen then do (nop). ++ ++ (3) If old_gen < gen <= new_gen then ++ (3.1) if map != 0 then (set) ++ (3.2) if map == 0 then either (set) or (nop). ++ ++ Note that (1) cannot be reliably detected, but since both actions ++ are valid it does not have to be. Only (2) and (3.1) cases need ++ to be distinguished for which relaxed mo access of gen and map is ++ enough: their value is synchronized when it matters. ++ ++ Note that a relaxed mo load may give an out-of-thin-air value since ++ it is used in decisions that can affect concurrent stores. But this ++ should only happen if the OOTA value causes UB that justifies the ++ concurrent store of the value. This is not expected to be an issue ++ in practice. */ + struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list); + +- while (idx >= listp->len) ++ if (dtv[0].counter < new_gen) + { +- idx -= listp->len; +- listp = listp->next; +- } +- +- if (dtv[0].counter < listp->slotinfo[idx].gen) +- { +- /* CONCURRENCY NOTES: +- +- Here the dtv needs to be updated to new_gen generation count. +- +- This code may be called during TLS access when GL(dl_load_tls_lock) +- is not held. In that case the user code has to synchronize with +- dlopen and dlclose calls of relevant modules. A module m is +- relevant if the generation of m <= new_gen and dlclose of m is +- synchronized: a memory access here happens after the dlopen and +- before the dlclose of relevant modules. The dtv entries for +- relevant modules need to be updated, other entries can be +- arbitrary. +- +- This e.g. means that the first part of the slotinfo list can be +- accessed race free, but the tail may be concurrently extended. +- Similarly relevant slotinfo entries can be read race free, but +- other entries are racy. However updating a non-relevant dtv +- entry does not affect correctness. For a relevant module m, +- max_modid >= modid of m. */ +- size_t new_gen = listp->slotinfo[idx].gen; + size_t total = 0; + size_t max_modid = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx)); + assert (max_modid >= req_modid); +@@ -779,31 +779,33 @@ _dl_update_slotinfo (unsigned long int req_modid) + { + size_t modid = total + cnt; + +- /* Later entries are not relevant. */ ++ /* Case (1) for all later modids. */ + if (modid > max_modid) + break; + + size_t gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen); + ++ /* Case (1). */ + if (gen > new_gen) +- /* Not relevant. */ + continue; + +- /* If the entry is older than the current dtv layout we +- know we don't have to handle it. */ ++ /* Case (2) or (1). */ + if (gen <= dtv[0].counter) + continue; + ++ /* Case (3) or (1). */ ++ + /* If there is no map this means the entry is empty. */ + struct link_map *map + = atomic_load_relaxed (&listp->slotinfo[cnt].map); + /* Check whether the current dtv array is large enough. */ + if (dtv[-1].counter < modid) + { ++ /* Case (3.2) or (1). */ + if (map == NULL) + continue; + +- /* Resize the dtv. */ ++ /* Resizing the dtv aborts on failure: bug 16134. */ + dtv = _dl_resize_dtv (dtv, max_modid); + + assert (modid <= dtv[-1].counter); +@@ -814,7 +816,7 @@ _dl_update_slotinfo (unsigned long int req_modid) + } + + /* If there is currently memory allocate for this +- dtv entry free it. */ ++ dtv entry free it. Note: this is not AS-safe. */ + /* XXX Ideally we will at some point create a memory + pool. */ + free (dtv[modid].pointer.to_free); +@@ -909,9 +911,9 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t *dtv, struct link_map *the_map) + + static struct link_map * + __attribute_noinline__ +-update_get_addr (GET_ADDR_ARGS) ++update_get_addr (GET_ADDR_ARGS, size_t gen) + { +- struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE); ++ struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen); + dtv_t *dtv = THREAD_DTV (); + + void *p = dtv[GET_ADDR_MODULE].pointer.val; +@@ -941,12 +943,17 @@ __tls_get_addr (GET_ADDR_ARGS) + dtv_t *dtv = THREAD_DTV (); + + /* Update is needed if dtv[0].counter < the generation of the accessed +- module. The global generation counter is used here as it is easier +- to check. Synchronization for the relaxed MO access is guaranteed +- by user code, see CONCURRENCY NOTES in _dl_update_slotinfo. */ ++ module, but the global generation counter is easier to check (which ++ must be synchronized up to the generation of the accessed module by ++ user code doing the TLS access so relaxed mo read is enough). */ + size_t gen = atomic_load_relaxed (&GL(dl_tls_generation)); + if (__glibc_unlikely (dtv[0].counter != gen)) +- return update_get_addr (GET_ADDR_PARAM); ++ { ++ /* Update DTV up to the global generation, see CONCURRENCY NOTES ++ in _dl_update_slotinfo. */ ++ gen = atomic_load_acquire (&GL(dl_tls_generation)); ++ return update_get_addr (GET_ADDR_PARAM, gen); ++ } + + void *p = dtv[GET_ADDR_MODULE].pointer.val; + +diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h +index e8b7359b04..ed69c6babd 100644 +--- a/sysdeps/generic/ldsodefs.h ++++ b/sysdeps/generic/ldsodefs.h +@@ -1251,7 +1251,8 @@ extern void _dl_add_to_slotinfo (struct link_map *l, bool do_add) + + /* Update slot information data for at least the generation of the + module with the given index. */ +-extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid) ++extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid, ++ size_t gen) + attribute_hidden; + + /* Look up the module's TLS block as for __tls_get_addr, +diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c +index 7a7fe38625..e9b6ab9970 100644 +--- a/sysdeps/x86_64/dl-tls.c ++++ b/sysdeps/x86_64/dl-tls.c +@@ -40,9 +40,9 @@ __tls_get_addr_slow (GET_ADDR_ARGS) + { + dtv_t *dtv = THREAD_DTV (); + +- size_t gen = atomic_load_relaxed (&GL(dl_tls_generation)); ++ size_t gen = atomic_load_acquire (&GL(dl_tls_generation)); + if (__glibc_unlikely (dtv[0].counter != gen)) +- return update_get_addr (GET_ADDR_PARAM); ++ return update_get_addr (GET_ADDR_PARAM, gen); + + return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL); + } +-- +2.27.0 + diff --git a/elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch b/elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch new file mode 100644 index 0000000..c60cbb7 --- /dev/null +++ b/elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch @@ -0,0 +1,521 @@ +From 549e7f7c5a94f5ccbab2ad5f1babca05028a31c7 Mon Sep 17 00:00:00 2001 +From: Florian Weimer +Date: Mon, 1 Jul 2024 17:42:04 +0200 +Subject: [PATCH] elf: Support recursive use of dynamic TLS in interposed + malloc + +It turns out that quite a few applications use bundled mallocs that +have been built to use global-dynamic TLS (instead of the recommended +initial-exec TLS). The previous workaround from +commit afe42e935b3ee97bac9a7064157587777259c60e ("elf: Avoid some +free (NULL) calls in _dl_update_slotinfo") does not fix all +encountered cases unfortunatelly. + +This change avoids the TLS generation update for recursive use +of TLS from a malloc that was called during a TLS update. This +is possible because an interposed malloc has a fixed module ID and +TLS slot. (It cannot be unloaded.) If an initially-loaded module ID +is encountered in __tls_get_addr and the dynamic linker is already +in the middle of a TLS update, use the outdated DTV, thus avoiding +another call into malloc. It's still necessary to update the +DTV to the most recent generation, to get out of the slow path, +which is why the check for recursion is needed. + +The bookkeeping is done using a global counter instead of per-thread +flag because TLS access in the dynamic linker is tricky. + +All this will go away once the dynamic linker stops using malloc +for TLS, likely as part of a change that pre-allocates all TLS +during pthread_create/dlopen. + +Fixes commit d2123d68275acc0f061e73d5f86ca504e0d5a344 ("elf: Fix slow +tls access after dlopen [BZ #19924]"). + +Reviewed-by: Szabolcs Nagy +(cherry picked from commit 018f0fc3b818d4d1460a4e2384c24802504b1d20) + +Conflict: adapt file "elf/Makefile" for patch "elf: Switch to main +malloc after final ld.so self-relocation" +--- + elf/Makefile | 26 +++++++++ + elf/dl-tls.c | 95 +++++++++++++++++++++++++++++--- + elf/rtld.c | 2 + + elf/tst-recursive-tls.c | 60 ++++++++++++++++++++ + elf/tst-recursive-tlsmallocmod.c | 64 +++++++++++++++++++++ + elf/tst-recursive-tlsmodN.c | 28 ++++++++++ + sysdeps/generic/ldsodefs.h | 14 +++++ + sysdeps/x86_64/dl-tls.c | 5 +- + 8 files changed, 284 insertions(+), 10 deletions(-) + create mode 100644 elf/tst-recursive-tls.c + create mode 100644 elf/tst-recursive-tlsmallocmod.c + create mode 100644 elf/tst-recursive-tlsmodN.c + +diff --git a/elf/Makefile b/elf/Makefile +index ea98cba8..391f29e9 100644 +--- a/elf/Makefile ++++ b/elf/Makefile +@@ -433,6 +433,7 @@ tests += \ + tst-p_align1 \ + tst-p_align2 \ + tst-p_align3 \ ++ tst-recursive-tls \ + tst-relsort1 \ + tst-ro-dynamic \ + tst-rtld-no-malloc \ +@@ -865,6 +866,23 @@ modules-names += \ + tst-null-argv-lib \ + tst-p_alignmod-base \ + tst-p_alignmod3 \ ++ tst-recursive-tlsmallocmod \ ++ tst-recursive-tlsmod0 \ ++ tst-recursive-tlsmod1 \ ++ tst-recursive-tlsmod2 \ ++ tst-recursive-tlsmod3 \ ++ tst-recursive-tlsmod4 \ ++ tst-recursive-tlsmod5 \ ++ tst-recursive-tlsmod6 \ ++ tst-recursive-tlsmod7 \ ++ tst-recursive-tlsmod8 \ ++ tst-recursive-tlsmod9 \ ++ tst-recursive-tlsmod10 \ ++ tst-recursive-tlsmod11 \ ++ tst-recursive-tlsmod12 \ ++ tst-recursive-tlsmod13 \ ++ tst-recursive-tlsmod14 \ ++ tst-recursive-tlsmod15 \ + tst-relsort1mod1 \ + tst-relsort1mod2 \ + tst-ro-dynamic-mod \ +@@ -3042,6 +3060,14 @@ CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2 + CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2 + endif + ++$(objpfx)tst-recursive-tls: $(objpfx)tst-recursive-tlsmallocmod.so ++# More objects than DTV_SURPLUS, to trigger DTV reallocation. ++$(objpfx)tst-recursive-tls.out: \ ++ $(patsubst %,$(objpfx)tst-recursive-tlsmod%.so, \ ++ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) ++$(objpfx)tst-recursive-tlsmod%.os: tst-recursive-tlsmodN.c ++ $(compile-command.c) -DVAR=thread_$* -DFUNC=get_threadvar_$* ++ + # Reuse an audit module which provides ample debug logging. + tst-rtld-no-malloc-audit-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so + +diff --git a/elf/dl-tls.c b/elf/dl-tls.c +index de016831..59d4021e 100644 +--- a/elf/dl-tls.c ++++ b/elf/dl-tls.c +@@ -75,6 +75,31 @@ + /* Default for dl_tls_static_optional. */ + #define OPTIONAL_TLS 512 + ++/* Used to count the number of threads currently executing dynamic TLS ++ updates. Used to avoid recursive malloc calls in __tls_get_addr ++ for an interposed malloc that uses global-dynamic TLS (which is not ++ recommended); see _dl_tls_allocate_active checks. This could be a ++ per-thread flag, but would need TLS access in the dynamic linker. */ ++unsigned int _dl_tls_threads_in_update; ++ ++static inline void ++_dl_tls_allocate_begin (void) ++{ ++ atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, 1); ++} ++ ++static inline void ++_dl_tls_allocate_end (void) ++{ ++ atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, -1); ++} ++ ++static inline bool ++_dl_tls_allocate_active (void) ++{ ++ return atomic_load_relaxed (&_dl_tls_threads_in_update) > 0; ++} ++ + /* Compute the static TLS surplus based on the namespace count and the + TLS space that can be used for optimizations. */ + static inline int +@@ -425,12 +450,18 @@ _dl_allocate_tls_storage (void) + size += TLS_PRE_TCB_SIZE; + #endif + +- /* Perform the allocation. Reserve space for the required alignment +- and the pointer to the original allocation. */ ++ /* Reserve space for the required alignment and the pointer to the ++ original allocation. */ + size_t alignment = GLRO (dl_tls_static_align); ++ ++ /* Perform the allocation. */ ++ _dl_tls_allocate_begin (); + void *allocated = malloc (size + alignment + sizeof (void *)); + if (__glibc_unlikely (allocated == NULL)) +- return NULL; ++ { ++ _dl_tls_allocate_end (); ++ return NULL; ++ } + + /* Perform alignment and allocate the DTV. */ + #if TLS_TCB_AT_TP +@@ -466,6 +497,8 @@ _dl_allocate_tls_storage (void) + result = allocate_dtv (result); + if (result == NULL) + free (allocated); ++ ++ _dl_tls_allocate_end (); + return result; + } + +@@ -483,6 +516,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid) + size_t newsize = max_modid + DTV_SURPLUS; + size_t oldsize = dtv[-1].counter; + ++ _dl_tls_allocate_begin (); + if (dtv == GL(dl_initial_dtv)) + { + /* This is the initial dtv that was either statically allocated in +@@ -502,6 +536,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid) + if (newp == NULL) + oom (); + } ++ _dl_tls_allocate_end (); + + newp[0].counter = newsize; + +@@ -676,7 +711,9 @@ allocate_dtv_entry (size_t alignment, size_t size) + if (powerof2 (alignment) && alignment <= _Alignof (max_align_t)) + { + /* The alignment is supported by malloc. */ ++ _dl_tls_allocate_begin (); + void *ptr = malloc (size); ++ _dl_tls_allocate_end (); + return (struct dtv_pointer) { ptr, ptr }; + } + +@@ -688,7 +725,10 @@ allocate_dtv_entry (size_t alignment, size_t size) + + /* Perform the allocation. This is the pointer we need to free + later. */ ++ _dl_tls_allocate_begin (); + void *start = malloc (alloc_size); ++ _dl_tls_allocate_end (); ++ + if (start == NULL) + return (struct dtv_pointer) {}; + +@@ -826,7 +866,11 @@ _dl_update_slotinfo (unsigned long int req_modid, size_t new_gen) + free implementation. Checking here papers over at + least some dynamic TLS usage by interposed mallocs. */ + if (dtv[modid].pointer.to_free != NULL) +- free (dtv[modid].pointer.to_free); ++ { ++ _dl_tls_allocate_begin (); ++ free (dtv[modid].pointer.to_free); ++ _dl_tls_allocate_end (); ++ } + dtv[modid].pointer.val = TLS_DTV_UNALLOCATED; + dtv[modid].pointer.to_free = NULL; + +@@ -956,10 +1000,22 @@ __tls_get_addr (GET_ADDR_ARGS) + size_t gen = atomic_load_relaxed (&GL(dl_tls_generation)); + if (__glibc_unlikely (dtv[0].counter != gen)) + { +- /* Update DTV up to the global generation, see CONCURRENCY NOTES +- in _dl_update_slotinfo. */ +- gen = atomic_load_acquire (&GL(dl_tls_generation)); +- return update_get_addr (GET_ADDR_PARAM, gen); ++ if (_dl_tls_allocate_active () ++ && GET_ADDR_MODULE < _dl_tls_initial_modid_limit) ++ /* This is a reentrant __tls_get_addr call, but we can ++ satisfy it because it's an initially-loaded module ID. ++ These TLS slotinfo slots do not change, so the ++ out-of-date generation counter does not matter. However, ++ if not in a TLS update, still update_get_addr below, to ++ get off the slow path eventually. */ ++ ; ++ else ++ { ++ /* Update DTV up to the global generation, see CONCURRENCY NOTES ++ in _dl_update_slotinfo. */ ++ gen = atomic_load_acquire (&GL(dl_tls_generation)); ++ return update_get_addr (GET_ADDR_PARAM, gen); ++ } + } + + void *p = dtv[GET_ADDR_MODULE].pointer.val; +@@ -969,7 +1025,7 @@ __tls_get_addr (GET_ADDR_ARGS) + + return (char *) p + GET_ADDR_OFFSET; + } +-#endif ++#endif /* SHARED */ + + + /* Look up the module's TLS block as for __tls_get_addr, +@@ -1018,6 +1074,25 @@ _dl_tls_get_addr_soft (struct link_map *l) + return data; + } + ++size_t _dl_tls_initial_modid_limit; ++ ++void ++_dl_tls_initial_modid_limit_setup (void) ++{ ++ struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list); ++ size_t idx; ++ for (idx = 0; idx < listp->len; ++idx) ++ { ++ struct link_map *l = listp->slotinfo[idx].map; ++ if (l == NULL ++ /* The object can be unloaded, so its modid can be ++ reassociated. */ ++ || !(l->l_type == lt_executable || l->l_type == lt_library)) ++ break; ++ } ++ _dl_tls_initial_modid_limit = idx; ++} ++ + + void + _dl_add_to_slotinfo (struct link_map *l, bool do_add) +@@ -1050,9 +1125,11 @@ _dl_add_to_slotinfo (struct link_map *l, bool do_add) + the first slot. */ + assert (idx == 0); + ++ _dl_tls_allocate_begin (); + listp = (struct dtv_slotinfo_list *) + malloc (sizeof (struct dtv_slotinfo_list) + + TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo)); ++ _dl_tls_allocate_end (); + if (listp == NULL) + { + /* We ran out of memory while resizing the dtv slotinfo list. */ +diff --git a/elf/rtld.c b/elf/rtld.c +index 558733b8..0a1e202c 100644 +--- a/elf/rtld.c ++++ b/elf/rtld.c +@@ -789,6 +789,8 @@ init_tls (size_t naudit) + _dl_fatal_printf ("\ + cannot allocate TLS data structures for initial thread\n"); + ++ _dl_tls_initial_modid_limit_setup (); ++ + /* Store for detection of the special case by __tls_get_addr + so it knows not to pass this dtv to the normal realloc. */ + GL(dl_initial_dtv) = GET_DTV (tcbp); +diff --git a/elf/tst-recursive-tls.c b/elf/tst-recursive-tls.c +new file mode 100644 +index 00000000..716d1f78 +--- /dev/null ++++ b/elf/tst-recursive-tls.c +@@ -0,0 +1,60 @@ ++/* Test with interposed malloc with dynamic TLS. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++ ++/* Defined in tst-recursive-tlsmallocmod.so. */ ++extern __thread unsigned int malloc_subsytem_counter; ++ ++static int ++do_test (void) ++{ ++ /* 16 is large enough to exercise the DTV resizing case. */ ++ void *handles[16]; ++ ++ for (unsigned int i = 0; i < array_length (handles); ++i) ++ { ++ /* Re-use the TLS slot for module 0. */ ++ if (i > 0) ++ xdlclose (handles[0]); ++ ++ char soname[30]; ++ snprintf (soname, sizeof (soname), "tst-recursive-tlsmod%u.so", i); ++ handles[i] = xdlopen (soname, RTLD_NOW); ++ ++ if (i > 0) ++ { ++ handles[0] = xdlopen ("tst-recursive-tlsmod0.so", RTLD_NOW); ++ int (*fptr) (void) = xdlsym (handles[0], "get_threadvar_0"); ++ /* May trigger TLS storage allocation using malloc. */ ++ TEST_COMPARE (fptr (), 0); ++ } ++ } ++ ++ for (unsigned int i = 0; i < array_length (handles); ++i) ++ xdlclose (handles[i]); ++ ++ printf ("info: malloc subsystem calls: %u\n", malloc_subsytem_counter); ++ TEST_VERIFY (malloc_subsytem_counter > 0); ++ return 0; ++} ++ ++#include +diff --git a/elf/tst-recursive-tlsmallocmod.c b/elf/tst-recursive-tlsmallocmod.c +new file mode 100644 +index 00000000..c24e9945 +--- /dev/null ++++ b/elf/tst-recursive-tlsmallocmod.c +@@ -0,0 +1,64 @@ ++/* Interposed malloc with dynamic TLS. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++ ++__thread unsigned int malloc_subsytem_counter; ++ ++static __typeof (malloc) *malloc_fptr; ++static __typeof (free) *free_fptr; ++static __typeof (calloc) *calloc_fptr; ++static __typeof (realloc) *realloc_fptr; ++ ++static void __attribute__ ((constructor)) ++init (void) ++{ ++ malloc_fptr = dlsym (RTLD_NEXT, "malloc"); ++ free_fptr = dlsym (RTLD_NEXT, "free"); ++ calloc_fptr = dlsym (RTLD_NEXT, "calloc"); ++ realloc_fptr = dlsym (RTLD_NEXT, "realloc"); ++} ++ ++void * ++malloc (size_t size) ++{ ++ ++malloc_subsytem_counter; ++ return malloc_fptr (size); ++} ++ ++void ++free (void *ptr) ++{ ++ ++malloc_subsytem_counter; ++ return free_fptr (ptr); ++} ++ ++void * ++calloc (size_t a, size_t b) ++{ ++ ++malloc_subsytem_counter; ++ return calloc_fptr (a, b); ++} ++ ++void * ++realloc (void *ptr, size_t size) ++{ ++ ++malloc_subsytem_counter; ++ return realloc_fptr (ptr, size); ++} +diff --git a/elf/tst-recursive-tlsmodN.c b/elf/tst-recursive-tlsmodN.c +new file mode 100644 +index 00000000..bb7592ae +--- /dev/null ++++ b/elf/tst-recursive-tlsmodN.c +@@ -0,0 +1,28 @@ ++/* Test module with global-dynamic TLS. Used to trigger DTV reallocation. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++/* Compiled with VAR and FUNC set via -D. FUNC requires some ++ relocation against TLS variable VAR. */ ++ ++__thread int VAR; ++ ++int ++FUNC (void) ++{ ++ return VAR; ++} +diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h +index 22fbbecd..ad271ae0 100644 +--- a/sysdeps/generic/ldsodefs.h ++++ b/sysdeps/generic/ldsodefs.h +@@ -1262,6 +1262,20 @@ extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid, + size_t gen) + attribute_hidden; + ++/* The last TLS module ID that is initially loaded, plus 1. TLS ++ addresses for modules with IDs lower than that can be obtained from ++ the DTV even if its generation is outdated. */ ++extern size_t _dl_tls_initial_modid_limit attribute_hidden attribute_relro; ++ ++/* Compute _dl_tls_initial_modid_limit. To be called after initial ++ relocation. */ ++void _dl_tls_initial_modid_limit_setup (void) attribute_hidden; ++ ++/* Number of threads currently in a TLS update. This is used to ++ detect reentrant __tls_get_addr calls without a per-thread ++ flag. */ ++extern unsigned int _dl_tls_threads_in_update attribute_hidden; ++ + /* Look up the module's TLS block as for __tls_get_addr, + but never touch anything. Return null if it's not allocated yet. */ + extern void *_dl_tls_get_addr_soft (struct link_map *l) attribute_hidden; +diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c +index e9b6ab99..c484f39e 100644 +--- a/sysdeps/x86_64/dl-tls.c ++++ b/sysdeps/x86_64/dl-tls.c +@@ -41,7 +41,10 @@ __tls_get_addr_slow (GET_ADDR_ARGS) + dtv_t *dtv = THREAD_DTV (); + + size_t gen = atomic_load_acquire (&GL(dl_tls_generation)); +- if (__glibc_unlikely (dtv[0].counter != gen)) ++ if (__glibc_unlikely (dtv[0].counter != gen) ++ /* See comment in __tls_get_addr in elf/dl-tls.c. */ ++ && !(_dl_tls_allocate_active () ++ && GET_ADDR_MODULE < _dl_tls_initial_modid_limit)) + return update_get_addr (GET_ADDR_PARAM, gen); + + return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL); +-- +2.27.0 + diff --git a/glibc.spec b/glibc.spec index f35d480..5589315 100644 --- a/glibc.spec +++ b/glibc.spec @@ -67,7 +67,7 @@ ############################################################################## Name: glibc Version: 2.38 -Release: 51 +Release: 52 Summary: The GNU libc libraries License: %{all_license} URL: http://www.gnu.org/software/glibc/ @@ -252,6 +252,21 @@ Patch162: nptl-initialize-rseq-area-prior-to-registration.patch Patch163: nptl-initialize-cpu_id_start-prior-to-rseq-registrat.patch Patch164: x86-Avoid-integer-truncation-with-large-cache-sizes-.patch Patch165: LoongArch-Force-SHMLBA-the-same-as-kernel.patch +Patch166: x86_64-Sort-fpu-multiarch-Makefile.patch +Patch167: x86_64-Add-log2-with-FMA.patch +Patch168: x86_64-Add-expm1-with-FMA.patch +Patch169: x86_64-Add-log1p-with-FMA.patch +Patch170: x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch +Patch171: elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch +Patch172: x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch +Patch173: sysdeps-x86-Makefile-Split-and-sort-tests.patch +Patch174: x86_64-Fix-missing-wcsncat-function-definition-witho.patch +Patch175: x86-Improve-large-memset-perf-with-non-temporal-stor.patch +Patch176: x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch +Patch177: elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch +Patch178: elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch +Patch179: Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch +Patch180: stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch #openEuler patch list Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch @@ -1471,6 +1486,23 @@ fi %endif %changelog +* Sun Jan 26 2025 Qingqing Li - 2.38-52 +- stdlib: Test using setenv with updated environ [BZ #32588] +- Fix underallocation of abort_msg_s struct (CVE-2025-0395) +- elf: Support recursive use of dynamic TLS in interposed malloc +- elf: Avoid some free (NULL) calls in _dl_update_slotinfo +- x86/string: Fixup alignment of main loop in str{n}cmp-evex [BZ #32212] +- x86: Improve large memset perf with non-temporal stores [RHEL-29312] +- x86_64: Fix missing wcsncat function definition without multiarch (x86-64-v4) +- sysdeps/x86/Makefile: Split and sort tests +- x86: Only align destination to 1x VEC_SIZE in memset 4x loop +- elf: Fix slow tls access after dlopen [BZ #19924] +- x86: Check the lower byte of EAX of CPUID leaf 2 [BZ #30643] +- x86_64: Add log1p with FMA +- x86_64: Add expm1 with FMA +- x86_64: Add log2 with FMA +- x86_64: Sort fpu/multiarch/Makefile + * Wed Jan 15 2025 MayShao - 2.38-51 - x86: Set preferred CPU features and default NT threshold for Zhaoxin processors diff --git a/stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch b/stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch new file mode 100644 index 0000000..9bbb5c3 --- /dev/null +++ b/stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch @@ -0,0 +1,75 @@ +From 650a0aaaffa9ddb44732fa6156b31c5f30ee596f Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 24 Jan 2025 18:53:13 +0800 +Subject: [PATCH] stdlib: Test using setenv with updated environ [BZ + #32588] + +Add a test for setenv with updated environ. Verify that BZ #32588 is +fixed. + +Signed-off-by: H.J. Lu +Reviewed-by: Florian Weimer +(cherry picked from commit 8ab34497de14e35aff09b607222fe1309ef156da) +--- + stdlib/Makefile | 1 + + stdlib/tst-setenv-environ.c | 36 ++++++++++++++++++++++++++++++++++++ + 2 files changed, 37 insertions(+) + create mode 100644 stdlib/tst-setenv-environ.c + +diff --git a/stdlib/Makefile b/stdlib/Makefile +index 25e42a77e7..750810ee92 100644 +--- a/stdlib/Makefile ++++ b/stdlib/Makefile +@@ -232,6 +232,7 @@ tests := \ + tst-setcontext7 \ + tst-setcontext8 \ + tst-setcontext9 \ ++ tst-setenv-environ \ + tst-strfmon_l \ + tst-strfrom \ + tst-strfrom-locale \ +diff --git a/stdlib/tst-setenv-environ.c b/stdlib/tst-setenv-environ.c +new file mode 100644 +index 0000000000..02fcef96d0 +--- /dev/null ++++ b/stdlib/tst-setenv-environ.c +@@ -0,0 +1,36 @@ ++/* Test using setenv with updated environ. ++ Copyright (C) 2025 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++ ++extern char **environ; ++ ++int ++do_test (void) ++{ ++ char *valp; ++ static char *dummy_environ[] = { NULL }; ++ environ = dummy_environ; ++ setenv ("A", "1", 0); ++ valp = getenv ("A"); ++ TEST_VERIFY_EXIT (valp[0] == '1' && valp[1] == '\0'); ++ return 0; ++} ++ ++#include +-- +2.27.0 + diff --git a/sysdeps-x86-Makefile-Split-and-sort-tests.patch b/sysdeps-x86-Makefile-Split-and-sort-tests.patch new file mode 100644 index 0000000..6925e1a --- /dev/null +++ b/sysdeps-x86-Makefile-Split-and-sort-tests.patch @@ -0,0 +1,178 @@ +From 0d14bf0754ee8d8cf2bf3dad298fa5c5f97537db Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Thu, 7 Dec 2023 09:00:11 -0800 +Subject: [PATCH] sysdeps/x86/Makefile: Split and sort tests + +Put each test on a separate line and sort tests. + +(cherry picked from commit 7e03e0de7e7c2de975b5c5e18f5a4b0c75816674) +--- + sysdeps/x86/Makefile | 110 ++++++++++++++++++++++++++++++------------- + 1 file changed, 78 insertions(+), 32 deletions(-) + +diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile +index 917c26f116..5631a59a26 100644 +--- a/sysdeps/x86/Makefile ++++ b/sysdeps/x86/Makefile +@@ -10,36 +10,51 @@ sysdep_headers += sys/platform/x86.h bits/platform/x86.h + CFLAGS-dl-get-cpu-features.os += $(rtld-early-cflags) + CFLAGS-get-cpuid-feature-leaf.o += $(no-stack-protector) + +-tests += tst-get-cpu-features tst-get-cpu-features-static \ +- tst-cpu-features-cpuinfo tst-cpu-features-cpuinfo-static \ +- tst-cpu-features-supports tst-cpu-features-supports-static +-tests-static += tst-get-cpu-features-static \ +- tst-cpu-features-cpuinfo-static \ +- tst-cpu-features-supports-static ++tests += \ ++ tst-get-cpu-features \ ++ tst-get-cpu-features-static \ ++ tst-cpu-features-cpuinfo \ ++ tst-cpu-features-cpuinfo-static \ ++ tst-cpu-features-supports \ ++ tst-cpu-features-supports-static \ ++# tests ++tests-static += \ ++ tst-get-cpu-features-static \ ++ tst-cpu-features-cpuinfo-static \ ++ tst-cpu-features-supports-static \ ++# tests-static + ifeq (yes,$(have-ifunc)) + ifeq (yes,$(have-gcc-ifunc)) + tests += \ + tst-ifunc-isa-1 \ +- tst-ifunc-isa-1-static ++ tst-ifunc-isa-1-static \ ++# tests + tests-static += \ +- tst-ifunc-isa-1-static ++ tst-ifunc-isa-1-static \ ++# tests-static + test-xfail-tst-ifunc-isa-1 = $(with-lld) + test-xfail-tst-ifunc-isa-1-static = $(with-lld) + tests += \ + tst-ifunc-isa-2 \ +- tst-ifunc-isa-2-static ++ tst-ifunc-isa-2-static \ ++# tests + tests-static += \ +- tst-ifunc-isa-2-static ++ tst-ifunc-isa-2-static \ ++# tests-static + test-xfail-tst-ifunc-isa-2 = $(with-lld) + test-xfail-tst-ifunc-isa-2-static = $(with-lld) + endif + endif + ifeq (yes,$(enable-x86-isa-level)) +-tests += tst-isa-level-1 +-modules-names += tst-isa-level-mod-1-baseline \ +- tst-isa-level-mod-1-v2 \ +- tst-isa-level-mod-1-v3 \ +- tst-isa-level-mod-1-v4 \ ++tests += \ ++ tst-isa-level-1 \ ++# tests ++modules-names += \ ++ tst-isa-level-mod-1-baseline \ ++ tst-isa-level-mod-1-v2 \ ++ tst-isa-level-mod-1-v3 \ ++ tst-isa-level-mod-1-v4 \ ++# modules-names + + # X86 ISA level baseline + CFLAGS-tst-isa-level-mod-1-baseline.c += -DINCLUDE_X86_ISA_LEVEL \ +@@ -68,14 +83,18 @@ tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV) + endif + + ifeq ($(subdir),math) +-tests += tst-ldbl-nonnormal-printf ++tests += \ ++ tst-ldbl-nonnormal-printf \ ++# tests + endif # $(subdir) == math + + ifeq ($(subdir),setjmp) + gen-as-const-headers += jmp_buf-ssp.sym + sysdep_routines += __longjmp_cancel + ifneq ($(enable-cet),no) +-tests += tst-setjmp-cet ++tests += \ ++ tst-setjmp-cet \ ++# tests + tst-setjmp-cet-ENV = GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on + endif + endif +@@ -122,20 +141,45 @@ ifneq ($(enable-cet),no) + ifeq ($(subdir),elf) + sysdep-dl-routines += dl-cet + +-tests += tst-cet-legacy-1 tst-cet-legacy-1a tst-cet-legacy-2 \ +- tst-cet-legacy-2a tst-cet-legacy-3 tst-cet-legacy-4 \ +- tst-cet-legacy-5a tst-cet-legacy-6a tst-cet-legacy-7 \ +- tst-cet-legacy-8 tst-cet-legacy-9 tst-cet-legacy-9-static \ +- tst-cet-legacy-10 tst-cet-legacy-10-static +-tests-static += tst-cet-legacy-9-static tst-cet-legacy-10-static ++tests += \ ++ tst-cet-legacy-1 \ ++ tst-cet-legacy-1a \ ++ tst-cet-legacy-2 \ ++ tst-cet-legacy-2a \ ++ tst-cet-legacy-3 \ ++ tst-cet-legacy-4 \ ++ tst-cet-legacy-5a \ ++ tst-cet-legacy-6a \ ++ tst-cet-legacy-7 \ ++ tst-cet-legacy-8 \ ++ tst-cet-legacy-9 \ ++ tst-cet-legacy-9-static \ ++ tst-cet-legacy-10 \ ++ tst-cet-legacy-10-static \ ++# tests ++tests-static += \ ++ tst-cet-legacy-9-static \ ++ tst-cet-legacy-10-static \ ++# tests-static + tst-cet-legacy-1a-ARGS = -- $(host-test-program-cmd) +-tests += tst-cet-legacy-4a tst-cet-legacy-4b tst-cet-legacy-4c \ +- tst-cet-legacy-5b tst-cet-legacy-6b +-modules-names += tst-cet-legacy-mod-1 tst-cet-legacy-mod-2 \ +- tst-cet-legacy-mod-4 tst-cet-legacy-mod-5a \ +- tst-cet-legacy-mod-5b tst-cet-legacy-mod-5c \ +- tst-cet-legacy-mod-6a tst-cet-legacy-mod-6b \ +- tst-cet-legacy-mod-6c ++tests += \ ++ tst-cet-legacy-4a \ ++ tst-cet-legacy-4b \ ++ tst-cet-legacy-4c \ ++ tst-cet-legacy-5b \ ++ tst-cet-legacy-6b \ ++# tests ++modules-names += \ ++ tst-cet-legacy-mod-1 \ ++ tst-cet-legacy-mod-2 \ ++ tst-cet-legacy-mod-4 \ ++ tst-cet-legacy-mod-5a \ ++ tst-cet-legacy-mod-5b \ ++ tst-cet-legacy-mod-5c \ ++ tst-cet-legacy-mod-6a \ ++ tst-cet-legacy-mod-6b \ ++ tst-cet-legacy-mod-6c \ ++# modules-names + + CFLAGS-tst-cet-legacy-2.c += -fcf-protection=branch + CFLAGS-tst-cet-legacy-2a.c += -fcf-protection +@@ -243,7 +287,9 @@ endif + ifeq ($(subdir),posix) + tests += \ + tst-sysconf-cache-linesize \ +- tst-sysconf-cache-linesize-static ++ tst-sysconf-cache-linesize-static \ ++# tests + tests-static += \ +- tst-sysconf-cache-linesize-static ++ tst-sysconf-cache-linesize-static \ ++# tests-static + endif +-- +2.27.0 + diff --git a/x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch b/x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch new file mode 100644 index 0000000..701321b --- /dev/null +++ b/x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch @@ -0,0 +1,77 @@ +From 58822f954f6284c8687dfff43fa4e9e349eeccad Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 28 Aug 2023 12:08:14 -0700 +Subject: [PATCH] x86: Check the lower byte of EAX of CPUID leaf 2 [BZ + #30643] + +The old Intel software developer manual specified that the low byte of +EAX of CPUID leaf 2 returned 1 which indicated the number of rounds of +CPUDID leaf 2 was needed to retrieve the complete cache information. The +newer Intel manual has been changed to that it should always return 1 +and be ignored. If the lower byte isn't 1, CPUID leaf 2 can't be used. +In this case, we ignore CPUID leaf 2 and use CPUID leaf 4 instead. If +CPUID leaf 4 doesn't contain the cache information, cache information +isn't available at all. This addresses BZ #30643. + +(cherry picked from commit 1493622f4f9048ffede3fbedb64695efa49d662a) +--- + sysdeps/x86/dl-cacheinfo.h | 31 +++++++++++++------------------ + 1 file changed, 13 insertions(+), 18 deletions(-) + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 6c7740422a..400d15f208 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -187,7 +187,7 @@ intel_check_word (int name, unsigned int value, bool *has_level_2, + ++round; + } + /* There is no other cache information anywhere else. */ +- break; ++ return -1; + } + else + { +@@ -257,28 +257,23 @@ handle_intel (int name, const struct cpu_features *cpu_features) + + /* OK, we can use the CPUID instruction to get all info about the + caches. */ +- unsigned int cnt = 0; +- unsigned int max = 1; + long int result = 0; + bool no_level_2_or_3 = false; + bool has_level_2 = false; ++ unsigned int eax; ++ unsigned int ebx; ++ unsigned int ecx; ++ unsigned int edx; ++ __cpuid (2, eax, ebx, ecx, edx); + +- while (cnt++ < max) ++ /* The low byte of EAX of CPUID leaf 2 should always return 1 and it ++ should be ignored. If it isn't 1, use CPUID leaf 4 instead. */ ++ if ((eax & 0xff) != 1) ++ return intel_check_word (name, 0xff, &has_level_2, &no_level_2_or_3, ++ cpu_features); ++ else + { +- unsigned int eax; +- unsigned int ebx; +- unsigned int ecx; +- unsigned int edx; +- __cpuid (2, eax, ebx, ecx, edx); +- +- /* The low byte of EAX in the first round contain the number of +- rounds we have to make. At least one, the one we are already +- doing. */ +- if (cnt == 1) +- { +- max = eax & 0xff; +- eax &= 0xffffff00; +- } ++ eax &= 0xffffff00; + + /* Process the individual registers' value. */ + result = intel_check_word (name, eax, &has_level_2, +-- +2.27.0 + diff --git a/x86-Improve-large-memset-perf-with-non-temporal-stor.patch b/x86-Improve-large-memset-perf-with-non-temporal-stor.patch new file mode 100644 index 0000000..abd7fdf --- /dev/null +++ b/x86-Improve-large-memset-perf-with-non-temporal-stor.patch @@ -0,0 +1,254 @@ +From 04b8d484323b2ff18b3422c4b883ef4cb6281c53 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 24 May 2024 12:38:50 -0500 +Subject: [PATCH] x86: Improve large memset perf with non-temporal stores + [RHEL-29312] + +Previously we use `rep stosb` for all medium/large memsets. This is +notably worse than non-temporal stores for large (above a +few MBs) memsets. +See: +https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing +For data using different stategies for large memset on ICX and SKX. + +Using non-temporal stores can be up to 3x faster on ICX and 2x faster +on SKX. Historically, these numbers would not have been so good +because of the zero-over-zero writeback optimization that `rep stosb` +is able to do. But, the zero-over-zero writeback optimization has been +removed as a potential side-channel attack, so there is no longer any +good reason to only rely on `rep stosb` for large memsets. On the flip +size, non-temporal writes can avoid data in their RFO requests saving +memory bandwidth. + +All of the other changes to the file are to re-organize the +code-blocks to maintain "good" alignment given the new code added in +the `L(stosb_local)` case. + +The results from running the GLIBC memset benchmarks on TGL-client for +N=20 runs: + +Geometric Mean across the suite New / Old EXEX256: 0.979 +Geometric Mean across the suite New / Old EXEX512: 0.979 +Geometric Mean across the suite New / Old AVX2 : 0.986 +Geometric Mean across the suite New / Old SSE2 : 0.979 + +Most of the cases are essentially unchanged, this is mostly to show +that adding the non-temporal case didn't add any regressions to the +other cases. + +The results on the memset-large benchmark suite on TGL-client for N=20 +runs: + +Geometric Mean across the suite New / Old EXEX256: 0.926 +Geometric Mean across the suite New / Old EXEX512: 0.925 +Geometric Mean across the suite New / Old AVX2 : 0.928 +Geometric Mean across the suite New / Old SSE2 : 0.924 + +So roughly a 7.5% speedup. This is lower than what we see on servers +(likely because clients typically have faster single-core bandwidth so +saving bandwidth on RFOs is less impactful), but still advantageous. + +Full test-suite passes on x86_64 w/ and w/o multiarch. +Reviewed-by: H.J. Lu + +(cherry picked from commit 5bf0ab80573d66e4ae5d94b094659094336da90f) +--- + .../multiarch/memset-vec-unaligned-erms.S | 147 +++++++++++------- + 1 file changed, 91 insertions(+), 56 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 0f0636b90f..aba45e3da0 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -21,8 +21,13 @@ + 2. If size is less than VEC, use integer register stores. + 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. + 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. +- 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with +- 4 VEC stores and store 4 * VEC at a time until done. */ ++ 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with ++ 4 VEC stores and store 4 * VEC at a time until done. ++ 6. On machines ERMS feature, if size is range ++ [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold) ++ then REP STOSB will be used. ++ 7. If size >= __x86_shared_non_temporal_threshold, use a ++ non-temporal stores. */ + + #include + +@@ -145,6 +150,41 @@ L(entry_from_wmemset): + VMOVU %VMM(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VMM(0), (%rdi) + VZEROUPPER_RETURN ++ ++ /* If have AVX512 mask instructions put L(less_vec) close to ++ entry as it doesn't take much space and is likely a hot target. */ ++#ifdef USE_LESS_VEC_MASK_STORE ++ /* Align to ensure the L(less_vec) logic all fits in 1x cache lines. */ ++ .p2align 6,, 47 ++ .p2align 4 ++L(less_vec): ++L(less_vec_from_wmemset): ++ /* Less than 1 VEC. */ ++# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 ++# error Unsupported VEC_SIZE! ++# endif ++ /* Clear high bits from edi. Only keeping bits relevant to page ++ cross check. Note that we are using rax which is set in ++ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ ++ andl $(PAGE_SIZE - 1), %edi ++ /* Check if VEC_SIZE store cross page. Mask stores suffer ++ serious performance degradation when it has to fault suppress. */ ++ cmpl $(PAGE_SIZE - VEC_SIZE), %edi ++ /* This is generally considered a cold target. */ ++ ja L(cross_page) ++# if VEC_SIZE > 32 ++ movq $-1, %rcx ++ bzhiq %rdx, %rcx, %rcx ++ kmovq %rcx, %k1 ++# else ++ movl $-1, %ecx ++ bzhil %edx, %ecx, %ecx ++ kmovd %ecx, %k1 ++# endif ++ vmovdqu8 %VMM(0), (%rax){%k1} ++ VZEROUPPER_RETURN ++#endif ++ + #if defined USE_MULTIARCH && IS_IN (libc) + END (MEMSET_SYMBOL (__memset, unaligned)) + +@@ -183,54 +223,6 @@ L(last_2x_vec): + #endif + VZEROUPPER_RETURN + +- /* If have AVX512 mask instructions put L(less_vec) close to +- entry as it doesn't take much space and is likely a hot target. +- */ +-#ifdef USE_LESS_VEC_MASK_STORE +- .p2align 4,, 10 +-L(less_vec): +-L(less_vec_from_wmemset): +- /* Less than 1 VEC. */ +-# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +-# error Unsupported VEC_SIZE! +-# endif +- /* Clear high bits from edi. Only keeping bits relevant to page +- cross check. Note that we are using rax which is set in +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ +- andl $(PAGE_SIZE - 1), %edi +- /* Check if VEC_SIZE store cross page. Mask stores suffer +- serious performance degradation when it has to fault suppress. +- */ +- cmpl $(PAGE_SIZE - VEC_SIZE), %edi +- /* This is generally considered a cold target. */ +- ja L(cross_page) +-# if VEC_SIZE > 32 +- movq $-1, %rcx +- bzhiq %rdx, %rcx, %rcx +- kmovq %rcx, %k1 +-# else +- movl $-1, %ecx +- bzhil %edx, %ecx, %ecx +- kmovd %ecx, %k1 +-# endif +- vmovdqu8 %VMM(0), (%rax){%k1} +- VZEROUPPER_RETURN +- +-# if defined USE_MULTIARCH && IS_IN (libc) +- /* Include L(stosb_local) here if including L(less_vec) between +- L(stosb_more_2x_vec) and ENTRY. This is to cache align the +- L(stosb_more_2x_vec) target. */ +- .p2align 4,, 10 +-L(stosb_local): +- movzbl %sil, %eax +- mov %RDX_LP, %RCX_LP +- mov %RDI_LP, %RDX_LP +- rep stosb +- mov %RDX_LP, %RAX_LP +- VZEROUPPER_RETURN +-# endif +-#endif +- + #if defined USE_MULTIARCH && IS_IN (libc) + .p2align 4 + L(stosb_more_2x_vec): +@@ -316,21 +308,33 @@ L(return_vzeroupper): + ret + #endif + +- .p2align 4,, 10 +-#ifndef USE_LESS_VEC_MASK_STORE +-# if defined USE_MULTIARCH && IS_IN (libc) ++#ifdef USE_WITH_AVX2 ++ .p2align 4 ++#else ++ .p2align 4,, 4 ++#endif ++ ++#if defined USE_MULTIARCH && IS_IN (libc) + /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in + range for 2-byte jump encoding. */ + L(stosb_local): ++ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP ++ jae L(nt_memset) + movzbl %sil, %eax + mov %RDX_LP, %RCX_LP + mov %RDI_LP, %RDX_LP + rep stosb ++# if (defined USE_WITH_SSE2) || (defined USE_WITH_AVX512) ++ /* Use xchg to save 1-byte (this helps align targets below). */ ++ xchg %RDX_LP, %RAX_LP ++# else + mov %RDX_LP, %RAX_LP +- VZEROUPPER_RETURN + # endif ++ VZEROUPPER_RETURN ++#endif ++#ifndef USE_LESS_VEC_MASK_STORE + /* Define L(less_vec) only if not otherwise defined. */ +- .p2align 4 ++ .p2align 4,, 12 + L(less_vec): + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to + xmm). This is only does anything for AVX2. */ +@@ -421,4 +425,35 @@ L(between_2_3): + movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) + #endif + ret +-END (MEMSET_SYMBOL (__memset, unaligned_erms)) ++ ++#if defined USE_MULTIARCH && IS_IN (libc) ++# ifdef USE_WITH_AVX512 ++ /* Force align so the loop doesn't cross a cache-line. */ ++ .p2align 4 ++# endif ++ .p2align 4,, 7 ++ /* Memset using non-temporal stores. */ ++L(nt_memset): ++ VMOVU %VMM(0), (VEC_SIZE * 0)(%rdi) ++ leaq (VEC_SIZE * -4)(%rdi, %rdx), %rdx ++ /* Align DST. */ ++ orq $(VEC_SIZE * 1 - 1), %rdi ++ incq %rdi ++ .p2align 4,, 7 ++L(nt_loop): ++ VMOVNT %VMM(0), (VEC_SIZE * 0)(%rdi) ++ VMOVNT %VMM(0), (VEC_SIZE * 1)(%rdi) ++ VMOVNT %VMM(0), (VEC_SIZE * 2)(%rdi) ++ VMOVNT %VMM(0), (VEC_SIZE * 3)(%rdi) ++ subq $(VEC_SIZE * -4), %rdi ++ cmpq %rdx, %rdi ++ jb L(nt_loop) ++ sfence ++ VMOVU %VMM(0), (VEC_SIZE * 0)(%rdx) ++ VMOVU %VMM(0), (VEC_SIZE * 1)(%rdx) ++ VMOVU %VMM(0), (VEC_SIZE * 2)(%rdx) ++ VMOVU %VMM(0), (VEC_SIZE * 3)(%rdx) ++ VZEROUPPER_RETURN ++#endif ++ ++END(MEMSET_SYMBOL(__memset, unaligned_erms)) +-- +2.27.0 + diff --git a/x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch b/x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch new file mode 100644 index 0000000..0e870fe --- /dev/null +++ b/x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch @@ -0,0 +1,34 @@ +From 5a64f933655384477d85122c6855dc6d84061810 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 1 Nov 2023 15:30:26 -0500 +Subject: [PATCH] x86: Only align destination to 1x VEC_SIZE in memset 4x + loop + +Current code aligns to 2x VEC_SIZE. Aligning to 2x has no affect on +performance other than potentially resulting in an additional +iteration of the loop. +1x maintains aligned stores (the only reason to align in this case) +and doesn't incur any unnecessary loop iterations. +Reviewed-by: Sunil K Pandey + +(cherry picked from commit 9469261cf1924d350feeec64d2c80cafbbdcdd4d) +--- + sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 3d9ad49cb9..0f0636b90f 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -293,7 +293,7 @@ L(more_2x_vec): + leaq (VEC_SIZE * 4)(%rax), %LOOP_REG + #endif + /* Align dst for loop. */ +- andq $(VEC_SIZE * -2), %LOOP_REG ++ andq $(VEC_SIZE * -1), %LOOP_REG + .p2align 4 + L(loop): + VMOVA %VMM(0), LOOP_4X_OFFSET(%LOOP_REG) +-- +2.27.0 + diff --git a/x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch b/x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch new file mode 100644 index 0000000..5d5cf38 --- /dev/null +++ b/x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch @@ -0,0 +1,149 @@ +From 12fec8aae5e17cc4dc3bb079265c46ee78faeddb Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 27 Sep 2024 15:50:10 -0700 +Subject: [PATCH] x86/string: Fixup alignment of main loop in + str{n}cmp-evex [BZ #32212] + +The loop should be aligned to 32-bytes so that it can ideally run out +the DSB. This is particularly important on Skylake-Server where +deficiencies in it's DSB implementation make it prone to not being +able to run loops out of the DSB. + +For example running strcmp-evex on 200Mb string: + +32-byte aligned loop: + - 43,399,578,766 idq.dsb_uops +not 32-byte aligned loop: + - 6,060,139,704 idq.dsb_uops + +This results in a 25% performance degradation for the non-aligned +version. + +The fix is to just ensure the code layout is such that the loop is +aligned. (Which was previously the case but was accidentally dropped +in 84e7c46df). + +NB: The fix was actually 64-byte alignment. This is because 64-byte +alignment generally produces more stable performance than 32-byte +aligned code (cache line crosses can affect perf), so if we are going +past 16-byte alignmnent, might as well go to 64. 64-byte alignment +also matches most other functions we over-align, so it creates a +common point of optimization. + +Times are reported as ratio of Time_With_Patch / +Time_Without_Patch. Lower is better. + +The values being reported is the geometric mean of the ratio across +all tests in bench-strcmp and bench-strncmp. + +Note this patch is only attempting to improve the Skylake-Server +strcmp for long strings. The rest of the numbers are only to test for +regressions. + +Tigerlake Results Strings <= 512: + strcmp : 1.026 + strncmp: 0.949 + +Tigerlake Results Strings > 512: + strcmp : 0.994 + strncmp: 0.998 + +Skylake-Server Results Strings <= 512: + strcmp : 0.945 + strncmp: 0.943 + +Skylake-Server Results Strings > 512: + strcmp : 0.778 + strncmp: 1.000 + +The 2.6% regression on TGL-strcmp is due to slowdowns caused by +changes in alignment of code handling small sizes (most on the +page-cross logic). These should be safe to ignore because 1) We +previously only 16-byte aligned the function so this behavior is not +new and was essentially up to chance before this patch and 2) this +type of alignment related regression on small sizes really only comes +up in tight micro-benchmark loops and is unlikely to have any affect +on realworld performance. + +Reviewed-by: H.J. Lu +(cherry picked from commit 483443d3211532903d7e790211af5a1d55fdb1f3) +--- + sysdeps/x86_64/multiarch/strcmp-evex.S | 26 +++++++++++++------------- + 1 file changed, 13 insertions(+), 13 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index ae39cdf217..6a7fec669e 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -209,7 +209,9 @@ + returned. */ + + .section SECTION(.text), "ax", @progbits +- .align 16 ++ /* Align 64 bytes here. This is to get the L(loop) block ideally ++ aligned for the DSB. */ ++ .align 64 + .type STRCMP, @function + .globl STRCMP + # ifdef USE_AS_STRCASECMP_L +@@ -509,9 +511,7 @@ L(ret4): + ret + # endif + +- /* 32 byte align here ensures the main loop is ideally aligned +- for DSB. */ +- .p2align 5 ++ .p2align 4,, 4 + L(more_3x_vec): + /* Safe to compare 4x vectors. */ + VMOVU (VEC_SIZE)(%rdi), %VMM(0) +@@ -1426,10 +1426,9 @@ L(less_32_till_page): + L(ret_zero_page_cross_slow_case0): + xorl %eax, %eax + ret +-# endif +- +- ++# else + .p2align 4,, 10 ++# endif + L(less_16_till_page): + cmpl $((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax + ja L(less_8_till_page) +@@ -1482,8 +1481,12 @@ L(less_16_till_page): + # endif + jmp L(prepare_loop_aligned) + +- +- ++# ifndef USE_AS_STRNCMP ++ /* Fits in aligning bytes. */ ++L(ret_zero_4_loop): ++ xorl %eax, %eax ++ ret ++# endif + + .p2align 4,, 10 + L(less_8_till_page): +@@ -1554,6 +1557,7 @@ L(ret_less_8_wcs): + + # ifdef USE_AS_STRNCMP + .p2align 4,, 2 ++L(ret_zero_4_loop): + L(ret_zero_page_cross_slow_case1): + xorl %eax, %eax + ret +@@ -1586,10 +1590,6 @@ L(less_4_loop): + subq $-(CHAR_PER_VEC * 4), %rdx + # endif + jmp L(prepare_loop_aligned) +- +-L(ret_zero_4_loop): +- xorl %eax, %eax +- ret + L(ret_less_4_loop): + xorl %r8d, %eax + subl %r8d, %eax +-- +2.27.0 + diff --git a/x86_64-Add-expm1-with-FMA.patch b/x86_64-Add-expm1-with-FMA.patch new file mode 100644 index 0000000..eae04a5 --- /dev/null +++ b/x86_64-Add-expm1-with-FMA.patch @@ -0,0 +1,135 @@ +From b2a45f1eee39d67c1fff2d697d32857fb13c8c5d Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 11 Aug 2023 08:04:08 -0700 +Subject: [PATCH] x86_64: Add expm1 with FMA + +On Skylake, it improves expm1 bench performance by: + + Before After Improvement +max 70.204 68.054 3% +min 20.709 16.2 22% +mean 22.1221 16.7367 24% + +NB: Add + +extern long double __expm1l (long double); +extern long double __expm1f128 (long double); + +for __typeof (__expm1l) and __typeof (__expm1f128) when __expm1 is +defined since __expm1 may be expanded in their declarations which +causes the build failure. + +(cherry picked from commit 1b214630ce6f7e0099b8b6f87246246739b079cf) +--- + sysdeps/ieee754/dbl-64/s_expm1.c | 7 +++++ + sysdeps/x86_64/fpu/multiarch/Makefile | 2 ++ + sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c | 10 ++++++ + sysdeps/x86_64/fpu/multiarch/s_expm1.c | 36 ++++++++++++++++++++++ + 4 files changed, 55 insertions(+) + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_expm1.c + +diff --git a/sysdeps/ieee754/dbl-64/s_expm1.c b/sysdeps/ieee754/dbl-64/s_expm1.c +index 8f1c95bd04..1cafeca9c0 100644 +--- a/sysdeps/ieee754/dbl-64/s_expm1.c ++++ b/sysdeps/ieee754/dbl-64/s_expm1.c +@@ -130,6 +130,11 @@ static const double + 4.00821782732936239552e-06, /* 3ED0CFCA 86E65239 */ + -2.01099218183624371326e-07 }; /* BE8AFDB7 6E09C32D */ + ++#ifndef SECTION ++# define SECTION ++#endif ++ ++SECTION + double + __expm1 (double x) + { +@@ -258,4 +263,6 @@ __expm1 (double x) + } + return y; + } ++#ifndef __expm1 + libm_alias_double (__expm1, expm1) ++#endif +diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile +index f773255721..add339a876 100644 +--- a/sysdeps/x86_64/fpu/multiarch/Makefile ++++ b/sysdeps/x86_64/fpu/multiarch/Makefile +@@ -37,6 +37,7 @@ libm-sysdep_routines += \ + e_log2-fma \ + e_pow-fma \ + s_atan-fma \ ++ s_expm1-fma \ + s_sin-fma \ + s_sincos-fma \ + s_tan-fma \ +@@ -49,6 +50,7 @@ CFLAGS-e_log-fma.c = -mfma -mavx2 + CFLAGS-e_log2-fma.c = -mfma -mavx2 + CFLAGS-e_pow-fma.c = -mfma -mavx2 + CFLAGS-s_atan-fma.c = -mfma -mavx2 ++CFLAGS-s_expm1-fma.c = -mfma -mavx2 + CFLAGS-s_sin-fma.c = -mfma -mavx2 + CFLAGS-s_tan-fma.c = -mfma -mavx2 + CFLAGS-s_sincos-fma.c = -mfma -mavx2 +diff --git a/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c b/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c +new file mode 100644 +index 0000000000..3ee2bd804e +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c +@@ -0,0 +1,10 @@ ++#define __expm1 __expm1_fma ++ ++/* NB: __expm1 may be expanded to __expm1_fma in the following ++ prototypes. */ ++extern long double __expm1l (long double); ++extern long double __expm1f128 (long double); ++ ++#define SECTION __attribute__ ((section (".text.fma"))) ++ ++#include +diff --git a/sysdeps/x86_64/fpu/multiarch/s_expm1.c b/sysdeps/x86_64/fpu/multiarch/s_expm1.c +new file mode 100644 +index 0000000000..2cae83fb7f +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_expm1.c +@@ -0,0 +1,36 @@ ++/* Multiple versions of expm1. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++extern double __redirect_expm1 (double); ++ ++#define SYMBOL_NAME expm1 ++#include "ifunc-fma.h" ++ ++libc_ifunc_redirected (__redirect_expm1, __expm1, IFUNC_SELECTOR ()); ++libm_alias_double (__expm1, expm1) ++ ++#define __expm1 __expm1_sse2 ++ ++/* NB: __expm1 may be expanded to __expm1_sse2 in the following ++ prototypes. */ ++extern long double __expm1l (long double); ++extern long double __expm1f128 (long double); ++ ++#include +-- +2.27.0 + diff --git a/x86_64-Add-log1p-with-FMA.patch b/x86_64-Add-log1p-with-FMA.patch new file mode 100644 index 0000000..64cb328 --- /dev/null +++ b/x86_64-Add-log1p-with-FMA.patch @@ -0,0 +1,140 @@ +From c92946d9b29956be78ca4487264848714fd5d505 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Thu, 17 Aug 2023 09:42:29 -0700 +Subject: [PATCH] x86_64: Add log1p with FMA + +On Skylake, it changes log1p bench performance by: + + Before After Improvement +max 63.349 58.347 8% +min 4.448 5.651 -30% +mean 12.0674 10.336 14% + +The minimum code path is + + if (hx < 0x3FDA827A) /* x < 0.41422 */ + { + if (__glibc_unlikely (ax >= 0x3ff00000)) /* x <= -1.0 */ + { + ... + } + if (__glibc_unlikely (ax < 0x3e200000)) /* |x| < 2**-29 */ + { + math_force_eval (two54 + x); /* raise inexact */ + if (ax < 0x3c900000) /* |x| < 2**-54 */ + { + ... + } + else + return x - x * x * 0.5; + +FMA and non-FMA code sequences look similar. Non-FMA version is slightly +faster. Since log1p is called by asinh and atanh, it improves asinh +performance by: + + Before After Improvement +max 75.645 63.135 16% +min 10.074 10.071 0% +mean 15.9483 14.9089 6% + +and improves atanh performance by: + + Before After Improvement +max 91.768 75.081 18% +min 15.548 13.883 10% +mean 18.3713 16.8011 8% + +(cherry picked from commit a8ecb126d4c26c52f4ad828c566afe4043a28155) +--- + sysdeps/ieee754/dbl-64/s_log1p.c | 5 ++++ + sysdeps/x86_64/fpu/multiarch/Makefile | 2 ++ + sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c | 4 +++ + sysdeps/x86_64/fpu/multiarch/s_log1p.c | 29 ++++++++++++++++++++++ + 4 files changed, 40 insertions(+) + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_log1p.c + +diff --git a/sysdeps/ieee754/dbl-64/s_log1p.c b/sysdeps/ieee754/dbl-64/s_log1p.c +index e6476a8260..eeb0af859f 100644 +--- a/sysdeps/ieee754/dbl-64/s_log1p.c ++++ b/sysdeps/ieee754/dbl-64/s_log1p.c +@@ -99,6 +99,11 @@ static const double + + static const double zero = 0.0; + ++#ifndef SECTION ++# define SECTION ++#endif ++ ++SECTION + double + __log1p (double x) + { +diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile +index add339a876..ea81753b70 100644 +--- a/sysdeps/x86_64/fpu/multiarch/Makefile ++++ b/sysdeps/x86_64/fpu/multiarch/Makefile +@@ -38,6 +38,7 @@ libm-sysdep_routines += \ + e_pow-fma \ + s_atan-fma \ + s_expm1-fma \ ++ s_log1p-fma \ + s_sin-fma \ + s_sincos-fma \ + s_tan-fma \ +@@ -51,6 +52,7 @@ CFLAGS-e_log2-fma.c = -mfma -mavx2 + CFLAGS-e_pow-fma.c = -mfma -mavx2 + CFLAGS-s_atan-fma.c = -mfma -mavx2 + CFLAGS-s_expm1-fma.c = -mfma -mavx2 ++CFLAGS-s_log1p-fma.c = -mfma -mavx2 + CFLAGS-s_sin-fma.c = -mfma -mavx2 + CFLAGS-s_tan-fma.c = -mfma -mavx2 + CFLAGS-s_sincos-fma.c = -mfma -mavx2 +diff --git a/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c b/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c +new file mode 100644 +index 0000000000..8952df8f9e +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c +@@ -0,0 +1,4 @@ ++#define __log1p __log1p_fma ++#define SECTION __attribute__ ((section (".text.fma"))) ++ ++#include +diff --git a/sysdeps/x86_64/fpu/multiarch/s_log1p.c b/sysdeps/x86_64/fpu/multiarch/s_log1p.c +new file mode 100644 +index 0000000000..6ce5198d6d +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_log1p.c +@@ -0,0 +1,29 @@ ++/* Multiple versions of log1p. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++extern double __redirect_log1p (double); ++ ++#define SYMBOL_NAME log1p ++#include "ifunc-fma.h" ++ ++libc_ifunc_redirected (__redirect_log1p, __log1p, IFUNC_SELECTOR ()); ++ ++#define __log1p __log1p_sse2 ++#include +-- +2.27.0 + diff --git a/x86_64-Add-log2-with-FMA.patch b/x86_64-Add-log2-with-FMA.patch new file mode 100644 index 0000000..2439490 --- /dev/null +++ b/x86_64-Add-log2-with-FMA.patch @@ -0,0 +1,102 @@ +From 49016f2190693d5b2d4d6294d438ebae7a58d151 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Thu, 10 Aug 2023 11:24:30 -0700 +Subject: [PATCH] x86_64: Add log2 with FMA + +On Skylake, it improves log2 bench performance by: + + Before After Improvement +max 208.779 63.827 69% +min 9.977 6.55 34% +mean 10.366 6.8191 34% + +(cherry picked from commit f6b10ed8e9a00de49d0951e760cc2b5288862b47) +--- + sysdeps/x86_64/fpu/multiarch/Makefile | 2 ++ + sysdeps/x86_64/fpu/multiarch/e_log2-fma.c | 3 ++ + sysdeps/x86_64/fpu/multiarch/e_log2.c | 43 +++++++++++++++++++++++ + 3 files changed, 48 insertions(+) + create mode 100644 sysdeps/x86_64/fpu/multiarch/e_log2-fma.c + create mode 100644 sysdeps/x86_64/fpu/multiarch/e_log2.c + +diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile +index e37e488c37..f773255721 100644 +--- a/sysdeps/x86_64/fpu/multiarch/Makefile ++++ b/sysdeps/x86_64/fpu/multiarch/Makefile +@@ -34,6 +34,7 @@ libm-sysdep_routines += \ + e_atan2-fma \ + e_exp-fma \ + e_log-fma \ ++ e_log2-fma \ + e_pow-fma \ + s_atan-fma \ + s_sin-fma \ +@@ -45,6 +46,7 @@ CFLAGS-e_asin-fma.c = -mfma -mavx2 + CFLAGS-e_atan2-fma.c = -mfma -mavx2 + CFLAGS-e_exp-fma.c = -mfma -mavx2 + CFLAGS-e_log-fma.c = -mfma -mavx2 ++CFLAGS-e_log2-fma.c = -mfma -mavx2 + CFLAGS-e_pow-fma.c = -mfma -mavx2 + CFLAGS-s_atan-fma.c = -mfma -mavx2 + CFLAGS-s_sin-fma.c = -mfma -mavx2 +diff --git a/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c b/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c +new file mode 100644 +index 0000000000..9fbebc1b47 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c +@@ -0,0 +1,3 @@ ++#define __log2 __log2_fma ++ ++#include +diff --git a/sysdeps/x86_64/fpu/multiarch/e_log2.c b/sysdeps/x86_64/fpu/multiarch/e_log2.c +new file mode 100644 +index 0000000000..c0320caf36 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/e_log2.c +@@ -0,0 +1,43 @@ ++/* Multiple versions of log2. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++ ++extern double __redirect_log2 (double); ++ ++#define SYMBOL_NAME log2 ++#include "ifunc-fma.h" ++ ++libc_ifunc_redirected (__redirect_log2, __log2, IFUNC_SELECTOR ()); ++ ++#ifdef SHARED ++__hidden_ver1 (__log2, __GI___log2, __redirect_log2) ++ __attribute__ ((visibility ("hidden"))); ++ ++versioned_symbol (libm, __ieee754_log2, log2, GLIBC_2_29); ++libm_alias_double_other (__log2, log2) ++#else ++libm_alias_double (__log2, log2) ++#endif ++ ++strong_alias (__log2, __ieee754_log2) ++libm_alias_finite (__log2, __log2) ++ ++#define __log2 __log2_sse2 ++#include +-- +2.27.0 + diff --git a/x86_64-Fix-missing-wcsncat-function-definition-witho.patch b/x86_64-Fix-missing-wcsncat-function-definition-witho.patch new file mode 100644 index 0000000..a960466 --- /dev/null +++ b/x86_64-Fix-missing-wcsncat-function-definition-witho.patch @@ -0,0 +1,44 @@ +From dc1762113dbe40be832bedd41b52d9822d62c50f Mon Sep 17 00:00:00 2001 +From: Gabi Falk +Date: Tue, 7 May 2024 18:25:00 +0000 +Subject: [PATCH] x86_64: Fix missing wcsncat function definition without + multiarch (x86-64-v4) + +This code expects the WCSCAT preprocessor macro to be predefined in case +the evex implementation of the function should be defined with a name +different from __wcsncat_evex. However, when glibc is built for +x86-64-v4 without multiarch support, sysdeps/x86_64/wcsncat.S defines +WCSNCAT variable instead of WCSCAT to build it as wcsncat. Rename the +variable to WCSNCAT, as it is actually a better naming choice for the +variable in this case. + +Reported-by: Kenton Groombridge +Link: https://bugs.gentoo.org/921945 +Fixes: 64b8b6516b ("x86: Add evex optimized functions for the wchar_t strcpy family") +Signed-off-by: Gabi Falk +Reviewed-by: Sunil K Pandey +(cherry picked from commit dd5f891c1ad9f1b43b9db93afe2a55cbb7a6194e) +--- + sysdeps/x86_64/multiarch/wcsncat-evex.S | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/wcsncat-evex.S b/sysdeps/x86_64/multiarch/wcsncat-evex.S +index 392215950a..10bfb0a531 100644 +--- a/sysdeps/x86_64/multiarch/wcsncat-evex.S ++++ b/sysdeps/x86_64/multiarch/wcsncat-evex.S +@@ -1,9 +1,9 @@ +-#ifndef WCSCAT +-# define WCSCAT __wcsncat_evex ++#ifndef WCSNCAT ++# define WCSNCAT __wcsncat_evex + #endif + + #define USE_AS_WCSCPY + #define USE_AS_STRCAT + +-#define STRNCAT WCSCAT ++#define STRNCAT WCSNCAT + #include "strncat-evex.S" +-- +2.27.0 + diff --git a/x86_64-Sort-fpu-multiarch-Makefile.patch b/x86_64-Sort-fpu-multiarch-Makefile.patch new file mode 100644 index 0000000..08ca62f --- /dev/null +++ b/x86_64-Sort-fpu-multiarch-Makefile.patch @@ -0,0 +1,144 @@ +From 5c9be512ee25ceab92a284adc75fe22bbd94b179 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Wed, 9 Aug 2023 11:08:52 -0700 +Subject: [PATCH] x86_64: Sort fpu/multiarch/Makefile + +Sort Makefile variables using scripts/sort-makefile-lines.py. + +No code generation changes observed in libm. No regressions on x86_64. + +(cherry picked from commit 881546979d0219c18337e1b4f4d00cfacab13c40) +--- + sysdeps/x86_64/fpu/multiarch/Makefile | 94 +++++++++++++++++++++------ + 1 file changed, 74 insertions(+), 20 deletions(-) + +diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile +index 248162525b..e37e488c37 100644 +--- a/sysdeps/x86_64/fpu/multiarch/Makefile ++++ b/sysdeps/x86_64/fpu/multiarch/Makefile +@@ -1,17 +1,45 @@ + ifeq ($(subdir),math) +-libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \ +- s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \ +- s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c ++libm-sysdep_routines += \ ++ s_ceil-c \ ++ s_ceilf-c \ ++ s_floor-c \ ++ s_floorf-c \ ++ s_rint-c \ ++ s_rintf-c \ ++ s_nearbyint-c \ ++ s_nearbyintf-c \ ++ s_roundeven-c \ ++ s_roundevenf-c \ ++ s_trunc-c \ ++ s_truncf-c \ ++# libm-sysdep_routines + +-libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \ +- s_floorf-sse4_1 s_nearbyint-sse4_1 \ +- s_nearbyintf-sse4_1 s_roundeven-sse4_1 \ +- s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \ +- s_trunc-sse4_1 s_truncf-sse4_1 ++libm-sysdep_routines += \ ++ s_ceil-sse4_1 \ ++ s_ceilf-sse4_1 \ ++ s_floor-sse4_1 \ ++ s_floorf-sse4_1 \ ++ s_nearbyint-sse4_1 \ ++ s_nearbyintf-sse4_1 \ ++ s_roundeven-sse4_1 \ ++ s_roundevenf-sse4_1 \ ++ s_rint-sse4_1 \ ++ s_rintf-sse4_1 \ ++ s_trunc-sse4_1 \ ++ s_truncf-sse4_1 \ ++# libm-sysdep_routines + +-libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \ +- e_asin-fma e_atan2-fma s_sin-fma s_tan-fma \ +- s_sincos-fma ++libm-sysdep_routines += \ ++ e_asin-fma \ ++ e_atan2-fma \ ++ e_exp-fma \ ++ e_log-fma \ ++ e_pow-fma \ ++ s_atan-fma \ ++ s_sin-fma \ ++ s_sincos-fma \ ++ s_tan-fma \ ++# libm-sysdep_routines + + CFLAGS-e_asin-fma.c = -mfma -mavx2 + CFLAGS-e_atan2-fma.c = -mfma -mavx2 +@@ -23,10 +51,22 @@ CFLAGS-s_sin-fma.c = -mfma -mavx2 + CFLAGS-s_tan-fma.c = -mfma -mavx2 + CFLAGS-s_sincos-fma.c = -mfma -mavx2 + +-libm-sysdep_routines += s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2 ++libm-sysdep_routines += \ ++ s_cosf-sse2 \ ++ s_sincosf-sse2 \ ++ s_sinf-sse2 \ ++# libm-sysdep_routines + +-libm-sysdep_routines += e_exp2f-fma e_expf-fma e_log2f-fma e_logf-fma \ +- e_powf-fma s_sinf-fma s_cosf-fma s_sincosf-fma ++libm-sysdep_routines += \ ++ e_exp2f-fma \ ++ e_expf-fma \ ++ e_log2f-fma \ ++ e_logf-fma \ ++ e_powf-fma \ ++ s_cosf-fma \ ++ s_sincosf-fma \ ++ s_sinf-fma \ ++# libm-sysdep_routines + + CFLAGS-e_exp2f-fma.c = -mfma -mavx2 + CFLAGS-e_expf-fma.c = -mfma -mavx2 +@@ -37,9 +77,17 @@ CFLAGS-s_sinf-fma.c = -mfma -mavx2 + CFLAGS-s_cosf-fma.c = -mfma -mavx2 + CFLAGS-s_sincosf-fma.c = -mfma -mavx2 + +-libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \ +- e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \ +- s_sincos-fma4 ++libm-sysdep_routines += \ ++ e_exp-fma4 \ ++ e_log-fma4 \ ++ e_pow-fma4 \ ++ e_asin-fma4 \ ++ s_atan-fma4 \ ++ e_atan2-fma4 \ ++ s_sin-fma4 \ ++ s_sincos-fma4 \ ++ s_tan-fma4 \ ++# libm-sysdep_routines + + CFLAGS-e_asin-fma4.c = -mfma4 + CFLAGS-e_atan2-fma4.c = -mfma4 +@@ -51,9 +99,15 @@ CFLAGS-s_sin-fma4.c = -mfma4 + CFLAGS-s_tan-fma4.c = -mfma4 + CFLAGS-s_sincos-fma4.c = -mfma4 + +-libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \ +- e_atan2-avx s_sin-avx s_tan-avx \ +- s_sincos-avx ++libm-sysdep_routines += \ ++ e_exp-avx \ ++ e_log-avx \ ++ s_atan-avx \ ++ e_atan2-avx \ ++ s_sin-avx \ ++ s_sincos-avx \ ++ s_tan-avx \ ++# libm-sysdep_routines + + CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX + CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX +-- +2.27.0 + -- Gitee