From d5576a8feda207f06e46bcbcc1bdb566f0fd460a Mon Sep 17 00:00:00 2001
From: Qingqing Li <liqingqing3@huawei.com>
Date: Sun, 26 Jan 2025 10:05:30 +0800
Subject: [PATCH] backport form glibc upstream 2.38 branch, this include below
 patches: - stdlib: Test using setenv with updated environ [BZ #32588] - Fix
 underallocation of abort_msg_s struct (CVE-2025-0395) - elf: Support
 recursive use of dynamic TLS in interposed malloc - elf: Avoid some free
 (NULL) calls in _dl_update_slotinfo - x86/string: Fixup alignment of main
 loop in str{n}cmp-evex [BZ #32212] - x86: Improve large memset perf with
 non-temporal stores [RHEL-29312] - x86_64: Fix missing wcsncat function
 definition without multiarch (x86-64-v4) - sysdeps/x86/Makefile: Split and
 sort tests - x86: Only align destination to 1x VEC_SIZE in memset 4x loop -
 elf: Fix slow tls access after dlopen [BZ #19924] - x86: Check the lower byte
 of EAX of CPUID leaf 2 [BZ #30643] - x86_64: Add log1p with FMA - x86_64: Add
 expm1 with FMA - x86_64: Add log2 with FMA - x86_64: Sort
 fpu/multiarch/Makefile

---
 ...ion-of-abort_msg_s-struct-CVE-2025-0.patch |  89 +++
 ...ree-NULL-calls-in-_dl_update_slotinf.patch |  50 ++
 ...low-tls-access-after-dlopen-BZ-19924.patch | 328 +++++++++++
 ...rsive-use-of-dynamic-TLS-in-interpos.patch | 521 ++++++++++++++++++
 glibc.spec                                    |  34 +-
 ...g-setenv-with-updated-environ-BZ-325.patch |  75 +++
 ...ps-x86-Makefile-Split-and-sort-tests.patch | 178 ++++++
 ...wer-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch |  77 +++
 ...e-memset-perf-with-non-temporal-stor.patch | 254 +++++++++
 ...estination-to-1x-VEC_SIZE-in-memset-.patch |  34 ++
 ...-alignment-of-main-loop-in-str-n-cmp.patch | 149 +++++
 x86_64-Add-expm1-with-FMA.patch               | 135 +++++
 x86_64-Add-log1p-with-FMA.patch               | 140 +++++
 x86_64-Add-log2-with-FMA.patch                | 102 ++++
 ...ng-wcsncat-function-definition-witho.patch |  44 ++
 x86_64-Sort-fpu-multiarch-Makefile.patch      | 144 +++++
 16 files changed, 2353 insertions(+), 1 deletion(-)
 create mode 100644 Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
 create mode 100644 elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch
 create mode 100644 elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch
 create mode 100644 elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
 create mode 100644 stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
 create mode 100644 sysdeps-x86-Makefile-Split-and-sort-tests.patch
 create mode 100644 x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch
 create mode 100644 x86-Improve-large-memset-perf-with-non-temporal-stor.patch
 create mode 100644 x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch
 create mode 100644 x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch
 create mode 100644 x86_64-Add-expm1-with-FMA.patch
 create mode 100644 x86_64-Add-log1p-with-FMA.patch
 create mode 100644 x86_64-Add-log2-with-FMA.patch
 create mode 100644 x86_64-Fix-missing-wcsncat-function-definition-witho.patch
 create mode 100644 x86_64-Sort-fpu-multiarch-Makefile.patch

diff --git a/Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch b/Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
new file mode 100644
index 0000000..64ab3bf
--- /dev/null
+++ b/Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
@@ -0,0 +1,89 @@
+From c32fd59314c343db88c3ea4a203870481d33c3d2 Mon Sep 17 00:00:00 2001
+From: Siddhesh Poyarekar <siddhesh@sourceware.org>
+Date: Tue, 21 Jan 2025 16:11:06 -0500
+Subject: [PATCH] Fix underallocation of abort_msg_s struct
+ (CVE-2025-0395)
+
+Include the space needed to store the length of the message itself, in
+addition to the message string.  This resolves BZ #32582.
+
+Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
+Reviewed: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+(cherry picked from commit 68ee0f704cb81e9ad0a78c644a83e1e9cd2ee578)
+---
+ NEWS                       | 6 ++++++
+ assert/assert.c            | 4 +++-
+ sysdeps/posix/libc_fatal.c | 4 +++-
+ 3 files changed, 12 insertions(+), 2 deletions(-)
+
+diff --git a/NEWS b/NEWS
+index d0815514e0..3e511d6de4 100644
+--- a/NEWS
++++ b/NEWS
+@@ -34,6 +34,11 @@ Security related changes:
+   buffer overflow, which could be exploited to achieve escalated
+   privileges.  This flaw was introduced in glibc 2.34.
+ 
++  CVE-2025-0395: When the assert() function fails, it does not allocate
++  enough space for the assertion failure message string and size
++  information, which may lead to a buffer overflow if the message string
++  size aligns to page size.
++
+ The following bugs are resolved with this release:
+ 
+   [27821] ungetc: Fix backup buffer leak on program exit
+@@ -61,6 +66,7 @@ The following bugs are resolved with this release:
+   [32137] libio: Attempt wide backup free only for non-legacy code
+   [32231] elf: Change ldconfig auxcache magic number
+   [32470] x86: Avoid integer truncation with large cache sizes
++  [32582] Fix underallocation of abort_msg_s struct (CVE-2025-0395)
+ 
+ Version 2.38
+ 
+diff --git a/assert/assert.c b/assert/assert.c
+index b7c7a4a1ba..65a9fedf0d 100644
+--- a/assert/assert.c
++++ b/assert/assert.c
+@@ -18,6 +18,7 @@
+ #include <assert.h>
+ #include <atomic.h>
+ #include <ldsodefs.h>
++#include <libc-pointer-arith.h>
+ #include <libintl.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+@@ -64,7 +65,8 @@ __assert_fail_base (const char *fmt, const char *assertion, const char *file,
+       (void) __fxprintf (NULL, "%s", str);
+       (void) fflush (stderr);
+ 
+-      total = (total + 1 + GLRO(dl_pagesize) - 1) & ~(GLRO(dl_pagesize) - 1);
++      total = ALIGN_UP (total + sizeof (struct abort_msg_s) + 1,
++			GLRO(dl_pagesize));
+       struct abort_msg_s *buf = __mmap (NULL, total, PROT_READ | PROT_WRITE,
+ 					MAP_ANON | MAP_PRIVATE, -1, 0);
+       if (__glibc_likely (buf != MAP_FAILED))
+diff --git a/sysdeps/posix/libc_fatal.c b/sysdeps/posix/libc_fatal.c
+index 70edcc10c1..5b9e4b7918 100644
+--- a/sysdeps/posix/libc_fatal.c
++++ b/sysdeps/posix/libc_fatal.c
+@@ -20,6 +20,7 @@
+ #include <errno.h>
+ #include <fcntl.h>
+ #include <ldsodefs.h>
++#include <libc-pointer-arith.h>
+ #include <paths.h>
+ #include <stdarg.h>
+ #include <stdbool.h>
+@@ -123,7 +124,8 @@ __libc_message (const char *fmt, ...)
+ 
+       WRITEV_FOR_FATAL (fd, iov, nlist, total);
+ 
+-      total = (total + 1 + GLRO(dl_pagesize) - 1) & ~(GLRO(dl_pagesize) - 1);
++      total = ALIGN_UP (total + sizeof (struct abort_msg_s) + 1,
++			GLRO(dl_pagesize));
+       struct abort_msg_s *buf = __mmap (NULL, total,
+ 					PROT_READ | PROT_WRITE,
+ 					MAP_ANON | MAP_PRIVATE, -1, 0);
+-- 
+2.27.0
+
diff --git a/elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch b/elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch
new file mode 100644
index 0000000..6cfe03e
--- /dev/null
+++ b/elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch
@@ -0,0 +1,50 @@
+From 48642ef1a5721e0a7694d84fe46d83b6086dfe75 Mon Sep 17 00:00:00 2001
+From: Florian Weimer <fweimer@redhat.com>
+Date: Mon, 3 Jun 2024 10:49:40 +0200
+Subject: [PATCH] elf: Avoid some free (NULL) calls in
+ _dl_update_slotinfo
+
+This has been confirmed to work around some interposed mallocs.  Here
+is a discussion of the impact test ust/libc-wrapper/test_libc-wrapper
+in lttng-tools:
+
+  New TLS usage in libgcc_s.so.1, compatibility impact
+  <https://inbox.sourceware.org/libc-alpha/8734v1ieke.fsf@oldenburg.str.redhat.com/>
+
+Reportedly, this patch also papers over a similar issue when tcmalloc
+2.9.1 is not compiled with -ftls-model=initial-exec.  Of course the
+goal really should be to compile mallocs with the initial-exec TLS
+model, but this commit appears to be a useful interim workaround.
+
+Fixes commit d2123d68275acc0f061e73d5f86ca504e0d5a344 ("elf: Fix slow
+tls access after dlopen [BZ #19924]").
+
+Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+(cherry picked from commit afe42e935b3ee97bac9a7064157587777259c60e)
+---
+ elf/dl-tls.c | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+diff --git a/elf/dl-tls.c b/elf/dl-tls.c
+index 70446e71a8..de0168319c 100644
+--- a/elf/dl-tls.c
++++ b/elf/dl-tls.c
+@@ -819,7 +819,14 @@ _dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
+ 		 dtv entry free it.  Note: this is not AS-safe.  */
+ 	      /* XXX Ideally we will at some point create a memory
+ 		 pool.  */
+-	      free (dtv[modid].pointer.to_free);
++	      /* Avoid calling free on a null pointer.  Some mallocs
++		 incorrectly use dynamic TLS, and depending on how the
++		 free function was compiled, it could call
++		 __tls_get_addr before the null pointer check in the
++		 free implementation.  Checking here papers over at
++		 least some dynamic TLS usage by interposed mallocs.  */
++	      if (dtv[modid].pointer.to_free != NULL)
++		free (dtv[modid].pointer.to_free);
+ 	      dtv[modid].pointer.val = TLS_DTV_UNALLOCATED;
+ 	      dtv[modid].pointer.to_free = NULL;
+ 
+-- 
+2.27.0
+
diff --git a/elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch b/elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch
new file mode 100644
index 0000000..d401ab1
--- /dev/null
+++ b/elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch
@@ -0,0 +1,328 @@
+From 7772f9358c9a947251196ea7844b339f0a423ff6 Mon Sep 17 00:00:00 2001
+From: Szabolcs Nagy <szabolcs.nagy@arm.com>
+Date: Tue, 16 Feb 2021 12:55:13 +0000
+Subject: [PATCH] elf: Fix slow tls access after dlopen [BZ #19924]
+
+In short: __tls_get_addr checks the global generation counter and if
+the current dtv is older then _dl_update_slotinfo updates dtv up to the
+generation of the accessed module. So if the global generation is newer
+than generation of the module then __tls_get_addr keeps hitting the
+slow dtv update path. The dtv update path includes a number of checks
+to see if any update is needed and this already causes measurable tls
+access slow down after dlopen.
+
+It may be possible to detect up-to-date dtv faster.  But if there are
+many modules loaded (> TLS_SLOTINFO_SURPLUS) then this requires at
+least walking the slotinfo list.
+
+This patch tries to update the dtv to the global generation instead, so
+after a dlopen the tls access slow path is only hit once.  The modules
+with larger generation than the accessed one were not necessarily
+synchronized before, so additional synchronization is needed.
+
+This patch uses acquire/release synchronization when accessing the
+generation counter.
+
+Note: in the x86_64 version of dl-tls.c the generation is only loaded
+once, since relaxed mo is not faster than acquire mo load.
+
+I have not benchmarked this. Tested by Adhemerval Zanella on aarch64,
+powerpc, sparc, x86 who reported that it fixes the performance issue
+of bug 19924.
+
+Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
+(cherry picked from commit d2123d68275acc0f061e73d5f86ca504e0d5a344)
+---
+ elf/dl-close.c             |   2 +-
+ elf/dl-open.c              |   8 +--
+ elf/dl-reloc.c             |   6 +-
+ elf/dl-tls.c               | 117 ++++++++++++++++++++-----------------
+ sysdeps/generic/ldsodefs.h |   3 +-
+ sysdeps/x86_64/dl-tls.c    |   4 +-
+ 6 files changed, 74 insertions(+), 66 deletions(-)
+
+diff --git a/elf/dl-close.c b/elf/dl-close.c
+index b887a44888..1c7a861db1 100644
+--- a/elf/dl-close.c
++++ b/elf/dl-close.c
+@@ -703,7 +703,7 @@ _dl_close_worker (struct link_map *map, bool force)
+       if (__glibc_unlikely (newgen == 0))
+ 	_dl_fatal_printf ("TLS generation counter wrapped!  Please report as described in "REPORT_BUGS_TO".\n");
+       /* Can be read concurrently.  */
+-      atomic_store_relaxed (&GL(dl_tls_generation), newgen);
++      atomic_store_release (&GL(dl_tls_generation), newgen);
+ 
+       if (tls_free_end == GL(dl_tls_static_used))
+ 	GL(dl_tls_static_used) = tls_free_start;
+diff --git a/elf/dl-open.c b/elf/dl-open.c
+index 2d985e21d8..351931af04 100644
+--- a/elf/dl-open.c
++++ b/elf/dl-open.c
+@@ -405,7 +405,7 @@ update_tls_slotinfo (struct link_map *new)
+     _dl_fatal_printf (N_("\
+ TLS generation counter wrapped!  Please report this."));
+   /* Can be read concurrently.  */
+-  atomic_store_relaxed (&GL(dl_tls_generation), newgen);
++  atomic_store_release (&GL(dl_tls_generation), newgen);
+ 
+   /* We need a second pass for static tls data, because
+      _dl_update_slotinfo must not be run while calls to
+@@ -422,8 +422,8 @@ TLS generation counter wrapped!  Please report this."));
+ 	     now, but we can delay updating the DTV.  */
+ 	  imap->l_need_tls_init = 0;
+ #ifdef SHARED
+-	  /* Update the slot information data for at least the
+-	     generation of the DSO we are allocating data for.  */
++	  /* Update the slot information data for the current
++	     generation.  */
+ 
+ 	  /* FIXME: This can terminate the process on memory
+ 	     allocation failure.  It is not possible to raise
+@@ -431,7 +431,7 @@ TLS generation counter wrapped!  Please report this."));
+ 	     _dl_update_slotinfo would have to be split into two
+ 	     operations, similar to resize_scopes and update_scopes
+ 	     above.  This is related to bug 16134.  */
+-	  _dl_update_slotinfo (imap->l_tls_modid);
++	  _dl_update_slotinfo (imap->l_tls_modid, newgen);
+ #endif
+ 
+ 	  dl_init_static_tls (imap);
+diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c
+index 1d558c1e0c..e5c555d82c 100644
+--- a/elf/dl-reloc.c
++++ b/elf/dl-reloc.c
+@@ -112,11 +112,11 @@ _dl_try_allocate_static_tls (struct link_map *map, bool optional)
+   if (map->l_real->l_relocated)
+     {
+ #ifdef SHARED
++      /* Update the DTV of the current thread.  Note: GL(dl_load_tls_lock)
++	 is held here so normal load of the generation counter is valid.  */
+       if (__builtin_expect (THREAD_DTV()[0].counter != GL(dl_tls_generation),
+ 			    0))
+-	/* Update the slot information data for at least the generation of
+-	   the DSO we are allocating data for.  */
+-	(void) _dl_update_slotinfo (map->l_tls_modid);
++	(void) _dl_update_slotinfo (map->l_tls_modid, GL(dl_tls_generation));
+ #endif
+ 
+       dl_init_static_tls (map);
+diff --git a/elf/dl-tls.c b/elf/dl-tls.c
+index 1f6f820819..70446e71a8 100644
+--- a/elf/dl-tls.c
++++ b/elf/dl-tls.c
+@@ -716,57 +716,57 @@ allocate_and_init (struct link_map *map)
+ 
+ 
+ struct link_map *
+-_dl_update_slotinfo (unsigned long int req_modid)
++_dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
+ {
+   struct link_map *the_map = NULL;
+   dtv_t *dtv = THREAD_DTV ();
+ 
+-  /* The global dl_tls_dtv_slotinfo array contains for each module
+-     index the generation counter current when the entry was created.
++  /* CONCURRENCY NOTES:
++
++     The global dl_tls_dtv_slotinfo_list array contains for each module
++     index the generation counter current when that entry was updated.
+      This array never shrinks so that all module indices which were
+-     valid at some time can be used to access it.  Before the first
+-     use of a new module index in this function the array was extended
+-     appropriately.  Access also does not have to be guarded against
+-     modifications of the array.  It is assumed that pointer-size
+-     values can be read atomically even in SMP environments.  It is
+-     possible that other threads at the same time dynamically load
+-     code and therefore add to the slotinfo list.  This is a problem
+-     since we must not pick up any information about incomplete work.
+-     The solution to this is to ignore all dtv slots which were
+-     created after the one we are currently interested.  We know that
+-     dynamic loading for this module is completed and this is the last
+-     load operation we know finished.  */
+-  unsigned long int idx = req_modid;
++     valid at some time can be used to access it.  Concurrent loading
++     and unloading of modules can update slotinfo entries or extend
++     the array.  The updates happen under the GL(dl_load_tls_lock) and
++     finish with the release store of the generation counter to
++     GL(dl_tls_generation) which is synchronized with the load of
++     new_gen in the caller.  So updates up to new_gen are synchronized
++     but updates for later generations may not be.
++
++     Here we update the thread dtv from old_gen (== dtv[0].counter) to
++     new_gen generation.  For this, each dtv[i] entry is either set to
++     an unallocated state (set), or left unmodified (nop).  Where (set)
++     may resize the dtv first if modid i >= dtv[-1].counter. The rules
++     for the decision between (set) and (nop) are
++
++     (1) If slotinfo entry i is concurrently updated then either (set)
++         or (nop) is valid: TLS access cannot use dtv[i] unless it is
++         synchronized with a generation > new_gen.
++
++     Otherwise, if the generation of slotinfo entry i is gen and the
++     loaded module for this entry is map then
++
++     (2) If gen <= old_gen then do (nop).
++
++     (3) If old_gen < gen <= new_gen then
++         (3.1) if map != 0 then (set)
++         (3.2) if map == 0 then either (set) or (nop).
++
++     Note that (1) cannot be reliably detected, but since both actions
++     are valid it does not have to be.  Only (2) and (3.1) cases need
++     to be distinguished for which relaxed mo access of gen and map is
++     enough: their value is synchronized when it matters.
++
++     Note that a relaxed mo load may give an out-of-thin-air value since
++     it is used in decisions that can affect concurrent stores.  But this
++     should only happen if the OOTA value causes UB that justifies the
++     concurrent store of the value.  This is not expected to be an issue
++     in practice.  */
+   struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
+ 
+-  while (idx >= listp->len)
++  if (dtv[0].counter < new_gen)
+     {
+-      idx -= listp->len;
+-      listp = listp->next;
+-    }
+-
+-  if (dtv[0].counter < listp->slotinfo[idx].gen)
+-    {
+-      /* CONCURRENCY NOTES:
+-
+-	 Here the dtv needs to be updated to new_gen generation count.
+-
+-	 This code may be called during TLS access when GL(dl_load_tls_lock)
+-	 is not held.  In that case the user code has to synchronize with
+-	 dlopen and dlclose calls of relevant modules.  A module m is
+-	 relevant if the generation of m <= new_gen and dlclose of m is
+-	 synchronized: a memory access here happens after the dlopen and
+-	 before the dlclose of relevant modules.  The dtv entries for
+-	 relevant modules need to be updated, other entries can be
+-	 arbitrary.
+-
+-	 This e.g. means that the first part of the slotinfo list can be
+-	 accessed race free, but the tail may be concurrently extended.
+-	 Similarly relevant slotinfo entries can be read race free, but
+-	 other entries are racy.  However updating a non-relevant dtv
+-	 entry does not affect correctness.  For a relevant module m,
+-	 max_modid >= modid of m.  */
+-      size_t new_gen = listp->slotinfo[idx].gen;
+       size_t total = 0;
+       size_t max_modid  = atomic_load_relaxed (&GL(dl_tls_max_dtv_idx));
+       assert (max_modid >= req_modid);
+@@ -779,31 +779,33 @@ _dl_update_slotinfo (unsigned long int req_modid)
+ 	    {
+ 	      size_t modid = total + cnt;
+ 
+-	      /* Later entries are not relevant.  */
++	      /* Case (1) for all later modids.  */
+ 	      if (modid > max_modid)
+ 		break;
+ 
+ 	      size_t gen = atomic_load_relaxed (&listp->slotinfo[cnt].gen);
+ 
++	      /* Case (1).  */
+ 	      if (gen > new_gen)
+-		/* Not relevant.  */
+ 		continue;
+ 
+-	      /* If the entry is older than the current dtv layout we
+-		 know we don't have to handle it.  */
++	      /* Case (2) or (1).  */
+ 	      if (gen <= dtv[0].counter)
+ 		continue;
+ 
++	      /* Case (3) or (1).  */
++
+ 	      /* If there is no map this means the entry is empty.  */
+ 	      struct link_map *map
+ 		= atomic_load_relaxed (&listp->slotinfo[cnt].map);
+ 	      /* Check whether the current dtv array is large enough.  */
+ 	      if (dtv[-1].counter < modid)
+ 		{
++		  /* Case (3.2) or (1).  */
+ 		  if (map == NULL)
+ 		    continue;
+ 
+-		  /* Resize the dtv.  */
++		  /* Resizing the dtv aborts on failure: bug 16134.  */
+ 		  dtv = _dl_resize_dtv (dtv, max_modid);
+ 
+ 		  assert (modid <= dtv[-1].counter);
+@@ -814,7 +816,7 @@ _dl_update_slotinfo (unsigned long int req_modid)
+ 		}
+ 
+ 	      /* If there is currently memory allocate for this
+-		 dtv entry free it.  */
++		 dtv entry free it.  Note: this is not AS-safe.  */
+ 	      /* XXX Ideally we will at some point create a memory
+ 		 pool.  */
+ 	      free (dtv[modid].pointer.to_free);
+@@ -909,9 +911,9 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t *dtv, struct link_map *the_map)
+ 
+ static struct link_map *
+ __attribute_noinline__
+-update_get_addr (GET_ADDR_ARGS)
++update_get_addr (GET_ADDR_ARGS, size_t gen)
+ {
+-  struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE);
++  struct link_map *the_map = _dl_update_slotinfo (GET_ADDR_MODULE, gen);
+   dtv_t *dtv = THREAD_DTV ();
+ 
+   void *p = dtv[GET_ADDR_MODULE].pointer.val;
+@@ -941,12 +943,17 @@ __tls_get_addr (GET_ADDR_ARGS)
+   dtv_t *dtv = THREAD_DTV ();
+ 
+   /* Update is needed if dtv[0].counter < the generation of the accessed
+-     module.  The global generation counter is used here as it is easier
+-     to check.  Synchronization for the relaxed MO access is guaranteed
+-     by user code, see CONCURRENCY NOTES in _dl_update_slotinfo.  */
++     module, but the global generation counter is easier to check (which
++     must be synchronized up to the generation of the accessed module by
++     user code doing the TLS access so relaxed mo read is enough).  */
+   size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
+   if (__glibc_unlikely (dtv[0].counter != gen))
+-    return update_get_addr (GET_ADDR_PARAM);
++    {
++      /* Update DTV up to the global generation, see CONCURRENCY NOTES
++         in _dl_update_slotinfo.  */
++      gen = atomic_load_acquire (&GL(dl_tls_generation));
++      return update_get_addr (GET_ADDR_PARAM, gen);
++    }
+ 
+   void *p = dtv[GET_ADDR_MODULE].pointer.val;
+ 
+diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
+index e8b7359b04..ed69c6babd 100644
+--- a/sysdeps/generic/ldsodefs.h
++++ b/sysdeps/generic/ldsodefs.h
+@@ -1251,7 +1251,8 @@ extern void _dl_add_to_slotinfo (struct link_map *l, bool do_add)
+ 
+ /* Update slot information data for at least the generation of the
+    module with the given index.  */
+-extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid)
++extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid,
++					     size_t gen)
+      attribute_hidden;
+ 
+ /* Look up the module's TLS block as for __tls_get_addr,
+diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
+index 7a7fe38625..e9b6ab9970 100644
+--- a/sysdeps/x86_64/dl-tls.c
++++ b/sysdeps/x86_64/dl-tls.c
+@@ -40,9 +40,9 @@ __tls_get_addr_slow (GET_ADDR_ARGS)
+ {
+   dtv_t *dtv = THREAD_DTV ();
+ 
+-  size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
++  size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
+   if (__glibc_unlikely (dtv[0].counter != gen))
+-    return update_get_addr (GET_ADDR_PARAM);
++    return update_get_addr (GET_ADDR_PARAM, gen);
+ 
+   return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
+ }
+-- 
+2.27.0
+
diff --git a/elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch b/elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
new file mode 100644
index 0000000..c60cbb7
--- /dev/null
+++ b/elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
@@ -0,0 +1,521 @@
+From 549e7f7c5a94f5ccbab2ad5f1babca05028a31c7 Mon Sep 17 00:00:00 2001
+From: Florian Weimer <fweimer@redhat.com>
+Date: Mon, 1 Jul 2024 17:42:04 +0200
+Subject: [PATCH] elf: Support recursive use of dynamic TLS in interposed
+ malloc
+
+It turns out that quite a few applications use bundled mallocs that
+have been built to use global-dynamic TLS (instead of the recommended
+initial-exec TLS).  The previous workaround from
+commit afe42e935b3ee97bac9a7064157587777259c60e ("elf: Avoid some
+free (NULL) calls in _dl_update_slotinfo") does not fix all
+encountered cases unfortunatelly.
+
+This change avoids the TLS generation update for recursive use
+of TLS from a malloc that was called during a TLS update.  This
+is possible because an interposed malloc has a fixed module ID and
+TLS slot.  (It cannot be unloaded.)  If an initially-loaded module ID
+is encountered in __tls_get_addr and the dynamic linker is already
+in the middle of a TLS update, use the outdated DTV, thus avoiding
+another call into malloc.  It's still necessary to update the
+DTV to the most recent generation, to get out of the slow path,
+which is why the check for recursion is needed.
+
+The bookkeeping is done using a global counter instead of per-thread
+flag because TLS access in the dynamic linker is tricky.
+
+All this will go away once the dynamic linker stops using malloc
+for TLS, likely as part of a change that pre-allocates all TLS
+during pthread_create/dlopen.
+
+Fixes commit d2123d68275acc0f061e73d5f86ca504e0d5a344 ("elf: Fix slow
+tls access after dlopen [BZ #19924]").
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+(cherry picked from commit 018f0fc3b818d4d1460a4e2384c24802504b1d20)
+
+Conflict: adapt file "elf/Makefile" for patch "elf: Switch to main
+malloc after final ld.so self-relocation"
+---
+ elf/Makefile                     | 26 +++++++++
+ elf/dl-tls.c                     | 95 +++++++++++++++++++++++++++++---
+ elf/rtld.c                       |  2 +
+ elf/tst-recursive-tls.c          | 60 ++++++++++++++++++++
+ elf/tst-recursive-tlsmallocmod.c | 64 +++++++++++++++++++++
+ elf/tst-recursive-tlsmodN.c      | 28 ++++++++++
+ sysdeps/generic/ldsodefs.h       | 14 +++++
+ sysdeps/x86_64/dl-tls.c          |  5 +-
+ 8 files changed, 284 insertions(+), 10 deletions(-)
+ create mode 100644 elf/tst-recursive-tls.c
+ create mode 100644 elf/tst-recursive-tlsmallocmod.c
+ create mode 100644 elf/tst-recursive-tlsmodN.c
+
+diff --git a/elf/Makefile b/elf/Makefile
+index ea98cba8..391f29e9 100644
+--- a/elf/Makefile
++++ b/elf/Makefile
+@@ -433,6 +433,7 @@ tests += \
+   tst-p_align1 \
+   tst-p_align2 \
+   tst-p_align3 \
++  tst-recursive-tls \
+   tst-relsort1 \
+   tst-ro-dynamic \
+   tst-rtld-no-malloc \
+@@ -865,6 +866,23 @@ modules-names += \
+   tst-null-argv-lib \
+   tst-p_alignmod-base \
+   tst-p_alignmod3 \
++  tst-recursive-tlsmallocmod \
++  tst-recursive-tlsmod0 \
++  tst-recursive-tlsmod1 \
++  tst-recursive-tlsmod2 \
++  tst-recursive-tlsmod3 \
++  tst-recursive-tlsmod4 \
++  tst-recursive-tlsmod5 \
++  tst-recursive-tlsmod6 \
++  tst-recursive-tlsmod7 \
++  tst-recursive-tlsmod8 \
++  tst-recursive-tlsmod9 \
++  tst-recursive-tlsmod10 \
++  tst-recursive-tlsmod11 \
++  tst-recursive-tlsmod12 \
++  tst-recursive-tlsmod13 \
++  tst-recursive-tlsmod14 \
++  tst-recursive-tlsmod15 \
+   tst-relsort1mod1 \
+   tst-relsort1mod2 \
+   tst-ro-dynamic-mod \
+@@ -3042,6 +3060,14 @@ CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
+ CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
+ endif
+ 
++$(objpfx)tst-recursive-tls: $(objpfx)tst-recursive-tlsmallocmod.so
++# More objects than DTV_SURPLUS, to trigger DTV reallocation.
++$(objpfx)tst-recursive-tls.out: \
++  $(patsubst %,$(objpfx)tst-recursive-tlsmod%.so, \
++    0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
++$(objpfx)tst-recursive-tlsmod%.os: tst-recursive-tlsmodN.c
++	$(compile-command.c) -DVAR=thread_$* -DFUNC=get_threadvar_$*
++
+ # Reuse an audit module which provides ample debug logging.
+ tst-rtld-no-malloc-audit-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
+ 
+diff --git a/elf/dl-tls.c b/elf/dl-tls.c
+index de016831..59d4021e 100644
+--- a/elf/dl-tls.c
++++ b/elf/dl-tls.c
+@@ -75,6 +75,31 @@
+ /* Default for dl_tls_static_optional.  */
+ #define OPTIONAL_TLS 512
+ 
++/* Used to count the number of threads currently executing dynamic TLS
++   updates.  Used to avoid recursive malloc calls in __tls_get_addr
++   for an interposed malloc that uses global-dynamic TLS (which is not
++   recommended); see _dl_tls_allocate_active checks.  This could be a
++   per-thread flag, but would need TLS access in the dynamic linker.  */
++unsigned int _dl_tls_threads_in_update;
++
++static inline void
++_dl_tls_allocate_begin (void)
++{
++  atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, 1);
++}
++
++static inline void
++_dl_tls_allocate_end (void)
++{
++  atomic_fetch_add_relaxed (&_dl_tls_threads_in_update, -1);
++}
++
++static inline bool
++_dl_tls_allocate_active (void)
++{
++  return atomic_load_relaxed (&_dl_tls_threads_in_update) > 0;
++}
++
+ /* Compute the static TLS surplus based on the namespace count and the
+    TLS space that can be used for optimizations.  */
+ static inline int
+@@ -425,12 +450,18 @@ _dl_allocate_tls_storage (void)
+   size += TLS_PRE_TCB_SIZE;
+ #endif
+ 
+-  /* Perform the allocation.  Reserve space for the required alignment
+-     and the pointer to the original allocation.  */
++  /* Reserve space for the required alignment and the pointer to the
++     original allocation.  */
+   size_t alignment = GLRO (dl_tls_static_align);
++
++  /* Perform the allocation.  */
++  _dl_tls_allocate_begin ();
+   void *allocated = malloc (size + alignment + sizeof (void *));
+   if (__glibc_unlikely (allocated == NULL))
+-    return NULL;
++    {
++      _dl_tls_allocate_end ();
++      return NULL;
++    }
+ 
+   /* Perform alignment and allocate the DTV.  */
+ #if TLS_TCB_AT_TP
+@@ -466,6 +497,8 @@ _dl_allocate_tls_storage (void)
+   result = allocate_dtv (result);
+   if (result == NULL)
+     free (allocated);
++
++  _dl_tls_allocate_end ();
+   return result;
+ }
+ 
+@@ -483,6 +516,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
+   size_t newsize = max_modid + DTV_SURPLUS;
+   size_t oldsize = dtv[-1].counter;
+ 
++  _dl_tls_allocate_begin ();
+   if (dtv == GL(dl_initial_dtv))
+     {
+       /* This is the initial dtv that was either statically allocated in
+@@ -502,6 +536,7 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
+       if (newp == NULL)
+ 	oom ();
+     }
++  _dl_tls_allocate_end ();
+ 
+   newp[0].counter = newsize;
+ 
+@@ -676,7 +711,9 @@ allocate_dtv_entry (size_t alignment, size_t size)
+   if (powerof2 (alignment) && alignment <= _Alignof (max_align_t))
+     {
+       /* The alignment is supported by malloc.  */
++      _dl_tls_allocate_begin ();
+       void *ptr = malloc (size);
++      _dl_tls_allocate_end ();
+       return (struct dtv_pointer) { ptr, ptr };
+     }
+ 
+@@ -688,7 +725,10 @@ allocate_dtv_entry (size_t alignment, size_t size)
+ 
+   /* Perform the allocation.  This is the pointer we need to free
+      later.  */
++  _dl_tls_allocate_begin ();
+   void *start = malloc (alloc_size);
++  _dl_tls_allocate_end ();
++
+   if (start == NULL)
+     return (struct dtv_pointer) {};
+ 
+@@ -826,7 +866,11 @@ _dl_update_slotinfo (unsigned long int req_modid, size_t new_gen)
+ 		 free implementation.  Checking here papers over at
+ 		 least some dynamic TLS usage by interposed mallocs.  */
+ 	      if (dtv[modid].pointer.to_free != NULL)
+-		free (dtv[modid].pointer.to_free);
++		{
++		  _dl_tls_allocate_begin ();
++		  free (dtv[modid].pointer.to_free);
++		  _dl_tls_allocate_end ();
++		}
+ 	      dtv[modid].pointer.val = TLS_DTV_UNALLOCATED;
+ 	      dtv[modid].pointer.to_free = NULL;
+ 
+@@ -956,10 +1000,22 @@ __tls_get_addr (GET_ADDR_ARGS)
+   size_t gen = atomic_load_relaxed (&GL(dl_tls_generation));
+   if (__glibc_unlikely (dtv[0].counter != gen))
+     {
+-      /* Update DTV up to the global generation, see CONCURRENCY NOTES
+-         in _dl_update_slotinfo.  */
+-      gen = atomic_load_acquire (&GL(dl_tls_generation));
+-      return update_get_addr (GET_ADDR_PARAM, gen);
++      if (_dl_tls_allocate_active ()
++	  && GET_ADDR_MODULE < _dl_tls_initial_modid_limit)
++	  /* This is a reentrant __tls_get_addr call, but we can
++	     satisfy it because it's an initially-loaded module ID.
++	     These TLS slotinfo slots do not change, so the
++	     out-of-date generation counter does not matter.  However,
++	     if not in a TLS update, still update_get_addr below, to
++	     get off the slow path eventually.  */
++	;
++      else
++	{
++	  /* Update DTV up to the global generation, see CONCURRENCY NOTES
++	     in _dl_update_slotinfo.  */
++	  gen = atomic_load_acquire (&GL(dl_tls_generation));
++	  return update_get_addr (GET_ADDR_PARAM, gen);
++	}
+     }
+ 
+   void *p = dtv[GET_ADDR_MODULE].pointer.val;
+@@ -969,7 +1025,7 @@ __tls_get_addr (GET_ADDR_ARGS)
+ 
+   return (char *) p + GET_ADDR_OFFSET;
+ }
+-#endif
++#endif /* SHARED */
+ 
+ 
+ /* Look up the module's TLS block as for __tls_get_addr,
+@@ -1018,6 +1074,25 @@ _dl_tls_get_addr_soft (struct link_map *l)
+   return data;
+ }
+ 
++size_t _dl_tls_initial_modid_limit;
++
++void
++_dl_tls_initial_modid_limit_setup (void)
++{
++  struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
++  size_t idx;
++  for (idx = 0; idx < listp->len; ++idx)
++    {
++      struct link_map *l = listp->slotinfo[idx].map;
++      if (l == NULL
++	  /* The object can be unloaded, so its modid can be
++	     reassociated.  */
++	  || !(l->l_type == lt_executable || l->l_type == lt_library))
++	break;
++    }
++  _dl_tls_initial_modid_limit = idx;
++}
++
+ 
+ void
+ _dl_add_to_slotinfo (struct link_map *l, bool do_add)
+@@ -1050,9 +1125,11 @@ _dl_add_to_slotinfo (struct link_map *l, bool do_add)
+ 	 the first slot.  */
+       assert (idx == 0);
+ 
++      _dl_tls_allocate_begin ();
+       listp = (struct dtv_slotinfo_list *)
+ 	malloc (sizeof (struct dtv_slotinfo_list)
+ 		+ TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
++      _dl_tls_allocate_end ();
+       if (listp == NULL)
+ 	{
+ 	  /* We ran out of memory while resizing the dtv slotinfo list.  */
+diff --git a/elf/rtld.c b/elf/rtld.c
+index 558733b8..0a1e202c 100644
+--- a/elf/rtld.c
++++ b/elf/rtld.c
+@@ -789,6 +789,8 @@ init_tls (size_t naudit)
+     _dl_fatal_printf ("\
+ cannot allocate TLS data structures for initial thread\n");
+ 
++  _dl_tls_initial_modid_limit_setup ();
++
+   /* Store for detection of the special case by __tls_get_addr
+      so it knows not to pass this dtv to the normal realloc.  */
+   GL(dl_initial_dtv) = GET_DTV (tcbp);
+diff --git a/elf/tst-recursive-tls.c b/elf/tst-recursive-tls.c
+new file mode 100644
+index 00000000..716d1f78
+--- /dev/null
++++ b/elf/tst-recursive-tls.c
+@@ -0,0 +1,60 @@
++/* Test with interposed malloc with dynamic TLS.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <array_length.h>
++#include <stdio.h>
++#include <support/check.h>
++#include <support/xdlfcn.h>
++
++/* Defined in tst-recursive-tlsmallocmod.so.  */
++extern __thread unsigned int malloc_subsytem_counter;
++
++static int
++do_test (void)
++{
++  /* 16 is large enough to exercise the DTV resizing case.  */
++  void *handles[16];
++
++  for (unsigned int i = 0; i < array_length (handles); ++i)
++    {
++      /* Re-use the TLS slot for module 0.  */
++      if (i > 0)
++        xdlclose (handles[0]);
++
++      char soname[30];
++      snprintf (soname, sizeof (soname), "tst-recursive-tlsmod%u.so", i);
++      handles[i] = xdlopen (soname, RTLD_NOW);
++
++      if (i > 0)
++        {
++          handles[0] = xdlopen ("tst-recursive-tlsmod0.so", RTLD_NOW);
++          int (*fptr) (void) = xdlsym (handles[0], "get_threadvar_0");
++          /* May trigger TLS storage allocation using malloc.  */
++          TEST_COMPARE (fptr (), 0);
++        }
++    }
++
++  for (unsigned int i = 0; i < array_length (handles); ++i)
++    xdlclose (handles[i]);
++
++  printf ("info: malloc subsystem calls: %u\n", malloc_subsytem_counter);
++  TEST_VERIFY (malloc_subsytem_counter > 0);
++  return 0;
++}
++
++#include <support/test-driver.c>
+diff --git a/elf/tst-recursive-tlsmallocmod.c b/elf/tst-recursive-tlsmallocmod.c
+new file mode 100644
+index 00000000..c24e9945
+--- /dev/null
++++ b/elf/tst-recursive-tlsmallocmod.c
+@@ -0,0 +1,64 @@
++/* Interposed malloc with dynamic TLS.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <stdlib.h>
++#include <dlfcn.h>
++
++__thread unsigned int malloc_subsytem_counter;
++
++static __typeof (malloc) *malloc_fptr;
++static __typeof (free) *free_fptr;
++static __typeof (calloc) *calloc_fptr;
++static __typeof (realloc) *realloc_fptr;
++
++static void __attribute__ ((constructor))
++init (void)
++{
++  malloc_fptr = dlsym (RTLD_NEXT, "malloc");
++  free_fptr = dlsym (RTLD_NEXT, "free");
++  calloc_fptr = dlsym (RTLD_NEXT, "calloc");
++  realloc_fptr = dlsym (RTLD_NEXT, "realloc");
++}
++
++void *
++malloc (size_t size)
++{
++  ++malloc_subsytem_counter;
++  return malloc_fptr (size);
++}
++
++void
++free (void *ptr)
++{
++  ++malloc_subsytem_counter;
++  return free_fptr (ptr);
++}
++
++void *
++calloc (size_t a, size_t b)
++{
++  ++malloc_subsytem_counter;
++  return calloc_fptr (a, b);
++}
++
++void *
++realloc (void *ptr, size_t size)
++{
++  ++malloc_subsytem_counter;
++  return realloc_fptr (ptr, size);
++}
+diff --git a/elf/tst-recursive-tlsmodN.c b/elf/tst-recursive-tlsmodN.c
+new file mode 100644
+index 00000000..bb7592ae
+--- /dev/null
++++ b/elf/tst-recursive-tlsmodN.c
+@@ -0,0 +1,28 @@
++/* Test module with global-dynamic TLS.  Used to trigger DTV reallocation.
++   Copyright (C) 2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++/* Compiled with VAR and FUNC set via -D.  FUNC requires some
++   relocation against TLS variable VAR.  */
++
++__thread int VAR;
++
++int
++FUNC (void)
++{
++  return VAR;
++}
+diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
+index 22fbbecd..ad271ae0 100644
+--- a/sysdeps/generic/ldsodefs.h
++++ b/sysdeps/generic/ldsodefs.h
+@@ -1262,6 +1262,20 @@ extern struct link_map *_dl_update_slotinfo (unsigned long int req_modid,
+ 					     size_t gen)
+      attribute_hidden;
+ 
++/* The last TLS module ID that is initially loaded, plus 1.  TLS
++   addresses for modules with IDs lower than that can be obtained from
++   the DTV even if its generation is outdated.  */
++extern size_t _dl_tls_initial_modid_limit attribute_hidden attribute_relro;
++
++/* Compute _dl_tls_initial_modid_limit.  To be called after initial
++   relocation.  */
++void _dl_tls_initial_modid_limit_setup (void) attribute_hidden;
++
++/* Number of threads currently in a TLS update.  This is used to
++   detect reentrant __tls_get_addr calls without a per-thread
++   flag.  */
++extern unsigned int _dl_tls_threads_in_update attribute_hidden;
++
+ /* Look up the module's TLS block as for __tls_get_addr,
+    but never touch anything.  Return null if it's not allocated yet.  */
+ extern void *_dl_tls_get_addr_soft (struct link_map *l) attribute_hidden;
+diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c
+index e9b6ab99..c484f39e 100644
+--- a/sysdeps/x86_64/dl-tls.c
++++ b/sysdeps/x86_64/dl-tls.c
+@@ -41,7 +41,10 @@ __tls_get_addr_slow (GET_ADDR_ARGS)
+   dtv_t *dtv = THREAD_DTV ();
+ 
+   size_t gen = atomic_load_acquire (&GL(dl_tls_generation));
+-  if (__glibc_unlikely (dtv[0].counter != gen))
++  if (__glibc_unlikely (dtv[0].counter != gen)
++      /* See comment in __tls_get_addr in elf/dl-tls.c.  */
++      && !(_dl_tls_allocate_active ()
++           && GET_ADDR_MODULE < _dl_tls_initial_modid_limit))
+     return update_get_addr (GET_ADDR_PARAM, gen);
+ 
+   return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL);
+-- 
+2.27.0
+
diff --git a/glibc.spec b/glibc.spec
index f35d480..5589315 100644
--- a/glibc.spec
+++ b/glibc.spec
@@ -67,7 +67,7 @@
 ##############################################################################
 Name: 	 	glibc
 Version: 	2.38
-Release: 	51
+Release: 	52
 Summary: 	The GNU libc libraries
 License:	%{all_license}
 URL: 		http://www.gnu.org/software/glibc/
@@ -252,6 +252,21 @@ Patch162: nptl-initialize-rseq-area-prior-to-registration.patch
 Patch163: nptl-initialize-cpu_id_start-prior-to-rseq-registrat.patch
 Patch164: x86-Avoid-integer-truncation-with-large-cache-sizes-.patch
 Patch165: LoongArch-Force-SHMLBA-the-same-as-kernel.patch
+Patch166: x86_64-Sort-fpu-multiarch-Makefile.patch
+Patch167: x86_64-Add-log2-with-FMA.patch
+Patch168: x86_64-Add-expm1-with-FMA.patch
+Patch169: x86_64-Add-log1p-with-FMA.patch
+Patch170: x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch
+Patch171: elf-Fix-slow-tls-access-after-dlopen-BZ-19924.patch
+Patch172: x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch
+Patch173: sysdeps-x86-Makefile-Split-and-sort-tests.patch
+Patch174: x86_64-Fix-missing-wcsncat-function-definition-witho.patch
+Patch175: x86-Improve-large-memset-perf-with-non-temporal-stor.patch
+Patch176: x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch
+Patch177: elf-Avoid-some-free-NULL-calls-in-_dl_update_slotinf.patch
+Patch178: elf-Support-recursive-use-of-dynamic-TLS-in-interpos.patch
+Patch179: Fix-underallocation-of-abort_msg_s-struct-CVE-2025-0.patch
+Patch180: stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
 
 #openEuler patch list
 Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch
@@ -1471,6 +1486,23 @@ fi
 %endif
 
 %changelog
+* Sun Jan 26 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-52
+- stdlib: Test using setenv with updated environ [BZ #32588]
+- Fix underallocation of abort_msg_s struct (CVE-2025-0395)
+- elf: Support recursive use of dynamic TLS in interposed malloc
+- elf: Avoid some free (NULL) calls in _dl_update_slotinfo
+- x86/string: Fixup alignment of main loop in str{n}cmp-evex [BZ #32212]
+- x86: Improve large memset perf with non-temporal stores [RHEL-29312]
+- x86_64: Fix missing wcsncat function definition without multiarch (x86-64-v4)
+- sysdeps/x86/Makefile: Split and sort tests
+- x86: Only align destination to 1x VEC_SIZE in memset 4x loop
+- elf: Fix slow tls access after dlopen [BZ #19924]
+- x86: Check the lower byte of EAX of CPUID leaf 2 [BZ #30643]
+- x86_64: Add log1p with FMA
+- x86_64: Add expm1 with FMA
+- x86_64: Add log2 with FMA
+- x86_64: Sort fpu/multiarch/Makefile
+
 * Wed Jan 15 2025 MayShao <mayshao-oc@zhaoxin.com> - 2.38-51
 - x86: Set preferred CPU features and default NT threshold for Zhaoxin processors
 
diff --git a/stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch b/stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
new file mode 100644
index 0000000..9bbb5c3
--- /dev/null
+++ b/stdlib-Test-using-setenv-with-updated-environ-BZ-325.patch
@@ -0,0 +1,75 @@
+From 650a0aaaffa9ddb44732fa6156b31c5f30ee596f Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 24 Jan 2025 18:53:13 +0800
+Subject: [PATCH] stdlib: Test using setenv with updated environ [BZ
+ #32588]
+
+Add a test for setenv with updated environ.  Verify that BZ #32588 is
+fixed.
+
+Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
+Reviewed-by: Florian Weimer <fweimer@redhat.com>
+(cherry picked from commit 8ab34497de14e35aff09b607222fe1309ef156da)
+---
+ stdlib/Makefile             |  1 +
+ stdlib/tst-setenv-environ.c | 36 ++++++++++++++++++++++++++++++++++++
+ 2 files changed, 37 insertions(+)
+ create mode 100644 stdlib/tst-setenv-environ.c
+
+diff --git a/stdlib/Makefile b/stdlib/Makefile
+index 25e42a77e7..750810ee92 100644
+--- a/stdlib/Makefile
++++ b/stdlib/Makefile
+@@ -232,6 +232,7 @@ tests := \
+   tst-setcontext7 \
+   tst-setcontext8 \
+   tst-setcontext9 \
++  tst-setenv-environ \
+   tst-strfmon_l \
+   tst-strfrom \
+   tst-strfrom-locale \
+diff --git a/stdlib/tst-setenv-environ.c b/stdlib/tst-setenv-environ.c
+new file mode 100644
+index 0000000000..02fcef96d0
+--- /dev/null
++++ b/stdlib/tst-setenv-environ.c
+@@ -0,0 +1,36 @@
++/* Test using setenv with updated environ.
++   Copyright (C) 2025 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <stdlib.h>
++#include <support/check.h>
++
++extern char **environ;
++
++int
++do_test (void)
++{
++  char *valp;
++  static char *dummy_environ[] = { NULL };
++  environ = dummy_environ;
++  setenv ("A", "1", 0);
++  valp = getenv ("A");
++  TEST_VERIFY_EXIT (valp[0] == '1' && valp[1] == '\0');
++  return 0;
++}
++
++#include <support/test-driver.c>
+-- 
+2.27.0
+
diff --git a/sysdeps-x86-Makefile-Split-and-sort-tests.patch b/sysdeps-x86-Makefile-Split-and-sort-tests.patch
new file mode 100644
index 0000000..6925e1a
--- /dev/null
+++ b/sysdeps-x86-Makefile-Split-and-sort-tests.patch
@@ -0,0 +1,178 @@
+From 0d14bf0754ee8d8cf2bf3dad298fa5c5f97537db Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 7 Dec 2023 09:00:11 -0800
+Subject: [PATCH] sysdeps/x86/Makefile: Split and sort tests
+
+Put each test on a separate line and sort tests.
+
+(cherry picked from commit 7e03e0de7e7c2de975b5c5e18f5a4b0c75816674)
+---
+ sysdeps/x86/Makefile | 110 ++++++++++++++++++++++++++++++-------------
+ 1 file changed, 78 insertions(+), 32 deletions(-)
+
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 917c26f116..5631a59a26 100644
+--- a/sysdeps/x86/Makefile
++++ b/sysdeps/x86/Makefile
+@@ -10,36 +10,51 @@ sysdep_headers += sys/platform/x86.h bits/platform/x86.h
+ CFLAGS-dl-get-cpu-features.os += $(rtld-early-cflags)
+ CFLAGS-get-cpuid-feature-leaf.o += $(no-stack-protector)
+ 
+-tests += tst-get-cpu-features tst-get-cpu-features-static \
+-	 tst-cpu-features-cpuinfo tst-cpu-features-cpuinfo-static \
+-	 tst-cpu-features-supports tst-cpu-features-supports-static
+-tests-static += tst-get-cpu-features-static \
+-		tst-cpu-features-cpuinfo-static \
+-		tst-cpu-features-supports-static
++tests += \
++  tst-get-cpu-features \
++  tst-get-cpu-features-static \
++  tst-cpu-features-cpuinfo \
++  tst-cpu-features-cpuinfo-static \
++  tst-cpu-features-supports \
++  tst-cpu-features-supports-static \
++# tests
++tests-static += \
++  tst-get-cpu-features-static \
++  tst-cpu-features-cpuinfo-static \
++  tst-cpu-features-supports-static \
++# tests-static
+ ifeq (yes,$(have-ifunc))
+ ifeq (yes,$(have-gcc-ifunc))
+ tests += \
+   tst-ifunc-isa-1 \
+-  tst-ifunc-isa-1-static
++  tst-ifunc-isa-1-static \
++# tests
+ tests-static += \
+-  tst-ifunc-isa-1-static
++  tst-ifunc-isa-1-static \
++# tests-static
+ test-xfail-tst-ifunc-isa-1 = $(with-lld)
+ test-xfail-tst-ifunc-isa-1-static = $(with-lld)
+ tests += \
+   tst-ifunc-isa-2 \
+-  tst-ifunc-isa-2-static
++  tst-ifunc-isa-2-static \
++# tests
+ tests-static += \
+-  tst-ifunc-isa-2-static
++  tst-ifunc-isa-2-static \
++# tests-static
+ test-xfail-tst-ifunc-isa-2 = $(with-lld)
+ test-xfail-tst-ifunc-isa-2-static = $(with-lld)
+ endif
+ endif
+ ifeq (yes,$(enable-x86-isa-level))
+-tests += tst-isa-level-1
+-modules-names += tst-isa-level-mod-1-baseline \
+-		 tst-isa-level-mod-1-v2 \
+-		 tst-isa-level-mod-1-v3 \
+-		 tst-isa-level-mod-1-v4 \
++tests += \
++  tst-isa-level-1 \
++# tests
++modules-names += \
++  tst-isa-level-mod-1-baseline \
++  tst-isa-level-mod-1-v2 \
++  tst-isa-level-mod-1-v3 \
++  tst-isa-level-mod-1-v4 \
++# modules-names
+ 
+ # X86 ISA level baseline
+ CFLAGS-tst-isa-level-mod-1-baseline.c += -DINCLUDE_X86_ISA_LEVEL \
+@@ -68,14 +83,18 @@ tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV)
+ endif
+ 
+ ifeq ($(subdir),math)
+-tests += tst-ldbl-nonnormal-printf
++tests += \
++ tst-ldbl-nonnormal-printf \
++# tests
+ endif # $(subdir) == math
+ 
+ ifeq ($(subdir),setjmp)
+ gen-as-const-headers += jmp_buf-ssp.sym
+ sysdep_routines += __longjmp_cancel
+ ifneq ($(enable-cet),no)
+-tests += tst-setjmp-cet
++tests += \
++  tst-setjmp-cet \
++# tests
+ tst-setjmp-cet-ENV = GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on
+ endif
+ endif
+@@ -122,20 +141,45 @@ ifneq ($(enable-cet),no)
+ ifeq ($(subdir),elf)
+ sysdep-dl-routines += dl-cet
+ 
+-tests += tst-cet-legacy-1 tst-cet-legacy-1a tst-cet-legacy-2 \
+-	 tst-cet-legacy-2a tst-cet-legacy-3 tst-cet-legacy-4 \
+-	 tst-cet-legacy-5a tst-cet-legacy-6a tst-cet-legacy-7 \
+-	 tst-cet-legacy-8 tst-cet-legacy-9 tst-cet-legacy-9-static \
+-	 tst-cet-legacy-10 tst-cet-legacy-10-static
+-tests-static += tst-cet-legacy-9-static tst-cet-legacy-10-static
++tests += \
++  tst-cet-legacy-1 \
++  tst-cet-legacy-1a \
++  tst-cet-legacy-2 \
++  tst-cet-legacy-2a \
++  tst-cet-legacy-3 \
++  tst-cet-legacy-4 \
++  tst-cet-legacy-5a \
++  tst-cet-legacy-6a \
++  tst-cet-legacy-7 \
++  tst-cet-legacy-8 \
++  tst-cet-legacy-9 \
++  tst-cet-legacy-9-static \
++  tst-cet-legacy-10 \
++  tst-cet-legacy-10-static \
++# tests
++tests-static += \
++  tst-cet-legacy-9-static \
++  tst-cet-legacy-10-static \
++# tests-static
+ tst-cet-legacy-1a-ARGS = -- $(host-test-program-cmd)
+-tests += tst-cet-legacy-4a tst-cet-legacy-4b tst-cet-legacy-4c \
+-	 tst-cet-legacy-5b tst-cet-legacy-6b
+-modules-names += tst-cet-legacy-mod-1 tst-cet-legacy-mod-2 \
+-		 tst-cet-legacy-mod-4 tst-cet-legacy-mod-5a \
+-		 tst-cet-legacy-mod-5b tst-cet-legacy-mod-5c \
+-		 tst-cet-legacy-mod-6a tst-cet-legacy-mod-6b \
+-		 tst-cet-legacy-mod-6c
++tests += \
++  tst-cet-legacy-4a \
++  tst-cet-legacy-4b \
++  tst-cet-legacy-4c \
++  tst-cet-legacy-5b \
++  tst-cet-legacy-6b \
++# tests
++modules-names += \
++  tst-cet-legacy-mod-1 \
++  tst-cet-legacy-mod-2 \
++  tst-cet-legacy-mod-4 \
++  tst-cet-legacy-mod-5a \
++  tst-cet-legacy-mod-5b \
++  tst-cet-legacy-mod-5c \
++  tst-cet-legacy-mod-6a \
++  tst-cet-legacy-mod-6b \
++  tst-cet-legacy-mod-6c \
++# modules-names
+ 
+ CFLAGS-tst-cet-legacy-2.c += -fcf-protection=branch
+ CFLAGS-tst-cet-legacy-2a.c += -fcf-protection
+@@ -243,7 +287,9 @@ endif
+ ifeq ($(subdir),posix)
+ tests += \
+   tst-sysconf-cache-linesize \
+-  tst-sysconf-cache-linesize-static
++  tst-sysconf-cache-linesize-static \
++# tests
+ tests-static += \
+-  tst-sysconf-cache-linesize-static
++  tst-sysconf-cache-linesize-static \
++# tests-static
+ endif
+-- 
+2.27.0
+
diff --git a/x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch b/x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch
new file mode 100644
index 0000000..701321b
--- /dev/null
+++ b/x86-Check-the-lower-byte-of-EAX-of-CPUID-leaf-2-BZ-3.patch
@@ -0,0 +1,77 @@
+From 58822f954f6284c8687dfff43fa4e9e349eeccad Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 28 Aug 2023 12:08:14 -0700
+Subject: [PATCH] x86: Check the lower byte of EAX of CPUID leaf 2 [BZ
+ #30643]
+
+The old Intel software developer manual specified that the low byte of
+EAX of CPUID leaf 2 returned 1 which indicated the number of rounds of
+CPUDID leaf 2 was needed to retrieve the complete cache information. The
+newer Intel manual has been changed to that it should always return 1
+and be ignored.  If the lower byte isn't 1, CPUID leaf 2 can't be used.
+In this case, we ignore CPUID leaf 2 and use CPUID leaf 4 instead.  If
+CPUID leaf 4 doesn't contain the cache information, cache information
+isn't available at all.  This addresses BZ #30643.
+
+(cherry picked from commit 1493622f4f9048ffede3fbedb64695efa49d662a)
+---
+ sysdeps/x86/dl-cacheinfo.h | 31 +++++++++++++------------------
+ 1 file changed, 13 insertions(+), 18 deletions(-)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 6c7740422a..400d15f208 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -187,7 +187,7 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
+ 	      ++round;
+ 	    }
+ 	  /* There is no other cache information anywhere else.  */
+-	  break;
++	  return -1;
+ 	}
+       else
+ 	{
+@@ -257,28 +257,23 @@ handle_intel (int name, const struct cpu_features *cpu_features)
+ 
+   /* OK, we can use the CPUID instruction to get all info about the
+      caches.  */
+-  unsigned int cnt = 0;
+-  unsigned int max = 1;
+   long int result = 0;
+   bool no_level_2_or_3 = false;
+   bool has_level_2 = false;
++  unsigned int eax;
++  unsigned int ebx;
++  unsigned int ecx;
++  unsigned int edx;
++  __cpuid (2, eax, ebx, ecx, edx);
+ 
+-  while (cnt++ < max)
++  /* The low byte of EAX of CPUID leaf 2 should always return 1 and it
++     should be ignored.  If it isn't 1, use CPUID leaf 4 instead.  */
++  if ((eax & 0xff) != 1)
++    return intel_check_word (name, 0xff, &has_level_2, &no_level_2_or_3,
++			     cpu_features);
++  else
+     {
+-      unsigned int eax;
+-      unsigned int ebx;
+-      unsigned int ecx;
+-      unsigned int edx;
+-      __cpuid (2, eax, ebx, ecx, edx);
+-
+-      /* The low byte of EAX in the first round contain the number of
+-	 rounds we have to make.  At least one, the one we are already
+-	 doing.  */
+-      if (cnt == 1)
+-	{
+-	  max = eax & 0xff;
+-	  eax &= 0xffffff00;
+-	}
++      eax &= 0xffffff00;
+ 
+       /* Process the individual registers' value.  */
+       result = intel_check_word (name, eax, &has_level_2,
+-- 
+2.27.0
+
diff --git a/x86-Improve-large-memset-perf-with-non-temporal-stor.patch b/x86-Improve-large-memset-perf-with-non-temporal-stor.patch
new file mode 100644
index 0000000..abd7fdf
--- /dev/null
+++ b/x86-Improve-large-memset-perf-with-non-temporal-stor.patch
@@ -0,0 +1,254 @@
+From 04b8d484323b2ff18b3422c4b883ef4cb6281c53 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 24 May 2024 12:38:50 -0500
+Subject: [PATCH] x86: Improve large memset perf with non-temporal stores
+ [RHEL-29312]
+
+Previously we use `rep stosb` for all medium/large memsets. This is
+notably worse than non-temporal stores for large (above a
+few MBs) memsets.
+See:
+https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
+For data using different stategies for large memset on ICX and SKX.
+
+Using non-temporal stores can be up to 3x faster on ICX and 2x faster
+on SKX. Historically, these numbers would not have been so good
+because of the zero-over-zero writeback optimization that `rep stosb`
+is able to do. But, the zero-over-zero writeback optimization has been
+removed as a potential side-channel attack, so there is no longer any
+good reason to only rely on `rep stosb` for large memsets. On the flip
+size, non-temporal writes can avoid data in their RFO requests saving
+memory bandwidth.
+
+All of the other changes to the file are to re-organize the
+code-blocks to maintain "good" alignment given the new code added in
+the `L(stosb_local)` case.
+
+The results from running the GLIBC memset benchmarks on TGL-client for
+N=20 runs:
+
+Geometric Mean across the suite New / Old EXEX256: 0.979
+Geometric Mean across the suite New / Old EXEX512: 0.979
+Geometric Mean across the suite New / Old AVX2   : 0.986
+Geometric Mean across the suite New / Old SSE2   : 0.979
+
+Most of the cases are essentially unchanged, this is mostly to show
+that adding the non-temporal case didn't add any regressions to the
+other cases.
+
+The results on the memset-large benchmark suite on TGL-client for N=20
+runs:
+
+Geometric Mean across the suite New / Old EXEX256: 0.926
+Geometric Mean across the suite New / Old EXEX512: 0.925
+Geometric Mean across the suite New / Old AVX2   : 0.928
+Geometric Mean across the suite New / Old SSE2   : 0.924
+
+So roughly a 7.5% speedup. This is lower than what we see on servers
+(likely because clients typically have faster single-core bandwidth so
+saving bandwidth on RFOs is less impactful), but still advantageous.
+
+Full test-suite passes on x86_64 w/ and w/o multiarch.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+(cherry picked from commit 5bf0ab80573d66e4ae5d94b094659094336da90f)
+---
+ .../multiarch/memset-vec-unaligned-erms.S     | 147 +++++++++++-------
+ 1 file changed, 91 insertions(+), 56 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 0f0636b90f..aba45e3da0 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -21,8 +21,13 @@
+    2. If size is less than VEC, use integer register stores.
+    3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
+    4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
+-   5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
+-      4 VEC stores and store 4 * VEC at a time until done.  */
++   5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
++      4 VEC stores and store 4 * VEC at a time until done.
++   6. On machines ERMS feature, if size is range
++	  [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
++	  then REP STOSB will be used.
++   7. If size >= __x86_shared_non_temporal_threshold, use a
++	  non-temporal stores.  */
+ 
+ #include <sysdep.h>
+ 
+@@ -145,6 +150,41 @@ L(entry_from_wmemset):
+ 	VMOVU	%VMM(0), -VEC_SIZE(%rdi,%rdx)
+ 	VMOVU	%VMM(0), (%rdi)
+ 	VZEROUPPER_RETURN
++
++	/* If have AVX512 mask instructions put L(less_vec) close to
++	   entry as it doesn't take much space and is likely a hot target.  */
++#ifdef USE_LESS_VEC_MASK_STORE
++    /* Align to ensure the L(less_vec) logic all fits in 1x cache lines.  */
++	.p2align 6,, 47
++	.p2align 4
++L(less_vec):
++L(less_vec_from_wmemset):
++	/* Less than 1 VEC.  */
++# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
++#  error Unsupported VEC_SIZE!
++# endif
++	/* Clear high bits from edi. Only keeping bits relevant to page
++	   cross check. Note that we are using rax which is set in
++	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
++	andl	$(PAGE_SIZE - 1), %edi
++	/* Check if VEC_SIZE store cross page. Mask stores suffer
++	   serious performance degradation when it has to fault suppress.  */
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
++	/* This is generally considered a cold target.  */
++	ja	L(cross_page)
++# if VEC_SIZE > 32
++	movq	$-1, %rcx
++	bzhiq	%rdx, %rcx, %rcx
++	kmovq	%rcx, %k1
++# else
++	movl	$-1, %ecx
++	bzhil	%edx, %ecx, %ecx
++	kmovd	%ecx, %k1
++# endif
++	vmovdqu8 %VMM(0), (%rax){%k1}
++	VZEROUPPER_RETURN
++#endif
++
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ END (MEMSET_SYMBOL (__memset, unaligned))
+ 
+@@ -183,54 +223,6 @@ L(last_2x_vec):
+ #endif
+ 	VZEROUPPER_RETURN
+ 
+-	/* If have AVX512 mask instructions put L(less_vec) close to
+-	   entry as it doesn't take much space and is likely a hot target.
+-	 */
+-#ifdef USE_LESS_VEC_MASK_STORE
+-	.p2align 4,, 10
+-L(less_vec):
+-L(less_vec_from_wmemset):
+-	/* Less than 1 VEC.  */
+-# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+-#  error Unsupported VEC_SIZE!
+-# endif
+-	/* Clear high bits from edi. Only keeping bits relevant to page
+-	   cross check. Note that we are using rax which is set in
+-	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
+-	andl	$(PAGE_SIZE - 1), %edi
+-	/* Check if VEC_SIZE store cross page. Mask stores suffer
+-	   serious performance degradation when it has to fault suppress.
+-	 */
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
+-	/* This is generally considered a cold target.  */
+-	ja	L(cross_page)
+-# if VEC_SIZE > 32
+-	movq	$-1, %rcx
+-	bzhiq	%rdx, %rcx, %rcx
+-	kmovq	%rcx, %k1
+-# else
+-	movl	$-1, %ecx
+-	bzhil	%edx, %ecx, %ecx
+-	kmovd	%ecx, %k1
+-# endif
+-	vmovdqu8 %VMM(0), (%rax){%k1}
+-	VZEROUPPER_RETURN
+-
+-# if defined USE_MULTIARCH && IS_IN (libc)
+-	/* Include L(stosb_local) here if including L(less_vec) between
+-	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
+-	   L(stosb_more_2x_vec) target.  */
+-	.p2align 4,, 10
+-L(stosb_local):
+-	movzbl	%sil, %eax
+-	mov	%RDX_LP, %RCX_LP
+-	mov	%RDI_LP, %RDX_LP
+-	rep	stosb
+-	mov	%RDX_LP, %RAX_LP
+-	VZEROUPPER_RETURN
+-# endif
+-#endif
+-
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ 	.p2align 4
+ L(stosb_more_2x_vec):
+@@ -316,21 +308,33 @@ L(return_vzeroupper):
+ 	ret
+ #endif
+ 
+-	.p2align 4,, 10
+-#ifndef USE_LESS_VEC_MASK_STORE
+-# if defined USE_MULTIARCH && IS_IN (libc)
++#ifdef USE_WITH_AVX2
++	.p2align 4
++#else
++	.p2align 4,, 4
++#endif
++
++#if defined USE_MULTIARCH && IS_IN (libc)
+ 	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
+ 	   range for 2-byte jump encoding.  */
+ L(stosb_local):
++	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
++	jae	L(nt_memset)
+ 	movzbl	%sil, %eax
+ 	mov	%RDX_LP, %RCX_LP
+ 	mov	%RDI_LP, %RDX_LP
+ 	rep	stosb
++# if (defined USE_WITH_SSE2) || (defined USE_WITH_AVX512)
++	/* Use xchg to save 1-byte (this helps align targets below).  */
++	xchg	%RDX_LP, %RAX_LP
++# else
+ 	mov	%RDX_LP, %RAX_LP
+-	VZEROUPPER_RETURN
+ # endif
++	VZEROUPPER_RETURN
++#endif
++#ifndef USE_LESS_VEC_MASK_STORE
+ 	/* Define L(less_vec) only if not otherwise defined.  */
+-	.p2align 4
++	.p2align 4,, 12
+ L(less_vec):
+ 	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+ 	   xmm). This is only does anything for AVX2.  */
+@@ -421,4 +425,35 @@ L(between_2_3):
+ 	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+-END (MEMSET_SYMBOL (__memset, unaligned_erms))
++
++#if defined USE_MULTIARCH && IS_IN (libc)
++# ifdef USE_WITH_AVX512
++	/* Force align so the loop doesn't cross a cache-line.  */
++	.p2align 4
++# endif
++	.p2align 4,, 7
++    /* Memset using non-temporal stores.  */
++L(nt_memset):
++	VMOVU	%VMM(0), (VEC_SIZE * 0)(%rdi)
++	leaq	(VEC_SIZE * -4)(%rdi, %rdx), %rdx
++    /* Align DST.  */
++	orq	$(VEC_SIZE * 1 - 1), %rdi
++	incq	%rdi
++	.p2align 4,, 7
++L(nt_loop):
++	VMOVNT	%VMM(0), (VEC_SIZE * 0)(%rdi)
++	VMOVNT	%VMM(0), (VEC_SIZE * 1)(%rdi)
++	VMOVNT	%VMM(0), (VEC_SIZE * 2)(%rdi)
++	VMOVNT	%VMM(0), (VEC_SIZE * 3)(%rdi)
++	subq	$(VEC_SIZE * -4), %rdi
++	cmpq	%rdx, %rdi
++	jb	L(nt_loop)
++	sfence
++	VMOVU	%VMM(0), (VEC_SIZE * 0)(%rdx)
++	VMOVU	%VMM(0), (VEC_SIZE * 1)(%rdx)
++	VMOVU	%VMM(0), (VEC_SIZE * 2)(%rdx)
++	VMOVU	%VMM(0), (VEC_SIZE * 3)(%rdx)
++	VZEROUPPER_RETURN
++#endif
++
++END(MEMSET_SYMBOL(__memset, unaligned_erms))
+-- 
+2.27.0
+
diff --git a/x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch b/x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch
new file mode 100644
index 0000000..0e870fe
--- /dev/null
+++ b/x86-Only-align-destination-to-1x-VEC_SIZE-in-memset-.patch
@@ -0,0 +1,34 @@
+From 5a64f933655384477d85122c6855dc6d84061810 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 1 Nov 2023 15:30:26 -0500
+Subject: [PATCH] x86: Only align destination to 1x VEC_SIZE in memset 4x
+ loop
+
+Current code aligns to 2x VEC_SIZE. Aligning to 2x has no affect on
+performance other than potentially resulting in an additional
+iteration of the loop.
+1x maintains aligned stores (the only reason to align in this case)
+and doesn't incur any unnecessary loop iterations.
+Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
+
+(cherry picked from commit 9469261cf1924d350feeec64d2c80cafbbdcdd4d)
+---
+ sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 3d9ad49cb9..0f0636b90f 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -293,7 +293,7 @@ L(more_2x_vec):
+ 	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
+ #endif
+ 	/* Align dst for loop.  */
+-	andq	$(VEC_SIZE * -2), %LOOP_REG
++	andq	$(VEC_SIZE * -1), %LOOP_REG
+ 	.p2align 4
+ L(loop):
+ 	VMOVA	%VMM(0), LOOP_4X_OFFSET(%LOOP_REG)
+-- 
+2.27.0
+
diff --git a/x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch b/x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch
new file mode 100644
index 0000000..5d5cf38
--- /dev/null
+++ b/x86-string-Fixup-alignment-of-main-loop-in-str-n-cmp.patch
@@ -0,0 +1,149 @@
+From 12fec8aae5e17cc4dc3bb079265c46ee78faeddb Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 27 Sep 2024 15:50:10 -0700
+Subject: [PATCH] x86/string: Fixup alignment of main loop in
+ str{n}cmp-evex [BZ #32212]
+
+The loop should be aligned to 32-bytes so that it can ideally run out
+the DSB. This is particularly important on Skylake-Server where
+deficiencies in it's DSB implementation make it prone to not being
+able to run loops out of the DSB.
+
+For example running strcmp-evex on 200Mb string:
+
+32-byte aligned loop:
+    - 43,399,578,766      idq.dsb_uops
+not 32-byte aligned loop:
+    - 6,060,139,704       idq.dsb_uops
+
+This results in a 25% performance degradation for the non-aligned
+version.
+
+The fix is to just ensure the code layout is such that the loop is
+aligned. (Which was previously the case but was accidentally dropped
+in 84e7c46df).
+
+NB: The fix was actually 64-byte alignment. This is because 64-byte
+alignment generally produces more stable performance than 32-byte
+aligned code (cache line crosses can affect perf), so if we are going
+past 16-byte alignmnent, might as well go to 64. 64-byte alignment
+also matches most other functions we over-align, so it creates a
+common point of optimization.
+
+Times are reported as ratio of Time_With_Patch /
+Time_Without_Patch. Lower is better.
+
+The values being reported is the geometric mean of the ratio across
+all tests in bench-strcmp and bench-strncmp.
+
+Note this patch is only attempting to improve the Skylake-Server
+strcmp for long strings. The rest of the numbers are only to test for
+regressions.
+
+Tigerlake Results Strings <= 512:
+    strcmp : 1.026
+    strncmp: 0.949
+
+Tigerlake Results Strings > 512:
+    strcmp : 0.994
+    strncmp: 0.998
+
+Skylake-Server Results Strings <= 512:
+    strcmp : 0.945
+    strncmp: 0.943
+
+Skylake-Server Results Strings > 512:
+    strcmp : 0.778
+    strncmp: 1.000
+
+The 2.6% regression on TGL-strcmp is due to slowdowns caused by
+changes in alignment of code handling small sizes (most on the
+page-cross logic). These should be safe to ignore because 1) We
+previously only 16-byte aligned the function so this behavior is not
+new and was essentially up to chance before this patch and 2) this
+type of alignment related regression on small sizes really only comes
+up in tight micro-benchmark loops and is unlikely to have any affect
+on realworld performance.
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+(cherry picked from commit 483443d3211532903d7e790211af5a1d55fdb1f3)
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 26 +++++++++++++-------------
+ 1 file changed, 13 insertions(+), 13 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index ae39cdf217..6a7fec669e 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -209,7 +209,9 @@
+    returned.  */
+ 
+ 	.section SECTION(.text), "ax", @progbits
+-	.align	16
++	/* Align 64 bytes here. This is to get the L(loop) block ideally
++	   aligned for the DSB.  */
++	.align	64
+ 	.type	STRCMP, @function
+ 	.globl	STRCMP
+ # ifdef USE_AS_STRCASECMP_L
+@@ -509,9 +511,7 @@ L(ret4):
+ 	ret
+ # endif
+ 
+-	/* 32 byte align here ensures the main loop is ideally aligned
+-	   for DSB.  */
+-	.p2align 5
++	.p2align 4,, 4
+ L(more_3x_vec):
+ 	/* Safe to compare 4x vectors.  */
+ 	VMOVU	(VEC_SIZE)(%rdi), %VMM(0)
+@@ -1426,10 +1426,9 @@ L(less_32_till_page):
+ L(ret_zero_page_cross_slow_case0):
+ 	xorl	%eax, %eax
+ 	ret
+-# endif
+-
+-
++# else
+ 	.p2align 4,, 10
++# endif
+ L(less_16_till_page):
+ 	cmpl	$((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
+ 	ja	L(less_8_till_page)
+@@ -1482,8 +1481,12 @@ L(less_16_till_page):
+ # endif
+ 	jmp	L(prepare_loop_aligned)
+ 
+-
+-
++# ifndef USE_AS_STRNCMP
++	/* Fits in aligning bytes.  */
++L(ret_zero_4_loop):
++	xorl	%eax, %eax
++	ret
++# endif
+ 
+ 	.p2align 4,, 10
+ L(less_8_till_page):
+@@ -1554,6 +1557,7 @@ L(ret_less_8_wcs):
+ 
+ #  ifdef USE_AS_STRNCMP
+ 	.p2align 4,, 2
++L(ret_zero_4_loop):
+ L(ret_zero_page_cross_slow_case1):
+ 	xorl	%eax, %eax
+ 	ret
+@@ -1586,10 +1590,6 @@ L(less_4_loop):
+ 	subq	$-(CHAR_PER_VEC * 4), %rdx
+ #  endif
+ 	jmp	L(prepare_loop_aligned)
+-
+-L(ret_zero_4_loop):
+-	xorl	%eax, %eax
+-	ret
+ L(ret_less_4_loop):
+ 	xorl	%r8d, %eax
+ 	subl	%r8d, %eax
+-- 
+2.27.0
+
diff --git a/x86_64-Add-expm1-with-FMA.patch b/x86_64-Add-expm1-with-FMA.patch
new file mode 100644
index 0000000..eae04a5
--- /dev/null
+++ b/x86_64-Add-expm1-with-FMA.patch
@@ -0,0 +1,135 @@
+From b2a45f1eee39d67c1fff2d697d32857fb13c8c5d Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 11 Aug 2023 08:04:08 -0700
+Subject: [PATCH] x86_64: Add expm1 with FMA
+
+On Skylake, it improves expm1 bench performance by:
+
+        Before       After     Improvement
+max     70.204       68.054       3%
+min     20.709       16.2         22%
+mean    22.1221      16.7367      24%
+
+NB: Add
+
+extern long double __expm1l (long double);
+extern long double __expm1f128 (long double);
+
+for __typeof (__expm1l) and __typeof (__expm1f128) when __expm1 is
+defined since __expm1 may be expanded in their declarations which
+causes the build failure.
+
+(cherry picked from commit 1b214630ce6f7e0099b8b6f87246246739b079cf)
+---
+ sysdeps/ieee754/dbl-64/s_expm1.c           |  7 +++++
+ sysdeps/x86_64/fpu/multiarch/Makefile      |  2 ++
+ sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c | 10 ++++++
+ sysdeps/x86_64/fpu/multiarch/s_expm1.c     | 36 ++++++++++++++++++++++
+ 4 files changed, 55 insertions(+)
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_expm1.c
+
+diff --git a/sysdeps/ieee754/dbl-64/s_expm1.c b/sysdeps/ieee754/dbl-64/s_expm1.c
+index 8f1c95bd04..1cafeca9c0 100644
+--- a/sysdeps/ieee754/dbl-64/s_expm1.c
++++ b/sysdeps/ieee754/dbl-64/s_expm1.c
+@@ -130,6 +130,11 @@ static const double
+ 	  4.00821782732936239552e-06, /* 3ED0CFCA 86E65239 */
+ 	  -2.01099218183624371326e-07 }; /* BE8AFDB7 6E09C32D */
+ 
++#ifndef SECTION
++# define SECTION
++#endif
++
++SECTION
+ double
+ __expm1 (double x)
+ {
+@@ -258,4 +263,6 @@ __expm1 (double x)
+     }
+   return y;
+ }
++#ifndef __expm1
+ libm_alias_double (__expm1, expm1)
++#endif
+diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
+index f773255721..add339a876 100644
+--- a/sysdeps/x86_64/fpu/multiarch/Makefile
++++ b/sysdeps/x86_64/fpu/multiarch/Makefile
+@@ -37,6 +37,7 @@ libm-sysdep_routines += \
+   e_log2-fma \
+   e_pow-fma \
+   s_atan-fma \
++  s_expm1-fma \
+   s_sin-fma \
+   s_sincos-fma \
+   s_tan-fma \
+@@ -49,6 +50,7 @@ CFLAGS-e_log-fma.c = -mfma -mavx2
+ CFLAGS-e_log2-fma.c = -mfma -mavx2
+ CFLAGS-e_pow-fma.c = -mfma -mavx2
+ CFLAGS-s_atan-fma.c = -mfma -mavx2
++CFLAGS-s_expm1-fma.c = -mfma -mavx2
+ CFLAGS-s_sin-fma.c = -mfma -mavx2
+ CFLAGS-s_tan-fma.c = -mfma -mavx2
+ CFLAGS-s_sincos-fma.c = -mfma -mavx2
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c b/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c
+new file mode 100644
+index 0000000000..3ee2bd804e
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_expm1-fma.c
+@@ -0,0 +1,10 @@
++#define __expm1 __expm1_fma
++
++/* NB: __expm1 may be expanded to __expm1_fma in the following
++   prototypes.  */
++extern long double __expm1l (long double);
++extern long double __expm1f128 (long double);
++
++#define SECTION __attribute__ ((section (".text.fma")))
++
++#include <sysdeps/ieee754/dbl-64/s_expm1.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_expm1.c b/sysdeps/x86_64/fpu/multiarch/s_expm1.c
+new file mode 100644
+index 0000000000..2cae83fb7f
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_expm1.c
+@@ -0,0 +1,36 @@
++/* Multiple versions of expm1.
++   Copyright (C) 2023 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <libm-alias-double.h>
++
++extern double __redirect_expm1 (double);
++
++#define SYMBOL_NAME expm1
++#include "ifunc-fma.h"
++
++libc_ifunc_redirected (__redirect_expm1, __expm1, IFUNC_SELECTOR ());
++libm_alias_double (__expm1, expm1)
++
++#define __expm1 __expm1_sse2
++
++/* NB: __expm1 may be expanded to __expm1_sse2 in the following
++   prototypes.  */
++extern long double __expm1l (long double);
++extern long double __expm1f128 (long double);
++
++#include <sysdeps/ieee754/dbl-64/s_expm1.c>
+-- 
+2.27.0
+
diff --git a/x86_64-Add-log1p-with-FMA.patch b/x86_64-Add-log1p-with-FMA.patch
new file mode 100644
index 0000000..64cb328
--- /dev/null
+++ b/x86_64-Add-log1p-with-FMA.patch
@@ -0,0 +1,140 @@
+From c92946d9b29956be78ca4487264848714fd5d505 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 17 Aug 2023 09:42:29 -0700
+Subject: [PATCH] x86_64: Add log1p with FMA
+
+On Skylake, it changes log1p bench performance by:
+
+        Before       After     Improvement
+max     63.349       58.347       8%
+min     4.448        5.651        -30%
+mean    12.0674      10.336       14%
+
+The minimum code path is
+
+ if (hx < 0x3FDA827A)                          /* x < 0.41422  */
+    {
+      if (__glibc_unlikely (ax >= 0x3ff00000))           /* x <= -1.0 */
+        {
+	   ...
+        }
+      if (__glibc_unlikely (ax < 0x3e200000))           /* |x| < 2**-29 */
+        {
+          math_force_eval (two54 + x);          /* raise inexact */
+          if (ax < 0x3c900000)                  /* |x| < 2**-54 */
+            {
+	      ...
+            }
+          else
+            return x - x * x * 0.5;
+
+FMA and non-FMA code sequences look similar.  Non-FMA version is slightly
+faster.  Since log1p is called by asinh and atanh, it improves asinh
+performance by:
+
+        Before       After     Improvement
+max     75.645       63.135       16%
+min     10.074       10.071       0%
+mean    15.9483      14.9089      6%
+
+and improves atanh performance by:
+
+        Before       After     Improvement
+max     91.768       75.081       18%
+min     15.548       13.883       10%
+mean    18.3713      16.8011      8%
+
+(cherry picked from commit a8ecb126d4c26c52f4ad828c566afe4043a28155)
+---
+ sysdeps/ieee754/dbl-64/s_log1p.c           |  5 ++++
+ sysdeps/x86_64/fpu/multiarch/Makefile      |  2 ++
+ sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c |  4 +++
+ sysdeps/x86_64/fpu/multiarch/s_log1p.c     | 29 ++++++++++++++++++++++
+ 4 files changed, 40 insertions(+)
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_log1p.c
+
+diff --git a/sysdeps/ieee754/dbl-64/s_log1p.c b/sysdeps/ieee754/dbl-64/s_log1p.c
+index e6476a8260..eeb0af859f 100644
+--- a/sysdeps/ieee754/dbl-64/s_log1p.c
++++ b/sysdeps/ieee754/dbl-64/s_log1p.c
+@@ -99,6 +99,11 @@ static const double
+ 
+ static const double zero = 0.0;
+ 
++#ifndef SECTION
++# define SECTION
++#endif
++
++SECTION
+ double
+ __log1p (double x)
+ {
+diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
+index add339a876..ea81753b70 100644
+--- a/sysdeps/x86_64/fpu/multiarch/Makefile
++++ b/sysdeps/x86_64/fpu/multiarch/Makefile
+@@ -38,6 +38,7 @@ libm-sysdep_routines += \
+   e_pow-fma \
+   s_atan-fma \
+   s_expm1-fma \
++  s_log1p-fma \
+   s_sin-fma \
+   s_sincos-fma \
+   s_tan-fma \
+@@ -51,6 +52,7 @@ CFLAGS-e_log2-fma.c = -mfma -mavx2
+ CFLAGS-e_pow-fma.c = -mfma -mavx2
+ CFLAGS-s_atan-fma.c = -mfma -mavx2
+ CFLAGS-s_expm1-fma.c = -mfma -mavx2
++CFLAGS-s_log1p-fma.c = -mfma -mavx2
+ CFLAGS-s_sin-fma.c = -mfma -mavx2
+ CFLAGS-s_tan-fma.c = -mfma -mavx2
+ CFLAGS-s_sincos-fma.c = -mfma -mavx2
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c b/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c
+new file mode 100644
+index 0000000000..8952df8f9e
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_log1p-fma.c
+@@ -0,0 +1,4 @@
++#define __log1p __log1p_fma
++#define SECTION __attribute__ ((section (".text.fma")))
++
++#include <sysdeps/ieee754/dbl-64/s_log1p.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_log1p.c b/sysdeps/x86_64/fpu/multiarch/s_log1p.c
+new file mode 100644
+index 0000000000..6ce5198d6d
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/s_log1p.c
+@@ -0,0 +1,29 @@
++/* Multiple versions of log1p.
++   Copyright (C) 2023 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <libm-alias-double.h>
++
++extern double __redirect_log1p (double);
++
++#define SYMBOL_NAME log1p
++#include "ifunc-fma.h"
++
++libc_ifunc_redirected (__redirect_log1p, __log1p, IFUNC_SELECTOR ());
++
++#define __log1p __log1p_sse2
++#include <sysdeps/ieee754/dbl-64/s_log1p.c>
+-- 
+2.27.0
+
diff --git a/x86_64-Add-log2-with-FMA.patch b/x86_64-Add-log2-with-FMA.patch
new file mode 100644
index 0000000..2439490
--- /dev/null
+++ b/x86_64-Add-log2-with-FMA.patch
@@ -0,0 +1,102 @@
+From 49016f2190693d5b2d4d6294d438ebae7a58d151 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 10 Aug 2023 11:24:30 -0700
+Subject: [PATCH] x86_64: Add log2 with FMA
+
+On Skylake, it improves log2 bench performance by:
+
+        Before       After     Improvement
+max     208.779      63.827       69%
+min     9.977        6.55         34%
+mean    10.366       6.8191       34%
+
+(cherry picked from commit f6b10ed8e9a00de49d0951e760cc2b5288862b47)
+---
+ sysdeps/x86_64/fpu/multiarch/Makefile     |  2 ++
+ sysdeps/x86_64/fpu/multiarch/e_log2-fma.c |  3 ++
+ sysdeps/x86_64/fpu/multiarch/e_log2.c     | 43 +++++++++++++++++++++++
+ 3 files changed, 48 insertions(+)
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/e_log2-fma.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/e_log2.c
+
+diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
+index e37e488c37..f773255721 100644
+--- a/sysdeps/x86_64/fpu/multiarch/Makefile
++++ b/sysdeps/x86_64/fpu/multiarch/Makefile
+@@ -34,6 +34,7 @@ libm-sysdep_routines += \
+   e_atan2-fma \
+   e_exp-fma \
+   e_log-fma \
++  e_log2-fma \
+   e_pow-fma \
+   s_atan-fma \
+   s_sin-fma \
+@@ -45,6 +46,7 @@ CFLAGS-e_asin-fma.c = -mfma -mavx2
+ CFLAGS-e_atan2-fma.c = -mfma -mavx2
+ CFLAGS-e_exp-fma.c = -mfma -mavx2
+ CFLAGS-e_log-fma.c = -mfma -mavx2
++CFLAGS-e_log2-fma.c = -mfma -mavx2
+ CFLAGS-e_pow-fma.c = -mfma -mavx2
+ CFLAGS-s_atan-fma.c = -mfma -mavx2
+ CFLAGS-s_sin-fma.c = -mfma -mavx2
+diff --git a/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c b/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c
+new file mode 100644
+index 0000000000..9fbebc1b47
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/e_log2-fma.c
+@@ -0,0 +1,3 @@
++#define __log2 __log2_fma
++
++#include <sysdeps/ieee754/dbl-64/e_log2.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/e_log2.c b/sysdeps/x86_64/fpu/multiarch/e_log2.c
+new file mode 100644
+index 0000000000..c0320caf36
+--- /dev/null
++++ b/sysdeps/x86_64/fpu/multiarch/e_log2.c
+@@ -0,0 +1,43 @@
++/* Multiple versions of log2.
++   Copyright (C) 2023 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <libm-alias-double.h>
++#include <libm-alias-finite.h>
++
++extern double __redirect_log2 (double);
++
++#define SYMBOL_NAME log2
++#include "ifunc-fma.h"
++
++libc_ifunc_redirected (__redirect_log2, __log2, IFUNC_SELECTOR ());
++
++#ifdef SHARED
++__hidden_ver1 (__log2, __GI___log2, __redirect_log2)
++  __attribute__ ((visibility ("hidden")));
++
++versioned_symbol (libm, __ieee754_log2, log2, GLIBC_2_29);
++libm_alias_double_other (__log2, log2)
++#else
++libm_alias_double (__log2, log2)
++#endif
++
++strong_alias (__log2, __ieee754_log2)
++libm_alias_finite (__log2, __log2)
++
++#define __log2 __log2_sse2
++#include <sysdeps/ieee754/dbl-64/e_log2.c>
+-- 
+2.27.0
+
diff --git a/x86_64-Fix-missing-wcsncat-function-definition-witho.patch b/x86_64-Fix-missing-wcsncat-function-definition-witho.patch
new file mode 100644
index 0000000..a960466
--- /dev/null
+++ b/x86_64-Fix-missing-wcsncat-function-definition-witho.patch
@@ -0,0 +1,44 @@
+From dc1762113dbe40be832bedd41b52d9822d62c50f Mon Sep 17 00:00:00 2001
+From: Gabi Falk <gabifalk@gmx.com>
+Date: Tue, 7 May 2024 18:25:00 +0000
+Subject: [PATCH] x86_64: Fix missing wcsncat function definition without
+ multiarch (x86-64-v4)
+
+This code expects the WCSCAT preprocessor macro to be predefined in case
+the evex implementation of the function should be defined with a name
+different from __wcsncat_evex.  However, when glibc is built for
+x86-64-v4 without multiarch support, sysdeps/x86_64/wcsncat.S defines
+WCSNCAT variable instead of WCSCAT to build it as wcsncat.  Rename the
+variable to WCSNCAT, as it is actually a better naming choice for the
+variable in this case.
+
+Reported-by: Kenton Groombridge
+Link: https://bugs.gentoo.org/921945
+Fixes: 64b8b6516b ("x86: Add evex optimized functions for the wchar_t strcpy family")
+Signed-off-by: Gabi Falk <gabifalk@gmx.com>
+Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
+(cherry picked from commit dd5f891c1ad9f1b43b9db93afe2a55cbb7a6194e)
+---
+ sysdeps/x86_64/multiarch/wcsncat-evex.S | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/wcsncat-evex.S b/sysdeps/x86_64/multiarch/wcsncat-evex.S
+index 392215950a..10bfb0a531 100644
+--- a/sysdeps/x86_64/multiarch/wcsncat-evex.S
++++ b/sysdeps/x86_64/multiarch/wcsncat-evex.S
+@@ -1,9 +1,9 @@
+-#ifndef WCSCAT
+-# define WCSCAT	__wcsncat_evex
++#ifndef WCSNCAT
++# define WCSNCAT	__wcsncat_evex
+ #endif
+ 
+ #define USE_AS_WCSCPY
+ #define USE_AS_STRCAT
+ 
+-#define STRNCAT	WCSCAT
++#define STRNCAT	WCSNCAT
+ #include "strncat-evex.S"
+-- 
+2.27.0
+
diff --git a/x86_64-Sort-fpu-multiarch-Makefile.patch b/x86_64-Sort-fpu-multiarch-Makefile.patch
new file mode 100644
index 0000000..08ca62f
--- /dev/null
+++ b/x86_64-Sort-fpu-multiarch-Makefile.patch
@@ -0,0 +1,144 @@
+From 5c9be512ee25ceab92a284adc75fe22bbd94b179 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 9 Aug 2023 11:08:52 -0700
+Subject: [PATCH] x86_64: Sort fpu/multiarch/Makefile
+
+Sort Makefile variables using scripts/sort-makefile-lines.py.
+
+No code generation changes observed in libm.  No regressions on x86_64.
+
+(cherry picked from commit 881546979d0219c18337e1b4f4d00cfacab13c40)
+---
+ sysdeps/x86_64/fpu/multiarch/Makefile | 94 +++++++++++++++++++++------
+ 1 file changed, 74 insertions(+), 20 deletions(-)
+
+diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
+index 248162525b..e37e488c37 100644
+--- a/sysdeps/x86_64/fpu/multiarch/Makefile
++++ b/sysdeps/x86_64/fpu/multiarch/Makefile
+@@ -1,17 +1,45 @@
+ ifeq ($(subdir),math)
+-libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
+-			s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
+-			s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
++libm-sysdep_routines += \
++  s_ceil-c \
++  s_ceilf-c \
++  s_floor-c \
++  s_floorf-c \
++  s_rint-c \
++  s_rintf-c \
++  s_nearbyint-c \
++  s_nearbyintf-c \
++  s_roundeven-c \
++  s_roundevenf-c \
++  s_trunc-c \
++  s_truncf-c \
++# libm-sysdep_routines
+ 
+-libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
+-			s_floorf-sse4_1 s_nearbyint-sse4_1 \
+-			s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
+-			s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
+-			s_trunc-sse4_1 s_truncf-sse4_1
++libm-sysdep_routines += \
++  s_ceil-sse4_1 \
++  s_ceilf-sse4_1 \
++  s_floor-sse4_1 \
++  s_floorf-sse4_1 \
++  s_nearbyint-sse4_1 \
++  s_nearbyintf-sse4_1 \
++  s_roundeven-sse4_1 \
++  s_roundevenf-sse4_1 \
++  s_rint-sse4_1 \
++  s_rintf-sse4_1 \
++  s_trunc-sse4_1 \
++  s_truncf-sse4_1 \
++# libm-sysdep_routines
+ 
+-libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
+-			e_asin-fma e_atan2-fma s_sin-fma s_tan-fma \
+-			s_sincos-fma
++libm-sysdep_routines += \
++  e_asin-fma \
++  e_atan2-fma \
++  e_exp-fma \
++  e_log-fma \
++  e_pow-fma \
++  s_atan-fma \
++  s_sin-fma \
++  s_sincos-fma \
++  s_tan-fma \
++# libm-sysdep_routines
+ 
+ CFLAGS-e_asin-fma.c = -mfma -mavx2
+ CFLAGS-e_atan2-fma.c = -mfma -mavx2
+@@ -23,10 +51,22 @@ CFLAGS-s_sin-fma.c = -mfma -mavx2
+ CFLAGS-s_tan-fma.c = -mfma -mavx2
+ CFLAGS-s_sincos-fma.c = -mfma -mavx2
+ 
+-libm-sysdep_routines += s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
++libm-sysdep_routines += \
++  s_cosf-sse2 \
++  s_sincosf-sse2 \
++  s_sinf-sse2 \
++# libm-sysdep_routines
+ 
+-libm-sysdep_routines += e_exp2f-fma e_expf-fma e_log2f-fma e_logf-fma \
+-			e_powf-fma s_sinf-fma s_cosf-fma s_sincosf-fma
++libm-sysdep_routines += \
++  e_exp2f-fma \
++  e_expf-fma \
++  e_log2f-fma \
++  e_logf-fma \
++  e_powf-fma \
++  s_cosf-fma \
++  s_sincosf-fma \
++  s_sinf-fma \
++# libm-sysdep_routines
+ 
+ CFLAGS-e_exp2f-fma.c = -mfma -mavx2
+ CFLAGS-e_expf-fma.c = -mfma -mavx2
+@@ -37,9 +77,17 @@ CFLAGS-s_sinf-fma.c = -mfma -mavx2
+ CFLAGS-s_cosf-fma.c = -mfma -mavx2
+ CFLAGS-s_sincosf-fma.c = -mfma -mavx2
+ 
+-libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
+-			e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
+-			s_sincos-fma4
++libm-sysdep_routines += \
++  e_exp-fma4 \
++  e_log-fma4 \
++  e_pow-fma4 \
++  e_asin-fma4 \
++  s_atan-fma4 \
++  e_atan2-fma4 \
++  s_sin-fma4 \
++  s_sincos-fma4 \
++  s_tan-fma4 \
++# libm-sysdep_routines
+ 
+ CFLAGS-e_asin-fma4.c = -mfma4
+ CFLAGS-e_atan2-fma4.c = -mfma4
+@@ -51,9 +99,15 @@ CFLAGS-s_sin-fma4.c = -mfma4
+ CFLAGS-s_tan-fma4.c = -mfma4
+ CFLAGS-s_sincos-fma4.c = -mfma4
+ 
+-libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \
+-			e_atan2-avx s_sin-avx s_tan-avx \
+-			s_sincos-avx
++libm-sysdep_routines += \
++  e_exp-avx \
++  e_log-avx \
++  s_atan-avx \
++  e_atan2-avx \
++  s_sin-avx \
++  s_sincos-avx \
++  s_tan-avx \
++# libm-sysdep_routines
+ 
+ CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX
+ CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX
+-- 
+2.27.0
+
-- 
Gitee