From 910c9e3e36557af913251a7ac2ae4e0a3f189750 Mon Sep 17 00:00:00 2001
From: Xie jiamei <xiejiamei@hygon.cn>
Date: Wed, 18 Jun 2025 12:57:41 +0800
Subject: [PATCH] [Feature]Add support for Hygon processors

Signed-off-by: Xie jiamei <xiejiamei@hygon.cn>
---
 ...chitecture-type-for-Hygon-processors.patch |  69 +++++
 ...formation-support-for-Hygon-processo.patch |  97 +++++++
 ...ix-Zen3-Zen4-ERMS-selection-BZ-30994.patch | 152 ++++++++++
 ...rate-non-temporal-tunable-for-memset.patch | 215 ++++++++++++++
 ...-non-temporal-memset-tunable-for-AMD.patch |  47 ++++
 ...r-x86_memset_non_temporal_threshold-.patch |  41 +++
 ...on-temporal-memset-on-Skylake-Server.patch | 262 ++++++++++++++++++
 ...n_Temporal_Memset-to-control-non-tem.patch |  95 +++++++
 ...OSB-tunable-to-allow-NT-memset-witho.patch | 221 +++++++++++++++
 ...temporal-memset-for-Hygon-processors.patch |  92 ++++++
 ...e-memset-perf-with-non-temporal-stor.patch | 248 +++++++++++++++++
 glibc.spec                                    |  27 +-
 12 files changed, 1560 insertions(+), 6 deletions(-)
 create mode 100644 0001-x86-Add-new-architecture-type-for-Hygon-processors.patch
 create mode 100644 0002-x86-Add-cache-information-support-for-Hygon-processo.patch
 create mode 100644 0003-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch
 create mode 100644 0004-x86-Add-seperate-non-temporal-tunable-for-memset.patch
 create mode 100644 0005-x86-Enable-non-temporal-memset-tunable-for-AMD.patch
 create mode 100644 0006-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch
 create mode 100644 0007-x86-Disable-non-temporal-memset-on-Skylake-Server.patch
 create mode 100644 0008-x86-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch
 create mode 100644 0009-x86-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch
 create mode 100644 0010-x86-Enable-non-temporal-memset-for-Hygon-processors.patch
 create mode 100644 0011-x86-Improve-large-memset-perf-with-non-temporal-stor.patch

diff --git a/0001-x86-Add-new-architecture-type-for-Hygon-processors.patch b/0001-x86-Add-new-architecture-type-for-Hygon-processors.patch
new file mode 100644
index 0000000..45165a0
--- /dev/null
+++ b/0001-x86-Add-new-architecture-type-for-Hygon-processors.patch
@@ -0,0 +1,69 @@
+From 5e1c0ca3aacae059f1971162d5a9f586265e72d3 Mon Sep 17 00:00:00 2001
+From: Feifei Wang <wangfeifei@hygon.cn>
+Date: Mon, 19 Aug 2024 14:57:53 +0800
+Subject: [PATCH 01/11] x86: Add new architecture type for Hygon processors
+
+Add a new architecture type arch_kind_hygon to spilt Hygon branch
+from AMD. This is to facilitate the Hygon processors to make settings
+that are suitable for its own characteristics.
+
+Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
+Reviewed-by: Jing Li <lijing@hygon.cn>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/cpu-features.c         | 19 ++++++++++++++++---
+ sysdeps/x86/include/cpu-features.h |  1 +
+ 2 files changed, 17 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index f752ebd24d..c4dd85145e 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -851,9 +851,8 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
+ 	cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
+ 	  |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
+     }
+-  /* This spells out "AuthenticAMD" or "HygonGenuine".  */
+-  else if ((ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+-	   || (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e))
++  /* This spells out "AuthenticAMD".  */
++  else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+     {
+       unsigned int extended_model;
+ 
+@@ -963,6 +962,20 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
+ 	    }
+ 	}
+     }
++  /* This spells out "HygonGenuine".  */
++  else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)
++    {
++      unsigned int extended_model;
++
++      kind = arch_kind_hygon;
++
++      get_common_indices (cpu_features, &family, &model, &extended_model,
++			  &stepping);
++
++      get_extended_indices (cpu_features);
++
++      update_active (cpu_features);
++    }
+   else
+     {
+       kind = arch_kind_other;
+diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
+index eb30d342a6..594feeb2f4 100644
+--- a/sysdeps/x86/include/cpu-features.h
++++ b/sysdeps/x86/include/cpu-features.h
+@@ -856,6 +856,7 @@ enum cpu_features_kind
+   arch_kind_intel,
+   arch_kind_amd,
+   arch_kind_zhaoxin,
++  arch_kind_hygon,
+   arch_kind_other
+ };
+ 
+-- 
+2.17.1
+
diff --git a/0002-x86-Add-cache-information-support-for-Hygon-processo.patch b/0002-x86-Add-cache-information-support-for-Hygon-processo.patch
new file mode 100644
index 0000000..69affcf
--- /dev/null
+++ b/0002-x86-Add-cache-information-support-for-Hygon-processo.patch
@@ -0,0 +1,97 @@
+From 8a14035ba9574c26b6d504fda99e630a8bcaf5c7 Mon Sep 17 00:00:00 2001
+From: Feifei Wang <wangfeifei@hygon.cn>
+Date: Mon, 19 Aug 2024 14:57:54 +0800
+Subject: [PATCH 02/11] x86: Add cache information support for Hygon processors
+
+Add hygon branch in dl_init_cacheinfo function to initialize
+cache size variables for hygon processors. In the meanwhile,
+add handle_hygon() function to get cache information.
+
+Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
+Reviewed-by: Jing Li <lijing@hygon.cn>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 60 ++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 60 insertions(+)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 2c5b6d6980..8f141e7634 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -406,6 +406,48 @@ handle_zhaoxin (int name)
+   return 0;
+ }
+ 
++static long int __attribute__ ((noinline))
++handle_hygon (int name)
++{
++  unsigned int eax;
++  unsigned int ebx;
++  unsigned int ecx;
++  unsigned int edx;
++  unsigned int count = 0x1;
++
++  if (name >= _SC_LEVEL3_CACHE_SIZE)
++    count = 0x3;
++  else if (name >= _SC_LEVEL2_CACHE_SIZE)
++    count = 0x2;
++  else if (name >= _SC_LEVEL1_DCACHE_SIZE)
++    count = 0x0;
++
++  /* Use __cpuid__ '0x8000_001D' to compute cache details.  */
++  __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
++
++  switch (name)
++    {
++    case _SC_LEVEL1_ICACHE_ASSOC:
++    case _SC_LEVEL1_DCACHE_ASSOC:
++    case _SC_LEVEL2_CACHE_ASSOC:
++    case _SC_LEVEL3_CACHE_ASSOC:
++      return ((ebx >> 22) & 0x3ff) + 1;
++    case _SC_LEVEL1_ICACHE_LINESIZE:
++    case _SC_LEVEL1_DCACHE_LINESIZE:
++    case _SC_LEVEL2_CACHE_LINESIZE:
++    case _SC_LEVEL3_CACHE_LINESIZE:
++      return (ebx & 0xfff) + 1;
++    case _SC_LEVEL1_ICACHE_SIZE:
++    case _SC_LEVEL1_DCACHE_SIZE:
++    case _SC_LEVEL2_CACHE_SIZE:
++    case _SC_LEVEL3_CACHE_SIZE:
++      return (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1);
++    default:
++      __builtin_unreachable ();
++    }
++  return -1;
++}
++
+ static void
+ get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
+                 long int core)
+@@ -724,6 +766,24 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       if (shared_per_thread <= 0)
+ 	shared_per_thread = shared;
+     }
++  else if (cpu_features->basic.kind == arch_kind_hygon)
++    {
++      data = handle_hygon (_SC_LEVEL1_DCACHE_SIZE);
++      shared = handle_hygon (_SC_LEVEL3_CACHE_SIZE);
++      shared_per_thread = shared;
++
++      level1_icache_size = handle_hygon (_SC_LEVEL1_ICACHE_SIZE);
++      level1_icache_linesize = handle_hygon (_SC_LEVEL1_ICACHE_LINESIZE);
++      level1_dcache_size = data;
++      level1_dcache_assoc = handle_hygon (_SC_LEVEL1_DCACHE_ASSOC);
++      level1_dcache_linesize = handle_hygon (_SC_LEVEL1_DCACHE_LINESIZE);
++      level2_cache_size = handle_hygon (_SC_LEVEL2_CACHE_SIZE);;
++      level2_cache_assoc = handle_hygon (_SC_LEVEL2_CACHE_ASSOC);
++      level2_cache_linesize = handle_hygon (_SC_LEVEL2_CACHE_LINESIZE);
++      level3_cache_size = shared;
++      level3_cache_assoc = handle_hygon (_SC_LEVEL3_CACHE_ASSOC);
++      level3_cache_linesize = handle_hygon (_SC_LEVEL3_CACHE_LINESIZE);
++    }
+ 
+   cpu_features->level1_icache_size = level1_icache_size;
+   cpu_features->level1_icache_linesize = level1_icache_linesize;
+-- 
+2.17.1
+
diff --git a/0003-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch b/0003-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch
new file mode 100644
index 0000000..7ef1413
--- /dev/null
+++ b/0003-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch
@@ -0,0 +1,152 @@
+From 9aae95da1d78018c1961c60ee80c95192131020c Mon Sep 17 00:00:00 2001
+From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+Date: Thu, 8 Feb 2024 10:08:38 -0300
+Subject: [PATCH 03/11] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
+
+The REP MOVSB usage on memcpy/memmove does not show much performance
+improvement on Zen3/Zen4 cores compared to the vectorized loops.  Also,
+as from BZ 30994, if the source is aligned and the destination is not
+the performance can be 20x slower.
+
+The performance difference is noticeable with small buffer sizes, closer
+to the lower bounds limits when memcpy/memmove starts to use ERMS.  The
+performance of REP MOVSB is similar to vectorized instruction on the
+size limit (the L2 cache).  Also, there is no drawback to multiple cores
+sharing the cache.
+
+Checked on x86_64-linux-gnu on Zen3.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 45 +++++++++++++++++++++-----------------
+ 1 file changed, 25 insertions(+), 20 deletions(-)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 8f141e7634..a8ad6cbefa 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -673,7 +673,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   long int data = -1;
+   long int shared = -1;
+   long int shared_per_thread = -1;
+-  long int core = -1;
+   unsigned int threads = 0;
+   unsigned long int level1_icache_size = -1;
+   unsigned long int level1_icache_linesize = -1;
+@@ -691,7 +690,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (cpu_features->basic.kind == arch_kind_intel)
+     {
+       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
+-      core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
+       shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
+       shared_per_thread = shared;
+ 
+@@ -704,7 +702,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+ 	= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
+       level1_dcache_linesize
+ 	= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
+-      level2_cache_size = core;
++      level2_cache_size
++	= handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
+       level2_cache_assoc
+ 	= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
+       level2_cache_linesize
+@@ -717,12 +716,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       level4_cache_size
+ 	= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
+ 
+-      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
++      get_common_cache_info (&shared, &shared_per_thread, &threads,
++			     level2_cache_size);
+     }
+   else if (cpu_features->basic.kind == arch_kind_zhaoxin)
+     {
+       data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
+-      core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
+       shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
+       shared_per_thread = shared;
+ 
+@@ -731,19 +730,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       level1_dcache_size = data;
+       level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
+       level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
+-      level2_cache_size = core;
++      level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
+       level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
+       level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
+       level3_cache_size = shared;
+       level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
+       level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
+ 
+-      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
++      get_common_cache_info (&shared, &shared_per_thread, &threads,
++			     level2_cache_size);
+     }
+   else if (cpu_features->basic.kind == arch_kind_amd)
+     {
+       data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
+-      core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
+       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
+       shared_per_thread = shared;
+ 
+@@ -752,7 +751,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       level1_dcache_size = data;
+       level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
+       level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
+-      level2_cache_size = core;
++      level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
+       level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
+       level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
+       level3_cache_size = shared;
+@@ -760,8 +759,15 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
+ 
+       if (shared <= 0)
+-        /* No shared L3 cache.  All we have is the L2 cache.  */
+-	shared = core;
++        {
++           /* No shared L3 cache.  All we have is the L2 cache.  */
++           shared = level2_cache_size;
++        }
++      else if (cpu_features->basic.family < 0x17)
++        {
++           /* Account for exclusive L2 and L3 caches.  */
++           shared += level2_cache_size;
++        }
+ 
+       if (shared_per_thread <= 0)
+ 	shared_per_thread = shared;
+@@ -883,6 +889,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
+     rep_movsb_threshold = 2112;
+ 
++   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
++      cases slower than the vectorized path (and for some alignments,
++      it is really slow, check BZ #30994).  */
++  if (cpu_features->basic.kind == arch_kind_amd)
++    rep_movsb_threshold = non_temporal_threshold;
++
+   /* The default threshold to use Enhanced REP STOSB.  */
+   unsigned long int rep_stosb_threshold = 2048;
+ 
+@@ -924,16 +936,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+ 			   SIZE_MAX);
+ 
+   unsigned long int rep_movsb_stop_threshold;
+-  /* ERMS feature is implemented from AMD Zen3 architecture and it is
+-     performing poorly for data above L2 cache size. Henceforth, adding
+-     an upper bound threshold parameter to limit the usage of Enhanced
+-     REP MOVSB operations and setting its value to L2 cache size.  */
+-  if (cpu_features->basic.kind == arch_kind_amd)
+-    rep_movsb_stop_threshold = core;
+   /* Setting the upper bound of ERMS to the computed value of
+-     non-temporal threshold for architectures other than AMD.  */
+-  else
+-    rep_movsb_stop_threshold = non_temporal_threshold;
++     non-temporal threshold for all architectures.  */
++  rep_movsb_stop_threshold = non_temporal_threshold;
+ 
+   cpu_features->data_cache_size = data;
+   cpu_features->shared_cache_size = shared;
+-- 
+2.17.1
+
diff --git a/0004-x86-Add-seperate-non-temporal-tunable-for-memset.patch b/0004-x86-Add-seperate-non-temporal-tunable-for-memset.patch
new file mode 100644
index 0000000..b5ebe58
--- /dev/null
+++ b/0004-x86-Add-seperate-non-temporal-tunable-for-memset.patch
@@ -0,0 +1,215 @@
+From 57ce020adf1acf50d67f4693de5c3e786ce195ec Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 24 May 2024 12:38:51 -0500
+Subject: [PATCH 04/11] x86: Add seperate non-temporal tunable for memset
+
+The tuning for non-temporal stores for memset vs memcpy is not always
+the same. This includes both the exact value and whether non-temporal
+stores are profitable at all for a given arch.
+
+This patch add `x86_memset_non_temporal_threshold`. Currently we
+disable non-temporal stores for non Intel vendors as the only
+benchmarks showing its benefit have been on Intel hardware.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ manual/tunables.texi                             | 16 +++++++++++++++-
+ sysdeps/x86/cacheinfo.h                          |  8 +++++++-
+ sysdeps/x86/dl-cacheinfo.h                       | 16 ++++++++++++++++
+ sysdeps/x86/dl-diagnostics-cpu.c                 |  2 ++
+ sysdeps/x86/dl-tunables.list                     |  3 +++
+ sysdeps/x86/include/cpu-features.h               |  4 +++-
+ .../x86_64/multiarch/memset-vec-unaligned-erms.S | 11 +++++++++--
+ 7 files changed, 55 insertions(+), 5 deletions(-)
+
+diff --git a/manual/tunables.texi b/manual/tunables.texi
+index bdd3bacb2a..eaef1604c7 100644
+--- a/manual/tunables.texi
++++ b/manual/tunables.texi
+@@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
+ glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
+ glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
+ glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
++glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
+ glibc.cpu.x86_shstk:
+ glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
+ glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
+@@ -486,7 +487,8 @@ thread stack originally backup by Huge Pages to default pages.
+ @cindex shared_cache_size tunables
+ @cindex tunables, shared_cache_size
+ @cindex non_temporal_threshold tunables
+-@cindex tunables, non_temporal_threshold
++@cindex memset_non_temporal_threshold tunables
++@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
+ 
+ @deftp {Tunable namespace} glibc.cpu
+ Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
+@@ -562,6 +564,18 @@ like memmove and memcpy.
+ This tunable is specific to i386 and x86-64.
+ @end deftp
+ 
++@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
++The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
++the user to set threshold in bytes for non temporal store in
++memset. Non temporal stores give a hint to the hardware to move data
++directly to memory without displacing other data from the cache. This
++tunable is used by some platforms to determine when to use non
++temporal stores memset.
++
++This tunable is specific to i386 and x86-64.
++@end deftp
++
++
+ @deftp Tunable glibc.cpu.x86_rep_movsb_threshold
+ The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
+ set threshold in bytes to start using "rep movsb".  The value must be
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index ec1bc142c4..fd2b2ae66b 100644
+--- a/sysdeps/x86/cacheinfo.h
++++ b/sysdeps/x86/cacheinfo.h
+@@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024;
+ long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
+ long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
+ 
+-/* Threshold to use non temporal store.  */
++/* Threshold to use non temporal store in memmove.  */
+ long int __x86_shared_non_temporal_threshold attribute_hidden;
+ 
++/* Threshold to use non temporal store in memset.  */
++long int __x86_memset_non_temporal_threshold attribute_hidden;
++
+ /* Threshold to use Enhanced REP MOVSB.  */
+ long int __x86_rep_movsb_threshold attribute_hidden = 2048;
+ 
+@@ -77,6 +80,9 @@ init_cacheinfo (void)
+   __x86_shared_non_temporal_threshold
+     = cpu_features->non_temporal_threshold;
+ 
++  __x86_memset_non_temporal_threshold
++      = cpu_features->memset_non_temporal_threshold;
++
+   __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
+   __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
+   __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index a8ad6cbefa..cbcc154e24 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -889,6 +889,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
+     rep_movsb_threshold = 2112;
+ 
++  /* Non-temporal stores in memset have only been tested on Intel hardware.
++     Until we benchmark data on other x86 processor, disable non-temporal
++     stores in memset. */
++  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
++  if (cpu_features->basic.kind == arch_kind_intel)
++      memset_non_temporal_threshold = non_temporal_threshold;
++
+    /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+       cases slower than the vectorized path (and for some alignments,
+       it is really slow, check BZ #30994).  */
+@@ -915,6 +922,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       && tunable_size <= maximum_non_temporal_threshold)
+     non_temporal_threshold = tunable_size;
+ 
++  tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
++  if (tunable_size > minimum_non_temporal_threshold
++      && tunable_size <= maximum_non_temporal_threshold)
++    memset_non_temporal_threshold = tunable_size;
++
+   tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
+   if (tunable_size > minimum_rep_movsb_threshold)
+     rep_movsb_threshold = tunable_size;
+@@ -930,6 +942,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
+ 			   minimum_non_temporal_threshold,
+ 			   maximum_non_temporal_threshold);
++  TUNABLE_SET_WITH_BOUNDS (
++      x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
++      minimum_non_temporal_threshold, maximum_non_temporal_threshold);
+   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
+ 			   minimum_rep_movsb_threshold, SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
+@@ -943,6 +958,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   cpu_features->data_cache_size = data;
+   cpu_features->shared_cache_size = shared;
+   cpu_features->non_temporal_threshold = non_temporal_threshold;
++  cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
+   cpu_features->rep_movsb_threshold = rep_movsb_threshold;
+   cpu_features->rep_stosb_threshold = rep_stosb_threshold;
+   cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
+diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
+index 5aab63e532..05d54b5eba 100644
+--- a/sysdeps/x86/dl-diagnostics-cpu.c
++++ b/sysdeps/x86/dl-diagnostics-cpu.c
+@@ -83,6 +83,8 @@ _dl_diagnostics_cpu (void)
+                             cpu_features->shared_cache_size);
+   print_cpu_features_value ("non_temporal_threshold",
+                             cpu_features->non_temporal_threshold);
++  print_cpu_features_value ("memset_non_temporal_threshold",
++                            cpu_features->memset_non_temporal_threshold);
+   print_cpu_features_value ("rep_movsb_threshold",
+                             cpu_features->rep_movsb_threshold);
+   print_cpu_features_value ("rep_movsb_stop_threshold",
+diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
+index feb7004036..f334a2ad6a 100644
+--- a/sysdeps/x86/dl-tunables.list
++++ b/sysdeps/x86/dl-tunables.list
+@@ -30,6 +30,9 @@ glibc {
+     x86_non_temporal_threshold {
+       type: SIZE_T
+     }
++    x86_memset_non_temporal_threshold {
++      type: SIZE_T
++    }
+     x86_rep_movsb_threshold {
+       type: SIZE_T
+       # Since there is overhead to set up REP MOVSB operation, REP
+diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
+index 594feeb2f4..e2d641dcd0 100644
+--- a/sysdeps/x86/include/cpu-features.h
++++ b/sysdeps/x86/include/cpu-features.h
+@@ -918,8 +918,10 @@ struct cpu_features
+   /* Shared cache size for use in memory and string routines, typically
+      L2 or L3 size.  */
+   unsigned long int shared_cache_size;
+-  /* Threshold to use non temporal store.  */
++  /* Threshold to use non temporal store in memmove.  */
+   unsigned long int non_temporal_threshold;
++  /* Threshold to use non temporal store in memset.  */
++  unsigned long int memset_non_temporal_threshold;
+   /* Threshold to use "rep movsb".  */
+   unsigned long int rep_movsb_threshold;
+   /* Threshold to stop using "rep movsb".  */
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 3d9ad49cb9..eb9cbf0da9 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -21,8 +21,13 @@
+    2. If size is less than VEC, use integer register stores.
+    3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
+    4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
+-   5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
+-      4 VEC stores and store 4 * VEC at a time until done.  */
++   5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
++      4 VEC stores and store 4 * VEC at a time until done.
++   6. On machines ERMS feature, if size is range
++	  [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
++	  then REP STOSB will be used.
++   7. If size >= __x86_memset_non_temporal_threshold, use a
++	  non-temporal stores.  */
+ 
+ #include <sysdep.h>
+ 
+@@ -322,6 +327,8 @@ L(return_vzeroupper):
+ 	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
+ 	   range for 2-byte jump encoding.  */
+ L(stosb_local):
++	cmp	__x86_memset_non_temporal_threshold(%rip), %RDX_LP
++	jae	L(nt_memset)
+ 	movzbl	%sil, %eax
+ 	mov	%RDX_LP, %RCX_LP
+ 	mov	%RDI_LP, %RDX_LP
+-- 
+2.17.1
+
diff --git a/0005-x86-Enable-non-temporal-memset-tunable-for-AMD.patch b/0005-x86-Enable-non-temporal-memset-tunable-for-AMD.patch
new file mode 100644
index 0000000..6ab5ad9
--- /dev/null
+++ b/0005-x86-Enable-non-temporal-memset-tunable-for-AMD.patch
@@ -0,0 +1,47 @@
+From fb7889b5c34b85a9ac9b50c252b6cd6f81c8630b Mon Sep 17 00:00:00 2001
+From: Joe Damato <jdamato@fastly.com>
+Date: Fri, 7 Jun 2024 23:04:47 +0000
+Subject: [PATCH 05/11] x86: Enable non-temporal memset tunable for AMD
+
+In commit 46b5e98ef6f1 ("x86: Add seperate non-temporal tunable for
+memset") a tunable threshold for enabling non-temporal memset was added,
+but only for Intel hardware.
+
+Since that commit, new benchmark results suggest that non-temporal
+memset is beneficial on AMD, as well, so allow this tunable to be set
+for AMD.
+
+See:
+https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
+which has been updated to include data using different stategies for
+large memset on AMD Zen2, Zen3, and Zen4.
+
+Signed-off-by: Joe Damato <jdamato@fastly.com>
+Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index cbcc154e24..362e9bc24e 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -889,11 +889,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
+     rep_movsb_threshold = 2112;
+ 
+-  /* Non-temporal stores in memset have only been tested on Intel hardware.
+-     Until we benchmark data on other x86 processor, disable non-temporal
+-     stores in memset. */
++  /* Non-temporal stores are more performant on Intel and AMD hardware above
++     non_temporal_threshold. Enable this for both Intel and AMD hardware. */
+   unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+-  if (cpu_features->basic.kind == arch_kind_intel)
++  if (cpu_features->basic.kind == arch_kind_intel
++      || cpu_features->basic.kind == arch_kind_amd)
+       memset_non_temporal_threshold = non_temporal_threshold;
+ 
+    /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+-- 
+2.17.1
+
diff --git a/0006-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch b/0006-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch
new file mode 100644
index 0000000..aa9ce9b
--- /dev/null
+++ b/0006-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch
@@ -0,0 +1,41 @@
+From 00cbe2f60e51acb8230032e71e2910492f132ed5 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 14 Jun 2024 13:01:58 -0500
+Subject: [PATCH 06/11] x86: Fix value for `x86_memset_non_temporal_threshold`
+ when it is undesirable
+
+When we don't want to use non-temporal stores for memset, we set
+`x86_memset_non_temporal_threshold` to SIZE_MAX.
+
+The current code, however, we using `maximum_non_temporal_threshold`
+as the upper bound which is `SIZE_MAX >> 4` so we ended up with a
+value of `0`.
+
+Fix is to just use `SIZE_MAX` as the upper bound for when setting the
+tunable.
+Tested-by: Borislav Petkov (AMD) <bp@alien8.de>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 362e9bc24e..8e2fd8799f 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -942,9 +942,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
+ 			   minimum_non_temporal_threshold,
+ 			   maximum_non_temporal_threshold);
+-  TUNABLE_SET_WITH_BOUNDS (
+-      x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
+-      minimum_non_temporal_threshold, maximum_non_temporal_threshold);
++  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
++			   memset_non_temporal_threshold,
++			   minimum_non_temporal_threshold, SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
+ 			   minimum_rep_movsb_threshold, SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
+-- 
+2.17.1
+
diff --git a/0007-x86-Disable-non-temporal-memset-on-Skylake-Server.patch b/0007-x86-Disable-non-temporal-memset-on-Skylake-Server.patch
new file mode 100644
index 0000000..f3220f4
--- /dev/null
+++ b/0007-x86-Disable-non-temporal-memset-on-Skylake-Server.patch
@@ -0,0 +1,262 @@
+From 5431ab7ca77eb3b89e18ad6174230e83e15494fd Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 15 Jul 2024 16:19:17 +0800
+Subject: [PATCH 07/11] x86: Disable non-temporal memset on Skylake Server
+
+The original commit enabling non-temporal memset on Skylake Server had
+erroneous benchmarks (actually done on ICX).
+
+Further benchmarks indicate non-temporal stores may in fact by a
+regression on Skylake Server.
+
+This commit may be over-cautious in some cases, but should avoid any
+regressions for 2.40.
+
+Tested using qemu on all x86_64 cpu arch supported by both qemu +
+GLIBC.
+
+Reviewed-by: DJ Delorie <dj@redhat.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/cpu-features.c                    |  13 +-
+ sysdeps/x86/cpu-tunables.c                    |   5 +
+ sysdeps/x86/dl-cacheinfo.h                    |  15 +-
+ ...cpu-features-preferred_feature_index_1.def |   1 +
+ sysdeps/x86/tst-hwcap-tunables.c              | 148 ++++++++++++++++++
+ 5 files changed, 172 insertions(+), 10 deletions(-)
+ create mode 100644 sysdeps/x86/tst-hwcap-tunables.c
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index c4dd85145e..b4030776a7 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -754,11 +754,18 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 
+ 	      /* Newer Bigcore microarch (larger non-temporal store
+ 		 threshold).  */
+-	    case INTEL_BIGCORE_SKYLAKE:
+-	    case INTEL_BIGCORE_KABYLAKE:
+-	    case INTEL_BIGCORE_COMETLAKE:
+ 	    case INTEL_BIGCORE_SKYLAKE_AVX512:
+ 	    case INTEL_BIGCORE_CANNONLAKE:
++	      /* Benchmarks indicate non-temporal memset is not
++		     necessarily profitable on SKX (and in some cases much
++		     worse). This is likely unique to SKX due its it unique
++		     mesh interconnect (not present on ICX or BWD). Disable
++		     non-temporal on all Skylake servers. */
++	      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
++		  |= bit_arch_Avoid_Non_Temporal_Memset;
++	    case INTEL_BIGCORE_COMETLAKE:
++	    case INTEL_BIGCORE_SKYLAKE:
++	    case INTEL_BIGCORE_KABYLAKE:
+ 	    case INTEL_BIGCORE_ICELAKE:
+ 	    case INTEL_BIGCORE_TIGERLAKE:
+ 	    case INTEL_BIGCORE_ROCKETLAKE:
+diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
+index 0d4f328585..4b62d697a2 100644
+--- a/sysdeps/x86/cpu-tunables.c
++++ b/sysdeps/x86/cpu-tunables.c
+@@ -272,6 +272,11 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+ 		 disable, 24);
+ 	    }
+ 	  break;
++	case 25:
++	  {
++	    CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
++					      Avoid_Non_Temporal_Memset, 25);
++	  }
+ 	case 26:
+ 	    {
+ 	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 8e2fd8799f..13923e4f1e 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -892,13 +892,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   /* Non-temporal stores are more performant on Intel and AMD hardware above
+      non_temporal_threshold. Enable this for both Intel and AMD hardware. */
+   unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+-  if (cpu_features->basic.kind == arch_kind_intel
+-      || cpu_features->basic.kind == arch_kind_amd)
+-      memset_non_temporal_threshold = non_temporal_threshold;
+-
+-   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+-      cases slower than the vectorized path (and for some alignments,
+-      it is really slow, check BZ #30994).  */
++  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
++      && (cpu_features->basic.kind == arch_kind_intel
++	  || cpu_features->basic.kind == arch_kind_amd))
++    memset_non_temporal_threshold = non_temporal_threshold;
++
++  /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
++     cases slower than the vectorized path (and for some alignments,
++     it is really slow, check BZ #30994).  */
+   if (cpu_features->basic.kind == arch_kind_amd)
+     rep_movsb_threshold = non_temporal_threshold;
+ 
+diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+index d20c5b3196..aae1c85551 100644
+--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
+ BIT (MathVec_Prefer_No_AVX512)
+ BIT (Prefer_FSRM)
+ BIT (Avoid_Short_Distance_REP_MOVSB)
++BIT (Avoid_Non_Temporal_Memset)
+diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
+new file mode 100644
+index 0000000000..94307283d7
+--- /dev/null
++++ b/sysdeps/x86/tst-hwcap-tunables.c
+@@ -0,0 +1,148 @@
++/* Tests for x86 GLIBC_TUNABLES=glibc.cpu.hwcaps filter.
++   Copyright (C) 2023-2024 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library; if not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#include <array_length.h>
++#include <getopt.h>
++#include <ifunc-impl-list.h>
++#include <spawn.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <intprops.h>
++#include <support/check.h>
++#include <support/support.h>
++#include <support/xunistd.h>
++#include <support/capture_subprocess.h>
++
++/* Nonzero if the program gets called via `exec'.  */
++#define CMDLINE_OPTIONS \
++  { "restart", no_argument, &restart, 1 },
++static int restart;
++
++/* Disable everything.  */
++static const char *test_1[] =
++{
++  "__memcpy_avx512_no_vzeroupper",
++  "__memcpy_avx512_unaligned",
++  "__memcpy_avx512_unaligned_erms",
++  "__memcpy_evex_unaligned",
++  "__memcpy_evex_unaligned_erms",
++  "__memcpy_avx_unaligned",
++  "__memcpy_avx_unaligned_erms",
++  "__memcpy_avx_unaligned_rtm",
++  "__memcpy_avx_unaligned_erms_rtm",
++  "__memcpy_ssse3",
++};
++
++static const struct test_t
++{
++  const char *env;
++  const char *const *funcs;
++  size_t nfuncs;
++} tests[] =
++{
++  {
++    /* Disable everything.  */
++    "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
++    "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
++    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
++    test_1,
++    array_length (test_1)
++  },
++  {
++    /* Same as before, but with some empty suboptions.  */
++    ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
++    "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
++    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
++    test_1,
++    array_length (test_1)
++  }
++};
++
++/* Called on process re-execution.  */
++_Noreturn static void
++handle_restart (int ntest)
++{
++  struct libc_ifunc_impl impls[32];
++  int cnt = __libc_ifunc_impl_list ("memcpy", impls, array_length (impls));
++  if (cnt == 0)
++    _exit (EXIT_SUCCESS);
++  TEST_VERIFY_EXIT (cnt >= 1);
++  for (int i = 0; i < cnt; i++)
++    {
++      for (int f = 0; f < tests[ntest].nfuncs; f++)
++	{
++	  if (strcmp (impls[i].name, tests[ntest].funcs[f]) == 0)
++	    TEST_COMPARE (impls[i].usable, false);
++	}
++    }
++
++  _exit (EXIT_SUCCESS);
++}
++
++static int
++do_test (int argc, char *argv[])
++{
++  /* We must have either:
++     - One our fource parameters left if called initially:
++       + path to ld.so         optional
++       + "--library-path"      optional
++       + the library path      optional
++       + the application name
++       + the test to check  */
++
++  TEST_VERIFY_EXIT (argc == 2 || argc == 5);
++
++  if (restart)
++    handle_restart (atoi (argv[1]));
++
++  char nteststr[INT_BUFSIZE_BOUND (int)];
++
++  char *spargv[10];
++  {
++    int i = 0;
++    for (; i < argc - 1; i++)
++      spargv[i] = argv[i + 1];
++    spargv[i++] = (char *) "--direct";
++    spargv[i++] = (char *) "--restart";
++    spargv[i++] = nteststr;
++    spargv[i] = NULL;
++  }
++
++  for (int i = 0; i < array_length (tests); i++)
++    {
++      snprintf (nteststr, sizeof nteststr, "%d", i);
++
++      printf ("[%d] Spawned test for %s\n", i, tests[i].env);
++      char *tunable = xasprintf ("glibc.cpu.hwcaps=%s", tests[i].env);
++      setenv ("GLIBC_TUNABLES", tunable, 1);
++
++      struct support_capture_subprocess result
++	= support_capture_subprogram (spargv[0], spargv, NULL);
++      support_capture_subprocess_check (&result, "tst-tunables", 0,
++					sc_allow_stderr);
++      support_capture_subprocess_free (&result);
++
++      free (tunable);
++    }
++
++  return 0;
++}
++
++#define TEST_FUNCTION_ARGV do_test
++#include <support/test-driver.c>
+-- 
+2.17.1
+
diff --git a/0008-x86-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch b/0008-x86-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch
new file mode 100644
index 0000000..92cdb5c
--- /dev/null
+++ b/0008-x86-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch
@@ -0,0 +1,95 @@
+From c2a035a04585ec554490e6500ab04011df8c883d Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 14 Aug 2024 14:37:30 +0800
+Subject: [PATCH 08/11] x86: Use `Avoid_Non_Temporal_Memset` to control
+ non-temporal path
+
+This is just a refactor and there should be no behavioral change from
+this commit.
+
+The goal is to make `Avoid_Non_Temporal_Memset` a more universal knob
+for controlling whether we use non-temporal memset rather than having
+extra logic based on vendor.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/cpu-features.c | 16 ++++++++++++++++
+ sysdeps/x86/dl-cacheinfo.h | 15 +++++++--------
+ 2 files changed, 23 insertions(+), 8 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index b4030776a7..c9f2297524 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -640,6 +640,12 @@ init_cpu_features (struct cpu_features *cpu_features)
+   unsigned int stepping = 0;
+   enum cpu_features_kind kind;
+ 
++  /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
++     as of writing this, we only have benchmarks indicatings it profitability
++     on Intel/AMD.  */
++  cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
++      |= bit_arch_Avoid_Non_Temporal_Memset;
++
+   cpu_features->cachesize_non_temporal_divisor = 4;
+ #if !HAS_CPUID
+   if (__get_cpuid_max (0, 0) == 0)
+@@ -665,6 +671,11 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 
+       update_active (cpu_features);
+ 
++      /* Benchmarks indicate non-temporal memset can be profitable on Intel
++	hardware.  */
++      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
++	  &= ~bit_arch_Avoid_Non_Temporal_Memset;
++
+       if (family == 0x06)
+ 	{
+ 	  model += extended_model;
+@@ -874,6 +885,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
+ 
+       ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
+ 
++      /* Benchmarks indicate non-temporal memset can be profitable on AMD
++	hardware.  */
++      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
++	  &= ~bit_arch_Avoid_Non_Temporal_Memset;
++
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
+ 	{
+ 	  /* Since the FMA4 bit is in CPUID_INDEX_80000001 and
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 13923e4f1e..d1ea14b2c7 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -889,14 +889,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
+     rep_movsb_threshold = 2112;
+ 
+-  /* Non-temporal stores are more performant on Intel and AMD hardware above
+-     non_temporal_threshold. Enable this for both Intel and AMD hardware. */
+-  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
+-      && (cpu_features->basic.kind == arch_kind_intel
+-	  || cpu_features->basic.kind == arch_kind_amd))
+-    memset_non_temporal_threshold = non_temporal_threshold;
+-
+   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+      cases slower than the vectorized path (and for some alignments,
+      it is really slow, check BZ #30994).  */
+@@ -918,6 +910,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   if (tunable_size != 0)
+     shared = tunable_size;
+ 
++  /* Non-temporal stores are more performant on some hardware above
++     non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
++     Intel and AMD hardware. */
++  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
++  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
++    memset_non_temporal_threshold = non_temporal_threshold;
++
+   tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
+   if (tunable_size > minimum_non_temporal_threshold
+       && tunable_size <= maximum_non_temporal_threshold)
+-- 
+2.17.1
+
diff --git a/0009-x86-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch b/0009-x86-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch
new file mode 100644
index 0000000..4bd4e39
--- /dev/null
+++ b/0009-x86-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch
@@ -0,0 +1,221 @@
+From f42d4cd6e73a9881b1323456dd8698caebec9a9a Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 14 Aug 2024 14:37:31 +0800
+Subject: [PATCH 09/11] x86: Add `Avoid_STOSB` tunable to allow NT memset
+ without ERMS
+
+The goal of this flag is to allow targets which don't prefer/have ERMS
+to still access the non-temporal memset implementation.
+
+There are 4 cases for tuning memset:
+    1) `Avoid_STOSB && Avoid_Non_Temporal_Memset`
+        - Memset with temporal stores
+    2) `Avoid_STOSB && !Avoid_Non_Temporal_Memset`
+        - Memset with temporal/non-temporal stores. Non-temporal path
+          goes through `rep stosb` path. We accomplish this by setting
+          `x86_rep_stosb_threshold` to
+          `x86_memset_non_temporal_threshold`.
+    3) `!Avoid_STOSB && Avoid_Non_Temporal_Memset`
+        - Memset with temporal stores/`rep stosb`
+    3) `!Avoid_STOSB && !Avoid_Non_Temporal_Memset`
+        - Memset with temporal stores/`rep stosb`/non-temporal stores.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/cpu-features.c                    |  4 +++
+ sysdeps/x86/cpu-tunables.c                    |  6 +++-
+ sysdeps/x86/dl-cacheinfo.h                    | 34 ++++++++++++++++---
+ ...cpu-features-preferred_feature_index_1.def |  1 +
+ sysdeps/x86/tst-hwcap-tunables.c              |  6 ++--
+ sysdeps/x86_64/multiarch/ifunc-memset.h       | 18 +++++++---
+ 6 files changed, 56 insertions(+), 13 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index c9f2297524..287edc5b08 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -1014,6 +1014,10 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
+   if (CPU_FEATURES_CPU_P (cpu_features, CMOV))
+     cpu_features->preferred[index_arch_I686] |= bit_arch_I686;
+ 
++  /* No ERMS, we want to avoid stosb for memset.  */
++  if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++    cpu_features->preferred[index_arch_Avoid_STOSB] |= bit_arch_Avoid_STOSB;
++
+ #if !HAS_CPUID
+ no_cpuid:
+ #endif
+diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
+index 4b62d697a2..e8bf8ea731 100644
+--- a/sysdeps/x86/cpu-tunables.c
++++ b/sysdeps/x86/cpu-tunables.c
+@@ -214,6 +214,10 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+ 	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+ 						Prefer_FSRM,
+ 						disable, 11);
++	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
++			            Avoid_STOSB,
++						disable, 11);
++
+ 	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH (n, cpu_features,
+ 						     Slow_SSE4_2,
+ 						     SSE4_2,
+@@ -275,7 +279,7 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+ 	case 25:
+ 	  {
+ 	    CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+-					      Avoid_Non_Temporal_Memset, 25);
++					      Avoid_Non_Temporal_Memset, disable, 25);
+ 	  }
+ 	case 26:
+ 	    {
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index d1ea14b2c7..efe1bb7e4a 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -937,18 +937,42 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
+ 				     long int, NULL);
+ 
++  /*
++     For memset, the non-temporal implementation is only accessed through the
++     stosb code. ie:
++     ```
++     if (size >= rep_stosb_thresh)
++     {
++    	if (size >= non_temporal_thresh)
++     {
++     do_non_temporal ();
++     }
++    	do_stosb ();
++     }
++     do_normal_vec_loop ();
++     ```
++     So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh`
++     to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`,
++    `rep stosb` will never be used.
++   */
++  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
++			   memset_non_temporal_threshold,
++			   minimum_non_temporal_threshold, SIZE_MAX);
++  /* Do `rep_stosb_thresh = non_temporal_thresh` after setting/getting the
++     final value of `x86_memset_non_temporal_threshold`. In some cases this can
++     be a matter of correctness.  */
++  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_STOSB))
++    rep_stosb_threshold
++	= TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
++  TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
++			   SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
+ 			   minimum_non_temporal_threshold,
+ 			   maximum_non_temporal_threshold);
+-  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
+-			   memset_non_temporal_threshold,
+-			   minimum_non_temporal_threshold, SIZE_MAX);
+   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
+ 			   minimum_rep_movsb_threshold, SIZE_MAX);
+-  TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
+-			   SIZE_MAX);
+ 
+   unsigned long int rep_movsb_stop_threshold;
+   /* Setting the upper bound of ERMS to the computed value of
+diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+index aae1c85551..38a0c9226c 100644
+--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+@@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512)
+ BIT (Prefer_FSRM)
+ BIT (Avoid_Short_Distance_REP_MOVSB)
+ BIT (Avoid_Non_Temporal_Memset)
++BIT (Avoid_STOSB)
+diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
+index 94307283d7..1920f5057e 100644
+--- a/sysdeps/x86/tst-hwcap-tunables.c
++++ b/sysdeps/x86/tst-hwcap-tunables.c
+@@ -60,7 +60,8 @@ static const struct test_t
+     /* Disable everything.  */
+     "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
+     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
+-    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
++    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,"
++    "-Avoid_STOSB",
+     test_1,
+     array_length (test_1)
+   },
+@@ -68,7 +69,8 @@ static const struct test_t
+     /* Same as before, but with some empty suboptions.  */
+     ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
+     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
+-    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
++    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,"
++    "-Avoid_STOSB,-,",
+     test_1,
+     array_length (test_1)
+   }
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 5c5096ec5a..6b3b9a17a2 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -46,6 +46,13 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
+   attribute_hidden;
+ 
++static inline int
++prefer_erms_nt_impl (const struct cpu_features *cpu_features)
++{
++  return CPU_FEATURE_USABLE_P (cpu_features, ERMS)
++	 || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset);
++}
++
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+@@ -61,7 +68,7 @@ IFUNC_SELECTOR (void)
+ 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+ 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	{
+-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	  if (prefer_erms_nt_impl (cpu_features))
+ 	    return OPTIMIZE (avx512_unaligned_erms);
+ 
+ 	  return OPTIMIZE (avx512_unaligned);
+@@ -76,7 +83,7 @@ IFUNC_SELECTOR (void)
+ 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+ 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	{
+-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	  if (prefer_erms_nt_impl (cpu_features))
+ 	    return OPTIMIZE (evex_unaligned_erms);
+ 
+ 	  return OPTIMIZE (evex_unaligned);
+@@ -84,7 +91,7 @@ IFUNC_SELECTOR (void)
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ 	{
+-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	  if (prefer_erms_nt_impl (cpu_features))
+ 	    return OPTIMIZE (avx2_unaligned_erms_rtm);
+ 
+ 	  return OPTIMIZE (avx2_unaligned_rtm);
+@@ -93,14 +100,15 @@ IFUNC_SELECTOR (void)
+       if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+ 				       Prefer_No_VZEROUPPER, !))
+ 	{
+-	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++	  if (prefer_erms_nt_impl (cpu_features))
+ 	    return OPTIMIZE (avx2_unaligned_erms);
+ 
+ 	  return OPTIMIZE (avx2_unaligned);
+ 	}
+     }
+ 
+-  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)
++      || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
+     return OPTIMIZE (sse2_unaligned_erms);
+ 
+   return OPTIMIZE (sse2_unaligned);
+-- 
+2.17.1
+
diff --git a/0010-x86-Enable-non-temporal-memset-for-Hygon-processors.patch b/0010-x86-Enable-non-temporal-memset-for-Hygon-processors.patch
new file mode 100644
index 0000000..3f39f06
--- /dev/null
+++ b/0010-x86-Enable-non-temporal-memset-for-Hygon-processors.patch
@@ -0,0 +1,92 @@
+From ddfc6cc086ce3fd4b76c5fc981eba96290d1501c Mon Sep 17 00:00:00 2001
+From: Feifei Wang <wangfeifei@hygon.cn>
+Date: Mon, 19 Aug 2024 14:57:55 +0800
+Subject: [PATCH 10/11] x86: Enable non-temporal memset for Hygon processors
+
+This patch uses 'Avoid_Non_Temporal_Memset' flag to access
+the non-temporal memset implementation for hygon processors.
+
+Test Results:
+
+hygon1 arch
+x86_memset_non_temporal_threshold = 8MB
+size                          new performance time / old performance time
+1MB                           0.994
+4MB                           0.996
+8MB                           0.670
+16MB                          0.343
+32MB                          0.355
+
+hygon2 arch
+x86_memset_non_temporal_threshold = 8MB
+size                          new performance time / old performance time
+1MB                           1
+4MB                           1
+8MB                           1.312
+16MB                          0.822
+32MB                          0.830
+
+hygon3 arch
+x86_memset_non_temporal_threshold = 8MB
+size                          new performance time / old performance time
+1MB                           1
+4MB                           0.990
+8MB                           0.737
+16MB                          0.390
+32MB                          0.401
+
+For hygon arch with this patch, non-temporal stores can improve
+performance by 20% - 65%.
+
+Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
+Reviewed-by: Jing Li <lijing@hygon.cn>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/cpu-features.c | 9 +++++++--
+ sysdeps/x86/dl-cacheinfo.h | 2 +-
+ 2 files changed, 8 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 287edc5b08..f5539aea6f 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -640,9 +640,9 @@ init_cpu_features (struct cpu_features *cpu_features)
+   unsigned int stepping = 0;
+   enum cpu_features_kind kind;
+ 
+-  /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
++  /* Default is avoid non-temporal memset for non Intel/AMD/Hygon hardware. This is,
+      as of writing this, we only have benchmarks indicatings it profitability
+-     on Intel/AMD.  */
++     on Intel/AMD/Hygon.  */
+   cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+       |= bit_arch_Avoid_Non_Temporal_Memset;
+ 
+@@ -998,6 +998,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
+       get_extended_indices (cpu_features);
+ 
+       update_active (cpu_features);
++
++      /* Benchmarks indicate non-temporal memset can be profitable on Hygon
++       hardware.  */
++      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
++	    &= ~bit_arch_Avoid_Non_Temporal_Memset;
+     }
+   else
+     {
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index efe1bb7e4a..2a837eb24b 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -912,7 +912,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+ 
+   /* Non-temporal stores are more performant on some hardware above
+      non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
+-     Intel and AMD hardware. */
++     Intel, AMD and Hygon hardware. */
+   unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+   if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
+     memset_non_temporal_threshold = non_temporal_threshold;
+-- 
+2.17.1
+
diff --git a/0011-x86-Improve-large-memset-perf-with-non-temporal-stor.patch b/0011-x86-Improve-large-memset-perf-with-non-temporal-stor.patch
new file mode 100644
index 0000000..091c73a
--- /dev/null
+++ b/0011-x86-Improve-large-memset-perf-with-non-temporal-stor.patch
@@ -0,0 +1,248 @@
+From 708e5218e65602e8a3495b908e86cfbb50f65126 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 24 May 2024 12:38:50 -0500
+Subject: [PATCH 11/11] x86: Improve large memset perf with non-temporal stores
+ [RHEL-29312]
+
+Previously we use `rep stosb` for all medium/large memsets. This is
+notably worse than non-temporal stores for large (above a
+few MBs) memsets.
+See:
+https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
+For data using different stategies for large memset on ICX and SKX.
+
+Using non-temporal stores can be up to 3x faster on ICX and 2x faster
+on SKX. Historically, these numbers would not have been so good
+because of the zero-over-zero writeback optimization that `rep stosb`
+is able to do. But, the zero-over-zero writeback optimization has been
+removed as a potential side-channel attack, so there is no longer any
+good reason to only rely on `rep stosb` for large memsets. On the flip
+size, non-temporal writes can avoid data in their RFO requests saving
+memory bandwidth.
+
+All of the other changes to the file are to re-organize the
+code-blocks to maintain "good" alignment given the new code added in
+the `L(stosb_local)` case.
+
+The results from running the GLIBC memset benchmarks on TGL-client for
+N=20 runs:
+
+Geometric Mean across the suite New / Old EXEX256: 0.979
+Geometric Mean across the suite New / Old EXEX512: 0.979
+Geometric Mean across the suite New / Old AVX2   : 0.986
+Geometric Mean across the suite New / Old SSE2   : 0.979
+
+Most of the cases are essentially unchanged, this is mostly to show
+that adding the non-temporal case didn't add any regressions to the
+other cases.
+
+The results on the memset-large benchmark suite on TGL-client for N=20
+runs:
+
+Geometric Mean across the suite New / Old EXEX256: 0.926
+Geometric Mean across the suite New / Old EXEX512: 0.925
+Geometric Mean across the suite New / Old AVX2   : 0.928
+Geometric Mean across the suite New / Old SSE2   : 0.924
+
+So roughly a 7.5% speedup. This is lower than what we see on servers
+(likely because clients typically have faster single-core bandwidth so
+saving bandwidth on RFOs is less impactful), but still advantageous.
+
+Full test-suite passes on x86_64 w/ and w/o multiarch.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ .../multiarch/memset-vec-unaligned-erms.S     | 142 +++++++++++-------
+ 1 file changed, 85 insertions(+), 57 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index eb9cbf0da9..f08c8646b1 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -24,10 +24,10 @@
+    5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
+       4 VEC stores and store 4 * VEC at a time until done.
+    6. On machines ERMS feature, if size is range
+-	  [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
+-	  then REP STOSB will be used.
++      [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
++      then REP STOSB will be used.
+    7. If size >= __x86_memset_non_temporal_threshold, use a
+-	  non-temporal stores.  */
++      non-temporal stores.  */
+ 
+ #include <sysdep.h>
+ 
+@@ -150,6 +150,41 @@ L(entry_from_wmemset):
+ 	VMOVU	%VMM(0), -VEC_SIZE(%rdi,%rdx)
+ 	VMOVU	%VMM(0), (%rdi)
+ 	VZEROUPPER_RETURN
++
++	/* If have AVX512 mask instructions put L(less_vec) close to
++	   entry as it doesn't take much space and is likely a hot target.  */
++#ifdef USE_LESS_VEC_MASK_STORE
++    /* Align to ensure the L(less_vec) logic all fits in 1x cache lines.  */
++	.p2align 6,, 47
++	.p2align 4
++L(less_vec):
++L(less_vec_from_wmemset):
++	/* Less than 1 VEC.  */
++# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
++#  error Unsupported VEC_SIZE!
++# endif
++	/* Clear high bits from edi. Only keeping bits relevant to page
++	   cross check. Note that we are using rax which is set in
++	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
++	andl	$(PAGE_SIZE - 1), %edi
++	/* Check if VEC_SIZE store cross page. Mask stores suffer
++	   serious performance degradation when it has to fault suppress.  */
++	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
++	/* This is generally considered a cold target.  */
++	ja	L(cross_page)
++# if VEC_SIZE > 32
++	movq	$-1, %rcx
++	bzhiq	%rdx, %rcx, %rcx
++	kmovq	%rcx, %k1
++# else
++	movl	$-1, %ecx
++	bzhil	%edx, %ecx, %ecx
++	kmovd	%ecx, %k1
++# endif
++	vmovdqu8 %VMM(0), (%rax){%k1}
++	VZEROUPPER_RETURN
++#endif
++
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ END (MEMSET_SYMBOL (__memset, unaligned))
+ 
+@@ -188,54 +223,6 @@ L(last_2x_vec):
+ #endif
+ 	VZEROUPPER_RETURN
+ 
+-	/* If have AVX512 mask instructions put L(less_vec) close to
+-	   entry as it doesn't take much space and is likely a hot target.
+-	 */
+-#ifdef USE_LESS_VEC_MASK_STORE
+-	.p2align 4,, 10
+-L(less_vec):
+-L(less_vec_from_wmemset):
+-	/* Less than 1 VEC.  */
+-# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+-#  error Unsupported VEC_SIZE!
+-# endif
+-	/* Clear high bits from edi. Only keeping bits relevant to page
+-	   cross check. Note that we are using rax which is set in
+-	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
+-	andl	$(PAGE_SIZE - 1), %edi
+-	/* Check if VEC_SIZE store cross page. Mask stores suffer
+-	   serious performance degradation when it has to fault suppress.
+-	 */
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
+-	/* This is generally considered a cold target.  */
+-	ja	L(cross_page)
+-# if VEC_SIZE > 32
+-	movq	$-1, %rcx
+-	bzhiq	%rdx, %rcx, %rcx
+-	kmovq	%rcx, %k1
+-# else
+-	movl	$-1, %ecx
+-	bzhil	%edx, %ecx, %ecx
+-	kmovd	%ecx, %k1
+-# endif
+-	vmovdqu8 %VMM(0), (%rax){%k1}
+-	VZEROUPPER_RETURN
+-
+-# if defined USE_MULTIARCH && IS_IN (libc)
+-	/* Include L(stosb_local) here if including L(less_vec) between
+-	   L(stosb_more_2x_vec) and ENTRY. This is to cache align the
+-	   L(stosb_more_2x_vec) target.  */
+-	.p2align 4,, 10
+-L(stosb_local):
+-	movzbl	%sil, %eax
+-	mov	%RDX_LP, %RCX_LP
+-	mov	%RDI_LP, %RDX_LP
+-	rep	stosb
+-	mov	%RDX_LP, %RAX_LP
+-	VZEROUPPER_RETURN
+-# endif
+-#endif
+-
+ #if defined USE_MULTIARCH && IS_IN (libc)
+ 	.p2align 4
+ L(stosb_more_2x_vec):
+@@ -321,9 +308,13 @@ L(return_vzeroupper):
+ 	ret
+ #endif
+ 
+-	.p2align 4,, 10
+-#ifndef USE_LESS_VEC_MASK_STORE
+-# if defined USE_MULTIARCH && IS_IN (libc)
++#ifdef USE_WITH_AVX2
++	.p2align 4
++#else
++	.p2align 4,, 4
++#endif
++
++#if defined USE_MULTIARCH && IS_IN (libc)
+ 	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
+ 	   range for 2-byte jump encoding.  */
+ L(stosb_local):
+@@ -333,11 +324,17 @@ L(stosb_local):
+ 	mov	%RDX_LP, %RCX_LP
+ 	mov	%RDI_LP, %RDX_LP
+ 	rep	stosb
++# if (defined USE_WITH_SSE2) || (defined USE_WITH_AVX512)
++	/* Use xchg to save 1-byte (this helps align targets below).  */
++	xchg	%RDX_LP, %RAX_LP
++# else
+ 	mov	%RDX_LP, %RAX_LP
+-	VZEROUPPER_RETURN
+ # endif
++	VZEROUPPER_RETURN
++#endif
++#ifndef USE_LESS_VEC_MASK_STORE
+ 	/* Define L(less_vec) only if not otherwise defined.  */
+-	.p2align 4
++	.p2align 4,, 12
+ L(less_vec):
+ 	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+ 	   xmm). This is only does anything for AVX2.  */
+@@ -428,4 +425,35 @@ L(between_2_3):
+ 	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
+ #endif
+ 	ret
+-END (MEMSET_SYMBOL (__memset, unaligned_erms))
++
++#if defined USE_MULTIARCH && IS_IN (libc)
++# ifdef USE_WITH_AVX512
++	/* Force align so the loop doesn't cross a cache-line.  */
++	.p2align 4
++# endif
++	.p2align 4,, 7
++    /* Memset using non-temporal stores.  */
++L(nt_memset):
++	VMOVU	%VMM(0), (VEC_SIZE * 0)(%rdi)
++	leaq	(VEC_SIZE * -4)(%rdi, %rdx), %rdx
++    /* Align DST.  */
++	orq	$(VEC_SIZE * 1 - 1), %rdi
++	incq	%rdi
++	.p2align 4,, 7
++L(nt_loop):
++	VMOVNT	%VMM(0), (VEC_SIZE * 0)(%rdi)
++	VMOVNT	%VMM(0), (VEC_SIZE * 1)(%rdi)
++	VMOVNT	%VMM(0), (VEC_SIZE * 2)(%rdi)
++	VMOVNT	%VMM(0), (VEC_SIZE * 3)(%rdi)
++	subq	$(VEC_SIZE * -4), %rdi
++	cmpq	%rdx, %rdi
++	jb	L(nt_loop)
++	sfence
++	VMOVU	%VMM(0), (VEC_SIZE * 0)(%rdx)
++	VMOVU	%VMM(0), (VEC_SIZE * 1)(%rdx)
++	VMOVU	%VMM(0), (VEC_SIZE * 2)(%rdx)
++	VMOVU	%VMM(0), (VEC_SIZE * 3)(%rdx)
++	VZEROUPPER_RETURN
++#endif
++
++END(MEMSET_SYMBOL(__memset, unaligned_erms))
+-- 
+2.17.1
+
diff --git a/glibc.spec b/glibc.spec
index 4c81d8e..cfe4699 100644
--- a/glibc.spec
+++ b/glibc.spec
@@ -1,4 +1,4 @@
-%define anolis_release 10
+%define anolis_release 11
 
 %bcond_without testsuite
 %bcond_without benchtests
@@ -165,6 +165,18 @@ Patch3049: 0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch
 
 Patch3050: LoongArch-Force-SHMLBA-the-same-as-kernel.patch
 
+Patch3051: 0001-x86-Add-new-architecture-type-for-Hygon-processors.patch
+Patch3052: 0002-x86-Add-cache-information-support-for-Hygon-processo.patch
+Patch3053: 0003-x86-Fix-Zen3-Zen4-ERMS-selection-BZ-30994.patch
+Patch3054: 0004-x86-Add-seperate-non-temporal-tunable-for-memset.patch
+Patch3055: 0005-x86-Enable-non-temporal-memset-tunable-for-AMD.patch
+Patch3056: 0006-x86-Fix-value-for-x86_memset_non_temporal_threshold-.patch
+Patch3057: 0007-x86-Disable-non-temporal-memset-on-Skylake-Server.patch
+Patch3058: 0008-x86-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch
+Patch3059: 0009-x86-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch
+Patch3060: 0010-x86-Enable-non-temporal-memset-for-Hygon-processors.patch
+Patch3061: 0011-x86-Improve-large-memset-perf-with-non-temporal-stor.patch
+
 BuildRequires: audit-libs-devel >= 1.1.3 libcap-devel systemtap-sdt-devel
 BuildRequires: procps-ng util-linux gawk sed >= 3.95 gettext
 BuildRequires: python3 python3-devel
@@ -1127,22 +1139,25 @@ update_gconv_modules_cache ()
 %{_libdir}/libpthread_nonshared.a
 
 %changelog
+* Wed Jun 18 2025 xiajiamei <xiejiamei@hygon.cn> - 2.38-11
+- Add patches to support Hygon
+
 * Wed May 28 2025 mgb01105731 <mgb01105731@alibaba-inc.com> - 2.38-10
 - Add patch to fix CVE-2025-4802
 
 * Tue May 27 2025 mgb01105731 <mgb01105731@alibaba-inc.com> - 2.38-9
 - Add patch to fix CVE-2025-0395
 
-* Thu Jan 02 2025 Peng Fan <fanpeng@loongson.cn> - 2.38-8
-- LoongArch: Force SHMLBA the same as kernel
-
-* Sat Apr 26 2025 Yihao Yan <yan.yihao@zte.com.cn> - 2.38-7
+* Sat Apr 26 2025 Yihao Yan <yan.yihao@zte.com.cn> - 2.38-8
 - Add support for riscv64
 - Add lp64d into glibc
 
-* Wed Jan 15 2025 MayShao <mayshao-oc@zhaoxin.com> - 2.38-6
+* Wed Jan 15 2025 MayShao <mayshao-oc@zhaoxin.com> - 2.38-7
 - x86: Set preferred CPU features and default NT threshold for Zhaoxin processors
 
+* Thu Jan 02 2025 Peng Fan <fanpeng@loongson.cn> - 2.38-6
+- LoongArch: Force SHMLBA the same as kernel
+
 * Fri Dec 20 2024 Zhao Hang <wb-zh951434@alibaba-inc.com> - 2.38-5
 - fix CVE-2024-33602
 
-- 
Gitee