From dfc9af6c3a7f4e6b0ddd924fc888292358100e7c Mon Sep 17 00:00:00 2001 From: Qingqing Li Date: Fri, 28 Mar 2025 14:44:31 +0800 Subject: [PATCH] sync from glibc upstream 2.38 branch. below is the patch list: - math: Improve layout of exp/exp10 data - AArch64: Use prefer_sve_ifuncs for SVE memset - AArch64: Add SVE memset - math: Improve layout of expf data - AArch64: Remove zva_128 from memset - AArch64: Optimize memset - AArch64: Improve generic strlen - assert: Add test for CVE-2025-0395 (cherry picked from commit a6a6276229d415c277b108ed8e6ef4f2fe517bae) --- AArch64-Add-SVE-memset.patch | 200 ++++++++++++ AArch64-Improve-generic-strlen.patch | 92 ++++++ AArch64-Optimize-memset.patch | 287 ++++++++++++++++++ AArch64-Remove-zva_128-from-memset.patch | 65 ++++ ...Use-prefer_sve_ifuncs-for-SVE-memset.patch | 29 ++ assert-Add-test-for-CVE-2025-0395.patch | 132 ++++++++ glibc.spec | 20 +- math-Improve-layout-of-exp-exp10-data.patch | 39 +++ math-Improve-layout-of-expf-data.patch | 34 +++ 9 files changed, 897 insertions(+), 1 deletion(-) create mode 100644 AArch64-Add-SVE-memset.patch create mode 100644 AArch64-Improve-generic-strlen.patch create mode 100644 AArch64-Optimize-memset.patch create mode 100644 AArch64-Remove-zva_128-from-memset.patch create mode 100644 AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch create mode 100644 assert-Add-test-for-CVE-2025-0395.patch create mode 100644 math-Improve-layout-of-exp-exp10-data.patch create mode 100644 math-Improve-layout-of-expf-data.patch diff --git a/AArch64-Add-SVE-memset.patch b/AArch64-Add-SVE-memset.patch new file mode 100644 index 0000000..502acd4 --- /dev/null +++ b/AArch64-Add-SVE-memset.patch @@ -0,0 +1,200 @@ +From 52c2b1556f773d9a75d030160e0e273a5ea84502 Mon Sep 17 00:00:00 2001 +From: Wilco Dijkstra +Date: Tue, 24 Dec 2024 18:01:59 +0000 +Subject: [PATCH] AArch64: Add SVE memset + +Add SVE memset based on the generic memset with predicated load for sizes < 16. +Unaligned memsets of 128-1024 are improved by ~20% on average by using aligned +stores for the last 64 bytes. Performance of random memset benchmark improves +by ~2% on Neoverse V1. + +Reviewed-by: Yury Khrustalev +(cherry picked from commit 163b1bbb76caba4d9673c07940c5930a1afa7548) +--- + sysdeps/aarch64/multiarch/Makefile | 1 + + sysdeps/aarch64/multiarch/ifunc-impl-list.c | 3 +- + sysdeps/aarch64/multiarch/memset.c | 4 + + sysdeps/aarch64/multiarch/memset_sve_zva64.S | 123 +++++++++++++++++++ + 4 files changed, 130 insertions(+), 1 deletion(-) + create mode 100644 sysdeps/aarch64/multiarch/memset_sve_zva64.S + +diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile +index e4720b7468..214b6137b0 100644 +--- a/sysdeps/aarch64/multiarch/Makefile ++++ b/sysdeps/aarch64/multiarch/Makefile +@@ -14,6 +14,7 @@ sysdep_routines += \ + memset_generic \ + memset_kunpeng \ + memset_mops \ ++ memset_sve_zva64 \ + memset_zva64 \ + strlen_asimd \ + strlen_generic \ +diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c +index 73038ac810..2fa6baa319 100644 +--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c +@@ -56,7 +56,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng) + #if HAVE_AARCH64_SVE_ASM +- IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 256, __memset_a64fx) ++ IFUNC_IMPL_ADD (array, i, memset, sve && !bti && zva_size == 256, __memset_a64fx) ++ IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 64, __memset_sve_zva64) + #endif + IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) +diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c +index 6deb6865e5..89fde57f42 100644 +--- a/sysdeps/aarch64/multiarch/memset.c ++++ b/sysdeps/aarch64/multiarch/memset.c +@@ -34,6 +34,7 @@ extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden; + extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden; + extern __typeof (__redirect_memset) __memset_generic attribute_hidden; + extern __typeof (__redirect_memset) __memset_mops attribute_hidden; ++extern __typeof (__redirect_memset) __memset_sve_zva64 attribute_hidden; + + static inline __typeof (__redirect_memset) * + select_memset_ifunc (void) +@@ -47,6 +48,9 @@ select_memset_ifunc (void) + { + if (IS_A64FX (midr) && zva_size == 256) + return __memset_a64fx; ++ ++ if (zva_size == 64) ++ return __memset_sve_zva64; + } + + if (IS_KUNPENG920 (midr)) +diff --git a/sysdeps/aarch64/multiarch/memset_sve_zva64.S b/sysdeps/aarch64/multiarch/memset_sve_zva64.S +new file mode 100644 +index 0000000000..7fb40fdd9e +--- /dev/null ++++ b/sysdeps/aarch64/multiarch/memset_sve_zva64.S +@@ -0,0 +1,123 @@ ++/* Optimized memset for SVE. ++ Copyright (C) 2025 Free Software Foundation, Inc. ++ ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#include ++ ++/* Assumptions: ++ * ++ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. ++ * ZVA size is 64. ++ */ ++ ++#if HAVE_AARCH64_SVE_ASM ++ ++.arch armv8.2-a+sve ++ ++#define dstin x0 ++#define val x1 ++#define valw w1 ++#define count x2 ++#define dst x3 ++#define dstend x4 ++#define zva_val x5 ++#define vlen x5 ++#define off x3 ++#define dstend2 x5 ++ ++ENTRY (__memset_sve_zva64) ++ dup v0.16B, valw ++ cmp count, 16 ++ b.lo L(set_16) ++ ++ add dstend, dstin, count ++ cmp count, 64 ++ b.hs L(set_128) ++ ++ /* Set 16..63 bytes. */ ++ mov off, 16 ++ and off, off, count, lsr 1 ++ sub dstend2, dstend, off ++ str q0, [dstin] ++ str q0, [dstin, off] ++ str q0, [dstend2, -16] ++ str q0, [dstend, -16] ++ ret ++ ++ .p2align 4 ++L(set_16): ++ whilelo p0.b, xzr, count ++ st1b z0.b, p0, [dstin] ++ ret ++ ++ .p2align 4 ++L(set_128): ++ bic dst, dstin, 15 ++ cmp count, 128 ++ b.hi L(set_long) ++ stp q0, q0, [dstin] ++ stp q0, q0, [dstin, 32] ++ stp q0, q0, [dstend, -64] ++ stp q0, q0, [dstend, -32] ++ ret ++ ++ .p2align 4 ++L(set_long): ++ cmp count, 256 ++ b.lo L(no_zva) ++ tst valw, 255 ++ b.ne L(no_zva) ++ ++ str q0, [dstin] ++ str q0, [dst, 16] ++ bic dst, dstin, 31 ++ stp q0, q0, [dst, 32] ++ bic dst, dstin, 63 ++ sub count, dstend, dst /* Count is now 64 too large. */ ++ sub count, count, 128 /* Adjust count and bias for loop. */ ++ ++ sub x8, dstend, 1 /* Write last bytes before ZVA loop. */ ++ bic x8, x8, 15 ++ stp q0, q0, [x8, -48] ++ str q0, [x8, -16] ++ str q0, [dstend, -16] ++ ++ .p2align 4 ++L(zva64_loop): ++ add dst, dst, 64 ++ dc zva, dst ++ subs count, count, 64 ++ b.hi L(zva64_loop) ++ ret ++ ++L(no_zva): ++ str q0, [dstin] ++ sub count, dstend, dst /* Count is 16 too large. */ ++ sub count, count, 64 + 16 /* Adjust count and bias for loop. */ ++L(no_zva_loop): ++ stp q0, q0, [dst, 16] ++ stp q0, q0, [dst, 48] ++ add dst, dst, 64 ++ subs count, count, 64 ++ b.hi L(no_zva_loop) ++ stp q0, q0, [dstend, -64] ++ stp q0, q0, [dstend, -32] ++ ret ++ ++END (__memset_sve_zva64) ++#endif +-- +2.27.0 + diff --git a/AArch64-Improve-generic-strlen.patch b/AArch64-Improve-generic-strlen.patch new file mode 100644 index 0000000..7868ad4 --- /dev/null +++ b/AArch64-Improve-generic-strlen.patch @@ -0,0 +1,92 @@ +From 9ca74b8ad1968d935815bdc2f1f1c7e9f2e32f70 Mon Sep 17 00:00:00 2001 +From: Wilco Dijkstra +Date: Wed, 7 Aug 2024 14:43:47 +0100 +Subject: [PATCH] AArch64: Improve generic strlen + +Improve performance by handling another 16 bytes before entering the loop. +Use ADDHN in the loop to avoid SHRN+FMOV when it terminates. Change final +size computation to avoid increasing latency. On Neoverse V1 performance +of the random strlen benchmark improves by 4.6%. + +Reviewed-by: Adhemerval Zanella +(cherry picked from commit 3dc426b642dcafdbc11a99f2767e081d086f5fc7) +--- + sysdeps/aarch64/strlen.S | 39 +++++++++++++++++++++++++++------------ + 1 file changed, 27 insertions(+), 12 deletions(-) + +diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S +index 133ef93342..352fb40d3a 100644 +--- a/sysdeps/aarch64/strlen.S ++++ b/sysdeps/aarch64/strlen.S +@@ -1,4 +1,5 @@ +-/* Copyright (C) 2012-2023 Free Software Foundation, Inc. ++/* Generic optimized strlen using SIMD. ++ Copyright (C) 2012-2024 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + +@@ -56,36 +57,50 @@ ENTRY (STRLEN) + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ + fmov synd, dend + lsr synd, synd, shift +- cbz synd, L(loop) ++ cbz synd, L(next16) + + rbit synd, synd + clz result, synd + lsr result, result, 2 + ret + ++L(next16): ++ ldr data, [src, 16] ++ cmeq vhas_nul.16b, vdata.16b, 0 ++ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ ++ fmov synd, dend ++ cbz synd, L(loop) ++ add src, src, 16 ++#ifndef __AARCH64EB__ ++ rbit synd, synd ++#endif ++ sub result, src, srcin ++ clz tmp, synd ++ add result, result, tmp, lsr 2 ++ ret ++ + .p2align 5 + L(loop): +- ldr data, [src, 16] ++ ldr data, [src, 32]! + cmeq vhas_nul.16b, vdata.16b, 0 +- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b ++ addhn vend.8b, vhas_nul.8h, vhas_nul.8h + fmov synd, dend + cbnz synd, L(loop_end) +- ldr data, [src, 32]! ++ ldr data, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 +- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b ++ addhn vend.8b, vhas_nul.8h, vhas_nul.8h + fmov synd, dend + cbz synd, L(loop) +- sub src, src, 16 ++ add src, src, 16 + L(loop_end): +- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ +- sub result, src, srcin +- fmov synd, dend ++ sub result, shift, src, lsl 2 /* (srcin - src) << 2. */ + #ifndef __AARCH64EB__ + rbit synd, synd ++ sub result, result, 3 + #endif +- add result, result, 16 + clz tmp, synd +- add result, result, tmp, lsr 2 ++ sub result, tmp, result ++ lsr result, result, 2 + ret + + END (STRLEN) +-- +2.27.0 + diff --git a/AArch64-Optimize-memset.patch b/AArch64-Optimize-memset.patch new file mode 100644 index 0000000..663f4ef --- /dev/null +++ b/AArch64-Optimize-memset.patch @@ -0,0 +1,287 @@ +From 95aa21432ccbf77225abd485d98df36ba760ff80 Mon Sep 17 00:00:00 2001 +From: Wilco Dijkstra +Date: Mon, 9 Sep 2024 15:26:47 +0100 +Subject: [PATCH] AArch64: Optimize memset + +Improve small memsets by avoiding branches and use overlapping stores. +Use DC ZVA for copies over 128 bytes. Remove unnecessary code for ZVA sizes +other than 64 and 128. Performance of random memset benchmark improves by 24% +on Neoverse N1. + +Reviewed-by: Adhemerval Zanella +(cherry picked from commit cec3aef32412779e207f825db0d057ebb4628ae8) +--- + sysdeps/aarch64/memset.S | 195 +++++++++++++++++---------------------- + 1 file changed, 84 insertions(+), 111 deletions(-) + +diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S +index bbfb7184c3..caafb019e2 100644 +--- a/sysdeps/aarch64/memset.S ++++ b/sysdeps/aarch64/memset.S +@@ -1,4 +1,5 @@ +-/* Copyright (C) 2012-2023 Free Software Foundation, Inc. ++/* Generic optimized memset using SIMD. ++ Copyright (C) 2012-2024 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + +@@ -17,7 +18,6 @@ + . */ + + #include +-#include "memset-reg.h" + + #ifndef MEMSET + # define MEMSET memset +@@ -25,130 +25,132 @@ + + /* Assumptions: + * +- * ARMv8-a, AArch64, unaligned accesses ++ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * + */ + +-ENTRY (MEMSET) ++#define dstin x0 ++#define val x1 ++#define valw w1 ++#define count x2 ++#define dst x3 ++#define dstend x4 ++#define zva_val x5 ++#define off x3 ++#define dstend2 x5 + ++ENTRY (MEMSET) + PTR_ARG (0) + SIZE_ARG (2) + + dup v0.16B, valw ++ cmp count, 16 ++ b.lo L(set_small) ++ + add dstend, dstin, count ++ cmp count, 64 ++ b.hs L(set_128) + +- cmp count, 96 +- b.hi L(set_long) +- cmp count, 16 +- b.hs L(set_medium) +- mov val, v0.D[0] ++ /* Set 16..63 bytes. */ ++ mov off, 16 ++ and off, off, count, lsr 1 ++ sub dstend2, dstend, off ++ str q0, [dstin] ++ str q0, [dstin, off] ++ str q0, [dstend2, -16] ++ str q0, [dstend, -16] ++ ret + ++ .p2align 4 + /* Set 0..15 bytes. */ +- tbz count, 3, 1f +- str val, [dstin] +- str val, [dstend, -8] +- ret +- nop +-1: tbz count, 2, 2f +- str valw, [dstin] +- str valw, [dstend, -4] ++L(set_small): ++ add dstend, dstin, count ++ cmp count, 4 ++ b.lo 2f ++ lsr off, count, 3 ++ sub dstend2, dstend, off, lsl 2 ++ str s0, [dstin] ++ str s0, [dstin, off, lsl 2] ++ str s0, [dstend2, -4] ++ str s0, [dstend, -4] + ret ++ ++ /* Set 0..3 bytes. */ + 2: cbz count, 3f ++ lsr off, count, 1 + strb valw, [dstin] +- tbz count, 1, 3f +- strh valw, [dstend, -2] ++ strb valw, [dstin, off] ++ strb valw, [dstend, -1] + 3: ret + +- /* Set 17..96 bytes. */ +-L(set_medium): +- str q0, [dstin] +- tbnz count, 6, L(set96) +- str q0, [dstend, -16] +- tbz count, 5, 1f +- str q0, [dstin, 16] +- str q0, [dstend, -32] +-1: ret +- + .p2align 4 +- /* Set 64..96 bytes. Write 64 bytes from the start and +- 32 bytes from the end. */ +-L(set96): +- str q0, [dstin, 16] ++L(set_128): ++ bic dst, dstin, 15 ++ cmp count, 128 ++ b.hi L(set_long) ++ stp q0, q0, [dstin] + stp q0, q0, [dstin, 32] ++ stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +- .p2align 3 +- nop ++ .p2align 4 + L(set_long): +- and valw, valw, 255 +- bic dst, dstin, 15 + str q0, [dstin] +- cmp count, 256 +- ccmp valw, 0, 0, cs +- b.eq L(try_zva) +-L(no_zva): +- sub count, dstend, dst /* Count is 16 too large. */ +- sub dst, dst, 16 /* Dst is biased by -32. */ +- sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +-1: stp q0, q0, [dst, 32] +- stp q0, q0, [dst, 64]! +-L(tail64): +- subs count, count, 64 +- b.hi 1b +-2: stp q0, q0, [dstend, -64] ++ str q0, [dst, 16] ++ tst valw, 255 ++ b.ne L(no_zva) ++#ifndef ZVA64_ONLY ++ mrs zva_val, dczid_el0 ++ and zva_val, zva_val, 31 ++ cmp zva_val, 4 /* ZVA size is 64 bytes. */ ++ b.ne L(zva_128) ++#endif ++ stp q0, q0, [dst, 32] ++ bic dst, dstin, 63 ++ sub count, dstend, dst /* Count is now 64 too large. */ ++ sub count, count, 64 + 64 /* Adjust count and bias for loop. */ ++ ++ /* Write last bytes before ZVA loop. */ ++ stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] ++ ++ .p2align 4 ++L(zva64_loop): ++ add dst, dst, 64 ++ dc zva, dst ++ subs count, count, 64 ++ b.hi L(zva64_loop) + ret + +-L(try_zva): +-#ifndef ZVA64_ONLY + .p2align 3 +- mrs tmp1, dczid_el0 +- tbnz tmp1w, 4, L(no_zva) +- and tmp1w, tmp1w, 15 +- cmp tmp1w, 4 /* ZVA size is 64 bytes. */ +- b.ne L(zva_128) +- nop +-#endif +- /* Write the first and last 64 byte aligned block using stp rather +- than using DC ZVA. This is faster on some cores. +- */ +- .p2align 4 +-L(zva_64): +- str q0, [dst, 16] ++L(no_zva): ++ sub count, dstend, dst /* Count is 32 too large. */ ++ sub count, count, 64 + 32 /* Adjust count and bias for loop. */ ++L(no_zva_loop): + stp q0, q0, [dst, 32] +- bic dst, dst, 63 + stp q0, q0, [dst, 64] +- stp q0, q0, [dst, 96] +- sub count, dstend, dst /* Count is now 128 too large. */ +- sub count, count, 128+64+64 /* Adjust count and bias for loop. */ +- add dst, dst, 128 +-1: dc zva, dst + add dst, dst, 64 + subs count, count, 64 +- b.hi 1b +- stp q0, q0, [dst, 0] +- stp q0, q0, [dst, 32] ++ b.hi L(no_zva_loop) + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + + #ifndef ZVA64_ONLY +- .p2align 3 ++ .p2align 4 + L(zva_128): +- cmp tmp1w, 5 /* ZVA size is 128 bytes. */ +- b.ne L(zva_other) ++ cmp zva_val, 5 /* ZVA size is 128 bytes. */ ++ b.ne L(no_zva) + +- str q0, [dst, 16] + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + bic dst, dst, 127 + sub count, dstend, dst /* Count is now 128 too large. */ +- sub count, count, 128+128 /* Adjust count and bias for loop. */ +- add dst, dst, 128 +-1: dc zva, dst +- add dst, dst, 128 ++ sub count, count, 128 + 128 /* Adjust count and bias for loop. */ ++1: add dst, dst, 128 ++ dc zva, dst + subs count, count, 128 + b.hi 1b + stp q0, q0, [dstend, -128] +@@ -156,35 +158,6 @@ L(zva_128): + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret +- +-L(zva_other): +- mov tmp2w, 4 +- lsl zva_lenw, tmp2w, tmp1w +- add tmp1, zva_len, 64 /* Max alignment bytes written. */ +- cmp count, tmp1 +- blo L(no_zva) +- +- sub tmp2, zva_len, 1 +- add tmp1, dst, zva_len +- add dst, dst, 16 +- subs count, tmp1, dst /* Actual alignment bytes to write. */ +- bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ +- beq 2f +-1: stp q0, q0, [dst], 64 +- stp q0, q0, [dst, -32] +- subs count, count, 64 +- b.hi 1b +-2: mov dst, tmp1 +- sub count, dstend, tmp1 /* Remaining bytes to write. */ +- subs count, count, zva_len +- b.lo 4f +-3: dc zva, dst +- add dst, dst, zva_len +- subs count, count, zva_len +- b.hs 3b +-4: add count, count, zva_len +- sub dst, dst, 32 /* Bias dst for tail loop. */ +- b L(tail64) + #endif + + END (MEMSET) +-- +2.27.0 + diff --git a/AArch64-Remove-zva_128-from-memset.patch b/AArch64-Remove-zva_128-from-memset.patch new file mode 100644 index 0000000..aa8bc76 --- /dev/null +++ b/AArch64-Remove-zva_128-from-memset.patch @@ -0,0 +1,65 @@ +From 5fe151d86a19bc3dc791fd2d92efeb6c6e11cf64 Mon Sep 17 00:00:00 2001 +From: Wilco Dijkstra +Date: Mon, 25 Nov 2024 18:43:08 +0000 +Subject: [PATCH] AArch64: Remove zva_128 from memset + +Remove ZVA 128 support from memset - the new memset no longer +guarantees count >= 256, which can result in underflow and a +crash if ZVA size is 128 ([1]). Since only one CPU uses a ZVA +size of 128 and its memcpy implementation was removed in commit +e162ab2bf1b82c40f29e1925986582fa07568ce8, remove this special +case too. + +[1] https://sourceware.org/pipermail/libc-alpha/2024-November/161626.html + +Reviewed-by: Andrew Pinski +(cherry picked from commit a08d9a52f967531a77e1824c23b5368c6434a72d) +--- + sysdeps/aarch64/memset.S | 25 +------------------------ + 1 file changed, 1 insertion(+), 24 deletions(-) + +diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S +index caafb019e2..71814d0b2f 100644 +--- a/sysdeps/aarch64/memset.S ++++ b/sysdeps/aarch64/memset.S +@@ -104,7 +104,7 @@ L(set_long): + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ +- b.ne L(zva_128) ++ b.ne L(no_zva) + #endif + stp q0, q0, [dst, 32] + bic dst, dstin, 63 +@@ -137,28 +137,5 @@ L(no_zva_loop): + stp q0, q0, [dstend, -32] + ret + +-#ifndef ZVA64_ONLY +- .p2align 4 +-L(zva_128): +- cmp zva_val, 5 /* ZVA size is 128 bytes. */ +- b.ne L(no_zva) +- +- stp q0, q0, [dst, 32] +- stp q0, q0, [dst, 64] +- stp q0, q0, [dst, 96] +- bic dst, dst, 127 +- sub count, dstend, dst /* Count is now 128 too large. */ +- sub count, count, 128 + 128 /* Adjust count and bias for loop. */ +-1: add dst, dst, 128 +- dc zva, dst +- subs count, count, 128 +- b.hi 1b +- stp q0, q0, [dstend, -128] +- stp q0, q0, [dstend, -96] +- stp q0, q0, [dstend, -64] +- stp q0, q0, [dstend, -32] +- ret +-#endif +- + END (MEMSET) + libc_hidden_builtin_def (MEMSET) +-- +2.27.0 + diff --git a/AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch b/AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch new file mode 100644 index 0000000..b92230b --- /dev/null +++ b/AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch @@ -0,0 +1,29 @@ +From 097299ffa904b327fce83770fa6a522e4393ddb3 Mon Sep 17 00:00:00 2001 +From: Wilco Dijkstra +Date: Thu, 27 Feb 2025 16:28:52 +0000 +Subject: [PATCH] AArch64: Use prefer_sve_ifuncs for SVE memset + +Use prefer_sve_ifuncs for SVE memset just like memcpy. + +Reviewed-by: Yury Khrustalev +(cherry picked from commit 0f044be1dae5169d0e57f8d487b427863aeadab4) +--- + sysdeps/aarch64/multiarch/memset.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c +index 89fde57f42..ce5d35a20e 100644 +--- a/sysdeps/aarch64/multiarch/memset.c ++++ b/sysdeps/aarch64/multiarch/memset.c +@@ -49,7 +49,7 @@ select_memset_ifunc (void) + if (IS_A64FX (midr) && zva_size == 256) + return __memset_a64fx; + +- if (zva_size == 64) ++ if (prefer_sve_ifuncs && zva_size == 64) + return __memset_sve_zva64; + } + +-- +2.27.0 + diff --git a/assert-Add-test-for-CVE-2025-0395.patch b/assert-Add-test-for-CVE-2025-0395.patch new file mode 100644 index 0000000..2670800 --- /dev/null +++ b/assert-Add-test-for-CVE-2025-0395.patch @@ -0,0 +1,132 @@ +From f984e2d7e8299726891a1a497a3c36cd5542a0bf Mon Sep 17 00:00:00 2001 +From: Siddhesh Poyarekar +Date: Fri, 31 Jan 2025 12:16:30 -0500 +Subject: [PATCH] assert: Add test for CVE-2025-0395 + +Use the __progname symbol to override the program name to induce the +failure that CVE-2025-0395 describes. + +This is related to BZ #32582 + +Signed-off-by: Siddhesh Poyarekar +Reviewed-by: Adhemerval Zanella +(cherry picked from commit cdb9ba84191ce72e86346fb8b1d906e7cd930ea2) +--- + assert/Makefile | 1 + + assert/tst-assert-sa-2025-0001.c | 92 ++++++++++++++++++++++++++++++++ + 2 files changed, 93 insertions(+) + create mode 100644 assert/tst-assert-sa-2025-0001.c + +diff --git a/assert/Makefile b/assert/Makefile +index 67f4e6a570..b0fc9fc4d2 100644 +--- a/assert/Makefile ++++ b/assert/Makefile +@@ -38,6 +38,7 @@ tests := \ + test-assert-perr \ + tst-assert-c++ \ + tst-assert-g++ \ ++ tst-assert-sa-2025-0001 \ + # tests + + ifeq ($(have-cxx-thread_local),yes) +diff --git a/assert/tst-assert-sa-2025-0001.c b/assert/tst-assert-sa-2025-0001.c +new file mode 100644 +index 0000000000..102cb0078d +--- /dev/null ++++ b/assert/tst-assert-sa-2025-0001.c +@@ -0,0 +1,92 @@ ++/* Test for CVE-2025-0395. ++ Copyright The GNU Toolchain Authors. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++/* Test that a large enough __progname does not result in a buffer overflow ++ when printing an assertion failure. This was CVE-2025-0395. */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++extern const char *__progname; ++ ++int ++do_test (int argc, char **argv) ++{ ++ ++ support_need_proc ("Reads /proc/self/maps to add guards to writable maps."); ++ ignore_stderr (); ++ ++ /* XXX assumes that the assert is on a 2 digit line number. */ ++ const char *prompt = ": %s:99: do_test: Assertion `argc < 1' failed.\n"; ++ ++ int ret = fprintf (stderr, prompt, __FILE__); ++ if (ret < 0) ++ FAIL_EXIT1 ("fprintf failed: %m\n"); ++ ++ size_t pagesize = getpagesize (); ++ size_t namesize = pagesize - 1 - ret; ++ ++ /* Alter the progname so that the assert message fills the entire page. */ ++ char progname[namesize]; ++ memset (progname, 'A', namesize - 1); ++ progname[namesize - 1] = '\0'; ++ __progname = progname; ++ ++ FILE *f = xfopen ("/proc/self/maps", "r"); ++ char *line = NULL; ++ size_t len = 0; ++ uintptr_t prev_to = 0; ++ ++ /* Pad the beginning of every writable mapping with a PROT_NONE map. This ++ ensures that the mmap in the assert_fail path never ends up below a ++ writable map and will terminate immediately in case of a buffer ++ overflow. */ ++ while (xgetline (&line, &len, f)) ++ { ++ uintptr_t from, to; ++ char perm[4]; ++ ++ sscanf (line, "%" SCNxPTR "-%" SCNxPTR " %c%c%c%c ", ++ &from, &to, ++ &perm[0], &perm[1], &perm[2], &perm[3]); ++ ++ bool writable = (memchr (perm, 'w', 4) != NULL); ++ ++ if (prev_to != 0 && from - prev_to > pagesize && writable) ++ xmmap ((void *) from - pagesize, pagesize, PROT_NONE, ++ MAP_ANONYMOUS | MAP_PRIVATE, 0); ++ ++ prev_to = to; ++ } ++ ++ xfclose (f); ++ ++ assert (argc < 1); ++ return 0; ++} ++ ++#define EXPECTED_SIGNAL SIGABRT ++#define TEST_FUNCTION_ARGV do_test ++#include +-- +2.27.0 + diff --git a/glibc.spec b/glibc.spec index 181eccd..da5b102 100644 --- a/glibc.spec +++ b/glibc.spec @@ -67,7 +67,7 @@ ############################################################################## Name: glibc Version: 2.38 -Release: 56 +Release: 57 Summary: The GNU libc libraries License: %{all_license} URL: http://www.gnu.org/software/glibc/ @@ -278,6 +278,14 @@ Patch188: backport-x86-Disable-non-temporal-memset-on-Skylake-Server.patch Patch189: backport-Use-Avoid_Non_Temporal_Memset-to-control-non-tem.patch Patch190: backport-Add-Avoid_STOSB-tunable-to-allow-NT-memset-witho.patch Patch191: backport-x86-Enable-non-temporal-memset-for-Hygon-processors.patch +Patch192: assert-Add-test-for-CVE-2025-0395.patch +Patch193: AArch64-Improve-generic-strlen.patch +Patch194: AArch64-Optimize-memset.patch +Patch195: AArch64-Remove-zva_128-from-memset.patch +Patch196: math-Improve-layout-of-expf-data.patch +Patch197: AArch64-Add-SVE-memset.patch +Patch198: AArch64-Use-prefer_sve_ifuncs-for-SVE-memset.patch +Patch199: math-Improve-layout-of-exp-exp10-data.patch #openEuler patch list Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch @@ -1502,6 +1510,16 @@ fi %endif %changelog +* Fri Mar 28 2025 Qingqing Li - 2.38-57 +- math: Improve layout of exp/exp10 data +- AArch64: Use prefer_sve_ifuncs for SVE memset +- AArch64: Add SVE memset +- math: Improve layout of expf data +- AArch64: Remove zva_128 from memset +- AArch64: Optimize memset +- AArch64: Improve generic strlen +- assert: Add test for CVE-2025-0395 + * Wed Mar 12 2025 xiajimei - 2.38-56 - x86: Enable non-temporal memset for Hygon processors - x86: Add `Avoid_STOSB` tunable to allow NT memset without ERMS diff --git a/math-Improve-layout-of-exp-exp10-data.patch b/math-Improve-layout-of-exp-exp10-data.patch new file mode 100644 index 0000000..a0da041 --- /dev/null +++ b/math-Improve-layout-of-exp-exp10-data.patch @@ -0,0 +1,39 @@ +From 5a08d049dc5037e89eb95bb1506652f0043fa39e Mon Sep 17 00:00:00 2001 +From: Wilco Dijkstra +Date: Fri, 13 Dec 2024 15:43:07 +0000 +Subject: [PATCH] math: Improve layout of exp/exp10 data + +GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch +changes the exp_data struct slightly so that the fields are better aligned +and without gaps. As a result on targets that support them, more load-pair +instructions are used in exp. + +The exp benchmark improves 2.5%, "144bits" by 7.2%, "768bits" by 12.7% on +Neoverse V2. + +Reviewed-by: Adhemerval Zanella +(cherry picked from commit 5afaf99edb326fd9f36eb306a828d129a3a1d7f7) +--- + sysdeps/ieee754/dbl-64/math_config.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h +index 19af33fd86..52b720ecd1 100644 +--- a/sysdeps/ieee754/dbl-64/math_config.h ++++ b/sysdeps/ieee754/dbl-64/math_config.h +@@ -195,10 +195,11 @@ check_uflow (double x) + extern const struct exp_data + { + double invln2N; +- double shift; + double negln2hiN; + double negln2loN; + double poly[4]; /* Last four coefficients. */ ++ double shift; ++ + double exp2_shift; + double exp2_poly[EXP2_POLY_ORDER]; + uint64_t tab[2*(1 << EXP_TABLE_BITS)]; +-- +2.27.0 + diff --git a/math-Improve-layout-of-expf-data.patch b/math-Improve-layout-of-expf-data.patch new file mode 100644 index 0000000..aa3ee5e --- /dev/null +++ b/math-Improve-layout-of-expf-data.patch @@ -0,0 +1,34 @@ +From 3de5112326a4274c97f154f3d335c11965ee960c Mon Sep 17 00:00:00 2001 +From: Wilco Dijkstra +Date: Wed, 24 Jul 2024 15:17:47 +0100 +Subject: [PATCH] math: Improve layout of expf data + +GCC aligns global data to 16 bytes if their size is >= 16 bytes. This patch +changes the exp2f_data struct slightly so that the fields are better aligned. +As a result on targets that support them, load-pair instructions accessing +poly_scaled and invln2_scaled are now 16-byte aligned. + +Reviewed-by: Adhemerval Zanella +(cherry picked from commit 44fa9c1080fe6a9539f0d2345b9d2ae37b8ee57a) +--- + sysdeps/ieee754/flt-32/math_config.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h +index d1b06a1a90..5904eb9bac 100644 +--- a/sysdeps/ieee754/flt-32/math_config.h ++++ b/sysdeps/ieee754/flt-32/math_config.h +@@ -166,9 +166,9 @@ extern const struct exp2f_data + uint64_t tab[1 << EXP2F_TABLE_BITS]; + double shift_scaled; + double poly[EXP2F_POLY_ORDER]; +- double shift; + double invln2_scaled; + double poly_scaled[EXP2F_POLY_ORDER]; ++ double shift; + } __exp2f_data attribute_hidden; + + #define LOGF_TABLE_BITS 4 +-- +2.27.0 + -- Gitee