From cac59fc30a95bb2691ee4785a57ee7c613ebcda2 Mon Sep 17 00:00:00 2001 From: liqingqing_1229 Date: Sat, 12 Jul 2025 02:20:08 +0000 Subject: [PATCH] revert "aarch64: Use memcpy_simd as the default memcpy" and use __memmove_generic for kunpeng920_tsv120 1. This revert commit e6f3fe362f1aab78b1448d69ecdbd9e3872636d3 cause memcpy_simd has performance regression in many arm cores https://sourceware.org/bugzilla/show_bug.cgi?id=27437 2. using __memmove_generic for kunpeng920_tsv120 for performance Signed-off-by: liqingqing_1229 (cherry picked from commit 3b4088f747f5533c2066f5891eee2999eb5ebcf0) --- ...eneric-when-kunpeng920-with-tsv120-m.patch | 26 ++ glibc.spec | 8 +- ...se-memcpy_simd-as-the-default-memcpy.patch | 318 ++++++++++++++++++ 3 files changed, 351 insertions(+), 1 deletion(-) create mode 100644 Using-__memmove_generic-when-kunpeng920-with-tsv120-m.patch create mode 100644 revert-aarch64-Use-memcpy_simd-as-the-default-memcpy.patch diff --git a/Using-__memmove_generic-when-kunpeng920-with-tsv120-m.patch b/Using-__memmove_generic-when-kunpeng920-with-tsv120-m.patch new file mode 100644 index 0000000..a5d7724 --- /dev/null +++ b/Using-__memmove_generic-when-kunpeng920-with-tsv120-m.patch @@ -0,0 +1,26 @@ +From 5ca322e4bdce79687f88ba62c1c1e5fd024f8dc6 Mon Sep 17 00:00:00 2001 +From: liqingqing +Date: Sat, 12 Jul 2025 06:10:24 +0800 +Subject: [PATCH] Using __memmove_generic when kunpeng920 with tsv120 micro + architecture + +--- + sysdeps/aarch64/multiarch/memmove.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c +index 6b771668..d438daa4 100644 +--- a/sysdeps/aarch64/multiarch/memmove.c ++++ b/sysdeps/aarch64/multiarch/memmove.c +@@ -47,6 +47,8 @@ select_memmove_ifunc (void) + { + if (IS_A64FX (midr)) + return __memmove_a64fx; ++ if (IS_KUNPENG920_TSV120 (midr)) ++ return __memmove_generic; + return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic; + } + +-- +2.33.0 + diff --git a/glibc.spec b/glibc.spec index 1e4f584..cf3a182 100644 --- a/glibc.spec +++ b/glibc.spec @@ -67,7 +67,7 @@ ############################################################################## Name: glibc Version: 2.38 -Release: 61 +Release: 62 Summary: The GNU libc libraries License: %{all_license} URL: http://www.gnu.org/software/glibc/ @@ -337,6 +337,8 @@ Patch9034: 0002-x86_64-Optimize-large-size-copy-in-memmove-ssse3.patch Patch9035: 0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch Patch9036: fix-CVE-2019-1010023.patch Patch9037: Using-__memcpy_generic-when-kunpeng920-with-tsv120-m.patch +Patch9038: Using-__memmove_generic-when-kunpeng920-with-tsv120-m.patch +Patch9039: revert-aarch64-Use-memcpy_simd-as-the-default-memcpy.patch Provides: ldconfig rtld(GNU_HASH) bundled(gnulib) @@ -1517,6 +1519,10 @@ fi %endif %changelog +* Sat Jul 12 2025 Qingqing Li - 2.38-62 +- aarch64: Using __memmove_generic when kunpeng920 with tsv120 micro architecture +- aarch64: revert "aarch64: Use memcpy_simd as the default memcpy" + * Mon Jul 07 2025 Qingqing Li - 2.38-61 - aarch64: Using __memcpy_generic when kunpeng920 with tsv120 micro architecture diff --git a/revert-aarch64-Use-memcpy_simd-as-the-default-memcpy.patch b/revert-aarch64-Use-memcpy_simd-as-the-default-memcpy.patch new file mode 100644 index 0000000..76455b3 --- /dev/null +++ b/revert-aarch64-Use-memcpy_simd-as-the-default-memcpy.patch @@ -0,0 +1,318 @@ +From 9e87dccaffb020117fee8fb7eeffff5a2387f16f Mon Sep 17 00:00:00 2001 +From: liqingqing +Date: Sat, 12 Jul 2025 06:05:35 +0800 +Subject: [PATCH] revert "aarch64: Use memcpy_simd as the default memcpy" + +This revert commit e6f3fe362f1aab78b1448d69ecdbd9e3872636d3 cause +memcpy_simd has performance regression in many arm cores +https://sourceware.org/bugzilla/show_bug.cgi?id=27437 +--- + sysdeps/aarch64/memcpy.S | 192 ++++++++++++++++++++++----------------- + 1 file changed, 111 insertions(+), 81 deletions(-) + +diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S +index 304e7eda..0adc5246 100644 +--- a/sysdeps/aarch64/memcpy.S ++++ b/sysdeps/aarch64/memcpy.S +@@ -1,5 +1,4 @@ +-/* Generic optimized memcpy using SIMD. +- Copyright (C) 2012-2023 Free Software Foundation, Inc. ++/* Copyright (C) 2012-2021 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + +@@ -21,7 +20,7 @@ + + /* Assumptions: + * +- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. ++ * ARMv8-a, AArch64, unaligned accesses. + * + */ + +@@ -37,18 +36,21 @@ + #define B_l x8 + #define B_lw w8 + #define B_h x9 ++#define C_l x10 + #define C_lw w10 ++#define C_h x11 ++#define D_l x12 ++#define D_h x13 ++#define E_l x14 ++#define E_h x15 ++#define F_l x16 ++#define F_h x17 ++#define G_l count ++#define G_h dst ++#define H_l src ++#define H_h srcend + #define tmp1 x14 + +-#define A_q q0 +-#define B_q q1 +-#define C_q q2 +-#define D_q q3 +-#define E_q q4 +-#define F_q q5 +-#define G_q q6 +-#define H_q q7 +- + #ifndef MEMMOVE + # define MEMMOVE memmove + #endif +@@ -67,9 +69,10 @@ + Large copies use a software pipelined loop processing 64 bytes per + iteration. The destination pointer is 16-byte aligned to minimize + unaligned accesses. The loop tail is handled by always copying 64 bytes +- from the end. */ ++ from the end. ++*/ + +-ENTRY (MEMCPY) ++ENTRY_ALIGN (MEMCPY, 6) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) +@@ -84,10 +87,10 @@ ENTRY (MEMCPY) + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) +- ldr A_q, [src] +- ldr B_q, [srcend, -16] +- str A_q, [dstin] +- str B_q, [dstend, -16] ++ ldp A_l, A_h, [src] ++ ldp D_l, D_h, [srcend, -16] ++ stp A_l, A_h, [dstin] ++ stp D_l, D_h, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +@@ -99,6 +102,7 @@ L(copy16): + str A_h, [dstend, -8] + ret + ++ .p2align 3 + /* Copy 4-7 bytes. */ + L(copy8): + tbz count, 2, L(copy4) +@@ -124,69 +128,87 @@ L(copy0): + .p2align 4 + /* Medium copies: 33..128 bytes. */ + L(copy32_128): +- ldp A_q, B_q, [src] +- ldp C_q, D_q, [srcend, -32] ++ ldp A_l, A_h, [src] ++ ldp B_l, B_h, [src, 16] ++ ldp C_l, C_h, [srcend, -32] ++ ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi L(copy128) +- stp A_q, B_q, [dstin] +- stp C_q, D_q, [dstend, -32] ++ stp A_l, A_h, [dstin] ++ stp B_l, B_h, [dstin, 16] ++ stp C_l, C_h, [dstend, -32] ++ stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ + L(copy128): +- ldp E_q, F_q, [src, 32] ++ ldp E_l, E_h, [src, 32] ++ ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) +- ldp G_q, H_q, [srcend, -64] +- stp G_q, H_q, [dstend, -64] ++ ldp G_l, G_h, [srcend, -64] ++ ldp H_l, H_h, [srcend, -48] ++ stp G_l, G_h, [dstend, -64] ++ stp H_l, H_h, [dstend, -48] + L(copy96): +- stp A_q, B_q, [dstin] +- stp E_q, F_q, [dstin, 32] +- stp C_q, D_q, [dstend, -32] ++ stp A_l, A_h, [dstin] ++ stp B_l, B_h, [dstin, 16] ++ stp E_l, E_h, [dstin, 32] ++ stp F_l, F_h, [dstin, 48] ++ stp C_l, C_h, [dstend, -32] ++ stp D_l, D_h, [dstend, -16] + ret + +- /* Align loop64 below to 16 bytes. */ +- nop +- ++ .p2align 4 + /* Copy more than 128 bytes. */ + L(copy_long): +- /* Copy 16 bytes and then align src to 16-byte alignment. */ +- ldr D_q, [src] +- and tmp1, src, 15 +- bic src, src, 15 +- sub dst, dstin, tmp1 ++ /* Copy 16 bytes and then align dst to 16-byte alignment. */ ++ ldp D_l, D_h, [src] ++ and tmp1, dstin, 15 ++ bic dst, dstin, 15 ++ sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ +- ldp A_q, B_q, [src, 16] +- str D_q, [dstin] +- ldp C_q, D_q, [src, 48] ++ ldp A_l, A_h, [src, 16] ++ stp D_l, D_h, [dstin] ++ ldp B_l, B_h, [src, 32] ++ ldp C_l, C_h, [src, 48] ++ ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) ++ + L(loop64): +- stp A_q, B_q, [dst, 16] +- ldp A_q, B_q, [src, 80] +- stp C_q, D_q, [dst, 48] +- ldp C_q, D_q, [src, 112] +- add src, src, 64 +- add dst, dst, 64 ++ stp A_l, A_h, [dst, 16] ++ ldp A_l, A_h, [src, 16] ++ stp B_l, B_h, [dst, 32] ++ ldp B_l, B_h, [src, 32] ++ stp C_l, C_h, [dst, 48] ++ ldp C_l, C_h, [src, 48] ++ stp D_l, D_h, [dst, 64]! ++ ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ + L(copy64_from_end): +- ldp E_q, F_q, [srcend, -64] +- stp A_q, B_q, [dst, 16] +- ldp A_q, B_q, [srcend, -32] +- stp C_q, D_q, [dst, 48] +- stp E_q, F_q, [dstend, -64] +- stp A_q, B_q, [dstend, -32] ++ ldp E_l, E_h, [srcend, -64] ++ stp A_l, A_h, [dst, 16] ++ ldp A_l, A_h, [srcend, -48] ++ stp B_l, B_h, [dst, 32] ++ ldp B_l, B_h, [srcend, -32] ++ stp C_l, C_h, [dst, 48] ++ ldp C_l, C_h, [srcend, -16] ++ stp D_l, D_h, [dst, 64] ++ stp E_l, E_h, [dstend, -64] ++ stp A_l, A_h, [dstend, -48] ++ stp B_l, B_h, [dstend, -32] ++ stp C_l, C_h, [dstend, -16] + ret + + END (MEMCPY) + libc_hidden_builtin_def (MEMCPY) + +- +-ENTRY (MEMMOVE) ++ENTRY_ALIGN (MEMMOVE, 4) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) +@@ -198,56 +220,64 @@ ENTRY (MEMMOVE) + cmp count, 32 + b.hi L(copy32_128) + +- /* Small moves: 0..32 bytes. */ ++ /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) +- ldr A_q, [src] +- ldr B_q, [srcend, -16] +- str A_q, [dstin] +- str B_q, [dstend, -16] ++ ldp A_l, A_h, [src] ++ ldp D_l, D_h, [srcend, -16] ++ stp A_l, A_h, [dstin] ++ stp D_l, D_h, [dstend, -16] + ret + ++ .p2align 4 + L(move_long): + /* Only use backward copy if there is an overlap. */ + sub tmp1, dstin, src +- cbz tmp1, L(move0) ++ cbz tmp1, L(copy0) + cmp tmp1, count + b.hs L(copy_long) + + /* Large backwards copy for overlapping copies. +- Copy 16 bytes and then align srcend to 16-byte alignment. */ +-L(copy_long_backwards): +- ldr D_q, [srcend, -16] +- and tmp1, srcend, 15 +- bic srcend, srcend, 15 ++ Copy 16 bytes and then align dst to 16-byte alignment. */ ++ ldp D_l, D_h, [srcend, -16] ++ and tmp1, dstend, 15 ++ sub srcend, srcend, tmp1 + sub count, count, tmp1 +- ldp A_q, B_q, [srcend, -32] +- str D_q, [dstend, -16] +- ldp C_q, D_q, [srcend, -64] ++ ldp A_l, A_h, [srcend, -16] ++ stp D_l, D_h, [dstend, -16] ++ ldp B_l, B_h, [srcend, -32] ++ ldp C_l, C_h, [srcend, -48] ++ ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + + L(loop64_backwards): +- str B_q, [dstend, -16] +- str A_q, [dstend, -32] +- ldp A_q, B_q, [srcend, -96] +- str D_q, [dstend, -48] +- str C_q, [dstend, -64]! +- ldp C_q, D_q, [srcend, -128] +- sub srcend, srcend, 64 ++ stp A_l, A_h, [dstend, -16] ++ ldp A_l, A_h, [srcend, -16] ++ stp B_l, B_h, [dstend, -32] ++ ldp B_l, B_h, [srcend, -32] ++ stp C_l, C_h, [dstend, -48] ++ ldp C_l, C_h, [srcend, -48] ++ stp D_l, D_h, [dstend, -64]! ++ ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ + L(copy64_from_start): +- ldp E_q, F_q, [src, 32] +- stp A_q, B_q, [dstend, -32] +- ldp A_q, B_q, [src] +- stp C_q, D_q, [dstend, -64] +- stp E_q, F_q, [dstin, 32] +- stp A_q, B_q, [dstin] +-L(move0): ++ ldp G_l, G_h, [src, 48] ++ stp A_l, A_h, [dstend, -16] ++ ldp A_l, A_h, [src, 32] ++ stp B_l, B_h, [dstend, -32] ++ ldp B_l, B_h, [src, 16] ++ stp C_l, C_h, [dstend, -48] ++ ldp C_l, C_h, [src] ++ stp D_l, D_h, [dstend, -64] ++ stp G_l, G_h, [dstin, 48] ++ stp A_l, A_h, [dstin, 32] ++ stp B_l, B_h, [dstin, 16] ++ stp C_l, C_h, [dstin] + ret + + END (MEMMOVE) +-- +2.33.0 + -- Gitee