From cac59fc30a95bb2691ee4785a57ee7c613ebcda2 Mon Sep 17 00:00:00 2001
From: liqingqing_1229 <liqingqing3@huawei.com>
Date: Sat, 12 Jul 2025 02:20:08 +0000
Subject: [PATCH] revert "aarch64: Use memcpy_simd as the default memcpy" and
 use __memmove_generic for kunpeng920_tsv120

1. This revert commit e6f3fe362f1aab78b1448d69ecdbd9e3872636d3 cause
memcpy_simd has performance regression in many arm cores
https://sourceware.org/bugzilla/show_bug.cgi?id=27437
2. using __memmove_generic for kunpeng920_tsv120 for performance

Signed-off-by: liqingqing_1229 <liqingqing3@huawei.com>
(cherry picked from commit 3b4088f747f5533c2066f5891eee2999eb5ebcf0)
---
 ...eneric-when-kunpeng920-with-tsv120-m.patch |  26 ++
 glibc.spec                                    |   8 +-
 ...se-memcpy_simd-as-the-default-memcpy.patch | 318 ++++++++++++++++++
 3 files changed, 351 insertions(+), 1 deletion(-)
 create mode 100644 Using-__memmove_generic-when-kunpeng920-with-tsv120-m.patch
 create mode 100644 revert-aarch64-Use-memcpy_simd-as-the-default-memcpy.patch

diff --git a/Using-__memmove_generic-when-kunpeng920-with-tsv120-m.patch b/Using-__memmove_generic-when-kunpeng920-with-tsv120-m.patch
new file mode 100644
index 0000000..a5d7724
--- /dev/null
+++ b/Using-__memmove_generic-when-kunpeng920-with-tsv120-m.patch
@@ -0,0 +1,26 @@
+From 5ca322e4bdce79687f88ba62c1c1e5fd024f8dc6 Mon Sep 17 00:00:00 2001
+From: liqingqing <liqingqing3@huawei.com>
+Date: Sat, 12 Jul 2025 06:10:24 +0800
+Subject: [PATCH] Using __memmove_generic when kunpeng920 with tsv120 micro
+ architecture
+
+---
+ sysdeps/aarch64/multiarch/memmove.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
+index 6b771668..d438daa4 100644
+--- a/sysdeps/aarch64/multiarch/memmove.c
++++ b/sysdeps/aarch64/multiarch/memmove.c
+@@ -47,6 +47,8 @@ select_memmove_ifunc (void)
+     {
+       if (IS_A64FX (midr))
+ 	return __memmove_a64fx;
++      if (IS_KUNPENG920_TSV120 (midr))
++	return __memmove_generic;
+       return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic;
+     }
+ 
+-- 
+2.33.0
+
diff --git a/glibc.spec b/glibc.spec
index 1e4f584..cf3a182 100644
--- a/glibc.spec
+++ b/glibc.spec
@@ -67,7 +67,7 @@
 ##############################################################################
 Name: 	 	glibc
 Version: 	2.38
-Release: 	61
+Release: 	62
 Summary: 	The GNU libc libraries
 License:	%{all_license}
 URL: 		http://www.gnu.org/software/glibc/
@@ -337,6 +337,8 @@ Patch9034: 0002-x86_64-Optimize-large-size-copy-in-memmove-ssse3.patch
 Patch9035: 0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch
 Patch9036: fix-CVE-2019-1010023.patch
 Patch9037: Using-__memcpy_generic-when-kunpeng920-with-tsv120-m.patch
+Patch9038: Using-__memmove_generic-when-kunpeng920-with-tsv120-m.patch
+Patch9039: revert-aarch64-Use-memcpy_simd-as-the-default-memcpy.patch
 
 Provides: ldconfig rtld(GNU_HASH) bundled(gnulib)
 
@@ -1517,6 +1519,10 @@ fi
 %endif
 
 %changelog
+* Sat Jul 12 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-62
+- aarch64: Using __memmove_generic when kunpeng920 with tsv120 micro architecture
+- aarch64: revert "aarch64: Use memcpy_simd as the default memcpy"
+
 * Mon Jul 07 2025 Qingqing Li <liqingqing3@huawei.com> - 2.38-61
 - aarch64: Using __memcpy_generic when kunpeng920 with tsv120 micro architecture
 
diff --git a/revert-aarch64-Use-memcpy_simd-as-the-default-memcpy.patch b/revert-aarch64-Use-memcpy_simd-as-the-default-memcpy.patch
new file mode 100644
index 0000000..76455b3
--- /dev/null
+++ b/revert-aarch64-Use-memcpy_simd-as-the-default-memcpy.patch
@@ -0,0 +1,318 @@
+From 9e87dccaffb020117fee8fb7eeffff5a2387f16f Mon Sep 17 00:00:00 2001
+From: liqingqing <liqingqing3@huawei.com>
+Date: Sat, 12 Jul 2025 06:05:35 +0800
+Subject: [PATCH] revert "aarch64: Use memcpy_simd as the default memcpy"
+
+This revert commit e6f3fe362f1aab78b1448d69ecdbd9e3872636d3 cause
+memcpy_simd has performance regression in many arm cores
+https://sourceware.org/bugzilla/show_bug.cgi?id=27437
+---
+ sysdeps/aarch64/memcpy.S | 192 ++++++++++++++++++++++-----------------
+ 1 file changed, 111 insertions(+), 81 deletions(-)
+
+diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
+index 304e7eda..0adc5246 100644
+--- a/sysdeps/aarch64/memcpy.S
++++ b/sysdeps/aarch64/memcpy.S
+@@ -1,5 +1,4 @@
+-/* Generic optimized memcpy using SIMD.
+-   Copyright (C) 2012-2023 Free Software Foundation, Inc.
++/* Copyright (C) 2012-2021 Free Software Foundation, Inc.
+ 
+    This file is part of the GNU C Library.
+ 
+@@ -21,7 +20,7 @@
+ 
+ /* Assumptions:
+  *
+- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
++ * ARMv8-a, AArch64, unaligned accesses.
+  *
+  */
+ 
+@@ -37,18 +36,21 @@
+ #define B_l	x8
+ #define B_lw	w8
+ #define B_h	x9
++#define C_l	x10
+ #define C_lw	w10
++#define C_h	x11
++#define D_l	x12
++#define D_h	x13
++#define E_l	x14
++#define E_h	x15
++#define F_l	x16
++#define F_h	x17
++#define G_l	count
++#define G_h	dst
++#define H_l	src
++#define H_h	srcend
+ #define tmp1	x14
+ 
+-#define A_q	q0
+-#define B_q	q1
+-#define C_q	q2
+-#define D_q	q3
+-#define E_q	q4
+-#define F_q	q5
+-#define G_q	q6
+-#define H_q	q7
+-
+ #ifndef MEMMOVE
+ # define MEMMOVE memmove
+ #endif
+@@ -67,9 +69,10 @@
+    Large copies use a software pipelined loop processing 64 bytes per
+    iteration.  The destination pointer is 16-byte aligned to minimize
+    unaligned accesses.  The loop tail is handled by always copying 64 bytes
+-   from the end.  */
++   from the end.
++*/
+ 
+-ENTRY (MEMCPY)
++ENTRY_ALIGN (MEMCPY, 6)
+ 	PTR_ARG (0)
+ 	PTR_ARG (1)
+ 	SIZE_ARG (2)
+@@ -84,10 +87,10 @@ ENTRY (MEMCPY)
+ 	/* Small copies: 0..32 bytes.  */
+ 	cmp	count, 16
+ 	b.lo	L(copy16)
+-	ldr	A_q, [src]
+-	ldr	B_q, [srcend, -16]
+-	str	A_q, [dstin]
+-	str	B_q, [dstend, -16]
++	ldp	A_l, A_h, [src]
++	ldp	D_l, D_h, [srcend, -16]
++	stp	A_l, A_h, [dstin]
++	stp	D_l, D_h, [dstend, -16]
+ 	ret
+ 
+ 	/* Copy 8-15 bytes.  */
+@@ -99,6 +102,7 @@ L(copy16):
+ 	str	A_h, [dstend, -8]
+ 	ret
+ 
++	.p2align 3
+ 	/* Copy 4-7 bytes.  */
+ L(copy8):
+ 	tbz	count, 2, L(copy4)
+@@ -124,69 +128,87 @@ L(copy0):
+ 	.p2align 4
+ 	/* Medium copies: 33..128 bytes.  */
+ L(copy32_128):
+-	ldp	A_q, B_q, [src]
+-	ldp	C_q, D_q, [srcend, -32]
++	ldp	A_l, A_h, [src]
++	ldp	B_l, B_h, [src, 16]
++	ldp	C_l, C_h, [srcend, -32]
++	ldp	D_l, D_h, [srcend, -16]
+ 	cmp	count, 64
+ 	b.hi	L(copy128)
+-	stp	A_q, B_q, [dstin]
+-	stp	C_q, D_q, [dstend, -32]
++	stp	A_l, A_h, [dstin]
++	stp	B_l, B_h, [dstin, 16]
++	stp	C_l, C_h, [dstend, -32]
++	stp	D_l, D_h, [dstend, -16]
+ 	ret
+ 
+ 	.p2align 4
+ 	/* Copy 65..128 bytes.  */
+ L(copy128):
+-	ldp	E_q, F_q, [src, 32]
++	ldp	E_l, E_h, [src, 32]
++	ldp	F_l, F_h, [src, 48]
+ 	cmp	count, 96
+ 	b.ls	L(copy96)
+-	ldp	G_q, H_q, [srcend, -64]
+-	stp	G_q, H_q, [dstend, -64]
++	ldp	G_l, G_h, [srcend, -64]
++	ldp	H_l, H_h, [srcend, -48]
++	stp	G_l, G_h, [dstend, -64]
++	stp	H_l, H_h, [dstend, -48]
+ L(copy96):
+-	stp	A_q, B_q, [dstin]
+-	stp	E_q, F_q, [dstin, 32]
+-	stp	C_q, D_q, [dstend, -32]
++	stp	A_l, A_h, [dstin]
++	stp	B_l, B_h, [dstin, 16]
++	stp	E_l, E_h, [dstin, 32]
++	stp	F_l, F_h, [dstin, 48]
++	stp	C_l, C_h, [dstend, -32]
++	stp	D_l, D_h, [dstend, -16]
+ 	ret
+ 
+-	/* Align loop64 below to 16 bytes.  */
+-	nop
+-
++	.p2align 4
+ 	/* Copy more than 128 bytes.  */
+ L(copy_long):
+-	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+-	ldr	D_q, [src]
+-	and	tmp1, src, 15
+-	bic	src, src, 15
+-	sub	dst, dstin, tmp1
++	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
++	ldp	D_l, D_h, [src]
++	and	tmp1, dstin, 15
++	bic	dst, dstin, 15
++	sub	src, src, tmp1
+ 	add	count, count, tmp1	/* Count is now 16 too large.  */
+-	ldp	A_q, B_q, [src, 16]
+-	str	D_q, [dstin]
+-	ldp	C_q, D_q, [src, 48]
++	ldp	A_l, A_h, [src, 16]
++	stp	D_l, D_h, [dstin]
++	ldp	B_l, B_h, [src, 32]
++	ldp	C_l, C_h, [src, 48]
++	ldp	D_l, D_h, [src, 64]!
+ 	subs	count, count, 128 + 16	/* Test and readjust count.  */
+ 	b.ls	L(copy64_from_end)
++
+ L(loop64):
+-	stp	A_q, B_q, [dst, 16]
+-	ldp	A_q, B_q, [src, 80]
+-	stp	C_q, D_q, [dst, 48]
+-	ldp	C_q, D_q, [src, 112]
+-	add	src, src, 64
+-	add	dst, dst, 64
++	stp	A_l, A_h, [dst, 16]
++	ldp	A_l, A_h, [src, 16]
++	stp	B_l, B_h, [dst, 32]
++	ldp	B_l, B_h, [src, 32]
++	stp	C_l, C_h, [dst, 48]
++	ldp	C_l, C_h, [src, 48]
++	stp	D_l, D_h, [dst, 64]!
++	ldp	D_l, D_h, [src, 64]!
+ 	subs	count, count, 64
+ 	b.hi	L(loop64)
+ 
+ 	/* Write the last iteration and copy 64 bytes from the end.  */
+ L(copy64_from_end):
+-	ldp	E_q, F_q, [srcend, -64]
+-	stp	A_q, B_q, [dst, 16]
+-	ldp	A_q, B_q, [srcend, -32]
+-	stp	C_q, D_q, [dst, 48]
+-	stp	E_q, F_q, [dstend, -64]
+-	stp	A_q, B_q, [dstend, -32]
++	ldp	E_l, E_h, [srcend, -64]
++	stp	A_l, A_h, [dst, 16]
++	ldp	A_l, A_h, [srcend, -48]
++	stp	B_l, B_h, [dst, 32]
++	ldp	B_l, B_h, [srcend, -32]
++	stp	C_l, C_h, [dst, 48]
++	ldp	C_l, C_h, [srcend, -16]
++	stp	D_l, D_h, [dst, 64]
++	stp	E_l, E_h, [dstend, -64]
++	stp	A_l, A_h, [dstend, -48]
++	stp	B_l, B_h, [dstend, -32]
++	stp	C_l, C_h, [dstend, -16]
+ 	ret
+ 
+ END (MEMCPY)
+ libc_hidden_builtin_def (MEMCPY)
+ 
+-
+-ENTRY (MEMMOVE)
++ENTRY_ALIGN (MEMMOVE, 4)
+ 	PTR_ARG (0)
+ 	PTR_ARG (1)
+ 	SIZE_ARG (2)
+@@ -198,56 +220,64 @@ ENTRY (MEMMOVE)
+ 	cmp	count, 32
+ 	b.hi	L(copy32_128)
+ 
+-	/* Small moves: 0..32 bytes.  */
++	/* Small copies: 0..32 bytes.  */
+ 	cmp	count, 16
+ 	b.lo	L(copy16)
+-	ldr	A_q, [src]
+-	ldr	B_q, [srcend, -16]
+-	str	A_q, [dstin]
+-	str	B_q, [dstend, -16]
++	ldp	A_l, A_h, [src]
++	ldp	D_l, D_h, [srcend, -16]
++	stp	A_l, A_h, [dstin]
++	stp	D_l, D_h, [dstend, -16]
+ 	ret
+ 
++	.p2align 4
+ L(move_long):
+ 	/* Only use backward copy if there is an overlap.  */
+ 	sub	tmp1, dstin, src
+-	cbz	tmp1, L(move0)
++	cbz	tmp1, L(copy0)
+ 	cmp	tmp1, count
+ 	b.hs	L(copy_long)
+ 
+ 	/* Large backwards copy for overlapping copies.
+-	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
+-L(copy_long_backwards):
+-	ldr	D_q, [srcend, -16]
+-	and	tmp1, srcend, 15
+-	bic	srcend, srcend, 15
++	   Copy 16 bytes and then align dst to 16-byte alignment.  */
++	ldp	D_l, D_h, [srcend, -16]
++	and	tmp1, dstend, 15
++	sub	srcend, srcend, tmp1
+ 	sub	count, count, tmp1
+-	ldp	A_q, B_q, [srcend, -32]
+-	str	D_q, [dstend, -16]
+-	ldp	C_q, D_q, [srcend, -64]
++	ldp	A_l, A_h, [srcend, -16]
++	stp	D_l, D_h, [dstend, -16]
++	ldp	B_l, B_h, [srcend, -32]
++	ldp	C_l, C_h, [srcend, -48]
++	ldp	D_l, D_h, [srcend, -64]!
+ 	sub	dstend, dstend, tmp1
+ 	subs	count, count, 128
+ 	b.ls	L(copy64_from_start)
+ 
+ L(loop64_backwards):
+-	str	B_q, [dstend, -16]
+-	str	A_q, [dstend, -32]
+-	ldp	A_q, B_q, [srcend, -96]
+-	str	D_q, [dstend, -48]
+-	str	C_q, [dstend, -64]!
+-	ldp	C_q, D_q, [srcend, -128]
+-	sub	srcend, srcend, 64
++	stp	A_l, A_h, [dstend, -16]
++	ldp	A_l, A_h, [srcend, -16]
++	stp	B_l, B_h, [dstend, -32]
++	ldp	B_l, B_h, [srcend, -32]
++	stp	C_l, C_h, [dstend, -48]
++	ldp	C_l, C_h, [srcend, -48]
++	stp	D_l, D_h, [dstend, -64]!
++	ldp	D_l, D_h, [srcend, -64]!
+ 	subs	count, count, 64
+ 	b.hi	L(loop64_backwards)
+ 
+ 	/* Write the last iteration and copy 64 bytes from the start.  */
+ L(copy64_from_start):
+-	ldp	E_q, F_q, [src, 32]
+-	stp	A_q, B_q, [dstend, -32]
+-	ldp	A_q, B_q, [src]
+-	stp	C_q, D_q, [dstend, -64]
+-	stp	E_q, F_q, [dstin, 32]
+-	stp	A_q, B_q, [dstin]
+-L(move0):
++	ldp	G_l, G_h, [src, 48]
++	stp	A_l, A_h, [dstend, -16]
++	ldp	A_l, A_h, [src, 32]
++	stp	B_l, B_h, [dstend, -32]
++	ldp	B_l, B_h, [src, 16]
++	stp	C_l, C_h, [dstend, -48]
++	ldp	C_l, C_h, [src]
++	stp	D_l, D_h, [dstend, -64]
++	stp	G_l, G_h, [dstin, 48]
++	stp	A_l, A_h, [dstin, 32]
++	stp	B_l, B_h, [dstin, 16]
++	stp	C_l, C_h, [dstin]
+ 	ret
+ 
+ END (MEMMOVE)
+-- 
+2.33.0
+
-- 
Gitee