diff --git a/AArch64-Optimize-memcmp.patch b/AArch64-Optimize-memcmp.patch new file mode 100644 index 0000000000000000000000000000000000000000..68183e60b322c16e3bff14e12b55e5c005b8c27c --- /dev/null +++ b/AArch64-Optimize-memcmp.patch @@ -0,0 +1,308 @@ +From 82dee12d3b6b11714a14ffb46886f693bc745ec6 Mon Sep 17 00:00:00 2001 +From: Wilco Dijkstra +Date: Thu, 2 Dec 2021 18:30:55 +0000 +Subject: [PATCH] AArch64: Optimize memcmp + +Rewrite memcmp to improve performance. On small and medium inputs performance +is 10-20% better. Large inputs use a SIMD loop processing 64 bytes per +iteration, which is 30-50% faster depending on the size. + +Reviewed-by: Szabolcs Nagy +(cherry picked from commit b51eb35c572b015641f03e3682c303f7631279b7) +--- + sysdeps/aarch64/memcmp.S | 241 ++++++++++++++++++++++----------------- + 1 file changed, 134 insertions(+), 107 deletions(-) + +diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S +index c1937f6f5c..c7d56a8af0 100644 +--- a/sysdeps/aarch64/memcmp.S ++++ b/sysdeps/aarch64/memcmp.S +@@ -22,105 +22,79 @@ + + /* Assumptions: + * +- * ARMv8-a, AArch64, unaligned accesses. ++ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + */ + +-/* Parameters and result. */ +-#define src1 x0 +-#define src2 x1 +-#define limit x2 +-#define result w0 +- +-/* Internal variables. */ +-#define data1 x3 +-#define data1w w3 +-#define data1h x4 +-#define data2 x5 +-#define data2w w5 +-#define data2h x6 +-#define tmp1 x7 +-#define tmp2 x8 +- +-ENTRY_ALIGN (memcmp, 6) ++#define src1 x0 ++#define src2 x1 ++#define limit x2 ++#define result w0 ++ ++#define data1 x3 ++#define data1w w3 ++#define data2 x4 ++#define data2w w4 ++#define data3 x5 ++#define data3w w5 ++#define data4 x6 ++#define data4w w6 ++#define tmp x6 ++#define src1end x7 ++#define src2end x8 ++ ++ ++ENTRY (memcmp) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + +- subs limit, limit, 16 ++ cmp limit, 16 + b.lo L(less16) +- +- ldp data1, data1h, [src1], 16 +- ldp data2, data2h, [src2], 16 ++ ldp data1, data3, [src1] ++ ldp data2, data4, [src2] + ccmp data1, data2, 0, ne +- ccmp data1h, data2h, 0, eq +- b.ne L(return64) ++ ccmp data3, data4, 0, eq ++ b.ne L(return2) + +- subs limit, limit, 16 ++ add src1end, src1, limit ++ add src2end, src2, limit ++ cmp limit, 32 + b.ls L(last_bytes) +- cmp limit, 112 +- b.lo L(loop16) +- +- and tmp1, src1, 15 +- add limit, limit, tmp1 +- sub src1, src1, tmp1 +- sub src2, src2, tmp1 +- subs limit, limit, 48 ++ cmp limit, 160 ++ b.hs L(loop_align) ++ sub limit, limit, 32 + +- /* Compare 128 up bytes using aligned access. */ + .p2align 4 +-L(loop64): +- ldp data1, data1h, [src1] +- ldp data2, data2h, [src2] +- cmp data1, data2 +- ccmp data1h, data2h, 0, eq +- b.ne L(return64) +- +- ldp data1, data1h, [src1, 16] +- ldp data2, data2h, [src2, 16] +- cmp data1, data2 +- ccmp data1h, data2h, 0, eq +- b.ne L(return64) +- +- ldp data1, data1h, [src1, 32] +- ldp data2, data2h, [src2, 32] +- cmp data1, data2 +- ccmp data1h, data2h, 0, eq +- b.ne L(return64) +- +- ldp data1, data1h, [src1, 48] +- ldp data2, data2h, [src2, 48] ++L(loop32): ++ ldp data1, data3, [src1, 16] ++ ldp data2, data4, [src2, 16] + cmp data1, data2 +- ccmp data1h, data2h, 0, eq +- b.ne L(return64) ++ ccmp data3, data4, 0, eq ++ b.ne L(return2) ++ cmp limit, 16 ++ b.ls L(last_bytes) + +- subs limit, limit, 64 +- add src1, src1, 64 +- add src2, src2, 64 +- b.pl L(loop64) +- adds limit, limit, 48 +- b.lo L(last_bytes) +- +-L(loop16): +- ldp data1, data1h, [src1], 16 +- ldp data2, data2h, [src2], 16 ++ ldp data1, data3, [src1, 32] ++ ldp data2, data4, [src2, 32] + cmp data1, data2 +- ccmp data1h, data2h, 0, eq +- b.ne L(return64) ++ ccmp data3, data4, 0, eq ++ b.ne L(return2) ++ add src1, src1, 32 ++ add src2, src2, 32 ++L(last64): ++ subs limit, limit, 32 ++ b.hi L(loop32) + +- subs limit, limit, 16 +- b.hi L(loop16) + /* Compare last 1-16 bytes using unaligned access. */ + L(last_bytes): +- add src1, src1, limit +- add src2, src2, limit +- ldp data1, data1h, [src1] +- ldp data2, data2h, [src2] ++ ldp data1, data3, [src1end, -16] ++ ldp data2, data4, [src2end, -16] ++L(return2): ++ cmp data1, data2 ++ csel data1, data1, data3, ne ++ csel data2, data2, data4, ne + + /* Compare data bytes and set return value to 0, -1 or 1. */ +-L(return64): +- cmp data1, data2 +- csel data1, data1, data1h, ne +- csel data2, data2, data2h, ne + L(return): + #ifndef __AARCH64EB__ + rev data1, data1 +@@ -133,45 +107,98 @@ L(return): + + .p2align 4 + L(less16): +- adds limit, limit, 8 +- b.lo L(less8) //lo:< ++ add src1end, src1, limit ++ add src2end, src2, limit ++ tbz limit, 3, L(less8) + ldr data1, [src1] + ldr data2, [src2] +- /* equal 8 optimized */ +- ccmp data1, data2, 0, ne +- b.ne L(return) +- +- ldr data1, [src1, limit] +- ldr data2, [src2, limit] +- b L(return) ++ ldr data3, [src1end, -8] ++ ldr data4, [src2end, -8] ++ b L(return2) + + .p2align 4 + L(less8): +- adds limit, limit, 4 +- b.lo L(less4) ++ tbz limit, 2, L(less4) + ldr data1w, [src1] + ldr data2w, [src2] +- ccmp data1w, data2w, 0, ne +- b.ne L(return) +- ldr data1w, [src1, limit] +- ldr data2w, [src2, limit] +- b L(return) ++ ldr data3w, [src1end, -4] ++ ldr data4w, [src2end, -4] ++ b L(return2) + +- .p2align 4 + L(less4): +- adds limit, limit, 4 +- b.eq L(ret_0) +- +-L(byte_loop): +- ldrb data1w, [src1], 1 +- ldrb data2w, [src2], 1 +- subs limit, limit, 1 +- ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ +- b.eq L(byte_loop) ++ tbz limit, 1, L(less2) ++ ldrh data1w, [src1] ++ ldrh data2w, [src2] ++ cmp data1w, data2w ++ b.ne L(return) ++L(less2): ++ mov result, 0 ++ tbz limit, 0, L(return_zero) ++ ldrb data1w, [src1end, -1] ++ ldrb data2w, [src2end, -1] + sub result, data1w, data2w ++L(return_zero): + ret +-L(ret_0): +- mov result, 0 ++ ++L(loop_align): ++ ldp data1, data3, [src1, 16] ++ ldp data2, data4, [src2, 16] ++ cmp data1, data2 ++ ccmp data3, data4, 0, eq ++ b.ne L(return2) ++ ++ /* Align src2 and adjust src1, src2 and limit. */ ++ and tmp, src2, 15 ++ sub tmp, tmp, 16 ++ sub src2, src2, tmp ++ add limit, limit, tmp ++ sub src1, src1, tmp ++ sub limit, limit, 64 + 16 ++ ++ .p2align 4 ++L(loop64): ++ ldr q0, [src1, 16] ++ ldr q1, [src2, 16] ++ subs limit, limit, 64 ++ ldr q2, [src1, 32] ++ ldr q3, [src2, 32] ++ eor v0.16b, v0.16b, v1.16b ++ eor v1.16b, v2.16b, v3.16b ++ ldr q2, [src1, 48] ++ ldr q3, [src2, 48] ++ umaxp v0.16b, v0.16b, v1.16b ++ ldr q4, [src1, 64]! ++ ldr q5, [src2, 64]! ++ eor v1.16b, v2.16b, v3.16b ++ eor v2.16b, v4.16b, v5.16b ++ umaxp v1.16b, v1.16b, v2.16b ++ umaxp v0.16b, v0.16b, v1.16b ++ umaxp v0.16b, v0.16b, v0.16b ++ fmov tmp, d0 ++ ccmp tmp, 0, 0, hi ++ b.eq L(loop64) ++ ++ /* If equal, process last 1-64 bytes using scalar loop. */ ++ add limit, limit, 64 + 16 ++ cbz tmp, L(last64) ++ ++ /* Determine the 8-byte aligned offset of the first difference. */ ++#ifdef __AARCH64EB__ ++ rev16 tmp, tmp ++#endif ++ rev tmp, tmp ++ clz tmp, tmp ++ bic tmp, tmp, 7 ++ sub tmp, tmp, 48 ++ ldr data1, [src1, tmp] ++ ldr data2, [src2, tmp] ++#ifndef __AARCH64EB__ ++ rev data1, data1 ++ rev data2, data2 ++#endif ++ mov result, 1 ++ cmp data1, data2 ++ cneg result, result, lo + ret + + END (memcmp) +-- +2.27.0 + diff --git a/glibc.spec b/glibc.spec index a90938ae87e5d83161f818230719bd13c3034a18..52d72498434be57b4a0b0c3fed51c0e00b6a16e4 100644 --- a/glibc.spec +++ b/glibc.spec @@ -71,7 +71,7 @@ ############################################################################## Name: glibc Version: 2.34 -Release: 164 +Release: 165 Summary: The GNU libc libraries License: %{all_license} URL: http://www.gnu.org/software/glibc/ @@ -316,6 +316,7 @@ Patch224: backport-elf-Move-__rtld_malloc_init_stubs-call-into-_dl_star.patch Patch225: backport-elf-Handle-static-PIE-with-non-zero-load-address-BZ-.patch Patch226: backport-elf-Introduce-_dl_relocate_object_no_relro.patch Patch227: backport-elf-Switch-to-main-malloc-after-final-ld.so-self-rel.patch +Patch228: AArch64-Optimize-memcmp.patch Patch9000: turn-default-value-of-x86_rep_stosb_threshold_form_2K_to_1M.patch Patch9001: delete-no-hard-link-to-avoid-all_language-package-to.patch @@ -1546,6 +1547,9 @@ fi %endif %changelog +* Mon Feb 10 2025 mayuhang - 2.34-165 +- AArch64: Optimize memcmp + * Wed Dec 11 2024 taoyuxiang - 2.34-164 - Change Inner-Net to Inner-Net-2.0 - Change GFDL to GFDL-1.3-only