diff --git a/1-5-AArch64-Improve-A64FX-memset-for-small-sizes.patch b/1-5-AArch64-Improve-A64FX-memset-for-small-sizes.patch new file mode 100644 index 0000000000000000000000000000000000000000..2db004518382383d0814b1acc240f24ba6fcd6f2 --- /dev/null +++ b/1-5-AArch64-Improve-A64FX-memset-for-small-sizes.patch @@ -0,0 +1,136 @@ +From 07b427296b8d59f439144029d9a948f6c1ce0a31 Mon Sep 17 00:00:00 2001 +From: Wilco Dijkstra +Date: Tue, 10 Aug 2021 13:30:27 +0100 +Subject: [PATCH] [1/5] AArch64: Improve A64FX memset for small sizes + +Improve performance of small memsets by reducing instruction counts and +improving code alignment. Bench-memset shows 35-45% performance gain for +small sizes. + +Reviewed-by: Naohiro Tamura +--- + sysdeps/aarch64/multiarch/memset_a64fx.S | 96 ++++++++++++-------------------- + 1 file changed, 36 insertions(+), 60 deletions(-) + +diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S +index ce54e54..cf3d402 100644 +--- a/sysdeps/aarch64/multiarch/memset_a64fx.S ++++ b/sysdeps/aarch64/multiarch/memset_a64fx.S +@@ -51,78 +51,54 @@ + .endm + + .macro st1b_unroll first=0, last=7 +- st1b z0.b, p0, [dst, #\first, mul vl] ++ st1b z0.b, p0, [dst, \first, mul vl] + .if \last-\first + st1b_unroll "(\first+1)", \last + .endif + .endm + +- .macro shortcut_for_small_size exit +- // if rest <= vector_length * 2 +- whilelo p0.b, xzr, count +- whilelo p1.b, vector_length, count +- b.last 1f +- st1b z0.b, p0, [dstin, #0, mul vl] +- st1b z0.b, p1, [dstin, #1, mul vl] +- ret +-1: // if rest > vector_length * 8 +- cmp count, vector_length, lsl 3 // vector_length * 8 +- b.hi \exit +- // if rest <= vector_length * 4 +- lsl tmp1, vector_length, 1 // vector_length * 2 +- whilelo p2.b, tmp1, count +- incb tmp1 +- whilelo p3.b, tmp1, count +- b.last 1f +- st1b z0.b, p0, [dstin, #0, mul vl] +- st1b z0.b, p1, [dstin, #1, mul vl] +- st1b z0.b, p2, [dstin, #2, mul vl] +- st1b z0.b, p3, [dstin, #3, mul vl] +- ret +-1: // if rest <= vector_length * 8 +- lsl tmp1, vector_length, 2 // vector_length * 4 +- whilelo p4.b, tmp1, count +- incb tmp1 +- whilelo p5.b, tmp1, count +- b.last 1f +- st1b z0.b, p0, [dstin, #0, mul vl] +- st1b z0.b, p1, [dstin, #1, mul vl] +- st1b z0.b, p2, [dstin, #2, mul vl] +- st1b z0.b, p3, [dstin, #3, mul vl] +- st1b z0.b, p4, [dstin, #4, mul vl] +- st1b z0.b, p5, [dstin, #5, mul vl] +- ret +-1: lsl tmp1, vector_length, 2 // vector_length * 4 +- incb tmp1 // vector_length * 5 +- incb tmp1 // vector_length * 6 +- whilelo p6.b, tmp1, count +- incb tmp1 +- whilelo p7.b, tmp1, count +- st1b z0.b, p0, [dstin, #0, mul vl] +- st1b z0.b, p1, [dstin, #1, mul vl] +- st1b z0.b, p2, [dstin, #2, mul vl] +- st1b z0.b, p3, [dstin, #3, mul vl] +- st1b z0.b, p4, [dstin, #4, mul vl] +- st1b z0.b, p5, [dstin, #5, mul vl] +- st1b z0.b, p6, [dstin, #6, mul vl] +- st1b z0.b, p7, [dstin, #7, mul vl] +- ret +- .endm + +-ENTRY (MEMSET) ++#undef BTI_C ++#define BTI_C + ++ENTRY (MEMSET) + PTR_ARG (0) + SIZE_ARG (2) + +- cbnz count, 1f +- ret +-1: dup z0.b, valw + cntb vector_length +- // shortcut for less than vector_length * 8 +- // gives a free ptrue to p0.b for n >= vector_length +- shortcut_for_small_size L(vl_agnostic) +- // end of shortcut ++ dup z0.b, valw ++ whilelo p0.b, vector_length, count ++ b.last 1f ++ whilelo p1.b, xzr, count ++ st1b z0.b, p1, [dstin, 0, mul vl] ++ st1b z0.b, p0, [dstin, 1, mul vl] ++ ret ++ ++ // count >= vector_length * 2 ++1: cmp count, vector_length, lsl 2 ++ add dstend, dstin, count ++ b.hi 1f ++ st1b z0.b, p0, [dstin, 0, mul vl] ++ st1b z0.b, p0, [dstin, 1, mul vl] ++ st1b z0.b, p0, [dstend, -2, mul vl] ++ st1b z0.b, p0, [dstend, -1, mul vl] ++ ret ++ ++ // count > vector_length * 4 ++1: lsl tmp1, vector_length, 3 ++ cmp count, tmp1 ++ b.hi L(vl_agnostic) ++ st1b z0.b, p0, [dstin, 0, mul vl] ++ st1b z0.b, p0, [dstin, 1, mul vl] ++ st1b z0.b, p0, [dstin, 2, mul vl] ++ st1b z0.b, p0, [dstin, 3, mul vl] ++ st1b z0.b, p0, [dstend, -4, mul vl] ++ st1b z0.b, p0, [dstend, -3, mul vl] ++ st1b z0.b, p0, [dstend, -2, mul vl] ++ st1b z0.b, p0, [dstend, -1, mul vl] ++ ret + ++ .p2align 4 + L(vl_agnostic): // VL Agnostic + mov rest, count + mov dst, dstin +-- +1.8.3.1 + diff --git a/2-5-AArch64-Improve-A64FX-memset-for-large-sizes.patch b/2-5-AArch64-Improve-A64FX-memset-for-large-sizes.patch new file mode 100644 index 0000000000000000000000000000000000000000..81cdbe038153bf7f08099f86bacb538ab90c8550 --- /dev/null +++ b/2-5-AArch64-Improve-A64FX-memset-for-large-sizes.patch @@ -0,0 +1,131 @@ +From 9bc2ed8f46d80859a5596789cc9e8cc2de84b0e7 Mon Sep 17 00:00:00 2001 +From: Wilco Dijkstra +Date: Tue, 10 Aug 2021 13:39:37 +0100 +Subject: [PATCH] [2/5] AArch64: Improve A64FX memset for large sizes + +Improve performance of large memsets. Simplify alignment code. For zero memset +use DC ZVA, which almost doubles performance. For non-zero memsets use the +unroll8 loop which is about 10% faster. + +Reviewed-by: Naohiro Tamura +--- + sysdeps/aarch64/multiarch/memset_a64fx.S | 85 ++++++++++---------------------- + 1 file changed, 25 insertions(+), 60 deletions(-) + +diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S +index cf3d402..75cf43a 100644 +--- a/sysdeps/aarch64/multiarch/memset_a64fx.S ++++ b/sysdeps/aarch64/multiarch/memset_a64fx.S +@@ -27,14 +27,11 @@ + */ + + #define L1_SIZE (64*1024) // L1 64KB +-#define L2_SIZE (8*1024*1024) // L2 8MB - 1MB ++#define L2_SIZE (8*1024*1024) // L2 8MB + #define CACHE_LINE_SIZE 256 + #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 +-#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance +-#define rest x8 ++#define rest x2 + #define vector_length x9 +-#define vl_remainder x10 // vector_length remainder +-#define cl_remainder x11 // CACHE_LINE_SIZE remainder + + #if HAVE_AARCH64_SVE_ASM + # if IS_IN (libc) +@@ -42,14 +39,6 @@ + + .arch armv8.2-a+sve + +- .macro dc_zva times +- dc zva, tmp1 +- add tmp1, tmp1, CACHE_LINE_SIZE +- .if \times-1 +- dc_zva "(\times-1)" +- .endif +- .endm +- + .macro st1b_unroll first=0, last=7 + st1b z0.b, p0, [dst, \first, mul vl] + .if \last-\first +@@ -188,54 +177,30 @@ L(L1_prefetch): // if rest >= L1_SIZE + cbnz rest, L(unroll32) + ret + +-L(L2): +- // align dst address at vector_length byte boundary +- sub tmp1, vector_length, 1 +- ands tmp2, dst, tmp1 +- // if vl_remainder == 0 +- b.eq 1f +- sub vl_remainder, vector_length, tmp2 +- // process remainder until the first vector_length boundary +- whilelt p2.b, xzr, vl_remainder +- st1b z0.b, p2, [dst] +- add dst, dst, vl_remainder +- sub rest, rest, vl_remainder +- // align dstin address at CACHE_LINE_SIZE byte boundary +-1: mov tmp1, CACHE_LINE_SIZE +- ands tmp2, dst, CACHE_LINE_SIZE - 1 +- // if cl_remainder == 0 +- b.eq L(L2_dc_zva) +- sub cl_remainder, tmp1, tmp2 +- // process remainder until the first CACHE_LINE_SIZE boundary +- mov tmp1, xzr // index +-2: whilelt p2.b, tmp1, cl_remainder +- st1b z0.b, p2, [dst, tmp1] +- incb tmp1 +- cmp tmp1, cl_remainder +- b.lo 2b +- add dst, dst, cl_remainder +- sub rest, rest, cl_remainder +- +-L(L2_dc_zva): +- // zero fill +- mov tmp1, dst +- dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1 +- mov zva_len, ZF_DIST +- add tmp1, zva_len, CACHE_LINE_SIZE * 2 +- // unroll ++ // count >= L2_SIZE + .p2align 3 +-1: st1b_unroll 0, 3 +- add tmp2, dst, zva_len +- dc zva, tmp2 +- st1b_unroll 4, 7 +- add tmp2, tmp2, CACHE_LINE_SIZE +- dc zva, tmp2 +- add dst, dst, CACHE_LINE_SIZE * 2 +- sub rest, rest, CACHE_LINE_SIZE * 2 +- cmp rest, tmp1 // ZF_DIST + CACHE_LINE_SIZE * 2 +- b.ge 1b +- cbnz rest, L(unroll8) +- ret ++L(L2): ++ tst valw, 255 ++ b.ne L(unroll8) ++ // align dst to CACHE_LINE_SIZE byte boundary ++ and tmp2, dst, CACHE_LINE_SIZE - 1 ++ st1b z0.b, p0, [dst, 0, mul vl] ++ st1b z0.b, p0, [dst, 1, mul vl] ++ st1b z0.b, p0, [dst, 2, mul vl] ++ st1b z0.b, p0, [dst, 3, mul vl] ++ sub dst, dst, tmp2 ++ add count, count, tmp2 ++ ++ // clear cachelines using DC ZVA ++ sub count, count, CACHE_LINE_SIZE * 2 ++ .p2align 4 ++1: add dst, dst, CACHE_LINE_SIZE ++ dc zva, dst ++ subs count, count, CACHE_LINE_SIZE ++ b.hi 1b ++ add count, count, CACHE_LINE_SIZE ++ add dst, dst, CACHE_LINE_SIZE ++ b L(last) + + END (MEMSET) + libc_hidden_builtin_def (MEMSET) +-- +1.8.3.1 + diff --git a/3-5-AArch64-Improve-A64FX-memset-for-remaining-bytes.patch b/3-5-AArch64-Improve-A64FX-memset-for-remaining-bytes.patch new file mode 100644 index 0000000000000000000000000000000000000000..7ba35160157681add3051dc033c2782f84c22cb9 --- /dev/null +++ b/3-5-AArch64-Improve-A64FX-memset-for-remaining-bytes.patch @@ -0,0 +1,80 @@ +From 186092c6ba8825598ffdbf15dbf0823c771f560d Mon Sep 17 00:00:00 2001 +From: Wilco Dijkstra +Date: Tue, 10 Aug 2021 13:42:07 +0100 +Subject: [PATCH] [3/5] AArch64: Improve A64FX memset for remaining bytes + +Simplify handling of remaining bytes. Avoid lots of taken branches and complex +whilelo computations, instead unconditionally write vectors from the end. + +Reviewed-by: Naohiro Tamura +--- + sysdeps/aarch64/multiarch/memset_a64fx.S | 46 +++++++++----------------------- + 1 file changed, 13 insertions(+), 33 deletions(-) + +diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S +index 75cf43a..337c86b 100644 +--- a/sysdeps/aarch64/multiarch/memset_a64fx.S ++++ b/sysdeps/aarch64/multiarch/memset_a64fx.S +@@ -130,38 +130,19 @@ L(unroll8): + b 1b + + L(last): +- whilelo p0.b, xzr, rest +- whilelo p1.b, vector_length, rest +- b.last 1f +- st1b z0.b, p0, [dst, #0, mul vl] +- st1b z0.b, p1, [dst, #1, mul vl] +- ret +-1: lsl tmp1, vector_length, 1 // vector_length * 2 +- whilelo p2.b, tmp1, rest +- incb tmp1 +- whilelo p3.b, tmp1, rest +- b.last 1f +- st1b z0.b, p0, [dst, #0, mul vl] +- st1b z0.b, p1, [dst, #1, mul vl] +- st1b z0.b, p2, [dst, #2, mul vl] +- st1b z0.b, p3, [dst, #3, mul vl] +- ret +-1: lsl tmp1, vector_length, 2 // vector_length * 4 +- whilelo p4.b, tmp1, rest +- incb tmp1 +- whilelo p5.b, tmp1, rest +- incb tmp1 +- whilelo p6.b, tmp1, rest +- incb tmp1 +- whilelo p7.b, tmp1, rest +- st1b z0.b, p0, [dst, #0, mul vl] +- st1b z0.b, p1, [dst, #1, mul vl] +- st1b z0.b, p2, [dst, #2, mul vl] +- st1b z0.b, p3, [dst, #3, mul vl] +- st1b z0.b, p4, [dst, #4, mul vl] +- st1b z0.b, p5, [dst, #5, mul vl] +- st1b z0.b, p6, [dst, #6, mul vl] +- st1b z0.b, p7, [dst, #7, mul vl] ++ cmp count, vector_length, lsl 1 ++ b.ls 2f ++ add tmp2, vector_length, vector_length, lsl 2 ++ cmp count, tmp2 ++ b.ls 5f ++ st1b z0.b, p0, [dstend, -8, mul vl] ++ st1b z0.b, p0, [dstend, -7, mul vl] ++ st1b z0.b, p0, [dstend, -6, mul vl] ++5: st1b z0.b, p0, [dstend, -5, mul vl] ++ st1b z0.b, p0, [dstend, -4, mul vl] ++ st1b z0.b, p0, [dstend, -3, mul vl] ++2: st1b z0.b, p0, [dstend, -2, mul vl] ++ st1b z0.b, p0, [dstend, -1, mul vl] + ret + + L(L1_prefetch): // if rest >= L1_SIZE +@@ -199,7 +180,6 @@ L(L2): + subs count, count, CACHE_LINE_SIZE + b.hi 1b + add count, count, CACHE_LINE_SIZE +- add dst, dst, CACHE_LINE_SIZE + b L(last) + + END (MEMSET) +-- +1.8.3.1 + diff --git a/4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch b/4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch new file mode 100644 index 0000000000000000000000000000000000000000..fd176710033f0dd03a04542b998b246a0349a6c6 --- /dev/null +++ b/4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch @@ -0,0 +1,51 @@ +From e69d9981f858a38e19304e6ff5ebdf89f2cb0ba0 Mon Sep 17 00:00:00 2001 +From: Wilco Dijkstra +Date: Tue, 10 Aug 2021 13:44:27 +0100 +Subject: [PATCH] [4/5] AArch64: Improve A64FX memset by removing unroll32 + +Remove unroll32 code since it doesn't improve performance. + +Reviewed-by: Naohiro Tamura +--- + sysdeps/aarch64/multiarch/memset_a64fx.S | 18 +----------------- + 1 file changed, 1 insertion(+), 17 deletions(-) + +diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S +index 337c86b..ef03156 100644 +--- a/sysdeps/aarch64/multiarch/memset_a64fx.S ++++ b/sysdeps/aarch64/multiarch/memset_a64fx.S +@@ -102,22 +102,6 @@ L(vl_agnostic): // VL Agnostic + ccmp vector_length, tmp1, 0, cs + b.eq L(L1_prefetch) + +-L(unroll32): +- lsl tmp1, vector_length, 3 // vector_length * 8 +- lsl tmp2, vector_length, 5 // vector_length * 32 +- .p2align 3 +-1: cmp rest, tmp2 +- b.cc L(unroll8) +- st1b_unroll +- add dst, dst, tmp1 +- st1b_unroll +- add dst, dst, tmp1 +- st1b_unroll +- add dst, dst, tmp1 +- st1b_unroll +- add dst, dst, tmp1 +- sub rest, rest, tmp2 +- b 1b + + L(unroll8): + lsl tmp1, vector_length, 3 +@@ -155,7 +139,7 @@ L(L1_prefetch): // if rest >= L1_SIZE + sub rest, rest, CACHE_LINE_SIZE * 2 + cmp rest, L1_SIZE + b.ge 1b +- cbnz rest, L(unroll32) ++ cbnz rest, L(unroll8) + ret + + // count >= L2_SIZE +-- +1.8.3.1 + diff --git a/5-5-AArch64-Improve-A64FX-memset-medium-loops.patch b/5-5-AArch64-Improve-A64FX-memset-medium-loops.patch new file mode 100644 index 0000000000000000000000000000000000000000..f8bc03e6150c98185e227fea69034114e97cbfdb --- /dev/null +++ b/5-5-AArch64-Improve-A64FX-memset-medium-loops.patch @@ -0,0 +1,96 @@ +From a5db6a5cae6a92d1675c013e5c8d972768721576 Mon Sep 17 00:00:00 2001 +From: Wilco Dijkstra +Date: Tue, 10 Aug 2021 13:46:20 +0100 +Subject: [PATCH] [5/5] AArch64: Improve A64FX memset medium loops + +Simplify the code for memsets smaller than L1. Improve the unroll8 and +L1_prefetch loops. + +Reviewed-by: Naohiro Tamura +--- + sysdeps/aarch64/multiarch/memset_a64fx.S | 45 ++++++++++++++------------------ + 1 file changed, 19 insertions(+), 26 deletions(-) + +diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S +index ef03156..7bf759b 100644 +--- a/sysdeps/aarch64/multiarch/memset_a64fx.S ++++ b/sysdeps/aarch64/multiarch/memset_a64fx.S +@@ -30,7 +30,6 @@ + #define L2_SIZE (8*1024*1024) // L2 8MB + #define CACHE_LINE_SIZE 256 + #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 +-#define rest x2 + #define vector_length x9 + + #if HAVE_AARCH64_SVE_ASM +@@ -89,29 +88,19 @@ ENTRY (MEMSET) + + .p2align 4 + L(vl_agnostic): // VL Agnostic +- mov rest, count + mov dst, dstin +- add dstend, dstin, count +- // if rest >= L2_SIZE && vector_length == 64 then L(L2) +- mov tmp1, 64 +- cmp rest, L2_SIZE +- ccmp vector_length, tmp1, 0, cs +- b.eq L(L2) +- // if rest >= L1_SIZE && vector_length == 64 then L(L1_prefetch) +- cmp rest, L1_SIZE +- ccmp vector_length, tmp1, 0, cs +- b.eq L(L1_prefetch) +- ++ cmp count, L1_SIZE ++ b.hi L(L1_prefetch) + ++ // count >= 8 * vector_length + L(unroll8): +- lsl tmp1, vector_length, 3 +- .p2align 3 +-1: cmp rest, tmp1 +- b.cc L(last) +- st1b_unroll ++ sub count, count, tmp1 ++ .p2align 4 ++1: st1b_unroll 0, 7 + add dst, dst, tmp1 +- sub rest, rest, tmp1 +- b 1b ++ subs count, count, tmp1 ++ b.hi 1b ++ add count, count, tmp1 + + L(last): + cmp count, vector_length, lsl 1 +@@ -129,18 +118,22 @@ L(last): + st1b z0.b, p0, [dstend, -1, mul vl] + ret + +-L(L1_prefetch): // if rest >= L1_SIZE ++ // count >= L1_SIZE + .p2align 3 ++L(L1_prefetch): ++ cmp count, L2_SIZE ++ b.hs L(L2) ++ cmp vector_length, 64 ++ b.ne L(unroll8) + 1: st1b_unroll 0, 3 + prfm pstl1keep, [dst, PF_DIST_L1] + st1b_unroll 4, 7 + prfm pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE] + add dst, dst, CACHE_LINE_SIZE * 2 +- sub rest, rest, CACHE_LINE_SIZE * 2 +- cmp rest, L1_SIZE +- b.ge 1b +- cbnz rest, L(unroll8) +- ret ++ sub count, count, CACHE_LINE_SIZE * 2 ++ cmp count, PF_DIST_L1 ++ b.hs 1b ++ b L(unroll8) + + // count >= L2_SIZE + .p2align 3 +-- +1.8.3.1 + diff --git a/glibc.spec b/glibc.spec index a4158e576bf4a637a2b23f2471689ca4ebc4b7a8..2e68546d62de2971a86de41d820cdffd655de359 100644 --- a/glibc.spec +++ b/glibc.spec @@ -63,7 +63,7 @@ ############################################################################## Name: glibc Version: 2.34 -Release: 4 +Release: 5 Summary: The GNU libc libraries License: %{all_license} URL: http://www.gnu.org/software/glibc/ @@ -90,6 +90,11 @@ Patch9: ldconfig-avoid-leak-on-empty-paths-in-config-file.patch Patch10: Linux-Fix-fcntl-ioctl-prctl-redirects-for-_TIME_BITS.patch Patch11: nis-Fix-leak-on-realloc-failure-in-nis_getnames-BZ-2.patch Patch12: rt-Set-the-correct-message-queue-for-tst-mqueue10.patch +Patch13: 1-5-AArch64-Improve-A64FX-memset-for-small-sizes.patch +Patch14: 2-5-AArch64-Improve-A64FX-memset-for-large-sizes.patch +Patch15: 3-5-AArch64-Improve-A64FX-memset-for-remaining-bytes.patch +Patch16: 4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch +Patch17: 5-5-AArch64-Improve-A64FX-memset-medium-loops.patch #Patch9000: turn-REP_STOSB_THRESHOLD-from-2k-to-1M.patch Patch9001: delete-no-hard-link-to-avoid-all_language-package-to.patch @@ -1181,6 +1186,9 @@ fi %doc hesiod/README.hesiod %changelog +* Fri Sep 17 2021 Qingqing Li - 2.34-5 +- aarch64: optimize memset performance. + * Fri Sep 17 2021 Qingqing Li - 2.34-4 - backport upstream patches to fix some memory leak and double free bugs