From 152e3e7e6fbdf1eea20f49c7a99291207e36ac43 Mon Sep 17 00:00:00 2001 From: mashu555 Date: Thu, 18 Sep 2025 14:11:25 +0800 Subject: [PATCH] aarch64: Optimize memcpy_sve by using 32-byte alignment --- ...mentation_for_32-byte_aligned_access.patch | 119 ++++++++++++++++++ glibc.spec | 6 +- 2 files changed, 124 insertions(+), 1 deletion(-) create mode 100755 AArch64-modify_the_SVE_memcpy_implementation_for_32-byte_aligned_access.patch diff --git a/AArch64-modify_the_SVE_memcpy_implementation_for_32-byte_aligned_access.patch b/AArch64-modify_the_SVE_memcpy_implementation_for_32-byte_aligned_access.patch new file mode 100755 index 0000000..3ec266a --- /dev/null +++ b/AArch64-modify_the_SVE_memcpy_implementation_for_32-byte_aligned_access.patch @@ -0,0 +1,119 @@ +From 47d70aaf3adb11af5c488b434039bb5844c97bb6 Mon Sep 17 00:00:00 2001 +From: Long Wei +Date: Wed, 24 Sep 2025 10:38:47 +0800 +Subject: [PATCH] aarch64:Modify the copy_long function in the SVE memcpy + implementation for 32-byte aligned access + +aarch64: Optimize memcpy_sve by using 32-byte alignment +The current memcpy_sve implementation shifts the destination pointer +forward to achieve only 16-byte alignment. This can lead to two +performance issues: +1. Cross-cache-line accesses: With 16-byte alignment, a 32-byte store + operation can still straddle two cache lines. This forces the CPU + to perform two separate cache line accesses, effectively doubling + the time for the store. +2. Cache bank conflicts: On some ARM microarchitectures, L1 cache is + organized into banks. 16-byte alignment can cause stores to frequently + hit the same bank, creating contention and reducing effective memory + bandwidth. +Change the implementation of memcpy_sve from shifting forward to 16-byte +alignment to shifting forward to 32-byte alignment, which is more +cache-friendly. +- All 32-byte SVE vector stores are fully contained within a + single 64-byte cache line, minimizing access latency. +- Stores are distributed across different cache banks more + evenly, preventing conflicts and maximizing throughput. + +We tested the performance of `memcpy` on Kunpeng servers using the +libmicro test suite. +The results showed that using 32-byte alignment can reduce the latency +of `memcpy`. +The test results are in microseconds. + +16-byte alignment: +memcpy_10 memcpy_32 memcpy_64 memcpy_128 memcpy_256 +0.0028 0.0028 0.0028 0.0035 0.0063 +memcpy_512 memcpy_1k memcpy_2k memcpy_4k memcpy_8k +0.0122 0.0165 0.0315 0.0605 0.1251 +memcpy_10k memcpy_16k memcpy_32k memcpy_64k memcpy_128k +0.1597 0.2458 0.512 1.024 2.048 +memcpy_256k memcpy_512k memcpy_1m memcpy_2m memcpy_4m +4.096 7.936 16.8 33.152 66.72 +memcpy_8m memcpy_10m +132.096 165.12 + +32-byte alignment: +memcpy_10 memcpy_32 memcpy_64 memcpy_128 memcpy_256 +0.0028 0.0028 0.0028 0.0035 0.0058 +memcpy_512 memcpy_1k memcpy_2k memcpy_4k memcpy_8k +0.0096 0.0165 0.0315 0.0614 0.121 +memcpy_10k memcpy_16k memcpy_32k memcpy_64k memcpy_128k +0.1515 0.2355 0.512 1.024 2.048 +memcpy_256k memcpy_512k memcpy_1m memcpy_2m memcpy_4m +3.84 7.168 15.072 29.952 60.032 +memcpy_8m memcpy_10m +119.04 147.968 + +No functional change. + +sysdeps/aarch64/multiarch/memcpy_sve.S: Change alignment shifting from 16 +bytes to 32 bytes. +--- + sysdeps/aarch64/multiarch/memcpy_sve.S | 30 +++++++++++++------------- + 1 file changed, 15 insertions(+), 15 deletions(-) + +diff --git a/sysdeps/aarch64/multiarch/memcpy_sve.S b/sysdeps/aarch64/multiarch/memcpy_sve.S +index 2f14f913..17c8859a 100644 +--- a/sysdeps/aarch64/multiarch/memcpy_sve.S ++++ b/sysdeps/aarch64/multiarch/memcpy_sve.S +@@ -109,22 +109,22 @@ L(copy_long): + add srcend, src, count + add dstend, dstin, count + +- /* Copy 16 bytes and then align src to 16-byte alignment. */ +- ldr D_q, [src] +- and tmp1, src, 15 +- bic src, src, 15 ++ /* Copy 32 bytes and then align src to 32-byte alignment. */ ++ ldp A_q, B_q, [src] ++ and tmp1, src, 31 ++ bic src, src, 31 + sub dst, dstin, tmp1 +- add count, count, tmp1 /* Count is now 16 too large. */ +- ldp A_q, B_q, [src, 16] +- str D_q, [dstin] +- ldp C_q, D_q, [src, 48] +- subs count, count, 128 + 16 /* Test and readjust count. */ ++ add count, count, tmp1 /* Count is now 32 too large. */ ++ stp A_q, B_q, [dstin] ++ ldp A_q, B_q, [src, 32] ++ ldp C_q, D_q, [src, 64] ++ subs count, count, 128 + 32 /* Test and readjust count. */ + b.ls L(copy64_from_end) + L(loop64): +- stp A_q, B_q, [dst, 16] +- ldp A_q, B_q, [src, 80] +- stp C_q, D_q, [dst, 48] +- ldp C_q, D_q, [src, 112] ++ stp A_q, B_q, [dst, 32] ++ ldp A_q, B_q, [src, 96] ++ stp C_q, D_q, [dst, 64] ++ ldp C_q, D_q, [src, 128] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 +@@ -133,9 +133,9 @@ L(loop64): + /* Write the last iteration and copy 64 bytes from the end. */ + L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] +- stp A_q, B_q, [dst, 16] ++ stp A_q, B_q, [dst, 32] + ldp A_q, B_q, [srcend, -32] +- stp C_q, D_q, [dst, 48] ++ stp C_q, D_q, [dst, 64] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret +-- +2.33.0 + diff --git a/glibc.spec b/glibc.spec index 583b11b..d359115 100644 --- a/glibc.spec +++ b/glibc.spec @@ -67,7 +67,7 @@ ############################################################################## Name: glibc Version: 2.38 -Release: 67 +Release: 68 Summary: The GNU libc libraries License: %{all_license} URL: http://www.gnu.org/software/glibc/ @@ -406,6 +406,7 @@ Patch9038: Using-__memmove_generic-when-kunpeng920-with-tsv120-m.patch Patch9039: revert-aarch64-Use-memcpy_simd-as-the-default-memcpy.patch Patch9040: backport-Fix-UB-on__dl_map_object_from_fd.patch Patch9041: backport-Fix-handling-of-symbol-versions-which-hash-to-zero.patch +Patch9042: AArch64-modify_the_SVE_memcpy_implementation_for_32-byte_aligned_access.patch Provides: ldconfig rtld(GNU_HASH) bundled(gnulib) @@ -1586,6 +1587,9 @@ fi %endif %changelog +* Thu Sep 18 2025 Long Wei - 2.38-68 +- aarch64: Optimize memcpy_sve by using 32-byte alignment + * Fri Aug 29 2025 Qingqing Li - 2.38-67 - i386: Add GLIBC_ABI_GNU_TLS version [BZ #33221] - Also add GLIBC_ABI_GNU2_TLS version [BZ #33129] -- Gitee