From ae77348b9707fbbf7c4bacfb19d99b1a1dd45fad Mon Sep 17 00:00:00 2001 From: sunyining <2097653236@qq.com> Date: Wed, 15 Oct 2025 10:48:18 +0800 Subject: [PATCH] syn_string_test --- str_test/srrchr/asmdefs.h | 91 ++++++++++++++++ str_test/srrchr/srrchr | Bin 0 -> 71000 bytes str_test/srrchr/srrchr_neon.S | 136 +++++++++++++++++++++++ str_test/srrchr/strr_scalar.S | 70 ++++++++++++ str_test/srrchr/strr_sve.S | 72 +++++++++++++ str_test/srrchr/strrchr_scalar | Bin 0 -> 70984 bytes str_test/srrchr/test.cpp | 106 ++++++++++++++++++ str_test/strcmp/asmdefs.h | 91 ++++++++++++++++ str_test/strcmp/strcmp.S | 186 ++++++++++++++++++++++++++++++++ str_test/strcmp/strcmp_neon | Bin 0 -> 13584 bytes str_test/strcmp/strcmp_neon.S | 96 +++++++++++++++++ str_test/strcmp/strcmp_scalar | Bin 0 -> 13584 bytes str_test/strcmp/strcmp_scalar.S | 31 ++++++ str_test/strcmp/strcmp_sve.S | 25 +++++ str_test/strcmp/strcmp_test | Bin 0 -> 13584 bytes str_test/strcmp/strcmp_test.cpp | 84 +++++++++++++++ str_test/strcpy/asmdefs.h | 91 ++++++++++++++++ str_test/strcpy/strcpy.S | 142 ++++++++++++++++++++++++ str_test/strcpy/strcpy1 | Bin 0 -> 13752 bytes str_test/strcpy/strcpy_scalar | Bin 0 -> 13928 bytes str_test/strcpy/strcpy_scalar.S | 95 ++++++++++++++++ str_test/strcpy/strcpy_sve.S | 89 +++++++++++++++ str_test/strcpy/strcpy_test.cpp | 92 ++++++++++++++++ str_test/strlen/asmdefs.h | 91 ++++++++++++++++ str_test/strlen/strlen.S | 182 +++++++++++++++++++++++++++++++ str_test/strlen/strlen_neon | Bin 0 -> 13664 bytes str_test/strlen/strlen_neon.S | 164 ++++++++++++++++++++++++++++ str_test/strlen/strlen_scalar | Bin 0 -> 14120 bytes str_test/strlen/strlen_scalar.S | 160 +++++++++++++++++++++++++++ str_test/strlen/strlen_sve.S | 117 ++++++++++++++++++++ str_test/strlen/test.cpp | 97 +++++++++++++++++ 31 files changed, 2308 insertions(+) create mode 100644 str_test/srrchr/asmdefs.h create mode 100644 str_test/srrchr/srrchr create mode 100644 str_test/srrchr/srrchr_neon.S create mode 100644 str_test/srrchr/strr_scalar.S create mode 100644 str_test/srrchr/strr_sve.S create mode 100644 str_test/srrchr/strrchr_scalar create mode 100644 str_test/srrchr/test.cpp create mode 100644 str_test/strcmp/asmdefs.h create mode 100644 str_test/strcmp/strcmp.S create mode 100644 str_test/strcmp/strcmp_neon create mode 100644 str_test/strcmp/strcmp_neon.S create mode 100644 str_test/strcmp/strcmp_scalar create mode 100644 str_test/strcmp/strcmp_scalar.S create mode 100644 str_test/strcmp/strcmp_sve.S create mode 100644 str_test/strcmp/strcmp_test create mode 100644 str_test/strcmp/strcmp_test.cpp create mode 100644 str_test/strcpy/asmdefs.h create mode 100644 str_test/strcpy/strcpy.S create mode 100644 str_test/strcpy/strcpy1 create mode 100644 str_test/strcpy/strcpy_scalar create mode 100644 str_test/strcpy/strcpy_scalar.S create mode 100644 str_test/strcpy/strcpy_sve.S create mode 100644 str_test/strcpy/strcpy_test.cpp create mode 100644 str_test/strlen/asmdefs.h create mode 100644 str_test/strlen/strlen.S create mode 100644 str_test/strlen/strlen_neon create mode 100644 str_test/strlen/strlen_neon.S create mode 100644 str_test/strlen/strlen_scalar create mode 100644 str_test/strlen/strlen_scalar.S create mode 100644 str_test/strlen/strlen_sve.S create mode 100644 str_test/strlen/test.cpp diff --git a/str_test/srrchr/asmdefs.h b/str_test/srrchr/asmdefs.h new file mode 100644 index 0000000..7a0a2ef --- /dev/null +++ b/str_test/srrchr/asmdefs.h @@ -0,0 +1,91 @@ +/* + * Macros for asm code. AArch64 version. + * + * Copyright (c) 2019-2025, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +/* Set the line separator for the assembler. */ +#if defined (__APPLE__) +# define SEP %% +# define PREF _ +#else +# define SEP ; +# define PREF +#endif + +/* Branch Target Identitication support. */ +#define BTI_C hint 34 +#define BTI_J hint 36 +/* Return address signing support (pac-ret). */ +#define PACIASP hint 25 SEP .cfi_negate_ra_state +#define AUTIASP hint 29 SEP .cfi_negate_ra_state + +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ +#define FEATURE_1_AND 0xc0000000 +#define FEATURE_1_BTI 1 +#define FEATURE_1_PAC 2 + +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a" SEP \ + .p2align 3 SEP \ + .word 4 SEP \ + .word 16 SEP \ + .word 5 SEP \ + .asciz "GNU" SEP \ + .word type SEP \ + .word 4 SEP \ + .word value SEP \ + .word 0 SEP \ + .text + +/* If set then the GNU Property Note section will be added to + mark objects to support BTI and PAC-RET. */ +#ifndef WANT_GNU_PROPERTY +#define WANT_GNU_PROPERTY 1 +#endif + +#if WANT_GNU_PROPERTY +/* Add property note with supported features to all asm files. */ +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +#endif + +#define ENTRY_ALIGN(name, alignment) \ + .align alignment SEP \ + ENTRY_ALIAS(name) SEP \ + .cfi_startproc SEP \ + BTI_C + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#if defined (__APPLE__) +/* Darwin is an underscore platform, symbols need an extra _ prefix. */ +# define ENTRY_ALIAS(name) \ + .global _ ## name SEP \ + _ ## name: + +# define END(name) .cfi_endproc +#elif defined (_WIN32) || defined (__CYGWIN__) +# define ENTRY_ALIAS(name) \ + .global name SEP \ + name: + +# define END(name) .cfi_endproc +#else +# define ENTRY_ALIAS(name) \ + .global name; \ + .type name,%function; \ + name: + +# define END(name) \ + .cfi_endproc; \ + .size name, .-name +#endif + +#define L(l) .L ## l + +#endif diff --git a/str_test/srrchr/srrchr b/str_test/srrchr/srrchr new file mode 100644 index 0000000000000000000000000000000000000000..8879324e37e7272b97e456a2d78f41f6ffc9acb7 GIT binary patch literal 71000 zcmeI0eQ*@lk-&R*Rv-|F6~8(n zUiZvvF*{oMsH@8T)lW6**ZsQtz4v=>-pL}T6n}H?AeHr6uoHtth%CSxLHx`#eYWysD9qvxVJMKUv%fIej0k}cwhL~hNE+6+%yKtM?O@@f4o(* z?bOEoDf0~p4a&gJ3*m^+U^;5NV0Obw6jL2OT%9|H{nuy@u#-d5IA=R_t|qZkEPoj7 zYOAuVC2t$UpI8;k^Os}To5!%fgZ2Pk?y?>Y#l~|#+KctOcnmwOWHJB$MtcCSQAO7y zeTLo{PZ^2$V=>V6HJNzI?9?OOJ&-gKiF5>QxY;6l)-*Dvo;2bq=*+}oV05H2CS=WM zER#Wz!7Z49SYONPS^pl)anFNWef zyJ=ZVUA?{_So3+{>cT~!x3n+UaS*Ytc-D+%+LzZS(y3Ux(UG7&x{~RX8oF+~7In0_ zc<~_Co-#tpm^CZSRAB#5xs<{x(W#r?Ss#*I+-^$!7a| zNBsHx)6_u{Dx)ZXkUHnsep&q#&zQ?3W#O=YCwR>|PuXoi|*%9dI!_H?G1@t@Y z3XPR#qrj*SEQ|ebx5Mt_u*YFnKOI)Uiw-;Q32NByusiSjSpn^> z;q|{99M)jq8GtRr>wmT9fTka_dj~j=ZRb%`{8LlT-|>ZkfOpn0is8#@`zf|PX}4MW zOQ`?Aq5li&2Oat=sGo7@2T}JvXyvxze}npLhyE(+bq@VsQSWf*M^Jy#p&vv22M+x> z>IWVANz~8SI?wY)>W`Xo`MB4o);+S?X3FQoLD2J!S zIU3Wi@bToAxX!q1+b)%ba>MHnS!?~{e10ph=Z(nQFxTyPy{Ez6(^@D;b9^83o0vh3%Zav?mPw&e4-So_NZn;--O_$a!m3gr@ja|dvIr!hy>e>CL` z^gXY6_5$v=V*XVChi6}X{AhU66Hn*IKXI<=YIrlnpTYRLt4+BME8aU2KfL~xk#)NX z`&?ZYx;Ny3d#tgYZOZkdKOUx=z!%D`!Z8ow6+lj_>)>9S)j^BZNgO{hqI(9syN}R) zvjTn3APZ#NHfJVGt@rtAtH4vS1RB;pfonqd)Hg=rUb4oK>+ubKj4{-Ij^_Dc6YiT( z?(_4~&C>J8*0tVn4e}Vc%lBXdlq{1AqPYbKa}< zM?*gE)1gv$I^cuDu)$N&|8P|;?bCO>o^z+K<=YQj3m<*{;l;I{vi`Q4@p|InKEdh9`7Ri*&6zwAK-W*A2j8l|Gt3;#(#wU(s&M`WVW;(#Ge=+NBfQ3D{dcs zuJ+d4{{A+fcTXlT$&XuK=~V#k(cMTzVPy=Ti3okul2JJmbOMNKYp|n`oCK;@7LQu z_h4n~7d`X-p$D&t8vKmL^0)^sH2S}}y{qEyw^JXKoATEDWOj=3QqG#~0V`+CD|SvR z&$awM8Viks#z6g3-;tN+v_AYmX=_Ufym6;z;;TKL3IECY-}4_h{z54X9PlkYoCml7 z{^z&Ty-efa-yHtEq30MX%VWuOW?jH~5H!qqIu)QNxmc8cLs#KL<9&g7kM-Ua2tU#l zXvYUTRD%mU1F5X#gBy#=hf1on_Dc%dU*{ihzvYGxKDMmxRXQF5cCj5iGqn5Z-HU$P z{U7J^o3Pn`C7(~A{#BIyC^w?qg7Of`Jt+NG^ZD0MR-!zGawSSiyYgB-e?X<@?mdsS zfv@j&&x}d4tmm~#^xuK~EJ6w4@wb!TU3k5Ywra9rAzKKq570)EN&qh!XC;n--UT3B z9@y#IrPXcrww^q3)Ou8+xIVldKp%eQnTh(h@Tx|e-DkOfr+1gN+iG?}(}?S>N7-tV(3g_VG&IqU3qiG221uJg@W0Z-SECKTBnXD|vf<*qs%6P3bSHiRP8g z<2tYOjjBDTbcKV{{@G=+cKP@lZ!^%Ipo?8@wl2 zJ-?>c(rW&8!Gd7*qC0J?E4-+xfd^`NK^z_$_SIh0u0dZ>yKiJ)JJNs#CBXZ+&^``$ z-xu1)1Mi1IdnxcfDzqakzHuB48cYP<*A6p|rwsW1F0@YqUjIV-WGJ>C8k7T{3x)nu zV4FHG3hjQVp5nCFbqy+DV^RB5;B&?qrNK?W=S!h|8Wh`S8r%$g9u)ddM|JcU zV&mcbrYnB{{k_;P+xdL7{YUQ;+dg{T=YL|qWf0(l6#b}ho=vWOvIyHgY5dLn17Kxg zK2u<~D-X}ty>FS@DVM)(uK>SaC061K|I2tbx_CCDKk;?qN$20oA`dt7$FN_s z@(}Gp4rQno%k!Br?DwHPz@0f{3)+iu=p?&$^!=M1BmV{~&-HVgpYt|b{*$3!oo}z< ze5mY0JFTOe^FNP~->35M?+BfzRDL@~o;Q@;{d;$U^Vl~O4B2VmjAd7fLwtyK;^5AM zKXpZU-ir3Ac>eHt&g1+X*}bEGcWRVBpASC82UbNgW=${+bEBZ^Mn_yXjV_qm2WXC? zIbtLd`lB(UNAFCLErm8SmNkQsHET%En!TN!!3acSnOIjmYsNCVnbaeRbSjp`gweF# zl}L9O2|a42Gg;l}#mwpCnncWuMT7S)Sh%>jlinFm#dRZ-G1lp^l$luvof#t;)1$q~ z;TC{di1WCX~vTU z4OtU2XPmBjENYkrSg2I~SSpIsOyZiQBL%MdnoK-pcIuJto`L_pJ9^`Z==^w8nZk7~^XZ1O+PaNwHw2^WQkc({X2x!LB$mm>)2RYQ z$F@u?VNigotx1>=q}u_4W~>kY(_MsZX=|edW8Lb$>5gJ7Yj%Wv1KAN=$HYc59>HGI zn9I(AyBLCa2Pg5abR1v*=PG^wprZ(nKdbP2BuoB2qM9vVu9KXD7kzW!cz)kxIhULq zg?}&Dv=}SScz&;78DK{x-SG`*{|LW1aR1v>g(ZK#;qNrA$9QPPc>2lW`2DKFl78Q* z(CaB4;r2%nl>8?36=mo6el)*f%eN*9hes5;^Y30b{XFrxb*iPSQ zSm*a^mMc{}jh}qn^M4F&G-Qq+c@J0bvG3` z2W{k{x*K_~=l6T=2VJb9@UjX|Va*xO@Bb`Mjd*JC(ramDld7Rv^nxXKe!lErqc zIQ!@C2`s0vqmu6N?^f~5pT9edNLM`bWAh$YJU`d7Y-dL$-TlAhis$bbEc?~^aT<63 z2e5%&Md{w4!rwb;hi%>M&pON3(2ZWbIG(?Ygl;gNp6h7ua6G^7G^=>t*u1~EU)~3& zF_xw{60hEWn$;JP{n+L#9MAH7Y;?w7zmsfm+z;pAIMzSF3fZ`Se*W}ZS+CzO>~Cjf z4d4P+$yR;cWYswSPM4kIos##S(>0{}A5zcH^j*9WFCIVNm(< literal 0 HcmV?d00001 diff --git a/str_test/srrchr/srrchr_neon.S b/str_test/srrchr/srrchr_neon.S new file mode 100644 index 0000000..6902895 --- /dev/null +++ b/str_test/srrchr/srrchr_neon.S @@ -0,0 +1,136 @@ + + +#include "asmdefs.h" + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 + +#define result x0 + +#define src x2 +#define tmp1 x3 +#define wtmp2 w4 +#define tmp3 x5 +#define src_match x6 +#define src_offset x7 +#define const_m1 x8 +#define tmp4 x9 +#define nul_match x10 +#define chr_match x11 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_nul1 v3 +#define vhas_nul2 v4 +#define vhas_chr1 v5 +#define vhas_chr2 v6 +#define vrepmask_0 v7 +#define vrepmask_c v16 +#define vend1 v17 +#define vend2 v18 + +/* Core algorithm. + + For each 32-byte hunk we calculate a 64-bit syndrome value, with + two bits per byte (LSB is always in bits 0 and 1, for both big + and little-endian systems). For each tuple, bit 0 is set iff + the relevant byte matched the requested character; bit 1 is set + iff the relevant byte matched the NUL end of string (we trigger + off bit0 for the special case of looking for NUL). Since the bits + in the syndrome reflect exactly the order in which things occur + in the original string a count_trailing_zeros() operation will + identify exactly which byte is causing the termination, and why. */ + +ENTRY (__strrchr_aarch64) + /* Magic constant 0x40100401 to allow us to identify which lane + matches the requested byte. Magic constant 0x80200802 used + similarly for NUL termination. */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ + dup vrepmask_c.4s, wtmp2 + mov src_offset, #0 + ands tmp1, srcin, #31 + add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ + b.eq L(aligned) + + /* Input string is not 32-byte aligned. Rather than forcing + the padding bytes to a safe value, we calculate the syndrome + for all the bytes, but then mask off those bits of the + syndrome that are related to the padding. */ + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + neg tmp1, tmp1 + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_nul2.16b, vdata2.16b, #0 + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b + and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 + addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vend1.16b, vhas_nul1.16b, vhas_chr1.16b // 128->64 + mov nul_match, vend1.d[0] + lsl tmp1, tmp1, #1 + mov const_m1, #~0 + lsr tmp3, const_m1, tmp1 + mov chr_match, vend1.d[1] + + bic nul_match, nul_match, tmp3 // Mask padding bits. + bic chr_match, chr_match, tmp3 // Mask padding bits. + cbnz nul_match, L(tail) + + .p2align 4 +L(loop): + cmp chr_match, #0 + csel src_match, src, src_match, ne + csel src_offset, chr_match, src_offset, ne +L(aligned): + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + uminp vend1.16b, vdata1.16b, vdata2.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + cmeq vend1.16b, vend1.16b, 0 + addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vend1.16b, vend1.16b, vhas_chr1.16b // 128->64 + mov nul_match, vend1.d[0] + mov chr_match, vend1.d[1] + cbz nul_match, L(loop) + + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_nul2.16b, vdata2.16b, #0 + and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b + and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b + mov nul_match, vhas_nul1.d[0] + +L(tail): + /* Work out exactly where the string ends. */ + sub tmp4, nul_match, #1 + eor tmp4, tmp4, nul_match + ands chr_match, chr_match, tmp4 + /* And pick the values corresponding to the last match. */ + csel src_match, src, src_match, ne + csel src_offset, chr_match, src_offset, ne + + /* Count down from the top of the syndrome to find the last match. */ + clz tmp3, src_offset + /* Src_match points beyond the word containing the match, so we can + simply subtract half the bit-offset into the syndrome. Because + we are counting down, we need to go back one more character. */ + add tmp3, tmp3, #2 + sub result, src_match, tmp3, lsr #1 + /* But if the syndrome shows no match was found, then return NULL. */ + cmp src_offset, #0 + csel result, result, xzr, ne + + ret + +END (__strrchr_aarch64) diff --git a/str_test/srrchr/strr_scalar.S b/str_test/srrchr/strr_scalar.S new file mode 100644 index 0000000..76c0a50 --- /dev/null +++ b/str_test/srrchr/strr_scalar.S @@ -0,0 +1,70 @@ +/* + * strrchr - find last position of a character in a string. + * Scalar implementation for AArch64 + */ + +/* Arguments and results */ +#define srcin x0 +#define chrin w1 +#define result x0 + +#define src x2 +#define tmp1 x3 +#define tmp2 x4 +#define tmp3 x5 +#define last_match x6 +#define current_char w7 // 改为32位寄存器 +#define char_byte w8 // 改为32位寄存器 +#define const_m1 x9 + + .global __strrchr_aarch64_scalar + .type __strrchr_aarch64_scalar, %function +__strrchr_aarch64_scalar: + mov last_match, #0 // Initialize last match to NULL + mov char_byte, chrin // Use 32-bit move for 32-bit register + + // Handle empty string case + ldrb current_char, [srcin] // Use w7 (32-bit) for byte load + cbz current_char, .Lcheck_null_char + +.Lloop: + // Load current character + ldrb current_char, [src], #1 // Use w7 (32-bit) for byte load + + // Check for null terminator + cbz current_char, .Lend_of_string + + // Check if current character matches + cmp current_char, char_byte + b.ne .Lloop + + // Found a match, update last_match (pointing to current position) + sub last_match, src, #1 + b .Lloop + +.Lend_of_string: + // At end of string, check if we're looking for null terminator + cmp char_byte, #0 + b.ne .Lreturn_result + + // If looking for null terminator, return end of string + mov last_match, src + sub last_match, last_match, #1 + +.Lreturn_result: + // Return last match found (or NULL if none) + mov result, last_match + ret + +.Lcheck_null_char: + // Special case: empty string + cmp char_byte, #0 + b.ne .Lreturn_null + mov result, srcin // Return start for null char in empty string + ret + +.Lreturn_null: + mov result, #0 + ret + + .size __strrchr_aarch64_scalar, .-__strrchr_aarch64_scalar diff --git a/str_test/srrchr/strr_sve.S b/str_test/srrchr/strr_sve.S new file mode 100644 index 0000000..9347e1b --- /dev/null +++ b/str_test/srrchr/strr_sve.S @@ -0,0 +1,72 @@ +#include "asmdefs.h" + +#define srcin x0 +#define chrin w1 +#define result x0 + +#define src x2 +#define src_match x3 +#define tmp1 w4 +#define tmp2 x5 +#define const_m1 x6 +#define last_offset x7 + +/* SVE registers */ +#define pg_ptr p0.b +#define pg_nul p1.b +#define pg_chr p2.b +#define pg_combined p3.b + +#define z_src z0 +#define z_repchr z1 +#define z_index z2 +#define z_zero z3 + +ENTRY (__strrchr_aarch64_sve) + mov src, srcin + mov src_match, #0 + dup z_repchr.b, chri + mov z_zero.b, #0 + index z_index.b, #0, #1 + + ptrue pg_ptr.b + + .p2align 3 +L(loop): + ld1b z_src.b, pg_ptr/z, [src] + + + cmpne pg_nul, pg_ptr/z, z_src.b, #0 + b.none L(loop_end) + + + cmpne pg_chr, pg_ptr/z, z_src.b, z_repchr.b + b.none L(loop_next) + + + lasta tmp1, pg_chr, z_index.b + add tmp2, src, tmp1 // Calculate absolute position + mov src_match, tmp2 + +L(loop_next): + + incb src + b L(loop) + +L(loop_end): + + cmpne pg_chr, pg_ptr/z, z_src.b, z_repchr.b + b.none L(check_result) + + and pg_combined.b, pg_ptr/z, pg_chr.b, pg_nul.b + cbz x4, L(check_result) + lasta tmp1, pg_combined, z_index.b + add tmp2, src, tmp1 + mov src_match, tmp2 + +L(check_result): + + mov result, src_match + ret + +END (__strrchr_aarch64_sve) diff --git a/str_test/srrchr/strrchr_scalar b/str_test/srrchr/strrchr_scalar new file mode 100644 index 0000000000000000000000000000000000000000..e1850b399179a8dae2a2eec5df4901f3869370b4 GIT binary patch literal 70984 zcmeI0eQ;FQb->T=TZBMJNEl;muvuZ!AsMsgL&hR$%C0_;u3@S?c-AZe530@sW^xP^NsIZfG0iSlaK$`YK} zxc_Oa_2X`P{kWTv<%k-`86`Q6REld=ajhzjC9uDeJU^NnT^-8qmrdA4g(dk?+4LJb zLG?{ljyg%UyJld?abL$c8t2Uxg$XvR{-)w`OpTubgiS8(HDbdoB)y*0^QEgRtSp-+rZT_WMuGdEQ?8%wG+>HMh6!K^ixWf%1_L74o0% z)Lc8YaevBuQ$m9ZsOE+6L})M%HC`~cVH1iu9v`kQnZo{0Xb-cKN76VKdvva*u~I63 z4DB`SuDpIfiBB5!RX9dHstJt zX;~;N+=3RE{Ye|T9<`D-<-(A7FQwAEjARPpn5|duHf=kZE^hAbGpqzqH?)UWL+PE} z(%#n4sIQEyxF5K>st)wFj`cbY!t74wY}4vk-irAgIpXTH#{&4b4<+q~FkbrxpT*lg1V6z#6^i~YN@}O_n$xbqFxJWD`UfZc z`TWz=K@zQ?D1fLs=h!}`eh)?qGzz>|*?)$kLOJ&9wX=Rq`3KP7dv3qwv3t+05s%$_ z3!m}WE8Q-g|K~k+b!0mBF^}E5AGa!d5XVF34VNB|T}{nt+UBu)$Jy_(^O;2f10K6V zj-F`s=9w z(4+qY>PJ2L&rm<-(T}1Y{39o~6aP!p7kl(yq2A!p{|)s{kA58WCp`Ko)PLyF-$VVV zM?a1FIalX--mDzoIqa~yTmkvZ$feK-Z*RTQk}rJcj<*94&HrF6=T`jP9iZeeujZ@wQKTDLHUQbXB* zGK8`b=UgM5|6l?dHoB(fs(%W6oN?R4DA@^}HE*C+4~fuit2J_^cMq(;Pp>yl0RHjJ4!7 zYzyJ_ZOr>3UXFdVB_GA(X>Xx$o3pxPXYf-KRxNro=cNTNh{%1?xK;Ka< za2RmEmGZ9zcw_Ok$4xqJ@u`LxYwO=*M@#N^3?Ht2Zk@f?zfJ=`_u=QuI8o_!J+f3V7wf<4}N23|Jac-7+i{D`vm(rR1UAL#*6j=jX(Ss@4gVc)_5Ws z3O*ezho{3Kcmtjc)C@dSTTlD+!(iaT+3SUlBiCamzW>nb`as3N=3DW4{Go9DEv>sY zpVbx~z8pAY5u*z1BN`j1n_w3lJrc`fjz z=@4D>QUK~&=0@vZ0;nrMc`x1L>q7AMRa*A|JdFFf<6|v&_?Kuqqm><|JMtKP3D9V3EIfilsgCt+5-~sD~K!KzRX{;t>82 z$FViGk7oX_}I zbe`9!@|&R~_s>$9;cCI1A9m-2URV07YN7?D^SDNpzDu>|m9B7b+Rt4sR}=lbWUJZ_ zH#?CigZ2fKgG;WZvGL2{r4M%YW$eE2n#g^T<;zy|IogW9T)8r`yzYyx)fZk;)gS=% zydWMA4g2aSY1g2?q&+mTuRUo%gK5C~x!7I?yzh(c(}DLxvArC49~IjX7T-9Y1`TEc z?`w}4$5R1(e;3H7Yp9JRCl-Uf0xE*!EfDZ{;5VCkykL4M%)=c)tF9%iM7578TcC150NY znc=Ibe%5$)`FOUXKk@Y2A5ivTpTFDhJp8dJ-fw@d{P}kqRj1_ef55Xp)z6yRZIQjX&Nsj;A@l3w&{@%$m+0NXKmE<}3K01kcK8t6&Zyl%N&;5p~dE~DA z&>3e9`0dnMs^9-{{4W;#=f)KZC~;oRK;Qwq-;b#03&#!rx%+MM`MC)F=dhDU-aCc8 z9qnO{A6IY3?Gbi*Dc&bN=wI#G5A6R4+849pk-O1eYCJ<|ukrYC^nW3L3>4}YxM2zx32+Yn222|i2EUdlf@g?$U!!=7<) zwG-{7_&i4T;N<)FnJMxgaPr(ZxA{5m701642GsfXCeD+}zoMPi*U$g>6#4H|dH8pP z&QmJqrpWW(%I^QYyTWG;r0iE5#!!@xhxo`19P3_OQo~tEVq`^y*z1vSrX_n>jlY-?okP zoZZ*e6^TQ_w9M{g&NeOGPV4bhHe=>6VIr${r?QA8XE2kTMm^qu?mNIQK5m~cx z)#}ntdRH=&)D6orw(Dlbwzfl;Wu#3#(U(qd#}JREqo-Zu8jshvt$EaJ#G&f?y3Mim z&3f~OCS3=;=}$Jq*0(jHxn;wHdULC4ZEe~NdQ1DphFH74aoxHt%^iA2tf9SG2eUtE zM_bz48XEPLkrnr2U@F-e*K@XE*?QVYW+oa})q&pDv0lFcqGNp{VdyYAQziy>=5i_y z2cpOO4Vt5oO71Wxa$pBYF5atmo3@=y7d7N;%xrmGbu(eu1~{lxelwH6X{K?_vhgBU z?Afy7J(g}5C|1|$Ib2x7!mw>tGGlk?@t)q|0Ltx-7oBjvI4bnfyNqPY>!|X2O}gHl z&SpGwy%F8rXIP11-$X8}_ZXQ3GBP@|Rx$0ijhH5p%;^^0?rNuh~&1XZKD&VZMm|CYlzsiwm~G5warL(rZ3XjmrNy=B@@aNYiL_W z_np($W8``ulGvWXe6F-Dx8)Jj$|bXzB1OkG%S;&*plaJvHbm&IfQW7ORmdCH?VDXupEr7`Xra zs=|`L*YI~4-(x#$z zBUrYxqmusqU-!lHcMF#EJnk+2{*PcIy(-bYL506>)Q`Km-=B4sZ=xH$qH;Wc2Z`Qf z{0PR<-r;zD?`c)>ys>$IalgC|&SET0aUx#52eqm%Aj8<^EgaACV{G)s-?)=paNG~) z;5gQQixskQ|NOjJ?PR@izp%fXl{J8iSS8!?8z!g5@n7`WIo>OI?|EILs{bSEd6~Y0 rH{-?Q=lhbHdF +#include +#include +#include +#include + +#define NUM_TESTS 1000000 + +extern "C" char*__strrchr_aarch64(const char* str, int c); + + + +// 清理缓存函数 +void flush_cache() { + volatile char* flush = (volatile char*)malloc(1024 * 1024); // 1MB + for (int i = 0; i < 1024 * 1024; i += 64) { + flush[i] = i; + } + free((void*)flush); +} + +void test_strrchr_performance(const size_t sizes[], size_t num_sizes) { + struct timespec start, end; + long long total_time_ns; + + for (size_t i = 0; i < num_sizes; ++i) { + size_t size = sizes[i]; + + // 分配内存并初始化字符串 + char* str = (char *)malloc(size + 1); // +1 for null terminator + if (str == NULL) { + fprintf(stderr, "Memory allocation failed\n"); + exit(EXIT_FAILURE); + } + + // 填充字符串内容 + for (size_t j = 0; j < size; j++) { + str[j] = (char)('a' + (j % 26)); // 使用字母填充 + } + str[size] = '\0'; // 确保以null结尾 + + // 在字符串的中间位置插入一个目标字符用于查找 + char target_char = 'X'; + if (size > 0) { + size_t target_pos = size / 2; + str[target_pos] = target_char; + } else { + // 对于空字符串的特殊处理 + target_char = 'a'; + } + + clock_gettime(CLOCK_MONOTONIC, &start); + for (int j = 0; j < NUM_TESTS; ++j) { + char* result = strrchr(str, target_char); + // 防止编译器优化掉未使用的结果 + asm volatile("" : "+r"(result) : : "memory"); + // flush_cache(); // 如果需要清理缓存 + } + clock_gettime(CLOCK_MONOTONIC, &end); + + total_time_ns = (end.tv_sec - start.tv_sec) * 1000000000LL + (end.tv_nsec - start.tv_nsec); + double avg_time = (double)total_time_ns / NUM_TESTS; + + // 验证结果的正确性 + char* expected = strrchr(str, target_char); + char* actual = strrchr(str, target_char); + + + if (expected != actual) { + fprintf(stderr, "Error: strrchr verification failed for size %zu\n", size); + fprintf(stderr, "Expected: %p, Actual: %p\n", (void*)expected, (void*)actual); + if (expected && actual) { + fprintf(stderr, "Expected char: '%c', Actual char: '%c'\n", *expected, *actual); + } + exit(EXIT_FAILURE); + } + + printf("Size: %zu, Time: %.3f ns\n", size, avg_time); + + + free(str); + } +} + +int main(int argc, char* argv[]) { + int start = 0; + int end = 200; + int size = end - start + 1; + size_t sizes[size + 10]; + + for (int i = 0; i < size; i++) { + sizes[i] = i + 1; + } + + size_t additional_sizes[] = {256, 500, 512, 1024, 2000, 4096, 16384, 30000, 65536, 1024*1024}; + int num_additional = sizeof(additional_sizes) / sizeof(additional_sizes[0]); + + for (int i = 0; i < num_additional; i++) { + sizes[size + i] = additional_sizes[i]; + } + + size_t total_sizes = size + num_additional; + test_strrchr_performance(sizes, total_sizes); + + return 0; +} diff --git a/str_test/strcmp/asmdefs.h b/str_test/strcmp/asmdefs.h new file mode 100644 index 0000000..7a0a2ef --- /dev/null +++ b/str_test/strcmp/asmdefs.h @@ -0,0 +1,91 @@ +/* + * Macros for asm code. AArch64 version. + * + * Copyright (c) 2019-2025, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +/* Set the line separator for the assembler. */ +#if defined (__APPLE__) +# define SEP %% +# define PREF _ +#else +# define SEP ; +# define PREF +#endif + +/* Branch Target Identitication support. */ +#define BTI_C hint 34 +#define BTI_J hint 36 +/* Return address signing support (pac-ret). */ +#define PACIASP hint 25 SEP .cfi_negate_ra_state +#define AUTIASP hint 29 SEP .cfi_negate_ra_state + +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ +#define FEATURE_1_AND 0xc0000000 +#define FEATURE_1_BTI 1 +#define FEATURE_1_PAC 2 + +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a" SEP \ + .p2align 3 SEP \ + .word 4 SEP \ + .word 16 SEP \ + .word 5 SEP \ + .asciz "GNU" SEP \ + .word type SEP \ + .word 4 SEP \ + .word value SEP \ + .word 0 SEP \ + .text + +/* If set then the GNU Property Note section will be added to + mark objects to support BTI and PAC-RET. */ +#ifndef WANT_GNU_PROPERTY +#define WANT_GNU_PROPERTY 1 +#endif + +#if WANT_GNU_PROPERTY +/* Add property note with supported features to all asm files. */ +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +#endif + +#define ENTRY_ALIGN(name, alignment) \ + .align alignment SEP \ + ENTRY_ALIAS(name) SEP \ + .cfi_startproc SEP \ + BTI_C + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#if defined (__APPLE__) +/* Darwin is an underscore platform, symbols need an extra _ prefix. */ +# define ENTRY_ALIAS(name) \ + .global _ ## name SEP \ + _ ## name: + +# define END(name) .cfi_endproc +#elif defined (_WIN32) || defined (__CYGWIN__) +# define ENTRY_ALIAS(name) \ + .global name SEP \ + name: + +# define END(name) .cfi_endproc +#else +# define ENTRY_ALIAS(name) \ + .global name; \ + .type name,%function; \ + name: + +# define END(name) \ + .cfi_endproc; \ + .size name, .-name +#endif + +#define L(l) .L ## l + +#endif diff --git a/str_test/strcmp/strcmp.S b/str_test/strcmp/strcmp.S new file mode 100644 index 0000000..16a6600 --- /dev/null +++ b/str_test/strcmp/strcmp.S @@ -0,0 +1,186 @@ +/* + * strcmp - compare two strings + * + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + + +/* Assumptions: + * + * ARMv8-a, AArch64. + * MTE compatible. + */ + +#include "asmdefs.h" + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +#define src1 x0 +#define src2 x1 +#define result x0 + +#define data1 x2 +#define data1w w2 +#define data2 x3 +#define data2w w3 +#define has_nul x4 +#define diff x5 +#define off1 x5 +#define syndrome x6 +#define tmp x6 +#define data3 x7 +#define zeroones x8 +#define shift x9 +#define off2 x10 + +/* On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. */ +#ifdef __AARCH64EB__ +# define LS_FW lsl +#else +# define LS_FW lsr +#endif + +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. + Since carry propagation makes 0x1 bytes before a NUL byte appear + NUL too in big-endian, byte-reverse the data before the NUL check. */ + + +ENTRY (__strcmp_aarch64) + sub off2, src2, src1 + mov zeroones, REP8_01 + and tmp, src1, 7 + tst off2, 7 + b.ne L(misaligned8) + cbnz tmp, L(mutual_align) + + .p2align 4 + +L(loop_aligned): + ldr data2, [src1, off2] + ldr data1, [src1], 8 +L(start_realigned): +#ifdef __AARCH64EB__ + rev tmp, data1 + sub has_nul, tmp, zeroones + orr tmp, tmp, REP8_7f +#else + sub has_nul, data1, zeroones + orr tmp, data1, REP8_7f +#endif + bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ + ccmp data1, data2, 0, eq + b.eq L(loop_aligned) +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + eor diff, data1, data2 + orr syndrome, diff, has_nul +L(end): +#ifndef __AARCH64EB__ + rev syndrome, syndrome + rev data1, data1 + rev data2, data2 +#endif + clz shift, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + lsl data1, data1, shift + lsl data2, data2, shift + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, 56 + sub result, data1, data2, lsr 56 + ret + + .p2align 4 + +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that precede the start point. */ + bic src1, src1, 7 + ldr data2, [src1, off2] + ldr data1, [src1], 8 + neg shift, src2, lsl 3 /* Bits to alignment -64. */ + mov tmp, -1 + LS_FW tmp, tmp, shift + orr data1, data1, tmp + orr data2, data2, tmp + b L(start_realigned) + +L(misaligned8): + /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always + checking to make sure that we don't access beyond the end of SRC2. */ + cbz tmp, L(src1_aligned) +L(do_misaligned): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + cmp data1w, 0 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.ne L(done) + tst src1, 7 + b.ne L(do_misaligned) + +L(src1_aligned): + neg shift, src2, lsl 3 + bic src2, src2, 7 + ldr data3, [src2], 8 +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + lsr tmp, zeroones, shift + orr data3, data3, tmp + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + bics has_nul, has_nul, tmp + b.ne L(tail) + + sub off1, src2, src1 + + .p2align 4 + +L(loop_unaligned): + ldr data3, [src1, off1] + ldr data2, [src1, off2] +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + ldr data1, [src1], 8 + bics has_nul, has_nul, tmp + ccmp data1, data2, 0, eq + b.eq L(loop_unaligned) + + lsl tmp, has_nul, shift +#ifdef __AARCH64EB__ + rev tmp, tmp +#endif + eor diff, data1, data2 + orr syndrome, diff, tmp + cbnz syndrome, L(end) +L(tail): + ldr data1, [src1] + neg shift, shift + lsr data2, data3, shift + lsr has_nul, has_nul, shift +#ifdef __AARCH64EB__ + rev data2, data2 + rev has_nul, has_nul +#endif + eor diff, data1, data2 + orr syndrome, diff, has_nul + b L(end) + +L(done): + sub result, data1, data2 + ret + +END (__strcmp_aarch64) diff --git a/str_test/strcmp/strcmp_neon b/str_test/strcmp/strcmp_neon new file mode 100644 index 0000000000000000000000000000000000000000..8c486491df61d741762e3ae6c6bec836cbd066e4 GIT binary patch literal 13584 zcmeHOe{fXSbw0aq`7MLM7-SIegrS3H%my$fKxp&y3yB&8k(JCep6O$?yOK6YyXx*L z7RITCqh{j3v{rBoiE9E5O%apM&<;(Q#7Pt;!+1*4V2H6w0s+T%)v-Gx40f~xdHbFB z?%BQjw8}7@_OIw_-ky8Dd(OG{o_F4T_uhB)bnVv7exHvd8T373&YI>TmMSO>stkxl z>uDN(%V{A^1|M)Rx7>0GO5HS~ObeZ=a08U}*85$+OBX6Rq0&O8lT=wTvZ~ZYg_6@7 z_m|shEpp4PMXp7m=T$of6%{>ZS#O8Z+oAM?lE^D6{o}sz)ueDQtw9+lp)AX3$J^-$ z=g-e{*fDgg3zJaM`xNxJop+m*o2XIMH=bVxRQrX>efU^=todW@k(KSScvttzP$(I0 zTemitN(5KC{ho`4@I9_;_%yw-^zr%MiI3Dg(tEj#tj6O_UwdT=x0~C*_E?4!%TIA~ zF3x3ApMvj-7&L>fi6KqHpm{`Qp~Yn=pQFi0_6#znFdAj;r5Z2Uc4Ug#X#zf5gfz zO19mZjK$L}cDQX9b%fg66JcNos<53(hmvW#BNU4RY0<$6-q{kYvE8`N5uoUZtIq+YI{v^^~XuF z53ePAOVc(R6-HZQsdO~iw5_^55sx;7n%lXK){aD6wcK_^iwY{Y_{oQ8&;R)_RtU!t z#&PeX7eF~4Ok4;1UcloWpzk2h$%lbwKd3y%Zn@!pHxIetaZHM3(@SQZ(EX0L>RmuBcUsd7ApVx4G+#b|${ZYN9;WOMSo!=o1FVXN3 z4cFtnP2qkthQ}32of@vUr(47IakNLnWo&0#U)FHt8mEfqG+epZ!4GM;-mfDXevc#R z{v6kE^(t@*PHOlpP5!ioOH6R_64Q8nbkC3bM-A$0CVGB!&rgmI81^Mm9Y)!6L?>te zOI@b)nLq6_{Yx&fj$BUJcTx7TTjucJ2Y*K6e+2$bjXwk4|7VUZNB%7Mr5gWl;HxzL z-@!L)`~~pOX#6GcFKhf|@MkpsD)={DUix*nRD81oz9H&q6#q#7?}jd-o(qd{y=~Bm z>xPwi^8xC62mbDXUumQ+W1uX6>z}gOgRkOp@Zq|Qg~xOto1M@11K4@8sBbT51?VS< z-W|a70sP5|x9Tz>$POF66NLLyOumBX+|oNwU#u+IyDvLs@4FRuDi5&!GtjTPQ^_$!gZbkR_2Rn^B^u0{@GbY8*a|9 zGNXIWIB|3p?Pnj2?)m3_*5x>(TXmV6kNNt2WMz&%K^Y(V{=wt^{@;a-{>-jG|L;?i z`iCB-(QGxnmi^*_*RqB`lc}b@>+l=uxWM+zZgT_!N#Bwc#*L%dEWbt(N*nu@8~67e z3;X*jAm=Z!`d|LLY5o3^O3>+`RiHCKt3gXZYe4Vm?99dx7m9eChQ2WPQ9Ziasu|zJB42XRm<&`Gr@0>+Ac$1O7f*GW32K>icZ!AKyPX z_+Rf&rPjmi_H>V2oJ@V!jC)U1L+=*uJ?AatZzBIqKivqFS+{pir=|~mbc%i8^y{fN zUi9CozGwyf`>d(7&kWEx>h;aWWO`1|=PK6k#Kmp=joty0`O5jv9Od$bU3z?e6e%a8b|NpxzVvoo7r z%lFBTe*e~(ccN-weILD7mw`WF(#ouW?sa3~3AQs0`EMVr>wgn+-#So_`$^AL^ladJ z)EK78=Xz<X zwr+ovJ@__aaR`+Cbhnqi%y`}Z7HywgVttj~d+Mtc_{pZT7QObiG5K5t^4I=z%66mt zNW-`BOr9P7!o{DKOl=6)ycMVNZ#7il3hZARpR}nWUa`rH_m=-dLvQ)fhTdb#;(UE+ z)6|CF5Z%Bt+{AX--Yv8-u)iWWY5&q71Bs9gDY`sdO^j(P?%@lP!s4M<^bS znk!PFj;NW2fEfv;LuM$&Qti?BgByvqMLQD7-KH}U3#DU;xXDwgXvBoNnZk_CT>fMi zUTC$AjoTYHnxBa#V=b}pACh#MgE^Yh4zaM&TpnE0V#ZUscO!y|(=A0+H*%AR^D;Ro zb>+x%>8dWpV_{3HK{H|VgJDXjG;!e;A_|u@>phKWMcUtlLZ1xCfC+JT> zyFmv*SACSt)+@78t?$W33Un{_Ei75$%(==S7l56$pj0_!G0Q!KYdx@4j3F^*;c5cL zl#_{z2eTev^`KNa(|jp#*r+<-Z@7BpqVuX}J#ObQ$OKphiDkLXhk)(pTU|MG*!R-n z!;7l!S@h_E^5++RZNUroA1pgG|LgOw`|Nh=d^>h4RnSS-k3;nq0apbWU^WA2W*wT z7vhsi;=K@`LJ|js_*6Qi;-e6sMiQrm_;iwZF2u+C?u@5i?z^U75FYv>_rsu>B;$c5 z4*$%eS7pG^3JmNt$$0?{hWuP^SE5BJz_uYxsN8~{is)P85dpov2o(!WAFRQKUnV!8kPYSa-77@fG1AY zqO4f^rN8Gr^3tz*c|kbV#Xf{R{S`YsxgA7T!Nq6OagU#C#%Yh#`(=;)?T}}Gdht&w z{Io~jW#^HMYi?_ZB1&G~*MjqT2_NO-Q=-?kGxoW7){!5JpH%i`ULf{eJ7b@_R~$QI zaaVpe)qDKr;`w%p{a+zd?EZ>B;ktq(_RDyqS4-pF7miG`JGf})=}zEgp3UXv z;yq**8ONT2e5n>EqVN^qONG(sG2nAFUa~&~uAf&B{ED()W+J1dUpV-@G2`bY;B)+A z$Bkc)V`qrvHM^3%Jq|txekd*Chb6#^+4+RyH?f+he-6A@e_I*%k9|(lo%pBYEK%)#^T5gt!DT^PsCWrcg+ckd`WGJeXN4@7nqzi4)Jn^`32+Q>7+?PE zC!!%7p^7sW2j+gev-yi*Fx=V6a4Ox^(h>|){v4E@?yzy-DjrS2ZX{v1wkMiH?RF%c zNT%#i7b-|}bhby+(MWK^nupgF7vghaF*}q@hIZT0csjWo$Gq^d(vEa>bnJ$T#@Uce z7l?+#;o7d>d(-Aw$pK!_X8yd#;qwxs(>>i~n5Y=aB#yYb7<2+9)9oexIa)Kk$Gfv#|_hWmK98j^?MP)96`S`)D4+CbP)5YJ}^ zp1H!|<$ulbeg$47n5(&Iy~+rc{R~y?$V#5+1GsofEBdld6uOLsv?SxdK)EARTHVb%pxUG76 z7L2Z+t3XwBUKd8*GacqbgoXO<#5uQKI_0U*^4R-^M_={YYdNwQ&hUrmAEZ}ihA|` E2d-4I0{{R3 literal 0 HcmV?d00001 diff --git a/str_test/strcmp/strcmp_neon.S b/str_test/strcmp/strcmp_neon.S new file mode 100644 index 0000000..cd74e6c --- /dev/null +++ b/str_test/strcmp/strcmp_neon.S @@ -0,0 +1,96 @@ +/* + + * strcmp - compare two strings (NEON vector version) + */ + +#include "asmdefs.h" + +#define src1 x0 +#define src2 x1 +#define result x0 + +#define vdata1 q0 +#define vdata2 q1 +#define vdata1_8b v0 +#define vdata2_8b v1 +#define vzero q2 +#define vmask q3 +#define vtmp1 q4 +#define vtmp2 q5 + +#define data1 x2 +#define data2 x3 +#define syndrome x4 +#define shift x5 +#define tmp1 x6 +#define tmp2 x7 + +ENTRY (__strcmp_aarch64) + movi v2.16b, #0 + + + and tmp1, src1, #15 + bic src1, src1, #15 + ldr q0, [src1], #16 + sub tmp1, tmp1, #16 + neg shift, tmp1 + + ldr q1, [src2], #16 + + + movi v0.16b, #0xff + mov v3.16b, v0.16b + add tmp1, shift, #16 + lsl tmp1, tmp1, #3 + dup v4.2d, tmp1 + ushl v3.2d, v3.2d, v4.2d + + + orr v0.16b, v0.16b, v3.16b + orr v1.16b, v1.16b, v3.16b + +1: + + cmeq v4.16b, v0.16b, v2.16b + cmhs v5.16b, v0.16b, v1.16b + cmhs v1.16b, v1.16b, v0.16b + orr v4.16b, v4.16b, v5.16b + orr v4.16b, v4.16b, v1.16b + + umaxp v4.16b, v4.16b, v4.16b + umaxp v4.16b, v4.16b, v4.16b + umov tmp1, v4.d[0] + cmp tmp1, #0 + b.ne 2f + + + ldr q0, [src1], #16 + ldr q1, [src2], #16 + b 1b + +2: + + cmeq v4.8b, v0.8b, v1.8b + cmeq v5.8b, v0.8b, v2.8b + orr v4.8b, v4.8b, v5.8b + + movi v5.8b, #1 + and v4.8b, v4.8b, v5.8b + shl v4.8b, v4.8b, #7 + + + uminv b6, v4.8b + umov w6, v6.b[0] + + clz tmp1, tmp1 + lsr tmp1, tmp1, #3 + + add src1, src1, tmp1 + add src2, src2, tmp1 + + ldrb w2, [src1] + ldrb w3, [src2] + sub w0, w2, w3 + + ret +END (__strcmp_aarch64) diff --git a/str_test/strcmp/strcmp_scalar b/str_test/strcmp/strcmp_scalar new file mode 100644 index 0000000000000000000000000000000000000000..86f38421c6af8ccdd87b0b203eccb9224012a36b GIT binary patch literal 13584 zcmeHOeQ;D)6+gQ#e3Xwi(3GZp<%Lp*&Xg4>P^hE6e377hna0j&M;|Y{n`BF}8+SK0 zfZ9;2(-ueB0So;?T07b>I)k0sGHTn%I8I0WpwQCT77EldIMtZ|EwV_u{?2>%WbeLg z9LCWZ|H#d}J@@?XIp>~x?#FxYzMFgNR;>sI0wl?x7l=9QDi5(#L1{>3KrC8DSK)Ui z&7iU1OI*w=w>*MUFI`log^pLa0m^#If*#uQ%B zMpt<-2^GE1L67~s(p0Jh8>PMx;?k%57b^Ym@%C8r$J-cYGazw~YPn|)w=EW?TA$GJHV=Q62J z!B<2Knn)+akfveKG$OOm;#`zZ)np`l02xyljk5L}jhAdYGR5pP0iPm58s%^oljoQe z!|xgaKMvf)8({_gwivwxV#5%juDQZbZqvSLvl4(k%Bhh3MLeL0r zU`uw36Kkh1jBT)6qv>?4gKNqcw{|(n2yrbuRF_im0bIXoWp$0cD75f4lI*RE$zIvC z+D3)Z)>tYXO*XBrX-~wXO-^$=*U{RMh%3)+Pqe6@3QLdz814DL0M-iOc!cq|575J) zJRVG3J2MYsdY8~okmnS@!t)(ana5ta;g&P@oGTy1YVIXj=M1%ASQ9+FL9J(kUsUV< zK!xXr;8^h8G~O)4g?~lK2Vh@S;mRM%nzP*#HLr@7qDS8%!bKz-K2H0{gfuIK&$c8Dy>YyqtV{WQ`0eRzBT zzpwHy^;rk914dvkp?`|WR}vkZbNSvQRg<=C&5hgge&yw=?W}(v^s6t|XVLf24-V-M zZhpxf6Xz#Vc01}^Uu`Y70<;8v?4vXFSqT=^AoM?;N}DF zI64mhIYxt8}4hJ)p zkPA+-dY}5J0|Z=tgeY`)O{ z;9<7A^NIcEh?ZocXybJ2k;891cnWBcLSMU-WbeK#hG*EsPBuZAD`RV|JQTl zsde|#&D|G|jHS#8HfeJJWNmRxw~@NdR*+|JHf#FPkxkZy6)>ejX$`1>9TuqvHnTa zal$Cu+lcY{^Re`mEojGeuWf5QY0TWag!Ux4k7mFJta)Wzz9&FUqEiQ*^|{<)?x!Hy z{jM=>Z*|Yo4E?P>i+IAMm7NFOlg5m_Z09QEf3&l{_Y~xQxV-`WN!u!|+qjP!!!-8b z{WRuaxODns>#2D+mAB0v19|q1>*rY909rl{eYn;*yX8WziR@Tq!#0(4&%fc9NhdTcdW5K`^>(({=)C-6H7Ae_oa_= zJKw`tJOj#cy3)&@YrGyTvzFb6%O?7YRa&#D2QjtCILPyi?Z_G-YXioNd;ewH6iuaL z@m4dHPKG-=&CY1DC6VlK;^C+{FXePZ%`^neh?90qC&g0j(fG}`6RnPRB$69V_rb?W z#}aXqA6=pm6Y6FPPe|rXo4RlVscURp+jzTqXEYgWiG}~2Bzr@6CUX4{3%8p$g%-7# z@zl`02%X6314Z!|;U#s`J|s8eQiWR{UC^a?wA@RcKwyUPiB$mj-q?33mz&GQ|IFng zAPu11pxvPStg;_;4=7#E<$eP)9<(2{@1tCM{&2qQkdIi`5#*i4ZaQy`sQ%)u>UOY-scLOL@U2Q&5vfHTM9$a($*b(>E%6jbQ zV#ug45X-X9^VoNeQPtH0fhT6~o>e`0)@QfR+%e<((;vQJXZfz{ADQ;(b&r)je(jis zslkUP96vU;;jOp-{N@||uOB_~8ho$9&EAAMw{u4}m*aN8R_PPL-GSQU$A)h0?89YT zorAfY2|AiaBQP3)(Flx2U^D`w5g3iYXaq(hFdBi;2#iMH|0@Erk1qS>>NVDVcVth6 z=e>MRlRPr?eM*w z`&GH@3yf2a%HFj5o(enHDf>cXFMXZtH>-m2O216;vW;-P;`>#_f}c^2iQfZCPWIBJ z{@G$*QM``3?L&30Dz*;1_9}Zh#jEDgfZ{tsjsQI-JJLN=B{|U%Um8>5?U~SVVBD-{Kle1p#_U?_E4XCQP#ldm+$(zfPqaX z`5s<~8`Lcwrxh4fGBlnw7~gfqkc{I(d@RX$FT}@@jDtdaJnfQ+q7@kU=q2M+gHi7U zlJQ)KkM!LcvtIgLQ!ogVzDR!u^$ITQ zq9;FmoOt-~exLtu>zzmgvYwL)h0|u`}Sa z9`6~m9$uHozI#~?oy zjg@ih?%hMa#>GWDPuBxC^XyQr4EG_k$U62l$d_v4L=+wZK1UdhJ`H@T#!L1^;QDz5 z!H+5X`wQI>PNo-z5;Ql0 z(@IWr%uYM4G`E`o#{h@%CDOhp>fi`foUu4C?^~kHUob=A&Q69?>8_TRP?+-PpzL&q zjRRNlXbN^C3A?pD(d@L_k#r)NvYjqekm%@akEWxM(DFsME-fy^=fYyPlT11r?Pxrm z+=ydd_;_kZx;i>GLPg_jNTv%!!{JojYG+vr9^3Yc#;VnIcHNp<8^^F}zp$oi^~xH+ zIAUnmHK@{t+D5YLSFNqCT4k?YvEq)pCcCMsdQ}}XzPRXCZZ1x8@n=+9`<(k#Sn(ZC zv#$7bSaT|sKO9zY5N^l@4x1I6Thj}?gKj!vNBNU7xp9MIY(E;0pv~Sb@gqk9#s9(Pp_yz0{Nd3E1MyTw!$m^p2wQddcc@IVO_p6N1 zeH?X$^waH$6(Zem~c^dJ~rf2aag(IH(J`BERh!>{i>I1jxqnZ61wpZ=pheN#QR z3oY}>`^?cEr7!W9{Q{v9KQSZm6aE>WzU(UqT_u8w`s;t*r!V^tLiu@KPyYJ*P{^-5 zJU2MWK85_>hF@NIp}&U?zoLk~>}NFafqNv8^SC@Ocs~kcZnBKLr^x?*$iSBIC3-Rr zPC|#9DD}(!$hr!)#s}aEljsXQjbdGYXdlJ&^rRlKB6`CA4H?FyetDiOH9ZNxystax zEW{XF;E50YivG<*h?fz4oyyqLMJ?0go>B9G_pw*t5`Sb~acVWXX6Tp|tXeA+W#}7qTTy=I;hSly$TBF@n zc2}{0BF9OU0Nc?L zlf0zLFCv9x|P#%8HSp3I`QR zc59qp_SIbAl$#42i$YH*KZX?*J!V<2S?M(^J)tD>ic0&qZG5#V+)L|G#z`p4a%z4n z9p-%V4K_Q0ZgOA}Dte!S9{YK%Nrj18RDF}h<%sfMsPx0fx}t3#>k2RHipG0-mj#2# zQ0JP}{#3%h!fAIU9N|80-12Gq(l@?y&x_5AZv2Z+95~bT@pt6HJDQYvG}~hKdj1#!VXob=FG% zijR|ISFI*%W9w!M6-GLusdOaSy1A|^5s$P6+q$@pj#whDJhvRtqJk8V^M5|f z6~gfd<8kk!$3b~K7`XQJKaShGguaD5Cm$xBuY$^b?3C+vIb+Y;@)~AyC&@ggs|mxL z>EKOjJ`;RQ&HJO(jvsEJIxQwg zQT7PY(Cp_LGZhbhqTkS}PqU6(cG-7P_KZ_z^ZyF|sEhw0_%~eqQSjPd*|u!?W8fFL z_#cB0xcL79-{#^^f`8D(p9cSoi+>&bQ5Sy}{2LB0?YdSfv1vwZi27P2KGObcp$n+* z@wp#u_s^%1Us3JDc725-uAbZOoXsO^;-=H*_{Up&lN3eVVKUDMU#!L{hqq=X9&_BiGYlx07y86JW+Vb5GXQ%Ier{-$yUeWC;_@uY>_wg10_Hl?M|Z%FAsT7SfIg4w1eKVXFT&?xTn6H^ zEkNro&oVRPyN=r9=q&u_7>)1x&H(H3IHPwPGneo34fx2+Jbgc9d}#Z7_i6)w2v&Ihg){ND_ z<|jW&7lW|*z2S#WvfX`89lk{7y8Z~-IM4jr$=4n_1OBHczyGGM|KD!Y`l)*4QWfgE zC-pCv_6@&%sf;=ftl8B&cB+*6&*?V~)NuxY4z^&tzEDb^-HmqK^vWYG=k;3$@1TQ8?xXqe0drm@mml;|tLQX9 zXInPAn)^vZyWiC34hD{_@2B52W)M%9G&4(~dtRSE$aZER|IK}k18+d?8+)73pY%vg z-<{k?Eg>pBzK5nA4^_;2avQZBpsLP=(;&~jas3>NTR^L(qYt<07k9s(ZRL7-Y&?Cx znbA;J4QcOah)YnT))z5uF5gu<@ZP-vjFUQylX{Gk28?6JoB>@)jr`wPDV z2K{m7dNr`~XYi|Z$;(gt2714Se;1U0=h5d&&?c^PdS_&e>yvVBi?qRjt-pt#PCH5m zk7cqiaKF5*&p7@jeAo&S0$@WAt z7L11?#?n+U7BSKgFv7uf&pR0gH-1jNj^Ifgpd~Q+Y zvR^OXKly!<(>#Ze?Aza}cqxDHgKUDK{m$!n?ptZ6dXba8?Zc{E-h-zrM`dr?elLTa zo0NSavX|a0`^~DLOzBrDUS6ATR{XH4Snv_G`7VBsDmmFpm--iqeMSG6b=rsOY}F4G zzhBwQDqb~@Mit+!%JC{}$4B04mD&G$A`Sgtrn&FX*V+OJut#3L<22UQtv8l_x~(Uk z?lIQ+@9-~QwxY-8R{Y({mHy?cZ+B3SdQn!#=wB-x;gZm?$s*swU9kNHthcCqNp3tB z%1LcEM*{1oCdNXBU)K9giT7vhtBcg9^W{q9oG33q>y{?O@q zlKH?Tj`-X_O}QPCe2I>o9yu?NNBO(gY&z)vNTdpNd;ymErBMFH+`LkVSK|DG%w+iz z9kYYX19_C|or_JLV)8drMNwSt_PiC=E}@IrJmTb>aOvb7Cp@q6H&ww6r3g=uuFIWg zQLqa8YBT>#@WUQ?hn-6@uSwNmB`?ow!TG#|kMiSFMz3RM;(hUuEk6+-QubwCAod+Q6Ysn4 z+jb`6j{IzTsc1a&ev18HAoF4I=%*$mPh^qjQTLocyX?)m_P5@@)xt- z`DC0Gip*mb=m;O?fg@_YVT<9%87C9x)sH}a4jL=t*50dze4UMpcAjnnZsgfqt`g57 zqsTn=dB|6|#)&9=1^6OiT=ZGsb6mV+e*oNlUP16P%6^rBj4S=j#^+9$KTiXnqfMMQ zemRMq5tet^mF$&C@HvP>MUgmE121OhPCIU5HBbK*d!Kg~lpT5Q%aacuLhU$s<^KTO zuxl#(nT>@wk8`=N)m4aZ`S>h zAdXPQ8H)pRzMWb8#m*nCNIDYsuUollO>rSU7Z$aG$z*Vc6^W;lJ8;Yk9}BH;Pb{_rDlX1~WV%2! z6pF1_cV8$5k1cCMOYPSvSpD#ZLE1u(N<~5%VYfGi_hruvx*m zHFtq?(9Mll5&ncqcHH0?+mFP18Q)cQS!XcSN&fJTIP5tzopee*7fGh@fxCdQP?n5z1zADmy1UZk z=fO_?bfg!*e2<_kVfVa0(y8u>&Mouj9Xvc2ZS0T zsHj)J9{Bs%Bar&{tBg?DZ&BL`6JG1KK%e(eME|hL2p!_!An|qx@1@9Vp*#ofR=DWP zd#q5|r-7KAlsRGPGsy8Ci0I4vu26}e_%Cuow?m)zRD_rJVxi5dCib6Y#D4`7ucJAp zqA%~yLPwRL)Gzwd{yj=RpycE|Td3?au|DrjdG-GR7{^3yH|6%(?;V}B!Y^1>;JAtU-ln_@_OE#y!9VJA-{_9y}?QLDdhhXyz;^e z{RwpVl}YqvpR|b&+#`vc!{vFwhfyH+CdTi^X?G!}%Ri6)*J^O1&b>?-(8tyQqRUQjo4V JB=_sp{~urb?S}vW literal 0 HcmV?d00001 diff --git a/str_test/strcmp/strcmp_test.cpp b/str_test/strcmp/strcmp_test.cpp new file mode 100644 index 0000000..e7e52df --- /dev/null +++ b/str_test/strcmp/strcmp_test.cpp @@ -0,0 +1,84 @@ +#include +#include +#include +#include + +#define NUM_TESTS 100000 + +extern "C" int __strcmp_aarch64(const char* str1, const char* str2); + +void test_strcmp_performance() { + struct timespec start, end; + + + int start_size = 0; + int end_size = 200; + int basic_size = end_size - start_size + 1; + size_t sizes[basic_size + 10]; + + + for (int i = 0; i < basic_size; i++) { + sizes[i] = i + 1; + } + + + size_t additional_sizes[] = {256, 500, 512, 1024, 2000, 4096, 16384, 30000, 65536, 1024*1024}; + int num_additional = sizeof(additional_sizes) / sizeof(additional_sizes[0]); + for (int i = 0; i < num_additional; i++) { + sizes[basic_size + i] = additional_sizes[i]; + } + + size_t total_sizes = basic_size + num_additional; + + printf("Testing strcmp performance (same test data as strlen):\n"); + + for (size_t i = 0; i < total_sizes; ++i) { + size_t size = sizes[i]; + + char* str1 = (char*)malloc(size + 1); + char* str2 = (char*)malloc(size + 1); + + if (str1 == NULL || str2 == NULL) { + fprintf(stderr, "Memory allocation failed for size %zu\n", size); + continue; + } + + + + for (size_t j = 0; j < size; j++) { + str1[j] = 'a' + (j % 26); + str2[j] = 'a' + (j % 26); + } + str1[size] = '\0'; + str2[size] = '\0'; + + // 预热 + for (int j = 0; j < 1000; ++j) { + __strcmp_aarch64(str1, str2); + } + + clock_gettime(CLOCK_MONOTONIC, &start); + for (int j = 0; j < NUM_TESTS; ++j) { + int result = __strcmp_aarch64(str1, str2); + if (result != 0) { + fprintf(stderr, "ERROR: Verification failed for size %zu\n", size); + break; + } + } + clock_gettime(CLOCK_MONOTONIC, &end); + + long long ns = (end.tv_sec - start.tv_sec) * 1000000000LL + + (end.tv_nsec - start.tv_nsec); + double avg_ns = (double)ns / NUM_TESTS; + + printf("strcmp size %zu: %.2f ns\n", size, avg_ns); + + free(str1); + free(str2); + } +} + +int main() { + test_strcmp_performance(); + return 0; +} diff --git a/str_test/strcpy/asmdefs.h b/str_test/strcpy/asmdefs.h new file mode 100644 index 0000000..7a0a2ef --- /dev/null +++ b/str_test/strcpy/asmdefs.h @@ -0,0 +1,91 @@ +/* + * Macros for asm code. AArch64 version. + * + * Copyright (c) 2019-2025, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +/* Set the line separator for the assembler. */ +#if defined (__APPLE__) +# define SEP %% +# define PREF _ +#else +# define SEP ; +# define PREF +#endif + +/* Branch Target Identitication support. */ +#define BTI_C hint 34 +#define BTI_J hint 36 +/* Return address signing support (pac-ret). */ +#define PACIASP hint 25 SEP .cfi_negate_ra_state +#define AUTIASP hint 29 SEP .cfi_negate_ra_state + +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ +#define FEATURE_1_AND 0xc0000000 +#define FEATURE_1_BTI 1 +#define FEATURE_1_PAC 2 + +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a" SEP \ + .p2align 3 SEP \ + .word 4 SEP \ + .word 16 SEP \ + .word 5 SEP \ + .asciz "GNU" SEP \ + .word type SEP \ + .word 4 SEP \ + .word value SEP \ + .word 0 SEP \ + .text + +/* If set then the GNU Property Note section will be added to + mark objects to support BTI and PAC-RET. */ +#ifndef WANT_GNU_PROPERTY +#define WANT_GNU_PROPERTY 1 +#endif + +#if WANT_GNU_PROPERTY +/* Add property note with supported features to all asm files. */ +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +#endif + +#define ENTRY_ALIGN(name, alignment) \ + .align alignment SEP \ + ENTRY_ALIAS(name) SEP \ + .cfi_startproc SEP \ + BTI_C + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#if defined (__APPLE__) +/* Darwin is an underscore platform, symbols need an extra _ prefix. */ +# define ENTRY_ALIAS(name) \ + .global _ ## name SEP \ + _ ## name: + +# define END(name) .cfi_endproc +#elif defined (_WIN32) || defined (__CYGWIN__) +# define ENTRY_ALIAS(name) \ + .global name SEP \ + name: + +# define END(name) .cfi_endproc +#else +# define ENTRY_ALIAS(name) \ + .global name; \ + .type name,%function; \ + name: + +# define END(name) \ + .cfi_endproc; \ + .size name, .-name +#endif + +#define L(l) .L ## l + +#endif diff --git a/str_test/strcpy/strcpy.S b/str_test/strcpy/strcpy.S new file mode 100644 index 0000000..f60ba89 --- /dev/null +++ b/str_test/strcpy/strcpy.S @@ -0,0 +1,142 @@ + +#include "asmdefs.h" + +#define dstin x0 +#define srcin x1 +#define result x0 + +#define src x2 +#define dst x3 +#define len x4 +#define synd x4 +#define tmp x5 +#define shift x5 +#define data1 x6 +#define dataw1 w6 +#define data2 x7 +#define dataw2 w7 + +#define dataq q0 +#define vdata v0 +#define vhas_nul v1 +#define vend v2 +#define dend d2 +#define dataq2 q1 + +#ifdef BUILD_STPCPY +# define STRCPY __stpcpy_aarch64 +# define IFSTPCPY(X,...) X,__VA_ARGS__ +#else +# define STRCPY __strcpy_aarch64 +# define IFSTPCPY(X,...) +#endif + +/* + Core algorithm: + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ + +ENTRY (STRCPY) + bic src, srcin, 15 + ld1 {vdata.16b}, [src] + cmeq vhas_nul.16b, vdata.16b, 0 + lsl shift, srcin, 2 + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + lsr synd, synd, shift + cbnz synd, L(tail) + + ldr dataq, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + cbz synd, L(start_loop) + +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + sub tmp, src, srcin + clz len, synd + add len, tmp, len, lsr 2 + tbz len, 4, L(less16) + sub tmp, len, 15 + ldr dataq, [srcin] + ldr dataq2, [srcin, tmp] + str dataq, [dstin] + str dataq2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret + +L(tail): + rbit synd, synd + clz len, synd + lsr len, len, 2 +L(less16): + tbz len, 3, L(less8) + sub tmp, len, 7 + ldr data1, [srcin] + ldr data2, [srcin, tmp] + str data1, [dstin] + str data2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret + + .p2align 4 +L(less8): + subs tmp, len, 3 + b.lo L(less4) + ldr dataw1, [srcin] + ldr dataw2, [srcin, tmp] + str dataw1, [dstin] + str dataw2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret + +L(less4): + cbz len, L(zerobyte) + ldrh dataw1, [srcin] + strh dataw1, [dstin] +L(zerobyte): + strb wzr, [dstin, len] + IFSTPCPY (add result, dstin, len) + ret + + .p2align 4 +L(start_loop): + sub tmp, srcin, dstin + ldr dataq2, [srcin] + sub dst, src, tmp + str dataq2, [dstin] +L(loop): + str dataq, [dst], 32 + ldr dataq, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loopend) + str dataq, [dst, -16] + ldr dataq, [src, 32]! + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(loop) + add dst, dst, 16 +L(loopend): + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ + fmov synd, dend + sub dst, dst, 31 +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz len, synd + lsr len, len, 2 + add dst, dst, len + ldr dataq, [dst, tmp] + str dataq, [dst] + IFSTPCPY (add result, dst, 15) + ret + +END (STRCPY) diff --git a/str_test/strcpy/strcpy1 b/str_test/strcpy/strcpy1 new file mode 100644 index 0000000000000000000000000000000000000000..de7003bba785cb449417baa8ad2af5494ad07383 GIT binary patch literal 13752 zcmeHOeQ;FQbw9gr0RaLekTEu2D?Vb}nhoC-37B1dASvc6pu}x5&C_akg*Iq+mEBb$ zjGG98XYi1zQ6Yxd2IJUG5Xp3y433pdClPeoA;oF=uz}by1!B8oaA)|kt+f%`-+AvI z>F(3YasO!lsjlYjx#xG!IrqGCKi+-!U45r&)k?R^MUo79g_!FeWg~u7(B)GZ5I>dC zNc>Kr$uu0i$HMG#zfDkTrygZmXo12FP}VDR+klf!R&qkcL#D%2Sw1ppyp0Ma>u&5{ z_I1uwyL`@6+oI5`%8%=ciXOA9SElsJl%7x$c}3;^_}=)cQ@E2>qKuPJmgThWophM< zWknV{fY#YC2^GCBLXZ9Yv`Lw#^{T$X;&MUxFI3v$v(3?l&o+nWHb>*FZF7UcWT^7SM6(9L&ja`7NZJfroskFM3M`-fzXo2M zN1nS`K6#E+KD=cR{1$L8E@yfQK)!k(1)k6Tl0opwLGbm1;L$RjOr?X#v>6LV<3JiUu~a%4inS1v zmd(_dj6^8dkVvM1gd@o$McSfiYW!9*nr8bj6WYY)&BkD~nL?;xlesaHPDf+8#T#3L z$uM!%JPwvn{?SprsBB?Ekmtk}UI68C zPDibBgI-0RFMfDo4``M)7 zGWW4Ui-xPvSa#YpTs@*Kyj{aph%J1FhU@pcOT))nlJ?I&4IiiB2Q^&!1Q#FDaD84m zqT%|y(xu@OH2c%N#_PRXe%;w?P{%%^J-u6w9qKmBbJv0GB05t1qv}l2jO86(_w;it zE09_m;z$GGuAHp|PCUF8Q?ba?2# zju&6FWmZn{cmC*kcjw9}0Z;?99Ml6^0Xht{5_EVcw?#vtYiD`8*|SaTK14Tyup7L; z{me$RX9_+YT;b0I-Tn+_muu|PF)WXMjrJyLUw><~KhwMAgw^IRX0v;xeLoGI3R}}~ zy<^bf%Z5M0_xLXCwj&)2mVOu+tryMWUm{p!-W2qPrj7s$TE5f!(aX4f^GGsxu9i zzPnGqcgwHzb~{yH?qBM6(PH>!SaU4;J} zBOW{CK6r-5&m)k%f>t{}(@f99VSt+@SS5W0QwxUgVY<;ay?`6x6OXw$DcV?5P z^CP~`+tuhd{>G#s-~Xz+^Xdb`V4ogEJMWAKxr3FrJ-54F?MDi%$ChvbU`MRdV=V%kBir zZUOR_hS6dQ)K9x&Ohn9SS84r<4z@pO1a-Uzy-V=H&2sRAEIQ%vUEroM>002!Y~8I5 z0mQ1j^TSX@=f|lk#B62E>IZ0p%a8gqZnQ%wxv#kqk7F*X%WSIb?A>zGYX9$|&1M+w zXG&2P&vWt>0r zTc!VF(80!`Im>u`#(R(doxjDu%C*nCX7=rGpPjM2a9kxd9@swXpP!lKrkDMN$Ch!K z;CR6~x}Cb;s3OLO(de63Heoyyu%iotu<9r_;7rk6h&oA>A9=Gp9;V<3I z9t~lxkoyWfPN#KWL*&_6k9(e{G8WbvwFW7Nk$u^Azjehm`Hk4SQ2|@Kh`>m@Z+8BTi7W2SkPJNwxX(9`(+I! zr)2R_WXqA?rt?}AF9y_#&^6ikJimkT+GIy2n|%&;cYKu1HiJI}+73z|XR~`i{Gf+G zcYyNxaTn+%&=a6+w%~3ydqVjo)w>>BPoB1EuE~Ydtr{mmz65p`fl^?^G?se^R}HXv zj3F`R#}x*~l#>@1`?(8P4JZZ1diQ${80EX&Yc9TZ)>`_oUK_5XkdZap6!71}bqJVU z=h&kC?gOqzF1}^yV2wgQ#np}SIoyPSu}56{Cmy(`@&{v|8B@J)+}@(9=SDwUxN^^g z-6esii@!fUFsyduu$ocs+Skqo&b=PEa6a(+Hv_|Khu73zeCu~_jI|8xF4;5T`{SQ3 z-dnV99NT%e@VU`!6nfD9!r9keKX?AZo4+4kGrTs?Z2@qGKkZ1w{3 zFx~FjUvywxV9eCO?y=t(g&Rj$1H4Y;drw2Q>`pfOkZJ=~8A?MD7>dA91co9o6oH`# z3`Jll0z(n_{~Q6?&zAjbSt`Gvf+GA+6kn*Qe20=fwneRKKFDSOrTt8&>t z=l2m#QvUwkY=ZfH_UmKcp7;^<0wa6OU8-E(1E(lQWsjKOt2o_blfGZs(=L6a*8UK{UKe3=3SuThSO-`AC#>}gB=(-bapcr~={57k)^UJ)(6M{WLR6|ZiOt}Fgk zRgPCuOJ3gRl==U4a!}pC{}t`~4*eRswv?3*`t zeyhdJ|Mr3fzIlrtv{8q8UY5T@l}Q6>5(YLOOFei%vr&c%n5 zjDuW!1WEtM#S2LKRW3e~r2ppPqe%LBE8u-$?i(l{^$-tLhnRoOegC>y7H@SET&OgWu*I!~_){yz3ALZ|16S47< zPySvi%8Sb^&~JscJ#-`6kJvf8`wr|UJm0#s{^M1O@B|5QmI+Ydww+>n$q^qL9~ftz zuk-mM_HR4vdm-;ey@JbpY0D4nCpJE?-RFK{y`yQLOqh`4BzD>y{e<_hKFR+R@<6bx z*!y}!<7kJ&j@+-a-%8xBI^-qJlk|^+sH5^v_*3BUXD#G8o=*IW3NLfW+x1Q)nb#m@ zr7(Wkjy%T|OphVsqW=CVz1Oxga9liT$q&SjDEqQr5c{^Bf#dEUEjt5oTfUgO^7hYu zKgIr=$b3@!`t^^zkQMvRLmz{+i^!MgA9G>nU0g2AGfo_EUVm;|@;9>1`J|T>^2}pJ zXb2bPfo`=9vBdCW_rrnn>SrMDaM4m(G6=o_xL1s7^Z`7Fym{ua1mugfej+M=1AK-s z8r=&#pFc-{JBEa%bxPTv&Kv(eh@CfCUb8FN|FUqbQq{uM z+S5ki7{no;{A}PwdE&ER5cxG0K8B7UPKJ;>{ZeD`&F5d06ox!s-Z!siC_+4*w|?|<&_{L86l5IcBuzH{W~gYK3CVPhXehL3VO!1OrSRA^SFR7Nt}?6ERGK)6RryzI0;_8) z0OJUwSyiJ-Ybw{1S-om)d0>^fcICpt*`J;o(XG6 zrTUMA<(!G@vw;e7POIq!_E9$-F(ct%I!IQi;27JF#KX8v`&3}=lRtHNVoO=dN)3uJ>5=)lLTLa_m5?MMpRs)@i{UPzA{jn04e30mG3)`~09T zYTuHMm?jSTNoV2{fO(JgnO~Vdzp=SB)ntZ(p{B?;$?Bp+zSQPeI@kc3PTI6dayStg zNw$zLo=8W08{@6MhSn&KBSymt3Y6E*(VJ(s`kzC*--1UT)`E5_QyHPMAESybS;;eeR0U{sHzlr2c&>BUJW_h%Ah(wF$lzJ|~)hrYy5;6o05+5Zq)CxVJP>;Jh!U-n6a?r?NiXZ_vK=T|Bo8=Pc6 zMgEV%DKEUxSE0kN%Azm(F7u|cwf_EMbN#&NUqFG3O_q`ODfvGY>Dbc0L{Iv`W$5rN zO8v5rQ#PHg;cSjg`hG=U=({M<_51dFytazeBL+lI`1g=uOzM|)X_42KaLPNHgKj{K zv3a(5->>LD*oW8|(buW;JzdoAwYdVdpTqm`D{+ZGGA?;JF236u_HT`vxRQ=SsaItA W-Nqqe75$Poe2_k}N#Czi|GxoX5ON0q literal 0 HcmV?d00001 diff --git a/str_test/strcpy/strcpy_scalar b/str_test/strcpy/strcpy_scalar new file mode 100644 index 0000000000000000000000000000000000000000..47986870f74100398b5e80ce389403c510dd00c1 GIT binary patch literal 13928 zcmeHOeQ;FQbwB%o5Z@AD3T!OJk{Uq_dTrQ1OuI7*&>!w3phbP_kNM z|FW;;ciQFUciI+(UQ&MaD=K=-vR;MKt5AAEN#qrk_Ho+6brVrBtpP2I74PC9ZaUpSTGs~(x{20(#cS) zm6)_{p~hq+LcxYaG7TggNhT@M9!*o@W65Zm?ZZrHGn+RXgV7cWp@z+7QzV^^#&V0B z+Jeb2an(EzmQ()OQM0D5vdUcQTk>Zl*=6^VSy#W-MCFmDXeu2^)~~HwmVDJYLC1J|C;XRv&D={WM7*uo2- zT+Zo;RqmnJk!Q@7cMr+SI_lwxjdW~+O&6>6Qt)<#_s?T5Fy}>noGo8$+An14cFJdUJcimf<6tOXjf_dUefSM8a}AuyrxOopl~<*;dM*WUud{$npLz(!*xHK zHC)y{R%q356&lMw9N!}WFLsD@9otF(US8J@RqZv92qO%HYMCE9&+>+wU!Jm#5xU^|JrOMg<6DVhI3 zr{SJ=hGpfl$_}Hf(k`?3e*oX0@&5?^Nsa#*_!l+)U%n_+Nr=(D)wkPip)b@Goloufe~n@#nx_uz4*+d9ydS9<^e2bpmB}Ax_yU zdUMl9HJR-9?|9Qi{>%$2DC44zo~*~6>8qm73k#^T41A!yY_QBroh!;cS6;|w`SE0D zrSU+gmu7Xo^pY*JYL36_CkNbJtL6kiJ)o7KUeGGgF`(6;MP1w%4HI2EDm%=ZFgLicFG@(CJ@afx%KCIzn$7t>A&CaqWj=SH(jd9fF8khioE{JdidOn%RrnqRnq-- ztX{QZbq@Y>jCk&l`(O{xpVg4PfLI{L8yC!>sa0NYMH#tDS5Wn%PoPhDZhd=5@1HF{ z`Y=wo?#yOy*C*W0Yc&`*{>=GD?D6yL1Um9@$kX*o@AjTO`;HEf|Barr+us|wo*L72 zV}ZN#ddS^L^DYg{M4b2Z{$b$igRZU*zv1q>bXQSq95wClL*LJ6Zy)?s5q0)?W*wdn zAFshj&d)>sL*x&*>56xz|C3GQsQwuj9pN!z{N>r_+&8LD`@QZR{$ko;c#8Kr$ujbu^!R_&Q6m)(fRaTnERHdlAu+#E=6gDN%1}nAJM=+Kh7NwIGX$MKpbqxq74}{A2lpZO8TZS70Oxvs3%}jg{Qq(% zK_9m880XKtq4Zw{Ei@0!g`T&6i@03Cn5ilD58(D)f6<^fFM7tFte)<#2v4Vq14NJZ zxq*A>N(;tc^)~sPbc3G4)j#mpH1_0cwD50UqvHR< z+^Y|7qrm};*-F^f?ef~yhx&O=$~@pXAomY`j^VU65=$hv7}f(~Fda?A4St%7gykN( zGMP*yzi8OcgvNIx$!KFVqzf92iKLOjBd77X$J@peevHrgmNkk#9+<3jO;J^?{W64- zQ*v`JvE|5b)5UFy7X#{v%Qf5cC4R@_dw=;S+3fSMTYf#8Z2{j3+5vhHbT?=p=poSZ z8`&&BpDhOM18oLnvxjbFvnP~aQoZZ(jpS{g>zX}ro>k*a$R9yH_kvPj>|B=n0-uH^)cmS;9@@)1FHq4z+_{ecfY4{mwVm0_fA`n7OdBfs~$4)Of(1l4{+52 zv+JB(vd_KWwffw9mJXf)=`Lr#(IO znbJKad#A9S=O#WsfsH~B+Fv>S*4t;!_P+COQEgFOVAwEM7U0TM73J(V;<;@wo9#s& zraN5wO7>3)Ou93$Yx1Mx(Qt${z|Uda_B3RB(HGp_(KH%?(Flx2U^D`w5g3iYXaq(h zFdBi;2>kz#fb3_>{*1t_RZxxr|ebVp~_|d zoZpu?N%@DjvI*w*+OM}md*a>d1xxmrkE(KcPo1M2l|5p9@8fivO@@ADPrE|)YgNex zr9WNq@>+Yl;wuy&c&&0o{O(tBvZpQe&sDg{;kD0dAF8t;yuw=ipxXS;Dqb~@`W64W zD$gih-Zz!`|8;UuHQ@h>4th#K({0B6zWaQO7cFVC zxFvtJbg6Igy`Q&Hhk9O?ze80>2Wb)>Y>vqHa1F+H6c4rMmG=&f^IZ8cB;z|5FCv)- zx%gO;@sW!clZ>led>qO6&Be!)jPqQ)(0AvFB;z4heiG&D4-ZWyStoMkr%<2%u|^eo zu*)Fl1#;y}sbBwyGbBG1=OARo8rJa8G?MjZ80GI<*u={#e;djAkSjj}8$3$w$BowJAU$Z^8!txFp}hEjwlNQkp6fC{(ml#-V>XtZSGBje2Lbv}Q@ z{xyev1M+UvE4Zwew*1I(V&fzGebJ}ZJAwAff(bcJVyD+JPI!;))BHanHw4Ryz3)gg zg)TYlNPC^*R^nFc7;h5ic1N76|A;@u4u94|p5y7nzpC&GhrC_y43c$CHH*SZUhd<9 zKaPxxhR3IjUfa&dd9mG+ABlG>`|`XX_H8>O=iR?rc1Gg1d@0Rz#LdQs{S^D}AoFSc z>v#0y`Sg;g4G3t7&+V1KALqKA zW&7@t>(!V7c1nTg^JjsD%?*j5;@xYxgKC^)5>zBC&un1 zxO?P%lO7Bu(@T5_S{TMbsbE9YOb452VLJhi0uJGepZQoMh|^nf#^S*2k9=l2l1lqR zt*s2F(rt~6z7P!`!7|e^6KB2RkreEP6J}FOq9NE~hSP~;$_%!lf<&yfC6bPWefKY2 zwmiQOpHqvP!DKSH#f-$$Nj_~Bj77|FTP(H(DjH`(GMytD3O)32Bv`cu9`hkKvn7#e zC7gO|QrToA-Ik0~ybUdEOti(raH(~R*{~%Yv1Ltj)yBZumFCKI)h5o3Rex<=U~OF$ zs>Ojzb7ie6t*zcjX3d)Qm4P+p`c0{2P)UBG_7-RemSIu^J*i`)eRN?(GzS9 zsnqZRwwx1uLpD%B&e=M>z&@C#BW5HVOb5vdFdSq1k$4!*w9f$Mh<<(PGVW_Qzz6=! z)<_cHa$~`GD6(O7EC%i2vw=B!CRbs*4WZm~fVqOovN{F};oK8@nyz(7FjqO8N|?>T zco;|htYe5drFH9}97gZAr6N4SwR4QR96Jze)e#Pdb+$1FR6()>W*v~U#+EwmsEgW7 z(h<|du|ye6d|ol{Aw=`*OO`aYw56KOP%zXS`7T)l8w*fsODr9108J-t+AKMo-;5+% z$rn$gBfh41o3EiQio=u9uz~`Wb&GgbTV>6`R5SU)TjH>1({$1p0^BJAp!C^) zmT|V9-%mKEqA&XrLha}F;j^)#FYVu^^eaI*$^L~<+1F$Lc`wkZ|F^(6CTcrx=zP8G zZ%RE7v68d@w*%GnWj{oybHF-YduX>qU*123PL#mg)stUQA>d=??34F+*FZ(q@>qSseXZ=5O=*zy2(5D;&)>;2C=<}->&katp|0DnR z;glC%=6}(DzZI>kqv@7`BSkBL+lI_z#d_OzM~C*%HH+aLPNngRVe~vBkFd z(68u!eh9HMqOVgKd%CFKu(=Mk|HR)7R^t+XWM1-nwD@jo*uTfr!ga_|DD{dgzY{q` QtfFS0E%Av6D(ck#A8J@-5&!@I literal 0 HcmV?d00001 diff --git a/str_test/strcpy/strcpy_scalar.S b/str_test/strcpy/strcpy_scalar.S new file mode 100644 index 0000000..bd114e5 --- /dev/null +++ b/str_test/strcpy/strcpy_scalar.S @@ -0,0 +1,95 @@ +#include "asmdefs.h" + +#define dstin x0 +#define srcin x1 +#define result x0 + +#define src x2 +#define dst x3 +#define data1 x4 +#define data2 x5 +#define has_nul x6 +#define tmp1 x7 +#define tmp2 x8 + +#ifdef BUILD_STPCPY +# define STRCPY __stpcpy_aarch64 +# define IFSTPCPY(X,...) X,__VA_ARGS__ +#else +# define STRCPY __strcpy_aarch64 +# define IFSTPCPY(X,...) +#endif + +/* + * 修正的标量实现strcpy/stpcpy + */ + +ENTRY (STRCPY) + mov dst, dstin + mov src, srcin + + /* 处理前几个字节直到src对齐到8字节边界 */ + ands tmp1, src, #7 + b.eq main_loop + + /* 计算需要对齐的字节数 */ + sub tmp1, tmp1, #8 + neg tmp1, tmp1 + +align_loop: + ldrb w4, [src], #1 + strb w4, [dst], #1 + cmp w4, #0 + b.eq return + subs tmp1, tmp1, #1 + b.gt align_loop + +main_loop: + /* 每次读取8字节 */ + ldr data1, [src], #8 + + /* 检查8字节中是否有null字节 */ + /* 使用经典的null检测算法: + * has_nul = (data1 - 0x0101010101010101) & ~data1 & 0x8080808080808080 + */ + mov tmp1, #0x0101 + movk tmp1, #0x0101, lsl #16 + movk tmp1, #0x0101, lsl #32 + movk tmp1, #0x0101, lsl #48 + + mov tmp2, #0x8080 + movk tmp2, #0x8080, lsl #16 + movk tmp2, #0x8080, lsl #32 + movk tmp2, #0x8080, lsl #48 + + sub has_nul, data1, tmp1 + bic has_nul, has_nul, data1 + ands has_nul, has_nul, tmp2 + b.ne null_found + + /* 没有null字节,存储8字节并继续循环 */ + str data1, [dst], #8 + b main_loop + +null_found: + /* 回退到包含null的8字节起始位置 */ + sub src, src, #8 + + /* 逐字节复制直到遇到null */ +copy_byte_loop: + ldrb w4, [src], #1 + strb w4, [dst], #1 + cmp w4, #0 + b.ne copy_byte_loop + +return: +#ifdef BUILD_STPCPY + /* stpcpy返回指向null终止符的指针 */ + sub result, dst, #1 +#else + /* strcpy返回目标字符串起始位置 */ + mov result, dstin +#endif + ret + +END (STRCPY) diff --git a/str_test/strcpy/strcpy_sve.S b/str_test/strcpy/strcpy_sve.S new file mode 100644 index 0000000..176d347 --- /dev/null +++ b/str_test/strcpy/strcpy_sve.S @@ -0,0 +1,89 @@ +#include "asmdefs.h" + +#define dstin x0 +#define srcin x1 +#define result x0 + +#define src x2 +#define dst x3 +#define len x4 +#define tmp x5 +#define vl x6 +#define vlw w6 + +#ifdef BUILD_STPCPY +# define STRCPY __stpcpy_aarch64 + +# define IFSTPCPY(X,...) X,__VA_ARGS__ +#else +# define STRCPY __strcpy_aarch64_sve +# define IFSTPCPY(X,...) +#endif + +/* + * SVE实现的strcpy/stpcpy + * 使用SVE向量指令处理可变长度向量 + */ + +ENTRY (STRCPY) + mov src, srcin + mov dst, dstin + + // 设置全真谓词用于初始操作 + ptrue p0.b + + // 获取SVE向量长度(字节) + cntb vl + + // 对齐处理:逐字节复制直到源地址对齐到向量长度 + and tmp, src, vl-1 + cbz tmp, 1f + + // 计算需要对齐的字节数 + sub tmp, vl, tmp + +align_loop: + ld1b z0.b, p0/z, [src] + cmpeq p1.b, p0/z, z0.b, #0 + b.any 1f + st1b z0.b, p0, [dst] + incb src + incb dst + subs tmp, tmp, #1 + b.gt align_loop + +1: + // 主循环:每次处理一个向量 + ptrue p0.b + +main_loop: + ld1b z0.b, p0/z, [src, #0, mul vl] + + // 检查向量中是否有null字节 + cmpeq p1.b, p0/z, z0.b, #0 + b.first 2f + + // 没有null字节,存储整个向量 + st1b z0.b, p0, [dst, #0, mul vl] + add src, src, vl + add dst, dst, vl + b main_loop + +2: + // 找到null字节,处理部分向量 + // p1包含null字节的位置信息 + + // 计算实际要复制的字节数 + brka p2.b, p0/z, p1.b + // p2现在包含从开始到第一个null字节(包括)的所有元素 + + + st1b z0.b, p2, [dst, #0, mul vl] + + + incp dst, p2.b + + IFSTPCPY(sub result, dst, #1) + ret + +END (STRCPY) diff --git a/str_test/strcpy/strcpy_test.cpp b/str_test/strcpy/strcpy_test.cpp new file mode 100644 index 0000000..6fb3b34 --- /dev/null +++ b/str_test/strcpy/strcpy_test.cpp @@ -0,0 +1,92 @@ +#include +#include +#include +#include +#include + +#define NUM_TESTS 100000 + +extern "C" char* __strcpy_aarch64(char* dst, const char* src); + +// 缓存清理 +void flush_cache() { + volatile char* flush = (volatile char*)malloc(1024 * 1024); // 1MB + for (int i = 0; i < 1024 * 1024; i += 64) { + flush[i] = i; + } + free((void*)flush); +} + +void test_strcpy_performance(const size_t sizes[], size_t num_sizes) { + struct timespec start, end; + long long total_time_ns; + + for (size_t i = 0; i < num_sizes; ++i) { + size_t size = sizes[i]; + + + char* src = (char *)malloc(size + 1); + char* dst = (char *)malloc(size + 1); + + if (src == NULL || dst == NULL) { + fprintf(stderr, "Memory allocation failed\n"); + exit(EXIT_FAILURE); + } + + + for (size_t j = 0; j < size; j++) { + src[j] = (char)('A' + (j % 26)); // 使用可打印字符 + } + src[size] = '\0'; + + + clock_gettime(CLOCK_MONOTONIC, &start); + for (int j = 0; j < NUM_TESTS; ++j) { + __strcpy_aarch64(dst, src); + // flush_cache(); + } + clock_gettime(CLOCK_MONOTONIC, &end); + total_time_ns = (end.tv_sec - start.tv_sec) * 1000000000LL + (end.tv_nsec - start.tv_nsec); + double avg_asm = (double)total_time_ns / NUM_TESTS; + + char* dst_verify = (char *)malloc(size + 1); + strcpy(dst_verify, src); + + if (strcmp(dst, dst_verify) != 0) { + fprintf(stderr, "Error: strcpy verification failed for size %zu\n", size); + exit(EXIT_FAILURE); + } + + + printf("%zu %.3f\n", size, avg_asm); + + free(src); + free(dst); + free(dst_verify); + } +} + +int main(int argc, char* argv[]) { + int start = 0; + int end = 200; + int count = end - start + 1; + + size_t sizes[count + 10]; + + for (int i = 0; i < count; i++) { + sizes[i] = start + i + 1; + } + + size_t additional_sizes[] = {256, 500, 512, 1024, 2000, 4096, 16384, 30000, 65536, 1024*1024}; + int num_additional = sizeof(additional_sizes) / sizeof(additional_sizes[0]); + + for (int i = 0; i < num_additional; i++) { + sizes[count + i] = additional_sizes[i]; + } + + size_t total_sizes = count + num_additional; + test_strcpy_performance(sizes, total_sizes); + + return 0; +} + diff --git a/str_test/strlen/asmdefs.h b/str_test/strlen/asmdefs.h new file mode 100644 index 0000000..7a0a2ef --- /dev/null +++ b/str_test/strlen/asmdefs.h @@ -0,0 +1,91 @@ +/* + * Macros for asm code. AArch64 version. + * + * Copyright (c) 2019-2025, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +/* Set the line separator for the assembler. */ +#if defined (__APPLE__) +# define SEP %% +# define PREF _ +#else +# define SEP ; +# define PREF +#endif + +/* Branch Target Identitication support. */ +#define BTI_C hint 34 +#define BTI_J hint 36 +/* Return address signing support (pac-ret). */ +#define PACIASP hint 25 SEP .cfi_negate_ra_state +#define AUTIASP hint 29 SEP .cfi_negate_ra_state + +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ +#define FEATURE_1_AND 0xc0000000 +#define FEATURE_1_BTI 1 +#define FEATURE_1_PAC 2 + +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a" SEP \ + .p2align 3 SEP \ + .word 4 SEP \ + .word 16 SEP \ + .word 5 SEP \ + .asciz "GNU" SEP \ + .word type SEP \ + .word 4 SEP \ + .word value SEP \ + .word 0 SEP \ + .text + +/* If set then the GNU Property Note section will be added to + mark objects to support BTI and PAC-RET. */ +#ifndef WANT_GNU_PROPERTY +#define WANT_GNU_PROPERTY 1 +#endif + +#if WANT_GNU_PROPERTY +/* Add property note with supported features to all asm files. */ +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +#endif + +#define ENTRY_ALIGN(name, alignment) \ + .align alignment SEP \ + ENTRY_ALIAS(name) SEP \ + .cfi_startproc SEP \ + BTI_C + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#if defined (__APPLE__) +/* Darwin is an underscore platform, symbols need an extra _ prefix. */ +# define ENTRY_ALIAS(name) \ + .global _ ## name SEP \ + _ ## name: + +# define END(name) .cfi_endproc +#elif defined (_WIN32) || defined (__CYGWIN__) +# define ENTRY_ALIAS(name) \ + .global name SEP \ + name: + +# define END(name) .cfi_endproc +#else +# define ENTRY_ALIAS(name) \ + .global name; \ + .type name,%function; \ + name: + +# define END(name) \ + .cfi_endproc; \ + .size name, .-name +#endif + +#define L(l) .L ## l + +#endif diff --git a/str_test/strlen/strlen.S b/str_test/strlen/strlen.S new file mode 100644 index 0000000..2c566a7 --- /dev/null +++ b/str_test/strlen/strlen.S @@ -0,0 +1,182 @@ + +#include "asmdefs.h" + +#define srcin x0 +#define len x0 + +#define src x1 +#define data1 x2 +#define data2 x3 +#define has_nul1 x4 +#define has_nul2 x5 +#define tmp1 x4 +#define tmp2 x5 +#define tmp3 x6 +#define tmp4 x7 +#define zeroones x8 + +#define maskv v0 +#define maskd d0 +#define dataq1 q1 +#define dataq2 q2 +#define datav1 v1 +#define datav2 v2 +#define tmp x2 +#define tmpw w2 +#define synd x3 +#define syndw w3 +#define shift x4 + +/* For the first 32 bytes, NUL detection works on the principle that + (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a + byte is zero, and can be done in parallel across the entire word. */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +/* To test the page crossing code path more thoroughly, compile with + -DTEST_PAGE_CROSS - this will force all calls through the slower + entry path. This option is not intended for production use. */ + +#ifdef TEST_PAGE_CROSS +# define MIN_PAGE_SIZE 32 +#else +# define MIN_PAGE_SIZE 4096 +#endif + +/* Core algorithm: + + Since strings are short on average, we check the first 32 bytes of the + string for a NUL character without aligning the string. In order to use + unaligned loads safely we must do a page cross check first. + + If there is a NUL byte we calculate the length from the 2 8-byte words + using conditional select to reduce branch mispredictions (it is unlikely + strlen will be repeatedly called on strings with the same length). + + If the string is longer than 32 bytes, align src so we don't need further + page cross checks, and process 32 bytes per iteration using a fast SIMD + loop. + + If the page cross check fails, we read 32 bytes from an aligned address, + and ignore any characters before the string. If it contains a NUL + character, return the length, if not, continue in the main loop. */ + +ENTRY (__strlen_aarch64) + and tmp1, srcin, MIN_PAGE_SIZE - 1 + cmp tmp1, MIN_PAGE_SIZE - 32 + b.hi L(page_cross) + + /* Look for a NUL byte in the first 16 bytes. */ + ldp data1, data2, [srcin] + mov zeroones, REP8_01 + +#ifdef __AARCH64EB__ + /* For big-endian, carry propagation (if the final byte in the + string is 0x01) means we cannot use has_nul1/2 directly. + Since we expect strings to be small and early-exit, + byte-swap the data now so has_null1/2 will be correct. */ + rev data1, data1 + rev data2, data2 +#endif + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + b.eq L(bytes16_31) + + /* Find the exact offset of the first NUL byte in the first 16 bytes + from the string start. Enter with C = has_nul1 == 0. */ + csel has_nul1, has_nul1, has_nul2, cc + mov len, 8 + rev has_nul1, has_nul1 + csel len, xzr, len, cc + clz tmp1, has_nul1 + add len, len, tmp1, lsr 3 + ret + + /* Look for a NUL byte at offset 16..31 in the string. */ +L(bytes16_31): + ldp data1, data2, [srcin, 16] +#ifdef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + b.eq L(loop_entry) + + /* Find the exact offset of the first NUL byte at offset 16..31 from + the string start. Enter with C = has_nul1 == 0. */ + csel has_nul1, has_nul1, has_nul2, cc + mov len, 24 + rev has_nul1, has_nul1 + mov tmp3, 16 + clz tmp1, has_nul1 + csel len, tmp3, len, cc + add len, len, tmp1, lsr 3 + ret + + nop +L(loop_entry): + bic src, srcin, 31 + + .p2align 5 +L(loop): + ldp dataq1, dataq2, [src, 32]! + uminp maskv.16b, datav1.16b, datav2.16b + uminp maskv.16b, maskv.16b, maskv.16b + cmeq maskv.8b, maskv.8b, 0 + fmov synd, maskd + cbz synd, L(loop) + + /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ + cmeq maskv.16b, datav1.16b, 0 + sub len, src, srcin + cbnz syndw, 1f + cmeq maskv.16b, datav2.16b, 0 + add len, len, 16 +1: + /* Generate a bitmask and compute correct byte offset. */ + shrn maskv.8b, maskv.8h, 4 + fmov synd, maskd +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz tmp, synd + add len, len, tmp, lsr 2 + ret + +L(page_cross): + bic src, srcin, 31 + mov tmpw, 0x0c03 + movk tmpw, 0xc030, lsl 16 + ld1 {datav1.16b, datav2.16b}, [src] + dup maskv.4s, tmpw + cmeq datav1.16b, datav1.16b, 0 + cmeq datav2.16b, datav2.16b, 0 + and datav1.16b, datav1.16b, maskv.16b + and datav2.16b, datav2.16b, maskv.16b + addp maskv.16b, datav1.16b, datav2.16b + addp maskv.16b, maskv.16b, maskv.16b + fmov synd, maskd + lsl shift, srcin, 1 + lsr synd, synd, shift + cbz synd, L(loop) + + rbit synd, synd + clz len, synd + lsr len, len, 1 + ret + +END (__strlen_aarch64) + + diff --git a/str_test/strlen/strlen_neon b/str_test/strlen/strlen_neon new file mode 100644 index 0000000000000000000000000000000000000000..d16b5c061112f898e191ea515ee6d6c75abd9e4f GIT binary patch literal 13664 zcmeHOeQ;FQbw9gr5dwseunL&ZCm;^#m^ELf*r~(rN+3b_s!%*rdzz=!?n>IQ+7-Ji zBTQX{oj>d$lcFFDwgV=vXJRog&>TEbT;i3!3VB||#Hm$5bmw!ib< zJ*&4*D~IV!{?paGJ@@?XIp>~x-o5wTci+{{RqIx}T`rQO(LWJ$zVQ~~Qw0rPl>zb5 z3L1ysnKXlnzbY~(iuulsCdY9j4CTc$|hT=P%>L% z{j#qsW?AJcW?2@6o>zWcQdIPqWxZOZSF7}dlE^D6?c=ubRj+U-twtFqp)AX3)5qyB z=U0@O>@Zqy!6a1l{t9~R=lv#Sp6aB&5#n+}`7c!N!y~b1<0G-~!dNuk-mx$kOom#P zFY~4n-o;kCE8qy<m8{qHEqZSM@9JM~-~6(6dFv_mn|)w=EW?TA$C^0{ z=Q62J!S_Wpnn2U!M%XkonnI-KTYM7bs>|3UiRO=h{~WlUCuuQkb#4JX1+0+$jljzb z$n$L|B+s!bgg1_We+*p5cipGPn z=(7gxBz|h%3Mk*alrj6EMG!CT6Cf1aUL@3ypNTw;34o8wn zigZNN)U+)bO|wNPhPJRlqbV4TQ3y7-7|oG%I@+3F+}s{ahKVcVp1hn2_wSl@YXViq z67S-_AjvLWM#h@@^#&@BG)GhENV0x?RV)#Y)CU`5Tt{SfT|5t?0Nh5?q*A zhW=cbON8T|#^b_82SK@C=(zTGy^P`Sp`ReniS_q_$~f;e%QbpWVV1l*Col7$rY02M z1hzutQ}dnRgX+0(ZI0!K;J$H|e6gO73x7h%yI|iwf1kAB_WA3)4Yv>FOE$d3s?z+u zV#6oe@If1H@81mycf%i^cOJc@W*ZZd*CnI_~YQ+ zUo&l+`u`4ou8sc%_<)W75AcmP{uKDA!N3FY|+ylyOn#sjTMC+^V9kfqB%WgV#IsK@8WfISaJw zKgee5F>3Zy9$wk?g8RX)H~ztrnP1`Sd2_$JXMTkrR09owdO)i{$ADIY7WHuK{Ux-! zveOt0irt6ldJuLSr+1!W+o-dyvj1ycZ)danFKHD=g202guPOL?c%?5Bbo(-xHLkL+ z$FOYq8SXoJaox?${+_!#PMG%|e(sn1Y^SrJGaJ{(8XXzXe3{E=<0sJXL_BD)CZnOu zgX=lyAHZee*J?68qTW5(>{RYoz)q5nx^{t9g03XG+7Ca#AFceTCKH70HO+N|(BBHl zR}#HH_s-7K{*qn0vtxH%t-Ryk%lgklKX9iegXbjl-8uccJKnSI+XTwszMX6c_#X4Q z=u!A_l&;idKznhWB#$q%89txK#W8FM&|^0z_%e5Q95-Wf5&m;5c&w27@F^ZI8zFlc zu|O>A=FOzZRUXfZN^+I2r0T7^a8GzFeKV)`uIWc+i>C*HI>PmT7SehKxvjTq(06>9 zfsIxlnl_$#KYiNW^UnE~PVGC`cZ2qvJazV^4{l8Ny>jaF)R>+-AG*3epXcuC|I(Ek zQxWfd=YMzOsnic|aNoT0<)YSc)O={^j%|YtMbz~We7uGF`irUeCobyc`_Apq|Mu)_ z?mJbdeIEC2Uoq|0J@h_#T;-kLs$5Zt{<_ocx;k(#ThD#y^=~a*(SSa58+LZO>FTj@ zRR2Kji**Cq%p*5lhm#zyV${QSSstsr?o<(N<^DdP%{ap4gz}I6zNUxE2X@!?RFdcY z0BP@67Ega|0W}^felR+_2714Reg7CNd;JZwFKbf}TZ|VSV-_|(Lwj!Dv--7JpEmXC zvB7LT_uWsQ4)lDUT7~wnZspi=Ob0glGH%4RlH6C_X!n4N>OTur^<=hGqp$zMyss}n zcCOZWsuJadxriMqJne*@*%@;%}BbBxae<37t{;UoC@Aub)dZrBLW z6J4-Zho@@hg0j&I%6?h04;*1Rf_%XDmpR%0RI)vw_HoB~W4?CQ^RVxGuXnDzeg*My z9rmr$59k}{O+Btr)1eQE-tpP^Di^)uy0YVYcotTea^EhQ#Bw3(ztC9Lf8o-Ez!4r_ zrcQw}!r0_;@q=wu79abO{`TBBy{*UTfL?0a%VvAozK(GjB6`%c2VHX5q~gGCw#RlF zJ9pK&i~5q4XoIiBv!N3D0bG?cL2>Cnox6d4s&8QXp23}`L-YUj)p>Waz3h7fe5X79 z)8d;vyX3jW>mg3-Bdv+#cHLZ92Gh|*T<0}mBrMP3CnL#dQ#52RdR&imv_(ScNLc^U zv+WP-&51N;ER9vkWFq;vo=PWUk+}ZMaBaOQk*tOT1QIxkNF zd6zcn@s#*iA4#R7@n)&8Es})&)?hpo(YHm@Eo?y#N1K|k5RIqJdQ)Dn7upywoNg(q z!ePCv0Y52=moiffUDTqB+7&Mb)QZwIL;EVfgYw#@IFrqG!*1~(ve_8;O`x5iouF7d zP(SEfpv9kOv%dr_1HA-V3(96+yqnGTE5D?A*Ryrx>6q=BQ8LG@aT?^iQO`0^@{gU( zau4G=0c;UtNR0V#eFTgtCmk32Sq2|!LCIgLAM_m30(;#XE`D&@TspB{2d-t1ku~2; z@ZZ6;0GL&0Y1u*dA=lS0eqieCLD}25d?;VUUCUp3*mbbt&;x;2N?x5%wZHVh#G02U z?JHaT{c*32ujww|JJ~;G&Db%u#m}8N?LT+c|C?X?i`Ep?uD$rde_!~o^QETx-pSqN z`^sLP#L8^twejB{$0nf#-LPfMa^Iz7kNS+-ynRrW%Rc-J#mnCJbj8bl zJiniClJZaQWfROFuwHX>d+eR+g+}(Q`&7BSPnIf2WzU)4vpD^kMRLEI%GJt#vEp^5 zU!iz;jhw3ZK~=Hf#flfd&G!_@${xAYKU?XG99~n*_Mtixx~KR)^)VrT*BBGrID70rEzzR#A(cN(X-e5a|Ys(M^s@Qudyc)DGG%=@T!(Za>; zCb#(SmMrluTJ{wSb?O)5`74%u*SD8zgoXA5YS+^YM|sJC~5$|9tt0ROo(aR7x^0+%m-*3>AKp(aeE5A|_=WXOpan8PLXMNzx#Z|4yodEg{@2O_z_cRs z@g%zDup{kt_S;|1t*kGpqFJHv5HzMT3B_Rk?d#r_#& zz9^pkvfxk(Vt?>H?~BNn;KPAE{T7T1CrX?+%M|kGe@*%8S?7Fmn-#EwE9ag}7C+-~ zBV3pV`qjEaaH}53nI4x;VwX9D6= z$j+C|xQW#vx*T|+_HJg}J$#%tj9`DOX=iADmi1ic2=cE0=lkW1=b;hgJtjW%yx@70 z(~n26^RB|3&$Z*qPN$kz&2I#-b6M0mR>?;yZYz#)7=Gqy#7 zI8zm8EDkJPK20m~lw^LIx*cvgy?X9ibpSWyu)xg8>k@f(3-u#I`L*lj7T_` z4w4xvIL7uP@i3Zc9jD6^wVk%h$JvDC7EI(HpvxD`KiOszH4oF}%Z5`4qa_#*<4Bx& zTraP5%|72!T-JK>inhqmCRMJrY*=+pJ;xRJ!Tw zt*wX#4iZ{t1oKVd=$`ZoKGRq5pq}y6;>As|_Ed`z3Wi!D&zM0nt5HW`$?Sea-qiNi zbg&ULowR6+%B*>eNbl{gq63P;0XYoc_)YxeWLsvLU!x}`E zhU8&5*cuI?)&y)>Hqg_^i&5N)ky&7B`hP*Z4}wP!)_zu!_er6$Poj!VS;;eLRG9Q@Pdx*ey4I!sOUQYMdza8|X}GfEQ&h;S z$k^+beFdQvBB-d-|HDdO;xGFTLN_@&jKojOzU9!D^}Wz~5meM!|GN%-+0PK#`A6#S zhd#f;^4Q=c`yTRtAWnaU7y3)+@GGt8%l@lx7F!$YFBUh%i~e~O$k=2VdGC?`Cy|aV z{Y&(u9}GZ;n<(|meoO5fwubY&7RmjJzR*ulV%N{D<8@0#>JbB?C;ab`VNB|mb!eGx zNjT*lw}Y-jjIl+Qc (X - 1) & ~(X | 0x7f)) is non-zero if a + byte is zero, and can be done in parallel across the entire word. */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +/* To test the page crossing code path more thoroughly, compile with + -DTEST_PAGE_CROSS - this will force all calls through the slower + entry path. This option is not intended for production use. */ + +#ifdef TEST_PAGE_CROSS +# define MIN_PAGE_SIZE 32 +#else +# define MIN_PAGE_SIZE 4096 +#endif + +/* Core algorithm: + + Since strings are short on average, we check the first 32 bytes of the + string for a NUL character without aligning the string. In order to use + unaligned loads safely we must do a page cross check first. + + If there is a NUL byte we calculate the length from the 2 8-byte words + using conditional select to reduce branch mispredictions (it is unlikely + strlen will be repeatedly called on strings with the same length). + + If the string is longer than 32 bytes, align src so we don't need further + page cross checks, and process 32 bytes per iteration using a fast SIMD + loop. + + If the page cross check fails, we read 32 bytes from an aligned address, + and ignore any characters before the string. If it contains a NUL + character, return the length, if not, continue in the main loop. */ + +ENTRY (__strlen_aarch64) + and tmp1, srcin, MIN_PAGE_SIZE - 1 + cmp tmp1, MIN_PAGE_SIZE - 32 + b.hi L(page_cross) + + /* Look for a NUL byte in the first 32 bytes using NEON. */ + ld1 {datav1.16b, datav2.16b}, [srcin] + + /* Check for NUL bytes in first 16 bytes */ + cmeq maskv.16b, datav1.16b, #0 + umaxp maskv.16b, maskv.16b, maskv.16b + fmov synd, maskd + cbnz synd, L(found_in_first16) + + /* Check for NUL bytes in next 16 bytes */ + cmeq maskv.16b, datav2.16b, #0 + umaxp maskv.16b, maskv.16b, maskv.16b + fmov synd, maskd + cbnz synd, L(found_in_second16) + + /* No NUL found in first 32 bytes, align and enter main loop */ + bic src, srcin, 31 + b L(loop_entry) + +L(found_in_first16): + /* Generate a bitmask and compute correct byte offset for first 16 bytes */ + cmeq maskv.16b, datav1.16b, #0 + shrn maskv.8b, maskv.8h, #4 + fmov synd, maskd +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz tmp, synd + mov len, tmp, lsr #2 + ret + +L(found_in_second16): + /* Generate a bitmask and compute correct byte offset for second 16 bytes */ + cmeq maskv.16b, datav2.16b, #0 + shrn maskv.8b, maskv.8h, #4 + fmov synd, maskd +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz tmp, synd + mov len, tmp, lsr #2 + add len, len, 16 + ret + + .p2align 5 +L(loop_entry): + bic src, srcin, 31 + +L(loop): + ldp dataq1, dataq2, [src, 32]! + uminp maskv.16b, datav1.16b, datav2.16b + uminp maskv.16b, maskv.16b, maskv.16b + cmeq maskv.8b, maskv.8b, 0 + fmov synd, maskd + cbz synd, L(loop) + + /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ + cmeq maskv.16b, datav1.16b, 0 + sub len, src, srcin + cbnz syndw, 1f + cmeq maskv.16b, datav2.16b, 0 + add len, len, 16 +1: + /* Generate a bitmask and compute correct byte offset. */ + shrn maskv.8b, maskv.8h, 4 + fmov synd, maskd +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz tmp, synd + add len, len, tmp, lsr 2 + ret + +L(page_cross): + bic src, srcin, 31 + mov tmpw, 0x0c03 + movk tmpw, 0xc030, lsl 16 + ld1 {datav1.16b, datav2.16b}, [src] + dup maskv.4s, tmpw + cmeq datav1.16b, datav1.16b, 0 + cmeq datav2.16b, datav2.16b, 0 + and datav1.16b, datav1.16b, maskv.16b + and datav2.16b, datav2.16b, maskv.16b + addp maskv.16b, datav1.16b, datav2.16b + addp maskv.16b, maskv.16b, maskv.16b + fmov synd, maskd + lsl shift, srcin, 1 + lsr synd, synd, shift + cbz synd, L(loop) + + rbit synd, synd + clz len, synd + lsr len, len, 1 + ret + +END (__strlen_aarch64) diff --git a/str_test/strlen/strlen_scalar b/str_test/strlen/strlen_scalar new file mode 100644 index 0000000000000000000000000000000000000000..741c3bdc8f46237de4c9b0b99d5ef29324eb8208 GIT binary patch literal 14120 zcmeHOeQ*@Vm4CY{ga82&)&b^o1USKESo0N(ojC030|}e22-#IQtjxXEV0N(q( znSQI;*%fhBD*vffv#;N;-+SHt=5^0Zck8Lz70bLH4@vUTKNE9-kv0-g1+(X=3`l^M z&T4;&FeW0wj#A^d?I$6mHl@>A`rpgMDo8>ktl&s#^ zzueX(Q|E38~ZZs0>=$va9iBRj} zMgC;mKi}?m1scNlxNg-WH06c2j$SCMxc2r7Rri_IPkiI<|NHJ}Za24q?Xe6emLG2A zY@ExaJ_Ua+;-gVCQEr4o!$;$Zv|NjiqI{e~MzXU8!G8f<6Cnp>t1}$DWRu7gvcDR* z3RMxcD2G`gdG=KyylD{p0&oqNJ8c0_sNUtk3)x>h2wpP?{`esHb>JGVyrk=)PD5{w z#EfWUlS#VXo`}R!&3dSHJ+&FpXgmZA!4}e!DI<~6+l)vINV7w%IboV)G{qAsN~XeQ zB0*+nBt^{|5|I>Jgkoqt8`PVPNR&dbxn6HEQ>jQ>ZgES8kq8r4#*w_33deWdise<+ z`U3y_`$)127m>caaixyR&6Y?qWhNR|R!8G8v(adZavd#g@tEp>Zi^Naw7uiSNbq1{ z8Tj*HE)k9+jmL$D_JDF+Xt;KDzl`BuL_a~E6YK8)m2rO1D)-Sl3bWsI&kMuKIgzo?JBL` ziw=CW1Mhd>&iG!Za4*`!^Nyre2d-RU6}3BXXFEF`xXfLw(B;6DYb-m@I&kGm3*YX* zmFq2hw*zej$(omxMJYxnfo zzALX~GL0BD+ba(*?S9&OXZP#>V9U&^2=u(M%iA-nA_(dOtpY6qtp*(iS_4|#!@due z(zePjz26YKbLffzyG@h2POxp%*-&}ztKC1&WOkkNRqQu_8@R7Yv~}OoK-%yI(wH?a zaa-TTYs=4Y-x0+P*VYDmZf!nl-Fviim)vJ3oeG_4xGwl;f1fXqK94^B0s39=2ldyb zeJCr!^&RN%#%1A`>(T+DgWEHi2^?3zj#GfTw}4iHE+x8j80`Rmpz_1Iv;o=6KF@wa zycLqKBswSkRU&q?Tev--C-zhmFGQIy7gJHDURTmX4sU%PNQmXmp7TgmaOHX9=-nQD2USHG$NA2hOKMnbM`nj*y z>kvDEbl++_h9-`rgC9Tc?fGf%b0=QfbLc8U%1w=vWo`j#ZXSdSx~>?Jc=8GOZeVzk~hYFlwaWx@qTqf(C;9h7~`_k{h=KJEp^G0S7&L$vev zxHRZ`VWUcWup9Oo@KjBoQ8o^Oa=UEVJNC02LB49;SF*DIsbqUVi{=Dg{9)J9D@TXp z)1oxc^T(glTQxNNSZy!8^$gwhzgv9N7zS@TcyDR2=XNr$LC5paw_l{9t`oC}wrz!7 zr+m<5lX=1z{TMO)OzGh>&y*cLb8c+Sejb;?lLvk)t=X1?c?)3hah{x8Ps3cKFv9)t zwys<9I?*EH`Lir=mB$qPR>5^iKYpDZU-mOho?rZ4#A&737Ef%{tQSlp6^X|*em6A3 z@=SizOhlR^A!pG8n%UWIhEir&yK7U&9IYju;*70Pn@GeH4`|6$B5KC8Z{=%i&GCem z#4E0?JeaAqka)lr3BroeVru-F3dp~(S&Jp5eT`-^6^XS-h3#en_S=kD$kaAOQmt%3 z3rCup@j@C)S@kCUem}G^a5!C8RJp_cXajy+USi5DF@&m3=XEGv45&9t&t%`1`CHU_ zM@wfi&$Hb>WinxqdeBbLPSEE;4}tCjr5l;dzk!s1_JRgL`x!@)YCW48sHk(AXL9Lu z3!8|tov^=%g@VJUvD_S7M}f^_42iJ-t`C7R<)qp>|vM%z=g*H^W}yXx!* zC#_c}*5h_Af{eWLO##nso(;^db4=MD?_STtXFsrXwxjIFxB@7j#~~FQv(K}qV(%SQ z-!J{asOnu~c8{)mdF)GN%U&Jv%E-E%uJ{5fbx547&#q|%J z{oudP{8#T7OMS<^A(dNhtiM-hCDFjfgukJ zd0@x`Lmn9Nz>o)qJa8Ki$ojafhs);_){?9wJg?DnDz!;=UrN@f-&c#bcdBw(hnIa% zS^J)(%4I#CzfU+x`NyAS;>_>1KX0>Z>|N@EM%JtksdD*_9HSa49|ZiJ#p!QslKqu6 z@_Jb>RwbI!uTZ>vK2A`4zp7a962(iqt?v}b${M-UKTYY296k}P{-HVx`b_bM)Mi3P z@v3`tS@GXf<@hAD^yT|bng6Skd8z|{Et=hj{+vCLeHyp8?9;% z_kYPhZ|?jKi<|%51q=N17Ts;5Zv8?$?_$ZmzO&p%-t4}3F7BgFxq*%X9~OEft{pJ; zZHAG=b1q&?68E|IaFTe)#Y;%yC>I|=60f=VNRqhD#RuDWE+x7Dx$>i_(Eaey7?OD* zSAHyAR`W$JUY0#4kc*eoc3#|YB_DQrWu9>s`RI=9eqSzLf%6YCqYae!FiXgMFo5!Y z+ITFy6q27nWd(5=^#fK|yNIr21`s=E&lCA^!t<*KHB0 zdU*g?R%AXNOZ_f8(qDJnX8q>ebc8MO^TcvxSdj-oZ`Yn@UH+_xJp0p)Kcw(OE_u7& z@g(z_s+PW?84aBEJuWcuPTx_-E^YI>KU)~SIzHKLe-2K$DlaJf-<#ecE zd=9iz?7xT1?fiK_UO1G(?<_i+4!G{`6@>>}c#-9h(X_^ePgHo=h2Nv_P8YsZ;bkuT zVTC{M!kZQTstez$@PjUVhr)m5!rxH%aTh*V9E$(@U2*%bu;alA8aK{vE9C!QSn^jg z?)j59WqHLq;JUvTl>A}!p2bp}>~WX>9Jv8`=eY)3nrIZBqlNIza(DyAIwP;qu zvL!XZ%N*lE3L1dV5XM2Hzzfy;cfeh)w6y*aINP7-s&|)#kI#Ev>}C7j{CVy~6fVz;%a(PbpIf+M z9OM*m_K(|sA8^efC)pdyzC6FA{iSf!I6N<8-kYPsdMe}I{O2#N4JA_Z{c)NV#&Ip9 zDWa#07Mj&bfFpoI*eTREmb0wCbR4d$`P)^&mCLIE*R6U)udP?5^)(Ho*R5Dx6hZ;EtrpPaGe-`6K|q^k4ybx)_trY}N%~_87V>C=Ro-9M9oyY&9&T%cH*hG_ zKJS@p3db-dF8F+A!9$n&WAo=XM>~?OddLX1n%}aVWL2Y%HIp^8i2TWoZ7HJ(G?lPv ztK@KG)l9ULKNe4!{+3vWzo{dF)1#5Ff`V1c=kkPNm9-klR`Q28#$eB;sf1mE9seZG zBIGbS$`WSOUt~wt+|^KW3;l%sMjH{w0*xvv>-|XIoyRIi+#*Zpo^t)68g~~b}#H^&u2}@5P$7^JwFY6#ezn~ncL!y+G<0ed+%*O1}z}ldQi8)l`RBpVuzk`riY_B5L_D zyACV=Cm{7e#7fG&!qRidIrU{dN9eh%q@8iS*3(XxzPt|!eN{TH>;x1Q@&ja?^~?H@ z&YIuAPH=VFVT}Y z=z|V-QR +#include +#include +#include +#include + +#define NUM_TESTS 1000000 + +extern "C" size_t __strlen_aarch64(const char* str); + +// 清理缓存(可选,用于更严格的测试) +void flush_cache() { + volatile char* flush = (volatile char*)malloc(1024 * 1024); // 1MB + for (int i = 0; i < 1024 * 1024; i += 64) { + flush[i] = i; + } + free((void*)flush); +} + +void test_strlen_performance(const size_t sizes[], size_t num_sizes) { + struct timespec start, end; + long long total_time_ns; + + for (size_t i = 0; i < num_sizes; ++i) { + size_t size = sizes[i]; + + // 分配内存并创建测试字符串 + char* str = (char *)malloc(size + 1); // +1 for null terminator + if (str == NULL) { + fprintf(stderr, "Memory allocation failed\n"); + exit(EXIT_FAILURE); + } + + // 填充字符串数据,确保最后一个字符是null terminator + for (size_t j = 0; j < size; j++) { + str[j] = (char)('A' + (j % 26)); // 填充字母,避免出现意外的null字符 + } + str[size] = '\0'; // 确保字符串正确终止 + + // 预热:先调用一次确保代码加载 + size_t test_len = __strlen_aarch64(str); + if (test_len != size) { + fprintf(stderr, "Verification failed: expected %zu, got %zu\n", size, test_len); + exit(EXIT_FAILURE); + } + + clock_gettime(CLOCK_MONOTONIC, &start); + for (int j = 0; j < NUM_TESTS; ++j) { + size_t len = __strlen_aarch64(str); + // 防止编译器优化掉调用 + asm volatile("" : : "r"(len) : "memory"); + // flush_cache(); // 可选:如果需要清理缓存 + } + clock_gettime(CLOCK_MONOTONIC, &end); + + total_time_ns = (end.tv_sec - start.tv_sec) * 1000000000LL + (end.tv_nsec - start.tv_nsec); + double avg_time = (double)total_time_ns / NUM_TESTS; + + // 验证结果 + size_t result_len = __strlen_aarch64(str); + if (result_len != size) { + fprintf(stderr, "Error: strlen verification failed for size %zu, expected %zu, got %zu\n", + size, size, result_len); + exit(EXIT_FAILURE); + } + + printf("Size: %zu, Average time: %.3f ns\n", size, avg_time); + + free(str); + } +} + +int main(int argc, char* argv[]) { + int start = 0; + int end = 200; + int count = end - start + 1; + size_t sizes[count + 10]; + + // 填充1-200的大小 + for (int i = 0; i < count; i++) { + sizes[i] = i + 1; + } + + // 添加额外的大小 + size_t additional_sizes[] = {256, 500, 512, 1024, 2000, 4096, 16384, 30000, 65536, 1024*1024}; + int num_additional = sizeof(additional_sizes) / sizeof(additional_sizes[0]); + for (int i = 0; i < num_additional; i++) { + sizes[count + i] = additional_sizes[i]; + } + + size_t total_sizes = count + num_additional; + + printf("Testing strlen performance with %zu different sizes...\n", total_sizes); + test_strlen_performance(sizes, total_sizes); + + return 0; +} -- Gitee