diff --git a/LoongArch-Optimize-string-functions-including-memcpy.patch b/LoongArch-Optimize-string-functions-including-memcpy.patch new file mode 100644 index 0000000000000000000000000000000000000000..98d12ceeb35c4f346feaa2eaa6d41d5b6558f040 --- /dev/null +++ b/LoongArch-Optimize-string-functions-including-memcpy.patch @@ -0,0 +1,1938 @@ +From 50f0fd88162ba3130f902d12305fe317a33ebaee Mon Sep 17 00:00:00 2001 +From: Xue Liu +Date: Sat, 14 Jan 2023 16:14:24 +0800 +Subject: [PATCH] LoongArch: Optimize string functions including memcpy, + memmove, memset, strchr, strchrnul, strcmp, strncmp, ctrcpy, ctrlen, strnlen. + +Change-Id: I2975aea74f44bf2c9e01a6dfb6ca2eaa57aa5f7c +--- + sysdeps/loongarch/lp64/memcpy.S | 259 ++++++++++++++++++ + sysdeps/loongarch/lp64/memmove.S | 406 +++++++++++++++++++++++++++++ + sysdeps/loongarch/lp64/memset.S | 170 ++++++++++++ + sysdeps/loongarch/lp64/strchr.S | 107 ++++++++ + sysdeps/loongarch/lp64/strchrnul.S | 115 ++++++++ + sysdeps/loongarch/lp64/strcmp.S | 161 ++++++++++++ + sysdeps/loongarch/lp64/strcpy.S | 175 +++++++++++++ + sysdeps/loongarch/lp64/strlen.S | 102 ++++++++ + sysdeps/loongarch/lp64/strncmp.S | 225 ++++++++++++++++ + sysdeps/loongarch/lp64/strnlen.S | 125 +++++++++ + 10 files changed, 1845 insertions(+) + create mode 100644 sysdeps/loongarch/lp64/memcpy.S + create mode 100644 sysdeps/loongarch/lp64/memmove.S + create mode 100644 sysdeps/loongarch/lp64/memset.S + create mode 100644 sysdeps/loongarch/lp64/strchr.S + create mode 100644 sysdeps/loongarch/lp64/strchrnul.S + create mode 100644 sysdeps/loongarch/lp64/strcmp.S + create mode 100644 sysdeps/loongarch/lp64/strcpy.S + create mode 100644 sysdeps/loongarch/lp64/strlen.S + create mode 100644 sysdeps/loongarch/lp64/strncmp.S + create mode 100644 sysdeps/loongarch/lp64/strnlen.S + +diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S +new file mode 100644 +index 00000000..5d850123 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/memcpy.S +@@ -0,0 +1,259 @@ ++/* Optimized memcpy implementation for LoongArch. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#include ++ ++/* Allow the routine to be named something else if desired. */ ++#ifndef MEMCPY_NAME ++#define MEMCPY_NAME memcpy ++#endif ++ ++#define LD_64(reg, n) \ ++ ld.d t0, reg, n; \ ++ ld.d t1, reg, n+8; \ ++ ld.d t2, reg, n+16; \ ++ ld.d t3, reg, n+24; \ ++ ld.d t4, reg, n+32; \ ++ ld.d t5, reg, n+40; \ ++ ld.d t6, reg, n+48; \ ++ ld.d t7, reg, n+56; ++ ++#define ST_64(reg, n) \ ++ st.d t0, reg, n; \ ++ st.d t1, reg, n+8; \ ++ st.d t2, reg, n+16; \ ++ st.d t3, reg, n+24; \ ++ st.d t4, reg, n+32; \ ++ st.d t5, reg, n+40; \ ++ st.d t6, reg, n+48; \ ++ st.d t7, reg, n+56; ++ ++LEAF(MEMCPY_NAME) ++//1st var: dst ptr: void *a1 $r4 a0 ++//2nd var: src ptr: void *a2 $r5 a1 ++//3rd var: size_t len $r6 a2 ++//t0~t9 registers as temp ++ ++ add.d a4, a1, a2 ++ add.d a3, a0, a2 ++ li.w a6, 16 ++ bge a6, a2, less_16bytes ++ li.w a6, 128 ++ blt a6, a2, long_bytes ++ li.w a6, 64 ++ blt a6, a2, more_64bytes ++ li.w a6, 32 ++ blt a6, a2, more_32bytes ++ ++ /* 17...32 */ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a4, -16 ++ ld.d t3, a4, -8 ++ st.d t0, a0, 0 ++ st.d t1, a0, 8 ++ st.d t2, a3, -16 ++ st.d t3, a3, -8 ++ jr ra ++ ++more_64bytes: ++ srli.d t8, a0, 3 ++ slli.d t8, t8, 3 ++ addi.d t8, t8, 0x8 ++ sub.d a7, a0, t8 ++ ld.d t0, a1, 0 ++ sub.d a1, a1, a7 ++ st.d t0, a0, 0 ++ ++ add.d a7, a7, a2 ++ addi.d a7, a7, -0x20 ++loop_32: ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ st.d t0, t8, 0 ++ st.d t1, t8, 8 ++ st.d t2, t8, 16 ++ st.d t3, t8, 24 ++ ++ addi.d t8, t8, 0x20 ++ addi.d a1, a1, 0x20 ++ addi.d a7, a7, -0x20 ++ blt zero, a7, loop_32 ++ ++ ld.d t4, a4, -32 ++ ld.d t5, a4, -24 ++ ld.d t6, a4, -16 ++ ld.d t7, a4, -8 ++ st.d t4, a3, -32 ++ st.d t5, a3, -24 ++ st.d t6, a3, -16 ++ st.d t7, a3, -8 ++ ++ jr ra ++ ++more_32bytes: ++ /* 33...64 */ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ ld.d t4, a4, -32 ++ ld.d t5, a4, -24 ++ ld.d t6, a4, -16 ++ ld.d t7, a4, -8 ++ st.d t0, a0, 0 ++ st.d t1, a0, 8 ++ st.d t2, a0, 16 ++ st.d t3, a0, 24 ++ st.d t4, a3, -32 ++ st.d t5, a3, -24 ++ st.d t6, a3, -16 ++ st.d t7, a3, -8 ++ jr ra ++ ++less_16bytes: ++ srai.d a6, a2, 3 ++ beqz a6, less_8bytes ++ ++ /* 8...16 */ ++ ld.d t0, a1, 0 ++ ld.d t1, a4, -8 ++ st.d t0, a0, 0 ++ st.d t1, a3, -8 ++ ++ jr ra ++ ++less_8bytes: ++ srai.d a6, a2, 2 ++ beqz a6, less_4bytes ++ ++ /* 4...7 */ ++ ld.w t0, a1, 0 ++ ld.w t1, a4, -4 ++ st.w t0, a0, 0 ++ st.w t1, a3, -4 ++ jr ra ++ ++less_4bytes: ++ srai.d a6, a2, 1 ++ beqz a6, less_2bytes ++ ++ /* 2...3 */ ++ ld.h t0, a1, 0 ++ ld.h t1, a4, -2 ++ st.h t0, a0, 0 ++ st.h t1, a3, -2 ++ jr ra ++ ++less_2bytes: ++ beqz a2, less_1bytes ++ ++ ld.b t0, a1, 0 ++ st.b t0, a0, 0 ++ jr ra ++ ++less_1bytes: ++ jr ra ++ ++long_bytes: ++ srli.d t8, a0, 3 ++ slli.d t8, t8, 3 ++ beq a0, t8, start ++ ++ ld.d t0, a1, 0 ++ addi.d t8, t8, 0x8 ++ st.d t0, a0, 0 ++ sub.d a7, a0, t8 ++ sub.d a1, a1, a7 ++ ++start: ++ addi.d a5, a3, -0x80 ++ blt a5, t8, align_end_proc ++ ++loop_128: ++ LD_64(a1, 0) ++ ST_64(t8, 0) ++ LD_64(a1, 64) ++ addi.d a1, a1, 0x80 ++ ST_64(t8, 64) ++ addi.d t8, t8, 0x80 ++ bge a5, t8, loop_128 ++ ++align_end_proc: ++ sub.d a2, a3, t8 ++ ++ pcaddi t1, 34 ++ andi t2, a2, 0x78 ++ sub.d t1, t1, t2 ++ jirl zero, t1, 0 ++ ++end_120_128_unalign: ++ ld.d t0, a1, 112 ++ st.d t0, t8, 112 ++end_112_120_unalign: ++ ld.d t0, a1, 104 ++ st.d t0, t8, 104 ++end_104_112_unalign: ++ ld.d t0, a1, 96 ++ st.d t0, t8, 96 ++end_96_104_unalign: ++ ld.d t0, a1, 88 ++ st.d t0, t8, 88 ++end_88_96_unalign: ++ ld.d t0, a1, 80 ++ st.d t0, t8, 80 ++end_80_88_unalign: ++ ld.d t0, a1, 72 ++ st.d t0, t8, 72 ++end_72_80_unalign: ++ ld.d t0, a1, 64 ++ st.d t0, t8, 64 ++end_64_72_unalign: ++ ld.d t0, a1, 56 ++ st.d t0, t8, 56 ++end_56_64_unalign: ++ ld.d t0, a1, 48 ++ st.d t0, t8, 48 ++end_48_56_unalign: ++ ld.d t0, a1, 40 ++ st.d t0, t8, 40 ++end_40_48_unalign: ++ ld.d t0, a1, 32 ++ st.d t0, t8, 32 ++end_32_40_unalign: ++ ld.d t0, a1, 24 ++ st.d t0, t8, 24 ++end_24_32_unalign: ++ ld.d t0, a1, 16 ++ st.d t0, t8, 16 ++end_16_24_unalign: ++ ld.d t0, a1, 8 ++ st.d t0, t8, 8 ++end_8_16_unalign: ++ ld.d t0, a1, 0 ++ st.d t0, t8, 0 ++end_0_8_unalign: ++ ld.d t0, a4, -8 ++ st.d t0, a3, -8 ++ ++ jr ra ++ ++END(MEMCPY_NAME) ++libc_hidden_builtin_def (MEMCPY_NAME) +diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S +new file mode 100644 +index 00000000..edd9cf3d +--- /dev/null ++++ b/sysdeps/loongarch/lp64/memmove.S +@@ -0,0 +1,406 @@ ++/* Optimized memmove implementation for LoongArch. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#include ++ ++/* Allow the routine to be named something else if desired. */ ++#ifndef MEMMOVE_NAME ++#define MEMMOVE_NAME memmove ++#endif ++ ++#define LD_64(reg, n) \ ++ ld.d t0, reg, n; \ ++ ld.d t1, reg, n+8; \ ++ ld.d t2, reg, n+16; \ ++ ld.d t3, reg, n+24; \ ++ ld.d t4, reg, n+32; \ ++ ld.d t5, reg, n+40; \ ++ ld.d t6, reg, n+48; \ ++ ld.d t7, reg, n+56; ++ ++ ++#define ST_64(reg, n) \ ++ st.d t0, reg, n; \ ++ st.d t1, reg, n+8; \ ++ st.d t2, reg, n+16; \ ++ st.d t3, reg, n+24; \ ++ st.d t4, reg, n+32; \ ++ st.d t5, reg, n+40; \ ++ st.d t6, reg, n+48; \ ++ st.d t7, reg, n+56; ++ ++/* memmove (const void *dst, const void *src, size_t n) */ ++LEAF(MEMMOVE_NAME) ++ add.d a4, a1, a2 ++ add.d a3, a0, a2 ++ beq a1, a0, less_1bytes ++ move t8, a0 ++ srai.d a6, a2, 4 #num/16 ++ beqz a6, less_16bytes #num<16 ++ srai.d a6, a2, 6 #num/64 ++ bnez a6, more_64bytes #num>64 ++ srai.d a6, a2, 5 ++ beqz a6, less_32bytes #num<32 ++ ++ ld.d t0, a1, 0 #32. */ ++ ++#include ++ ++#define ST_128(n) \ ++ st.d a1, a0, n; \ ++ st.d a1, a0, n+8 ; \ ++ st.d a1, a0, n+16 ; \ ++ st.d a1, a0, n+24 ; \ ++ st.d a1, a0, n+32 ; \ ++ st.d a1, a0, n+40 ; \ ++ st.d a1, a0, n+48 ; \ ++ st.d a1, a0, n+56 ; \ ++ st.d a1, a0, n+64 ; \ ++ st.d a1, a0, n+72 ; \ ++ st.d a1, a0, n+80 ; \ ++ st.d a1, a0, n+88 ; \ ++ st.d a1, a0, n+96 ; \ ++ st.d a1, a0, n+104; \ ++ st.d a1, a0, n+112; \ ++ st.d a1, a0, n+120; \ ++ ++/* void *memset(void *s, int c, size_t n); */ ++LEAF(memset) ++ .align 6 ++ ++ bstrins.d a1, a1, 15, 8 ++ add.d t7, a0, a2 ++ bstrins.d a1, a1, 31, 16 ++ move t0, a0 ++ bstrins.d a1, a1, 63, 32 ++ srai.d t8, a2, 4 #num/16 ++ beqz t8, less_16bytes #num<16 ++ srai.d t8, a2, 6 #num/64 ++ bnez t8, more_64bytes #num>64 ++ srai.d t8, a2, 5 #num/32 ++ beqz t8, less_32bytes #num<32 ++ st.d a1, a0, 0 #32. */ ++ ++#include ++ ++/* char * strchr (const char *s1, int c); */ ++LEAF(strchr) ++ .align 6 ++ ++ li.w t4, 0x7 ++ lu12i.w a2, 0x01010 ++ bstrins.d a1, a1, 15, 8 ++ andi t0, a0, 0x7 ++ ++ ori a2, a2, 0x101 ++ andn t4, a0, t4 ++ slli.w t1, t0, 3 ++ ++ ld.d t4, t4, 0 ++ ++ nor t8, zero, zero ++ bstrins.d a1, a1, 31, 16 ++ srl.d t4, t4, t1 ++ ++ bstrins.d a1, a1, 63, 32 ++ bstrins.d a2, a2, 63, 32 ++ srl.d a7, t8, t1 ++ ++ li.w t1, 8 ++ nor t8, a7, zero ++ slli.d a3, a2, 7 ++ or t5, t8, t4 ++ and t3, a7, a1 ++ ++ sub.w t1, t1, t0 ++ nor a3, a3, zero ++ xor t2, t5, t3 ++ sub.d a7, t5, a2 ++ nor a6, t5, a3 ++ ++ sub.d a5, t2, a2 ++ nor a4, t2, a3 ++ ++ and a6, a7, a6 ++ and a5, a5, a4 ++ or a7, a6, a5 ++ bnez a7, L(_mc8_a) ++ ++ add.d a0, a0, t1 ++L(_aloop): ++ ld.d t4, a0, 0 ++ ++ xor t2, t4, a1 ++ sub.d a7, t4, a2 ++ nor a6, t4, a3 ++ sub.d a5, t2, a2 ++ ++ nor a4, t2, a3 ++ and a6, a7, a6 ++ and a5, a5, a4 ++ or a7, a6, a5 ++ bnez a7, L(_mc8_a) ++ ++ ld.d t4, a0, 8 ++ addi.d a0, a0, 16 ++ xor t2, t4, a1 ++ sub.d a7, t4, a2 ++ nor a6, t4, a3 ++ sub.d a5, t2, a2 ++ ++ nor a4, t2, a3 ++ and a6, a7, a6 ++ and a5, a5, a4 ++ or a7, a6, a5 ++ beqz a7, L(_aloop) ++ ++ addi.d a0, a0, -8 ++L(_mc8_a): ++ ++ ctz.d t0, a5 ++ ctz.d t2, a6 ++ ++ srli.w t0, t0, 3 ++ srli.w t2, t2, 3 ++ sltu t1, t2, t0 ++ add.d v0, a0, t0 ++ masknez v0, v0, t1 ++ jr ra ++END(strchr) ++ ++libc_hidden_builtin_def (strchr) ++weak_alias (strchr, index) +diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S +new file mode 100644 +index 00000000..58b8b372 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/strchrnul.S +@@ -0,0 +1,115 @@ ++/* Optimized strchrnul implementation for LoongArch. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#include ++ ++#define MOVZ(rd,rs,rt) \ ++ masknez t6, rs, rt;\ ++ maskeqz rd, rd, rt;\ ++ or rd, rd, t6 ++ ++/* char *strchrnul(const char *s, int c); */ ++LEAF(__strchrnul) ++ .align 6 ++ ++ li.w t4, 0x7 ++ lu12i.w a2, 0x01010 ++ bstrins.d a1, a1, 15, 8 ++ andi t0, a0, 0x7 ++ ++ ori a2, a2, 0x101 ++ andn t4, a0, t4 ++ slli.w t1, t0, 3 ++ ld.d t4, t4, 0 ++ ++ nor t8, zero, zero ++ bstrins.d a1, a1, 31, 16 ++ srl.d t4, t4, t1 ++ ++ preld 0, a0, 32 ++ bstrins.d a1, a1, 63, 32 ++ bstrins.d a2, a2, 63, 32 ++ srl.d a7, t8, t1 ++ ++ nor t8, a7, zero ++ slli.d a3, a2, 7 ++ or t5, t8, t4 ++ and t3, a7, a1 ++ ++ nor a3, a3, zero ++ xor t2, t5, t3 ++ sub.d a7, t5, a2 ++ nor a6, t5, a3 ++ ++ li.w t1, 8 ++ sub.d a5, t2, a2 ++ nor a4, t2, a3 ++ ++ and a6, a7, a6 ++ and a5, a5, a4 ++ or a7, a6, a5 ++ bnez a7, L(_mc8_a) ++ ++ sub.w t1, t1, t0 ++ add.d a0, a0, t1 ++L(_aloop): ++ ld.d t4, a0, 0 ++ ++ xor t2, t4, a1 ++ sub.d a7, t4, a2 ++ nor a6, t4, a3 ++ sub.d a5, t2, a2 ++ ++ nor a4, t2, a3 ++ and a6, a7, a6 ++ and a5, a5, a4 ++ ++ or a7, a6, a5 ++ bnez a7, L(_mc8_a) ++ ++ ld.d t4, a0, 8 ++ addi.d a0, a0, 16 ++ ++ xor t2, t4, a1 ++ sub.d a7, t4, a2 ++ nor a6, t4, a3 ++ sub.d a5, t2, a2 ++ ++ nor a4, t2, a3 ++ and a6, a7, a6 ++ and a5, a5, a4 ++ ++ or a7, a6, a5 ++ beqz a7, L(_aloop) ++ ++ addi.d a0, a0, -8 ++L(_mc8_a): ++ ctz.d t0, a5 ++ ctz.d t2, a6 ++ ++ srli.w t0, t0, 3 ++ srli.w t2, t2, 3 ++ slt t1, t0, t2 ++ ++ MOVZ(t0,t2,t1) ++ ++ add.d v0, a0, t0 ++ jr ra ++END(__strchrnul) ++ ++weak_alias(__strchrnul, strchrnul) +diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S +new file mode 100644 +index 00000000..0f7a6d55 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/strcmp.S +@@ -0,0 +1,161 @@ ++/* Optimized strcmp implementation for LoongArch. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#include ++ ++/* Parameters and Results */ ++#define src1 a0 ++#define src2 a1 ++#define result v0 ++ ++/* Internal variable */ ++#define src1_off a2 ++#define src2_off a3 ++#define data1 t0 ++#define data2 t1 ++#define has_nul t2 ++#define diff t3 ++#define syndrome t4 ++#define zeroones t5 ++#define sevenf t6 ++#define pos t7 ++#define exchange t8 ++#define tmp1 a4 ++#define tmp2 a5 ++#define tmp3 a6 ++#define tmp4 a7 ++ ++/* rd <- if rc then ra else rb ++ tmp3 will be destroyed */ ++#define CONDITIONSEL(rd, rc, ra, rb)\ ++ masknez tmp3, rb, rc;\ ++ maskeqz rd, ra, rc;\ ++ or rd, rd, tmp3 ++ ++LEAF(strcmp) ++ .align 4 ++ ++ xor tmp1, src1, src2 ++ lu12i.w zeroones, 0x01010 ++ lu12i.w sevenf, 0x7f7f7 ++ andi src1_off, src1, 0x7 ++ ori zeroones, zeroones, 0x101 ++ ori sevenf, sevenf, 0xf7f ++ andi tmp1, tmp1, 0x7 ++ bstrins.d zeroones, zeroones, 63, 32 ++ bstrins.d sevenf, sevenf, 63, 32 ++ bnez tmp1, strcmp_misaligned8 ++ bnez src1_off, strcmp_mutual_align ++strcmp_loop_aligned: ++ ld.d data1, src1, 0 ++ addi.d src1, src1, 8 ++ ld.d data2, src2, 0 ++ addi.d src2, src2, 8 ++strcmp_start_realigned: ++ sub.d tmp1, data1, zeroones ++ or tmp2, data1, sevenf ++ xor diff, data1, data2 ++ andn has_nul, tmp1, tmp2 ++ or syndrome, diff, has_nul ++ beqz syndrome, strcmp_loop_aligned ++ ++strcmp_end: ++ ctz.d pos, syndrome ++ bstrins.d pos, zero, 2, 0 ++ srl.d data1, data1, pos ++ srl.d data2, data2, pos ++ andi data1, data1, 0xff ++ andi data2, data2, 0xff ++ sub.d result, data1, data2 ++ jr ra ++strcmp_mutual_align: ++ bstrins.d src1, zero, 2, 0 ++ bstrins.d src2, zero, 2, 0 ++ slli.d tmp1, src1_off, 0x3 ++ ld.d data1, src1, 0 ++ sub.d tmp1, zero, tmp1 ++ ld.d data2, src2, 0 ++ addi.d src1, src1, 8 ++ addi.d src2, src2, 8 ++ nor tmp2, zero, zero ++ srl.d tmp2, tmp2, tmp1 ++ or data1, data1, tmp2 ++ or data2, data2, tmp2 ++ b strcmp_start_realigned ++ ++strcmp_misaligned8: ++ /* check ++ if ((src1 != 0) && ((src2 == 0 ) || (src1 < src2))) ++ then exchange(src1,src2). */ ++ andi src2_off, src2, 0x7 ++ slt tmp2, src1_off, src2_off ++ CONDITIONSEL(tmp2, src2_off, tmp2, tmp1) ++ maskeqz exchange, tmp2, src1_off ++ xor tmp3, src1, src2 ++ maskeqz tmp3, tmp3, exchange ++ xor src1, src1, tmp3 ++ xor src2, src2, tmp3 ++ ++ andi src1_off, src1, 0x7 ++ beqz src1_off, strcmp_loop_misaligned ++strcmp_do_misaligned: ++ ld.bu data1, src1, 0 ++ ld.bu data2, src2, 0 ++ xor tmp3, data1, data2 ++ addi.d src1, src1, 1 ++ masknez tmp3, data1, tmp3 ++ addi.d src2, src2, 1 ++ beqz tmp3, strcmp_done ++ andi src1_off, src1, 0x7 ++ bnez src1_off, strcmp_do_misaligned ++ ++strcmp_loop_misaligned: ++ andi tmp1, src2, 0xff8 ++ xori tmp1, tmp1, 0xff8 ++ beqz tmp1, strcmp_do_misaligned ++ ld.d data1, src1, 0 ++ ld.d data2, src2, 0 ++ addi.d src1, src1, 8 ++ addi.d src2, src2, 8 ++ ++ sub.d tmp1, data1, zeroones ++ or tmp2, data1, sevenf ++ xor diff, data1, data2 ++ andn has_nul, tmp1, tmp2 ++ or syndrome, diff, has_nul ++ beqz syndrome, strcmp_loop_misaligned ++strcmp_misalign_end: ++ ctz.d pos, syndrome ++ bstrins.d pos, zero, 2, 0 ++ srl.d data1, data1, pos ++ srl.d data2, data2, pos ++ andi data1, data1, 0xff ++ andi data2, data2, 0xff ++ sub.d tmp1, data1, data2 ++ sub.d tmp2, data2, data1 ++ CONDITIONSEL(result, exchange, tmp2, tmp1) ++ jr ra ++ ++strcmp_done: ++ sub.d tmp1, data1, data2 ++ sub.d tmp2, data2, data1 ++ CONDITIONSEL(result, exchange, tmp2, tmp1) ++ jr ra ++END(strcmp) ++ ++libc_hidden_builtin_def (strcmp) +diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S +new file mode 100644 +index 00000000..03d9d361 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/strcpy.S +@@ -0,0 +1,175 @@ ++/* Optimized strcpy implementation for LoongArch. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#include ++ ++/* Parameters and Results */ ++#define dest a0 ++#define src a1 ++#define result v0 ++ ++/* Internal variable */ ++#define data t0 ++#define data1 t1 ++#define has_nul t2 ++#define diff t3 ++#define syndrome t4 ++#define zeroones t5 ++#define sevenf t6 ++#define pos t7 ++#define dest_backup t8 ++#define tmp1 a4 ++#define tmp2 a5 ++#define tmp3 a6 ++#define dest_off a2 ++#define src_off a3 ++#define tmp4 a7 ++ ++/* rd <- if rc then ra else rb ++ tmp3 will be destroyed. */ ++#define CONDITIONSEL(rd, rc, ra, rb)\ ++ masknez tmp3, rb, rc;\ ++ maskeqz rd, ra, rc;\ ++ or rd, rd, tmp3 ++ ++/* int strcpy (const char *s1, const char *s2); */ ++LEAF(strcpy) ++ .align 4 ++ ++ move dest_backup, dest ++ lu12i.w zeroones, 0x01010 ++ lu12i.w sevenf, 0x7f7f7 ++ ori zeroones, zeroones, 0x101 ++ ori sevenf, sevenf, 0xf7f ++ bstrins.d zeroones, zeroones, 63, 32 ++ bstrins.d sevenf, sevenf, 63, 32 ++ andi src_off, src, 0x7 ++ beqz src_off, strcpy_loop_aligned_1 ++ b strcpy_mutual_align ++strcpy_loop_aligned: ++ st.d data, dest, 0 ++ addi.d dest, dest, 8 ++strcpy_loop_aligned_1: ++ ld.d data, src, 0 ++ addi.d src, src, 8 ++strcpy_start_realigned: ++ sub.d tmp1, data, zeroones ++ or tmp2, data, sevenf ++ andn has_nul, tmp1, tmp2 ++ beqz has_nul, strcpy_loop_aligned ++ ++strcpy_end: ++ ++ /* 8 4 2 1 */ ++ ctz.d pos, has_nul ++ srli.d pos, pos, 3 ++ addi.d pos, pos, 1 ++ /* Do 8/4/2/1 strcpy based on pos value. ++ pos value is the number of bytes to be copied ++ the bytes include the final \0 so the max length is 8 and the min length is 1. */ ++strcpy_end_8: ++ andi tmp1, pos, 0x8 ++ beqz tmp1, strcpy_end_4 ++ st.d data, dest, 0 ++ move dest, dest_backup ++ jr ra ++strcpy_end_4: ++ andi tmp1, pos, 0x4 ++ beqz tmp1, strcpy_end_2 ++ st.w data, dest, 0 ++ srli.d data, data, 32 ++ addi.d dest, dest, 4 ++strcpy_end_2: ++ andi tmp1, pos, 0x2 ++ beqz tmp1, strcpy_end_1 ++ st.h data, dest, 0 ++ srli.d data, data, 16 ++ addi.d dest, dest, 2 ++strcpy_end_1: ++ andi tmp1, pos, 0x1 ++ beqz tmp1, strcpy_end_ret ++ st.b data, dest, 0 ++strcpy_end_ret: ++ move result, dest_backup ++ jr ra ++ ++ ++strcpy_mutual_align: ++ /* Check if around src page bound. ++ if not go to page cross ok. ++ if it is, do further check. ++ use tmp2 to accelerate. */ ++ ++ li.w tmp2, 0xff8 ++ andi tmp1, src, 0xff8 ++ beq tmp1, tmp2, strcpy_page_cross ++ ++strcpy_page_cross_ok: ++ /* Load a misaligned double word and check if has \0 ++ If no, do a misaligned double word paste. ++ If yes, calculate the number of avaliable bytes, ++ then jump to 4/2/1 end. */ ++ ld.d data, src, 0 ++ sub.d tmp1, data, zeroones ++ or tmp2, data, sevenf ++ andn has_nul, tmp1, tmp2 ++ bnez has_nul, strcpy_end ++strcpy_mutual_align_finish: ++ /* Before jump back to align loop, make dest/src aligned. ++ This will cause a duplicated paste for several bytes between the first double word and the second double word, ++ but should not bring a problem. */ ++ li.w tmp1, 8 ++ st.d data, dest, 0 ++ sub.d tmp1, tmp1, src_off ++ add.d src, src, tmp1 ++ add.d dest, dest, tmp1 ++ ++ b strcpy_loop_aligned_1 ++ ++strcpy_page_cross: ++ /* ++ ld.d from aligned address(src & ~0x7). ++ check if high bytes have \0. ++ it not, go back to page cross ok, ++ since the string is supposed to cross the page bound in such situation. ++ if it is, do a srl for data to make it seems like a direct double word from src, ++ then go to 4/2/1 strcpy end. ++ ++ tmp4 is 0xffff...ffff mask ++ tmp2 demonstrate the bytes to be masked ++ tmp2 = src_off << 3 ++ data = data >> (src_off * 8) | -1 << (64 - src_off * 8) ++ and ++ -1 << (64 - src_off * 8) -> ~(-1 >> (src_off * 8)) */ ++ ++ li.w tmp1, 0x7 ++ andn tmp3, src, tmp1 ++ ld.d data, tmp3, 0 ++ li.w tmp4, -1 ++ slli.d tmp2, src_off, 3 ++ srl.d tmp4, tmp4, tmp2 ++ srl.d data, data, tmp2 ++ nor tmp4, tmp4, zero ++ or data, data, tmp4 ++ sub.d tmp1, data, zeroones ++ or tmp2, data, sevenf ++ andn has_nul, tmp1, tmp2 ++ beqz has_nul, strcpy_page_cross_ok ++ b strcpy_end ++END(strcpy) ++libc_hidden_builtin_def (strcpy) +diff --git a/sysdeps/loongarch/lp64/strlen.S b/sysdeps/loongarch/lp64/strlen.S +new file mode 100644 +index 00000000..3569598c +--- /dev/null ++++ b/sysdeps/loongarch/lp64/strlen.S +@@ -0,0 +1,102 @@ ++/* Optimized strlen implementation for LoongArch. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#include ++#include ++ ++/* size_t strlen (const char *s1); */ ++LEAF(strlen) ++ .align 5 ++ ++ nor t4, zero, zero ++ lu12i.w a2, 0x01010 ++ andi t5, a0, 0x7 ++ ++ li.w t7, 0x7 ++ slli.d t6, t5, 0x3 ++ andn t7, a0, t7 ++ ld.d a1, t7, 0 ++ sub.d t7, zero, t6 ++ sll.d t4, t4, t7 ++ maskeqz t4, t4, t6 ++ srl.d a1, a1, t6 ++ or a1, a1, t4 ++ ++ ++ ori a2, a2, 0x101 ++ nor t1, a1, zero ++ li.w a4, 8 ++ ++ bstrins.d a2, a2, 63, 32 ++ sub.d a5, a4, t5 ++ move t5, a0 ++ ++ sub.d t0, a1, a2 ++ slli.d t4, a2, 7 ++ nor a3, zero, t4 ++ nor t1, a1, a3 ++ ++ and t0, t0, t1 ++ bnez t0, strlen_count1 ++ add.d a0, a0, a5 ++strlen_loop: ++ ld.d a1, a0, 0 ++ sub.d t0, a1, a2 ++ and t1, t0, t4 ++ bnez t1, strlen_count_pre ++ ld.d a1, a0, 8 ++ sub.d t0, a1, a2 ++ and t1, t0, t4 ++ addi.d a0, a0, 16 ++ beqz t1, strlen_loop ++strlen_count: ++ addi.d a0, a0, -8 ++strlen_count_pre: ++ nor t1, a1, a3 ++ and t0, t0, t1 ++ beqz t0, strlen_noascii_start ++strlen_count1: ++ ctz.d t1, t0 ++ sub.d v0, a0, t5 ++ srli.w t1, t1, 3 ++ add.d v0, v0, t1 ++ jr ra ++strlen_noascii_start: ++ addi.d a0, a0, 8 ++strlen_loop_noascii: ++ ld.d a1, a0, 0 ++ sub.d t0, a1, a2 ++ nor t1, a1, a3 ++ and t0, t0, t1 ++ bnez t0, strlen_count1 ++ ld.d a1, a0, 8 ++ sub.d t0, a1, a2 ++ nor t1, a1, a3 ++ and t0, t0, t1 ++ addi.d a0, a0, 16 ++ beqz t0, strlen_loop_noascii ++ addi.d a0, a0, -8 ++ ctz.d t1, t0 ++ sub.d v0, a0, t5 ++ srli.w t1, t1, 3 ++ add.d v0, v0, t1 ++ jr ra ++END(strlen) ++ ++libc_hidden_builtin_def (strlen) ++ +diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S +new file mode 100644 +index 00000000..979ea40a +--- /dev/null ++++ b/sysdeps/loongarch/lp64/strncmp.S +@@ -0,0 +1,225 @@ ++/* Optimized strncmp implementation for LoongArch. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#include ++ ++/* Parameters and Results */ ++#define src1 a0 ++#define src2 a1 ++#define limit a2 ++#define result v0 ++ ++ ++/* Internal variable */ ++#define data1 t0 ++#define data2 t1 ++#define has_nul t2 ++#define diff t3 ++#define syndrome t4 ++#define zeroones t5 ++#define sevenf t6 ++#define pos t7 ++#define exchange t8 ++#define tmp1 a5 ++#define tmp2 a6 ++#define tmp3 a7 ++#define src1_off a3 ++#define limit_wd a4 ++ ++LEAF(strncmp) ++ .align 4 ++ beqz limit, strncmp_ret0 ++ ++ xor tmp1, src1, src2 ++ lu12i.w zeroones, 0x01010 ++ lu12i.w sevenf, 0x7f7f7 ++ andi src1_off, src1, 0x7 ++ ori zeroones, zeroones, 0x101 ++ andi tmp1, tmp1, 0x7 ++ ori sevenf, sevenf, 0xf7f ++ bstrins.d zeroones, zeroones, 63, 32 ++ bstrins.d sevenf, sevenf, 63, 32 ++ bnez tmp1, strncmp_misaligned8 ++ bnez src1_off, strncmp_mutual_align ++ addi.d limit_wd, limit, -1 ++ srli.d limit_wd, limit_wd, 3 ++ ++strncmp_loop_aligned: ++ ld.d data1, src1, 0 ++ addi.d src1, src1, 8 ++ ld.d data2, src2, 0 ++ addi.d src2, src2, 8 ++strncmp_start_realigned: ++ addi.d limit_wd, limit_wd, -1 ++ sub.d tmp1, data1, zeroones ++ or tmp2, data1, sevenf ++ xor diff, data1, data2 ++ andn has_nul, tmp1, tmp2 ++ srli.d tmp1, limit_wd, 63 ++ or syndrome, diff, has_nul ++ or tmp2, syndrome, tmp1 ++ beqz tmp2, strncmp_loop_aligned ++ ++ /* if not reach limit */ ++ bge limit_wd, zero, strncmp_not_limit ++ /* if reach limit */ ++ andi limit, limit, 0x7 ++ li.w tmp1, 0x8 ++ sub.d limit, tmp1, limit ++ slli.d limit, limit, 0x3 ++ li.d tmp1, -1 ++ srl.d tmp1, tmp1, limit ++ and data1, data1, tmp1 ++ and data2, data2, tmp1 ++ orn syndrome, syndrome, tmp1 ++ ++ ++strncmp_not_limit: ++ ctz.d pos, syndrome ++ bstrins.d pos, zero, 2, 0 ++ srl.d data1, data1, pos ++ srl.d data2, data2, pos ++ andi data1, data1, 0xff ++ andi data2, data2, 0xff ++ sub.d result, data1, data2 ++ jr ra ++ ++ ++ ++strncmp_mutual_align: ++ bstrins.d src1, zero, 2, 0 ++ bstrins.d src2, zero, 2, 0 ++ slli.d tmp1, src1_off, 0x3 ++ ld.d data1, src1, 0 ++ ld.d data2, src2, 0 ++ addi.d src2, src2, 8 ++ addi.d src1, src1, 8 ++ ++ addi.d limit_wd, limit, -1 ++ andi tmp3, limit_wd, 0x7 ++ srli.d limit_wd, limit_wd, 3 ++ add.d limit, limit, src1_off ++ add.d tmp3, tmp3, src1_off ++ srli.d tmp3, tmp3, 3 ++ add.d limit_wd, limit_wd, tmp3 ++ ++ sub.d tmp1, zero, tmp1 ++ nor tmp2, zero, zero ++ srl.d tmp2, tmp2, tmp1 ++ or data1, data1, tmp2 ++ or data2, data2, tmp2 ++ b strncmp_start_realigned ++ ++strncmp_misaligned8: ++ ++ li.w tmp1, 0x10 ++ bge limit, tmp1, strncmp_try_words ++strncmp_byte_loop: ++ ld.bu data1, src1, 0 ++ ld.bu data2, src2, 0 ++ addi.d limit, limit, -1 ++ xor tmp1, data1, data2 ++ masknez tmp1, data1, tmp1 ++ maskeqz tmp1, limit, tmp1 ++ beqz tmp1, strncmp_done ++ ++ ld.bu data1, src1, 1 ++ ld.bu data2, src2, 1 ++ addi.d src1, src1, 2 ++ addi.d src2, src2, 2 ++ addi.d limit, limit, -1 ++ xor tmp1, data1, data2 ++ masknez tmp1, data1, tmp1 ++ maskeqz tmp1, limit, tmp1 ++ bnez tmp1, strncmp_byte_loop ++ ++ ++strncmp_done: ++ sub.d result, data1, data2 ++ jr ra ++ ++strncmp_try_words: ++ srli.d limit_wd, limit, 3 ++ beqz src1_off, strncmp_do_misaligned ++ ++ sub.d src1_off, zero, src1_off ++ andi src1_off, src1_off, 0x7 ++ sub.d limit, limit, src1_off ++ srli.d limit_wd, limit, 0x3 ++ ++strncmp_page_end_loop: ++ ld.bu data1, src1, 0 ++ ld.bu data2, src2, 0 ++ addi.d src1, src1, 1 ++ addi.d src2, src2, 1 ++ xor tmp1, data1, data2 ++ masknez tmp1, data1, tmp1 ++ beqz tmp1, strncmp_done ++ andi tmp1, src1, 0x7 ++ bnez tmp1, strncmp_page_end_loop ++strncmp_do_misaligned: ++ li.w src1_off, 0x8 ++ addi.d limit_wd, limit_wd, -1 ++ blt limit_wd, zero, strncmp_done_loop ++ ++strncmp_loop_misaligned: ++ andi tmp2, src2, 0xff8 ++ xori tmp2, tmp2, 0xff8 ++ beqz tmp2, strncmp_page_end_loop ++ ++ ld.d data1, src1, 0 ++ ld.d data2, src2, 0 ++ addi.d src1, src1, 8 ++ addi.d src2, src2, 8 ++ sub.d tmp1, data1, zeroones ++ or tmp2, data1, sevenf ++ xor diff, data1, data2 ++ andn has_nul, tmp1, tmp2 ++ or syndrome, diff, has_nul ++ bnez syndrome, strncmp_not_limit ++ addi.d limit_wd, limit_wd, -1 ++ bge limit_wd, zero, strncmp_loop_misaligned ++ ++strncmp_done_loop: ++ andi limit, limit, 0x7 ++ beqz limit, strncmp_not_limit ++ /* Read the last double word ++ check if the final part is about to exceed the page */ ++ andi tmp1, src2, 0x7 ++ andi tmp2, src2, 0xff8 ++ add.d tmp1, tmp1, limit ++ xori tmp2, tmp2, 0xff8 ++ andi tmp1, tmp1, 0x8 ++ masknez tmp1, tmp1, tmp2 ++ bnez tmp1, strncmp_byte_loop ++ addi.d src1, src1, -8 ++ addi.d src2, src2, -8 ++ ldx.d data1, src1, limit ++ ldx.d data2, src2, limit ++ sub.d tmp1, data1, zeroones ++ or tmp2, data1, sevenf ++ xor diff, data1, data2 ++ andn has_nul, tmp1, tmp2 ++ or syndrome, diff, has_nul ++ bnez syndrome, strncmp_not_limit ++ ++strncmp_ret0: ++ move result, zero ++ jr ra ++END(strncmp) ++libc_hidden_builtin_def (strncmp) +diff --git a/sysdeps/loongarch/lp64/strnlen.S b/sysdeps/loongarch/lp64/strnlen.S +new file mode 100644 +index 00000000..8eaa60e2 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/strnlen.S +@@ -0,0 +1,125 @@ ++/* Optimized strlen implementation for LoongArch. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#include ++ ++/* rd <- if rc then ra else rb ++ a5 will be destroyed. */ ++#define CONDITIONSEL(rd, ra, rb, rc)\ ++ masknez a5, rb, rc;\ ++ maskeqz rd, ra, rc;\ ++ or rd, rd, a5 ++ ++/* Parameters and Results */ ++#define srcin a0 ++#define limit a1 ++#define len v0 ++ ++/* Internal variable */ ++#define data1 t0 ++#define data2 t1 ++#define has_nul1 t2 ++#define has_nul2 t3 ++#define src t4 ++#define zeroones t5 ++#define sevenf t6 ++#define data2a t7 ++#define tmp6 t7 ++#define pos t8 ++#define tmp1 a2 ++#define tmp2 a3 ++#define tmp3 a4 ++#define tmp4 a5 ++#define tmp5 a6 ++#define limit_wd a7 ++ ++/* size_t strnlen (const char *s1,size_t maxlen); */ ++LEAF(__strnlen) ++ .align 4 ++ beqz limit, _hit_limit ++ lu12i.w zeroones, 0x01010 ++ lu12i.w sevenf, 0x7f7f7 ++ ori zeroones, zeroones, 0x101 ++ ori sevenf, sevenf, 0xf7f ++ bstrins.d zeroones, zeroones, 63, 32 ++ bstrins.d sevenf, sevenf, 63, 32 ++ andi tmp1, srcin, 15 ++ sub.d src, srcin, tmp1 ++ bnez tmp1, misaligned ++ addi.d limit_wd, limit, -1 ++ srli.d limit_wd, limit_wd, 4 ++_loop: ++ ld.d data1, src, 0 ++ ld.d data2, src, 8 ++ addi.d src, src, 16 ++_realigned: ++ sub.d tmp1, data1, zeroones ++ or tmp2, data1, sevenf ++ sub.d tmp3, data2, zeroones ++ or tmp4, data2, sevenf ++ andn has_nul1, tmp1, tmp2 ++ andn has_nul2, tmp3, tmp4 ++ addi.d limit_wd, limit_wd, -1 ++ srli.d tmp1, limit_wd, 63 ++ or tmp2, has_nul1, has_nul2 ++ or tmp3, tmp1, tmp2 ++ beqz tmp3, _loop ++ beqz tmp2, _hit_limit ++ sub.d len, src, srcin ++ beqz has_nul1, _nul_in_data2 ++ move has_nul2, has_nul1 ++ addi.d len, len, -8 ++_nul_in_data2: ++ ctz.d pos, has_nul2 ++ srli.d pos, pos, 3 ++ addi.d len, len, -8 ++ add.d len, len, pos ++ sltu tmp1, len, limit ++ CONDITIONSEL(len, len, limit, tmp1) ++ jr ra ++ ++misaligned: ++ addi.d limit_wd, limit, -1 ++ sub.d tmp4, zero, tmp1 ++ andi tmp3, limit_wd, 15 ++ srli.d limit_wd, limit_wd, 4 ++ li.d tmp5, -1 ++ ld.d data1, src, 0 ++ ld.d data2, src, 8 ++ addi.d src, src, 16 ++ slli.d tmp4, tmp4, 3 ++ add.d tmp3, tmp3, tmp1 ++ srl.d tmp2, tmp5, tmp4 ++ srli.d tmp3, tmp3, 4 ++ add.d limit_wd, limit_wd, tmp3 ++ or data1, data1, tmp2 ++ or data2a, data2, tmp2 ++ li.w tmp3, 9 ++ sltu tmp1, tmp1, tmp3 ++ CONDITIONSEL(data1, data1, tmp5, tmp1) ++ CONDITIONSEL(data2, data2, data2a, tmp1) ++ b _realigned ++ ++_hit_limit: ++ move len, limit ++ jr ra ++END(__strnlen) ++ ++weak_alias (__strnlen, strnlen) ++libc_hidden_def (strnlen) ++libc_hidden_def (__strnlen) +-- +2.33.0 + diff --git a/glibc.spec b/glibc.spec index 959fe67e4d13f517ef2f8561a29c992ca3d67ad0..e60eb76da734bc5b9c2c0f4d793ed603cbc995a9 100644 --- a/glibc.spec +++ b/glibc.spec @@ -66,7 +66,7 @@ ############################################################################## Name: glibc Version: 2.34 -Release: 105 +Release: 106 Summary: The GNU libc libraries License: %{all_license} URL: http://www.gnu.org/software/glibc/ @@ -272,6 +272,7 @@ Patch9021: x86-use-total-l3cache-for-non_temporal_threshold.patch Patch9022: login-Add-back-libutil-as-an-empty-library.patch Patch9023: malloc-Fix-malloc-debug-for-2.35-onwards.patch Patch9024: LoongArch-Port.patch +Patch9025: LoongArch-Optimize-string-functions-including-memcpy.patch %endif Provides: ldconfig rtld(GNU_HASH) bundled(gnulib) @@ -1438,6 +1439,9 @@ fi %endif %changelog +* Sat Jan 14 2023 Xue Liu - 2.34-106 +- LoongArch: Optimize some string functions + * Wed Dec 21 2022 wanghongliang - 2.34-105 - LoongArch Port - Add login-Add-back-libutil-as-an-empty-library.patch from upstream