From 065775f8bdf46124bddfe3a432031b98a66846cd Mon Sep 17 00:00:00 2001 From: Xue Liu Date: Sat, 14 Jan 2023 16:48:19 +0800 Subject: [PATCH] LoongArch: Optimize string functions including memcpy, memmove, memset, strchr, strchrnul, strcmp, strncmp, ctrcpy, ctrlen, strnlen. --- ...ze-string-functions-including-memcpy.patch | 2264 +++++++++++++++++ glibc.spec | 6 +- 2 files changed, 2269 insertions(+), 1 deletion(-) create mode 100644 LoongArch-Optimize-string-functions-including-memcpy.patch diff --git a/LoongArch-Optimize-string-functions-including-memcpy.patch b/LoongArch-Optimize-string-functions-including-memcpy.patch new file mode 100644 index 0000000..17130d0 --- /dev/null +++ b/LoongArch-Optimize-string-functions-including-memcpy.patch @@ -0,0 +1,2264 @@ +From da7a2b19fc87165caaa1d5de5b058680f09d155d Mon Sep 17 00:00:00 2001 +From: Xue Liu +Date: Sat, 14 Jan 2023 16:14:24 +0800 +Subject: [PATCH] LoongArch: Optimize string functions including memcpy, + memmove, memset, strchr, strchrnul, strcmp, strncmp, ctrcpy, ctrlen, strnlen. + +Change-Id: I2975aea74f44bf2c9e01a6dfb6ca2eaa57aa5f7c +--- + sysdeps/loongarch/lp64/memcpy.S | 258 ++++++++++++++++ + sysdeps/loongarch/lp64/memmove.S | 476 +++++++++++++++++++++++++++++ + sysdeps/loongarch/lp64/memset.S | 175 +++++++++++ + sysdeps/loongarch/lp64/strchr.S | 140 +++++++++ + sysdeps/loongarch/lp64/strchrnul.S | 156 ++++++++++ + sysdeps/loongarch/lp64/strcmp.S | 197 ++++++++++++ + sysdeps/loongarch/lp64/strcpy.S | 210 +++++++++++++ + sysdeps/loongarch/lp64/strlen.S | 135 ++++++++ + sysdeps/loongarch/lp64/strncmp.S | 269 ++++++++++++++++ + sysdeps/loongarch/lp64/strnlen.S | 155 ++++++++++ + 10 files changed, 2171 insertions(+) + create mode 100644 sysdeps/loongarch/lp64/memcpy.S + create mode 100644 sysdeps/loongarch/lp64/memmove.S + create mode 100644 sysdeps/loongarch/lp64/memset.S + create mode 100644 sysdeps/loongarch/lp64/strchr.S + create mode 100644 sysdeps/loongarch/lp64/strchrnul.S + create mode 100644 sysdeps/loongarch/lp64/strcmp.S + create mode 100644 sysdeps/loongarch/lp64/strcpy.S + create mode 100644 sysdeps/loongarch/lp64/strlen.S + create mode 100644 sysdeps/loongarch/lp64/strncmp.S + create mode 100644 sysdeps/loongarch/lp64/strnlen.S + +diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S +new file mode 100644 +index 00000000..5e531523 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/memcpy.S +@@ -0,0 +1,258 @@ ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif ++ ++/* Allow the routine to be named something else if desired. */ ++#ifndef MEMCPY_NAME ++#define MEMCPY_NAME memcpy ++#endif ++ ++#define LD_64(reg, n) \ ++ ld.d t0, reg, n; \ ++ ld.d t1, reg, n+8; \ ++ ld.d t2, reg, n+16; \ ++ ld.d t3, reg, n+24; \ ++ ld.d t4, reg, n+32; \ ++ ld.d t5, reg, n+40; \ ++ ld.d t6, reg, n+48; \ ++ ld.d t7, reg, n+56; ++ ++#define ST_64(reg, n) \ ++ st.d t0, reg, n; \ ++ st.d t1, reg, n+8; \ ++ st.d t2, reg, n+16; \ ++ st.d t3, reg, n+24; \ ++ st.d t4, reg, n+32; \ ++ st.d t5, reg, n+40; \ ++ st.d t6, reg, n+48; \ ++ st.d t7, reg, n+56; ++ ++#ifdef ANDROID_CHANGES ++LEAF(MEMCPY_NAME, 0) ++#else ++LEAF(MEMCPY_NAME) ++#endif ++ ++//1st var: dst ptr: void *a1 $r4 a0 ++//2nd var: src ptr: void *a2 $r5 a1 ++//3rd var: size_t len $r6 a2 ++//t0~t9 registers as temp ++ ++ add.d a4, a1, a2 ++ add.d a3, a0, a2 ++ li.w a6, 16 ++ bge a6, a2, less_16bytes ++ li.w a6, 128 ++ blt a6, a2, long_bytes ++ li.w a6, 64 ++ blt a6, a2, more_64bytes ++ li.w a6, 32 ++ blt a6, a2, more_32bytes ++ ++ /* 17...32 */ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a4, -16 ++ ld.d t3, a4, -8 ++ st.d t0, a0, 0 ++ st.d t1, a0, 8 ++ st.d t2, a3, -16 ++ st.d t3, a3, -8 ++ jr ra ++ ++more_64bytes: ++ srli.d t8, a0, 3 ++ slli.d t8, t8, 3 ++ addi.d t8, t8, 0x8 ++ sub.d a7, a0, t8 ++ ld.d t0, a1, 0 ++ sub.d a1, a1, a7 ++ st.d t0, a0, 0 ++ ++ add.d a7, a7, a2 ++ addi.d a7, a7, -0x20 ++loop_32: ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ st.d t0, t8, 0 ++ st.d t1, t8, 8 ++ st.d t2, t8, 16 ++ st.d t3, t8, 24 ++ ++ addi.d t8, t8, 0x20 ++ addi.d a1, a1, 0x20 ++ addi.d a7, a7, -0x20 ++ blt zero, a7, loop_32 ++ ++ ld.d t4, a4, -32 ++ ld.d t5, a4, -24 ++ ld.d t6, a4, -16 ++ ld.d t7, a4, -8 ++ st.d t4, a3, -32 ++ st.d t5, a3, -24 ++ st.d t6, a3, -16 ++ st.d t7, a3, -8 ++ ++ jr ra ++ ++more_32bytes: ++ /* 33...64 */ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ ld.d t4, a4, -32 ++ ld.d t5, a4, -24 ++ ld.d t6, a4, -16 ++ ld.d t7, a4, -8 ++ st.d t0, a0, 0 ++ st.d t1, a0, 8 ++ st.d t2, a0, 16 ++ st.d t3, a0, 24 ++ st.d t4, a3, -32 ++ st.d t5, a3, -24 ++ st.d t6, a3, -16 ++ st.d t7, a3, -8 ++ jr ra ++ ++less_16bytes: ++ srai.d a6, a2, 3 ++ beqz a6, less_8bytes ++ ++ /* 8...16 */ ++ ld.d t0, a1, 0 ++ ld.d t1, a4, -8 ++ st.d t0, a0, 0 ++ st.d t1, a3, -8 ++ ++ jr ra ++ ++less_8bytes: ++ srai.d a6, a2, 2 ++ beqz a6, less_4bytes ++ ++ /* 4...7 */ ++ ld.w t0, a1, 0 ++ ld.w t1, a4, -4 ++ st.w t0, a0, 0 ++ st.w t1, a3, -4 ++ jr ra ++ ++less_4bytes: ++ srai.d a6, a2, 1 ++ beqz a6, less_2bytes ++ ++ /* 2...3 */ ++ ld.h t0, a1, 0 ++ ld.h t1, a4, -2 ++ st.h t0, a0, 0 ++ st.h t1, a3, -2 ++ jr ra ++ ++less_2bytes: ++ beqz a2, less_1bytes ++ ++ ld.b t0, a1, 0 ++ st.b t0, a0, 0 ++ jr ra ++ ++less_1bytes: ++ jr ra ++ ++long_bytes: ++ srli.d t8, a0, 3 ++ slli.d t8, t8, 3 ++ beq a0, t8, start ++ ++ ld.d t0, a1, 0 ++ addi.d t8, t8, 0x8 ++ st.d t0, a0, 0 ++ sub.d a7, a0, t8 ++ sub.d a1, a1, a7 ++ ++start: ++ addi.d a5, a3, -0x80 ++ blt a5, t8, align_end_proc ++ ++loop_128: ++ LD_64(a1, 0) ++ ST_64(t8, 0) ++ LD_64(a1, 64) ++ addi.d a1, a1, 0x80 ++ ST_64(t8, 64) ++ addi.d t8, t8, 0x80 ++ bge a5, t8, loop_128 ++ ++align_end_proc: ++ sub.d a2, a3, t8 ++ ++ pcaddi t1, 34 ++ andi t2, a2, 0x78 ++ sub.d t1, t1, t2 ++ jirl zero, t1, 0 ++ ++end_120_128_unalign: ++ ld.d t0, a1, 112 ++ st.d t0, t8, 112 ++end_112_120_unalign: ++ ld.d t0, a1, 104 ++ st.d t0, t8, 104 ++end_104_112_unalign: ++ ld.d t0, a1, 96 ++ st.d t0, t8, 96 ++end_96_104_unalign: ++ ld.d t0, a1, 88 ++ st.d t0, t8, 88 ++end_88_96_unalign: ++ ld.d t0, a1, 80 ++ st.d t0, t8, 80 ++end_80_88_unalign: ++ ld.d t0, a1, 72 ++ st.d t0, t8, 72 ++end_72_80_unalign: ++ ld.d t0, a1, 64 ++ st.d t0, t8, 64 ++end_64_72_unalign: ++ ld.d t0, a1, 56 ++ st.d t0, t8, 56 ++end_56_64_unalign: ++ ld.d t0, a1, 48 ++ st.d t0, t8, 48 ++end_48_56_unalign: ++ ld.d t0, a1, 40 ++ st.d t0, t8, 40 ++end_40_48_unalign: ++ ld.d t0, a1, 32 ++ st.d t0, t8, 32 ++end_32_40_unalign: ++ ld.d t0, a1, 24 ++ st.d t0, t8, 24 ++end_24_32_unalign: ++ ld.d t0, a1, 16 ++ st.d t0, t8, 16 ++end_16_24_unalign: ++ ld.d t0, a1, 8 ++ st.d t0, t8, 8 ++end_8_16_unalign: ++ ld.d t0, a1, 0 ++ st.d t0, t8, 0 ++end_0_8_unalign: ++ ld.d t0, a4, -8 ++ st.d t0, a3, -8 ++ ++ jr ra ++ ++END(MEMCPY_NAME) ++#ifndef ANDROID_CHANGES ++#ifdef _LIBC ++libc_hidden_builtin_def (MEMCPY_NAME) ++#endif ++#endif ++ +diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S +new file mode 100644 +index 00000000..f87d036b +--- /dev/null ++++ b/sysdeps/loongarch/lp64/memmove.S +@@ -0,0 +1,476 @@ ++#ifdef _LIBC ++#include ++#include ++#include ++#else ++#include ++#include ++#endif ++ ++/* Allow the routine to be named something else if desired. */ ++#ifndef MEMMOVE_NAME ++#define MEMMOVE_NAME memmove ++#endif ++ ++#define LD_64(reg, n) \ ++ ld.d t0, reg, n; \ ++ ld.d t1, reg, n+8; \ ++ ld.d t2, reg, n+16; \ ++ ld.d t3, reg, n+24; \ ++ ld.d t4, reg, n+32; \ ++ ld.d t5, reg, n+40; \ ++ ld.d t6, reg, n+48; \ ++ ld.d t7, reg, n+56; ++ ++ ++#define ST_64(reg, n) \ ++ st.d t0, reg, n; \ ++ st.d t1, reg, n+8; \ ++ st.d t2, reg, n+16; \ ++ st.d t3, reg, n+24; \ ++ st.d t4, reg, n+32; \ ++ st.d t5, reg, n+40; \ ++ st.d t6, reg, n+48; \ ++ st.d t7, reg, n+56; ++ ++#define LDST_1024 \ ++ LD_64(a1, 0); \ ++ ST_64(a0, 0); \ ++ LD_64(a1, 64); \ ++ ST_64(a0, 64); \ ++ LD_64(a1, 128); \ ++ ST_64(a0, 128); \ ++ LD_64(a1, 192); \ ++ ST_64(a0, 192); \ ++ LD_64(a1, 256); \ ++ ST_64(a0, 256); \ ++ LD_64(a1, 320); \ ++ ST_64(a0, 320); \ ++ LD_64(a1, 384); \ ++ ST_64(a0, 384); \ ++ LD_64(a1, 448); \ ++ ST_64(a0, 448); \ ++ LD_64(a1, 512); \ ++ ST_64(a0, 512); \ ++ LD_64(a1, 576); \ ++ ST_64(a0, 576); \ ++ LD_64(a1, 640); \ ++ ST_64(a0, 640); \ ++ LD_64(a1, 704); \ ++ ST_64(a0, 704); \ ++ LD_64(a1, 768); \ ++ ST_64(a0, 768); \ ++ LD_64(a1, 832); \ ++ ST_64(a0, 832); \ ++ LD_64(a1, 896); \ ++ ST_64(a0, 896); \ ++ LD_64(a1, 960); \ ++ ST_64(a0, 960); ++ ++#define LDST_1024_BACK \ ++ LD_64(a4, -64); \ ++ ST_64(a3, -64); \ ++ LD_64(a4, -128); \ ++ ST_64(a3, -128); \ ++ LD_64(a4, -192); \ ++ ST_64(a3, -192); \ ++ LD_64(a4, -256); \ ++ ST_64(a3, -256); \ ++ LD_64(a4, -320); \ ++ ST_64(a3, -320); \ ++ LD_64(a4, -384); \ ++ ST_64(a3, -384); \ ++ LD_64(a4, -448); \ ++ ST_64(a3, -448); \ ++ LD_64(a4, -512); \ ++ ST_64(a3, -512); \ ++ LD_64(a4, -576); \ ++ ST_64(a3, -576); \ ++ LD_64(a4, -640); \ ++ ST_64(a3, -640); \ ++ LD_64(a4, -704); \ ++ ST_64(a3, -704); \ ++ LD_64(a4, -768); \ ++ ST_64(a3, -768); \ ++ LD_64(a4, -832); \ ++ ST_64(a3, -832); \ ++ LD_64(a4, -896); \ ++ ST_64(a3, -896); \ ++ LD_64(a4, -960); \ ++ ST_64(a3, -960); \ ++ LD_64(a4, -1024); \ ++ ST_64(a3, -1024); ++ ++#ifdef ANDROID_CHANGES ++LEAF(MEMMOVE_NAME, 0) ++#else ++LEAF(MEMMOVE_NAME) ++#endif ++ ++//1st var: dest ptr: void *str1 $r4 a0 ++//2nd var: src ptr: void *str2 $r5 a1 ++//3rd var: size_t num ++//t0~t9 registers as temp ++ ++ add.d a4, a1, a2 ++ add.d a3, a0, a2 ++ beq a1, a0, less_1bytes ++ move t8, a0 ++ srai.d a6, a2, 4 #num/16 ++ beqz a6, less_16bytes #num<16 ++ srai.d a6, a2, 6 #num/64 ++ bnez a6, more_64bytes #num>64 ++ srai.d a6, a2, 5 ++ beqz a6, less_32bytes #num<32 ++ ++ ld.d t0, a1, 0 #32 ++#include ++#include ++#else ++#include ++#include ++#endif ++ ++#ifdef LOONGSON_TEST ++#define MEMSET _memset ++#else ++#define MEMSET memset ++#endif ++ ++#define ST_128(n) \ ++ st.d a1, a0, n; \ ++ st.d a1, a0, n+8 ; \ ++ st.d a1, a0, n+16 ; \ ++ st.d a1, a0, n+24 ; \ ++ st.d a1, a0, n+32 ; \ ++ st.d a1, a0, n+40 ; \ ++ st.d a1, a0, n+48 ; \ ++ st.d a1, a0, n+56 ; \ ++ st.d a1, a0, n+64 ; \ ++ st.d a1, a0, n+72 ; \ ++ st.d a1, a0, n+80 ; \ ++ st.d a1, a0, n+88 ; \ ++ st.d a1, a0, n+96 ; \ ++ st.d a1, a0, n+104; \ ++ st.d a1, a0, n+112; \ ++ st.d a1, a0, n+120; \ ++ ++//1st var: void *str $4 a0 ++//2nd var: int val $5 a1 ++//3rd var: size_t num $6 a2 ++ ++LEAF(MEMSET) ++ ++memset: ++ .align 6 ++ ++ bstrins.d a1, a1, 15, 8 ++ add.d t7, a0, a2 ++ bstrins.d a1, a1, 31, 16 ++ move t0, a0 ++ bstrins.d a1, a1, 63, 32 ++ srai.d t8, a2, 4 #num/16 ++ beqz t8, less_16bytes #num<16 ++ srai.d t8, a2, 6 #num/64 ++ bnez t8, more_64bytes #num>64 ++ srai.d t8, a2, 5 #num/32 ++ beqz t8, less_32bytes #num<32 ++ st.d a1, a0, 0 #32 ++#include ++ ++ ++ ++ ++ ++#define L_ADDIU addi.d ++#define L_ADDU add.d ++#define L_SUBU sub.d ++ ++#define STRCHR strchr ++#define MOVN(rd,rs,rt) \ ++ maskeqz t6, rs, rt;\ ++ masknez rd, rd, rt;\ ++ or rd, rd, t6 ++ ++#define MOVN2(rd,rt) \ ++ masknez rd, rd, rt;\ ++ or rd, rd, rt ++ ++ ++/* char * strchr (const char *s1, int c); */ ++ ++LEAF(STRCHR) ++ .align 6 ++ ++ li.w t4, 0x7 ++ lu12i.w a2, 0x01010 ++ bstrins.d a1, a1, 15, 8 ++ andi t0, a0, 0x7 ++ ++ ori a2, a2, 0x101 ++ andn t4, a0, t4 ++ slli.w t1, t0, 3 ++ ++ ld.d t4, t4, 0 ++ ++ ++ nor t8, zero, zero ++ bstrins.d a1, a1, 31, 16 ++ srl.d t4, t4, t1 ++ ++ bstrins.d a1, a1, 63, 32 ++ bstrins.d a2, a2, 63, 32 ++ srl.d a7, t8, t1 ++ ++ li.w t1, 8 ++ nor t8, a7, zero ++ slli.d a3, a2, 7 ++ or t5, t8, t4 ++ and t3, a7, a1 ++ ++ sub.w t1, t1, t0 ++ nor a3, a3, zero ++ xor t2, t5, t3 ++ sub.d a7, t5, a2 ++ nor a6, t5, a3 ++ ++ sub.d a5, t2, a2 ++ nor a4, t2, a3 ++ ++ and a6, a7, a6 ++ and a5, a5, a4 ++ or a7, a6, a5 ++ bnez a7, L(_mc8_a) ++ ++ L_ADDU a0, a0, t1 ++L(_aloop): ++ ld.d t4, a0, 0 ++ ++ xor t2, t4, a1 ++ sub.d a7, t4, a2 ++ nor a6, t4, a3 ++ sub.d a5, t2, a2 ++ ++ nor a4, t2, a3 ++ and a6, a7, a6 ++ and a5, a5, a4 ++ or a7, a6, a5 ++ bnez a7, L(_mc8_a) ++ ++ ld.d t4, a0, 8 ++ L_ADDIU a0, a0, 16 ++ xor t2, t4, a1 ++ sub.d a7, t4, a2 ++ nor a6, t4, a3 ++ sub.d a5, t2, a2 ++ ++ nor a4, t2, a3 ++ and a6, a7, a6 ++ and a5, a5, a4 ++ or a7, a6, a5 ++ beqz a7, L(_aloop) ++ ++ L_ADDIU a0, a0, -8 ++L(_mc8_a): ++ ++ ctz.d t0, a5 ++ ctz.d t2, a6 ++ ++ srli.w t0, t0, 3 ++ srli.w t2, t2, 3 ++ sltu t1, t2, t0 ++ L_ADDU v0, a0, t0 ++ masknez v0, v0, t1 ++ jr ra ++END(STRCHR) ++ ++#ifndef ANDROID_CHANGES ++#ifdef _LIBC ++libc_hidden_builtin_def (strchr) ++weak_alias (strchr, index) ++#endif ++#endif +diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S +new file mode 100644 +index 00000000..a57a5065 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/strchrnul.S +@@ -0,0 +1,156 @@ ++/* Copyright 2016 Loongson Technology Corporation Limited */ ++ ++/* Author: Songyuekun songyuekun@loongson.cn */ ++ ++/* ++ * ISA: MIPS64R2 ++ * ABI: N64 ++ */ ++ ++/* basic algorithm : ++ ++ +. use ld.d and mask for the first 8 bytes or less; ++ ++ +. build a1 with 8c with dins; ++ ++ +. use xor from a1 and v0 to check if is found; ++ ++ +. if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has ++ one byte is \0, else has no \0 ++ ++*/ ++ ++ ++ ++ ++#include ++#include ++ ++ ++ ++ ++ ++#define L_ADDIU addi.d ++#define L_ADDU add.d ++#define L_SUBU sub.d ++ ++#define STRCHRNUL __strchrnul ++ ++#define MOVN(rd,rs,rt) \ ++ maskeqz t6, rs, rt;\ ++ masknez rd, rd, rt;\ ++ or rd, rd, t6 ++ ++#define MOVZ(rd,rs,rt) \ ++ masknez t6, rs, rt;\ ++ maskeqz rd, rd, rt;\ ++ or rd, rd, t6 ++ ++ ++#define MOVN2(rd,rt) \ ++ masknez rd, rd, rt;\ ++ or rd, rd, rt ++ ++ ++/* char * strchrnul (const char *s1, int c); */ ++ ++LEAF(STRCHRNUL) ++ .align 6 ++ ++ li.w t4, 0x7 ++ lu12i.w a2, 0x01010 ++ bstrins.d a1, a1, 15, 8 ++ andi t0, a0, 0x7 ++ ++ ori a2, a2, 0x101 ++ andn t4, a0, t4 ++ slli.w t1, t0, 3 ++/* ++ ldr t4, 0(a0) ++*/ ++ ld.d t4, t4, 0 ++ ++ ++ nor t8, zero, zero ++ bstrins.d a1, a1, 31, 16 ++ srl.d t4, t4, t1 ++ ++ preld 0, a0, 32 ++ bstrins.d a1, a1, 63, 32 ++ bstrins.d a2, a2, 63, 32 ++ srl.d a7, t8, t1 ++ ++ nor t8, a7, zero ++ slli.d a3, a2, 7 ++ or t5, t8, t4 ++ and t3, a7, a1 ++ ++ nor a3, a3, zero ++ xor t2, t5, t3 ++ sub.d a7, t5, a2 ++ nor a6, t5, a3 ++ ++ li.w t1, 8 ++ sub.d a5, t2, a2 ++ nor a4, t2, a3 ++ ++ and a6, a7, a6 ++ and a5, a5, a4 ++ or a7, a6, a5 ++ bnez a7, L(_mc8_a) ++ ++ ++ sub.w t1, t1, t0 ++ L_ADDU a0, a0, t1 ++L(_aloop): ++ ld.d t4, a0, 0 ++ ++ xor t2, t4, a1 ++ sub.d a7, t4, a2 ++ nor a6, t4, a3 ++ sub.d a5, t2, a2 ++ ++ nor a4, t2, a3 ++ and a6, a7, a6 ++ and a5, a5, a4 ++ ++ or a7, a6, a5 ++ bnez a7, L(_mc8_a) ++ ++ ld.d t4, a0, 8 ++ L_ADDIU a0, a0, 16 ++ ++ xor t2, t4, a1 ++ sub.d a7, t4, a2 ++ nor a6, t4, a3 ++ sub.d a5, t2, a2 ++ ++ nor a4, t2, a3 ++ and a6, a7, a6 ++ and a5, a5, a4 ++ ++ or a7, a6, a5 ++ beqz a7, L(_aloop) ++ ++ L_ADDIU a0, a0, -8 ++L(_mc8_a): ++ ++ ctz.d t0, a5 ++ ctz.d t2, a6 ++ ++ srli.w t0, t0, 3 ++ srli.w t2, t2, 3 ++ slt t1, t0, t2 ++ ++ MOVZ(t0,t2,t1) ++ ++ L_ADDU v0, a0, t0 ++ jr ra ++END(STRCHRNUL) ++ ++#ifndef ANDROID_CHANGES ++#ifdef _LIBC ++weak_alias(__strchrnul, strchrnul) ++libc_hidden_builtin_def (__strchrnul) ++#endif ++#endif +diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S +new file mode 100644 +index 00000000..11474bf2 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/strcmp.S +@@ -0,0 +1,197 @@ ++/* Copyright 2016 Loongson Technology Corporation Limited */ ++ ++/* Author: songyuekun songyuekun@loongson.cn */ ++ ++/* ++ * ISA: MIPS64R2 ++ * ABI: N64 ++ */ ++ ++/* basic algorithm : ++ ++ +. let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1, ++ set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1 ++ ++ +. if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more ld other times; ++ ++ +. if not, load partial t2 and t3, check if t2 has \0; ++ ++ +. then use use ld for t0, ldr for t1, ++ ++ +. if partial 8 byte from t1 has \0, compare partial 8 byte from t1 with 8 ++ byte from t0 with a mask in a7 ++ ++ +. if not, ldl other part of t1, compare 8 byte from t1 with 8 byte from t0 ++ ++ +. if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has ++ one byte is \0, else has no \0 ++ ++ +. for partial 8 byte from ldr t3, 0(a0), preload t3 with 0xffffffffffffffff ++ ++ ++*/ ++#include ++#include ++ ++ ++#define STRCMP strcmp ++ ++#define REP8_01 0x0101010101010101 ++#define REP8_7f 0x7f7f7f7f7f7f7f7f ++#define REP8_80 0x8080808080808080 ++ ++/* Parameters and Results */ ++#define src1 a0 ++#define src2 a1 ++#define result v0 ++// Note: v0 = a0 in N64 ABI ++ ++ ++/* Internal variable */ ++#define data1 t0 ++#define data2 t1 ++#define has_nul t2 ++#define diff t3 ++#define syndrome t4 ++#define zeroones t5 ++#define sevenf t6 ++#define pos t7 ++#define exchange t8 ++#define tmp1 a4 ++#define tmp2 a5 ++#define tmp3 a6 ++#define src1_off a2 ++#define src2_off a3 ++#define tmp4 a7 ++ ++/* rd <- if rc then ra else rb ++ will destroy tmp3 ++*/ ++#define CONDITIONSEL(rd,rc,ra,rb)\ ++ masknez tmp3, rb, rc;\ ++ maskeqz rd, ra, rc;\ ++ or rd, rd, tmp3 ++ ++ ++ ++/* int strcmp (const char *s1, const char *s2); */ ++ ++LEAF(STRCMP) ++ .align 4 ++ ++ xor tmp1, src1, src2 ++ lu12i.w zeroones, 0x01010 ++ lu12i.w sevenf, 0x7f7f7 ++ andi src1_off, src1, 0x7 ++ ori zeroones, zeroones, 0x101 ++ ori sevenf, sevenf, 0xf7f ++ andi tmp1, tmp1, 0x7 ++ bstrins.d zeroones, zeroones, 63, 32 ++ bstrins.d sevenf, sevenf, 63, 32 ++ bnez tmp1, strcmp_misaligned8 ++ bnez src1_off, strcmp_mutual_align ++strcmp_loop_aligned: ++ ld.d data1, src1, 0 ++ addi.d src1, src1, 8 ++ ld.d data2, src2, 0 ++ addi.d src2, src2, 8 ++strcmp_start_realigned: ++ sub.d tmp1, data1, zeroones ++ or tmp2, data1, sevenf ++ xor diff, data1, data2 ++ andn has_nul, tmp1, tmp2 ++ or syndrome, diff, has_nul ++ beqz syndrome, strcmp_loop_aligned ++ ++strcmp_end: ++ ctz.d pos, syndrome ++ bstrins.d pos, zero, 2, 0 ++ srl.d data1, data1, pos ++ srl.d data2, data2, pos ++ andi data1, data1, 0xff ++ andi data2, data2, 0xff ++ sub.d result, data1, data2 ++ jr ra ++strcmp_mutual_align: ++ bstrins.d src1, zero, 2, 0 ++ bstrins.d src2, zero, 2, 0 ++ slli.d tmp1, src1_off, 0x3 ++ ld.d data1, src1, 0 ++ sub.d tmp1, zero, tmp1 ++ ld.d data2, src2, 0 ++ addi.d src1, src1, 8 ++ addi.d src2, src2, 8 ++ nor tmp2, zero, zero ++ srl.d tmp2, tmp2, tmp1 ++ or data1, data1, tmp2 ++ or data2, data2, tmp2 ++ b strcmp_start_realigned ++ ++strcmp_misaligned8: ++ ++/* check ++ if ((src1 != 0) && ((src2 == 0 ) || (src1 < src2))) ++ then exchange(src1,src2) ++ ++*/ ++ andi src2_off, src2, 0x7 ++ slt tmp2, src1_off, src2_off ++ CONDITIONSEL(tmp2,src2_off,tmp2,tmp1) ++ maskeqz exchange, tmp2, src1_off ++ xor tmp3, src1, src2 ++ maskeqz tmp3, tmp3, exchange ++ xor src1, src1, tmp3 ++ xor src2, src2, tmp3 ++ ++ andi src1_off, src1, 0x7 ++ beqz src1_off, strcmp_loop_misaligned ++strcmp_do_misaligned: ++ ld.bu data1, src1, 0 ++ ld.bu data2, src2, 0 ++ xor tmp3, data1, data2 ++ addi.d src1, src1, 1 ++ masknez tmp3, data1, tmp3 ++ addi.d src2, src2, 1 ++ beqz tmp3, strcmp_done ++ andi src1_off, src1, 0x7 ++ bnez src1_off, strcmp_do_misaligned ++ ++strcmp_loop_misaligned: ++ andi tmp1, src2, 0xff8 ++ xori tmp1, tmp1, 0xff8 ++ beqz tmp1, strcmp_do_misaligned ++ ld.d data1, src1, 0 ++ ld.d data2, src2, 0 ++ addi.d src1, src1, 8 ++ addi.d src2, src2, 8 ++ ++ sub.d tmp1, data1, zeroones ++ or tmp2, data1, sevenf ++ xor diff, data1, data2 ++ andn has_nul, tmp1, tmp2 ++ or syndrome, diff, has_nul ++ beqz syndrome, strcmp_loop_misaligned ++// b strcmp_end ++strcmp_misalign_end: ++ ctz.d pos, syndrome ++ bstrins.d pos, zero, 2, 0 ++ srl.d data1, data1, pos ++ srl.d data2, data2, pos ++ andi data1, data1, 0xff ++ andi data2, data2, 0xff ++ sub.d tmp1, data1, data2 ++ sub.d tmp2, data2, data1 ++ CONDITIONSEL(result,exchange,tmp2,tmp1) ++ jr ra ++ ++strcmp_done: ++ sub.d tmp1, data1, data2 ++ sub.d tmp2, data2, data1 ++ CONDITIONSEL(result,exchange,tmp2,tmp1) ++ jr ra ++END(STRCMP) ++#ifndef ANDROID_CHANGES ++#ifdef _LIBC ++libc_hidden_builtin_def (strcmp) ++#endif ++#endif +diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S +new file mode 100644 +index 00000000..ce39e5a1 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/strcpy.S +@@ -0,0 +1,210 @@ ++/* Copyright 2016 Loongson Technology Corporation Limited */ ++ ++/* Author: Huang Pei huangpei@loongson.cn */ ++ ++/* ++ * ISA: MIPS64R2 ++ * ABI: N64 ++ */ ++ ++/* basic algorithm : ++ ++ +. if src aligned. just do the copy loop. if not, do the cross page check and copy one double word. ++ ++ Then move src to aligned. ++ ++ +. if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has ++ one byte is \0, else has no \0 ++ ++ ++*/ ++ ++ ++#include ++#include ++ ++ ++#define STRCPY strcpy ++ ++ ++#define REP8_01 0x0101010101010101 ++#define REP8_7f 0x7f7f7f7f7f7f7f7f ++#define REP8_80 0x8080808080808080 ++ ++/* Parameters and Results */ ++#define dest a0 ++#define src a1 ++#define result v0 ++// Note: v0 = a0 in N64 ABI ++ ++ ++/* Internal variable */ ++#define data t0 ++#define data1 t1 ++#define has_nul t2 ++#define diff t3 ++#define syndrome t4 ++#define zeroones t5 ++#define sevenf t6 ++#define pos t7 ++#define dest_backup t8 ++#define tmp1 a4 ++#define tmp2 a5 ++#define tmp3 a6 ++#define dest_off a2 ++#define src_off a3 ++#define tmp4 a7 ++ ++/* rd <- if rc then ra else rb ++ will destroy tmp3 ++*/ ++#define CONDITIONSEL(rd,rc,ra,rb)\ ++ masknez tmp3, rb, rc;\ ++ maskeqz rd, ra, rc;\ ++ or rd, rd, tmp3 ++ ++ ++ ++/* int strcpy (const char *s1, const char *s2); */ ++ ++LEAF(STRCPY) ++ .align 4 ++ ++ move dest_backup, dest ++ lu12i.w zeroones, 0x01010 ++ lu12i.w sevenf, 0x7f7f7 ++ ori zeroones, zeroones, 0x101 ++ ori sevenf, sevenf, 0xf7f ++ bstrins.d zeroones, zeroones, 63, 32 ++ bstrins.d sevenf, sevenf, 63, 32 ++ andi src_off, src, 0x7 ++ beqz src_off, strcpy_loop_aligned_1 ++ b strcpy_mutual_align ++strcpy_loop_aligned: ++ st.d data, dest, 0 ++ addi.d dest, dest, 8 ++strcpy_loop_aligned_1: ++ ld.d data, src, 0 ++ addi.d src, src, 8 ++strcpy_start_realigned: ++ sub.d tmp1, data, zeroones ++ or tmp2, data, sevenf ++ andn has_nul, tmp1, tmp2 ++ beqz has_nul, strcpy_loop_aligned ++ ++strcpy_end: ++ ++/* ++8 4 2 1 ++*/ ++ ctz.d pos, has_nul ++ srli.d pos, pos, 3 ++ addi.d pos, pos, 1 ++/* ++ Do 8/4/2/1 strcpy based on pos value. ++ pos value is the number of bytes to be copied ++ the bytes include the final \0 so the max length is 8 and the min length is 1 ++*/ ++ ++strcpy_end_8: ++ andi tmp1, pos, 0x8 ++ beqz tmp1, strcpy_end_4 ++ st.d data, dest, 0 ++ move dest, dest_backup ++ jr ra ++strcpy_end_4: ++ andi tmp1, pos, 0x4 ++ beqz tmp1, strcpy_end_2 ++ st.w data, dest, 0 ++ srli.d data, data, 32 ++ addi.d dest, dest, 4 ++strcpy_end_2: ++ andi tmp1, pos, 0x2 ++ beqz tmp1, strcpy_end_1 ++ st.h data, dest, 0 ++ srli.d data, data, 16 ++ addi.d dest, dest, 2 ++strcpy_end_1: ++ andi tmp1, pos, 0x1 ++ beqz tmp1, strcpy_end_ret ++ st.b data, dest, 0 ++strcpy_end_ret: ++ move result, dest_backup ++ jr ra ++ ++ ++strcpy_mutual_align: ++/* ++ Check if around src page bound. ++ if not go to page cross ok. ++ if it is, do further check. ++ use tmp2 to accelerate. ++*/ ++ ++ li.w tmp2, 0xff8 ++ andi tmp1, src, 0xff8 ++ beq tmp1, tmp2, strcpy_page_cross ++ ++strcpy_page_cross_ok: ++/* ++ Load a misaligned double word and check if has \0 ++ If no, do a misaligned double word paste. ++ If yes, calculate the number of avaliable bytes, ++ then jump to 4/2/1 end. ++*/ ++ ld.d data, src, 0 ++ sub.d tmp1, data, zeroones ++ or tmp2, data, sevenf ++ andn has_nul, tmp1, tmp2 ++ bnez has_nul, strcpy_end ++strcpy_mutual_align_finish: ++/* ++ Before jump back to align loop, make dest/src aligned. ++ This will cause a duplicated paste for several bytes between the first double word and the second double word, ++ but should not bring a problem. ++*/ ++ li.w tmp1, 8 ++ st.d data, dest, 0 ++ sub.d tmp1, tmp1, src_off ++ add.d src, src, tmp1 ++ add.d dest, dest, tmp1 ++ ++ b strcpy_loop_aligned_1 ++ ++strcpy_page_cross: ++/* ++ ld.d from aligned address(src & ~0x7). ++ check if high bytes have \0. ++ it not, go back to page cross ok, ++ since the string is supposed to cross the page bound in such situation. ++ if it is, do a srl for data to make it seems like a direct double word from src, ++ then go to 4/2/1 strcpy end. ++ ++ tmp4 is 0xffff...ffff mask ++ tmp2 demonstrate the bytes to be masked ++ tmp2 = src_off << 3 ++ data = data >> (src_off * 8) | -1 << (64 - src_off * 8) ++ and ++ -1 << (64 - src_off * 8) -> ~(-1 >> (src_off * 8)) ++ ++*/ ++ li.w tmp1, 0x7 ++ andn tmp3, src, tmp1 ++ ld.d data, tmp3, 0 ++ li.w tmp4, -1 ++ slli.d tmp2, src_off, 3 ++ srl.d tmp4, tmp4, tmp2 ++ srl.d data, data, tmp2 ++ nor tmp4, tmp4, zero ++ or data, data, tmp4 ++ sub.d tmp1, data, zeroones ++ or tmp2, data, sevenf ++ andn has_nul, tmp1, tmp2 ++ beqz has_nul, strcpy_page_cross_ok ++ b strcpy_end ++END(STRCPY) ++#ifndef ANDROID_CHANGES ++#ifdef _LIBC ++libc_hidden_builtin_def (strcpy) ++#endif ++#endif +diff --git a/sysdeps/loongarch/lp64/strlen.S b/sysdeps/loongarch/lp64/strlen.S +new file mode 100644 +index 00000000..a34d8b69 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/strlen.S +@@ -0,0 +1,135 @@ ++/* Copyright 2016 Loongson Technology Corporation Limited */ ++ ++/* Author: Songyuekun songyuekun@loongson.cn */ ++ ++/* ++ * ISA: MIPS64R2 ++ * ABI: N64 ++ */ ++/* ++algorithm: ++ ++ #. use ld/ldr to access word/partial word in the string ++ ++ #. use (x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) != 0 to ++ judge if x has zero byte ++ ++ #. use dctz((x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) >> 3 ++ to get the index of first rightmost zero byte in dword x; ++ ++ #. use dctz(x) = 64 - dclz(~x & (x-1)); ++ ++ #. use pointer to the last non zero byte minus pointer to the start ++ of the string to get the length of string ++ ++*/ ++ ++ ++#include ++#include ++ ++ ++ ++#define L_ADDIU addi.d ++#define L_ADDU add.d ++#define L_SUBU sub.d ++ ++#define STRLEN strlen ++#define L(x) x ++ ++ ++/* size_t strlen (const char *s1); */ ++ ++ .text; ++ .globl strlen; ++ .align 5; ++ cfi_startproc ; ++ .type strlen, @function; ++strlen: ++ ++ //LEAF(strlen) ++ #preld 0, a0, 0 ++ ++ nor t4, zero, zero ++ lu12i.w a2, 0x01010 ++ andi t5, a0, 0x7 ++ ++ li.w t7, 0x7 ++ slli.d t6, t5, 0x3 ++ andn t7, a0, t7 ++ ld.d a1, t7, 0 ++ sub.d t7, zero, t6 ++ sll.d t4, t4, t7 ++ maskeqz t4, t4, t6 ++ srl.d a1, a1, t6 ++ or a1, a1, t4 ++ ++ ++ ori a2, a2, 0x101 ++ nor t1, a1, zero ++ li.w a4, 8 ++ ++ #preld 0, a0, 32 ++ bstrins.d a2, a2, 63, 32 ++ sub.d a5, a4, t5 ++ move t5, a0 ++ ++ sub.d t0, a1, a2 ++ slli.d t4, a2, 7 ++ nor a3, zero, t4 ++ nor t1, a1, a3 ++ ++ and t0, t0, t1 ++ #preld 0, a0, 64 ++ bnez t0, strlen_count1 /* instead of use bnel with daddu a0, a0, a5 in branch slot */ ++ L_ADDU a0, a0, a5 ++strlen_loop: ++ ld.d a1, a0, 0 ++ sub.d t0, a1, a2 ++ and t1, t0, t4 ++ bnez t1, strlen_count_pre ++ ld.d a1, a0, 8 ++ sub.d t0, a1, a2 ++ and t1, t0, t4 ++ L_ADDIU a0, a0, 16 ++ beqz t1, strlen_loop ++strlen_count: ++ addi.d a0, a0, -8 ++strlen_count_pre: ++ nor t1, a1, a3 ++ and t0, t0, t1 ++ beqz t0, strlen_noascii_start ++strlen_count1: ++ ctz.d t1, t0 ++ L_SUBU v0, a0, t5 ++ srli.w t1, t1, 3 ++ L_ADDU v0, v0, t1 ++ jr ra ++strlen_noascii_start: ++ addi.d a0, a0, 8 ++strlen_loop_noascii: ++ ld.d a1, a0, 0 ++ sub.d t0, a1, a2 ++ nor t1, a1, a3 ++ and t0, t0, t1 ++ bnez t0, strlen_count1 ++ ld.d a1, a0, 8 ++ sub.d t0, a1, a2 ++ nor t1, a1, a3 ++ and t0, t0, t1 ++ L_ADDIU a0, a0, 16 ++ beqz t0, strlen_loop_noascii ++ addi.d a0, a0, -8 ++ ctz.d t1, t0 ++ L_SUBU v0, a0, t5 ++ srli.w t1, t1, 3 ++ L_ADDU v0, v0, t1 ++ jr ra ++END(STRLEN) ++ ++#ifndef ANDROID_CHANGES ++#ifdef _LIBC ++libc_hidden_builtin_def (strlen) ++#endif ++#endif ++ +diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S +new file mode 100644 +index 00000000..29cc7b02 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/strncmp.S +@@ -0,0 +1,269 @@ ++/* Copyright 2016 Loongson Technology Corporation Limited */ ++ ++/* Author: songyuekun songyuekun@loongson.cn */ ++ ++/* ++ * ISA: MIPS64R2 ++ * ABI: N64 ++ */ ++ ++/* basic algorithm : ++ ++ +. let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1, ++ set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1 ++ ++ +. if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more ld other times; ++ ++ +. if not, load partial t2 and t3, check if t2 has \0; ++ ++ +. then use use ld for t0, ldr for t1, ++ ++ +. if partial 8 byte from t1 has \0, compare partial 8 byte from t1 with 8 ++ byte from t0 with a mask in a7 ++ ++ +. if not, ldl other part of t1, compare 8 byte from t1 with 8 byte from t0 ++ ++ +. if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has ++ one byte is \0, else has no \0 ++ ++ +. for partial 8 byte from ldr t3, 0(a0), preload t3 with 0xffffffffffffffff ++ ++ ++*/ ++#include ++#include ++ ++ ++#define STRNCMP strncmp ++ ++#define REP8_01 0x0101010101010101 ++#define REP8_7f 0x7f7f7f7f7f7f7f7f ++#define REP8_80 0x8080808080808080 ++ ++/* Parameters and Results */ ++#define src1 a0 ++#define src2 a1 ++#define limit a2 ++#define result v0 ++// Note: v0 = a0 in N64 ABI ++ ++ ++/* Internal variable */ ++#define data1 t0 ++#define data2 t1 ++#define has_nul t2 ++#define diff t3 ++#define syndrome t4 ++#define zeroones t5 ++#define sevenf t6 ++#define pos t7 ++#define exchange t8 ++#define tmp1 a5 ++#define tmp2 a6 ++#define tmp3 a7 ++#define src1_off a3 ++#define limit_wd a4 ++ ++ ++/* int strncmp (const char *s1, const char *s2); */ ++ ++LEAF(STRNCMP) ++ .align 4 ++ beqz limit, strncmp_ret0 ++ ++ xor tmp1, src1, src2 ++ lu12i.w zeroones, 0x01010 ++ lu12i.w sevenf, 0x7f7f7 ++ andi src1_off, src1, 0x7 ++ ori zeroones, zeroones, 0x101 ++ andi tmp1, tmp1, 0x7 ++ ori sevenf, sevenf, 0xf7f ++ bstrins.d zeroones, zeroones, 63, 32 ++ bstrins.d sevenf, sevenf, 63, 32 ++ bnez tmp1, strncmp_misaligned8 ++ bnez src1_off, strncmp_mutual_align ++ /* */ ++ addi.d limit_wd, limit, -1 ++ srli.d limit_wd, limit_wd, 3 ++ ++strncmp_loop_aligned: ++ ld.d data1, src1, 0 ++ addi.d src1, src1, 8 ++ ld.d data2, src2, 0 ++ addi.d src2, src2, 8 ++strncmp_start_realigned: ++ addi.d limit_wd, limit_wd, -1 ++ sub.d tmp1, data1, zeroones ++ or tmp2, data1, sevenf ++ xor diff, data1, data2 ++ andn has_nul, tmp1, tmp2 ++ srli.d tmp1, limit_wd, 63 ++ or syndrome, diff, has_nul ++ or tmp2, syndrome, tmp1 ++ beqz tmp2, strncmp_loop_aligned ++ ++ /* if not reach limit */ ++ bge limit_wd, zero, strncmp_not_limit ++ /* if reach limit */ ++ andi limit, limit, 0x7 ++ li.w tmp1, 0x8 ++ sub.d limit, tmp1, limit ++ slli.d limit, limit, 0x3 ++ li.d tmp1, -1 ++ srl.d tmp1, tmp1, limit ++ and data1, data1, tmp1 ++ and data2, data2, tmp1 ++ orn syndrome, syndrome, tmp1 ++ ++ ++strncmp_not_limit: ++ ctz.d pos, syndrome ++ bstrins.d pos, zero, 2, 0 ++ srl.d data1, data1, pos ++ srl.d data2, data2, pos ++ andi data1, data1, 0xff ++ andi data2, data2, 0xff ++ sub.d result, data1, data2 ++ jr ra ++ ++ ++ ++strncmp_mutual_align: ++ bstrins.d src1, zero, 2, 0 ++ bstrins.d src2, zero, 2, 0 ++ slli.d tmp1, src1_off, 0x3 ++ ld.d data1, src1, 0 ++ ld.d data2, src2, 0 ++ addi.d src2, src2, 8 ++ addi.d src1, src1, 8 ++ ++ addi.d limit_wd, limit, -1 ++ andi tmp3, limit_wd, 0x7 ++ srli.d limit_wd, limit_wd, 3 ++ add.d limit, limit, src1_off ++ add.d tmp3, tmp3, src1_off ++ srli.d tmp3, tmp3, 3 ++ add.d limit_wd, limit_wd, tmp3 ++ ++ sub.d tmp1, zero, tmp1 ++ nor tmp2, zero, zero ++ srl.d tmp2, tmp2, tmp1 ++ or data1, data1, tmp2 ++ or data2, data2, tmp2 ++ b strncmp_start_realigned ++ ++strncmp_misaligned8: ++ ++ li.w tmp1, 0x10 ++ bge limit, tmp1, strncmp_try_words ++strncmp_byte_loop: ++ ld.bu data1, src1, 0 ++ ld.bu data2, src2, 0 ++ addi.d limit, limit, -1 ++ xor tmp1, data1, data2 ++ masknez tmp1, data1, tmp1 ++ maskeqz tmp1, limit, tmp1 ++ beqz tmp1, strncmp_done ++ ++ ld.bu data1, src1, 1 ++ ld.bu data2, src2, 1 ++ addi.d src1, src1, 2 ++ addi.d src2, src2, 2 ++ addi.d limit, limit, -1 ++ xor tmp1, data1, data2 ++ masknez tmp1, data1, tmp1 ++ maskeqz tmp1, limit, tmp1 ++ bnez tmp1, strncmp_byte_loop ++ ++ ++strncmp_done: ++ sub.d result, data1, data2 ++ jr ra ++ ++strncmp_try_words: ++ srli.d limit_wd, limit, 3 ++ beqz src1_off, strncmp_do_misaligned ++ ++ sub.d src1_off, zero, src1_off ++ andi src1_off, src1_off, 0x7 ++ sub.d limit, limit, src1_off ++ srli.d limit_wd, limit, 0x3 ++ ++ ++strncmp_page_end_loop: ++ ld.bu data1, src1, 0 ++ ld.bu data2, src2, 0 ++ addi.d src1, src1, 1 ++ addi.d src2, src2, 1 ++ xor tmp1, data1, data2 ++ masknez tmp1, data1, tmp1 ++ beqz tmp1, strncmp_done ++ andi tmp1, src1, 0x7 ++ bnez tmp1, strncmp_page_end_loop ++strncmp_do_misaligned: ++ li.w src1_off, 0x8 ++ addi.d limit_wd, limit_wd, -1 ++ blt limit_wd, zero, strncmp_done_loop ++ ++strncmp_loop_misaligned: ++ andi tmp2, src2, 0xff8 ++ xori tmp2, tmp2, 0xff8 ++ beqz tmp2, strncmp_page_end_loop ++ ++ ld.d data1, src1, 0 ++ ld.d data2, src2, 0 ++ addi.d src1, src1, 8 ++ addi.d src2, src2, 8 ++ sub.d tmp1, data1, zeroones ++ or tmp2, data1, sevenf ++ xor diff, data1, data2 ++ andn has_nul, tmp1, tmp2 ++ or syndrome, diff, has_nul ++ bnez syndrome, strncmp_not_limit ++ addi.d limit_wd, limit_wd, -1 ++ #blt zero, limit_wd, strncmp_loop_misaligned ++ bge limit_wd, zero, strncmp_loop_misaligned ++ ++strncmp_done_loop: ++ andi limit, limit, 0x7 ++ beqz limit, strncmp_not_limit ++ /* Read the last double word */ ++ /* check if the final part is about to exceed the page */ ++ andi tmp1, src2, 0x7 ++ andi tmp2, src2, 0xff8 ++ add.d tmp1, tmp1, limit ++ xori tmp2, tmp2, 0xff8 ++ andi tmp1, tmp1, 0x8 ++ masknez tmp1, tmp1, tmp2 ++ bnez tmp1, strncmp_byte_loop ++ addi.d src1, src1, -8 ++ addi.d src2, src2, -8 ++ ldx.d data1, src1, limit ++ ldx.d data2, src2, limit ++ sub.d tmp1, data1, zeroones ++ or tmp2, data1, sevenf ++ xor diff, data1, data2 ++ andn has_nul, tmp1, tmp2 ++ or syndrome, diff, has_nul ++ bnez syndrome, strncmp_not_limit ++ ++strncmp_ret0: ++ move result, zero ++ jr ra ++/* check ++ if ((src1 != 0) && ((src2 == 0 ) || (src1 < src2))) ++ then exchange(src1,src2) ++ ++*/ ++ ++ ++ ++ ++ ++ ++END(STRNCMP) ++#ifndef ANDROID_CHANGES ++#ifdef _LIBC ++libc_hidden_builtin_def (strncmp) ++#endif ++#endif +diff --git a/sysdeps/loongarch/lp64/strnlen.S b/sysdeps/loongarch/lp64/strnlen.S +new file mode 100644 +index 00000000..3a204686 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/strnlen.S +@@ -0,0 +1,155 @@ ++/* Copyright 2016 Loongson Technology Corporation Limited */ ++ ++/* Author: Songyuekun songyuekun@loongson.cn */ ++ ++/* ++ * ISA: MIPS64R2 ++ * ABI: N64 ++ */ ++/* ++algorithm: ++ ++ #. use ld/ldr to access word/partial word in the string ++ ++ #. use (x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) != 0 to ++ judge if x has zero byte ++ ++ #. use dctz((x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) >> 3 ++ to get the index of first rightmost zero byte in dword x; ++ ++ #. use dctz(x) = 64 - dclz(~x & (x-1)); ++ ++ #. use pointer to the last non zero byte minus pointer to the start ++ of the string to get the length of string ++ ++*/ ++ ++#include ++#include ++ ++ ++ ++#define L_ADDIU addi.d ++#define L_ADDU add.d ++#define L_SUBU sub.d ++ ++#define STRNLEN __strnlen ++#define L(x) x ++/* rd <- if rc then ra else rb ++ will destroy t6 ++*/ ++ ++#define CONDITIONSEL(rd,ra,rb,rc)\ ++ masknez a5, rb, rc;\ ++ maskeqz rd, ra, rc;\ ++ or rd, rd, a5 ++ ++ ++/* Parameters and Results */ ++#define srcin a0 ++#define limit a1 ++#define len v0 ++ ++ ++/* Internal variable */ ++#define data1 t0 ++#define data2 t1 ++#define has_nul1 t2 ++#define has_nul2 t3 ++#define src t4 ++#define zeroones t5 ++#define sevenf t6 ++#define data2a t7 ++#define tmp6 t7 ++#define pos t8 ++#define tmp1 a2 ++#define tmp2 a3 ++#define tmp3 a4 ++#define tmp4 a5 ++#define tmp5 a6 ++#define limit_wd a7 ++ ++ ++ ++/* size_t strnlen (const char *s1,size_t maxlen); */ ++ ++LEAF(STRNLEN) ++ ++ .align 4 ++ beqz limit, L(_hit_limit) ++ lu12i.w zeroones, 0x01010 ++ lu12i.w sevenf, 0x7f7f7 ++ ori zeroones, zeroones, 0x101 ++ ori sevenf, sevenf, 0xf7f ++ bstrins.d zeroones, zeroones, 63, 32 ++ bstrins.d sevenf, sevenf, 63, 32 ++ andi tmp1, srcin, 15 ++ sub.d src, srcin, tmp1 ++ bnez tmp1, L(misaligned) ++ addi.d limit_wd, limit, -1 ++ srli.d limit_wd, limit_wd, 4 ++L(_loop): ++ ld.d data1, src, 0 ++ ld.d data2, src, 8 ++ addi.d src, src, 16 ++L(_realigned): ++ sub.d tmp1, data1, zeroones ++ or tmp2, data1, sevenf ++ sub.d tmp3, data2, zeroones ++ or tmp4, data2, sevenf ++ andn has_nul1, tmp1, tmp2 ++ andn has_nul2, tmp3, tmp4 ++ addi.d limit_wd, limit_wd, -1 ++ srli.d tmp1, limit_wd, 63 ++ or tmp2, has_nul1, has_nul2 ++ or tmp3, tmp1, tmp2 ++ beqz tmp3, L(_loop) ++ beqz tmp2, L(_hit_limit) ++ sub.d len, src, srcin ++ beqz has_nul1, L(_nul_in_data2) ++ move has_nul2, has_nul1 ++ addi.d len, len, -8 ++L(_nul_in_data2): ++ ctz.d pos, has_nul2 ++ srli.d pos, pos, 3 ++ addi.d len, len, -8 ++ add.d len, len, pos ++ sltu tmp1, len, limit ++ CONDITIONSEL(len,len,limit,tmp1) ++ jr ra ++ ++ ++L(misaligned): ++ addi.d limit_wd, limit, -1 ++ sub.d tmp4, zero, tmp1 ++ andi tmp3, limit_wd, 15 ++ srli.d limit_wd, limit_wd, 4 ++ li.d tmp5, -1 ++ ld.d data1, src, 0 ++ ld.d data2, src, 8 ++ addi.d src, src, 16 ++ slli.d tmp4, tmp4, 3 ++ add.d tmp3, tmp3, tmp1 ++ srl.d tmp2, tmp5, tmp4 ++ srli.d tmp3, tmp3, 4 ++ add.d limit_wd, limit_wd, tmp3 ++ or data1, data1, tmp2 ++ or data2a, data2, tmp2 ++ li.w tmp3, 9 ++ sltu tmp1, tmp1, tmp3 ++ CONDITIONSEL(data1,data1,tmp5,tmp1) ++ CONDITIONSEL(data2,data2,data2a,tmp1) ++ b L(_realigned) ++ ++ ++L(_hit_limit): ++ move len, limit ++ jr ra ++END(STRNLEN) ++#ifndef ANDROID_CHANGES ++#ifdef _LIBC ++weak_alias (__strnlen, strnlen) ++libc_hidden_def (strnlen) ++libc_hidden_def (__strnlen) ++#endif ++#endif +-- +2.33.0 + diff --git a/glibc.spec b/glibc.spec index 959fe67..101c296 100644 --- a/glibc.spec +++ b/glibc.spec @@ -66,7 +66,7 @@ ############################################################################## Name: glibc Version: 2.34 -Release: 105 +Release: 106 Summary: The GNU libc libraries License: %{all_license} URL: http://www.gnu.org/software/glibc/ @@ -272,6 +272,7 @@ Patch9021: x86-use-total-l3cache-for-non_temporal_threshold.patch Patch9022: login-Add-back-libutil-as-an-empty-library.patch Patch9023: malloc-Fix-malloc-debug-for-2.35-onwards.patch Patch9024: LoongArch-Port.patch +Patch9025: LoongArch-Optimize-string-functions-including-memcpy.patch %endif Provides: ldconfig rtld(GNU_HASH) bundled(gnulib) @@ -1438,6 +1439,9 @@ fi %endif %changelog +* Sun Jan 14 2023 Xue Liu - 2.34-106 +- LoongArch: Optimize some string functions + * Wed Dec 21 2022 wanghongliang - 2.34-105 - LoongArch Port - Add login-Add-back-libutil-as-an-empty-library.patch from upstream -- Gitee