From 065775f8bdf46124bddfe3a432031b98a66846cd Mon Sep 17 00:00:00 2001
From: Xue Liu <liuxue@loongson.cn>
Date: Sat, 14 Jan 2023 16:48:19 +0800
Subject: [PATCH] LoongArch: Optimize string functions including memcpy,
 memmove, memset, strchr, strchrnul, strcmp, strncmp, ctrcpy, ctrlen, strnlen.

---
 ...ze-string-functions-including-memcpy.patch | 2264 +++++++++++++++++
 glibc.spec                                    |    6 +-
 2 files changed, 2269 insertions(+), 1 deletion(-)
 create mode 100644 LoongArch-Optimize-string-functions-including-memcpy.patch

diff --git a/LoongArch-Optimize-string-functions-including-memcpy.patch b/LoongArch-Optimize-string-functions-including-memcpy.patch
new file mode 100644
index 0000000..17130d0
--- /dev/null
+++ b/LoongArch-Optimize-string-functions-including-memcpy.patch
@@ -0,0 +1,2264 @@
+From da7a2b19fc87165caaa1d5de5b058680f09d155d Mon Sep 17 00:00:00 2001
+From: Xue Liu <liuxue@loongson.cn>
+Date: Sat, 14 Jan 2023 16:14:24 +0800
+Subject: [PATCH] LoongArch: Optimize string functions including memcpy,
+ memmove, memset, strchr, strchrnul, strcmp, strncmp, ctrcpy, ctrlen, strnlen.
+
+Change-Id: I2975aea74f44bf2c9e01a6dfb6ca2eaa57aa5f7c
+---
+ sysdeps/loongarch/lp64/memcpy.S    | 258 ++++++++++++++++
+ sysdeps/loongarch/lp64/memmove.S   | 476 +++++++++++++++++++++++++++++
+ sysdeps/loongarch/lp64/memset.S    | 175 +++++++++++
+ sysdeps/loongarch/lp64/strchr.S    | 140 +++++++++
+ sysdeps/loongarch/lp64/strchrnul.S | 156 ++++++++++
+ sysdeps/loongarch/lp64/strcmp.S    | 197 ++++++++++++
+ sysdeps/loongarch/lp64/strcpy.S    | 210 +++++++++++++
+ sysdeps/loongarch/lp64/strlen.S    | 135 ++++++++
+ sysdeps/loongarch/lp64/strncmp.S   | 269 ++++++++++++++++
+ sysdeps/loongarch/lp64/strnlen.S   | 155 ++++++++++
+ 10 files changed, 2171 insertions(+)
+ create mode 100644 sysdeps/loongarch/lp64/memcpy.S
+ create mode 100644 sysdeps/loongarch/lp64/memmove.S
+ create mode 100644 sysdeps/loongarch/lp64/memset.S
+ create mode 100644 sysdeps/loongarch/lp64/strchr.S
+ create mode 100644 sysdeps/loongarch/lp64/strchrnul.S
+ create mode 100644 sysdeps/loongarch/lp64/strcmp.S
+ create mode 100644 sysdeps/loongarch/lp64/strcpy.S
+ create mode 100644 sysdeps/loongarch/lp64/strlen.S
+ create mode 100644 sysdeps/loongarch/lp64/strncmp.S
+ create mode 100644 sysdeps/loongarch/lp64/strnlen.S
+
+diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S
+new file mode 100644
+index 00000000..5e531523
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/memcpy.S
+@@ -0,0 +1,258 @@
++#ifdef _LIBC
++#include <sysdep.h>
++#include <sys/regdef.h>
++#include <sys/asm.h>
++#else
++#include <regdef.h>
++#include <sys/asm.h>
++#endif
++
++/* Allow the routine to be named something else if desired.  */
++#ifndef MEMCPY_NAME
++#define MEMCPY_NAME memcpy
++#endif
++
++#define LD_64(reg, n) \
++	ld.d    t0, reg, n;    \
++	ld.d    t1, reg, n+8;  \
++	ld.d    t2, reg, n+16; \
++	ld.d    t3, reg, n+24; \
++	ld.d    t4, reg, n+32; \
++	ld.d    t5, reg, n+40; \
++	ld.d    t6, reg, n+48; \
++	ld.d    t7, reg, n+56;
++
++#define ST_64(reg, n) \
++	st.d    t0, reg, n;    \
++	st.d    t1, reg, n+8;  \
++	st.d    t2, reg, n+16; \
++	st.d    t3, reg, n+24; \
++	st.d    t4, reg, n+32; \
++	st.d    t5, reg, n+40; \
++	st.d    t6, reg, n+48; \
++	st.d    t7, reg, n+56;
++
++#ifdef ANDROID_CHANGES
++LEAF(MEMCPY_NAME, 0)
++#else
++LEAF(MEMCPY_NAME)
++#endif
++
++//1st var: dst ptr: void *a1 $r4 a0
++//2nd var: src ptr: void *a2 $r5 a1
++//3rd var: size_t len $r6 a2
++//t0~t9 registers as temp
++
++	add.d   a4, a1, a2
++	add.d   a3, a0, a2
++	li.w    a6, 16
++	bge     a6, a2, less_16bytes
++	li.w    a6, 128
++	blt     a6, a2, long_bytes
++	li.w    a6, 64
++	blt     a6, a2, more_64bytes
++	li.w    a6, 32
++	blt     a6, a2, more_32bytes
++
++	/* 17...32 */
++	ld.d    t0, a1, 0
++	ld.d    t1, a1, 8
++	ld.d    t2, a4, -16
++	ld.d    t3, a4, -8
++	st.d    t0, a0, 0
++	st.d    t1, a0, 8
++	st.d    t2, a3, -16
++	st.d    t3, a3, -8
++	jr  ra
++
++more_64bytes:
++	srli.d	t8, a0, 3
++	slli.d	t8, t8, 3
++	addi.d	t8, t8,  0x8
++	sub.d	a7, a0, t8
++	ld.d	t0, a1, 0
++	sub.d	a1, a1, a7
++	st.d	t0, a0, 0
++
++	add.d	a7, a7, a2
++	addi.d	a7, a7, -0x20
++loop_32:
++	ld.d	t0, a1, 0
++	ld.d	t1, a1, 8
++	ld.d	t2, a1, 16
++	ld.d	t3, a1, 24
++	st.d	t0, t8, 0
++	st.d	t1, t8, 8
++	st.d	t2, t8, 16
++	st.d	t3, t8, 24
++
++	addi.d	t8,  t8,   0x20
++	addi.d	a1,  a1,   0x20
++	addi.d	a7,  a7,  -0x20
++	blt     zero, a7, loop_32
++
++	ld.d	t4, a4, -32
++	ld.d	t5, a4, -24
++	ld.d	t6, a4, -16
++	ld.d	t7, a4, -8
++	st.d	t4, a3, -32
++	st.d	t5, a3, -24
++	st.d	t6, a3, -16
++	st.d	t7, a3, -8
++
++	jr	ra
++
++more_32bytes:
++	/* 33...64 */
++	ld.d    t0, a1, 0
++	ld.d    t1, a1, 8
++	ld.d    t2, a1, 16
++	ld.d    t3, a1, 24
++	ld.d    t4, a4, -32
++	ld.d    t5, a4, -24
++	ld.d    t6, a4, -16
++	ld.d    t7, a4, -8
++	st.d    t0, a0, 0
++	st.d    t1, a0, 8
++	st.d    t2, a0, 16
++	st.d    t3, a0, 24
++	st.d    t4, a3, -32
++	st.d    t5, a3, -24
++	st.d    t6, a3, -16
++	st.d    t7, a3, -8
++	jr  ra
++
++less_16bytes:
++	srai.d  a6, a2, 3
++	beqz    a6, less_8bytes
++
++	/* 8...16 */
++	ld.d    t0, a1, 0
++	ld.d    t1, a4, -8
++	st.d    t0, a0, 0
++	st.d    t1, a3, -8
++
++	jr  ra
++
++less_8bytes:
++	srai.d  a6, a2, 2
++	beqz    a6, less_4bytes
++
++	/* 4...7 */
++	ld.w    t0, a1, 0
++	ld.w    t1, a4, -4
++	st.w    t0, a0, 0
++	st.w    t1, a3, -4
++	jr  ra
++
++less_4bytes:
++	srai.d  a6, a2, 1
++	beqz    a6, less_2bytes
++
++	/* 2...3 */
++	ld.h    t0, a1, 0
++	ld.h    t1, a4, -2
++	st.h    t0, a0, 0
++	st.h    t1, a3, -2
++	jr  ra
++
++less_2bytes:
++	beqz    a2, less_1bytes
++
++	ld.b    t0, a1, 0
++	st.b    t0, a0, 0
++	jr  ra
++
++less_1bytes:
++	jr  ra
++
++long_bytes:
++	srli.d  t8, a0, 3
++	slli.d  t8, t8, 3
++	beq     a0, t8, start
++
++	ld.d    t0, a1, 0
++	addi.d  t8, t8, 0x8
++	st.d    t0, a0, 0
++	sub.d   a7, a0, t8
++	sub.d   a1, a1, a7
++
++start:
++	addi.d  a5, a3, -0x80
++	blt     a5, t8, align_end_proc
++
++loop_128:
++	LD_64(a1, 0)
++	ST_64(t8, 0)
++	LD_64(a1, 64)
++	addi.d  a1, a1,  0x80
++	ST_64(t8, 64)
++	addi.d  t8, t8,  0x80
++	bge     a5, t8, loop_128
++
++align_end_proc:
++	sub.d   a2, a3, t8
++
++	pcaddi  t1, 34
++	andi    t2, a2, 0x78
++	sub.d   t1, t1, t2
++	jirl    zero, t1, 0
++
++end_120_128_unalign:
++	ld.d    t0, a1, 112
++	st.d    t0, t8, 112
++end_112_120_unalign:
++	ld.d    t0, a1, 104
++	st.d    t0, t8, 104
++end_104_112_unalign:
++	ld.d    t0, a1, 96
++	st.d    t0, t8, 96
++end_96_104_unalign:
++	ld.d    t0, a1, 88
++	st.d    t0, t8, 88
++end_88_96_unalign:
++	ld.d    t0, a1, 80
++	st.d    t0, t8, 80
++end_80_88_unalign:
++	ld.d    t0, a1, 72
++	st.d    t0, t8, 72
++end_72_80_unalign:
++	ld.d    t0, a1, 64
++	st.d    t0, t8, 64
++end_64_72_unalign:
++	ld.d    t0, a1, 56
++	st.d    t0, t8, 56
++end_56_64_unalign:
++	ld.d    t0, a1, 48
++	st.d    t0, t8, 48
++end_48_56_unalign:
++	ld.d    t0, a1, 40
++	st.d    t0, t8, 40
++end_40_48_unalign:
++	ld.d    t0, a1, 32
++	st.d    t0, t8, 32
++end_32_40_unalign:
++	ld.d    t0, a1, 24
++	st.d    t0, t8, 24
++end_24_32_unalign:
++	ld.d    t0, a1, 16
++	st.d    t0, t8, 16
++end_16_24_unalign:
++	ld.d    t0, a1, 8
++	st.d    t0, t8, 8
++end_8_16_unalign:
++	ld.d    t0, a1, 0
++	st.d    t0, t8, 0
++end_0_8_unalign:
++	ld.d    t0, a4, -8
++	st.d    t0, a3, -8
++
++	jr  ra
++
++END(MEMCPY_NAME)
++#ifndef ANDROID_CHANGES
++#ifdef _LIBC
++libc_hidden_builtin_def (MEMCPY_NAME)
++#endif
++#endif
++
+diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S
+new file mode 100644
+index 00000000..f87d036b
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/memmove.S
+@@ -0,0 +1,476 @@
++#ifdef _LIBC
++#include <sysdep.h>
++#include <sys/regdef.h>
++#include <sys/asm.h>
++#else
++#include <regdef.h>
++#include <sys/asm.h>
++#endif
++
++/* Allow the routine to be named something else if desired.  */
++#ifndef MEMMOVE_NAME
++#define MEMMOVE_NAME memmove
++#endif
++
++#define LD_64(reg, n) \
++	ld.d	t0, reg, n;    \
++	ld.d	t1, reg, n+8;  \
++	ld.d	t2, reg, n+16; \
++	ld.d	t3, reg, n+24; \
++	ld.d	t4, reg, n+32; \
++	ld.d	t5, reg, n+40; \
++	ld.d	t6, reg, n+48; \
++	ld.d	t7, reg, n+56;
++
++
++#define ST_64(reg, n) \
++	st.d	t0, reg, n;    \
++	st.d	t1, reg, n+8;  \
++	st.d	t2, reg, n+16; \
++	st.d	t3, reg, n+24; \
++	st.d	t4, reg, n+32; \
++	st.d	t5, reg, n+40; \
++	st.d	t6, reg, n+48; \
++	st.d	t7, reg, n+56;
++
++#define LDST_1024 \
++	LD_64(a1, 0);    \
++	ST_64(a0, 0);    \
++	LD_64(a1, 64);   \
++	ST_64(a0, 64);   \
++	LD_64(a1, 128);  \
++	ST_64(a0, 128);  \
++	LD_64(a1, 192);  \
++	ST_64(a0, 192);  \
++	LD_64(a1, 256);  \
++	ST_64(a0, 256);  \
++	LD_64(a1, 320);  \
++	ST_64(a0, 320);  \
++	LD_64(a1, 384);  \
++	ST_64(a0, 384);  \
++	LD_64(a1, 448);  \
++	ST_64(a0, 448);  \
++	LD_64(a1, 512);  \
++	ST_64(a0, 512);  \
++	LD_64(a1, 576);  \
++	ST_64(a0, 576);  \
++	LD_64(a1, 640);  \
++	ST_64(a0, 640);  \
++	LD_64(a1, 704);  \
++	ST_64(a0, 704);  \
++	LD_64(a1, 768);  \
++	ST_64(a0, 768);  \
++	LD_64(a1, 832);  \
++	ST_64(a0, 832);  \
++	LD_64(a1, 896);  \
++	ST_64(a0, 896);  \
++	LD_64(a1, 960);  \
++	ST_64(a0, 960);
++
++#define LDST_1024_BACK \
++	LD_64(a4, -64);   \
++	ST_64(a3, -64);   \
++	LD_64(a4, -128);  \
++	ST_64(a3, -128);  \
++	LD_64(a4, -192);  \
++	ST_64(a3, -192);  \
++	LD_64(a4, -256);  \
++	ST_64(a3, -256);  \
++	LD_64(a4, -320);  \
++	ST_64(a3, -320);  \
++	LD_64(a4, -384);  \
++	ST_64(a3, -384);  \
++	LD_64(a4, -448);  \
++	ST_64(a3, -448);  \
++	LD_64(a4, -512);  \
++	ST_64(a3, -512);  \
++	LD_64(a4, -576);  \
++	ST_64(a3, -576);  \
++	LD_64(a4, -640);  \
++	ST_64(a3, -640);  \
++	LD_64(a4, -704);  \
++	ST_64(a3, -704);  \
++	LD_64(a4, -768);  \
++	ST_64(a3, -768);  \
++	LD_64(a4, -832);  \
++	ST_64(a3, -832);  \
++	LD_64(a4, -896);  \
++	ST_64(a3, -896);  \
++	LD_64(a4, -960);  \
++	ST_64(a3, -960);  \
++	LD_64(a4, -1024); \
++	ST_64(a3, -1024);
++
++#ifdef ANDROID_CHANGES
++LEAF(MEMMOVE_NAME, 0)
++#else
++LEAF(MEMMOVE_NAME)
++#endif
++
++//1st var: dest ptr: void *str1 $r4 a0
++//2nd var: src  ptr: void *str2 $r5 a1
++//3rd var: size_t num
++//t0~t9 registers as temp
++
++	add.d	a4, a1, a2
++	add.d	a3, a0, a2
++	beq		a1, a0, less_1bytes
++	move	t8, a0
++	srai.d	a6, a2, 4  		#num/16
++	beqz	a6, less_16bytes        #num<16
++	srai.d	a6, a2, 6  		#num/64
++	bnez	a6, more_64bytes       #num>64
++	srai.d	a6, a2, 5
++	beqz	a6, less_32bytes	   #num<32
++
++	ld.d	t0, a1, 0              #32<num<64
++	ld.d	t1, a1, 8
++	ld.d	t2, a1, 16
++	ld.d	t3, a1, 24
++	ld.d	t4, a4, -32
++	ld.d	t5, a4, -24
++	ld.d	t6, a4, -16
++	ld.d	t7, a4, -8
++    st.d	t0, a0, 0
++    st.d	t1, a0, 8
++    st.d	t2, a0, 16
++    st.d	t3, a0, 24
++    st.d	t4, a3, -32
++    st.d	t5, a3, -24
++    st.d	t6, a3, -16
++    st.d	t7, a3, -8
++
++	jr  ra
++
++less_32bytes:
++	ld.d	t0, a1, 0
++	ld.d	t1, a1, 8
++	ld.d	t2, a4, -16
++	ld.d	t3, a4, -8
++	st.d	t0, a0, 0
++	st.d	t1, a0, 8
++	st.d	t2, a3, -16
++	st.d	t3, a3, -8
++
++	jr	ra
++
++less_16bytes:
++	srai.d	a6, a2, 3 #num/8
++	beqz	a6, less_8bytes
++
++	ld.d	t0, a1, 0
++	ld.d	t1, a4, -8
++	st.d	t0, a0, 0
++	st.d	t1, a3, -8
++
++	jr	ra
++
++less_8bytes:
++	srai.d	a6, a2, 2
++	beqz	a6, less_4bytes
++
++	ld.w	t0, a1, 0
++	ld.w	t1, a4, -4
++	st.w	t0, a0, 0
++	st.w	t1, a3, -4
++
++	jr	ra
++
++less_4bytes:
++	srai.d	a6, a2, 1
++	beqz	a6, less_2bytes
++
++	ld.h	t0, a1, 0
++	ld.h	t1, a4, -2
++	st.h	t0, a0, 0
++	st.h	t1, a3, -2
++
++	jr	ra
++
++less_2bytes:
++	beqz	a2, less_1bytes
++
++	ld.b	t0, a1, 0
++	st.b	t0, a0, 0
++
++	jr	ra
++
++less_1bytes:
++	jr	ra
++
++more_64bytes:
++	sub.d   a7, a0, a1
++	bltu	a7, a2, copy_backward
++
++copy_forward:
++	srli.d	a0, a0, 3
++	slli.d	a0, a0, 3
++	beq 	a0, t8, all_align
++	addi.d	a0, a0, 0x8
++	sub.d	a7, t8, a0
++	sub.d	a1, a1, a7
++	add.d	a2, a7, a2
++
++start_unalign_proc:
++	pcaddi  t1, 18
++	slli.d  a6, a7, 3
++	add.d   t1, t1, a6
++	jirl    zero, t1, 0
++
++start_7_unalign:
++	ld.b    t0, a1, -7
++	st.b    t0, a0, -7
++start_6_unalign:
++	ld.b    t0, a1, -6
++	st.b    t0, a0, -6
++start_5_unalign:
++	ld.b    t0, a1, -5
++	st.b    t0, a0, -5
++start_4_unalign:
++	ld.b    t0, a1, -4
++	st.b    t0, a0, -4
++start_3_unalign:
++	ld.b    t0, a1, -3
++	st.b    t0, a0, -3
++start_2_unalign:
++	ld.b    t0, a1, -2
++	st.b    t0, a0, -2
++start_1_unalign:
++	ld.b    t0, a1, -1
++	st.b    t0, a0, -1
++start_over:
++
++	addi.d	a2, a2, -0x80
++	blt     a2, zero, end_unalign_proc
++
++loop_less:
++	LD_64(a1, 0)
++	ST_64(a0, 0)
++	LD_64(a1, 64)
++	ST_64(a0, 64)
++
++	addi.d	a0, a0,  0x80
++	addi.d	a1, a1,  0x80
++	addi.d	a2, a2, -0x80
++	bge     a2, zero, loop_less
++
++end_unalign_proc:
++		addi.d  a2, a2, 0x80
++
++    	pcaddi  t1, 36
++    	andi    t2, a2, 0x78
++		add.d   a1, a1, t2
++		add.d   a0, a0, t2
++    	sub.d   t1, t1, t2
++    	jirl    zero, t1, 0
++
++end_120_128_unalign:
++		ld.d    t0, a1, -120
++		st.d    t0, a0, -120
++end_112_120_unalign:
++		ld.d    t0, a1, -112
++		st.d    t0, a0, -112
++end_104_112_unalign:
++		ld.d    t0, a1, -104
++		st.d    t0, a0, -104
++end_96_104_unalign:
++		ld.d    t0, a1, -96
++		st.d    t0, a0, -96
++end_88_96_unalign:
++		ld.d    t0, a1, -88
++		st.d    t0, a0, -88
++end_80_88_unalign:
++		ld.d    t0, a1, -80
++		st.d    t0, a0, -80
++end_72_80_unalign:
++		ld.d    t0, a1, -72
++		st.d    t0, a0, -72
++end_64_72_unalign:
++		ld.d    t0, a1, -64
++		st.d    t0, a0, -64
++end_56_64_unalign:
++		ld.d    t0, a1, -56
++		st.d    t0, a0, -56
++end_48_56_unalign:
++		ld.d    t0, a1, -48
++		st.d    t0, a0, -48
++end_40_48_unalign:
++		ld.d    t0, a1, -40
++		st.d    t0, a0, -40
++end_32_40_unalign:
++		ld.d    t0, a1, -32
++		st.d    t0, a0, -32
++end_24_32_unalign:
++    	ld.d    t0, a1, -24
++    	st.d    t0, a0, -24
++end_16_24_unalign:
++    	ld.d    t0, a1, -16
++    	st.d    t0, a0, -16
++end_8_16_unalign:
++    	ld.d    t0, a1, -8
++    	st.d    t0, a0, -8
++end_0_8_unalign:
++
++    	andi    a2, a2, 0x7
++		pcaddi  t1, 18
++		slli.d  a2, a2, 3
++		sub.d   t1, t1, a2
++		jirl    zero, t1, 0
++
++end_7_unalign:
++		ld.b    t0, a4, -7
++		st.b    t0, a3, -7
++end_6_unalign:
++		ld.b    t0, a4, -6
++		st.b    t0, a3, -6
++end_5_unalign:
++		ld.b    t0, a4, -5
++		st.b    t0, a3, -5
++end_4_unalign:
++		ld.b    t0, a4, -4
++		st.b    t0, a3, -4
++end_3_unalign:
++		ld.b    t0, a4, -3
++		st.b    t0, a3, -3
++end_2_unalign:
++		ld.b    t0, a4, -2
++		st.b    t0, a3, -2
++end_1_unalign:
++		ld.b    t0, a4, -1
++		st.b    t0, a3, -1
++end:
++
++		move    v0, t8
++		jr	ra
++
++all_align:
++	addi.d  a1, a1, 0x8
++	addi.d  a0, a0, 0x8
++	ld.d	t0, a1, -8
++	st.d    t0, a0, -8
++	addi.d  a2, a2, -8
++	b 		start_over
++
++all_align_back:
++	addi.d  a4, a4, -0x8
++	addi.d  a3, a3, -0x8
++	ld.d    t0, a4, 0
++	st.d    t0, a3, 0
++	addi.d  a2, a2, -8
++	b       start_over_back
++
++copy_backward:
++	move    a5, a3
++	srli.d  a3, a3, 3
++	slli.d  a3, a3, 3
++	beq     a3, a5, all_align_back
++	sub.d   a7, a3, a5
++	add.d   a4, a4, a7
++	add.d   a2, a7, a2
++
++	pcaddi  t1, 18
++	slli.d  a6, a7, 3
++	add.d   t1, t1, a6
++	jirl    zero, t1, 0
++
++	ld.b    t0, a4, 6
++	st.b    t0, a3, 6
++	ld.b    t0, a4, 5
++	st.b    t0, a3, 5
++	ld.b    t0, a4, 4
++	st.b    t0, a3, 4
++	ld.b    t0, a4, 3
++	st.b    t0, a3, 3
++	ld.b    t0, a4, 2
++	st.b    t0, a3, 2
++	ld.b    t0, a4, 1
++	st.b    t0, a3, 1
++	ld.b    t0, a4, 0
++	st.b    t0, a3, 0
++start_over_back:
++
++	addi.d  a2, a2, -0x80
++	blt     a2, zero, end_unalign_proc_back
++
++loop_less_back:
++	LD_64(a4, -64)
++	ST_64(a3, -64)
++	LD_64(a4, -128)
++	ST_64(a3, -128)
++
++	addi.d a4, a4, -0x80
++	addi.d a3, a3, -0x80
++	addi.d a2, a2, -0x80
++	bge    a2, zero, loop_less_back
++
++end_unalign_proc_back:
++		addi.d  a2, a2, 0x80
++
++		pcaddi  t1, 36
++		andi    t2, a2, 0x78
++		sub.d   a4, a4, t2
++		sub.d   a3, a3, t2
++		sub.d   t1, t1, t2
++		jirl    zero, t1, 0
++
++		ld.d    t0, a4, 112
++		st.d    t0, a3, 112
++		ld.d    t0, a4, 104
++		st.d    t0, a3, 104
++		ld.d    t0, a4, 96
++		st.d    t0, a3, 96
++		ld.d    t0, a4, 88
++		st.d    t0, a3, 88
++		ld.d    t0, a4, 80
++		st.d    t0, a3, 80
++		ld.d    t0, a4, 72
++		st.d    t0, a3, 72
++		ld.d    t0, a4, 64
++		st.d    t0, a3, 64
++		ld.d    t0, a4, 56
++		st.d    t0, a3, 56
++		ld.d    t0, a4, 48
++		st.d    t0, a3, 48
++		ld.d    t0, a4, 40
++		st.d    t0, a3, 40
++		ld.d    t0, a4, 32
++		st.d    t0, a3, 32
++    	ld.d    t0, a4, 24
++    	st.d    t0, a3, 24
++    	ld.d    t0, a4, 16
++    	st.d    t0, a3, 16
++    	ld.d    t0, a4, 8
++    	st.d    t0, a3, 8
++		ld.d    t0, a4, 0
++		st.d    t0, a3, 0
++
++		andi    a2, a2, 0x7
++		pcaddi  t1, 18
++		slli.d  a2, a2, 3
++		sub.d   t1, t1, a2
++		jirl    zero, t1, 0
++
++		ld.b    t0, a1, 6
++		st.b    t0, a0, 6
++		ld.b    t0, a1, 5
++		st.b    t0, a0, 5
++		ld.b    t0, a1, 4
++		st.b    t0, a0, 4
++		ld.b    t0, a1, 3
++		st.b    t0, a0, 3
++		ld.b    t0, a1, 2
++		st.b    t0, a0, 2
++		ld.b    t0, a1, 1
++		st.b    t0, a0, 1
++		ld.b    t0, a1, 0
++		st.b    t0, a0, 0
++
++		move    v0, t8
++		jr	ra
++
++END(MEMMOVE_NAME)
++#ifndef ANDROID_CHANGES
++#ifdef _LIBC
++libc_hidden_builtin_def (MEMMOVE_NAME)
++#endif
++#endif
+diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S
+new file mode 100644
+index 00000000..8bc152ee
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/memset.S
+@@ -0,0 +1,175 @@
++#ifdef _LIBC
++#include <sysdep.h>
++#include <sys/regdef.h>
++#include <sys/asm.h>
++#else
++#include <sys/asm.h>
++#include <sys/regdef.h>
++#endif
++
++#ifdef LOONGSON_TEST
++#define MEMSET	_memset
++#else
++#define MEMSET	memset
++#endif
++
++#define ST_128(n) 	\
++	st.d	a1, a0, n;		 \
++	st.d    a1, a0, n+8  ; 	 \
++	st.d    a1, a0, n+16 ;   \
++	st.d    a1, a0, n+24 ;   \
++	st.d    a1, a0, n+32 ;   \
++	st.d    a1, a0, n+40 ;   \
++	st.d    a1, a0, n+48 ;   \
++	st.d    a1, a0, n+56 ;   \
++	st.d    a1, a0, n+64 ;   \
++	st.d    a1, a0, n+72 ;   \
++	st.d    a1, a0, n+80 ;   \
++	st.d    a1, a0, n+88 ;   \
++	st.d    a1, a0, n+96 ;   \
++	st.d    a1, a0, n+104;   \
++	st.d    a1, a0, n+112;   \
++	st.d    a1, a0, n+120;	 \
++
++//1st var: void *str  $4 a0
++//2nd var: int val  $5   a1
++//3rd var: size_t num  $6  a2
++
++LEAF(MEMSET)
++
++memset:
++	.align	6
++
++	bstrins.d a1, a1, 15, 8
++	add.d	  t7, a0, a2
++	bstrins.d a1, a1, 31, 16
++	move	  t0, a0
++	bstrins.d a1, a1, 63, 32
++	srai.d	  t8, a2, 4         	#num/16
++	beqz	  t8, less_16bytes	#num<16
++	srai.d	  t8, a2, 6		#num/64
++	bnez	  t8, more_64bytes	#num>64
++	srai.d	  t8, a2, 5		#num/32
++	beqz	  t8, less_32bytes	#num<32
++	st.d	  a1, a0, 0 		#32<num<64
++	st.d	  a1, a0, 8
++	st.d	  a1, a0, 16
++	st.d	  a1, a0, 24
++	st.d	  a1, t7, -32
++	st.d	  a1, t7, -24
++	st.d	  a1, t7, -16
++	st.d	  a1, t7, -8
++
++	jr	  ra
++
++less_32bytes:
++	st.d	  a1, a0, 0
++	st.d	  a1, a0, 8
++	st.d	  a1, t7, -16
++	st.d	  a1, t7, -8
++
++	jr	  ra
++
++less_16bytes:
++	srai.d	  t8, a2, 3		#num/8
++	beqz	  t8, less_8bytes
++	st.d	  a1, a0, 0
++	st.d	  a1, t7, -8
++
++	jr	  ra
++
++less_8bytes:
++	srai.d	  t8, a2, 2
++	beqz	  t8, less_4bytes
++	st.w	  a1, a0, 0
++	st.w	  a1, t7, -4
++
++	jr	  ra
++
++less_4bytes:
++	srai.d	  t8, a2, 1
++	beqz	  t8, less_2bytes
++	st.h	  a1, a0, 0
++	st.h	  a1, t7, -2
++
++	jr	  ra
++
++less_2bytes:
++	beqz	  a2, less_1bytes
++	st.b	  a1, a0, 0
++
++	jr	  ra
++
++less_1bytes:
++	jr	  ra
++
++more_64bytes:
++	srli.d	  a0, a0, 3
++	slli.d	  a0, a0, 3
++	addi.d	  a0, a0, 0x8
++	st.d      a1, t0, 0
++	sub.d	  t2, t0, a0
++	add.d	  a2, t2, a2
++
++	addi.d	  a2, a2, -0x80
++	blt       a2, zero, end_unalign_proc
++
++loop_less:
++	ST_128(0)
++	addi.d	a0, a0,  0x80
++	addi.d	a2, a2, -0x80
++	bge     a2, zero, loop_less
++
++end_unalign_proc:
++	addi.d  a2, a2, 0x80
++
++	pcaddi  t1, 20
++	andi    t5, a2, 0x78
++	srli.d  t5, t5, 1
++	sub.d   t1, t1, t5
++	jirl    zero, t1, 0
++
++end_120_128_unalign:
++	st.d    a1, a0, 112
++end_112_120_unalign:
++	st.d    a1, a0, 104
++end_104_112_unalign:
++	st.d    a1, a0, 96
++end_96_104_unalign:
++	st.d    a1, a0, 88
++end_88_96_unalign:
++	st.d    a1, a0, 80
++end_80_88_unalign:
++	st.d    a1, a0, 72
++end_72_80_unalign:
++	st.d    a1, a0, 64
++end_64_72_unalign:
++	st.d    a1, a0, 56
++end_56_64_unalign:
++	st.d    a1, a0, 48
++end_48_56_unalign:
++	st.d    a1, a0, 40
++end_40_48_unalign:
++	st.d    a1, a0, 32
++end_32_40_unalign:
++	st.d    a1, a0, 24
++end_24_32_unalign:
++    st.d    a1, a0, 16
++end_16_24_unalign:
++    st.d    a1, a0, 8
++end_8_16_unalign:
++    st.d    a1, a0, 0
++end_0_8_unalign:
++
++	st.d    a1, t7, -8
++
++	move	  v0, t0
++	jr	  ra
++
++END(MEMSET)
++
++#ifndef ANDROID_CHANGES
++#ifdef _LIBC
++libc_hidden_builtin_def (memset)
++#endif
++#endif
+diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S
+new file mode 100644
+index 00000000..872101db
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strchr.S
+@@ -0,0 +1,140 @@
++/* Copyright 2016 Loongson Technology Corporation Limited  */
++
++/* Author: songyuekun songyuekun@loongson.cn */
++
++/*
++ * ISA: MIPS64R2
++ * ABI: N64
++ */
++
++/* basic algorithm :
++
++	+. 	use ld.d and mask for the first 8 bytes or less;
++
++	+.	build a1 with 8c with dins;
++
++	+.	use xor from a1 and v0 to check if is found;
++
++	+.	if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has
++		one byte is \0, else has no \0
++
++*/
++
++
++
++
++#include <sysdep.h>
++#include <sys/asm.h>
++
++
++
++
++
++#define L_ADDIU  addi.d
++#define L_ADDU   add.d
++#define L_SUBU   sub.d
++
++#define STRCHR	strchr
++#define MOVN(rd,rs,rt) \
++	maskeqz t6, rs, rt;\
++	masknez rd, rd, rt;\
++	or	rd, rd, t6
++
++#define MOVN2(rd,rt) \
++	masknez rd, rd, rt;\
++	or	rd, rd, rt
++
++
++/* char * strchr (const char *s1, int c); */
++
++LEAF(STRCHR)
++	.align		6
++
++	li.w		t4, 0x7
++	lu12i.w		a2, 0x01010
++	bstrins.d	a1, a1, 15, 8
++	andi		t0, a0, 0x7
++
++	ori		a2, a2, 0x101
++	andn		t4, a0, t4
++	slli.w		t1, t0, 3
++
++	ld.d		t4, t4, 0
++
++
++	nor		t8, zero, zero
++	bstrins.d	a1, a1, 31, 16
++	srl.d		t4, t4, t1
++
++	bstrins.d	a1, a1, 63, 32
++	bstrins.d	a2, a2, 63, 32
++	srl.d		a7, t8, t1
++
++	li.w		t1, 8
++	nor		t8, a7, zero
++	slli.d		a3, a2, 7
++	or		t5, t8, t4
++	and		t3, a7, a1
++
++	sub.w		t1, t1, t0
++	nor		a3, a3, zero
++	xor		t2, t5, t3
++	sub.d		a7, t5, a2
++	nor		a6, t5, a3
++
++	sub.d		a5, t2, a2
++	nor		a4, t2, a3
++
++    and         a6, a7, a6
++    and         a5, a5, a4
++    or          a7, a6, a5
++	bnez		a7, L(_mc8_a)
++
++	L_ADDU		a0, a0, t1
++L(_aloop):
++	ld.d		t4, a0, 0
++
++	xor		t2, t4, a1
++	sub.d		a7, t4, a2
++	nor		a6, t4, a3
++	sub.d		a5, t2, a2
++
++	nor		a4, t2, a3
++    and         a6, a7, a6
++    and         a5, a5, a4
++    or          a7, a6, a5
++	bnez		a7, L(_mc8_a)
++
++	ld.d		t4, a0, 8
++	L_ADDIU		a0, a0, 16
++	xor		t2, t4, a1
++	sub.d		a7, t4, a2
++	nor		a6, t4, a3
++	sub.d		a5, t2, a2
++
++	nor		a4, t2, a3
++    and         a6, a7, a6
++    and         a5, a5, a4
++    or          a7, a6, a5
++	beqz		a7, L(_aloop)
++
++	L_ADDIU		a0, a0, -8
++L(_mc8_a):
++
++    ctz.d       t0, a5
++    ctz.d       t2, a6
++
++	srli.w		t0, t0, 3
++	srli.w		t2, t2, 3
++	sltu		t1, t2, t0
++	L_ADDU		v0, a0, t0
++    masknez     v0, v0, t1
++	jr		ra
++END(STRCHR)
++
++#ifndef ANDROID_CHANGES
++#ifdef _LIBC
++libc_hidden_builtin_def (strchr)
++weak_alias (strchr, index)
++#endif
++#endif
+diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S
+new file mode 100644
+index 00000000..a57a5065
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strchrnul.S
+@@ -0,0 +1,156 @@
++/* Copyright 2016 Loongson Technology Corporation Limited  */
++
++/* Author: Songyuekun songyuekun@loongson.cn */
++
++/*
++ * ISA: MIPS64R2
++ * ABI: N64
++ */
++
++/* basic algorithm :
++
++	+.	use ld.d and mask for the first 8 bytes or less;
++
++	+.	build a1 with 8c with dins;
++
++	+.	use xor from a1 and v0 to check if is found;
++
++	+.	if (v0 - 0x0101010101010101) & (~(v0 | 0x7f7f7f7f7f7f7f7f)!= 0, v0 has
++		one byte is \0, else has no \0
++
++*/
++
++
++
++
++#include <sysdep.h>
++#include <sys/asm.h>
++
++
++
++
++
++#define L_ADDIU  addi.d
++#define L_ADDU   add.d
++#define L_SUBU   sub.d
++
++#define STRCHRNUL	__strchrnul
++
++#define MOVN(rd,rs,rt) \
++	maskeqz t6, rs, rt;\
++	masknez rd, rd, rt;\
++	or	rd, rd, t6
++
++#define MOVZ(rd,rs,rt) \
++	masknez t6, rs, rt;\
++	maskeqz rd, rd, rt;\
++	or	rd, rd, t6
++
++
++#define MOVN2(rd,rt) \
++	masknez rd, rd, rt;\
++	or	rd, rd, rt
++
++
++/* char * strchrnul (const char *s1, int c); */
++
++LEAF(STRCHRNUL)
++	.align		6
++
++	li.w		t4, 0x7
++	lu12i.w		a2, 0x01010
++	bstrins.d	a1, a1, 15, 8
++	andi		t0, a0, 0x7
++
++	ori		a2, a2, 0x101
++	andn		t4, a0, t4
++	slli.w		t1, t0, 3
++/*
++	ldr		t4, 0(a0)
++*/
++	ld.d		t4, t4, 0
++
++
++	nor		t8, zero, zero
++	bstrins.d	a1, a1, 31, 16
++	srl.d		t4, t4, t1
++
++    preld       0, a0, 32
++	bstrins.d	a1, a1, 63, 32
++	bstrins.d	a2, a2, 63, 32
++	srl.d		a7, t8, t1
++
++	nor		t8, a7, zero
++	slli.d		a3, a2, 7
++	or		t5, t8, t4
++	and		t3, a7, a1
++
++	nor		a3, a3, zero
++	xor		t2, t5, t3
++	sub.d		a7, t5, a2
++	nor		a6, t5, a3
++
++	li.w		t1, 8
++	sub.d		a5, t2, a2
++	nor		a4, t2, a3
++
++	and		a6, a7, a6
++	and		a5, a5, a4
++	or          a7, a6, a5
++	bnez		a7, L(_mc8_a)
++
++
++    sub.w		t1, t1, t0
++	L_ADDU		a0, a0, t1
++L(_aloop):
++	ld.d		t4, a0, 0
++
++	xor		t2, t4, a1
++	sub.d		a7, t4, a2
++	nor		a6, t4, a3
++	sub.d		a5, t2, a2
++
++	nor		a4, t2, a3
++	and		a6, a7, a6
++	and		a5, a5, a4
++
++    or          a7, a6, a5
++	bnez		a7, L(_mc8_a)
++
++	ld.d		t4, a0, 8
++	L_ADDIU		a0, a0, 16
++
++	xor		    t2, t4, a1
++	sub.d		a7, t4, a2
++	nor		    a6, t4, a3
++	sub.d		a5, t2, a2
++
++	nor		    a4, t2, a3
++	and		    a6, a7, a6
++	and		    a5, a5, a4
++
++    or          a7, a6, a5
++	beqz		a7, L(_aloop)
++
++	L_ADDIU		a0, a0, -8
++L(_mc8_a):
++
++    ctz.d       t0, a5
++    ctz.d       t2, a6
++
++	srli.w		t0, t0, 3
++	srli.w		t2, t2, 3
++	slt 		t1, t0, t2
++
++    MOVZ(t0,t2,t1)
++
++	L_ADDU		v0, a0, t0
++	jr		ra
++END(STRCHRNUL)
++
++#ifndef ANDROID_CHANGES
++#ifdef _LIBC
++weak_alias(__strchrnul, strchrnul)
++libc_hidden_builtin_def (__strchrnul)
++#endif
++#endif
+diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S
+new file mode 100644
+index 00000000..11474bf2
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strcmp.S
+@@ -0,0 +1,197 @@
++/* Copyright 2016 Loongson Technology Corporation Limited  */
++
++/* Author: songyuekun songyuekun@loongson.cn */
++
++/*
++ * ISA: MIPS64R2
++ * ABI: N64
++ */
++
++/* basic algorithm :
++
++	+.	let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1,
++		set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1
++
++	+.	if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more ld other times;
++
++	+.	if not,  load partial t2 and t3, check if t2 has \0;
++
++	+.	then use use ld for t0, ldr for t1,
++
++	+.	if partial 8 byte  from t1 has \0, compare partial 8 byte from t1 with 8
++		byte from t0 with a mask in a7
++
++	+.	if not, ldl other part of t1, compare  8 byte from t1 with 8 byte from t0
++
++	+.	if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has
++		one byte is \0, else has no \0
++
++	+.	for partial 8 byte from ldr t3, 0(a0), preload t3 with 0xffffffffffffffff
++
++
++*/
++#include <sys/asm.h>
++#include <sys/regdef.h>
++
++
++#define STRCMP	strcmp
++
++#define REP8_01 0x0101010101010101
++#define REP8_7f 0x7f7f7f7f7f7f7f7f
++#define REP8_80 0x8080808080808080
++
++/* Parameters and Results */
++#define src1	a0
++#define	src2	a1
++#define result	v0
++// Note: v0 = a0 in N64 ABI
++
++
++/* Internal variable */
++#define data1		t0
++#define	data2		t1
++#define	has_nul		t2
++#define	diff		t3
++#define syndrome	t4
++#define zeroones	t5
++#define	sevenf		t6
++#define pos		t7
++#define exchange	t8
++#define tmp1		a4
++#define	tmp2		a5
++#define	tmp3		a6
++#define src1_off    a2
++#define src2_off    a3
++#define tmp4        a7
++
++/* rd <- if rc then ra else rb
++    will destroy tmp3
++*/
++#define CONDITIONSEL(rd,rc,ra,rb)\
++        masknez tmp3, rb, rc;\
++        maskeqz rd,   ra, rc;\
++        or      rd,   rd, tmp3
++
++
++
++/* int strcmp (const char *s1, const char *s2); */
++
++LEAF(STRCMP)
++	.align		4
++
++	xor		tmp1, src1, src2
++    lu12i.w     zeroones, 0x01010
++    lu12i.w     sevenf, 0x7f7f7
++    andi        src1_off, src1, 0x7
++    ori         zeroones, zeroones, 0x101
++    ori         sevenf, sevenf, 0xf7f
++	andi		tmp1, tmp1, 0x7
++    bstrins.d   zeroones, zeroones, 63, 32
++    bstrins.d   sevenf, sevenf, 63, 32
++	bnez		tmp1, strcmp_misaligned8
++	bnez		src1_off, strcmp_mutual_align
++strcmp_loop_aligned:
++	ld.d		data1, src1, 0
++    addi.d      src1, src1, 8
++	ld.d		data2, src2, 0
++    addi.d      src2, src2, 8
++strcmp_start_realigned:
++	sub.d		tmp1, data1, zeroones
++	or		tmp2, data1, sevenf
++	xor		diff, data1, data2
++	andn		has_nul, tmp1, tmp2
++	or		syndrome, diff, has_nul
++	beqz		syndrome, strcmp_loop_aligned
++
++strcmp_end:
++	ctz.d		pos, syndrome
++    bstrins.d   pos, zero, 2, 0
++	srl.d		data1, data1, pos
++	srl.d		data2, data2, pos
++	andi		data1, data1, 0xff
++	andi		data2, data2, 0xff
++	sub.d		result, data1, data2
++	jr ra
++strcmp_mutual_align:
++    bstrins.d   src1, zero, 2, 0
++    bstrins.d   src2, zero, 2, 0
++	slli.d		tmp1, src1_off,  0x3
++	ld.d		data1, src1, 0
++	sub.d		tmp1, zero, tmp1
++	ld.d		data2, src2, 0
++    addi.d      src1, src1, 8
++    addi.d      src2, src2, 8
++	nor		tmp2, zero, zero
++	srl.d		tmp2, tmp2, tmp1
++	or		data1, data1, tmp2
++	or		data2, data2, tmp2
++	b		strcmp_start_realigned
++
++strcmp_misaligned8:
++
++/* check
++    if ((src1 != 0) && ((src2 == 0 ) || (src1 < src2)))
++    then exchange(src1,src2)
++
++*/
++    andi        src2_off, src2, 0x7
++    slt         tmp2, src1_off, src2_off
++    CONDITIONSEL(tmp2,src2_off,tmp2,tmp1)
++    maskeqz     exchange, tmp2, src1_off
++    xor         tmp3, src1, src2
++    maskeqz     tmp3, tmp3, exchange
++    xor         src1, src1, tmp3
++    xor         src2, src2, tmp3
++
++	andi		src1_off, src1, 0x7
++	beqz		src1_off, strcmp_loop_misaligned
++strcmp_do_misaligned:
++	ld.bu		data1, src1, 0
++	ld.bu		data2, src2, 0
++    xor         tmp3, data1, data2
++	addi.d		src1, src1, 1
++    masknez     tmp3, data1, tmp3
++	addi.d		src2, src2, 1
++    beqz        tmp3, strcmp_done
++	andi		src1_off, src1, 0x7
++	bnez		src1_off, strcmp_do_misaligned
++
++strcmp_loop_misaligned:
++	andi		tmp1, src2, 0xff8
++	xori		tmp1, tmp1, 0xff8
++	beqz		tmp1, strcmp_do_misaligned
++	ld.d		data1, src1, 0
++	ld.d		data2, src2, 0
++	addi.d		src1, src1, 8
++	addi.d		src2, src2, 8
++
++	sub.d		tmp1, data1, zeroones
++	or		tmp2, data1, sevenf
++	xor		diff, data1, data2
++	andn		has_nul, tmp1, tmp2
++	or		syndrome, diff, has_nul
++	beqz		syndrome, strcmp_loop_misaligned
++//	b		strcmp_end
++strcmp_misalign_end:
++	ctz.d		pos, syndrome
++    bstrins.d   pos, zero, 2, 0
++	srl.d		data1, data1, pos
++	srl.d		data2, data2, pos
++	andi		data1, data1, 0xff
++	andi		data2, data2, 0xff
++	sub.d		tmp1, data1, data2
++	sub.d		tmp2, data2, data1
++    CONDITIONSEL(result,exchange,tmp2,tmp1)
++	jr ra
++
++strcmp_done:
++	sub.d	    tmp1, data1, data2
++	sub.d		tmp2, data2, data1
++    CONDITIONSEL(result,exchange,tmp2,tmp1)
++	jr	ra
++END(STRCMP)
++#ifndef ANDROID_CHANGES
++#ifdef _LIBC
++libc_hidden_builtin_def (strcmp)
++#endif
++#endif
+diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S
+new file mode 100644
+index 00000000..ce39e5a1
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strcpy.S
+@@ -0,0 +1,210 @@
++/* Copyright 2016 Loongson Technology Corporation Limited  */
++
++/* Author: Huang Pei huangpei@loongson.cn */
++
++/*
++ * ISA: MIPS64R2
++ * ABI: N64
++ */
++
++/* basic algorithm :
++
++    +.  if src aligned. just do the copy loop. if not, do the cross page check and copy one double word.
++
++        Then move src to aligned.
++
++	+.	if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has
++		one byte is \0, else has no \0
++
++
++*/
++
++
++#include <sys/asm.h>
++#include <sys/regdef.h>
++
++
++#define STRCPY	strcpy
++
++
++#define REP8_01 0x0101010101010101
++#define REP8_7f 0x7f7f7f7f7f7f7f7f
++#define REP8_80 0x8080808080808080
++
++/* Parameters and Results */
++#define dest	a0
++#define	src	a1
++#define result	v0
++// Note: v0 = a0 in N64 ABI
++
++
++/* Internal variable */
++#define data		t0
++#define	data1		t1
++#define	has_nul		t2
++#define	diff		t3
++#define syndrome	t4
++#define zeroones	t5
++#define	sevenf		t6
++#define pos		t7
++#define dest_backup	t8
++#define tmp1		a4
++#define	tmp2		a5
++#define	tmp3		a6
++#define dest_off    a2
++#define src_off     a3
++#define tmp4        a7
++
++/* rd <- if rc then ra else rb
++    will destroy tmp3
++*/
++#define CONDITIONSEL(rd,rc,ra,rb)\
++        masknez tmp3, rb, rc;\
++        maskeqz rd,   ra, rc;\
++        or      rd,   rd, tmp3
++
++
++
++/* int strcpy (const char *s1, const char *s2); */
++
++LEAF(STRCPY)
++	.align		4
++
++    move        dest_backup, dest
++    lu12i.w     zeroones, 0x01010
++    lu12i.w     sevenf, 0x7f7f7
++    ori         zeroones, zeroones, 0x101
++    ori         sevenf, sevenf, 0xf7f
++    bstrins.d   zeroones, zeroones, 63, 32
++    bstrins.d   sevenf, sevenf, 63, 32
++    andi        src_off, src, 0x7
++	beqz		src_off, strcpy_loop_aligned_1
++    b           strcpy_mutual_align
++strcpy_loop_aligned:
++    st.d        data, dest, 0
++    addi.d      dest, dest, 8
++strcpy_loop_aligned_1:
++	ld.d		data, src, 0
++    addi.d      src, src, 8
++strcpy_start_realigned:
++	sub.d		tmp1, data, zeroones
++	or		    tmp2, data, sevenf
++	andn		has_nul, tmp1, tmp2
++	beqz		has_nul, strcpy_loop_aligned
++
++strcpy_end:
++
++/*
++8 4 2 1
++*/
++	ctz.d		pos, has_nul
++	srli.d		pos, pos, 3
++    addi.d      pos, pos, 1
++/*
++    Do 8/4/2/1 strcpy based on pos value.
++    pos value is the number of bytes to be copied
++    the bytes include the final \0 so the max length is 8 and the min length is 1
++*/
++
++strcpy_end_8:
++    andi        tmp1, pos, 0x8
++    beqz        tmp1, strcpy_end_4
++    st.d        data, dest, 0
++    move        dest, dest_backup
++    jr  ra
++strcpy_end_4:
++    andi        tmp1, pos, 0x4
++    beqz        tmp1, strcpy_end_2
++    st.w        data, dest, 0
++    srli.d      data, data, 32
++    addi.d      dest, dest, 4
++strcpy_end_2:
++    andi        tmp1, pos, 0x2
++    beqz        tmp1, strcpy_end_1
++    st.h        data, dest, 0
++    srli.d      data, data, 16
++    addi.d      dest, dest, 2
++strcpy_end_1:
++    andi        tmp1, pos, 0x1
++    beqz        tmp1, strcpy_end_ret
++    st.b        data, dest, 0
++strcpy_end_ret:
++    move        result, dest_backup
++    jr  ra
++
++
++strcpy_mutual_align:
++/*
++    Check if around src page bound.
++    if not go to page cross ok.
++    if it is, do further check.
++    use tmp2 to accelerate.
++*/
++
++    li.w          tmp2, 0xff8
++    andi        tmp1, src,  0xff8
++    beq         tmp1, tmp2, strcpy_page_cross
++
++strcpy_page_cross_ok:
++/*
++    Load a misaligned double word and check if has \0
++    If no, do a misaligned double word paste.
++    If yes, calculate the number of avaliable bytes,
++    then jump to 4/2/1 end.
++*/
++    ld.d        data, src, 0
++	sub.d		tmp1, data, zeroones
++	or		    tmp2, data, sevenf
++	andn		has_nul, tmp1, tmp2
++    bnez        has_nul, strcpy_end
++strcpy_mutual_align_finish:
++/*
++    Before jump back to align loop, make dest/src aligned.
++    This will cause a duplicated paste for several bytes between the first double word and the second double word,
++    but should not bring a problem.
++*/
++    li.w          tmp1, 8
++    st.d        data, dest, 0
++    sub.d       tmp1, tmp1, src_off
++    add.d       src,  src,  tmp1
++    add.d       dest, dest, tmp1
++
++	b		strcpy_loop_aligned_1
++
++strcpy_page_cross:
++/*
++    ld.d from aligned address(src & ~0x7).
++    check if high bytes have \0.
++    it not, go back to page cross ok,
++    since the string is supposed to cross the page bound in such situation.
++    if it is, do a srl for data to make it seems like a direct double word from src,
++    then go to 4/2/1 strcpy end.
++
++    tmp4 is 0xffff...ffff mask
++    tmp2 demonstrate the bytes to be masked
++    tmp2 = src_off << 3
++    data = data >> (src_off * 8) | -1 << (64 - src_off * 8)
++    and
++    -1 << (64 - src_off * 8) ->  ~(-1 >> (src_off * 8))
++
++*/
++    li.w          tmp1, 0x7
++    andn        tmp3, src,  tmp1
++    ld.d        data, tmp3, 0
++    li.w          tmp4, -1
++    slli.d      tmp2, src_off, 3
++    srl.d       tmp4, tmp4, tmp2
++    srl.d       data, data, tmp2
++    nor         tmp4, tmp4, zero
++    or          data, data, tmp4
++    sub.d		tmp1, data, zeroones
++	or		    tmp2, data, sevenf
++	andn		has_nul, tmp1, tmp2
++	beqz		has_nul, strcpy_page_cross_ok
++    b           strcpy_end
++END(STRCPY)
++#ifndef ANDROID_CHANGES
++#ifdef _LIBC
++libc_hidden_builtin_def (strcpy)
++#endif
++#endif
+diff --git a/sysdeps/loongarch/lp64/strlen.S b/sysdeps/loongarch/lp64/strlen.S
+new file mode 100644
+index 00000000..a34d8b69
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strlen.S
+@@ -0,0 +1,135 @@
++/* Copyright 2016 Loongson Technology Corporation Limited  */
++
++/* Author: Songyuekun songyuekun@loongson.cn */
++
++/*
++ * ISA: MIPS64R2
++ * ABI: N64
++ */
++/*
++algorithm:
++
++	#.	use ld/ldr to access word/partial word in the string
++
++	#.	use (x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) != 0 to
++	judge if x has zero byte
++
++	#.	use dctz((x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) >> 3
++		to get the index of first rightmost zero byte in dword x;
++
++	#.	use dctz(x) = 64 - dclz(~x & (x-1));
++
++	#.	use pointer to the last non zero byte  minus pointer to the start
++	of the string to get the length of string
++
++*/
++
++
++#include <sysdep.h>
++#include <sys/asm.h>
++
++
++
++#define L_ADDIU  addi.d
++#define L_ADDU   add.d
++#define L_SUBU   sub.d
++
++#define STRLEN	strlen
++#define L(x)	x
++
++
++/* size_t strlen (const char *s1); */
++
++	.text;
++    .globl	strlen;
++	.align		5;
++	cfi_startproc ;
++	.type	strlen, @function;
++strlen:
++
++    //LEAF(strlen)
++    #preld       0, a0, 0
++
++	nor		t4, zero, zero
++	lu12i.w		a2, 0x01010
++	andi		t5, a0, 0x7
++
++	li.w		t7, 0x7
++	slli.d		t6, t5, 0x3
++	andn		t7, a0, t7
++	ld.d		a1, t7, 0
++	sub.d		t7, zero, t6
++	sll.d		t4, t4, t7
++    maskeqz     t4, t4, t6
++	srl.d		a1, a1, t6
++	or		a1, a1, t4
++
++
++	ori		a2, a2, 0x101
++	nor		t1, a1, zero
++	li.w		a4, 8
++
++    #preld       0, a0, 32
++	bstrins.d	a2, a2, 63, 32
++	sub.d		a5, a4, t5
++	move		t5, a0
++
++	sub.d		t0, a1, a2
++	slli.d		t4, a2, 7
++	nor		a3, zero, t4
++	nor		t1, a1, a3
++
++	and		t0, t0, t1
++    #preld       0, a0, 64
++	bnez		t0, strlen_count1	/* instead of use bnel with daddu a0, a0, a5 in branch slot */
++	L_ADDU		a0, a0, a5
++strlen_loop:
++	ld.d		a1, a0, 0
++	sub.d		t0, a1, a2
++    and         t1, t0, t4
++	bnez		t1, strlen_count_pre
++	ld.d		a1, a0, 8
++	sub.d		t0, a1, a2
++	and		    t1, t0, t4
++	L_ADDIU		a0, a0, 16
++	beqz		t1, strlen_loop
++strlen_count:
++    addi.d      a0, a0, -8
++strlen_count_pre:
++    nor         t1, a1, a3
++    and         t0, t0, t1
++    beqz        t0, strlen_noascii_start
++strlen_count1:
++	ctz.d		t1, t0
++	L_SUBU		v0, a0, t5
++	srli.w		t1, t1, 3
++	L_ADDU		v0, v0, t1
++	jr		ra
++strlen_noascii_start:
++    addi.d      a0, a0, 8
++strlen_loop_noascii:
++	ld.d		a1, a0, 0
++	sub.d		t0, a1, a2
++	nor		t1, a1, a3
++	and		t0, t0, t1
++	bnez		t0, strlen_count1
++	ld.d		a1, a0, 8
++	sub.d		t0, a1, a2
++	nor		t1, a1, a3
++	and		t0, t0, t1
++	L_ADDIU		a0, a0, 16
++	beqz		t0, strlen_loop_noascii
++    addi.d      a0, a0, -8
++	ctz.d		t1, t0
++	L_SUBU		v0, a0, t5
++	srli.w		t1, t1, 3
++	L_ADDU		v0, v0, t1
++	jr		ra
++END(STRLEN)
++
++#ifndef ANDROID_CHANGES
++#ifdef _LIBC
++libc_hidden_builtin_def (strlen)
++#endif
++#endif
++
+diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S
+new file mode 100644
+index 00000000..29cc7b02
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strncmp.S
+@@ -0,0 +1,269 @@
++/* Copyright 2016 Loongson Technology Corporation Limited  */
++
++/* Author: songyuekun songyuekun@loongson.cn */
++
++/*
++ * ISA: MIPS64R2
++ * ABI: N64
++ */
++
++/* basic algorithm :
++
++	+.	let t0, t1 point to a0, a1, if a0 has smaller low 3 bit of a0 and a1,
++		set a4 to 1 and let t0 point to the larger of lower 3bit of a0 and a1
++
++	+.	if low 3 bit of a0 equal low 3 bit of a0, use a ldr one time and more ld other times;
++
++	+.	if not,  load partial t2 and t3, check if t2 has \0;
++
++	+.	then use use ld for t0, ldr for t1,
++
++	+.	if partial 8 byte  from t1 has \0, compare partial 8 byte from t1 with 8
++		byte from t0 with a mask in a7
++
++	+.	if not, ldl other part of t1, compare  8 byte from t1 with 8 byte from t0
++
++	+.	if (v0 - 0x0101010101010101) & (~v0) & 0x8080808080808080 != 0, v0 has
++		one byte is \0, else has no \0
++
++	+.	for partial 8 byte from ldr t3, 0(a0), preload t3 with 0xffffffffffffffff
++
++
++*/
++#include <sys/asm.h>
++#include <sys/regdef.h>
++
++
++#define STRNCMP	strncmp
++
++#define REP8_01 0x0101010101010101
++#define REP8_7f 0x7f7f7f7f7f7f7f7f
++#define REP8_80 0x8080808080808080
++
++/* Parameters and Results */
++#define src1	a0
++#define	src2	a1
++#define	limit	a2
++#define result	v0
++// Note: v0 = a0 in N64 ABI
++
++
++/* Internal variable */
++#define data1		t0
++#define	data2		t1
++#define	has_nul		t2
++#define	diff		t3
++#define syndrome	t4
++#define zeroones	t5
++#define	sevenf		t6
++#define pos		t7
++#define exchange	t8
++#define tmp1		a5
++#define	tmp2		a6
++#define	tmp3		a7
++#define src1_off    a3
++#define limit_wd    a4
++
++
++/* int strncmp (const char *s1, const char *s2); */
++
++LEAF(STRNCMP)
++	.align		4
++    beqz        limit, strncmp_ret0
++
++	xor		tmp1, src1, src2
++    lu12i.w     zeroones, 0x01010
++    lu12i.w     sevenf, 0x7f7f7
++    andi        src1_off, src1, 0x7
++    ori         zeroones, zeroones, 0x101
++	andi		tmp1, tmp1, 0x7
++    ori         sevenf, sevenf, 0xf7f
++    bstrins.d   zeroones, zeroones, 63, 32
++    bstrins.d   sevenf, sevenf, 63, 32
++	bnez		tmp1, strncmp_misaligned8
++	bnez		src1_off, strncmp_mutual_align
++    /* */
++    addi.d      limit_wd, limit, -1
++    srli.d      limit_wd, limit_wd, 3
++
++strncmp_loop_aligned:
++	ld.d		data1, src1, 0
++    addi.d      src1, src1, 8
++	ld.d		data2, src2, 0
++    addi.d      src2, src2, 8
++strncmp_start_realigned:
++    addi.d      limit_wd, limit_wd, -1
++	sub.d		tmp1, data1, zeroones
++	or		    tmp2, data1, sevenf
++	xor	    	diff, data1, data2
++	andn		has_nul, tmp1, tmp2
++    srli.d      tmp1, limit_wd, 63
++	or		    syndrome, diff, has_nul
++    or          tmp2, syndrome, tmp1
++	beqz		tmp2, strncmp_loop_aligned
++
++    /* if not reach limit */
++    bge         limit_wd, zero, strncmp_not_limit
++    /* if reach limit */
++    andi        limit, limit, 0x7
++    li.w          tmp1, 0x8
++    sub.d       limit, tmp1, limit
++    slli.d      limit, limit, 0x3
++    li.d        tmp1, -1
++    srl.d       tmp1, tmp1, limit
++    and         data1, data1, tmp1
++    and         data2, data2, tmp1
++    orn         syndrome, syndrome, tmp1
++
++
++strncmp_not_limit:
++	ctz.d		pos, syndrome
++    bstrins.d   pos, zero, 2, 0
++	srl.d		data1, data1, pos
++	srl.d		data2, data2, pos
++	andi		data1, data1, 0xff
++	andi		data2, data2, 0xff
++	sub.d		result, data1, data2
++	jr ra
++
++
++
++strncmp_mutual_align:
++    bstrins.d   src1, zero, 2, 0
++    bstrins.d   src2, zero, 2, 0
++	slli.d		tmp1, src1_off,  0x3
++	ld.d		data1, src1, 0
++	ld.d		data2, src2, 0
++    addi.d      src2, src2, 8
++    addi.d      src1, src1, 8
++
++    addi.d      limit_wd, limit, -1
++    andi        tmp3, limit_wd, 0x7
++    srli.d      limit_wd, limit_wd, 3
++    add.d       limit, limit, src1_off
++    add.d       tmp3, tmp3, src1_off
++    srli.d      tmp3, tmp3, 3
++    add.d       limit_wd, limit_wd, tmp3
++
++	sub.d		tmp1, zero, tmp1
++	nor		tmp2, zero, zero
++	srl.d		tmp2, tmp2, tmp1
++	or		data1, data1, tmp2
++	or		data2, data2, tmp2
++	b		strncmp_start_realigned
++
++strncmp_misaligned8:
++
++    li.w          tmp1, 0x10
++    bge         limit, tmp1, strncmp_try_words
++strncmp_byte_loop:
++    ld.bu       data1, src1, 0
++    ld.bu       data2, src2, 0
++    addi.d      limit, limit, -1
++    xor         tmp1, data1, data2
++    masknez     tmp1, data1, tmp1
++    maskeqz     tmp1, limit, tmp1
++    beqz        tmp1, strncmp_done
++
++    ld.bu       data1, src1, 1
++    ld.bu       data2, src2, 1
++    addi.d      src1, src1, 2
++    addi.d      src2, src2, 2
++    addi.d      limit, limit, -1
++    xor         tmp1, data1, data2
++    masknez     tmp1, data1, tmp1
++    maskeqz     tmp1, limit, tmp1
++    bnez        tmp1, strncmp_byte_loop
++
++
++strncmp_done:
++    sub.d       result, data1, data2
++    jr ra
++
++strncmp_try_words:
++    srli.d      limit_wd, limit, 3
++    beqz        src1_off, strncmp_do_misaligned
++
++    sub.d       src1_off, zero, src1_off
++    andi        src1_off, src1_off, 0x7
++    sub.d       limit, limit, src1_off
++    srli.d      limit_wd, limit, 0x3
++
++
++strncmp_page_end_loop:
++    ld.bu       data1, src1, 0
++    ld.bu       data2, src2, 0
++    addi.d      src1, src1, 1
++    addi.d      src2, src2, 1
++    xor         tmp1, data1, data2
++    masknez     tmp1, data1, tmp1
++    beqz        tmp1, strncmp_done
++    andi        tmp1, src1, 0x7
++    bnez        tmp1, strncmp_page_end_loop
++strncmp_do_misaligned:
++    li.w          src1_off, 0x8
++    addi.d      limit_wd, limit_wd, -1
++    blt         limit_wd, zero, strncmp_done_loop
++
++strncmp_loop_misaligned:
++    andi        tmp2, src2, 0xff8
++    xori        tmp2, tmp2, 0xff8
++    beqz        tmp2, strncmp_page_end_loop
++
++    ld.d        data1, src1, 0
++    ld.d        data2, src2, 0
++    addi.d      src1, src1, 8
++    addi.d      src2, src2, 8
++    sub.d       tmp1, data1, zeroones
++    or          tmp2, data1, sevenf
++    xor         diff, data1, data2
++    andn        has_nul, tmp1, tmp2
++    or          syndrome, diff, has_nul
++    bnez        syndrome, strncmp_not_limit
++    addi.d      limit_wd, limit_wd, -1
++    #blt         zero, limit_wd, strncmp_loop_misaligned
++    bge         limit_wd, zero, strncmp_loop_misaligned
++
++strncmp_done_loop:
++    andi        limit, limit, 0x7
++    beqz        limit, strncmp_not_limit
++    /* Read the last double word */
++    /* check if the final part is about to exceed the page */
++    andi        tmp1, src2, 0x7
++    andi        tmp2, src2, 0xff8
++    add.d       tmp1, tmp1, limit
++    xori        tmp2, tmp2, 0xff8
++    andi        tmp1, tmp1, 0x8
++    masknez     tmp1, tmp1, tmp2
++    bnez        tmp1, strncmp_byte_loop
++    addi.d      src1, src1, -8
++    addi.d      src2, src2, -8
++    ldx.d       data1, src1, limit
++    ldx.d       data2, src2, limit
++    sub.d       tmp1, data1, zeroones
++    or          tmp2, data1, sevenf
++    xor         diff, data1, data2
++    andn        has_nul, tmp1, tmp2
++    or          syndrome, diff, has_nul
++    bnez        syndrome, strncmp_not_limit
++
++strncmp_ret0:
++    move result, zero
++    jr ra
++/* check
++    if ((src1 != 0) && ((src2 == 0 ) || (src1 < src2)))
++    then exchange(src1,src2)
++
++*/
++
++
++
++
++
++
++END(STRNCMP)
++#ifndef ANDROID_CHANGES
++#ifdef _LIBC
++libc_hidden_builtin_def (strncmp)
++#endif
++#endif
+diff --git a/sysdeps/loongarch/lp64/strnlen.S b/sysdeps/loongarch/lp64/strnlen.S
+new file mode 100644
+index 00000000..3a204686
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strnlen.S
+@@ -0,0 +1,155 @@
++/* Copyright 2016 Loongson Technology Corporation Limited  */
++
++/* Author: Songyuekun songyuekun@loongson.cn */
++
++/*
++ * ISA: MIPS64R2
++ * ABI: N64
++ */
++/*
++algorithm:
++
++	#.	use ld/ldr to access word/partial word in the string
++
++	#.	use (x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) != 0 to
++	judge if x has zero byte
++
++	#.	use dctz((x - 0x0101010101010101) & (~(x | 0x7f7f7f7f7f7f7f7f) >> 3
++		to get the index of first rightmost zero byte in dword x;
++
++	#.	use dctz(x) = 64 - dclz(~x & (x-1));
++
++	#.	use pointer to the last non zero byte  minus pointer to the start
++	of the string to get the length of string
++
++*/
++
++#include <sys/asm.h>
++#include <sys/regdef.h>
++
++
++
++#define L_ADDIU  addi.d
++#define L_ADDU   add.d
++#define L_SUBU   sub.d
++
++#define STRNLEN	__strnlen
++#define L(x)	x
++/* rd <- if rc then ra else rb
++    will destroy t6
++*/
++
++#define CONDITIONSEL(rd,ra,rb,rc)\
++        masknez a5, rb, rc;\
++        maskeqz rd, ra, rc;\
++        or      rd, rd, a5
++
++
++/* Parameters and Results */
++#define srcin	a0
++#define	limit   a1
++#define len 	v0
++
++
++/* Internal variable */
++#define data1		t0
++#define	data2		t1
++#define	has_nul1	t2
++#define	has_nul2	t3
++#define src	        t4
++#define zeroones	t5
++#define	sevenf		t6
++#define data2a	    t7
++#define tmp6	    t7
++#define pos	        t8
++#define tmp1		a2
++#define	tmp2		a3
++#define	tmp3		a4
++#define tmp4        a5
++#define tmp5        a6
++#define limit_wd    a7
++
++
++
++/* size_t strnlen (const char *s1,size_t maxlen); */
++
++LEAF(STRNLEN)
++
++	.align		4
++    beqz        limit, L(_hit_limit)
++    lu12i.w     zeroones, 0x01010
++    lu12i.w     sevenf, 0x7f7f7
++    ori         zeroones, zeroones, 0x101
++    ori         sevenf, sevenf, 0xf7f
++    bstrins.d   zeroones, zeroones, 63, 32
++    bstrins.d   sevenf, sevenf, 63, 32
++    andi        tmp1, srcin, 15
++    sub.d       src, srcin, tmp1
++    bnez        tmp1, L(misaligned)
++    addi.d      limit_wd, limit, -1
++    srli.d      limit_wd, limit_wd, 4
++L(_loop):
++    ld.d        data1, src, 0
++    ld.d        data2, src, 8
++    addi.d      src, src, 16
++L(_realigned):
++    sub.d       tmp1, data1, zeroones
++    or          tmp2, data1, sevenf
++    sub.d       tmp3, data2, zeroones
++    or          tmp4, data2, sevenf
++    andn        has_nul1, tmp1, tmp2
++    andn        has_nul2, tmp3, tmp4
++    addi.d      limit_wd, limit_wd, -1
++    srli.d      tmp1, limit_wd, 63
++    or          tmp2, has_nul1, has_nul2
++    or          tmp3, tmp1, tmp2
++    beqz        tmp3, L(_loop)
++    beqz        tmp2, L(_hit_limit)
++    sub.d       len, src, srcin
++    beqz        has_nul1, L(_nul_in_data2)
++    move        has_nul2, has_nul1
++    addi.d      len, len, -8
++L(_nul_in_data2):
++    ctz.d       pos, has_nul2
++    srli.d      pos, pos, 3
++    addi.d      len, len, -8
++    add.d       len, len, pos
++    sltu        tmp1, len, limit
++    CONDITIONSEL(len,len,limit,tmp1)
++    jr ra
++
++
++L(misaligned):
++    addi.d      limit_wd, limit, -1
++    sub.d       tmp4, zero, tmp1
++    andi        tmp3, limit_wd, 15
++    srli.d      limit_wd, limit_wd, 4
++    li.d        tmp5, -1
++    ld.d        data1, src, 0
++    ld.d        data2, src, 8
++    addi.d      src, src, 16
++    slli.d      tmp4, tmp4, 3
++    add.d       tmp3, tmp3, tmp1
++    srl.d       tmp2, tmp5, tmp4
++    srli.d      tmp3, tmp3, 4
++    add.d       limit_wd, limit_wd, tmp3
++    or          data1, data1, tmp2
++    or          data2a, data2, tmp2
++    li.w          tmp3, 9
++    sltu        tmp1, tmp1, tmp3
++    CONDITIONSEL(data1,data1,tmp5,tmp1)
++    CONDITIONSEL(data2,data2,data2a,tmp1)
++    b           L(_realigned)
++
++
++L(_hit_limit):
++    move len, limit
++    jr  ra
++END(STRNLEN)
++#ifndef ANDROID_CHANGES
++#ifdef _LIBC
++weak_alias (__strnlen, strnlen)
++libc_hidden_def (strnlen)
++libc_hidden_def (__strnlen)
++#endif
++#endif
+-- 
+2.33.0
+
diff --git a/glibc.spec b/glibc.spec
index 959fe67..101c296 100644
--- a/glibc.spec
+++ b/glibc.spec
@@ -66,7 +66,7 @@
 ##############################################################################
 Name: 	 	glibc
 Version: 	2.34
-Release: 	105
+Release: 	106
 Summary: 	The GNU libc libraries
 License:	%{all_license}
 URL: 		http://www.gnu.org/software/glibc/
@@ -272,6 +272,7 @@ Patch9021: x86-use-total-l3cache-for-non_temporal_threshold.patch
 Patch9022: login-Add-back-libutil-as-an-empty-library.patch
 Patch9023: malloc-Fix-malloc-debug-for-2.35-onwards.patch
 Patch9024: LoongArch-Port.patch
+Patch9025: LoongArch-Optimize-string-functions-including-memcpy.patch
 %endif
 
 Provides: ldconfig rtld(GNU_HASH) bundled(gnulib)
@@ -1438,6 +1439,9 @@ fi
 %endif
 
 %changelog
+* Sun Jan 14 2023 Xue Liu <liuxue@loongson.cn> - 2.34-106
+- LoongArch: Optimize some string functions
+
 * Wed Dec 21 2022 wanghongliang <wanghongliang@loongson.cn> - 2.34-105
 - LoongArch Port
 - Add login-Add-back-libutil-as-an-empty-library.patch from upstream
-- 
Gitee