diff --git a/LoongArch-Optimize-string-functions-including-memcpy.patch b/LoongArch-Optimize-string-functions-including-memcpy.patch
new file mode 100644
index 0000000000000000000000000000000000000000..98d12ceeb35c4f346feaa2eaa6d41d5b6558f040
--- /dev/null
+++ b/LoongArch-Optimize-string-functions-including-memcpy.patch
@@ -0,0 +1,1938 @@
+From 50f0fd88162ba3130f902d12305fe317a33ebaee Mon Sep 17 00:00:00 2001
+From: Xue Liu <liuxue@loongson.cn>
+Date: Sat, 14 Jan 2023 16:14:24 +0800
+Subject: [PATCH] LoongArch: Optimize string functions including memcpy,
+ memmove, memset, strchr, strchrnul, strcmp, strncmp, ctrcpy, ctrlen, strnlen.
+
+Change-Id: I2975aea74f44bf2c9e01a6dfb6ca2eaa57aa5f7c
+---
+ sysdeps/loongarch/lp64/memcpy.S    | 259 ++++++++++++++++++
+ sysdeps/loongarch/lp64/memmove.S   | 406 +++++++++++++++++++++++++++++
+ sysdeps/loongarch/lp64/memset.S    | 170 ++++++++++++
+ sysdeps/loongarch/lp64/strchr.S    | 107 ++++++++
+ sysdeps/loongarch/lp64/strchrnul.S | 115 ++++++++
+ sysdeps/loongarch/lp64/strcmp.S    | 161 ++++++++++++
+ sysdeps/loongarch/lp64/strcpy.S    | 175 +++++++++++++
+ sysdeps/loongarch/lp64/strlen.S    | 102 ++++++++
+ sysdeps/loongarch/lp64/strncmp.S   | 225 ++++++++++++++++
+ sysdeps/loongarch/lp64/strnlen.S   | 125 +++++++++
+ 10 files changed, 1845 insertions(+)
+ create mode 100644 sysdeps/loongarch/lp64/memcpy.S
+ create mode 100644 sysdeps/loongarch/lp64/memmove.S
+ create mode 100644 sysdeps/loongarch/lp64/memset.S
+ create mode 100644 sysdeps/loongarch/lp64/strchr.S
+ create mode 100644 sysdeps/loongarch/lp64/strchrnul.S
+ create mode 100644 sysdeps/loongarch/lp64/strcmp.S
+ create mode 100644 sysdeps/loongarch/lp64/strcpy.S
+ create mode 100644 sysdeps/loongarch/lp64/strlen.S
+ create mode 100644 sysdeps/loongarch/lp64/strncmp.S
+ create mode 100644 sysdeps/loongarch/lp64/strnlen.S
+
+diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S
+new file mode 100644
+index 00000000..5d850123
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/memcpy.S
+@@ -0,0 +1,259 @@
++/* Optimized memcpy implementation for LoongArch.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library.  If not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++/* Allow the routine to be named something else if desired.  */
++#ifndef MEMCPY_NAME
++#define MEMCPY_NAME memcpy
++#endif
++
++#define LD_64(reg, n) \
++	ld.d	t0, reg, n;    \
++	ld.d	t1, reg, n+8;  \
++	ld.d	t2, reg, n+16; \
++	ld.d	t3, reg, n+24; \
++	ld.d	t4, reg, n+32; \
++	ld.d	t5, reg, n+40; \
++	ld.d	t6, reg, n+48; \
++	ld.d	t7, reg, n+56;
++
++#define ST_64(reg, n) \
++	st.d	t0, reg, n;    \
++	st.d	t1, reg, n+8;  \
++	st.d	t2, reg, n+16; \
++	st.d	t3, reg, n+24; \
++	st.d	t4, reg, n+32; \
++	st.d	t5, reg, n+40; \
++	st.d	t6, reg, n+48; \
++	st.d	t7, reg, n+56;
++
++LEAF(MEMCPY_NAME)
++//1st var: dst ptr: void *a1 $r4 a0
++//2nd var: src ptr: void *a2 $r5 a1
++//3rd var: size_t len $r6 a2
++//t0~t9 registers as temp
++
++	add.d	a4, a1, a2
++	add.d	a3, a0, a2
++	li.w	a6, 16
++	bge	a6, a2, less_16bytes
++	li.w	a6, 128
++	blt	a6, a2, long_bytes
++	li.w	a6, 64
++	blt	a6, a2, more_64bytes
++	li.w	a6, 32
++	blt	a6, a2, more_32bytes
++
++	/* 17...32 */
++	ld.d	t0, a1, 0
++	ld.d	t1, a1, 8
++	ld.d	t2, a4, -16
++	ld.d	t3, a4, -8
++	st.d	t0, a0, 0
++	st.d	t1, a0, 8
++	st.d	t2, a3, -16
++	st.d	t3, a3, -8
++	jr	ra
++
++more_64bytes:
++	srli.d	t8, a0, 3
++	slli.d	t8, t8, 3
++	addi.d	t8, t8, 0x8
++	sub.d	a7, a0, t8
++	ld.d	t0, a1, 0
++	sub.d	a1, a1, a7
++	st.d	t0, a0, 0
++
++	add.d	a7, a7, a2
++	addi.d	a7, a7, -0x20
++loop_32:
++	ld.d	t0, a1, 0
++	ld.d	t1, a1, 8
++	ld.d	t2, a1, 16
++	ld.d	t3, a1, 24
++	st.d	t0, t8, 0
++	st.d	t1, t8, 8
++	st.d	t2, t8, 16
++	st.d	t3, t8, 24
++
++	addi.d	t8, t8, 0x20
++	addi.d	a1, a1, 0x20
++	addi.d	a7, a7, -0x20
++	blt	zero, a7, loop_32
++
++	ld.d	t4, a4, -32
++	ld.d	t5, a4, -24
++	ld.d	t6, a4, -16
++	ld.d	t7, a4, -8
++	st.d	t4, a3, -32
++	st.d	t5, a3, -24
++	st.d	t6, a3, -16
++	st.d	t7, a3, -8
++
++	jr	ra
++
++more_32bytes:
++	/* 33...64 */
++	ld.d	t0, a1, 0
++	ld.d	t1, a1, 8
++	ld.d	t2, a1, 16
++	ld.d	t3, a1, 24
++	ld.d	t4, a4, -32
++	ld.d	t5, a4, -24
++	ld.d	t6, a4, -16
++	ld.d	t7, a4, -8
++	st.d	t0, a0, 0
++	st.d	t1, a0, 8
++	st.d	t2, a0, 16
++	st.d	t3, a0, 24
++	st.d	t4, a3, -32
++	st.d	t5, a3, -24
++	st.d	t6, a3, -16
++	st.d	t7, a3, -8
++	jr	ra
++
++less_16bytes:
++	srai.d	a6, a2, 3
++	beqz	a6, less_8bytes
++
++	/* 8...16 */
++	ld.d	t0, a1, 0
++	ld.d	t1, a4, -8
++	st.d	t0, a0, 0
++	st.d	t1, a3, -8
++
++	jr	ra
++
++less_8bytes:
++	srai.d	a6, a2, 2
++	beqz	a6, less_4bytes
++
++	/* 4...7 */
++	ld.w	t0, a1, 0
++	ld.w	t1, a4, -4
++	st.w	t0, a0, 0
++	st.w	t1, a3, -4
++	jr	ra
++
++less_4bytes:
++	srai.d	a6, a2, 1
++	beqz	a6, less_2bytes
++
++	/* 2...3 */
++	ld.h	t0, a1, 0
++	ld.h	t1, a4, -2
++	st.h	t0, a0, 0
++	st.h	t1, a3, -2
++	jr	ra
++
++less_2bytes:
++	beqz	a2, less_1bytes
++
++	ld.b	t0, a1, 0
++	st.b	t0, a0, 0
++	jr	ra
++
++less_1bytes:
++	jr	ra
++
++long_bytes:
++	srli.d	t8, a0, 3
++	slli.d	t8, t8, 3
++	beq	a0, t8, start
++
++	ld.d	t0, a1, 0
++	addi.d	t8, t8, 0x8
++	st.d	t0, a0, 0
++	sub.d	a7, a0, t8
++	sub.d	a1, a1, a7
++
++start:
++	addi.d	a5, a3, -0x80
++	blt	a5, t8, align_end_proc
++
++loop_128:
++	LD_64(a1, 0)
++	ST_64(t8, 0)
++	LD_64(a1, 64)
++	addi.d	a1, a1, 0x80
++	ST_64(t8, 64)
++	addi.d	t8, t8, 0x80
++	bge	a5, t8, loop_128
++
++align_end_proc:
++	sub.d	a2, a3, t8
++
++	pcaddi	t1, 34
++	andi	t2, a2, 0x78
++	sub.d	t1, t1, t2
++	jirl	zero, t1, 0
++
++end_120_128_unalign:
++	ld.d	t0, a1, 112
++	st.d	t0, t8, 112
++end_112_120_unalign:
++	ld.d	t0, a1, 104
++	st.d	t0, t8, 104
++end_104_112_unalign:
++	ld.d	t0, a1, 96
++	st.d	t0, t8, 96
++end_96_104_unalign:
++	ld.d	t0, a1, 88
++	st.d	t0, t8, 88
++end_88_96_unalign:
++	ld.d	t0, a1, 80
++	st.d	t0, t8, 80
++end_80_88_unalign:
++	ld.d	t0, a1, 72
++	st.d	t0, t8, 72
++end_72_80_unalign:
++	ld.d	t0, a1, 64
++	st.d	t0, t8, 64
++end_64_72_unalign:
++	ld.d	t0, a1, 56
++	st.d	t0, t8, 56
++end_56_64_unalign:
++	ld.d	t0, a1, 48
++	st.d	t0, t8, 48
++end_48_56_unalign:
++	ld.d	t0, a1, 40
++	st.d	t0, t8, 40
++end_40_48_unalign:
++	ld.d	t0, a1, 32
++	st.d	t0, t8, 32
++end_32_40_unalign:
++	ld.d	t0, a1, 24
++	st.d	t0, t8, 24
++end_24_32_unalign:
++	ld.d	t0, a1, 16
++	st.d	t0, t8, 16
++end_16_24_unalign:
++	ld.d	t0, a1, 8
++	st.d	t0, t8, 8
++end_8_16_unalign:
++	ld.d	t0, a1, 0
++	st.d	t0, t8, 0
++end_0_8_unalign:
++	ld.d	t0, a4, -8
++	st.d	t0, a3, -8
++
++	jr	ra
++
++END(MEMCPY_NAME)
++libc_hidden_builtin_def (MEMCPY_NAME)
+diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S
+new file mode 100644
+index 00000000..edd9cf3d
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/memmove.S
+@@ -0,0 +1,406 @@
++/* Optimized memmove implementation for LoongArch.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library.  If not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++/* Allow the routine to be named something else if desired.  */
++#ifndef MEMMOVE_NAME
++#define MEMMOVE_NAME memmove
++#endif
++
++#define LD_64(reg, n) \
++	ld.d	t0, reg, n;    \
++	ld.d	t1, reg, n+8;  \
++	ld.d	t2, reg, n+16; \
++	ld.d	t3, reg, n+24; \
++	ld.d	t4, reg, n+32; \
++	ld.d	t5, reg, n+40; \
++	ld.d	t6, reg, n+48; \
++	ld.d	t7, reg, n+56;
++
++
++#define ST_64(reg, n) \
++	st.d	t0, reg, n;    \
++	st.d	t1, reg, n+8;  \
++	st.d	t2, reg, n+16; \
++	st.d	t3, reg, n+24; \
++	st.d	t4, reg, n+32; \
++	st.d	t5, reg, n+40; \
++	st.d	t6, reg, n+48; \
++	st.d	t7, reg, n+56;
++
++/* memmove (const void *dst, const void *src, size_t n) */
++LEAF(MEMMOVE_NAME)
++	add.d	a4, a1, a2
++	add.d	a3, a0, a2
++	beq	a1, a0, less_1bytes
++	move	t8, a0
++	srai.d	a6, a2, 4		#num/16
++	beqz	a6, less_16bytes        #num<16
++	srai.d	a6, a2, 6		#num/64
++	bnez	a6, more_64bytes	#num>64
++	srai.d	a6, a2, 5
++	beqz	a6, less_32bytes	#num<32
++
++	ld.d	t0, a1, 0		#32<num<64
++	ld.d	t1, a1, 8
++	ld.d	t2, a1, 16
++	ld.d	t3, a1, 24
++	ld.d	t4, a4, -32
++	ld.d	t5, a4, -24
++	ld.d	t6, a4, -16
++	ld.d	t7, a4, -8
++	st.d	t0, a0, 0
++	st.d	t1, a0, 8
++	st.d	t2, a0, 16
++	st.d	t3, a0, 24
++	st.d	t4, a3, -32
++	st.d	t5, a3, -24
++	st.d	t6, a3, -16
++	st.d	t7, a3, -8
++
++	jr	ra
++
++less_32bytes:
++	ld.d	t0, a1, 0
++	ld.d	t1, a1, 8
++	ld.d	t2, a4, -16
++	ld.d	t3, a4, -8
++	st.d	t0, a0, 0
++	st.d	t1, a0, 8
++	st.d	t2, a3, -16
++	st.d	t3, a3, -8
++
++	jr	ra
++
++less_16bytes:
++	srai.d	a6, a2, 3		#num/8
++	beqz	a6, less_8bytes
++
++	ld.d	t0, a1, 0
++	ld.d	t1, a4, -8
++	st.d	t0, a0, 0
++	st.d	t1, a3, -8
++
++	jr	ra
++
++less_8bytes:
++	srai.d	a6, a2, 2
++	beqz	a6, less_4bytes
++
++	ld.w	t0, a1, 0
++	ld.w	t1, a4, -4
++	st.w	t0, a0, 0
++	st.w	t1, a3, -4
++
++	jr	ra
++
++less_4bytes:
++	srai.d	a6, a2, 1
++	beqz	a6, less_2bytes
++
++	ld.h	t0, a1, 0
++	ld.h	t1, a4, -2
++	st.h	t0, a0, 0
++	st.h	t1, a3, -2
++
++	jr	ra
++
++less_2bytes:
++	beqz	a2, less_1bytes
++
++	ld.b	t0, a1, 0
++	st.b	t0, a0, 0
++
++	jr	ra
++
++less_1bytes:
++	jr	ra
++
++more_64bytes:
++	sub.d	a7, a0, a1
++	bltu	a7, a2, copy_backward
++
++copy_forward:
++	srli.d	a0, a0, 3
++	slli.d	a0, a0, 3
++	beq	a0, t8, all_align
++	addi.d	a0, a0, 0x8
++	sub.d	a7, t8, a0
++	sub.d	a1, a1, a7
++	add.d	a2, a7, a2
++
++start_unalign_proc:
++	pcaddi	t1, 18
++	slli.d	a6, a7, 3
++	add.d	t1, t1, a6
++	jirl	zero, t1, 0
++
++start_7_unalign:
++	ld.b	t0, a1, -7
++	st.b	t0, a0, -7
++start_6_unalign:
++	ld.b	t0, a1, -6
++	st.b	t0, a0, -6
++start_5_unalign:
++	ld.b	t0, a1, -5
++	st.b	t0, a0, -5
++start_4_unalign:
++	ld.b	t0, a1, -4
++	st.b	t0, a0, -4
++start_3_unalign:
++	ld.b	t0, a1, -3
++	st.b	t0, a0, -3
++start_2_unalign:
++	ld.b	t0, a1, -2
++	st.b	t0, a0, -2
++start_1_unalign:
++	ld.b	t0, a1, -1
++	st.b	t0, a0, -1
++start_over:
++
++	addi.d	a2, a2, -0x80
++	blt	a2, zero, end_unalign_proc
++
++loop_less:
++	LD_64(a1, 0)
++	ST_64(a0, 0)
++	LD_64(a1, 64)
++	ST_64(a0, 64)
++
++	addi.d	a0, a0, 0x80
++	addi.d	a1, a1, 0x80
++	addi.d	a2, a2, -0x80
++	bge	a2, zero, loop_less
++
++end_unalign_proc:
++	addi.d	a2, a2, 0x80
++
++	pcaddi	t1, 36
++	andi	t2, a2, 0x78
++	add.d	a1, a1, t2
++	add.d	a0, a0, t2
++	sub.d	t1, t1, t2
++	jirl	zero, t1, 0
++
++end_120_128_unalign:
++	ld.d	t0, a1, -120
++	st.d	t0, a0, -120
++end_112_120_unalign:
++	ld.d	t0, a1, -112
++	st.d	t0, a0, -112
++end_104_112_unalign:
++	ld.d	t0, a1, -104
++	st.d	t0, a0, -104
++end_96_104_unalign:
++	ld.d	t0, a1, -96
++	st.d	t0, a0, -96
++end_88_96_unalign:
++	ld.d	t0, a1, -88
++	st.d	t0, a0, -88
++end_80_88_unalign:
++	ld.d	t0, a1, -80
++	st.d	t0, a0, -80
++end_72_80_unalign:
++	ld.d	t0, a1, -72
++	st.d	t0, a0, -72
++end_64_72_unalign:
++	ld.d	t0, a1, -64
++	st.d	t0, a0, -64
++end_56_64_unalign:
++	ld.d	t0, a1, -56
++	st.d	t0, a0, -56
++end_48_56_unalign:
++	ld.d	t0, a1, -48
++	st.d	t0, a0, -48
++end_40_48_unalign:
++	ld.d	t0, a1, -40
++	st.d	t0, a0, -40
++end_32_40_unalign:
++	ld.d	t0, a1, -32
++	st.d	t0, a0, -32
++end_24_32_unalign:
++	ld.d	t0, a1, -24
++	st.d	t0, a0, -24
++end_16_24_unalign:
++	ld.d	t0, a1, -16
++	st.d	t0, a0, -16
++end_8_16_unalign:
++	ld.d	t0, a1, -8
++	st.d	t0, a0, -8
++end_0_8_unalign:
++
++	andi	a2, a2, 0x7
++	pcaddi	t1, 18
++	slli.d	a2, a2, 3
++	sub.d	t1, t1, a2
++	jirl	zero, t1, 0
++
++end_7_unalign:
++	ld.b	t0, a4, -7
++	st.b	t0, a3, -7
++end_6_unalign:
++	ld.b	t0, a4, -6
++	st.b	t0, a3, -6
++end_5_unalign:
++	ld.b	t0, a4, -5
++	st.b	t0, a3, -5
++end_4_unalign:
++	ld.b	t0, a4, -4
++	st.b	t0, a3, -4
++end_3_unalign:
++	ld.b	t0, a4, -3
++	st.b	t0, a3, -3
++end_2_unalign:
++	ld.b	t0, a4, -2
++	st.b	t0, a3, -2
++end_1_unalign:
++	ld.b	t0, a4, -1
++	st.b	t0, a3, -1
++end:
++
++	move	v0, t8
++	jr	ra
++
++all_align:
++	addi.d	a1, a1, 0x8
++	addi.d	a0, a0, 0x8
++	ld.d	t0, a1, -8
++	st.d	t0, a0, -8
++	addi.d	a2, a2, -8
++	b	start_over
++
++all_align_back:
++	addi.d	a4, a4, -0x8
++	addi.d	a3, a3, -0x8
++	ld.d	t0, a4, 0
++	st.d	t0, a3, 0
++	addi.d	a2, a2, -8
++	b	start_over_back
++
++copy_backward:
++	move	a5, a3
++	srli.d	a3, a3, 3
++	slli.d	a3, a3, 3
++	beq	a3, a5, all_align_back
++	sub.d	a7, a3, a5
++	add.d	a4, a4, a7
++	add.d	a2, a7, a2
++
++	pcaddi	t1, 18
++	slli.d	a6, a7, 3
++	add.d	t1, t1, a6
++	jirl	zero, t1, 0
++
++	ld.b	t0, a4, 6
++	st.b	t0, a3, 6
++	ld.b	t0, a4, 5
++	st.b	t0, a3, 5
++	ld.b	t0, a4, 4
++	st.b	t0, a3, 4
++	ld.b	t0, a4, 3
++	st.b	t0, a3, 3
++	ld.b	t0, a4, 2
++	st.b	t0, a3, 2
++	ld.b	t0, a4, 1
++	st.b	t0, a3, 1
++	ld.b	t0, a4, 0
++	st.b	t0, a3, 0
++start_over_back:
++
++	addi.d	a2, a2, -0x80
++	blt	a2, zero, end_unalign_proc_back
++
++loop_less_back:
++	LD_64(a4, -64)
++	ST_64(a3, -64)
++	LD_64(a4, -128)
++	ST_64(a3, -128)
++
++	addi.d	a4, a4, -0x80
++	addi.d	a3, a3, -0x80
++	addi.d	a2, a2, -0x80
++	bge	a2, zero, loop_less_back
++
++end_unalign_proc_back:
++	addi.d	a2, a2, 0x80
++
++	pcaddi	t1, 36
++	andi	t2, a2, 0x78
++	sub.d	a4, a4, t2
++	sub.d	a3, a3, t2
++	sub.d	t1, t1, t2
++	jirl	zero, t1, 0
++
++	ld.d	t0, a4, 112
++	st.d	t0, a3, 112
++	ld.d	t0, a4, 104
++	st.d	t0, a3, 104
++	ld.d	t0, a4, 96
++	st.d	t0, a3, 96
++	ld.d	t0, a4, 88
++	st.d	t0, a3, 88
++	ld.d	t0, a4, 80
++	st.d	t0, a3, 80
++	ld.d	t0, a4, 72
++	st.d	t0, a3, 72
++	ld.d	t0, a4, 64
++	st.d	t0, a3, 64
++	ld.d	t0, a4, 56
++	st.d	t0, a3, 56
++	ld.d	t0, a4, 48
++	st.d	t0, a3, 48
++	ld.d	t0, a4, 40
++	st.d	t0, a3, 40
++	ld.d	t0, a4, 32
++	st.d	t0, a3, 32
++	ld.d	t0, a4, 24
++	st.d	t0, a3, 24
++	ld.d	t0, a4, 16
++	st.d	t0, a3, 16
++	ld.d	t0, a4, 8
++	st.d	t0, a3, 8
++	ld.d	t0, a4, 0
++	st.d	t0, a3, 0
++
++	andi	a2, a2, 0x7
++	pcaddi	t1, 18
++	slli.d	a2, a2, 3
++	sub.d	t1, t1, a2
++	jirl	zero, t1, 0
++
++	ld.b	t0, a1, 6
++	st.b	t0, a0, 6
++	ld.b	t0, a1, 5
++	st.b	t0, a0, 5
++	ld.b	t0, a1, 4
++	st.b	t0, a0, 4
++	ld.b	t0, a1, 3
++	st.b	t0, a0, 3
++	ld.b	t0, a1, 2
++	st.b	t0, a0, 2
++	ld.b	t0, a1, 1
++	st.b	t0, a0, 1
++	ld.b	t0, a1, 0
++	st.b	t0, a0, 0
++
++	move	v0, t8
++	jr	ra
++
++END(MEMMOVE_NAME)
++libc_hidden_builtin_def (MEMMOVE_NAME)
+diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S
+new file mode 100644
+index 00000000..261504b1
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/memset.S
+@@ -0,0 +1,170 @@
++/* Optimized memset implementation for LoongArch.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library.  If not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++#define ST_128(n)                \
++	st.d	a1, a0, n;       \
++	st.d	a1, a0, n+8  ;   \
++	st.d	a1, a0, n+16 ;   \
++	st.d	a1, a0, n+24 ;   \
++	st.d	a1, a0, n+32 ;   \
++	st.d	a1, a0, n+40 ;   \
++	st.d	a1, a0, n+48 ;   \
++	st.d	a1, a0, n+56 ;   \
++	st.d	a1, a0, n+64 ;   \
++	st.d	a1, a0, n+72 ;   \
++	st.d	a1, a0, n+80 ;   \
++	st.d	a1, a0, n+88 ;   \
++	st.d	a1, a0, n+96 ;   \
++	st.d	a1, a0, n+104;   \
++	st.d	a1, a0, n+112;   \
++	st.d	a1, a0, n+120;   \
++
++/* void *memset(void *s, int c, size_t n); */
++LEAF(memset)
++	.align	6
++
++	bstrins.d	a1, a1, 15, 8
++	add.d		t7, a0, a2
++	bstrins.d	a1, a1, 31, 16
++	move		t0, a0
++	bstrins.d	a1, a1, 63, 32
++	srai.d		t8, a2, 4		#num/16
++	beqz		t8, less_16bytes	#num<16
++	srai.d		t8, a2, 6		#num/64
++	bnez		t8, more_64bytes	#num>64
++	srai.d		t8, a2, 5		#num/32
++	beqz		t8, less_32bytes	#num<32
++	st.d		a1, a0, 0		#32<num<64
++	st.d		a1, a0, 8
++	st.d		a1, a0, 16
++	st.d		a1, a0, 24
++	st.d		a1, t7, -32
++	st.d		a1, t7, -24
++	st.d		a1, t7, -16
++	st.d		a1, t7, -8
++
++	jr	ra
++
++less_32bytes:
++	st.d	a1, a0, 0
++	st.d	a1, a0, 8
++	st.d	a1, t7, -16
++	st.d	a1, t7, -8
++
++	jr	ra
++
++less_16bytes:
++	srai.d	t8, a2, 3		#num/8
++	beqz	t8, less_8bytes
++	st.d	a1, a0, 0
++	st.d	a1, t7, -8
++
++	jr	ra
++
++less_8bytes:
++	srai.d	t8, a2, 2
++	beqz	t8, less_4bytes
++	st.w	a1, a0, 0
++	st.w	a1, t7, -4
++
++	jr	ra
++
++less_4bytes:
++	srai.d	t8, a2, 1
++	beqz	t8, less_2bytes
++	st.h	a1, a0, 0
++	st.h	a1, t7, -2
++
++	jr	ra
++
++less_2bytes:
++	beqz	a2, less_1bytes
++	st.b	a1, a0, 0
++
++	jr	ra
++
++less_1bytes:
++	jr	ra
++
++more_64bytes:
++	srli.d	a0, a0, 3
++	slli.d	a0, a0, 3
++	addi.d	a0, a0, 0x8
++	st.d	a1, t0, 0
++	sub.d	t2, t0, a0
++	add.d	a2, t2, a2
++
++	addi.d	a2, a2, -0x80
++	blt	a2, zero, end_unalign_proc
++
++loop_less:
++	ST_128(0)
++	addi.d	a0, a0, 0x80
++	addi.d	a2, a2, -0x80
++	bge	a2, zero, loop_less
++
++end_unalign_proc:
++	addi.d	a2, a2, 0x80
++
++	pcaddi	t1, 20
++	andi	t5, a2, 0x78
++	srli.d	t5, t5, 1
++	sub.d	t1, t1, t5
++	jirl	zero, t1, 0
++
++end_120_128_unalign:
++	st.d	a1, a0, 112
++end_112_120_unalign:
++	st.d	a1, a0, 104
++end_104_112_unalign:
++	st.d	a1, a0, 96
++end_96_104_unalign:
++	st.d	a1, a0, 88
++end_88_96_unalign:
++	st.d	a1, a0, 80
++end_80_88_unalign:
++	st.d	a1, a0, 72
++end_72_80_unalign:
++	st.d	a1, a0, 64
++end_64_72_unalign:
++	st.d	a1, a0, 56
++end_56_64_unalign:
++	st.d	a1, a0, 48
++end_48_56_unalign:
++	st.d	a1, a0, 40
++end_40_48_unalign:
++	st.d	a1, a0, 32
++end_32_40_unalign:
++	st.d	a1, a0, 24
++end_24_32_unalign:
++	st.d	a1, a0, 16
++end_16_24_unalign:
++	st.d	a1, a0, 8
++end_8_16_unalign:
++	st.d	a1, a0, 0
++end_0_8_unalign:
++	st.d	a1, t7, -8
++
++	move	v0, t0
++	jr	ra
++
++END(memset)
++
++libc_hidden_builtin_def (memset)
+diff --git a/sysdeps/loongarch/lp64/strchr.S b/sysdeps/loongarch/lp64/strchr.S
+new file mode 100644
+index 00000000..3d64c684
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strchr.S
+@@ -0,0 +1,107 @@
++/* Optimized strchr implementation for LoongArch.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library.  If not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++/* char * strchr (const char *s1, int c); */
++LEAF(strchr)
++	.align		6
++
++	li.w		t4, 0x7
++	lu12i.w		a2, 0x01010
++	bstrins.d	a1, a1, 15, 8
++	andi		t0, a0, 0x7
++
++	ori		a2, a2, 0x101
++	andn		t4, a0, t4
++	slli.w		t1, t0, 3
++
++	ld.d		t4, t4, 0
++
++	nor		t8, zero, zero
++	bstrins.d	a1, a1, 31, 16
++	srl.d		t4, t4, t1
++
++	bstrins.d	a1, a1, 63, 32
++	bstrins.d	a2, a2, 63, 32
++	srl.d		a7, t8, t1
++
++	li.w		t1, 8
++	nor		t8, a7, zero
++	slli.d		a3, a2, 7
++	or		t5, t8, t4
++	and		t3, a7, a1
++
++	sub.w		t1, t1, t0
++	nor		a3, a3, zero
++	xor		t2, t5, t3
++	sub.d		a7, t5, a2
++	nor		a6, t5, a3
++
++	sub.d		a5, t2, a2
++	nor		a4, t2, a3
++
++	and		a6, a7, a6
++	and		a5, a5, a4
++	or		a7, a6, a5
++	bnez		a7, L(_mc8_a)
++
++	add.d		a0, a0, t1
++L(_aloop):
++	ld.d		t4, a0, 0
++
++	xor		t2, t4, a1
++	sub.d		a7, t4, a2
++	nor		a6, t4, a3
++	sub.d		a5, t2, a2
++
++	nor		a4, t2, a3
++	and		a6, a7, a6
++	and		a5, a5, a4
++	or		a7, a6, a5
++	bnez		a7, L(_mc8_a)
++
++	ld.d		t4, a0, 8
++	addi.d		a0, a0, 16
++	xor		t2, t4, a1
++	sub.d		a7, t4, a2
++	nor		a6, t4, a3
++	sub.d		a5, t2, a2
++
++	nor		a4, t2, a3
++	and		a6, a7, a6
++	and		a5, a5, a4
++	or		a7, a6, a5
++	beqz		a7, L(_aloop)
++
++	addi.d		a0, a0, -8
++L(_mc8_a):
++
++	ctz.d		t0, a5
++	ctz.d		t2, a6
++
++	srli.w		t0, t0, 3
++	srli.w		t2, t2, 3
++	sltu		t1, t2, t0
++	add.d		v0, a0, t0
++	masknez		v0, v0, t1
++	jr		ra
++END(strchr)
++
++libc_hidden_builtin_def (strchr)
++weak_alias (strchr, index)
+diff --git a/sysdeps/loongarch/lp64/strchrnul.S b/sysdeps/loongarch/lp64/strchrnul.S
+new file mode 100644
+index 00000000..58b8b372
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strchrnul.S
+@@ -0,0 +1,115 @@
++/* Optimized strchrnul implementation for LoongArch.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library.  If not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++#define MOVZ(rd,rs,rt) \
++	masknez	t6, rs, rt;\
++	maskeqz	rd, rd, rt;\
++	or	rd, rd, t6
++
++/* char *strchrnul(const char *s, int c); */
++LEAF(__strchrnul)
++	.align		6
++
++	li.w		t4, 0x7
++	lu12i.w		a2, 0x01010
++	bstrins.d	a1, a1, 15, 8
++	andi		t0, a0, 0x7
++
++	ori		a2, a2, 0x101
++	andn		t4, a0, t4
++	slli.w		t1, t0, 3
++	ld.d		t4, t4, 0
++
++	nor		t8, zero, zero
++	bstrins.d	a1, a1, 31, 16
++	srl.d		t4, t4, t1
++
++	preld		0, a0, 32
++	bstrins.d	a1, a1, 63, 32
++	bstrins.d	a2, a2, 63, 32
++	srl.d		a7, t8, t1
++
++	nor		t8, a7, zero
++	slli.d		a3, a2, 7
++	or		t5, t8, t4
++	and		t3, a7, a1
++
++	nor		a3, a3, zero
++	xor		t2, t5, t3
++	sub.d		a7, t5, a2
++	nor		a6, t5, a3
++
++	li.w		t1, 8
++	sub.d		a5, t2, a2
++	nor		a4, t2, a3
++
++	and		a6, a7, a6
++	and		a5, a5, a4
++	or		a7, a6, a5
++	bnez		a7, L(_mc8_a)
++
++	sub.w		t1, t1, t0
++	add.d		a0, a0, t1
++L(_aloop):
++	ld.d		t4, a0, 0
++
++	xor		t2, t4, a1
++	sub.d		a7, t4, a2
++	nor		a6, t4, a3
++	sub.d		a5, t2, a2
++
++	nor		a4, t2, a3
++	and		a6, a7, a6
++	and		a5, a5, a4
++
++	or		a7, a6, a5
++	bnez		a7, L(_mc8_a)
++
++	ld.d		t4, a0, 8
++	addi.d		a0, a0, 16
++
++	xor		t2, t4, a1
++	sub.d		a7, t4, a2
++	nor		a6, t4, a3
++	sub.d		a5, t2, a2
++
++	nor		a4, t2, a3
++	and		a6, a7, a6
++	and		a5, a5, a4
++
++	or		a7, a6, a5
++	beqz		a7, L(_aloop)
++
++	addi.d		a0, a0, -8
++L(_mc8_a):
++	ctz.d		t0, a5
++	ctz.d		t2, a6
++
++	srli.w		t0, t0, 3
++	srli.w		t2, t2, 3
++	slt		t1, t0, t2
++
++	MOVZ(t0,t2,t1)
++
++	add.d		v0, a0, t0
++	jr		ra
++END(__strchrnul)
++
++weak_alias(__strchrnul, strchrnul)
+diff --git a/sysdeps/loongarch/lp64/strcmp.S b/sysdeps/loongarch/lp64/strcmp.S
+new file mode 100644
+index 00000000..0f7a6d55
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strcmp.S
+@@ -0,0 +1,161 @@
++/* Optimized strcmp implementation for LoongArch.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library.  If not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sys/asm.h>
++
++/* Parameters and Results */
++#define src1	a0
++#define src2	a1
++#define result	v0
++
++/* Internal variable */
++#define src1_off	a2
++#define src2_off	a3
++#define data1		t0
++#define data2		t1
++#define has_nul		t2
++#define diff		t3
++#define syndrome	t4
++#define zeroones	t5
++#define sevenf		t6
++#define pos		t7
++#define exchange	t8
++#define tmp1		a4
++#define tmp2		a5
++#define tmp3		a6
++#define tmp4		a7
++
++/* rd <- if rc then ra else rb
++   tmp3 will be destroyed  */
++#define CONDITIONSEL(rd, rc, ra, rb)\
++        masknez	tmp3, rb, rc;\
++        maskeqz	rd, ra, rc;\
++        or	rd, rd, tmp3
++
++LEAF(strcmp)
++	.align		4
++
++	xor		tmp1, src1, src2
++	lu12i.w		zeroones, 0x01010
++	lu12i.w		sevenf, 0x7f7f7
++	andi		src1_off, src1, 0x7
++	ori		zeroones, zeroones, 0x101
++	ori		sevenf, sevenf, 0xf7f
++	andi		tmp1, tmp1, 0x7
++	bstrins.d	zeroones, zeroones, 63, 32
++	bstrins.d	sevenf, sevenf, 63, 32
++	bnez		tmp1, strcmp_misaligned8
++	bnez		src1_off, strcmp_mutual_align
++strcmp_loop_aligned:
++	ld.d		data1, src1, 0
++	addi.d		src1, src1, 8
++	ld.d		data2, src2, 0
++	addi.d		src2, src2, 8
++strcmp_start_realigned:
++	sub.d		tmp1, data1, zeroones
++	or		tmp2, data1, sevenf
++	xor		diff, data1, data2
++	andn		has_nul, tmp1, tmp2
++	or		syndrome, diff, has_nul
++	beqz		syndrome, strcmp_loop_aligned
++
++strcmp_end:
++	ctz.d		pos, syndrome
++	bstrins.d	pos, zero, 2, 0
++	srl.d		data1, data1, pos
++	srl.d		data2, data2, pos
++	andi		data1, data1, 0xff
++	andi		data2, data2, 0xff
++	sub.d		result, data1, data2
++	jr		ra
++strcmp_mutual_align:
++	bstrins.d	src1, zero, 2, 0
++	bstrins.d	src2, zero, 2, 0
++	slli.d		tmp1, src1_off, 0x3
++	ld.d		data1, src1, 0
++	sub.d		tmp1, zero, tmp1
++	ld.d		data2, src2, 0
++	addi.d		src1, src1, 8
++	addi.d		src2, src2, 8
++	nor		tmp2, zero, zero
++	srl.d		tmp2, tmp2, tmp1
++	or		data1, data1, tmp2
++	or		data2, data2, tmp2
++	b		strcmp_start_realigned
++
++strcmp_misaligned8:
++	/* check
++	   if ((src1 != 0) && ((src2 == 0 ) || (src1 < src2)))
++	   then exchange(src1,src2).  */
++	andi		src2_off, src2, 0x7
++	slt		tmp2, src1_off, src2_off
++	CONDITIONSEL(tmp2, src2_off, tmp2, tmp1)
++	maskeqz		exchange, tmp2, src1_off
++	xor		tmp3, src1, src2
++	maskeqz		tmp3, tmp3, exchange
++	xor		src1, src1, tmp3
++	xor		src2, src2, tmp3
++
++	andi		src1_off, src1, 0x7
++	beqz		src1_off, strcmp_loop_misaligned
++strcmp_do_misaligned:
++	ld.bu		data1, src1, 0
++	ld.bu		data2, src2, 0
++	xor		tmp3, data1, data2
++	addi.d		src1, src1, 1
++	masknez		tmp3, data1, tmp3
++	addi.d		src2, src2, 1
++	beqz		tmp3, strcmp_done
++	andi		src1_off, src1, 0x7
++	bnez		src1_off, strcmp_do_misaligned
++
++strcmp_loop_misaligned:
++	andi		tmp1, src2, 0xff8
++	xori		tmp1, tmp1, 0xff8
++	beqz		tmp1, strcmp_do_misaligned
++	ld.d		data1, src1, 0
++	ld.d		data2, src2, 0
++	addi.d		src1, src1, 8
++	addi.d		src2, src2, 8
++
++	sub.d		tmp1, data1, zeroones
++	or		tmp2, data1, sevenf
++	xor		diff, data1, data2
++	andn		has_nul, tmp1, tmp2
++	or		syndrome, diff, has_nul
++	beqz		syndrome, strcmp_loop_misaligned
++strcmp_misalign_end:
++	ctz.d		pos, syndrome
++	bstrins.d	pos, zero, 2, 0
++	srl.d		data1, data1, pos
++	srl.d		data2, data2, pos
++	andi		data1, data1, 0xff
++	andi		data2, data2, 0xff
++	sub.d		tmp1, data1, data2
++	sub.d		tmp2, data2, data1
++	CONDITIONSEL(result, exchange, tmp2, tmp1)
++	jr		ra
++
++strcmp_done:
++	sub.d		tmp1, data1, data2
++	sub.d		tmp2, data2, data1
++	CONDITIONSEL(result, exchange, tmp2, tmp1)
++	jr		ra
++END(strcmp)
++
++libc_hidden_builtin_def (strcmp)
+diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S
+new file mode 100644
+index 00000000..03d9d361
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strcpy.S
+@@ -0,0 +1,175 @@
++/* Optimized strcpy implementation for LoongArch.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library.  If not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sys/asm.h>
++
++/* Parameters and Results */
++#define dest	a0
++#define src	a1
++#define result	v0
++
++/* Internal variable */
++#define data		t0
++#define data1		t1
++#define has_nul		t2
++#define diff		t3
++#define syndrome	t4
++#define zeroones	t5
++#define sevenf		t6
++#define pos		t7
++#define dest_backup	t8
++#define tmp1		a4
++#define tmp2		a5
++#define tmp3		a6
++#define dest_off	a2
++#define src_off		a3
++#define tmp4		a7
++
++/* rd <- if rc then ra else rb
++   tmp3 will be destroyed. */
++#define CONDITIONSEL(rd, rc, ra, rb)\
++	masknez	tmp3, rb, rc;\
++	maskeqz	rd, ra, rc;\
++	or	rd, rd, tmp3
++
++/* int strcpy (const char *s1, const char *s2); */
++LEAF(strcpy)
++	.align		4
++
++	move		dest_backup, dest
++	lu12i.w		zeroones, 0x01010
++	lu12i.w		sevenf, 0x7f7f7
++	ori		zeroones, zeroones, 0x101
++	ori		sevenf, sevenf, 0xf7f
++	bstrins.d	zeroones, zeroones, 63, 32
++	bstrins.d	sevenf, sevenf, 63, 32
++	andi		src_off, src, 0x7
++	beqz		src_off, strcpy_loop_aligned_1
++	b		strcpy_mutual_align
++strcpy_loop_aligned:
++	st.d		data, dest, 0
++	addi.d		dest, dest, 8
++strcpy_loop_aligned_1:
++	ld.d		data, src, 0
++	addi.d		src, src, 8
++strcpy_start_realigned:
++	sub.d		tmp1, data, zeroones
++	or		tmp2, data, sevenf
++	andn		has_nul, tmp1, tmp2
++	beqz		has_nul, strcpy_loop_aligned
++
++strcpy_end:
++
++	/* 8 4 2 1 */
++	ctz.d		pos, has_nul
++	srli.d		pos, pos, 3
++	addi.d		pos, pos, 1
++	/* Do 8/4/2/1 strcpy based on pos value.
++	   pos value is the number of bytes to be copied
++	   the bytes include the final \0 so the max length is 8 and the min length is 1. */
++strcpy_end_8:
++	andi		tmp1, pos, 0x8
++	beqz		tmp1, strcpy_end_4
++	st.d		data, dest, 0
++	move		dest, dest_backup
++	jr		ra
++strcpy_end_4:
++	andi		tmp1, pos, 0x4
++	beqz		tmp1, strcpy_end_2
++	st.w		data, dest, 0
++	srli.d		data, data, 32
++	addi.d		dest, dest, 4
++strcpy_end_2:
++	andi		tmp1, pos, 0x2
++	beqz		tmp1, strcpy_end_1
++	st.h		data, dest, 0
++	srli.d		data, data, 16
++	addi.d		dest, dest, 2
++strcpy_end_1:
++	andi		tmp1, pos, 0x1
++	beqz		tmp1, strcpy_end_ret
++	st.b		data, dest, 0
++strcpy_end_ret:
++	move		result, dest_backup
++	jr		ra
++
++
++strcpy_mutual_align:
++	/* Check if around src page bound.
++	   if not go to page cross ok.
++	   if it is, do further check.
++	   use tmp2 to accelerate. */
++
++	li.w		tmp2, 0xff8
++	andi		tmp1, src, 0xff8
++	beq		tmp1, tmp2, strcpy_page_cross
++
++strcpy_page_cross_ok:
++	/* Load a misaligned double word and check if has \0
++	   If no, do a misaligned double word paste.
++	   If yes, calculate the number of avaliable bytes,
++	   then jump to 4/2/1 end. */
++	ld.d		data, src, 0
++	sub.d		tmp1, data, zeroones
++	or		tmp2, data, sevenf
++	andn		has_nul, tmp1, tmp2
++	bnez		has_nul, strcpy_end
++strcpy_mutual_align_finish:
++	/* Before jump back to align loop, make dest/src aligned.
++	   This will cause a duplicated paste for several bytes between the first double word and the second double word,
++	   but should not bring a problem. */
++	li.w		tmp1, 8
++	st.d		data, dest, 0
++	sub.d		tmp1, tmp1, src_off
++	add.d		src, src, tmp1
++	add.d		dest, dest, tmp1
++
++	b		strcpy_loop_aligned_1
++
++strcpy_page_cross:
++	/*
++	   ld.d from aligned address(src & ~0x7).
++	   check if high bytes have \0.
++	   it not, go back to page cross ok,
++	   since the string is supposed to cross the page bound in such situation.
++	   if it is, do a srl for data to make it seems like a direct double word from src,
++	   then go to 4/2/1 strcpy end.
++
++	   tmp4 is 0xffff...ffff mask
++	   tmp2 demonstrate the bytes to be masked
++	   tmp2 = src_off << 3
++	   data = data >> (src_off * 8) | -1 << (64 - src_off * 8)
++	   and
++	   -1 << (64 - src_off * 8) -> ~(-1 >> (src_off * 8)) */
++
++	li.w		tmp1, 0x7
++	andn		tmp3, src, tmp1
++	ld.d		data, tmp3, 0
++	li.w		tmp4, -1
++	slli.d		tmp2, src_off, 3
++	srl.d		tmp4, tmp4, tmp2
++	srl.d		data, data, tmp2
++	nor		tmp4, tmp4, zero
++	or		data, data, tmp4
++	sub.d		tmp1, data, zeroones
++	or		tmp2, data, sevenf
++	andn		has_nul, tmp1, tmp2
++	beqz		has_nul, strcpy_page_cross_ok
++	b		strcpy_end
++END(strcpy)
++libc_hidden_builtin_def (strcpy)
+diff --git a/sysdeps/loongarch/lp64/strlen.S b/sysdeps/loongarch/lp64/strlen.S
+new file mode 100644
+index 00000000..3569598c
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strlen.S
+@@ -0,0 +1,102 @@
++/* Optimized strlen implementation for LoongArch.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library.  If not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++#include <sys/asm.h>
++
++/* size_t strlen (const char *s1); */
++LEAF(strlen)
++	.align		5
++
++	nor		t4, zero, zero
++	lu12i.w		a2, 0x01010
++	andi		t5, a0, 0x7
++
++	li.w		t7, 0x7
++	slli.d		t6, t5, 0x3
++	andn		t7, a0, t7
++	ld.d		a1, t7, 0
++	sub.d		t7, zero, t6
++	sll.d		t4, t4, t7
++	maskeqz		t4, t4, t6
++	srl.d		a1, a1, t6
++	or		a1, a1, t4
++
++
++	ori		a2, a2, 0x101
++	nor		t1, a1, zero
++	li.w		a4, 8
++
++	bstrins.d	a2, a2, 63, 32
++	sub.d		a5, a4, t5
++	move		t5, a0
++
++	sub.d		t0, a1, a2
++	slli.d		t4, a2, 7
++	nor		a3, zero, t4
++	nor		t1, a1, a3
++
++	and		t0, t0, t1
++	bnez		t0, strlen_count1
++	add.d		a0, a0, a5
++strlen_loop:
++	ld.d		a1, a0, 0
++	sub.d		t0, a1, a2
++	and		t1, t0, t4
++	bnez		t1, strlen_count_pre
++	ld.d		a1, a0, 8
++	sub.d		t0, a1, a2
++	and		t1, t0, t4
++	addi.d		a0, a0, 16
++	beqz		t1, strlen_loop
++strlen_count:
++	addi.d		a0, a0, -8
++strlen_count_pre:
++	nor		t1, a1, a3
++	and		t0, t0, t1
++	beqz		t0, strlen_noascii_start
++strlen_count1:
++	ctz.d		t1, t0
++	sub.d		v0, a0, t5
++	srli.w		t1, t1, 3
++	add.d		v0, v0, t1
++	jr		ra
++strlen_noascii_start:
++	addi.d		a0, a0, 8
++strlen_loop_noascii:
++	ld.d		a1, a0, 0
++	sub.d		t0, a1, a2
++	nor		t1, a1, a3
++	and		t0, t0, t1
++	bnez		t0, strlen_count1
++	ld.d		a1, a0, 8
++	sub.d		t0, a1, a2
++	nor		t1, a1, a3
++	and		t0, t0, t1
++	addi.d		a0, a0, 16
++	beqz		t0, strlen_loop_noascii
++	addi.d		a0, a0, -8
++	ctz.d		t1, t0
++	sub.d		v0, a0, t5
++	srli.w		t1, t1, 3
++	add.d		v0, v0, t1
++	jr		ra
++END(strlen)
++
++libc_hidden_builtin_def (strlen)
++
+diff --git a/sysdeps/loongarch/lp64/strncmp.S b/sysdeps/loongarch/lp64/strncmp.S
+new file mode 100644
+index 00000000..979ea40a
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strncmp.S
+@@ -0,0 +1,225 @@
++/* Optimized strncmp implementation for LoongArch.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library.  If not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sys/asm.h>
++
++/* Parameters and Results */
++#define src1	a0
++#define src2	a1
++#define limit	a2
++#define result	v0
++
++
++/* Internal variable */
++#define data1		t0
++#define data2		t1
++#define has_nul		t2
++#define diff		t3
++#define syndrome	t4
++#define zeroones	t5
++#define sevenf		t6
++#define pos		t7
++#define exchange	t8
++#define tmp1		a5
++#define tmp2		a6
++#define tmp3		a7
++#define src1_off	a3
++#define limit_wd	a4
++
++LEAF(strncmp)
++	.align		4
++	beqz		limit, strncmp_ret0
++
++	xor		tmp1, src1, src2
++	lu12i.w		zeroones, 0x01010
++	lu12i.w		sevenf, 0x7f7f7
++	andi		src1_off, src1, 0x7
++	ori		zeroones, zeroones, 0x101
++	andi		tmp1, tmp1, 0x7
++	ori		sevenf, sevenf, 0xf7f
++	bstrins.d	zeroones, zeroones, 63, 32
++	bstrins.d	sevenf, sevenf, 63, 32
++	bnez		tmp1, strncmp_misaligned8
++	bnez		src1_off, strncmp_mutual_align
++	addi.d		limit_wd, limit, -1
++	srli.d		limit_wd, limit_wd, 3
++
++strncmp_loop_aligned:
++	ld.d		data1, src1, 0
++	addi.d		src1, src1, 8
++	ld.d		data2, src2, 0
++	addi.d		src2, src2, 8
++strncmp_start_realigned:
++	addi.d		limit_wd, limit_wd, -1
++	sub.d		tmp1, data1, zeroones
++	or		tmp2, data1, sevenf
++	xor		diff, data1, data2
++	andn		has_nul, tmp1, tmp2
++	srli.d		tmp1, limit_wd, 63
++	or		syndrome, diff, has_nul
++	or		tmp2, syndrome, tmp1
++	beqz		tmp2, strncmp_loop_aligned
++
++	/* if not reach limit */
++	bge		limit_wd, zero, strncmp_not_limit
++	/* if reach limit */
++	andi		limit, limit, 0x7
++	li.w		tmp1, 0x8
++	sub.d		limit, tmp1, limit
++	slli.d		limit, limit, 0x3
++	li.d		tmp1, -1
++	srl.d		tmp1, tmp1, limit
++	and		data1, data1, tmp1
++	and		data2, data2, tmp1
++	orn		syndrome, syndrome, tmp1
++
++
++strncmp_not_limit:
++	ctz.d		pos, syndrome
++	bstrins.d	pos, zero, 2, 0
++	srl.d		data1, data1, pos
++	srl.d		data2, data2, pos
++	andi		data1, data1, 0xff
++	andi		data2, data2, 0xff
++	sub.d		result, data1, data2
++	jr		ra
++
++
++
++strncmp_mutual_align:
++	bstrins.d	src1, zero, 2, 0
++	bstrins.d	src2, zero, 2, 0
++	slli.d		tmp1, src1_off, 0x3
++	ld.d		data1, src1, 0
++	ld.d		data2, src2, 0
++	addi.d		src2, src2, 8
++	addi.d		src1, src1, 8
++
++	addi.d		limit_wd, limit, -1
++	andi		tmp3, limit_wd, 0x7
++	srli.d		limit_wd, limit_wd, 3
++	add.d		limit, limit, src1_off
++	add.d		tmp3, tmp3, src1_off
++	srli.d		tmp3, tmp3, 3
++	add.d		limit_wd, limit_wd, tmp3
++
++	sub.d		tmp1, zero, tmp1
++	nor		tmp2, zero, zero
++	srl.d		tmp2, tmp2, tmp1
++	or		data1, data1, tmp2
++	or		data2, data2, tmp2
++	b		strncmp_start_realigned
++
++strncmp_misaligned8:
++
++	li.w		tmp1, 0x10
++	bge		limit, tmp1, strncmp_try_words
++strncmp_byte_loop:
++	ld.bu		data1, src1, 0
++	ld.bu		data2, src2, 0
++	addi.d		limit, limit, -1
++	xor		tmp1, data1, data2
++	masknez		tmp1, data1, tmp1
++	maskeqz		tmp1, limit, tmp1
++	beqz		tmp1, strncmp_done
++
++	ld.bu		data1, src1, 1
++	ld.bu		data2, src2, 1
++	addi.d		src1, src1, 2
++	addi.d		src2, src2, 2
++	addi.d		limit, limit, -1
++	xor		tmp1, data1, data2
++	masknez		tmp1, data1, tmp1
++	maskeqz		tmp1, limit, tmp1
++	bnez		tmp1, strncmp_byte_loop
++
++
++strncmp_done:
++	sub.d		result, data1, data2
++	jr		ra
++
++strncmp_try_words:
++	srli.d		limit_wd, limit, 3
++	beqz		src1_off, strncmp_do_misaligned
++
++	sub.d		src1_off, zero, src1_off
++	andi		src1_off, src1_off, 0x7
++	sub.d		limit, limit, src1_off
++	srli.d		limit_wd, limit, 0x3
++
++strncmp_page_end_loop:
++	ld.bu		data1, src1, 0
++	ld.bu		data2, src2, 0
++	addi.d		src1, src1, 1
++	addi.d		src2, src2, 1
++	xor		tmp1, data1, data2
++	masknez		tmp1, data1, tmp1
++	beqz		tmp1, strncmp_done
++	andi		tmp1, src1, 0x7
++	bnez		tmp1, strncmp_page_end_loop
++strncmp_do_misaligned:
++	li.w		src1_off, 0x8
++	addi.d		limit_wd, limit_wd, -1
++	blt		limit_wd, zero, strncmp_done_loop
++
++strncmp_loop_misaligned:
++	andi		tmp2, src2, 0xff8
++	xori		tmp2, tmp2, 0xff8
++	beqz		tmp2, strncmp_page_end_loop
++
++	ld.d		data1, src1, 0
++	ld.d		data2, src2, 0
++	addi.d		src1, src1, 8
++	addi.d		src2, src2, 8
++	sub.d		tmp1, data1, zeroones
++	or		tmp2, data1, sevenf
++	xor		diff, data1, data2
++	andn		has_nul, tmp1, tmp2
++	or		syndrome, diff, has_nul
++	bnez		syndrome, strncmp_not_limit
++	addi.d		limit_wd, limit_wd, -1
++	bge		limit_wd, zero, strncmp_loop_misaligned
++
++strncmp_done_loop:
++	andi		limit, limit, 0x7
++	beqz		limit, strncmp_not_limit
++	/* Read the last double word
++	   check if the final part is about to exceed the page */
++	andi		tmp1, src2, 0x7
++	andi		tmp2, src2, 0xff8
++	add.d		tmp1, tmp1, limit
++	xori		tmp2, tmp2, 0xff8
++	andi		tmp1, tmp1, 0x8
++	masknez		tmp1, tmp1, tmp2
++	bnez		tmp1, strncmp_byte_loop
++	addi.d		src1, src1, -8
++	addi.d		src2, src2, -8
++	ldx.d		data1, src1, limit
++	ldx.d		data2, src2, limit
++	sub.d		tmp1, data1, zeroones
++	or		tmp2, data1, sevenf
++	xor		diff, data1, data2
++	andn		has_nul, tmp1, tmp2
++	or		syndrome, diff, has_nul
++	bnez		syndrome, strncmp_not_limit
++
++strncmp_ret0:
++	move		result, zero
++	jr		ra
++END(strncmp)
++libc_hidden_builtin_def (strncmp)
+diff --git a/sysdeps/loongarch/lp64/strnlen.S b/sysdeps/loongarch/lp64/strnlen.S
+new file mode 100644
+index 00000000..8eaa60e2
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strnlen.S
+@@ -0,0 +1,125 @@
++/* Optimized strlen implementation for LoongArch.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library.  If not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sys/asm.h>
++
++/* rd <- if rc then ra else rb
++   a5 will be destroyed. */
++#define CONDITIONSEL(rd, ra, rb, rc)\
++	masknez	a5, rb, rc;\
++	maskeqz	rd, ra, rc;\
++	or	rd, rd, a5
++
++/* Parameters and Results */
++#define srcin	a0
++#define limit	a1
++#define len	v0
++
++/* Internal variable */
++#define data1		t0
++#define data2		t1
++#define has_nul1	t2
++#define has_nul2	t3
++#define src		t4
++#define zeroones	t5
++#define sevenf		t6
++#define data2a		t7
++#define tmp6		t7
++#define pos		t8
++#define tmp1		a2
++#define tmp2		a3
++#define tmp3		a4
++#define tmp4		a5
++#define tmp5		a6
++#define limit_wd	a7
++
++/* size_t strnlen (const char *s1,size_t maxlen); */
++LEAF(__strnlen)
++	.align		4
++	beqz		limit, _hit_limit
++	lu12i.w		zeroones, 0x01010
++	lu12i.w		sevenf, 0x7f7f7
++	ori		zeroones, zeroones, 0x101
++	ori		sevenf, sevenf, 0xf7f
++	bstrins.d	zeroones, zeroones, 63, 32
++	bstrins.d	sevenf, sevenf, 63, 32
++	andi		tmp1, srcin, 15
++	sub.d		src, srcin, tmp1
++	bnez		tmp1, misaligned
++	addi.d		limit_wd, limit, -1
++	srli.d		limit_wd, limit_wd, 4
++_loop:
++	ld.d		data1, src, 0
++	ld.d		data2, src, 8
++	addi.d		src, src, 16
++_realigned:
++	sub.d		tmp1, data1, zeroones
++	or		tmp2, data1, sevenf
++	sub.d		tmp3, data2, zeroones
++	or		tmp4, data2, sevenf
++	andn		has_nul1, tmp1, tmp2
++	andn		has_nul2, tmp3, tmp4
++	addi.d		limit_wd, limit_wd, -1
++	srli.d		tmp1, limit_wd, 63
++	or		tmp2, has_nul1, has_nul2
++	or		tmp3, tmp1, tmp2
++	beqz		tmp3, _loop
++	beqz		tmp2, _hit_limit
++	sub.d		len, src, srcin
++	beqz		has_nul1, _nul_in_data2
++	move		has_nul2, has_nul1
++	addi.d		len, len, -8
++_nul_in_data2:
++	ctz.d		pos, has_nul2
++	srli.d		pos, pos, 3
++	addi.d		len, len, -8
++	add.d		len, len, pos
++	sltu		tmp1, len, limit
++	CONDITIONSEL(len, len, limit, tmp1)
++	jr		ra
++
++misaligned:
++	addi.d		limit_wd, limit, -1
++	sub.d		tmp4, zero, tmp1
++	andi		tmp3, limit_wd, 15
++	srli.d		limit_wd, limit_wd, 4
++	li.d		tmp5, -1
++	ld.d		data1, src, 0
++	ld.d		data2, src, 8
++	addi.d		src, src, 16
++	slli.d		tmp4, tmp4, 3
++	add.d		tmp3, tmp3, tmp1
++	srl.d		tmp2, tmp5, tmp4
++	srli.d		tmp3, tmp3, 4
++	add.d		limit_wd, limit_wd, tmp3
++	or		data1, data1, tmp2
++	or		data2a, data2, tmp2
++	li.w		tmp3, 9
++	sltu		tmp1, tmp1, tmp3
++	CONDITIONSEL(data1, data1, tmp5, tmp1)
++	CONDITIONSEL(data2, data2, data2a, tmp1)
++	b		_realigned
++
++_hit_limit:
++	move		len, limit
++	jr		ra
++END(__strnlen)
++
++weak_alias (__strnlen, strnlen)
++libc_hidden_def (strnlen)
++libc_hidden_def (__strnlen)
+-- 
+2.33.0
+
diff --git a/glibc.spec b/glibc.spec
index 959fe67e4d13f517ef2f8561a29c992ca3d67ad0..e60eb76da734bc5b9c2c0f4d793ed603cbc995a9 100644
--- a/glibc.spec
+++ b/glibc.spec
@@ -66,7 +66,7 @@
 ##############################################################################
 Name: 	 	glibc
 Version: 	2.34
-Release: 	105
+Release: 	106
 Summary: 	The GNU libc libraries
 License:	%{all_license}
 URL: 		http://www.gnu.org/software/glibc/
@@ -272,6 +272,7 @@ Patch9021: x86-use-total-l3cache-for-non_temporal_threshold.patch
 Patch9022: login-Add-back-libutil-as-an-empty-library.patch
 Patch9023: malloc-Fix-malloc-debug-for-2.35-onwards.patch
 Patch9024: LoongArch-Port.patch
+Patch9025: LoongArch-Optimize-string-functions-including-memcpy.patch
 %endif
 
 Provides: ldconfig rtld(GNU_HASH) bundled(gnulib)
@@ -1438,6 +1439,9 @@ fi
 %endif
 
 %changelog
+* Sat Jan 14 2023 Xue Liu <liuxue@loongson.cn> - 2.34-106
+- LoongArch: Optimize some string functions
+
 * Wed Dec 21 2022 wanghongliang <wanghongliang@loongson.cn> - 2.34-105
 - LoongArch Port
 - Add login-Add-back-libutil-as-an-empty-library.patch from upstream