diff --git a/0001-cmd-link-internal-add-support-for-internal-linking-o.patch b/0001-cmd-link-internal-add-support-for-internal-linking-o.patch
new file mode 100644
index 0000000000000000000000000000000000000000..8f49bc5378a4e86ad82b6008e3e5688252365a39
--- /dev/null
+++ b/0001-cmd-link-internal-add-support-for-internal-linking-o.patch
@@ -0,0 +1,457 @@
+From 2730907e506ac1fdcc25fbb263df89a03c12b309 Mon Sep 17 00:00:00 2001
+From: limeidan <limeidan@loongson.cn>
+Date: Mon, 9 Oct 2023 17:31:14 +0800
+Subject: [PATCH 01/44] cmd/link/internal: add support for internal linking on
+ loong64
+
+Change-Id: Ic0d36f27481ac707d04aaf7001f26061e510dd8f
+---
+ src/cmd/link/internal/loadelf/ldelf.go |  24 ++
+ src/cmd/link/internal/loong64/asm.go   | 356 ++++++++++++++++++++++++-
+ 2 files changed, 375 insertions(+), 5 deletions(-)
+
+diff --git a/src/cmd/link/internal/loadelf/ldelf.go b/src/cmd/link/internal/loadelf/ldelf.go
+index e0363b5535..be14cc3bb2 100644
+--- a/src/cmd/link/internal/loadelf/ldelf.go
++++ b/src/cmd/link/internal/loadelf/ldelf.go
+@@ -602,6 +602,11 @@ func Load(l *loader.Loader, arch *sys.Arch, localSymVersion int, f *bio.Reader,
+ 					// See https://sourceware.org/bugzilla/show_bug.cgi?id=21809
+ 					continue
+ 				}
++
++				if arch.Family == sys.Loong64 && (strings.HasPrefix(elfsym.name, ".L") || elfsym.name == "L0\001") {
++					// Symbols generated by the relax feature of gcc and binutils on loong64.
++					continue
++				}
+ 			}
+ 
+ 			if strings.HasPrefix(elfsym.name, ".Linfo_string") {
+@@ -682,6 +687,12 @@ func Load(l *loader.Loader, arch *sys.Arch, localSymVersion int, f *bio.Reader,
+ 			l.SetAttrOnList(s, true)
+ 			textp = append(textp, s)
+ 			for ss := l.SubSym(s); ss != 0; ss = l.SubSym(ss) {
++				if arch.Family == sys.Loong64 && (strings.HasPrefix(l.SymName(ss), ".L") || l.SymName(ss) == "L0\001") {
++					// Symbols generated by the relax feature of gcc and binutils on loong64.
++					// We ignore them here because there are too many symbols of this type,
++					// resulting in insufficient space in findfunctable.
++					continue
++				}
+ 				if l.AttrOnList(ss) {
+ 					return errorf("symbol %s listed multiple times",
+ 						l.SymName(ss))
+@@ -1018,7 +1029,14 @@ func relSize(arch *sys.Arch, pn string, elftype uint32) (uint8, uint8, error) {
+ 		MIPS64 | uint32(elf.R_MIPS_PC32)<<16:
+ 		return 4, 4, nil
+ 
++	// These are informational annotations to assist linker optimizations.
++	case LOONG64 | uint32(elf.R_LARCH_ALIGN)<<16,
++		LOONG64 | uint32(elf.R_LARCH_RELAX)<<16:
++		return 0, 0, nil
++
+ 	case LOONG64 | uint32(elf.R_LARCH_ADD8)<<16,
++		LOONG64 | uint32(elf.R_LARCH_ADD6)<<16,
++		LOONG64 | uint32(elf.R_LARCH_SUB6)<<16,
+ 		LOONG64 | uint32(elf.R_LARCH_SUB8)<<16:
+ 		return 1, 1, nil
+ 
+@@ -1032,7 +1050,13 @@ func relSize(arch *sys.Arch, pn string, elftype uint32) (uint8, uint8, error) {
+ 		LOONG64 | uint32(elf.R_LARCH_ADD32)<<16,
+ 		LOONG64 | uint32(elf.R_LARCH_SUB24)<<16,
+ 		LOONG64 | uint32(elf.R_LARCH_SUB32)<<16,
++		LOONG64 | uint32(elf.R_LARCH_B16)<<16,
++		LOONG64 | uint32(elf.R_LARCH_B21)<<16,
+ 		LOONG64 | uint32(elf.R_LARCH_B26)<<16,
++		LOONG64 | uint32(elf.R_LARCH_PCALA_HI20)<<16,
++		LOONG64 | uint32(elf.R_LARCH_PCALA_LO12)<<16,
++		LOONG64 | uint32(elf.R_LARCH_GOT_PC_HI20)<<16,
++		LOONG64 | uint32(elf.R_LARCH_GOT_PC_LO12)<<16,
+ 		LOONG64 | uint32(elf.R_LARCH_32_PCREL)<<16:
+ 		return 4, 4, nil
+ 
+diff --git a/src/cmd/link/internal/loong64/asm.go b/src/cmd/link/internal/loong64/asm.go
+index 2e69594f92..3a83f1a5ad 100644
+--- a/src/cmd/link/internal/loong64/asm.go
++++ b/src/cmd/link/internal/loong64/asm.go
+@@ -58,10 +58,328 @@ func gentext(ctxt *ld.Link, ldr *loader.Loader) {
+ }
+ 
+ func adddynrel(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, s loader.Sym, r loader.Reloc, rIdx int) bool {
+-	log.Fatalf("adddynrel not implemented")
++	targ := r.Sym()
++	var targType sym.SymKind
++	if targ != 0 {
++		targType = ldr.SymType(targ)
++	}
++
++	switch r.Type() {
++	default:
++		if r.Type() >= objabi.ElfRelocOffset {
++			ldr.Errorf(s, "adddynrel: unexpected reloction type %d (%s)", r.Type(), sym.RelocName(target.Arch, r.Type()))
++			return false
++		}
++
++	case objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_64):
++		if targType == sym.SDYNIMPORT {
++			ldr.Errorf(s, "unexpected R_LARCH_64 relocation for dynamic symbol %s", ldr.SymName(targ))
++		}
++		su := ldr.MakeSymbolUpdater(s)
++		su.SetRelocType(rIdx, objabi.R_ADDR)
++		if target.IsPIE() && target.IsInternal() {
++			// For internal linking PIE, this R_ADDR relocation cannot
++			// be resolved statically. We need to generate a dynamic
++			// relocation. Let the code below handle it.
++			break
++		}
++		return true
++
++	case objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_B26):
++		if targType == sym.SDYNIMPORT {
++			addpltsym(target, ldr, syms, targ)
++			su := ldr.MakeSymbolUpdater(s)
++			su.SetRelocSym(rIdx, syms.PLT)
++			su.SetRelocAdd(rIdx, r.Add()+int64(ldr.SymPlt(targ)))
++		}
++		if targType == 0 || targType == sym.SXREF {
++			ldr.Errorf(s, "unknown symbol %s in callloong64", ldr.SymName(targ))
++		}
++		su := ldr.MakeSymbolUpdater(s)
++		su.SetRelocType(rIdx, objabi.R_CALLLOONG64)
++		return true
++
++	case objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_GOT_PC_HI20),
++		objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_GOT_PC_LO12):
++		if targType != sym.SDYNIMPORT {
++			// TODO: turn LDR of GOT entry into ADR of symbol itself
++		}
++
++		ld.AddGotSym(target, ldr, syms, targ, uint32(elf.R_LARCH_64))
++		su := ldr.MakeSymbolUpdater(s)
++		if r.Type() == objabi.ElfRelocOffset+objabi.RelocType(elf.R_LARCH_GOT_PC_HI20) {
++			su.SetRelocType(rIdx, objabi.R_LOONG64_ADDR_HI)
++		} else {
++			su.SetRelocType(rIdx, objabi.R_LOONG64_ADDR_LO)
++		}
++		su.SetRelocSym(rIdx, syms.GOT)
++		su.SetRelocAdd(rIdx, r.Add()+int64(ldr.SymGot(targ)))
++		return true
++
++	case objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_PCALA_HI20),
++		objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_PCALA_LO12):
++		if targType == sym.SDYNIMPORT {
++			ldr.Errorf(s, "unexpected relocation for dynamic symbol %s", ldr.SymName(targ))
++		}
++		if targType == 0 || targType == sym.SXREF {
++			ldr.Errorf(s, "unknown symbol %s", ldr.SymName(targ))
++		}
++
++		su := ldr.MakeSymbolUpdater(s)
++		if r.Type() == objabi.ElfRelocOffset+objabi.RelocType(elf.R_LARCH_PCALA_HI20) {
++			su.SetRelocType(rIdx, objabi.R_LOONG64_ADDR_HI)
++		} else {
++			su.SetRelocType(rIdx, objabi.R_LOONG64_ADDR_LO)
++		}
++		return true
++
++	case objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_ADD64),
++		objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_SUB64):
++		su := ldr.MakeSymbolUpdater(s)
++		if r.Type() == objabi.ElfRelocOffset+objabi.RelocType(elf.R_LARCH_ADD64) {
++			su.SetRelocType(rIdx, objabi.R_LOONG64_ADD64)
++		} else {
++			su.SetRelocType(rIdx, objabi.R_LOONG64_SUB64)
++		}
++		return true
++
++	case objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_B16),
++		objabi.ElfRelocOffset + objabi.RelocType(elf.R_LARCH_B21):
++		if targType == sym.SDYNIMPORT {
++			addpltsym(target, ldr, syms, targ)
++			su := ldr.MakeSymbolUpdater(s)
++			su.SetRelocSym(rIdx, syms.PLT)
++			su.SetRelocAdd(rIdx, r.Add()+int64(ldr.SymPlt(targ)))
++		}
++		if targType == 0 || targType == sym.SXREF {
++			ldr.Errorf(s, "unknown symbol %s in R_JMPxxLOONG64", ldr.SymName(targ))
++		}
++		su := ldr.MakeSymbolUpdater(s)
++		if r.Type() == objabi.ElfRelocOffset+objabi.RelocType(elf.R_LARCH_B16) {
++			su.SetRelocType(rIdx, objabi.R_JMP16LOONG64)
++		} else {
++			su.SetRelocType(rIdx, objabi.R_JMP21LOONG64)
++		}
++		return true
++	}
++
++	relocs := ldr.Relocs(s)
++	r = relocs.At(rIdx)
++
++	switch r.Type() {
++	case objabi.R_CALLLOONG64:
++		if targType != sym.SDYNIMPORT {
++			return true
++		}
++		if target.IsExternal() {
++			return true
++		}
++
++		// Internal linking.
++		if r.Add() != 0 {
++			ldr.Errorf(s, "PLT call with no-zero addend (%v)", r.Add())
++		}
++
++		// Build a PLT entry and change the relocation target to that entry.
++		addpltsym(target, ldr, syms, targ)
++		su := ldr.MakeSymbolUpdater(s)
++		su.SetRelocSym(rIdx, syms.PLT)
++		su.SetRelocAdd(rIdx, int64(ldr.SymPlt(targ)))
++		return true
++
++	case objabi.R_ADDR:
++		if ldr.SymType(s) == sym.STEXT && target.IsElf() {
++			// The code is asking for the address of an external
++			// function. We provide it with the address of the
++			// correspondent GOT symbol.
++			ld.AddGotSym(target, ldr, syms, targ, uint32(elf.R_LARCH_64))
++			su := ldr.MakeSymbolUpdater(s)
++			su.SetRelocSym(rIdx, syms.GOT)
++			su.SetRelocAdd(rIdx, r.Add()+int64(ldr.SymGot(targ)))
++			return true
++		}
++
++		// Process dynamic relocations for the data sections.
++		if target.IsPIE() && target.IsInternal() {
++			// When internally linking, generate dynamic relocations
++			// for all typical R_ADDR relocations. The exception
++			// are those R_ADDR that are created as part of generating
++			// the dynamic relocations and must be resolved statically.
++			//
++			// There are three phases relevant to understanding this:
++			//
++			//	dodata()  // we are here
++			//	address() // symbol address assignment
++			//	reloc()   // resolution of static R_ADDR relocs
++			//
++			// At this point symbol addresses have not been
++			// assigned yet (as the final size of the .rela section
++			// will affect the addresses), and so we cannot write
++			// the Elf64_Rela.r_offset now. Instead we delay it
++			// until after the 'address' phase of the linker is
++			// complete. We do this via Addaddrplus, which creates
++			// a new R_ADDR relocation which will be resolved in
++			// the 'reloc' phase.
++			//
++			// These synthetic static R_ADDR relocs must be skipped
++			// now, or else we will be caught in an infinite loop
++			// of generating synthetic relocs for our synthetic
++			// relocs.
++			//
++			// Furthermore, the rela sections contain dynamic
++			// relocations with R_ADDR relocations on
++			// Elf64_Rela.r_offset. This field should contain the
++			// symbol offset as determined by reloc(), not the
++			// final dynamically linked address as a dynamic
++			// relocation would provide.
++			switch ldr.SymName(s) {
++			case ".dynsym", ".rela", ".rela.plt", ".got.plt", ".dynamic":
++				return false
++			}
++		} else {
++			// Either internally linking a static executable,
++			// in which case we can resolve these relocations
++			// statically in the 'reloc' phase, or externally
++			// linking, in which case the relocation will be
++			// prepared in the 'reloc' phase and passed to the
++			// external linker in the 'asmb' phase.
++			if ldr.SymType(s) != sym.SDATA && ldr.SymType(s) != sym.SRODATA {
++				break
++			}
++		}
++
++		if target.IsElf() {
++			// Generate R_LARCH_RELATIVE relocations for best
++			// efficiency in the dynamic linker.
++			//
++			// As noted above, symbol addresses have not been
++			// assigned yet, so we can't generate the final reloc
++			// entry yet. We ultimately want:
++			//
++			// r_offset = s + r.Off
++			// r_info = R_LARCH_RELATIVE
++			// r_addend = targ + r.Add
++			//
++			// The dynamic linker will set *offset = base address +
++			// addend.
++			//
++			// AddAddrPlus is used for r_offset and r_addend to
++			// generate new R_ADDR relocations that will update
++			// these fields in the 'reloc' phase.
++			rela := ldr.MakeSymbolUpdater(syms.Rela)
++			rela.AddAddrPlus(target.Arch, s, int64(r.Off()))
++			if r.Siz() == 8 {
++				rela.AddUint64(target.Arch, elf.R_INFO(0, uint32(elf.R_LARCH_RELATIVE)))
++			} else {
++				ldr.Errorf(s, "unexpected relocation for dynamic symbol %s", ldr.SymName(targ))
++			}
++			rela.AddAddrPlus(target.Arch, targ, int64(r.Add()))
++			return true
++		}
++
++	case objabi.R_LOONG64_GOT_HI,
++		objabi.R_LOONG64_GOT_LO:
++		ld.AddGotSym(target, ldr, syms, targ, uint32(elf.R_LARCH_64))
++		su := ldr.MakeSymbolUpdater(s)
++		if r.Type() == objabi.R_LOONG64_GOT_HI {
++			su.SetRelocType(rIdx, objabi.R_LOONG64_ADDR_HI)
++		} else {
++			su.SetRelocType(rIdx, objabi.R_LOONG64_ADDR_LO)
++		}
++		su.SetRelocSym(rIdx, syms.GOT)
++		su.SetRelocAdd(rIdx, r.Add()+int64(ldr.SymGot(targ)))
++		return true
++	}
+ 	return false
+ }
+ 
++func elfsetupplt(ctxt *ld.Link, ldr *loader.Loader, plt, gotplt *loader.SymbolBuilder, dynamic loader.Sym) {
++	if plt.Size() == 0 {
++		// pcalau12i $r14, imm
++		plt.AddSymRef(ctxt.Arch, gotplt.Sym(), 0, objabi.R_LOONG64_ADDR_HI, 4)
++		plt.SetUint32(ctxt.Arch, plt.Size()-4, 0x1a00000e)
++
++		// sub.d $r13, $r13, $r15
++		plt.AddUint32(ctxt.Arch, 0x0011bdad)
++
++		// ld.d $r15, $r14, imm
++		plt.AddSymRef(ctxt.Arch, gotplt.Sym(), 0, objabi.R_LOONG64_ADDR_LO, 4)
++		plt.SetUint32(ctxt.Arch, plt.Size()-4, 0x28c001cf)
++
++		// addi.d $r13, $r13, -40
++		plt.AddUint32(ctxt.Arch, 0x02ff61ad)
++
++		// addi.d $r12, $r14, imm
++		plt.AddSymRef(ctxt.Arch, gotplt.Sym(), 0, objabi.R_LOONG64_ADDR_LO, 4)
++		plt.SetUint32(ctxt.Arch, plt.Size()-4, 0x2c001cc)
++
++		// srli.d $r13, $r13, 1
++		plt.AddUint32(ctxt.Arch, 0x004505ad)
++
++		// ld.d $r12, $r12, 8
++		plt.AddUint32(ctxt.Arch, 0x28c0218c)
++
++		// jirl $r0, $r15, 0
++		plt.AddUint32(ctxt.Arch, 0x4c0001e0)
++
++		// check gotplt.size == 0
++		if gotplt.Size() != 0 {
++			ctxt.Errorf(gotplt.Sym(), "got.plt is not empty at the very beginning")
++		}
++
++		gotplt.AddUint64(ctxt.Arch, 0)
++		gotplt.AddUint64(ctxt.Arch, 0)
++	}
++}
++
++func addpltsym(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, s loader.Sym) {
++	if ldr.SymPlt(s) >= 0 {
++		return
++	}
++
++	ld.Adddynsym(ldr, target, syms, s)
++
++	if target.IsElf() {
++		plt := ldr.MakeSymbolUpdater(syms.PLT)
++		gotplt := ldr.MakeSymbolUpdater(syms.GOTPLT)
++		rela := ldr.MakeSymbolUpdater(syms.RelaPLT)
++		if plt.Size() == 0 {
++			panic("plt is not set up")
++		}
++
++		// pcalau12i $r15, imm
++		plt.AddAddrPlus4(target.Arch, gotplt.Sym(), gotplt.Size())
++		plt.SetUint32(target.Arch, plt.Size()-4, 0x1a00000f)
++		relocs := plt.Relocs()
++		plt.SetRelocType(relocs.Count()-1, objabi.R_LOONG64_ADDR_HI)
++
++		// ld.d $r15, $r15, imm
++		plt.AddAddrPlus4(target.Arch, gotplt.Sym(), gotplt.Size())
++		plt.SetUint32(target.Arch, plt.Size()-4, 0x28c001ef)
++		relocs = plt.Relocs()
++		plt.SetRelocType(relocs.Count()-1, objabi.R_LOONG64_ADDR_LO)
++
++		// pcaddu12i $r13, 0
++		plt.AddUint32(target.Arch, 0x1c00000d)
++
++		// jirl r0, r15, 0
++		plt.AddUint32(target.Arch, 0x4c0001e0)
++
++		// add to got.plt: pointer to plt[0]
++		gotplt.AddAddrPlus(target.Arch, plt.Sym(), 0)
++
++		// rela
++		rela.AddAddrPlus(target.Arch, gotplt.Sym(), gotplt.Size()-8)
++		sDynid := ldr.SymDynid(s)
++		rela.AddUint64(target.Arch, elf.R_INFO(uint32(sDynid), uint32(elf.R_LARCH_JUMP_SLOT)))
++		rela.AddUint64(target.Arch, 0)
++
++		ldr.SetPlt(s, int32(plt.Size()-16))
++	} else {
++		ldr.Errorf(s, "addpltsym: unsupport binary format")
++	}
++}
++
+ func elfreloc1(ctxt *ld.Link, out *ld.OutBuf, ldr *loader.Loader, s loader.Sym, r loader.ExtReloc, ri int, sectoff int64) bool {
+ 	// loong64 ELF relocation (endian neutral)
+ 	//		offset     uint64
+@@ -134,10 +452,6 @@ func elfreloc1(ctxt *ld.Link, out *ld.OutBuf, ldr *loader.Loader, s loader.Sym,
+ 	return true
+ }
+ 
+-func elfsetupplt(ctxt *ld.Link, ldr *loader.Loader, plt, gotplt *loader.SymbolBuilder, dynamic loader.Sym) {
+-	return
+-}
+-
+ func machoreloc1(*sys.Arch, *ld.OutBuf, *loader.Loader, loader.Sym, loader.ExtReloc, int64) bool {
+ 	return false
+ }
+@@ -197,6 +511,38 @@ func archreloc(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, r loade
+ 		pc := ldr.SymValue(s) + int64(r.Off())
+ 		t := ldr.SymAddr(rs) + r.Add() - pc
+ 		return int64(val&0xfc000000 | (((t >> 2) & 0xffff) << 10) | (((t >> 2) & 0x3ff0000) >> 16)), noExtReloc, isOk
++
++	case objabi.R_JMP16LOONG64,
++		objabi.R_JMP21LOONG64:
++		pc := ldr.SymValue(s) + int64(r.Off())
++		t := ldr.SymAddr(rs) + r.Add() - pc
++		if r.Type() == objabi.R_JMP16LOONG64 {
++			return int64(val&0xfc0003ff | (((t >> 2) & 0xffff) << 10)), noExtReloc, isOk
++		}
++		return int64(val&0xfc0003e0 | (((t >> 2) & 0xffff) << 10) | (((t >> 2) & 0x1f0000) >> 16)), noExtReloc, isOk
++
++	case objabi.R_LOONG64_TLS_IE_HI,
++		objabi.R_LOONG64_TLS_IE_LO:
++		if target.IsPIE() && target.IsElf() {
++			if !target.IsLinux() {
++				ldr.Errorf(s, "TLS reloc on unsupported OS %v", target.HeadType)
++			}
++			t := ldr.SymAddr(rs) + r.Add()
++			if r.Type() == objabi.R_LOONG64_TLS_IE_HI {
++				// pcalau12i -> lu12i.w
++				return (0x14000000 | (val & 0x1f) | ((t >> 12) << 5)), noExtReloc, isOk
++			}
++			// ld.d -> ori
++			return (0x03800000 | (val & 0x3ff) | ((t & 0xfff) << 10)), noExtReloc, isOk
++		} else {
++			log.Fatalf("cannot handle R_LOONG64_TLS_IE_x (sym %s) when linking internally", ldr.SymName(rs))
++		}
++
++	case objabi.R_LOONG64_ADD64, objabi.R_LOONG64_SUB64:
++		if r.Type() == objabi.R_LOONG64_ADD64 {
++			return int64(val + ldr.SymAddr(rs) + r.Add()), noExtReloc, isOk
++		}
++		return int64(val - (ldr.SymAddr(rs) + r.Add())), noExtReloc, isOk
+ 	}
+ 
+ 	return val, 0, false
+-- 
+2.38.1
+
diff --git a/0002-cmd-dist-internal-platform-enable-internal-linking-f.patch b/0002-cmd-dist-internal-platform-enable-internal-linking-f.patch
new file mode 100644
index 0000000000000000000000000000000000000000..8f2e4c171a8fe6328bc2ecfa7c3888b65e22339b
--- /dev/null
+++ b/0002-cmd-dist-internal-platform-enable-internal-linking-f.patch
@@ -0,0 +1,83 @@
+From d404dccc7f089ddbd81b95c3d97f19acc6cb0329 Mon Sep 17 00:00:00 2001
+From: limeidan <limeidan@loongson.cn>
+Date: Mon, 9 Oct 2023 17:32:03 +0800
+Subject: [PATCH 02/44] cmd/dist, internal/platform: enable internal linking
+ feature and test on loong64
+
+Change-Id: Ifea676e9eb44281465832fc4050f6286e50f4543
+---
+ src/cmd/dist/build.go              | 4 +++-
+ src/cmd/dist/test.go               | 4 ++--
+ src/internal/platform/supported.go | 6 ++++--
+ 3 files changed, 9 insertions(+), 5 deletions(-)
+
+diff --git a/src/cmd/dist/build.go b/src/cmd/dist/build.go
+index 1f467647f5..b71d6c393e 100644
+--- a/src/cmd/dist/build.go
++++ b/src/cmd/dist/build.go
+@@ -624,10 +624,12 @@ func setup() {
+ func mustLinkExternal(goos, goarch string, cgoEnabled bool) bool {
+ 	if cgoEnabled {
+ 		switch goarch {
+-		case "loong64", "mips", "mipsle", "mips64", "mips64le":
++		case "mips", "mipsle", "mips64", "mips64le":
+ 			// Internally linking cgo is incomplete on some architectures.
+ 			// https://golang.org/issue/14449
+ 			return true
++		case "loong64":
++			return false
+ 		case "arm64":
+ 			if goos == "windows" {
+ 				// windows/arm64 internal linking is not implemented.
+diff --git a/src/cmd/dist/test.go b/src/cmd/dist/test.go
+index 0c992118f4..9728ef29cb 100644
+--- a/src/cmd/dist/test.go
++++ b/src/cmd/dist/test.go
+@@ -1164,7 +1164,7 @@ func (t *tester) internalLink() bool {
+ 	// Internally linking cgo is incomplete on some architectures.
+ 	// https://golang.org/issue/10373
+ 	// https://golang.org/issue/14449
+-	if goarch == "loong64" || goarch == "mips64" || goarch == "mips64le" || goarch == "mips" || goarch == "mipsle" || goarch == "riscv64" {
++	if goarch == "mips64" || goarch == "mips64le" || goarch == "mips" || goarch == "mipsle" || goarch == "riscv64" {
+ 		return false
+ 	}
+ 	if goos == "aix" {
+@@ -1185,7 +1185,7 @@ func (t *tester) internalLinkPIE() bool {
+ 	}
+ 	switch goos + "-" + goarch {
+ 	case "darwin-amd64", "darwin-arm64",
+-		"linux-amd64", "linux-arm64", "linux-ppc64le",
++		"linux-amd64", "linux-arm64", "linux-loong64", "linux-ppc64le",
+ 		"android-arm64",
+ 		"windows-amd64", "windows-386", "windows-arm":
+ 		return true
+diff --git a/src/internal/platform/supported.go b/src/internal/platform/supported.go
+index e864c37d68..79ed6d4b1c 100644
+--- a/src/internal/platform/supported.go
++++ b/src/internal/platform/supported.go
+@@ -85,10 +85,12 @@ func FuzzInstrumented(goos, goarch string) bool {
+ func MustLinkExternal(goos, goarch string, withCgo bool) bool {
+ 	if withCgo {
+ 		switch goarch {
+-		case "loong64", "mips", "mipsle", "mips64", "mips64le":
++		case "mips", "mipsle", "mips64", "mips64le":
+ 			// Internally linking cgo is incomplete on some architectures.
+ 			// https://go.dev/issue/14449
+ 			return true
++		case "loong64":
++			return false
+ 		case "arm64":
+ 			if goos == "windows" {
+ 				// windows/arm64 internal linking is not implemented.
+@@ -225,7 +227,7 @@ func InternalLinkPIESupported(goos, goarch string) bool {
+ 	switch goos + "/" + goarch {
+ 	case "android/arm64",
+ 		"darwin/amd64", "darwin/arm64",
+-		"linux/amd64", "linux/arm64", "linux/ppc64le",
++		"linux/amd64", "linux/arm64", "linux/loong64", "linux/ppc64le",
+ 		"windows/386", "windows/amd64", "windows/arm", "windows/arm64":
+ 		return true
+ 	}
+-- 
+2.38.1
+
diff --git a/0003-cmd-runtime-enable-race-detector-on-loong64.patch b/0003-cmd-runtime-enable-race-detector-on-loong64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..0d61dcc5fcb6091e4e881d6ebe0a579b2ba30792
--- /dev/null
+++ b/0003-cmd-runtime-enable-race-detector-on-loong64.patch
@@ -0,0 +1,626 @@
+From f84142ce620b086cc90f728861a76e5066c22ed9 Mon Sep 17 00:00:00 2001
+From: Guoqi Chen <chenguoqi@loongson.cn>
+Date: Sat, 19 Aug 2023 09:22:34 +0800
+Subject: [PATCH 03/44] cmd,runtime: enable race detector on loong64
+
+The race feature depends on llvm. And support for building the tsan library on
+linux/loong64 has been added in this patch [1], which has been merged into the
+branch main and will be supported in the upcoming llvm18.
+
+[1]: https://github.com/llvm/llvm-project/pull/72819
+
+Co-authored-by: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Change-Id: If389318215476890295ed771297c6c088cfc84b3
+---
+ src/cmd/dist/test.go                     |   2 +-
+ src/internal/platform/supported.go       |   2 +-
+ src/race.bash                            |   3 +-
+ src/runtime/asm_loong64.s                |   1 +
+ src/runtime/race/README                  |   3 +-
+ src/runtime/race/race.go                 |   2 +-
+ src/runtime/race_loong64.s               | 509 +++++++++++++++++++++++
+ 8 files changed, 517 insertions(+), 5 deletions(-)
+ create mode 100644 src/runtime/race_loong64.s
+
+diff --git a/src/cmd/dist/test.go b/src/cmd/dist/test.go
+index 9728ef29cb..044268ada0 100644
+--- a/src/cmd/dist/test.go
++++ b/src/cmd/dist/test.go
+@@ -1674,7 +1674,7 @@ func (t *tester) makeGOROOTUnwritable() (undo func()) {
+ func raceDetectorSupported(goos, goarch string) bool {
+ 	switch goos {
+ 	case "linux":
+-		return goarch == "amd64" || goarch == "ppc64le" || goarch == "arm64" || goarch == "s390x"
++		return goarch == "amd64" || goarch == "ppc64le" || goarch == "arm64" || goarch == "s390x" || goarch == "loong64"
+ 	case "darwin":
+ 		return goarch == "amd64" || goarch == "arm64"
+ 	case "freebsd", "netbsd", "windows":
+diff --git a/src/internal/platform/supported.go b/src/internal/platform/supported.go
+index 79ed6d4b1c..52cad096cb 100644
+--- a/src/internal/platform/supported.go
++++ b/src/internal/platform/supported.go
+@@ -23,7 +23,7 @@ func (p OSArch) String() string {
+ func RaceDetectorSupported(goos, goarch string) bool {
+ 	switch goos {
+ 	case "linux":
+-		return goarch == "amd64" || goarch == "ppc64le" || goarch == "arm64" || goarch == "s390x"
++		return goarch == "amd64" || goarch == "ppc64le" || goarch == "arm64" || goarch == "s390x" || goarch == "loong64"
+ 	case "darwin":
+ 		return goarch == "amd64" || goarch == "arm64"
+ 	case "freebsd", "netbsd", "windows":
+diff --git a/src/race.bash b/src/race.bash
+index f1a168bfbb..ae9f57ffd7 100755
+--- a/src/race.bash
++++ b/src/race.bash
+@@ -9,7 +9,7 @@
+ set -e
+ 
+ function usage {
+-	echo 'race detector is only supported on linux/amd64, linux/ppc64le, linux/arm64, linux/s390x, freebsd/amd64, netbsd/amd64, openbsd/amd64, darwin/amd64, and darwin/arm64' 1>&2
++	echo 'race detector is only supported on linux/amd64, linux/ppc64le, linux/arm64, linux/loong64, linux/s390x, freebsd/amd64, netbsd/amd64, openbsd/amd64, darwin/amd64, and darwin/arm64' 1>&2
+ 	exit 1
+ }
+ 
+@@ -19,6 +19,7 @@ case $(uname -s -m) in
+   "Linux x86_64")  ;;
+   "Linux ppc64le") ;;
+   "Linux aarch64") ;;
++  "Linux loongarch64") ;;
+   "Linux s390x")   ;;
+   "FreeBSD amd64") ;;
+   "NetBSD amd64")  ;;
+diff --git a/src/runtime/asm_loong64.s b/src/runtime/asm_loong64.s
+index 1c5ced4512..1bd8276835 100644
+--- a/src/runtime/asm_loong64.s
++++ b/src/runtime/asm_loong64.s
+@@ -37,6 +37,7 @@ TEXT runtime·rt0_go(SB),NOSPLIT|TOPFRAME,$0
+ 	JAL	(R25)
+ 
+ nocgo:
++	JAL	runtime·save_g(SB)
+ 	// update stackguard after _cgo_init
+ 	MOVV	(g_stack+stack_lo)(g), R19
+ 	ADDV	$const_stackGuard, R19
+diff --git a/src/runtime/race/README b/src/runtime/race/README
+index 47c51ca9c1..06865d2b34 100644
+--- a/src/runtime/race/README
++++ b/src/runtime/race/README
+@@ -13,5 +13,6 @@ internal/amd64v1/race_windows.syso built with LLVM 51bfeff0e4b0757ff773da6882f4d
+ internal/amd64v3/race_linux.syso built with LLVM 51bfeff0e4b0757ff773da6882f4d538996c9b04 and Go e7d582b55dda36e76ce4d0ce770139ca0915b7c5.
+ race_darwin_arm64.syso built with LLVM 51bfeff0e4b0757ff773da6882f4d538996c9b04 and Go e7d582b55dda36e76ce4d0ce770139ca0915b7c5.
+ race_linux_arm64.syso built with LLVM 51bfeff0e4b0757ff773da6882f4d538996c9b04 and Go e7d582b55dda36e76ce4d0ce770139ca0915b7c5.
+-race_linux_ppc64le.syso built with LLVM 51bfeff0e4b0757ff773da6882f4d538996c9b04 and Go e7d582b55dda36e76ce4d0ce770139ca0915b7c5.
++race_linux_loong64.syso built with LLVM 9d3fbf97bef3f19da4e0a047f017b8142f59b3fd and Go 988b718f4130ab5b3ce5a5774e1a58e83c92a163.
++race_linux_ppc64le.syso built with LLVM 41cb504b7c4b18ac15830107431a0c1eec73a6b2 and Go 851ecea4cc99ab276109493477b2c7e30c253ea8.
+ race_linux_s390x.syso built with LLVM 51bfeff0e4b0757ff773da6882f4d538996c9b04 and Go e7d582b55dda36e76ce4d0ce770139ca0915b7c5.
+diff --git a/src/runtime/race/race.go b/src/runtime/race/race.go
+index 9c508ebc2b..9fd75424ca 100644
+--- a/src/runtime/race/race.go
++++ b/src/runtime/race/race.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build race && ((linux && (amd64 || arm64 || ppc64le || s390x)) || ((freebsd || netbsd || openbsd || windows) && amd64))
++//go:build race && ((linux && (amd64 || arm64 || loong64 || ppc64le || s390x)) || ((freebsd || netbsd || openbsd || windows) && amd64))
+ 
+ package race
+ 
+diff --git a/src/runtime/race_loong64.s b/src/runtime/race_loong64.s
+new file mode 100644
+index 0000000000..0512efc045
+--- /dev/null
++++ b/src/runtime/race_loong64.s
+@@ -0,0 +1,509 @@
++// Copyright 2023 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++//go:build race
++
++#include "go_asm.h"
++#include "funcdata.h"
++#include "textflag.h"
++#include "cgo/abi_loong64.h"
++
++// The following thunks allow calling the gcc-compiled race runtime directly
++// from Go code without going all the way through cgo.
++// First, it's much faster (up to 50% speedup for real Go programs).
++// Second, it eliminates race-related special cases from cgocall and scheduler.
++// Third, in long-term it will allow to remove cyclic runtime/race dependency on cmd/go.
++
++// A brief recap of the loong64 calling convention.
++// Arguments are passed in R4...R11, the rest is on stack.
++// Callee-saved registers are: R23...R30.
++// Temporary registers are: R12...R20
++// SP must be 16-byte aligned.
++
++// When calling racecalladdr, R20 is the call target address.
++
++// The race ctx, ThreadState *thr below, is passed in R4 and loaded in racecalladdr.
++
++// Load g from TLS. (See tls_loong64.s)
++#define load_g \
++	MOVV	runtime·tls_g(SB), g
++
++#define RARG0	R4
++#define RARG1	R5
++#define RARG2	R6
++#define RARG3	R7
++#define RCALL	R20
++
++// func runtime·raceread(addr uintptr)
++// Called from instrumented code.
++// Defined as ABIInternal so as to avoid introducing a wrapper,
++// which would make caller's PC ineffective.
++TEXT	runtime·raceread<ABIInternal>(SB), NOSPLIT, $0-8
++#ifdef GOEXPERIMENT_regabiargs
++	MOVV	R4, RARG1
++#else
++	MOVV	addr+0(FP), RARG1
++#endif
++	MOVV	R1, RARG2
++	// void __tsan_read(ThreadState *thr, void *addr, void *pc);
++	MOVV	$__tsan_read(SB), RCALL
++	JMP	racecalladdr<>(SB)
++
++// func runtime·RaceRead(addr uintptr)
++TEXT	runtime·RaceRead(SB), NOSPLIT, $0-8
++	// This needs to be a tail call, because raceread reads caller pc.
++	JMP	runtime·raceread(SB)
++
++// func runtime·racereadpc(void *addr, void *callpc, void *pc)
++TEXT	runtime·racereadpc(SB), NOSPLIT, $0-24
++	MOVV	addr+0(FP), RARG1
++	MOVV	callpc+8(FP), RARG2
++	MOVV	pc+16(FP), RARG3
++	// void __tsan_read_pc(ThreadState *thr, void *addr, void *callpc, void *pc);
++	MOVV	$__tsan_read_pc(SB), RCALL
++	JMP	racecalladdr<>(SB)
++
++// func runtime·racewrite(addr uintptr)
++// Called from instrumented code.
++// Defined as ABIInternal so as to avoid introducing a wrapper,
++// which would make caller's PC ineffective.
++TEXT	runtime·racewrite<ABIInternal>(SB), NOSPLIT, $0-8
++#ifdef GOEXPERIMENT_regabiargs
++	MOVV	R4, RARG1
++#else
++	MOVV	addr+0(FP), RARG1
++#endif
++	MOVV	R1, RARG2
++	// void __tsan_write(ThreadState *thr, void *addr, void *pc);
++	MOVV	$__tsan_write(SB), RCALL
++	JMP	racecalladdr<>(SB)
++
++// func runtime·RaceWrite(addr uintptr)
++TEXT	runtime·RaceWrite(SB), NOSPLIT, $0-8
++	// This needs to be a tail call, because racewrite reads caller pc.
++	JMP	runtime·racewrite(SB)
++
++// func runtime·racewritepc(void *addr, void *callpc, void *pc)
++TEXT	runtime·racewritepc(SB), NOSPLIT, $0-24
++	MOVV	addr+0(FP), RARG1
++	MOVV	callpc+8(FP), RARG2
++	MOVV	pc+16(FP), RARG3
++	// void __tsan_write_pc(ThreadState *thr, void *addr, void *callpc, void *pc);
++	MOVV	$__tsan_write_pc(SB), RCALL
++	JMP	racecalladdr<>(SB)
++
++// func runtime·racereadrange(addr, size uintptr)
++// Called from instrumented code.
++// Defined as ABIInternal so as to avoid introducing a wrapper,
++// which would make caller's PC ineffective.
++TEXT	runtime·racereadrange<ABIInternal>(SB), NOSPLIT, $0-16
++#ifdef GOEXPERIMENT_regabiargs
++	MOVV	R5, RARG2
++	MOVV	R4, RARG1
++#else
++	MOVV	addr+0(FP), RARG1
++	MOVV	size+8(FP), RARG2
++#endif
++	MOVV	R1, RARG3
++	// void __tsan_read_range(ThreadState *thr, void *addr, uintptr size, void *pc);
++	MOVV	$__tsan_read_range(SB), RCALL
++	JMP	racecalladdr<>(SB)
++
++// func runtime·RaceReadRange(addr, size uintptr)
++TEXT	runtime·RaceReadRange(SB), NOSPLIT, $0-16
++	// This needs to be a tail call, because racereadrange reads caller pc.
++	JMP	runtime·racereadrange(SB)
++
++// func runtime·racereadrangepc1(void *addr, uintptr sz, void *pc)
++TEXT	runtime·racereadrangepc1(SB), NOSPLIT, $0-24
++	MOVV	addr+0(FP), RARG1
++	MOVV	size+8(FP), RARG2
++	MOVV	pc+16(FP), RARG3
++	ADDV	$4, RARG3	// pc is function start, tsan wants return address.
++	// void __tsan_read_range(ThreadState *thr, void *addr, uintptr size, void *pc);
++	MOVV	$__tsan_read_range(SB), RCALL
++	JMP	racecalladdr<>(SB)
++
++// func runtime·racewriterange(addr, size uintptr)
++// Called from instrumented code.
++// Defined as ABIInternal so as to avoid introducing a wrapper,
++// which would make caller's PC ineffective.
++TEXT	runtime·racewriterange<ABIInternal>(SB), NOSPLIT, $0-16
++#ifdef GOEXPERIMENT_regabiargs
++	MOVV	R5, RARG2
++	MOVV	R4, RARG1
++#else
++	MOVV	addr+0(FP), RARG1
++	MOVV	size+8(FP), RARG2
++#endif
++	MOVV	R1, RARG3
++	// void __tsan_write_range(ThreadState *thr, void *addr, uintptr size, void *pc);
++	MOVV	$__tsan_write_range(SB), RCALL
++	JMP	racecalladdr<>(SB)
++
++// func runtime·RaceWriteRange(addr, size uintptr)
++TEXT	runtime·RaceWriteRange(SB), NOSPLIT, $0-16
++	// This needs to be a tail call, because racewriterange reads caller pc.
++	JMP	runtime·racewriterange(SB)
++
++// func runtime·racewriterangepc1(void *addr, uintptr sz, void *pc)
++TEXT	runtime·racewriterangepc1(SB), NOSPLIT, $0-24
++	MOVV	addr+0(FP), RARG1
++	MOVV	size+8(FP), RARG2
++	MOVV	pc+16(FP), RARG3
++	ADDV	$4, RARG3	// pc is function start, tsan wants return address.
++	// void __tsan_write_range(ThreadState *thr, void *addr, uintptr size, void *pc);
++	MOVV	$__tsan_write_range(SB), RCALL
++	JMP	racecalladdr<>(SB)
++
++// Call a __tsan function from Go code.
++//
++// RCALL = tsan function address
++// RARG0 = *ThreadState a.k.a. g_racectx from g
++// RARG1 = addr passed to __tsan function
++//
++// If addr (RARG1) is out of range, do nothing. Otherwise, setup goroutine
++// context and invoke racecall. Other arguments already set.
++TEXT	racecalladdr<>(SB), NOSPLIT, $0-0
++	// Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend).
++	MOVV	runtime·racearenastart(SB), R12
++	BLT	RARG1, R12, data
++	MOVV	runtime·racearenaend(SB), R12
++	BLT	RARG1, R12, call
++data:
++	MOVV	runtime·racedatastart(SB), R12
++	BLT	RARG1, R12, ret
++	MOVV	runtime·racedataend(SB), R12
++	BGE	RARG1, R12, ret
++call:
++	load_g
++	MOVV	g_racectx(g), RARG0
++	JMP	racecall<>(SB)
++ret:
++	RET
++
++// func runtime·racefuncenter(pc uintptr)
++// Called from instrumented code.
++TEXT	runtime·racefuncenter<ABIInternal>(SB), NOSPLIT, $0-8
++#ifdef GOEXPERIMENT_regabiargs
++	MOVV	R4, RCALL
++#else
++	MOVV	callpc+0(FP), RCALL
++#endif
++	JMP	racefuncenter<>(SB)
++
++// Common code for racefuncenter
++// RCALL = caller's return address
++TEXT	racefuncenter<>(SB), NOSPLIT, $0-0
++	load_g
++	MOVV	g_racectx(g), RARG0	// goroutine racectx
++	MOVV	RCALL, RARG1
++	// void __tsan_func_enter(ThreadState *thr, void *pc);
++	MOVV	$__tsan_func_enter(SB), RCALL
++	JAL	racecall<>(SB)
++	RET
++
++// func runtime·racefuncexit()
++// Called from instrumented code.
++TEXT	runtime·racefuncexit<ABIInternal>(SB), NOSPLIT, $0-0
++	load_g
++	MOVV	g_racectx(g), RARG0	// race context
++	// void __tsan_func_exit(ThreadState *thr);
++	MOVV	$__tsan_func_exit(SB), RCALL
++	JMP	racecall<>(SB)
++
++// Atomic operations for sync/atomic package.
++// R7 = addr of arguments passed to this function, it can
++// be fetched at 24(R3) in racecallatomic after two times JAL
++// RARG0, RARG1, RARG2 set in racecallatomic
++
++// Load
++TEXT	sync∕atomic·LoadInt32(SB), NOSPLIT, $0-12
++	GO_ARGS
++	MOVV	$__tsan_go_atomic32_load(SB), RCALL
++	JAL	racecallatomic<>(SB)
++	RET
++
++TEXT	sync∕atomic·LoadInt64(SB), NOSPLIT, $0-16
++	GO_ARGS
++	MOVV	$__tsan_go_atomic64_load(SB), RCALL
++	JAL	racecallatomic<>(SB)
++	RET
++
++TEXT	sync∕atomic·LoadUint32(SB), NOSPLIT, $0-12
++	GO_ARGS
++	JMP	sync∕atomic·LoadInt32(SB)
++
++TEXT	sync∕atomic·LoadUint64(SB), NOSPLIT, $0-16
++	GO_ARGS
++	JMP	sync∕atomic·LoadInt64(SB)
++
++TEXT	sync∕atomic·LoadUintptr(SB), NOSPLIT, $0-16
++	GO_ARGS
++	JMP	sync∕atomic·LoadInt64(SB)
++
++TEXT	sync∕atomic·LoadPointer(SB), NOSPLIT, $0-16
++	GO_ARGS
++	JMP	sync∕atomic·LoadInt64(SB)
++
++// Store
++TEXT	sync∕atomic·StoreInt32(SB), NOSPLIT, $0-12
++	GO_ARGS
++	MOVV	$__tsan_go_atomic32_store(SB), RCALL
++	JAL	racecallatomic<>(SB)
++	RET
++
++TEXT	sync∕atomic·StoreInt64(SB), NOSPLIT, $0-16
++	GO_ARGS
++	MOVV	$__tsan_go_atomic64_store(SB), RCALL
++	JAL	racecallatomic<>(SB)
++	RET
++
++TEXT	sync∕atomic·StoreUint32(SB), NOSPLIT, $0-12
++	GO_ARGS
++	JMP	sync∕atomic·StoreInt32(SB)
++
++TEXT	sync∕atomic·StoreUint64(SB), NOSPLIT, $0-16
++	GO_ARGS
++	JMP	sync∕atomic·StoreInt64(SB)
++
++TEXT	sync∕atomic·StoreUintptr(SB), NOSPLIT, $0-16
++	GO_ARGS
++	JMP	sync∕atomic·StoreInt64(SB)
++
++// Swap
++TEXT	sync∕atomic·SwapInt32(SB), NOSPLIT, $0-20
++	GO_ARGS
++	MOVV	$__tsan_go_atomic32_exchange(SB), RCALL
++	JAL	racecallatomic<>(SB)
++	RET
++
++TEXT	sync∕atomic·SwapInt64(SB), NOSPLIT, $0-24
++	GO_ARGS
++	MOVV	$__tsan_go_atomic64_exchange(SB), RCALL
++	JAL	racecallatomic<>(SB)
++	RET
++
++TEXT	sync∕atomic·SwapUint32(SB), NOSPLIT, $0-20
++	GO_ARGS
++	JMP	sync∕atomic·SwapInt32(SB)
++
++TEXT	sync∕atomic·SwapUint64(SB), NOSPLIT, $0-24
++	GO_ARGS
++	JMP	sync∕atomic·SwapInt64(SB)
++
++TEXT	sync∕atomic·SwapUintptr(SB), NOSPLIT, $0-24
++	GO_ARGS
++	JMP	sync∕atomic·SwapInt64(SB)
++
++// Add
++TEXT	sync∕atomic·AddInt32(SB), NOSPLIT, $0-20
++	GO_ARGS
++	MOVV	$__tsan_go_atomic32_fetch_add(SB), RCALL
++	JAL	racecallatomic<>(SB)
++	MOVW	add+8(FP), RARG0	// convert fetch_add to add_fetch
++	MOVW	ret+16(FP), RARG1
++	ADD	RARG0, RARG1, RARG0
++	MOVW	RARG0, ret+16(FP)
++	RET
++
++TEXT	sync∕atomic·AddInt64(SB), NOSPLIT, $0-24
++	GO_ARGS
++	MOVV	$__tsan_go_atomic64_fetch_add(SB), RCALL
++	JAL	racecallatomic<>(SB)
++	MOVV	add+8(FP), RARG0	// convert fetch_add to add_fetch
++	MOVV	ret+16(FP), RARG1
++	ADDV	RARG0, RARG1, RARG0
++	MOVV	RARG0, ret+16(FP)
++	RET
++
++TEXT	sync∕atomic·AddUint32(SB), NOSPLIT, $0-20
++	GO_ARGS
++	JMP	sync∕atomic·AddInt32(SB)
++
++TEXT	sync∕atomic·AddUint64(SB), NOSPLIT, $0-24
++	GO_ARGS
++	JMP	sync∕atomic·AddInt64(SB)
++
++TEXT	sync∕atomic·AddUintptr(SB), NOSPLIT, $0-24
++	GO_ARGS
++	JMP	sync∕atomic·AddInt64(SB)
++
++// CompareAndSwap
++TEXT	sync∕atomic·CompareAndSwapInt32(SB), NOSPLIT, $0-17
++	GO_ARGS
++	MOVV	$__tsan_go_atomic32_compare_exchange(SB), RCALL
++	JAL	racecallatomic<>(SB)
++	RET
++
++TEXT	sync∕atomic·CompareAndSwapInt64(SB), NOSPLIT, $0-25
++	GO_ARGS
++	MOVV	$__tsan_go_atomic64_compare_exchange(SB), RCALL
++	JAL	racecallatomic<>(SB)
++	RET
++
++TEXT	sync∕atomic·CompareAndSwapUint32(SB), NOSPLIT, $0-17
++	GO_ARGS
++	JMP	sync∕atomic·CompareAndSwapInt32(SB)
++
++TEXT	sync∕atomic·CompareAndSwapUint64(SB), NOSPLIT, $0-25
++	GO_ARGS
++	JMP	sync∕atomic·CompareAndSwapInt64(SB)
++
++TEXT	sync∕atomic·CompareAndSwapUintptr(SB), NOSPLIT, $0-25
++	GO_ARGS
++	JMP	sync∕atomic·CompareAndSwapInt64(SB)
++
++// Generic atomic operation implementation.
++// RCALL = addr of target function
++TEXT	racecallatomic<>(SB), NOSPLIT, $0
++	// Set up these registers
++	// RARG0 = *ThreadState
++	// RARG1 = caller pc
++	// RARG2 = pc
++	// RARG3 = addr of incoming arg list
++
++	// Trigger SIGSEGV early.
++	MOVV	24(R3), RARG3	// 1st arg is addr. after two times JAL, get it at 24(R3)
++	MOVB	(RARG3), R12	// segv here if addr is bad
++
++	// Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend).
++	MOVV	runtime·racearenastart(SB), R12
++	BLT	RARG3, R12, racecallatomic_data
++	MOVV	runtime·racearenaend(SB), R12
++	BLT	RARG3, R12, racecallatomic_ok
++
++racecallatomic_data:
++	MOVV	runtime·racedatastart(SB), R12
++	BLT	RARG3, R12, racecallatomic_ignore
++	MOVV	runtime·racedataend(SB), R12
++	BGE	RARG3, R12, racecallatomic_ignore
++
++racecallatomic_ok:
++	// Addr is within the good range, call the atomic function.
++	load_g
++	MOVV	g_racectx(g), RARG0	// goroutine context
++	MOVV	8(R3), RARG1	// caller pc
++	MOVV	RCALL, RARG2	// pc
++	ADDV	$24, R3, RARG3
++	JAL	racecall<>(SB)	// does not return
++	RET
++
++racecallatomic_ignore:
++	// Addr is outside the good range.
++	// Call __tsan_go_ignore_sync_begin to ignore synchronization during the atomic op.
++	// An attempt to synchronize on the address would cause crash.
++	MOVV	RCALL, R25	// remember the original function
++	MOVV	$__tsan_go_ignore_sync_begin(SB), RCALL
++	load_g
++	MOVV	g_racectx(g), RARG0	// goroutine context
++	JAL	racecall<>(SB)
++	MOVV	R25, RCALL	// restore the original function
++
++	// Call the atomic function.
++	// racecall will call LLVM race code which might clobber R22 (g)
++	load_g
++	MOVV	g_racectx(g), RARG0	// goroutine context
++	MOVV	8(R3), RARG1	// caller pc
++	MOVV	RCALL, RARG2	// pc
++	ADDV	$24, R3, RARG3	// arguments
++	JAL	racecall<>(SB)
++
++	// Call __tsan_go_ignore_sync_end.
++	MOVV	$__tsan_go_ignore_sync_end(SB), RCALL
++	MOVV	g_racectx(g), RARG0	// goroutine context
++	JAL	racecall<>(SB)
++	RET
++
++// func runtime·racecall(void(*f)(...), ...)
++// Calls C function f from race runtime and passes up to 4 arguments to it.
++// The arguments are never heap-object-preserving pointers, so we pretend there are no arguments.
++TEXT	runtime·racecall(SB), NOSPLIT, $0-0
++	MOVV	fn+0(FP), RCALL
++	MOVV	arg0+8(FP), RARG0
++	MOVV	arg1+16(FP), RARG1
++	MOVV	arg2+24(FP), RARG2
++	MOVV	arg3+32(FP), RARG3
++	JMP	racecall<>(SB)
++
++// Switches SP to g0 stack and calls (RCALL). Arguments already set.
++TEXT	racecall<>(SB), NOSPLIT|NOFRAME, $0-0
++	MOVV	g_m(g), R12
++	// Switch to g0 stack.
++	MOVV	R3, R23	// callee-saved, preserved across the CALL
++	MOVV	R1, R24	// callee-saved, preserved across the CALL
++	MOVV	m_g0(R12), R13
++	BEQ	R13, g, call	// already on g0
++	MOVV	(g_sched+gobuf_sp)(R13), R3
++call:
++	JAL	(RCALL)
++	MOVV	R23, R3
++	JAL	(R24)
++	RET
++
++// C->Go callback thunk that allows to call runtime·racesymbolize from C code.
++// Direct Go->C race call has only switched SP, finish g->g0 switch by setting correct g.
++// The overall effect of Go->C->Go call chain is similar to that of mcall.
++// RARG0 contains command code. RARG1 contains command-specific context.
++// See racecallback for command codes.
++TEXT	runtime·racecallbackthunk(SB), NOSPLIT|NOFRAME, $0
++	// Handle command raceGetProcCmd (0) here.
++	// First, code below assumes that we are on curg, while raceGetProcCmd
++	// can be executed on g0. Second, it is called frequently, so will
++	// benefit from this fast path.
++	BNE	RARG0, R0, rest
++	MOVV	g, R15
++	load_g
++	MOVV	g_m(g), RARG0
++	MOVV	m_p(RARG0), RARG0
++	MOVV	p_raceprocctx(RARG0), RARG0
++	MOVV	RARG0, (RARG1)
++	MOVV	R15, g
++	JMP	(R1)
++rest:
++	// Save callee-saved registers (Go code won't respect that).
++	// 8(R3) and 16(R3) are for args passed through racecallback
++	ADDV	$-176, R3
++	MOVV	R1, 0(R3)
++
++	SAVE_R22_TO_R31(8*3)
++	SAVE_F24_TO_F31(8*13)
++	// Set g = g0.
++	load_g
++	MOVV	g_m(g), R15
++	MOVV	m_g0(R15), R14
++	BEQ	R14, g, noswitch	// branch if already on g0
++	MOVV	R14, g
++
++#ifdef GOEXPERIMENT_regabiargs
++	JAL	runtime·racecallback<ABIInternal>(SB)
++#else
++	MOVV	RARG0, 8(R3)	// func arg
++	MOVV	RARG1, 16(R3)	// func arg
++	JAL	runtime·racecallback(SB)
++#endif
++	// All registers are smashed after Go code, reload.
++	MOVV	g_m(g), R15
++	MOVV	m_curg(R15), g	// g = m->curg
++ret:
++	// Restore callee-saved registers.
++	MOVV	0(R3), R1
++	RESTORE_F24_TO_F31(8*13)
++	RESTORE_R22_TO_R31(8*3)
++	ADDV	$176, R3
++	JMP	(R1)
++
++noswitch:
++	// already on g0
++#ifdef GOEXPERIMENT_regabiargs
++	JAL	runtime·racecallback<ABIInternal>(SB)
++#else
++	MOVV	RARG0, 8(R3)	// func arg
++	MOVV	RARG1, 16(R3)	// func arg
++	JAL	runtime·racecallback(SB)
++#endif
++	JMP	ret
++
++// tls_g, g value for each thread in TLS
++GLOBL runtime·tls_g+0(SB), TLSBSS+DUPOK, $8
+-- 
+2.38.1
+
diff --git a/0004-runtime-delete-on-register-ABI-fallback-path-for-rac.patch b/0004-runtime-delete-on-register-ABI-fallback-path-for-rac.patch
new file mode 100644
index 0000000000000000000000000000000000000000..54922d9071f7d3b9635639e6d87b4e67ab1d5546
--- /dev/null
+++ b/0004-runtime-delete-on-register-ABI-fallback-path-for-rac.patch
@@ -0,0 +1,111 @@
+From 5623cd585fd5891d1f6d6d93256e4252b95b9dae Mon Sep 17 00:00:00 2001
+From: Guoqi Chen <chenguoqi@loongson.cn>
+Date: Mon, 6 Nov 2023 17:13:43 +0800
+Subject: [PATCH 04/44] runtime: delete on-register ABI fallback path for race
+ of loong64
+
+Co-authored-by: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Change-Id: Ie8c4a137205e29dd7dc63825f502b1f6b2f1c205
+---
+ src/runtime/race_loong64.s | 34 ----------------------------------
+ 1 file changed, 34 deletions(-)
+
+diff --git a/src/runtime/race_loong64.s b/src/runtime/race_loong64.s
+index 0512efc045..04f264b21b 100644
+--- a/src/runtime/race_loong64.s
++++ b/src/runtime/race_loong64.s
+@@ -40,11 +40,7 @@
+ // Defined as ABIInternal so as to avoid introducing a wrapper,
+ // which would make caller's PC ineffective.
+ TEXT	runtime·raceread<ABIInternal>(SB), NOSPLIT, $0-8
+-#ifdef GOEXPERIMENT_regabiargs
+ 	MOVV	R4, RARG1
+-#else
+-	MOVV	addr+0(FP), RARG1
+-#endif
+ 	MOVV	R1, RARG2
+ 	// void __tsan_read(ThreadState *thr, void *addr, void *pc);
+ 	MOVV	$__tsan_read(SB), RCALL
+@@ -69,11 +65,7 @@ TEXT	runtime·racereadpc(SB), NOSPLIT, $0-24
+ // Defined as ABIInternal so as to avoid introducing a wrapper,
+ // which would make caller's PC ineffective.
+ TEXT	runtime·racewrite<ABIInternal>(SB), NOSPLIT, $0-8
+-#ifdef GOEXPERIMENT_regabiargs
+ 	MOVV	R4, RARG1
+-#else
+-	MOVV	addr+0(FP), RARG1
+-#endif
+ 	MOVV	R1, RARG2
+ 	// void __tsan_write(ThreadState *thr, void *addr, void *pc);
+ 	MOVV	$__tsan_write(SB), RCALL
+@@ -98,13 +90,8 @@ TEXT	runtime·racewritepc(SB), NOSPLIT, $0-24
+ // Defined as ABIInternal so as to avoid introducing a wrapper,
+ // which would make caller's PC ineffective.
+ TEXT	runtime·racereadrange<ABIInternal>(SB), NOSPLIT, $0-16
+-#ifdef GOEXPERIMENT_regabiargs
+ 	MOVV	R5, RARG2
+ 	MOVV	R4, RARG1
+-#else
+-	MOVV	addr+0(FP), RARG1
+-	MOVV	size+8(FP), RARG2
+-#endif
+ 	MOVV	R1, RARG3
+ 	// void __tsan_read_range(ThreadState *thr, void *addr, uintptr size, void *pc);
+ 	MOVV	$__tsan_read_range(SB), RCALL
+@@ -130,13 +117,8 @@ TEXT	runtime·racereadrangepc1(SB), NOSPLIT, $0-24
+ // Defined as ABIInternal so as to avoid introducing a wrapper,
+ // which would make caller's PC ineffective.
+ TEXT	runtime·racewriterange<ABIInternal>(SB), NOSPLIT, $0-16
+-#ifdef GOEXPERIMENT_regabiargs
+ 	MOVV	R5, RARG2
+ 	MOVV	R4, RARG1
+-#else
+-	MOVV	addr+0(FP), RARG1
+-	MOVV	size+8(FP), RARG2
+-#endif
+ 	MOVV	R1, RARG3
+ 	// void __tsan_write_range(ThreadState *thr, void *addr, uintptr size, void *pc);
+ 	MOVV	$__tsan_write_range(SB), RCALL
+@@ -186,11 +168,7 @@ ret:
+ // func runtime·racefuncenter(pc uintptr)
+ // Called from instrumented code.
+ TEXT	runtime·racefuncenter<ABIInternal>(SB), NOSPLIT, $0-8
+-#ifdef GOEXPERIMENT_regabiargs
+ 	MOVV	R4, RCALL
+-#else
+-	MOVV	callpc+0(FP), RCALL
+-#endif
+ 	JMP	racefuncenter<>(SB)
+ 
+ // Common code for racefuncenter
+@@ -476,13 +454,7 @@ rest:
+ 	BEQ	R14, g, noswitch	// branch if already on g0
+ 	MOVV	R14, g
+ 
+-#ifdef GOEXPERIMENT_regabiargs
+ 	JAL	runtime·racecallback<ABIInternal>(SB)
+-#else
+-	MOVV	RARG0, 8(R3)	// func arg
+-	MOVV	RARG1, 16(R3)	// func arg
+-	JAL	runtime·racecallback(SB)
+-#endif
+ 	// All registers are smashed after Go code, reload.
+ 	MOVV	g_m(g), R15
+ 	MOVV	m_curg(R15), g	// g = m->curg
+@@ -496,13 +468,7 @@ ret:
+ 
+ noswitch:
+ 	// already on g0
+-#ifdef GOEXPERIMENT_regabiargs
+ 	JAL	runtime·racecallback<ABIInternal>(SB)
+-#else
+-	MOVV	RARG0, 8(R3)	// func arg
+-	MOVV	RARG1, 16(R3)	// func arg
+-	JAL	runtime·racecallback(SB)
+-#endif
+ 	JMP	ret
+ 
+ // tls_g, g value for each thread in TLS
+-- 
+2.38.1
+
diff --git a/0005-cmd-internal-obj-loong64-remove-unused-register-alia.patch b/0005-cmd-internal-obj-loong64-remove-unused-register-alia.patch
new file mode 100644
index 0000000000000000000000000000000000000000..34a43a06ff29f3746fc70132c426bacdfed426e9
--- /dev/null
+++ b/0005-cmd-internal-obj-loong64-remove-unused-register-alia.patch
@@ -0,0 +1,27 @@
+From 2ecb3ca09093ce12b2e47d97cbff223a950de0bb Mon Sep 17 00:00:00 2001
+From: Guoqi Chen <chenguoqi@loongson.cn>
+Date: Thu, 16 Nov 2023 17:28:46 +0800
+Subject: [PATCH 05/44] cmd/internal/obj/loong64: remove unused register alias
+ definitions
+
+Change-Id: Ie788747372cd47cb3780e75b35750bb08bd166fc
+---
+ src/cmd/internal/obj/loong64/a.out.go | 2 --
+ 1 file changed, 2 deletions(-)
+
+diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
+index e6984dcba7..53b005af4d 100644
+--- a/src/cmd/internal/obj/loong64/a.out.go
++++ b/src/cmd/internal/obj/loong64/a.out.go
+@@ -225,8 +225,6 @@ const (
+ 	REGZERO = REG_R0 // set to zero
+ 	REGLINK = REG_R1
+ 	REGSP   = REG_R3
+-	REGRET  = REG_R20 // not use
+-	REGARG  = -1      // -1 disables passing the first argument in register
+ 	REGRT1  = REG_R20 // reserved for runtime, duffzero and duffcopy
+ 	REGRT2  = REG_R21 // reserved for runtime, duffcopy
+ 	REGCTXT = REG_R29 // context for closures
+-- 
+2.38.1
+
diff --git a/0006-internal-bytealg-optimize-IndexByte-and-IndexByteStr.patch b/0006-internal-bytealg-optimize-IndexByte-and-IndexByteStr.patch
new file mode 100644
index 0000000000000000000000000000000000000000..b295cb6d9be8b060f1b9ce2adf084cc77b189724
--- /dev/null
+++ b/0006-internal-bytealg-optimize-IndexByte-and-IndexByteStr.patch
@@ -0,0 +1,160 @@
+From 0b580e45412ffc11f3a1c7ed7165f7a81e51adec Mon Sep 17 00:00:00 2001
+From: Huang Qiqi <huangqiqi@loongson.cn>
+Date: Fri, 17 May 2024 17:10:59 +0800
+Subject: [PATCH 06/44] internal/bytealg: optimize IndexByte and
+ IndexByteString function for loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Benchmark results on Loongson 3C5000 (which is an LA464 implementation):
+
+goos: linux
+goarch: loong64
+pkg: bytes
+cpu: Loongson-3C5000 @ 2200.00MHz
+              │ test/old_3c5000_indexbyte.log │    test/new_3c5000_indexbyte.log    │
+              │            sec/op             │   sec/op     vs base                │
+IndexByte/10                      19.72n ± 0%   13.72n ± 0%  -30.44% (p=0.000 n=20)
+IndexByte/32                      58.27n ± 0%   21.54n ± 0%  -63.04% (p=0.000 n=20)
+IndexByte/4K                      5.609µ ± 0%   2.349µ ± 0%  -58.13% (p=0.000 n=20)
+IndexByte/4M                      3.844m ± 2%   2.408m ± 1%  -37.36% (p=0.000 n=20)
+IndexByte/64M                     62.38m ± 0%   41.83m ± 2%  -32.94% (p=0.000 n=20)
+geomean                           17.29µ        9.309µ       -46.17%
+
+Change-Id: I9d60af0196a0078e829669ccd88f93b5f7a5db0a
+---
+ src/internal/bytealg/indexbyte_loong64.s | 105 ++++++++++++++++++-----
+ 1 file changed, 82 insertions(+), 23 deletions(-)
+
+diff --git a/src/internal/bytealg/indexbyte_loong64.s b/src/internal/bytealg/indexbyte_loong64.s
+index c9591b3cda..7811741423 100644
+--- a/src/internal/bytealg/indexbyte_loong64.s
++++ b/src/internal/bytealg/indexbyte_loong64.s
+@@ -10,41 +10,100 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
+ 	// R5 = b_len
+ 	// R6 = b_cap (unused)
+ 	// R7 = byte to find
+-	AND	$0xff, R7
++	ADDV	R4, R5		// end
+ 	MOVV	R4, R6		// store base for later
++	AND		$0xff, R7
++	JMP	    indexbytebody<>(SB)
++
++TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
++	// R4 = s_base
++	// R5 = s_len
++	// R6 = byte to find
++	AND	    $0xff, R6, R7
+ 	ADDV	R4, R5		// end
+-	ADDV	$-1, R4
++	MOVV	R4, R6		// store base for later
++	JMP	    indexbytebody<>(SB)
+ 
+-	PCALIGN	$16
++// input:
++//   R4: b_base
++//   R5: end
++//   R6: store base for later
++//   R7: byte to find
++TEXT indexbytebody<>(SB),NOSPLIT,$0
+ loop:
++	ADDV	$8, R4, R10
++	BLT	    R5, R10, tail
++	MOVV	(R4), R8
++
++	AND	    $0xff, R8, R9
++	BEQ	    R7, R9, found
++
++	WORD    $0xcf2109         // bstrpick.w r9, r8, 15, 8
++	BEQ	    R7, R9, byte_1th
++
++	WORD    $0xd74109         // bstrpick.w r9, r8, 23, 16
++	BEQ	    R7, R9, byte_2th
++
++	WORD    $0xdf6109         // bstrpick.w r9, r8, 31, 24
++	BEQ	    R7, R9, byte_3th
++
++	WORD    $0xe78109         // bstrpick.w r9, r8, 39, 32
++	BEQ	    R7, R9, byte_4th
++
++	WORD    $0xefa109         // bstrpick.w r9, r8, 47, 40
++	BEQ	    R7, R9, byte_5th
++
++	WORD    $0xf7c109         // bstrpick.w r9, r8, 55, 48
++	BEQ	    R7, R9, byte_6th
++
++	WORD    $0xffe109         // bstrpick.w r9, r8, 63, 56
++	BEQ	    R7, R9, byte_7th
++
++	MOVV	R10, R4
++	JMP	    loop
++
++tail:
++	BEQ	    R4, R5, notfound
++	MOVBU   (R4), R8
++	BEQ		R7, R8, found
+ 	ADDV	$1, R4
+-	BEQ	R4, R5, notfound
+-	MOVBU	(R4), R8
+-	BNE	R7, R8, loop
++	JMP	    tail
+ 
+-	SUBV	R6, R4		// remove base
++byte_1th:
++	ADDV	$1, R4
++	SUBV	R6, R4
+ 	RET
+ 
+-notfound:
+-	MOVV	$-1, R4
++byte_2th:
++	ADDV	$2, R4
++	SUBV	R6, R4
+ 	RET
+ 
+-TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
+-	// R4 = s_base
+-	// R5 = s_len
+-	// R6 = byte to find
+-	MOVV	R4, R7		// store base for later
+-	ADDV	R4, R5		// end
+-	ADDV	$-1, R4
++byte_3th:
++	ADDV	$3, R4
++	SUBV	R6, R4
++	RET
+ 
+-	PCALIGN	$16
+-loop:
+-	ADDV	$1, R4
+-	BEQ	R4, R5, notfound
+-	MOVBU	(R4), R8
+-	BNE	R6, R8, loop
++byte_4th:
++	ADDV	$4, R4
++	SUBV	R6, R4
++	RET
++
++byte_5th:
++	ADDV	$5, R4
++	SUBV	R6, R4
++	RET
+ 
+-	SUBV	R7, R4		// remove base
++byte_6th:
++	ADDV	$6, R4
++	SUBV	R6, R4
++	RET
++
++byte_7th:
++	ADDV	$7, R4
++
++found:
++	SUBV	R6, R4		// remove base
+ 	RET
+ 
+ notfound:
+-- 
+2.38.1
+
diff --git a/0007-internal-bytealg-optimize-memequal-and-memequal_varl.patch b/0007-internal-bytealg-optimize-memequal-and-memequal_varl.patch
new file mode 100644
index 0000000000000000000000000000000000000000..7e97b4ae39bffab8dbc8df201de06e31b71fee74
--- /dev/null
+++ b/0007-internal-bytealg-optimize-memequal-and-memequal_varl.patch
@@ -0,0 +1,142 @@
+From 83f497423050707a8cd27152256699ccd7819456 Mon Sep 17 00:00:00 2001
+From: Huang Qiqi <huangqiqi@loongson.cn>
+Date: Sat, 18 May 2024 11:00:57 +0800
+Subject: [PATCH 07/44] internal/bytealg: optimize memequal and memequal_varlen
+ function for loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+goos: linux
+goarch: loong64
+pkg: bytes
+cpu: Loongson-3C5000 @ 2200.00MHz
+                              │ test/old_3c5000_equal.log │       test/new_3c5000_equal.log       │
+                              │          sec/op           │    sec/op      vs base                │
+Equal/0                                      0.6824n ± 0%   0.6837n ±  0%   +0.20% (p=0.000 n=20)
+Equal/1                                       10.46n ± 0%    12.71n ±  0%  +21.46% (p=0.000 n=20)
+Equal/6                                       17.29n ± 0%    19.57n ±  0%  +13.22% (p=0.000 n=20)
+Equal/9                                       21.38n ± 0%    13.19n ±  0%  -38.31% (p=0.000 n=20)
+Equal/15                                      29.57n ± 0%    21.39n ±  0%  -27.68% (p=0.000 n=20)
+Equal/16                                      30.94n ± 0%    10.46n ±  0%  -66.19% (p=0.000 n=20)
+Equal/20                                      36.40n ± 0%    16.83n ±  0%  -53.76% (p=0.000 n=20)
+Equal/32                                      52.78n ± 0%    12.28n ±  0%  -76.73% (p=0.000 n=20)
+Equal/4K                                     5606.0n ± 0%    385.8n ±  0%  -93.12% (p=0.000 n=20)
+Equal/4M                                     5728.9µ ± 0%    746.4µ ±  0%  -86.97% (p=0.000 n=20)
+Equal/64M                                     92.02m ± 0%    14.13m ±  5%  -84.65% (p=0.000 n=20)
+EqualBothUnaligned/64_0                       98.73n ± 0%    10.04n ±  0%  -89.83% (p=0.000 n=20)
+EqualBothUnaligned/64_1                       98.73n ± 0%    10.29n ±  0%  -89.58% (p=0.000 n=20)
+EqualBothUnaligned/64_4                       98.73n ± 0%    10.29n ±  0%  -89.58% (p=0.000 n=20)
+EqualBothUnaligned/64_7                       98.73n ± 0%    10.28n ±  0%  -89.59% (p=0.000 n=20)
+EqualBothUnaligned/4096_0                    5602.0n ± 0%    365.8n ±  0%  -93.47% (p=0.000 n=20)
+EqualBothUnaligned/4096_1                    5602.0n ± 0%    437.2n ±  0%  -92.19% (p=0.000 n=20)
+EqualBothUnaligned/4096_4                    5602.0n ± 0%    436.4n ±  0%  -92.21% (p=0.000 n=20)
+EqualBothUnaligned/4096_7                    5602.0n ± 0%    439.2n ±  0%  -92.16% (p=0.000 n=20)
+EqualBothUnaligned/4194304_0                 5729.0µ ± 0%    732.4µ ±  0%  -87.22% (p=0.000 n=20)
+EqualBothUnaligned/4194304_1                 5729.2µ ± 0%    781.8µ ±  1%  -86.35% (p=0.000 n=20)
+EqualBothUnaligned/4194304_4                 5729.3µ ± 0%    773.9µ ±  0%  -86.49% (p=0.000 n=20)
+EqualBothUnaligned/4194304_7                 5729.3µ ± 0%    773.9µ ±  5%  -86.49% (p=0.000 n=20)
+EqualBothUnaligned/67108864_0                 92.38m ± 0%    34.61m ± 38%  -62.53% (p=0.000 n=20)
+EqualBothUnaligned/67108864_1                 92.38m ± 0%    33.07m ± 23%  -64.20% (p=0.000 n=20)
+EqualBothUnaligned/67108864_4                 92.38m ± 0%    82.09m ± 32%  -11.14% (p=0.000 n=20)
+EqualBothUnaligned/67108864_7                 92.39m ± 0%    61.47m ± 16%  -33.46% (p=0.000 n=20)
+geomean                                       11.86µ         2.654µ        -77.62%
+
+Change-Id: Ib181f532238e6f6d82a3e9e6987abe121688b6eb
+---
+ src/internal/bytealg/equal_loong64.s | 72 +++++++++++++++++++---------
+ 1 file changed, 49 insertions(+), 23 deletions(-)
+
+diff --git a/src/internal/bytealg/equal_loong64.s b/src/internal/bytealg/equal_loong64.s
+index 830b09bd2c..4cc31d5e46 100644
+--- a/src/internal/bytealg/equal_loong64.s
++++ b/src/internal/bytealg/equal_loong64.s
+@@ -9,36 +9,62 @@
+ 
+ // memequal(a, b unsafe.Pointer, size uintptr) bool
+ TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+-	BEQ	R4, R5, eq
+-	ADDV	R4, R6, R7
+-	PCALIGN	$16
+-loop:
+-	BNE	R4, R7, test
+-	MOVV	$1, R4
++	// R4 = a_base
++	// R5 = b_base
++	// R6 = size
++    JMP     equalbody<>(SB)
++
++// memequal_varlen(a, b unsafe.Pointer) bool
++TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
++	// R4 = a_base
++	// R5 = b_base
++	MOVV	8(REGCTXT), R6  // compiler stores size at offset 8 in the closure
++    JMP     equalbody<>(SB)
++
++TEXT equalbody<>(SB),NOSPLIT|NOFRAME,$0
++	BEQ		R4, R5, eq
++	ADDV	R4, R6, R6		// end
++
++loop_16byte:
++	ADDV    $16, R4, R9
++	BLT     R6, R9, load8byte
++	MOVV	(R4), R7
++	MOVV	(R5), R8
++	MOVV	8(R4), R10
++	MOVV	8(R5), R11
++	MOVV	R9, R4
++    XOR     R7, R8, R7
++    XOR     R10, R11, R10
++    OR      R10, R7, R7
++	ADDV	$16, R5
++    BEQ     R7, loop_16byte
++
++	MOVB    R0, R4
+ 	RET
+-test:
+-	MOVBU	(R4), R9
++
++load8byte:
++	ADDV    $8, R4, R9
++	BLT     R6, R9, tail
++	MOVV	(R4), R7
++	MOVV	(R5), R8
++	MOVV	R9, R4
++	ADDV	$8, R5
++	BEQ		R7, R8, tail
++
++	MOVB    R0, R4
++	RET
++
++tail:
++	BEQ		R4, R6, eq
++	MOVBU	(R4), R7
++	MOVBU	(R5), R8
+ 	ADDV	$1, R4
+-	MOVBU	(R5), R10
+ 	ADDV	$1, R5
+-	BEQ	R9, R10, loop
++	BEQ		R7, R8, tail
+ 
+ 	MOVB    R0, R4
+ 	RET
+-eq:
+-	MOVV	$1, R4
+-	RET
+ 
+-// memequal_varlen(a, b unsafe.Pointer) bool
+-TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$40-17
+-	BEQ	R4, R5, eq
+-	MOVV	8(REGCTXT), R6    // compiler stores size at offset 8 in the closure
+-	MOVV	R4, 8(R3)
+-	MOVV	R5, 16(R3)
+-	MOVV	R6, 24(R3)
+-	JAL	runtime·memequal(SB)
+-	MOVBU	32(R3), R4
+-	RET
+ eq:
+ 	MOVV	$1, R4
+ 	RET
+-- 
+2.38.1
+
diff --git a/0008-internal-bytealg-optimize-Index-and-IndexString-func.patch b/0008-internal-bytealg-optimize-Index-and-IndexString-func.patch
new file mode 100644
index 0000000000000000000000000000000000000000..4fb2113d1dc87b193fc7af7c42f2ab6d9a4a6047
--- /dev/null
+++ b/0008-internal-bytealg-optimize-Index-and-IndexString-func.patch
@@ -0,0 +1,299 @@
+From 89d740fe5889c558dbb69b6ac3a80ec38cd5765c Mon Sep 17 00:00:00 2001
+From: Huang Qiqi <huangqiqi@loongson.cn>
+Date: Thu, 23 May 2024 16:25:06 +0800
+Subject: [PATCH 08/44] internal/bytealg: optimize Index and IndexString
+ function for loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+goos: linux
+goarch: loong64
+pkg: bytes
+cpu: Loongson-3C5000 @ 2200.00MHz
+              │ test/old_3c5000_index.log │      test/new_3c5000_index.log      │
+              │          sec/op           │   sec/op     vs base                │
+Index/10                      66.42n ± 0%   20.47n ± 0%  -69.18% (p=0.000 n=20)
+Index/32                      196.1n ± 0%   105.7n ± 0%  -46.12% (p=0.000 n=20)
+Index/4K                     13.622µ ± 0%   5.673µ ± 0%  -58.35% (p=0.000 n=20)
+Index/4M                     14.005m ± 0%   5.734m ± 0%  -59.06% (p=0.000 n=20)
+Index/64M                    224.50m ± 0%   91.94m ± 0%  -59.05% (p=0.000 n=20)
+IndexEasy/10                  21.30n ± 0%   18.66n ± 0%  -12.41% (p=0.000 n=20)
+IndexEasy/32                  41.40n ± 0%   33.91n ± 1%  -18.09% (p=0.000 n=20)
+IndexEasy/4K                  4.141µ ± 4%   2.373µ ± 1%  -42.70% (p=0.000 n=20)
+IndexEasy/4M                  3.830m ± 0%   2.392m ± 0%  -37.55% (p=0.000 n=20)
+IndexEasy/64M                 62.54m ± 1%   39.86m ± 0%  -36.26% (p=0.000 n=20)
+geomean                       29.43µ        15.73µ       -46.57%
+
+goos: linux
+goarch: loong64
+pkg: strings
+cpu: Loongson-3C5000 @ 2200.00MHz
+      │ test/old_3c5000_indexstring.log │   test/new_3c5000_indexstring.log   │
+      │             sec/op              │   sec/op     vs base                │
+Index                       30.54n ± 0%   16.91n ± 0%  -44.64% (p=0.000 n=20)
+
+Change-Id: I92739ada1637356c6d42761a8a596b0bffec405d
+---
+ src/internal/bytealg/index_generic.go |   2 +-
+ src/internal/bytealg/index_loong64.go |  23 ++++
+ src/internal/bytealg/index_loong64.s  | 190 ++++++++++++++++++++++++++
+ src/internal/bytealg/index_native.go  |   2 +-
+ 4 files changed, 215 insertions(+), 2 deletions(-)
+ create mode 100644 src/internal/bytealg/index_loong64.go
+ create mode 100644 src/internal/bytealg/index_loong64.s
+
+diff --git a/src/internal/bytealg/index_generic.go b/src/internal/bytealg/index_generic.go
+index a59e32938e..2d89c41825 100644
+--- a/src/internal/bytealg/index_generic.go
++++ b/src/internal/bytealg/index_generic.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64
++//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64 && !loong64
+ 
+ package bytealg
+ 
+diff --git a/src/internal/bytealg/index_loong64.go b/src/internal/bytealg/index_loong64.go
+new file mode 100644
+index 0000000000..d6f43eb32c
+--- /dev/null
++++ b/src/internal/bytealg/index_loong64.go
+@@ -0,0 +1,23 @@
++// Copyright 2018 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++package bytealg
++
++// Empirical data shows that using Index can get better
++// performance when len(s) <= 16.
++const MaxBruteForce = 16
++
++func init() {
++	// Optimize cases where the length of the substring is less than 32 bytes
++	MaxLen = 32
++}
++
++// Cutover reports the number of failures of IndexByte we should tolerate
++// before switching over to Index.
++// n is the number of bytes processed so far.
++// See the bytes.Index implementation for details.
++func Cutover(n int) int {
++	// 1 error per 8 characters, plus a few slop to start.
++	return (n + 16) / 8
++}
+diff --git a/src/internal/bytealg/index_loong64.s b/src/internal/bytealg/index_loong64.s
+new file mode 100644
+index 0000000000..221d0332a4
+--- /dev/null
++++ b/src/internal/bytealg/index_loong64.s
+@@ -0,0 +1,190 @@
++// Copyright 2018 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++#include "go_asm.h"
++#include "textflag.h"
++
++TEXT ·Index<ABIInternal>(SB),NOSPLIT,$0-56
++	MOVV	 R7, R6		 // R6 = separator pointer
++	MOVV	 R8, R7		 // R7 = separator length
++	JMP 	indexbody<>(SB)
++
++TEXT ·IndexString<ABIInternal>(SB),NOSPLIT,$0-40
++	JMP 	indexbody<>(SB)
++
++// input:
++//   R4 = string
++//   R5 = length
++//   R6 = separator pointer
++//   R7 = separator length (2 <= len <= 32)
++TEXT indexbody<>(SB),NOSPLIT,$0
++	// main idea is to load 'sep' into separate register(s)
++	// to avoid repeatedly re-load it again and again
++	// for sebsequent substring comparisons
++	SUBV	R7, R5, R8
++	ADDV	$1, R4, R9		  // store base for later
++	MOVV	$8, R5
++	ADDV	R4, R8			  // end
++	BLT	    R5, R7, len_gt_8
++
++len_le_8:
++	AND		$0x8, R7, R5
++	BNE	    R5, len_8
++	AND		$0x4, R7, R5
++	BNE	    R5, len_4_7
++
++len_2_3:
++	AND		$0x1, R7, R5
++	BNE	    R5, len_3
++
++len_2:
++	MOVHU   (R6), R10
++loop_2:
++	BLT	    R8, R4, not_found
++	MOVHU   (R4), R11
++	ADDV	$1, R4
++	BNE	    R10, R11, loop_2
++	JMP	    found
++
++len_3:
++	MOVHU	(R6), R10
++	MOVBU	2(R6), R11
++loop_3:
++	BLT	    R8, R4, not_found
++	MOVHU   (R4), R12
++	ADDV	$1, R4
++	BNE	    R10, R12, loop_3
++	MOVBU   1(R4), R12
++	BNE	    R11, R12, loop_3
++	JMP	    found
++
++len_4_7:
++	AND		$0x2, R7, R5
++	BNE	    R5, len_6_7
++	AND		$0x1, R7, R5
++	BNE	    R5, len_5
++
++len_4:
++	MOVWU   (R6), R10
++loop_4:
++	BLT	    R8, R4, not_found
++	MOVWU   (R4), R11
++	ADDV	$1, R4
++	BNE	    R10, R11, loop_4
++	JMP	    found
++len_5:
++	MOVWU	(R6), R10
++	MOVBU	4(R6), R11
++loop_5:
++	BLT	    R8, R4, not_found
++	MOVWU   (R4), R12
++	ADDV	$1, R4
++	BNE	    R10, R12, loop_5
++	MOVBU   3(R4), R12
++	BNE	    R11, R12, loop_5
++	JMP	    found
++
++len_6_7:
++	AND		$0x1, R7, R5
++	BNE	    R5, len_7
++
++len_6:
++	MOVWU	(R6), R10
++	MOVHU	4(R6), R11
++loop_6:
++	BLT	    R8, R4, not_found
++	MOVWU   (R4), R12
++	ADDV	$1, R4
++	BNE	    R10, R12, loop_6
++	MOVHU   3(R4), R12
++	BNE	    R11, R12, loop_6
++	JMP	    found
++
++len_7:
++	MOVWU	(R6), R10
++	MOVWU	3(R6), R11
++loop_7:
++	BLT	    R8, R4, not_found
++	MOVWU   (R4), R12
++	ADDV	$1, R4
++	BNE	    R10, R12, loop_7
++	MOVWU   2(R4), R12
++	BNE	    R11, R12, loop_7
++	JMP	    found
++
++len_8:
++	MOVV	(R6), R10
++loop_8:
++	BLT	    R8, R4, not_found
++	MOVV	(R4), R11
++	ADDV	$1, R4
++	BNE	    R10, R11, loop_8
++	JMP	    found
++
++len_gt_8:
++	MOVV	$16, R5
++	BLT	    R5, R7, len_gt_16
++
++len_9_16:
++	MOVV	(R6), R10
++	SUBV	$8, R7
++	MOVV	(R6)(R7), R11
++	SUBV	$1, R7
++loop_9_16:
++	BLT	    R8, R4, not_found
++	MOVV	(R4), R12
++	ADDV	$1, R4
++	BNE	    R10, R12, loop_9_16
++	MOVV	(R4)(R7), R12
++	BNE	    R11, R12, loop_9_16
++	JMP	    found
++
++len_gt_16:
++	MOVV	$24, R5
++	BLT	    R5, R7, len_25_32
++
++len_17_24:
++	MOVV	(R6), R10
++	SUBV	$8, R7
++	MOVV	8(R6), R11
++	MOVV	(R6)(R7), R12
++	SUBV	$1, R7
++loop_17_24:
++	BLT	    R8, R4, not_found
++	MOVV	(R4), R13
++	ADDV	$1, R4
++	BNE	    R10, R13, loop_17_24
++	MOVV	7(R4), R13
++	BNE	    R11, R13, loop_17_24
++	MOVV	(R4)(R7), R13
++	BNE	    R12, R13, loop_17_24
++	JMP	    found
++
++len_25_32:
++	MOVV	(R6), R10
++	SUBV	$8, R7
++	MOVV	8(R6), R11
++	MOVV	16(R6), R12
++	MOVV	(R6)(R7), R13
++	SUBV	$1, R7
++loop_25_32:
++	BLT	    R8, R4, not_found
++	MOVV	(R4), R14
++	ADDV	$1, R4
++	BNE	    R10, R14, loop_25_32
++	MOVV	7(R4), R14
++	BNE	    R11, R14, loop_25_32
++	MOVV	15(R4), R14
++	BNE	    R12, R14, loop_25_32
++	MOVV	(R4)(R7), R14
++	BNE	    R13, R14, loop_25_32
++	JMP	    found
++
++found:
++	SUBV	R9, R4
++	RET
++
++not_found:
++	MOVV	$-1, R4
++	RET
+diff --git a/src/internal/bytealg/index_native.go b/src/internal/bytealg/index_native.go
+index 59c93f9d12..7aadaabe4e 100644
+--- a/src/internal/bytealg/index_native.go
++++ b/src/internal/bytealg/index_native.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build amd64 || arm64 || s390x || ppc64le || ppc64
++//go:build amd64 || arm64 || s390x || ppc64le || ppc64 || loong64
+ 
+ package bytealg
+ 
+-- 
+2.38.1
+
diff --git a/0009-internal-bytealg-optimize-Count-and-CountString-func.patch b/0009-internal-bytealg-optimize-Count-and-CountString-func.patch
new file mode 100644
index 0000000000000000000000000000000000000000..9b99d37cca346311046f3085e4133441b5f321ae
--- /dev/null
+++ b/0009-internal-bytealg-optimize-Count-and-CountString-func.patch
@@ -0,0 +1,153 @@
+From 37c73e45ea537b7e8662b968b630a2566b25ae59 Mon Sep 17 00:00:00 2001
+From: Huang Qiqi <huangqiqi@loongson.cn>
+Date: Wed, 29 May 2024 10:49:41 +0800
+Subject: [PATCH 09/44] internal/bytealg: optimize Count and CountString
+ function for loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Benchmark results on Loongson 3C5000 (which is an LA464 implementation):
+
+goos: linux
+goarch: loong64
+pkg: bytes
+cpu: Loongson-3C5000 @ 2200.00MHz
+                │ test/old_3c5000_count.log │      test/new_3c5000_count.log      │
+                │          sec/op           │   sec/op     vs base                │
+CountSingle/10                  16.26n ± 0%   16.26n ± 0%        ~ (p=0.653 n=20)
+CountSingle/32                  41.48n ± 0%   27.48n ± 0%  -33.75% (p=0.000 n=20)
+CountSingle/4K                  4.998µ ± 0%   2.961µ ± 0%  -40.76% (p=0.000 n=20)
+CountSingle/4M                  5.076m ± 0%   3.510m ± 8%  -30.84% (p=0.000 n=20)
+CountSingle/64M                 88.70m ± 0%   58.15m ± 1%  -34.45% (p=0.000 n=20)
+geomean                         17.23µ        12.20µ       -29.19%
+
+Change-Id: Ic60d49fea83c9cf4f9b02bae3ce69b81206c7017
+---
+ src/internal/bytealg/count_generic.go |  2 +-
+ src/internal/bytealg/count_loong64.s  | 86 +++++++++++++++++++++++++++
+ src/internal/bytealg/count_native.go  |  2 +-
+ 3 files changed, 88 insertions(+), 2 deletions(-)
+ create mode 100644 src/internal/bytealg/count_loong64.s
+
+diff --git a/src/internal/bytealg/count_generic.go b/src/internal/bytealg/count_generic.go
+index 932a7c584c..16f974539c 100644
+--- a/src/internal/bytealg/count_generic.go
++++ b/src/internal/bytealg/count_generic.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build !amd64 && !arm && !arm64 && !ppc64le && !ppc64 && !riscv64 && !s390x
++//go:build !amd64 && !arm && !arm64 && !loong64 && !ppc64le && !ppc64 && !riscv64 && !s390x
+ 
+ package bytealg
+ 
+diff --git a/src/internal/bytealg/count_loong64.s b/src/internal/bytealg/count_loong64.s
+new file mode 100644
+index 0000000000..ca19c5f343
+--- /dev/null
++++ b/src/internal/bytealg/count_loong64.s
+@@ -0,0 +1,86 @@
++// Copyright 2020 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++#include "go_asm.h"
++#include "textflag.h"
++
++TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
++	// R4 = b_base
++	// R5 = b_len
++	// R6 = b_cap (unused)
++	// R7 = byte to count (want in R6)
++	AND	    $0xff, R7, R6
++	JMP 	countbody<>(SB)
++
++TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
++	// R4 = s_base
++	// R5 = s_len
++	// R6 = byte to count
++	AND     $0xff, R6
++	JMP 	countbody<>(SB)
++
++// input:
++//   R4 = s_base
++//   R5 = s_len
++//   R6 = byte to count
++TEXT countbody<>(SB),NOSPLIT,$0
++	MOVV	R0, R7      // count
++	ADDV	R4, R5      // end
++    MOVV    $1, R17
++
++loop:
++	ADDV	$8, R4, R9
++	BLT	    R5, R9, tail
++	MOVV	(R4), R8
++
++	AND	    $0xff, R8, R10
++	WORD    $0xcf210b         // bstrpick.w r11, r8, 15, 8
++    XOR     R6, R10, R10
++    XOR     R6, R11, R11
++    MASKNEZ R10, R17, R12
++    MASKNEZ R11, R17, R13
++    ADDV    R7, R12, R7
++    ADDV    R7, R13, R7
++
++	WORD    $0xd7410a         // bstrpick.w r10, r8, 23, 16
++	WORD    $0xdf610b         // bstrpick.w r11, r8, 31, 24
++    XOR     R6, R10, R10
++    XOR     R6, R11, R11
++    MASKNEZ R10, R17, R12
++    MASKNEZ R11, R17, R13
++    ADDV    R7, R12, R7
++    ADDV    R7, R13, R7
++
++	WORD    $0xe7810a         // bstrpick.w r10, r8, 39, 32
++	WORD    $0xefa10b         // bstrpick.w r11, r8, 47, 40
++    XOR     R6, R10, R10
++    XOR     R6, R11, R11
++    MASKNEZ R10, R17, R12
++    MASKNEZ R11, R17, R13
++    ADDV    R7, R12, R7
++    ADDV    R7, R13, R7
++
++	WORD    $0xf7c10a         // bstrpick.w r10, r8, 55, 48
++	WORD    $0xffe10b         // bstrpick.w r11, r8, 63, 56
++    XOR     R6, R10, R10
++    XOR     R6, R11, R11
++    MASKNEZ R10, R17, R12
++    MASKNEZ R11, R17, R13
++    ADDV    R7, R12, R7
++    ADDV    R7, R13, R7
++
++	MOVV	R9, R4
++    JMP     loop
++
++tail:
++	BEQ	    R4, R5, done
++	MOVBU	(R4), R8
++	ADDV	$1, R4
++	BNE     R6, R8, tail
++	ADDV	$1, R7
++	JMP	    tail
++
++done:
++	MOVV	R7, R4
++	RET
+diff --git a/src/internal/bytealg/count_native.go b/src/internal/bytealg/count_native.go
+index 90189c9fe0..eab64e8950 100644
+--- a/src/internal/bytealg/count_native.go
++++ b/src/internal/bytealg/count_native.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build amd64 || arm || arm64 || ppc64le || ppc64 || riscv64 || s390x
++//go:build amd64 || arm || arm64 || loong64 || ppc64le || ppc64 || riscv64 || s390x
+ 
+ package bytealg
+ 
+-- 
+2.38.1
+
diff --git a/0010-internal-bytealg-adjust-the-format-of-assembly-files.patch b/0010-internal-bytealg-adjust-the-format-of-assembly-files.patch
new file mode 100644
index 0000000000000000000000000000000000000000..85a21d39280a660722421eccf9ed74d018375699
--- /dev/null
+++ b/0010-internal-bytealg-adjust-the-format-of-assembly-files.patch
@@ -0,0 +1,583 @@
+From 14ffec301d84da6bcd5ef5757d6cd6445351225e Mon Sep 17 00:00:00 2001
+From: Huang Qiqi <huangqiqi@loongson.cn>
+Date: Mon, 3 Jun 2024 15:43:32 +0800
+Subject: [PATCH 10/44] internal/bytealg: adjust the format of assembly files
+ {count, equal, index, indexbyte}_loong64.s
+
+Change-Id: I19e6650e6595148e449da7a82be6e735c6f01ab6
+---
+ src/internal/bytealg/count_loong64.s     |  92 +++++++-------
+ src/internal/bytealg/equal_loong64.s     |  42 ++++---
+ src/internal/bytealg/index_loong64.s     | 148 +++++++++++------------
+ src/internal/bytealg/indexbyte_loong64.s |  52 ++++----
+ 4 files changed, 169 insertions(+), 165 deletions(-)
+
+diff --git a/src/internal/bytealg/count_loong64.s b/src/internal/bytealg/count_loong64.s
+index ca19c5f343..db8ba2cb24 100644
+--- a/src/internal/bytealg/count_loong64.s
++++ b/src/internal/bytealg/count_loong64.s
+@@ -1,4 +1,4 @@
+-// Copyright 2020 The Go Authors. All rights reserved.
++// Copyright 2024 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+@@ -9,77 +9,77 @@ TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
+ 	// R4 = b_base
+ 	// R5 = b_len
+ 	// R6 = b_cap (unused)
+-	// R7 = byte to count (want in R6)
+-	AND	    $0xff, R7, R6
+-	JMP 	countbody<>(SB)
++	// R7 = byte to count
++	AND	$0xff, R7, R6
++	JMP	countbody<>(SB)
+ 
+ TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
+ 	// R4 = s_base
+ 	// R5 = s_len
+ 	// R6 = byte to count
+-	AND     $0xff, R6
+-	JMP 	countbody<>(SB)
++	AND	$0xff, R6
++	JMP	countbody<>(SB)
+ 
+ // input:
+ //   R4 = s_base
+ //   R5 = s_len
+ //   R6 = byte to count
+ TEXT countbody<>(SB),NOSPLIT,$0
+-	MOVV	R0, R7      // count
+-	ADDV	R4, R5      // end
+-    MOVV    $1, R17
++	MOVV	R0, R7		// count
++	ADDV	R4, R5		// end
++	MOVV	$1, R17
+ 
+ loop:
+ 	ADDV	$8, R4, R9
+-	BLT	    R5, R9, tail
++	BLT	R5, R9, tail
+ 	MOVV	(R4), R8
+ 
+-	AND	    $0xff, R8, R10
+-	WORD    $0xcf210b         // bstrpick.w r11, r8, 15, 8
+-    XOR     R6, R10, R10
+-    XOR     R6, R11, R11
+-    MASKNEZ R10, R17, R12
+-    MASKNEZ R11, R17, R13
+-    ADDV    R7, R12, R7
+-    ADDV    R7, R13, R7
++	AND	$0xff, R8, R10
++	WORD	$0xcf210b	// bstrpick.w r11, r8, 15, 8
++	XOR	R6, R10, R10
++	XOR	R6, R11, R11
++	MASKNEZ	R10, R17, R12
++	MASKNEZ	R11, R17, R13
++	ADDV	R7, R12, R7
++	ADDV	R7, R13, R7
+ 
+-	WORD    $0xd7410a         // bstrpick.w r10, r8, 23, 16
+-	WORD    $0xdf610b         // bstrpick.w r11, r8, 31, 24
+-    XOR     R6, R10, R10
+-    XOR     R6, R11, R11
+-    MASKNEZ R10, R17, R12
+-    MASKNEZ R11, R17, R13
+-    ADDV    R7, R12, R7
+-    ADDV    R7, R13, R7
++	WORD	$0xd7410a	// bstrpick.w r10, r8, 23, 16
++	WORD	$0xdf610b	// bstrpick.w r11, r8, 31, 24
++	XOR	R6, R10, R10
++	XOR	R6, R11, R11
++	MASKNEZ	R10, R17, R12
++	MASKNEZ	R11, R17, R13
++	ADDV	R7, R12, R7
++	ADDV	R7, R13, R7
+ 
+-	WORD    $0xe7810a         // bstrpick.w r10, r8, 39, 32
+-	WORD    $0xefa10b         // bstrpick.w r11, r8, 47, 40
+-    XOR     R6, R10, R10
+-    XOR     R6, R11, R11
+-    MASKNEZ R10, R17, R12
+-    MASKNEZ R11, R17, R13
+-    ADDV    R7, R12, R7
+-    ADDV    R7, R13, R7
++	WORD	$0xe7810a	// bstrpick.w r10, r8, 39, 32
++	WORD	$0xefa10b	// bstrpick.w r11, r8, 47, 40
++	XOR	R6, R10, R10
++	XOR	R6, R11, R11
++	MASKNEZ	R10, R17, R12
++	MASKNEZ	R11, R17, R13
++	ADDV	R7, R12, R7
++	ADDV	R7, R13, R7
+ 
+-	WORD    $0xf7c10a         // bstrpick.w r10, r8, 55, 48
+-	WORD    $0xffe10b         // bstrpick.w r11, r8, 63, 56
+-    XOR     R6, R10, R10
+-    XOR     R6, R11, R11
+-    MASKNEZ R10, R17, R12
+-    MASKNEZ R11, R17, R13
+-    ADDV    R7, R12, R7
+-    ADDV    R7, R13, R7
++	WORD	$0xf7c10a	// bstrpick.w r10, r8, 55, 48
++	WORD	$0xffe10b	// bstrpick.w r11, r8, 63, 56
++	XOR	R6, R10, R10
++	XOR	R6, R11, R11
++	MASKNEZ	R10, R17, R12
++	MASKNEZ	R11, R17, R13
++	ADDV	R7, R12, R7
++	ADDV	R7, R13, R7
+ 
+ 	MOVV	R9, R4
+-    JMP     loop
++	JMP	loop
+ 
+ tail:
+-	BEQ	    R4, R5, done
++	BEQ	R4, R5, done
+ 	MOVBU	(R4), R8
+ 	ADDV	$1, R4
+-	BNE     R6, R8, tail
++	BNE	R6, R8, tail
+ 	ADDV	$1, R7
+-	JMP	    tail
++	JMP	tail
+ 
+ done:
+ 	MOVV	R7, R4
+diff --git a/src/internal/bytealg/equal_loong64.s b/src/internal/bytealg/equal_loong64.s
+index 4cc31d5e46..5d5d591a2c 100644
+--- a/src/internal/bytealg/equal_loong64.s
++++ b/src/internal/bytealg/equal_loong64.s
+@@ -12,57 +12,61 @@ TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+ 	// R4 = a_base
+ 	// R5 = b_base
+ 	// R6 = size
+-    JMP     equalbody<>(SB)
++	JMP	equalbody<>(SB)
+ 
+ // memequal_varlen(a, b unsafe.Pointer) bool
+ TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
+ 	// R4 = a_base
+ 	// R5 = b_base
+-	MOVV	8(REGCTXT), R6  // compiler stores size at offset 8 in the closure
+-    JMP     equalbody<>(SB)
++	MOVV	8(REGCTXT), R6	// compiler stores size at offset 8 in the closure
++	JMP	equalbody<>(SB)
+ 
++// input:
++//   R4 = a_base
++//   R5 = b_base
++//   R6 = size
+ TEXT equalbody<>(SB),NOSPLIT|NOFRAME,$0
+-	BEQ		R4, R5, eq
+-	ADDV	R4, R6, R6		// end
++	BEQ	R4, R5, eq
++	ADDV	R4, R6, R6	// end
+ 
+ loop_16byte:
+-	ADDV    $16, R4, R9
+-	BLT     R6, R9, load8byte
++	ADDV	$16, R4, R9
++	BLT	R6, R9, load8byte
+ 	MOVV	(R4), R7
+ 	MOVV	(R5), R8
+ 	MOVV	8(R4), R10
+ 	MOVV	8(R5), R11
+ 	MOVV	R9, R4
+-    XOR     R7, R8, R7
+-    XOR     R10, R11, R10
+-    OR      R10, R7, R7
++	XOR	R7, R8, R7
++	XOR	R10, R11, R10
++	OR	R10, R7, R7
+ 	ADDV	$16, R5
+-    BEQ     R7, loop_16byte
++	BEQ	R7, loop_16byte
+ 
+-	MOVB    R0, R4
++	MOVB	R0, R4
+ 	RET
+ 
+ load8byte:
+-	ADDV    $8, R4, R9
+-	BLT     R6, R9, tail
++	ADDV	$8, R4, R9
++	BLT	R6, R9, tail
+ 	MOVV	(R4), R7
+ 	MOVV	(R5), R8
+ 	MOVV	R9, R4
+ 	ADDV	$8, R5
+-	BEQ		R7, R8, tail
++	BEQ	R7, R8, tail
+ 
+-	MOVB    R0, R4
++	MOVB	R0, R4
+ 	RET
+ 
+ tail:
+-	BEQ		R4, R6, eq
++	BEQ	R4, R6, eq
+ 	MOVBU	(R4), R7
+ 	MOVBU	(R5), R8
+ 	ADDV	$1, R4
+ 	ADDV	$1, R5
+-	BEQ		R7, R8, tail
++	BEQ	R7, R8, tail
+ 
+-	MOVB    R0, R4
++	MOVB	R0, R4
+ 	RET
+ 
+ eq:
+diff --git a/src/internal/bytealg/index_loong64.s b/src/internal/bytealg/index_loong64.s
+index 221d0332a4..7f7190b3be 100644
+--- a/src/internal/bytealg/index_loong64.s
++++ b/src/internal/bytealg/index_loong64.s
+@@ -1,4 +1,4 @@
+-// Copyright 2018 The Go Authors. All rights reserved.
++// Copyright 2024 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+@@ -6,12 +6,12 @@
+ #include "textflag.h"
+ 
+ TEXT ·Index<ABIInternal>(SB),NOSPLIT,$0-56
+-	MOVV	 R7, R6		 // R6 = separator pointer
+-	MOVV	 R8, R7		 // R7 = separator length
+-	JMP 	indexbody<>(SB)
++	MOVV	R7, R6		// R6 = separator pointer
++	MOVV	R8, R7		// R7 = separator length
++	JMP	indexbody<>(SB)
+ 
+ TEXT ·IndexString<ABIInternal>(SB),NOSPLIT,$0-40
+-	JMP 	indexbody<>(SB)
++	JMP	indexbody<>(SB)
+ 
+ // input:
+ //   R4 = string
+@@ -23,108 +23,108 @@ TEXT indexbody<>(SB),NOSPLIT,$0
+ 	// to avoid repeatedly re-load it again and again
+ 	// for sebsequent substring comparisons
+ 	SUBV	R7, R5, R8
+-	ADDV	$1, R4, R9		  // store base for later
++	ADDV	$1, R4, R9	// store base for later
+ 	MOVV	$8, R5
+-	ADDV	R4, R8			  // end
+-	BLT	    R5, R7, len_gt_8
++	ADDV	R4, R8		// end
++	BLT	R5, R7, len_gt_8
+ 
+ len_le_8:
+-	AND		$0x8, R7, R5
+-	BNE	    R5, len_8
+-	AND		$0x4, R7, R5
+-	BNE	    R5, len_4_7
++	AND	$0x8, R7, R5
++	BNE	R5, len_8
++	AND	$0x4, R7, R5
++	BNE	R5, len_4_7
+ 
+ len_2_3:
+-	AND		$0x1, R7, R5
+-	BNE	    R5, len_3
++	AND	$0x1, R7, R5
++	BNE	R5, len_3
+ 
+ len_2:
+-	MOVHU   (R6), R10
++	MOVHU	(R6), R10
+ loop_2:
+-	BLT	    R8, R4, not_found
+-	MOVHU   (R4), R11
++	BLT	R8, R4, not_found
++	MOVHU	(R4), R11
+ 	ADDV	$1, R4
+-	BNE	    R10, R11, loop_2
+-	JMP	    found
++	BNE	R10, R11, loop_2
++	JMP	found
+ 
+ len_3:
+ 	MOVHU	(R6), R10
+ 	MOVBU	2(R6), R11
+ loop_3:
+-	BLT	    R8, R4, not_found
+-	MOVHU   (R4), R12
++	BLT	R8, R4, not_found
++	MOVHU	(R4), R12
+ 	ADDV	$1, R4
+-	BNE	    R10, R12, loop_3
+-	MOVBU   1(R4), R12
+-	BNE	    R11, R12, loop_3
+-	JMP	    found
++	BNE	R10, R12, loop_3
++	MOVBU	1(R4), R12
++	BNE	R11, R12, loop_3
++	JMP	found
+ 
+ len_4_7:
+-	AND		$0x2, R7, R5
+-	BNE	    R5, len_6_7
+-	AND		$0x1, R7, R5
+-	BNE	    R5, len_5
++	AND	$0x2, R7, R5
++	BNE	R5, len_6_7
++	AND	$0x1, R7, R5
++	BNE	R5, len_5
+ 
+ len_4:
+-	MOVWU   (R6), R10
++	MOVWU	(R6), R10
+ loop_4:
+-	BLT	    R8, R4, not_found
+-	MOVWU   (R4), R11
++	BLT	R8, R4, not_found
++	MOVWU	(R4), R11
+ 	ADDV	$1, R4
+-	BNE	    R10, R11, loop_4
+-	JMP	    found
++	BNE	R10, R11, loop_4
++	JMP	found
+ len_5:
+ 	MOVWU	(R6), R10
+ 	MOVBU	4(R6), R11
+ loop_5:
+-	BLT	    R8, R4, not_found
+-	MOVWU   (R4), R12
++	BLT	R8, R4, not_found
++	MOVWU	(R4), R12
+ 	ADDV	$1, R4
+-	BNE	    R10, R12, loop_5
+-	MOVBU   3(R4), R12
+-	BNE	    R11, R12, loop_5
+-	JMP	    found
++	BNE	R10, R12, loop_5
++	MOVBU	3(R4), R12
++	BNE	R11, R12, loop_5
++	JMP	found
+ 
+ len_6_7:
+-	AND		$0x1, R7, R5
+-	BNE	    R5, len_7
++	AND	$0x1, R7, R5
++	BNE	R5, len_7
+ 
+ len_6:
+ 	MOVWU	(R6), R10
+ 	MOVHU	4(R6), R11
+ loop_6:
+-	BLT	    R8, R4, not_found
+-	MOVWU   (R4), R12
++	BLT	R8, R4, not_found
++	MOVWU	(R4), R12
+ 	ADDV	$1, R4
+-	BNE	    R10, R12, loop_6
+-	MOVHU   3(R4), R12
+-	BNE	    R11, R12, loop_6
+-	JMP	    found
++	BNE	R10, R12, loop_6
++	MOVHU	3(R4), R12
++	BNE	R11, R12, loop_6
++	JMP	found
+ 
+ len_7:
+ 	MOVWU	(R6), R10
+ 	MOVWU	3(R6), R11
+ loop_7:
+-	BLT	    R8, R4, not_found
+-	MOVWU   (R4), R12
++	BLT	R8, R4, not_found
++	MOVWU	(R4), R12
+ 	ADDV	$1, R4
+-	BNE	    R10, R12, loop_7
+-	MOVWU   2(R4), R12
+-	BNE	    R11, R12, loop_7
+-	JMP	    found
++	BNE	R10, R12, loop_7
++	MOVWU	2(R4), R12
++	BNE	R11, R12, loop_7
++	JMP	found
+ 
+ len_8:
+ 	MOVV	(R6), R10
+ loop_8:
+-	BLT	    R8, R4, not_found
++	BLT	R8, R4, not_found
+ 	MOVV	(R4), R11
+ 	ADDV	$1, R4
+-	BNE	    R10, R11, loop_8
+-	JMP	    found
++	BNE	R10, R11, loop_8
++	JMP	found
+ 
+ len_gt_8:
+ 	MOVV	$16, R5
+-	BLT	    R5, R7, len_gt_16
++	BLT	R5, R7, len_gt_16
+ 
+ len_9_16:
+ 	MOVV	(R6), R10
+@@ -132,17 +132,17 @@ len_9_16:
+ 	MOVV	(R6)(R7), R11
+ 	SUBV	$1, R7
+ loop_9_16:
+-	BLT	    R8, R4, not_found
++	BLT	R8, R4, not_found
+ 	MOVV	(R4), R12
+ 	ADDV	$1, R4
+-	BNE	    R10, R12, loop_9_16
++	BNE	R10, R12, loop_9_16
+ 	MOVV	(R4)(R7), R12
+-	BNE	    R11, R12, loop_9_16
+-	JMP	    found
++	BNE	R11, R12, loop_9_16
++	JMP	found
+ 
+ len_gt_16:
+ 	MOVV	$24, R5
+-	BLT	    R5, R7, len_25_32
++	BLT	R5, R7, len_25_32
+ 
+ len_17_24:
+ 	MOVV	(R6), R10
+@@ -151,15 +151,15 @@ len_17_24:
+ 	MOVV	(R6)(R7), R12
+ 	SUBV	$1, R7
+ loop_17_24:
+-	BLT	    R8, R4, not_found
++	BLT	R8, R4, not_found
+ 	MOVV	(R4), R13
+ 	ADDV	$1, R4
+-	BNE	    R10, R13, loop_17_24
++	BNE	R10, R13, loop_17_24
+ 	MOVV	7(R4), R13
+-	BNE	    R11, R13, loop_17_24
++	BNE	R11, R13, loop_17_24
+ 	MOVV	(R4)(R7), R13
+-	BNE	    R12, R13, loop_17_24
+-	JMP	    found
++	BNE	R12, R13, loop_17_24
++	JMP	found
+ 
+ len_25_32:
+ 	MOVV	(R6), R10
+@@ -169,17 +169,17 @@ len_25_32:
+ 	MOVV	(R6)(R7), R13
+ 	SUBV	$1, R7
+ loop_25_32:
+-	BLT	    R8, R4, not_found
++	BLT	R8, R4, not_found
+ 	MOVV	(R4), R14
+ 	ADDV	$1, R4
+-	BNE	    R10, R14, loop_25_32
++	BNE	R10, R14, loop_25_32
+ 	MOVV	7(R4), R14
+-	BNE	    R11, R14, loop_25_32
++	BNE	R11, R14, loop_25_32
+ 	MOVV	15(R4), R14
+-	BNE	    R12, R14, loop_25_32
++	BNE	R12, R14, loop_25_32
+ 	MOVV	(R4)(R7), R14
+-	BNE	    R13, R14, loop_25_32
+-	JMP	    found
++	BNE	R13, R14, loop_25_32
++	JMP	found
+ 
+ found:
+ 	SUBV	R9, R4
+diff --git a/src/internal/bytealg/indexbyte_loong64.s b/src/internal/bytealg/indexbyte_loong64.s
+index 7811741423..b5f8f9cdbc 100644
+--- a/src/internal/bytealg/indexbyte_loong64.s
++++ b/src/internal/bytealg/indexbyte_loong64.s
+@@ -12,17 +12,17 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
+ 	// R7 = byte to find
+ 	ADDV	R4, R5		// end
+ 	MOVV	R4, R6		// store base for later
+-	AND		$0xff, R7
+-	JMP	    indexbytebody<>(SB)
++	AND	$0xff, R7
++	JMP	indexbytebody<>(SB)
+ 
+ TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
+ 	// R4 = s_base
+ 	// R5 = s_len
+ 	// R6 = byte to find
+-	AND	    $0xff, R6, R7
++	AND	$0xff, R6, R7
+ 	ADDV	R4, R5		// end
+ 	MOVV	R4, R6		// store base for later
+-	JMP	    indexbytebody<>(SB)
++	JMP	indexbytebody<>(SB)
+ 
+ // input:
+ //   R4: b_base
+@@ -32,42 +32,42 @@ TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
+ TEXT indexbytebody<>(SB),NOSPLIT,$0
+ loop:
+ 	ADDV	$8, R4, R10
+-	BLT	    R5, R10, tail
++	BLT	R5, R10, tail
+ 	MOVV	(R4), R8
+ 
+-	AND	    $0xff, R8, R9
+-	BEQ	    R7, R9, found
++	AND	$0xff, R8, R9
++	BEQ	R7, R9, found
+ 
+-	WORD    $0xcf2109         // bstrpick.w r9, r8, 15, 8
+-	BEQ	    R7, R9, byte_1th
++	WORD	$0xcf2109	// bstrpick.w r9, r8, 15, 8
++	BEQ	R7, R9, byte_1th
+ 
+-	WORD    $0xd74109         // bstrpick.w r9, r8, 23, 16
+-	BEQ	    R7, R9, byte_2th
++	WORD	$0xd74109	// bstrpick.w r9, r8, 23, 16
++	BEQ	R7, R9, byte_2th
+ 
+-	WORD    $0xdf6109         // bstrpick.w r9, r8, 31, 24
+-	BEQ	    R7, R9, byte_3th
++	WORD	$0xdf6109	// bstrpick.w r9, r8, 31, 24
++	BEQ	R7, R9, byte_3th
+ 
+-	WORD    $0xe78109         // bstrpick.w r9, r8, 39, 32
+-	BEQ	    R7, R9, byte_4th
++	WORD	$0xe78109	// bstrpick.w r9, r8, 39, 32
++	BEQ	R7, R9, byte_4th
+ 
+-	WORD    $0xefa109         // bstrpick.w r9, r8, 47, 40
+-	BEQ	    R7, R9, byte_5th
++	WORD	$0xefa109	// bstrpick.w r9, r8, 47, 40
++	BEQ	R7, R9, byte_5th
+ 
+-	WORD    $0xf7c109         // bstrpick.w r9, r8, 55, 48
+-	BEQ	    R7, R9, byte_6th
++	WORD	$0xf7c109	// bstrpick.w r9, r8, 55, 48
++	BEQ	R7, R9, byte_6th
+ 
+-	WORD    $0xffe109         // bstrpick.w r9, r8, 63, 56
+-	BEQ	    R7, R9, byte_7th
++	WORD	$0xffe109	// bstrpick.w r9, r8, 63, 56
++	BEQ	R7, R9, byte_7th
+ 
+ 	MOVV	R10, R4
+-	JMP	    loop
++	JMP	loop
+ 
+ tail:
+-	BEQ	    R4, R5, notfound
+-	MOVBU   (R4), R8
+-	BEQ		R7, R8, found
++	BEQ	R4, R5, notfound
++	MOVBU	(R4), R8
++	BEQ	R7, R8, found
+ 	ADDV	$1, R4
+-	JMP	    tail
++	JMP	tail
+ 
+ byte_1th:
+ 	ADDV	$1, R4
+-- 
+2.38.1
+
diff --git a/0011-cmd-internal-obj-loong64-optimize-immediate-loading.patch b/0011-cmd-internal-obj-loong64-optimize-immediate-loading.patch
new file mode 100644
index 0000000000000000000000000000000000000000..6136b63a2a511b369295c0585254d7237caab0fc
--- /dev/null
+++ b/0011-cmd-internal-obj-loong64-optimize-immediate-loading.patch
@@ -0,0 +1,776 @@
+From a08a479c526bcc63bf24e69ff7fa1d37a1179e1f Mon Sep 17 00:00:00 2001
+From: limeidan <limeidan@loongson.cn>
+Date: Thu, 11 Jul 2024 21:03:45 +0800
+Subject: [PATCH 11/44] cmd/internal/obj/loong64: optimize immediate loading
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+                      |        old       |      new                           |
+                      |      sec/op      |    sec/op             vs base      |
+BinaryTree17               11.08 ± 2%        11.16 ± 1%        ~ (p=0.529 n=10)
+Fannkuch11                 2.716 ± 0%        2.737 ± 0%   +0.79% (p=0.000 n=10)
+FmtFprintfEmpty           67.37n ± 0%       66.42n ± 0%   -1.41% (p=0.000 n=10)
+FmtFprintfString          95.28n ± 0%       90.85n ± 0%   -4.64% (p=0.000 n=10)
+FmtFprintfInt             97.69n ± 0%       98.06n ± 0%   +0.38% (p=0.000 n=10)
+FmtFprintfIntInt          149.1n ± 0%       147.4n ± 0%   -1.14% (p=0.000 n=10)
+FmtFprintfPrefixedInt     223.6n ± 0%       196.5n ± 0%  -12.10% (p=0.000 n=10)
+FmtFprintfFloat           290.9n ± 0%       281.6n ± 1%   -3.21% (p=0.000 n=10)
+FmtManyArgs               670.6n ± 0%       642.6n ± 0%   -4.18% (p=0.000 n=10)
+GobDecode                 10.26m ± 1%       10.23m ± 1%        ~ (p=0.105 n=10)
+GobEncode                 12.09m ± 1%       11.94m ± 1%   -1.24% (p=0.000 n=10)
+Gzip                      316.9m ± 0%       315.9m ± 0%   -0.32% (p=0.001 n=10)
+Gunzip                    65.48m ± 0%       59.77m ± 0%   -8.72% (p=0.000 n=10)
+HTTPClientServer          70.36µ ± 0%       68.72µ ± 0%   -2.34% (p=0.000 n=10)
+JSONEncode                13.61m ± 1%       13.19m ± 1%   -3.13% (p=0.000 n=10)
+JSONDecode                57.52m ± 1%       54.15m ± 1%   -5.86% (p=0.000 n=10)
+Mandelbrot200             4.577m ± 0%       4.572m ± 0%   -0.10% (p=0.002 n=10)
+GoParse                   6.466m ± 0%       6.363m ± 0%   -1.58% (p=0.000 n=10)
+RegexpMatchEasy0_32       89.20n ± 0%       87.72n ± 0%   -1.65% (p=0.000 n=10)
+RegexpMatchEasy0_1K       748.6n ± 0%       907.6n ± 0%  +21.22% (p=0.000 n=10)
+RegexpMatchEasy1_32       94.14n ± 0%       93.81n ± 0%   -0.35% (p=0.000 n=10)
+RegexpMatchEasy1_1K       832.1n ± 0%       953.6n ± 0%  +14.59% (p=0.000 n=10)
+RegexpMatchMedium_32      982.7n ± 0%      1018.0n ± 0%   +3.59% (p=0.000 n=10)
+RegexpMatchMedium_1K      30.51µ ± 0%       30.00µ ± 0%   -1.65% (p=0.000 n=10)
+RegexpMatchHard_32        1.721µ ± 0%       1.664µ ± 0%   -3.34% (p=0.000 n=10)
+RegexpMatchHard_1K        50.76µ ± 0%       50.92µ ± 0%   +0.32% (p=0.000 n=10)
+Revcomp                   870.5m ± 0%       710.5m ± 0%  -18.38% (p=0.000 n=10)
+Template                  93.18m ± 1%       93.67m ± 1%        ~ (p=0.123 n=10)
+TimeParse                 309.2n ± 0%       307.8n ± 0%   -0.45% (p=0.000 n=10)
+TimeFormat                401.5n ± 0%       394.2n ± 0%   -1.82% (p=0.000 n=10)
+geomean                   72.73µ            71.70µ        -1.41%
+
+Change-Id: Id8d342ef3bb82a420434b2b841674683efef67be
+---
+ src/cmd/asm/internal/asm/endtoend_test.go     |   2 +
+ .../asm/internal/asm/testdata/loong64enc1.s   |  24 ++
+ .../asm/internal/asm/testdata/loong64enc2.s   |  46 +++
+ .../asm/internal/asm/testdata/loong64enc3.s   |  65 ++++
+ .../asm/internal/asm/testdata/loong64enc4.s   |  42 +++
+ .../asm/internal/asm/testdata/loong64enc5.s   |  17 +
+ src/cmd/internal/obj/loong64/a.out.go         |  54 ++-
+ src/cmd/internal/obj/loong64/asm.go           | 321 +++++++++++++++++-
+ src/cmd/internal/obj/loong64/cnames.go        |  14 +
+ 9 files changed, 579 insertions(+), 6 deletions(-)
+ create mode 100644 src/cmd/asm/internal/asm/testdata/loong64enc4.s
+ create mode 100644 src/cmd/asm/internal/asm/testdata/loong64enc5.s
+
+diff --git a/src/cmd/asm/internal/asm/endtoend_test.go b/src/cmd/asm/internal/asm/endtoend_test.go
+index 6e1aa1cd95..3760b77625 100644
+--- a/src/cmd/asm/internal/asm/endtoend_test.go
++++ b/src/cmd/asm/internal/asm/endtoend_test.go
+@@ -465,6 +465,8 @@ func TestLOONG64Encoder(t *testing.T) {
+ 	testEndToEnd(t, "loong64", "loong64enc1")
+ 	testEndToEnd(t, "loong64", "loong64enc2")
+ 	testEndToEnd(t, "loong64", "loong64enc3")
++	testEndToEnd(t, "loong64", "loong64enc4")
++	testEndToEnd(t, "loong64", "loong64enc5")
+ 	testEndToEnd(t, "loong64", "loong64")
+ }
+ 
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+index 4a88aca031..3a3eb10a74 100644
+--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+@@ -516,3 +516,27 @@ lable2:
+ 	XVPCNTH		X3, X2          // 62249c76
+ 	XVPCNTW		X3, X2          // 62289c76
+ 	XVPCNTV		X3, X2          // 622c9c76
++
++	// MOVV C_DCON12_0, r
++	MOVV    $0x7a90000000000000, R4         // MOVV $8831558869273542656, R4        // 04a41e03
++	MOVV    $0xea90000000000000, R4         // MOVV $-1544734672188080128, R4       // 04a43a03
++
++	// MOVV C_UCON, r
++	MOVV    $0x54321000, R4                 // MOVV $1412567040, R4                 // 2464a814
++	MOVV    $0xffffffff8432f000, R4         // MOVV $-2077036544, R4                // e4650815
++
++	// MOVV C_ADDCON, r
++	MOVV    $0xfffffffffffff821, R4         // MOVV $-2015, R4                      // 0484e002
++
++	// MOVV C_ANDCON, r
++	MOVV $0x821, R4                         // MOVV $2081, R4                       // 0484a003
++
++	// ADDV C_SCON, [r1], r2
++	ADDV	$0x321, R4			// ADDV	$801, R4			// 8484cc02
++	ADDV	$0x321, R5, R4			// ADDV	$801, R5, R4			// a484cc02
++	ADDV	$0xfffffffffffffc21, R4		// ADDV	$-991, R4			// 8484f002
++	ADDV	$0xfffffffffffffc21, R5, R4	// ADDV	$-991, R5, R4			// a484f002
++
++	// AND C_SCON, [r1], r2
++	AND	$0x321, R4			// AND	$801, R4			// 84844c03
++	AND	$0x321, R5, R4			// AND	$801, R5, R4			// a4844c03
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc2.s b/src/cmd/asm/internal/asm/testdata/loong64enc2.s
+index e497b83627..ee3bad74b1 100644
+--- a/src/cmd/asm/internal/asm/testdata/loong64enc2.s
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc2.s
+@@ -77,3 +77,49 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
+ 	MOVH	name(SB), R4		// 1e00001ac4034028
+ 	MOVHU	R4, name(SB)		// 1e00001ac4034029
+ 	MOVHU	name(SB), R4		// 1e00001ac403402a
++
++	// MOVV C_DCON12_20S, r
++	MOVV    $0x273fffff80000000, R4         // MOVV	$2828260563841187840, R4        // 0400001584cc0903
++	MOVV    $0xf73fffff80000000, R4         // MOVV	$-630503949979353088, R4        // 0400001584cc3d03
++
++	// MOVV C_DCON20S_20, r
++	MOVV    $0xfff800000f000000, R4         // MOVV	$-2251799562027008, R4          // 04001e1404000017
++
++	// MOVV C_DCON12_12S, r
++	MOVV    $0x273ffffffffff800, R4         // MOVV	$2828260565988669440, R4        // 0400e00284cc0903
++	MOVV    $0xf73ffffffffff800, R4         // MOVV	$-630503947831871488, R4        // 0400e00284cc3d03
++
++	// MOVV C_DCON20S_12S, r
++	MOVV    $0xfff80000fffff800, R4         // MOVV	$-2251795518720000, R4          // 0400a00204000017
++	MOVV    $0xfff8000000000000, R4         // MOVV	$-2251799813685248, R4          // 0400800204000017
++
++	// MOVV C_DCON12_12U, r
++	MOVV    $0x2730000000000800, R4         // MOVV	$2823756966361303040, R4        // 0400a00384cc0903
++	MOVV    $0xf730000000000800, R4         // MOVV	$-635007547459237888, R4        // 0400a00384cc3d03
++
++	// MOVV C_DCON20S_12U, r
++	MOVV    $0xfff8000000000800, R4         // MOVV	$-2251799813683200, R4          // 0400a00304000017
++
++	// ADDV/AND C_DCON12_0, [r1], r2
++	ADDV	$0x3210000000000000, R4		// ADDV	$3607383301523767296, R4	// 1e840c0384f81000
++	ADDV	$0x3210000000000000, R5, R4	// ADDV	$3607383301523767296, R5, R4	// 1e840c03a4f81000
++	ADDV	$0xc210000000000000, R4		// ADDV	$-4463067230724161536, R4	// 1e84300384f81000
++	ADDV	$0xc210000000000000, R5, R4	// ADDV	$-4463067230724161536, R5, R4	// 1e843003a4f81000
++	AND	$0x3210000000000000, R4		// AND	$3607383301523767296, R4	// 1e840c0384f81400
++	AND	$0x3210000000000000, R5, R4	// AND	$3607383301523767296, R5, R4	// 1e840c03a4f81400
++	AND	$0xc210000000000000, R4		// AND	$-4463067230724161536, R4	// 1e84300384f81400
++	AND	$0xc210000000000000, R5, R4	// AND	$-4463067230724161536, R5, R4	// 1e843003a4f81400
++
++	// ADDV/AND C_UCON, [r1], r2
++	ADDV	$0x43210000, R4			// ADDV	$1126236160, R4			// 1e42861484f81000
++	ADDV	$0x43210000, R5, R4		// ADDV	$1126236160, R5, R4		// 1e428614a4f81000
++	ADDV	$0xffffffffc3210000, R4		// ADDV	$-1021247488, R4		// 1e42861584f81000
++	ADDV	$0xffffffffc3210000, R5, R4	// ADDV	$-1021247488, R5, R4		// 1e428615a4f81000
++	AND	$0x43210000, R4			// AND	$1126236160, R4			// 1e42861484f81400
++	AND	$0x43210000, R5, R4		// AND	$1126236160, R5, R4		// 1e428614a4f81400
++	AND	$0xffffffffc3210000, R4		// AND	$-1021247488, R4		// 1e42861584f81400
++	AND	$0xffffffffc3210000, R5, R4	// AND	$-1021247488, R5, R4		// 1e428615a4f81400
++
++	// AND C_ADDCON, [r1], r2
++	AND	$0xfffffffffffffc21, R4		// AND	$-991, R4			// 1e84b00284f81400
++	AND	$0xfffffffffffffc21, R5, R4	// AND	$-991, R5, R4			// 1e84b002a4f81400
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc3.s b/src/cmd/asm/internal/asm/testdata/loong64enc3.s
+index 2600884309..2d83bd719a 100644
+--- a/src/cmd/asm/internal/asm/testdata/loong64enc3.s
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc3.s
+@@ -121,3 +121,68 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
+ 	XOR	$74565, R4, R5			// 5e020014de178d0385f81500
+ 	XOR	$4097, R4			// 3e000014de07800384f81500
+ 	XOR	$4097, R4, R5			// 3e000014de07800385f81500
++
++	// MOVV C_DCON32_12S, r
++	MOVV    $0x27312345fffff800, R4         // MOVV	$2824077224892692480, R4        // 0400a002a468241684cc0903
++	MOVV    $0xf7312345fffff800, R4         // MOVV	$-634687288927848448, R4        // 0400a002a468241684cc3d03
++
++	// MOVV C_DCON32_0, r
++	MOVV    $0x2731234500000000, R4         // MOVV	$2824077220597727232, R4        // 04008002a468241684cc0903
++	MOVV    $0xf731234500000000, R4         // MOVV	$-634687293222813696, R4        // 04008002a468241684cc3d03
++
++	// MOVV C_DCON32_20, r
++	MOVV    $0x2731234512345000, R4         // MOVV	$2824077220903145472, R4        // a4682414a468241684cc0903
++	MOVV    $0xf731234512345000, R4         // MOVV	$-634687292917395456, R4        // a4682414a468241684cc3d03
++
++	// MOVV C_DCON12_32S, r
++	MOVV    $0x273fffff80000800, R4         // MOVV	$2828260563841189888, R4        // 040000158400a00384cc0903
++	MOVV    $0xf73fffff80000800, R4         // MOVV	$-630503949979351040, R4        // 040000158400a00384cc3d03
++
++	// MOVV C_DCON20S_32, r
++	MOVV    $0xfff8000080000800, R4         // MOVV	$-2251797666199552, R4          // 040000158400a00304000017
++
++	// MOVV C_DCON32_12U, r
++	MOVV    $0x2731234500000800, R4         // MOVV	$2824077220597729280, R4        // 0400a003a468241684cc0903
++	MOVV    $0xf731234500000800, R4         // MOVV	$-634687293222811648, R4        // 0400a003a468241684cc3d03
++
++	// ADDV/AND C_DCON12_20S, [r1], r2
++	ADDV    $0x273fffff80000000, R4         // ADDV	$2828260563841187840, R4        // 1e000015decf090384f81000
++	ADDV    $0x273fffff80000000, R4, R5     // ADDV	$2828260563841187840, R4, R5    // 1e000015decf090385f81000
++	AND     $0x273fffff80000000, R4         // AND	$2828260563841187840, R4        // 1e000015decf090384f81400
++	AND     $0x273fffff80000000, R4, R5     // AND	$2828260563841187840, R4, R5    // 1e000015decf090385f81400
++
++	// ADDV/AND C_DCON20S_20, [r1], r2
++	ADDV    $0xfff800000f000000, R4         // ADDV	$-2251799562027008, R4          // 1e001e141e00001784f81000
++	ADDV    $0xfff800000f000000, R4, R5     // ADDV	$-2251799562027008, R4, R5      // 1e001e141e00001785f81000
++	AND     $0xfff800000f000000, R4         // AND	$-2251799562027008, R4          // 1e001e141e00001784f81400
++	AND     $0xfff800000f000000, R4, R5     // AND	$-2251799562027008, R4, R5      // 1e001e141e00001785f81400
++
++	// ADDV/AND C_DCON12_12S, [r1], r2
++	ADDV    $0x273ffffffffff800, R4         // ADDV	$2828260565988669440, R4        // 1e00e002decf090384f81000
++	ADDV    $0x273ffffffffff800, R4, R5     // ADDV	$2828260565988669440, R4, R5    // 1e00e002decf090385f81000
++	AND     $0x273ffffffffff800, R4         // AND	$2828260565988669440, R4        // 1e00e002decf090384f81400
++	AND     $0x273ffffffffff800, R4, R5     // AND	$2828260565988669440, R4, R5    // 1e00e002decf090385f81400
++
++	// ADDV/AND C_DCON20S_12S, [r1], r2
++	ADDV    $0xfff80000fffff800, R4         // ADDV	$-2251795518720000, R4          // 1e00a0021e00001784f81000
++	ADDV    $0xfff80000fffff800, R4, R5     // ADDV	$-2251795518720000, R4, R5      // 1e00a0021e00001785f81000
++	AND     $0xfff80000fffff800, R4         // AND	$-2251795518720000, R4          // 1e00a0021e00001784f81400
++	AND     $0xfff80000fffff800, R4, R5     // AND	$-2251795518720000, R4, R5      // 1e00a0021e00001785f81400
++
++	// ADDV/AND C_DCON20S_0, [r1], r2
++	ADDV    $0xfff8000000000000, R4         // ADDV	$-2251799813685248, R4          // 1e0080021e00001784f81000
++	ADDV    $0xfff8000000000000, R4, R5     // ADDV	$-2251799813685248, R4, R5      // 1e0080021e00001785f81000
++	AND     $0xfff8000000000000, R4         // AND	$-2251799813685248, R4          // 1e0080021e00001784f81400
++	AND     $0xfff8000000000000, R4, R5     // AND	$-2251799813685248, R4, R5      // 1e0080021e00001785f81400
++
++	// ADDV/AND C_DCON12_12U, [r1], r2
++	ADDV    $0x2730000000000800, R4         // ADDV	$2823756966361303040, R4        // 1e00a003decf090384f81000
++	ADDV    $0x2730000000000800, R4, R5     // ADDV	$2823756966361303040, R4, R5    // 1e00a003decf090385f81000
++	AND     $0x2730000000000800, R4         // AND	$2823756966361303040, R4        // 1e00a003decf090384f81400
++	AND     $0x2730000000000800, R4, R5     // AND	$2823756966361303040, R4, R5    // 1e00a003decf090385f81400
++
++	// ADDV/AND C_DCON20S_12U, [r1], r2
++	ADDV    $0xfff8000000000800, R4         // ADDV	$-2251799813683200, R4          // 1e00a0031e00001784f81000
++	ADDV    $0xfff8000000000800, R4, R5     // ADDV	$-2251799813683200, R4, R5      // 1e00a0031e00001785f81000
++	AND     $0xfff8000000000800, R4         // AND	$-2251799813683200, R4          // 1e00a0031e00001784f81400
++	AND     $0xfff8000000000800, R4, R5     // AND	$-2251799813683200, R4, R5      // 1e00a0031e00001785f81400
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc4.s b/src/cmd/asm/internal/asm/testdata/loong64enc4.s
+new file mode 100644
+index 0000000000..16c06a3501
+--- /dev/null
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc4.s
+@@ -0,0 +1,42 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++#include "../../../../../runtime/textflag.h"
++
++TEXT asmtest(SB),DUPOK|NOSPLIT,$0
++	// ADDV/AND C_DCON32_12S, [r1], r2
++	ADDV    $0x27312345fffff800, R4         // ADDV	$2824077224892692480, R4        // 1e00a002be682416decf090384f81000
++	ADDV    $0x27312345fffff800, R4, R5     // ADDV	$2824077224892692480, R4, R5    // 1e00a002be682416decf090385f81000
++	AND     $0x27312345fffff800, R4         // AND	$2824077224892692480, R4        // 1e00a002be682416decf090384f81400
++	AND     $0x27312345fffff800, R4, R5     // AND	$2824077224892692480, R4, R5    // 1e00a002be682416decf090385f81400
++
++	// ADDV/AND C_DCON32_0, [r1], r2
++	ADDV    $0x2731234500000000, R4         // ADDV	$2824077220597727232, R4        // 1e008002be682416decf090384f81000
++	ADDV    $0x2731234500000000, R4, R5     // ADDV	$2824077220597727232, R4, R5    // 1e008002be682416decf090385f81000
++	AND     $0x2731234500000000, R4         // AND	$2824077220597727232, R4        // 1e008002be682416decf090384f81400
++	AND     $0x2731234500000000, R4, R5     // AND	$2824077220597727232, R4, R5    // 1e008002be682416decf090385f81400
++
++	// ADDV/AND C_DCON32_20, [r1], r2
++	ADDV    $0x2731234512345000, R4         // ADDV	$2824077220903145472, R4        // be682414be682416decf090384f81000
++	ADDV    $0x2731234512345000, R4, R5     // ADDV	$2824077220903145472, R4, R5    // be682414be682416decf090385f81000
++	AND     $0x2731234512345000, R4         // AND	$2824077220903145472, R4        // be682414be682416decf090384f81400
++	AND     $0x2731234512345000, R4, R5     // AND	$2824077220903145472, R4, R5    // be682414be682416decf090385f81400
++
++	// ADDV/AND C_DCON12_32S, [r1], r2
++	ADDV    $0x273fffff80000800, R4         // ADDV	$2828260563841189888, R4        // 1e000015de03a003decf090384f81000
++	ADDV    $0x273fffff80000800, R4, R5     // ADDV	$2828260563841189888, R4, R5    // 1e000015de03a003decf090385f81000
++	AND     $0x273fffff80000800, R4         // AND	$2828260563841189888, R4        // 1e000015de03a003decf090384f81400
++	AND     $0x273fffff80000800, R4, R5     // AND	$2828260563841189888, R4, R5    // 1e000015de03a003decf090385f81400
++
++	// ADDV/AND C_DCON20S_32, [r1], r2
++	ADDV    $0xfff8000080000800, R4         // ADDV	$-2251797666199552, R4          // 1e000015de03a0031e00001784f81000
++	ADDV    $0xfff8000080000800, R4, R5     // ADDV	$-2251797666199552, R4, R5      // 1e000015de03a0031e00001785f81000
++	AND     $0xfff8000080000800, R4         // AND	$-2251797666199552, R4          // 1e000015de03a0031e00001784f81400
++	AND     $0xfff8000080000800, R4, R5     // AND	$-2251797666199552, R4, R5      // 1e000015de03a0031e00001785f81400
++
++	// ADDV/AND C_DCON32_12U, [r1], r2
++	ADDV    $0x2731234500000800, R4         // ADDV	$2824077220597729280, R4        // 1e00a003be682416decf090384f81000
++	ADDV    $0x2731234500000800, R4, R5     // ADDV	$2824077220597729280, R4, R5    // 1e00a003be682416decf090385f81000
++	AND     $0x2731234500000800, R4         // AND	$2824077220597729280, R4        // 1e00a003be682416decf090384f81400
++	AND     $0x2731234500000800, R4, R5     // AND	$2824077220597729280, R4, R5    // 1e00a003be682416decf090385f81400
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc5.s b/src/cmd/asm/internal/asm/testdata/loong64enc5.s
+new file mode 100644
+index 0000000000..423e5c3b01
+--- /dev/null
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc5.s
+@@ -0,0 +1,17 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++#include "../../../../../runtime/textflag.h"
++
++TEXT asmtest(SB),DUPOK|NOSPLIT,$0
++	// ADDV/AND C_DCON, [r1], r2
++	ADDV	$0xfedcba9876543210, R4		// ADDV	$-81985529216486896, R4		// 7ea8ec14de4388031e539717deb73f0384f81000
++	ADDV	$0xfedcba9876543210, R5, R4	// ADDV	$-81985529216486896, R5, R4	// 7ea8ec14de4388031e539717deb73f03a4f81000
++	ADDV	$0x4edcba9876543210, R4		// ADDV	$5682621993817747984, R4	// 7ea8ec14de4388031e539717deb7130384f81000
++	ADDV	$0x4edcba9876543210, R5, R4	// ADDV	$5682621993817747984, R5, R4	// 7ea8ec14de4388031e539717deb71303a4f81000
++	AND	$0x4edcba9876543210, R4		// AND	$5682621993817747984, R4	// 7ea8ec14de4388031e539717deb7130384f81400
++	AND	$0x4edcba9876543210, R5, R4	// AND	$5682621993817747984, R5, R4	// 7ea8ec14de4388031e539717deb71303a4f81400
++	AND	$0xfedcba9876543210, R4		// AND	$-81985529216486896, R4		// 7ea8ec14de4388031e539717deb73f0384f81400
++	AND	$0xfedcba9876543210, R5, R4	// AND	$-81985529216486896, R5, R4	// 7ea8ec14de4388031e539717deb73f03a4f81400
++
+diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
+index 53b005af4d..b2207c2523 100644
+--- a/src/cmd/internal/obj/loong64/a.out.go
++++ b/src/cmd/internal/obj/loong64/a.out.go
+@@ -328,12 +328,58 @@ const (
+ 	C_ZCON
+ 	C_SCON // 12 bit signed
+ 	C_UCON // 32 bit signed, low 12 bits 0
++
++	// When the immediate value is SCON, it can choose either the ADDCON implementation
++	// or the ANDCON implementation, using ADD0CON/AND0CON to distinguish them, so that
++	// the program can choose the implementation with fewer instructions.
+ 	C_ADD0CON
+ 	C_AND0CON
+-	C_ADDCON  // -0x800 <= v < 0
+-	C_ANDCON  // 0 < v <= 0xFFF
+-	C_LCON    // other 32
+-	C_DCON    // other 64 (could subdivide further)
++
++	C_ADDCON // -0x800 <= v < 0
++	C_ANDCON // 0 < v <= 0xFFF
++	C_LCON   // other 32
++
++	// 64 bit signed, lo32 bits 0, hi20 bits are not 0, hi12 bits can
++	// be obtained by sign extension of the hi20 bits.
++	C_DCON20S_0
++	// 64 bit signed, lo52 bits 0, hi12 bits are not 0.
++	C_DCON12_0
++	// 64 bit signed, lo32 bits 0, hi32 bits are not 0.
++	C_DCON32_0
++	// 64 bit signed, lo12 bits 0, lo20 bits are not 0, hi20 bits can be
++	// obtained by sign extension of the lo20 bits, other bits are not 0.
++	C_DCON12_20S
++	// 64 bit signed, lo12 bits 0, hi20 bits are not 0, hi12 bits can be
++	// obtained by sign extension of the hi20 bits, other bits are not 0.
++	C_DCON20S_20
++	// 64 bit signed, lo12 bits 0, other bits are not 0.
++	C_DCON32_20
++	// 64 bit signed, lo12 bits are not 0, 12~51 bits can be obtained
++	// by sign extension of the lo12 bits, other bits are not 0.
++	C_DCON12_12S
++	// 64 bit signed, hi20 bits and lo12 bits are not 0, hi12 bits can
++	// be obtained by sign extension of the hi20 bits, lo20 bits can
++	// be obtained by sign extension of the lo12 bits.
++	C_DCON20S_12S
++	// 64 bit signed, lo12 bits are not 0, lo20 bits can be obtained by sign
++	// extension of the lo12 bits, other bits are not 0.
++	C_DCON32_12S
++	// 64 bit signed, lo20 and lo12 bits are not 0, hi20 bits can be obtained by sign
++	// extension of the lo20 bits. other bits are not 0.
++	C_DCON12_32S
++	// 64 bit signed, hi20 bits are not 0, hi12 bits can be obtained by sign
++	// extension of the hi20 bits, lo32 bits are not 0.
++	C_DCON20S_32
++	// 64 bit signed, 12~51 bits 0, other bits are not 0.
++	C_DCON12_12U
++	// 64 bit signed, lo20 bits 0, hi20 bits are not 0, hi12 bits can be
++	// obtained by sign extension of the hi20 bits, lo12 bits are not 0.
++	C_DCON20S_12U
++	// 64 bit signed, lo20 bits 0, other bits are not 0.
++	C_DCON32_12U
++	// other 64
++	C_DCON
++
+ 	C_SACON   // $n(REG) where n <= int12
+ 	C_LACON   // $n(REG) where int12 < n <= int32
+ 	C_DACON   // $n(REG) where int32 < n
+diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
+index 9024c5e53e..5757c3c452 100644
+--- a/src/cmd/internal/obj/loong64/asm.go
++++ b/src/cmd/internal/obj/loong64/asm.go
+@@ -9,6 +9,7 @@ import (
+ 	"cmd/internal/objabi"
+ 	"fmt"
+ 	"log"
++	"math/bits"
+ 	"slices"
+ )
+ 
+@@ -192,6 +193,9 @@ var optab = []Optab{
+ 	{AMOVV, C_UCON, C_NONE, C_NONE, C_REG, C_NONE, 24, 4, 0, 0},
+ 	{AMOVW, C_LCON, C_NONE, C_NONE, C_REG, C_NONE, 19, 8, 0, NOTUSETMP},
+ 	{AMOVV, C_LCON, C_NONE, C_NONE, C_REG, C_NONE, 19, 8, 0, NOTUSETMP},
++	{AMOVV, C_DCON12_0, C_NONE, C_NONE, C_REG, C_NONE, 67, 4, 0, NOTUSETMP},
++	{AMOVV, C_DCON12_20S, C_NONE, C_NONE, C_REG, C_NONE, 68, 8, 0, NOTUSETMP},
++	{AMOVV, C_DCON32_12S, C_NONE, C_NONE, C_REG, C_NONE, 69, 12, 0, NOTUSETMP},
+ 	{AMOVV, C_DCON, C_NONE, C_NONE, C_REG, C_NONE, 59, 16, 0, NOTUSETMP},
+ 
+ 	{AADD, C_ADD0CON, C_REG, C_NONE, C_REG, C_NONE, 4, 4, 0, 0},
+@@ -225,6 +229,20 @@ var optab = []Optab{
+ 
+ 	{AADDV, C_DCON, C_NONE, C_NONE, C_REG, C_NONE, 60, 20, 0, 0},
+ 	{AADDV, C_DCON, C_REG, C_NONE, C_REG, C_NONE, 60, 20, 0, 0},
++	{AAND, C_DCON, C_NONE, C_NONE, C_REG, C_NONE, 60, 20, 0, 0},
++	{AAND, C_DCON, C_REG, C_NONE, C_REG, C_NONE, 60, 20, 0, 0},
++	{AADDV, C_DCON12_0, C_NONE, C_NONE, C_REG, C_NONE, 70, 8, 0, 0},
++	{AADDV, C_DCON12_0, C_REG, C_NONE, C_REG, C_NONE, 70, 8, 0, 0},
++	{AAND, C_DCON12_0, C_NONE, C_NONE, C_REG, C_NONE, 70, 8, 0, 0},
++	{AAND, C_DCON12_0, C_REG, C_NONE, C_REG, C_NONE, 70, 8, 0, 0},
++	{AADDV, C_DCON12_20S, C_NONE, C_NONE, C_REG, C_NONE, 71, 12, 0, 0},
++	{AADDV, C_DCON12_20S, C_REG, C_NONE, C_REG, C_NONE, 71, 12, 0, 0},
++	{AAND, C_DCON12_20S, C_NONE, C_NONE, C_REG, C_NONE, 71, 12, 0, 0},
++	{AAND, C_DCON12_20S, C_REG, C_NONE, C_REG, C_NONE, 71, 12, 0, 0},
++	{AADDV, C_DCON32_12S, C_NONE, C_NONE, C_REG, C_NONE, 72, 16, 0, 0},
++	{AADDV, C_DCON32_12S, C_REG, C_NONE, C_REG, C_NONE, 72, 16, 0, 0},
++	{AAND, C_DCON32_12S, C_NONE, C_NONE, C_REG, C_NONE, 72, 16, 0, 0},
++	{AAND, C_DCON32_12S, C_REG, C_NONE, C_REG, C_NONE, 72, 16, 0, 0},
+ 
+ 	{ASLL, C_SCON, C_REG, C_NONE, C_REG, C_NONE, 16, 4, 0, 0},
+ 	{ASLL, C_SCON, C_NONE, C_NONE, C_REG, C_NONE, 16, 4, 0, 0},
+@@ -790,7 +808,7 @@ func (c *ctxt0) aclass(a *obj.Addr) int {
+ 		}
+ 
+ 		if c.instoffset != int64(int32(c.instoffset)) {
+-			return C_DCON
++			return dconClass(c.instoffset)
+ 		}
+ 
+ 		if c.instoffset >= 0 {
+@@ -830,6 +848,159 @@ func (c *ctxt0) aclass(a *obj.Addr) int {
+ 	return C_GOK
+ }
+ 
++// The constants here define the data characteristics within the bit field range.
++//
++//	ALL1: The data in the bit field is all 1
++//	ALL0: The data in the bit field is all 0
++//	ST1: The data in the bit field starts with 1, but not all 1
++//	ST0: The data in the bit field starts with 0, but not all 0
++const (
++	ALL1 = iota
++	ALL0
++	ST1
++	ST0
++)
++
++// mask returns the mask of the specified bit field, which is used to help determine
++// the data characteristics of the immediate value at the specified bit.
++func mask(suf int8, len int8) (uint64, uint64) {
++	if len == 12 {
++		if suf == 0 {
++			return 0xfff, 0x800
++		} else { // suf == 52
++			return 0xfff0000000000000, 0x8000000000000000
++		}
++	} else { // len == 20
++		if suf == 12 {
++			return 0xfffff000, 0x80000000
++		} else { // suf == 32
++			return 0xfffff00000000, 0x8000000000000
++		}
++	}
++}
++
++// bitField return a number represent status of val in bit field
++//
++//	suf: The starting bit of the bit field
++//	len: The length of the bit field
++func bitField(val int64, suf int8, len int8) int8 {
++	mask1, mask2 := mask(suf, len)
++	if uint64(val)&mask1 == mask1 {
++		return ALL1
++	} else if uint64(val)&mask1 == 0x0 {
++		return ALL0
++	} else if uint64(val)&mask2 == mask2 {
++		return ST1
++	} else {
++		return ST0
++	}
++}
++
++// Loading an immediate value larger than 32 bits requires four instructions
++// on loong64 (lu12i.w + ori + lu32i.d + lu52i.d), but in some special cases,
++// we can use the sign extension and zero extension features of the instruction
++// to fill in the high-order data (all 0 or all 1), which can save one to
++// three instructions.
++//
++//	| 63 ~ 52 | 51 ~ 32 | 31 ~ 12 | 11 ~ 0 |
++//	| lu52i.d | lu32i.d | lu12i.w |   ori  |
++func dconClass(offset int64) int {
++	tzb := bits.TrailingZeros64(uint64(offset))
++	hi12 := bitField(offset, 52, 12)
++	hi20 := bitField(offset, 32, 20)
++	lo20 := bitField(offset, 12, 20)
++	lo12 := bitField(offset, 0, 12)
++	if tzb >= 52 {
++		return C_DCON12_0 // lu52i.d
++	}
++	if tzb >= 32 {
++		if ((hi20 == ALL1 || hi20 == ST1) && hi12 == ALL1) || ((hi20 == ALL0 || hi20 == ST0) && hi12 == ALL0) {
++			return C_DCON20S_0 // addi.w + lu32i.d
++		}
++		return C_DCON32_0 // addi.w + lu32i.d + lu52i.d
++	}
++	if tzb >= 12 {
++		if lo20 == ST1 || lo20 == ALL1 {
++			if hi20 == ALL1 {
++				return C_DCON12_20S // lu12i.w + lu52i.d
++			}
++			if (hi20 == ST1 && hi12 == ALL1) || ((hi20 == ST0 || hi20 == ALL0) && hi12 == ALL0) {
++				return C_DCON20S_20 // lu12i.w + lu32i.d
++			}
++			return C_DCON32_20 // lu12i.w + lu32i.d + lu52i.d
++		}
++		if hi20 == ALL0 {
++			return C_DCON12_20S // lu12i.w + lu52i.d
++		}
++		if (hi20 == ST0 && hi12 == ALL0) || ((hi20 == ST1 || hi20 == ALL1) && hi12 == ALL1) {
++			return C_DCON20S_20 // lu12i.w + lu32i.d
++		}
++		return C_DCON32_20 // lu12i.w + lu32i.d + lu52i.d
++	}
++	if lo12 == ST1 || lo12 == ALL1 {
++		if lo20 == ALL1 {
++			if hi20 == ALL1 {
++				return C_DCON12_12S // addi.d + lu52i.d
++			}
++			if (hi20 == ST1 && hi12 == ALL1) || ((hi20 == ST0 || hi20 == ALL0) && hi12 == ALL0) {
++				return C_DCON20S_12S // addi.w + lu32i.d
++			}
++			return C_DCON32_12S // addi.w + lu32i.d + lu52i.d
++		}
++		if lo20 == ST1 {
++			if hi20 == ALL1 {
++
++				return C_DCON12_32S // lu12i.w + ori + lu52i.d
++			}
++			if (hi20 == ST1 && hi12 == ALL1) || ((hi20 == ST0 || hi20 == ALL0) && hi12 == ALL0) {
++				return C_DCON20S_32 // lu12i.w + ori + lu32i.d
++			}
++			return C_DCON // lu12i.w + ori + lu32i.d + lu52i.d
++		}
++		if lo20 == ALL0 {
++			if hi20 == ALL0 {
++				return C_DCON12_12U // ori + lu52i.d
++			}
++			if ((hi20 == ST1 || hi20 == ALL1) && hi12 == ALL1) || (hi20 == ST0 && hi12 == ALL0) {
++				return C_DCON20S_12U // ori + lu32i.d
++			}
++			return C_DCON32_12U // ori + lu32i.d + lu52i.d
++		}
++		if hi20 == ALL0 {
++			return C_DCON12_32S // lu12i.w + ori + lu52i.d
++		}
++		if ((hi20 == ST1 || hi20 == ALL1) && hi12 == ALL1) || (hi20 == ST0 && hi12 == ALL0) {
++			return C_DCON20S_32 // lu12i.w + ori + lu32i.d
++		}
++		return C_DCON // lu12i.w + ori + lu32i.d + lu52i.d
++	}
++	if lo20 == ALL0 {
++		if hi20 == ALL0 {
++			return C_DCON12_12U // ori + lu52i.d
++		}
++		if ((hi20 == ST1 || hi20 == ALL1) && hi12 == ALL1) || (hi20 == ST0 && hi12 == ALL0) {
++			return C_DCON20S_12U // ori + lu32i.d
++		}
++		return C_DCON32_12U // ori + lu32i.d + lu52i.d
++	}
++	if lo20 == ST1 || lo20 == ALL1 {
++		if hi20 == ALL1 {
++			return C_DCON12_32S // lu12i.w + ori + lu52i.d
++		}
++		if (hi20 == ST1 && hi12 == ALL1) || ((hi20 == ST0 || hi20 == ALL0) && hi12 == ALL0) {
++			return C_DCON20S_32 // lu12i.w + ori + lu32i.d
++		}
++		return C_DCON
++	}
++	if hi20 == ALL0 {
++		return C_DCON12_32S // lu12i.w + ori + lu52i.d
++	}
++	if ((hi20 == ST1 || hi20 == ALL1) && hi12 == ALL1) || (hi20 == ST0 && hi12 == ALL0) {
++		return C_DCON20S_32 // lu12i.w + ori + lu32i.d
++	}
++	return C_DCON
++}
++
+ // In Loong64，there are 8 CFRs, denoted as fcc0-fcc7.
+ // There are 4 FCSRs, denoted as fcsr0-fcsr3.
+ func (c *ctxt0) rclass(r int16) int {
+@@ -935,7 +1106,14 @@ func cmp(a int, b int) bool {
+ 	}
+ 	switch a {
+ 	case C_DCON:
+-		if b == C_LCON {
++		if b == C_LCON || b == C_DCON32_0 ||
++			b == C_DCON12_0 || b == C_DCON20S_0 ||
++			b == C_DCON12_20S || b == C_DCON12_12S ||
++			b == C_DCON20S_20 || b == C_DCON32_20 ||
++			b == C_DCON20S_12S || b == C_DCON32_12S ||
++			b == C_DCON12_32S || b == C_DCON20S_32 ||
++			b == C_DCON12_12U || b == C_DCON20S_12U ||
++			b == C_DCON32_12U {
+ 			return true
+ 		}
+ 		fallthrough
+@@ -944,6 +1122,22 @@ func cmp(a int, b int) bool {
+ 			return true
+ 		}
+ 
++	case C_DCON12_0:
++
++	case C_DCON12_20S:
++		if b == C_DCON20S_20 || b == C_DCON12_12S ||
++			b == C_DCON20S_12S || b == C_DCON12_12U ||
++			b == C_DCON20S_12U || b == C_DCON20S_0 {
++			return true
++		}
++
++	case C_DCON32_12S:
++		if b == C_DCON32_20 || b == C_DCON12_32S ||
++			b == C_DCON20S_32 || b == C_DCON32_12U ||
++			b == C_DCON32_0 {
++			return true
++		}
++
+ 	case C_ADD0CON:
+ 		if b == C_ADDCON {
+ 			return true
+@@ -2015,6 +2209,129 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
+ 			c.ctxt.Diag("illegal register combination: %v\n", p)
+ 		}
+ 		o1 = OP_RRR(atomicInst[p.As], uint32(rk), uint32(rj), uint32(rd))
++
++	case 67: // mov $dcon12_0, r
++		v := c.vregoff(&p.From)
++		o1 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(0), uint32(p.To.Reg))
++
++	case 68: // mov $dcon12_20S, r
++		v := c.vregoff(&p.From)
++		contype := c.aclass(&p.From)
++		switch contype {
++		default: // C_DCON12_20S
++			o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(p.To.Reg))
++			o2 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(p.To.Reg), uint32(p.To.Reg))
++		case C_DCON20S_20:
++			o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(p.To.Reg))
++			o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(p.To.Reg))
++		case C_DCON12_12S:
++			o1 = OP_12IRR(c.opirr(AADDV), uint32(v), uint32(0), uint32(p.To.Reg))
++			o2 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(p.To.Reg), uint32(p.To.Reg))
++		case C_DCON20S_12S, C_DCON20S_0:
++			o1 = OP_12IRR(c.opirr(AADD), uint32(v), uint32(0), uint32(p.To.Reg))
++			o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(p.To.Reg))
++		case C_DCON12_12U:
++			o1 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(0), uint32(p.To.Reg))
++			o2 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(p.To.Reg), uint32(p.To.Reg))
++		case C_DCON20S_12U:
++			o1 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(0), uint32(p.To.Reg))
++			o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(p.To.Reg))
++		}
++
++	case 69: // mov $dcon32_12S, r
++		v := c.vregoff(&p.From)
++		contype := c.aclass(&p.From)
++		switch contype {
++		default: // C_DCON32_12S, C_DCON32_0
++			o1 = OP_12IRR(c.opirr(AADD), uint32(v), uint32(0), uint32(p.To.Reg))
++			o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(p.To.Reg))
++			o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(p.To.Reg), uint32(p.To.Reg))
++		case C_DCON32_20:
++			o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(p.To.Reg))
++			o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(p.To.Reg))
++			o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(p.To.Reg), uint32(p.To.Reg))
++		case C_DCON12_32S:
++			o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(p.To.Reg))
++			o2 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(p.To.Reg), uint32(p.To.Reg))
++			o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(p.To.Reg), uint32(p.To.Reg))
++		case C_DCON20S_32:
++			o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(p.To.Reg))
++			o2 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(p.To.Reg), uint32(p.To.Reg))
++			o3 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(p.To.Reg))
++		case C_DCON32_12U:
++			o1 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(0), uint32(p.To.Reg))
++			o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(p.To.Reg))
++			o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(p.To.Reg), uint32(p.To.Reg))
++		}
++
++	case 70: // add $dcon12_0,[r1],r2
++		v := c.vregoff(&p.From)
++		r := int(p.Reg)
++		if r == 0 {
++			r = int(p.To.Reg)
++		}
++		o1 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(0), uint32(REGTMP))
++		o2 = OP_RRR(c.oprrr(p.As), uint32(REGTMP), uint32(r), uint32(p.To.Reg))
++
++	case 71: // add $dcon12_20S,[r1],r2
++		v := c.vregoff(&p.From)
++		r := int(p.Reg)
++		if r == 0 {
++			r = int(p.To.Reg)
++		}
++		contype := c.aclass(&p.From)
++		switch contype {
++		default: // C_DCON12_20S
++			o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(REGTMP))
++			o2 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(REGTMP), uint32(REGTMP))
++		case C_DCON20S_20:
++			o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(REGTMP))
++			o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(REGTMP))
++		case C_DCON12_12S:
++			o1 = OP_12IRR(c.opirr(AADDV), uint32(v), uint32(0), uint32(REGTMP))
++			o2 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(REGTMP), uint32(REGTMP))
++		case C_DCON20S_12S, C_DCON20S_0:
++			o1 = OP_12IRR(c.opirr(AADD), uint32(v), uint32(0), uint32(REGTMP))
++			o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(REGTMP))
++		case C_DCON12_12U:
++			o1 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(0), uint32(REGTMP))
++			o2 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(REGTMP), uint32(REGTMP))
++		case C_DCON20S_12U:
++			o1 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(0), uint32(REGTMP))
++			o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(REGTMP))
++		}
++		o3 = OP_RRR(c.oprrr(p.As), uint32(REGTMP), uint32(r), uint32(p.To.Reg))
++
++	case 72: // add $dcon32_12S,[r1],r2
++		v := c.vregoff(&p.From)
++		r := int(p.Reg)
++		if r == 0 {
++			r = int(p.To.Reg)
++		}
++		contype := c.aclass(&p.From)
++		switch contype {
++		default: // C_DCON32_12S, C_DCON32_0
++			o1 = OP_12IRR(c.opirr(AADD), uint32(v), uint32(0), uint32(REGTMP))
++			o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(REGTMP))
++			o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(REGTMP), uint32(REGTMP))
++		case C_DCON32_20:
++			o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(REGTMP))
++			o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(REGTMP))
++			o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(REGTMP), uint32(REGTMP))
++		case C_DCON12_32S:
++			o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(REGTMP))
++			o2 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(REGTMP), uint32(REGTMP))
++			o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(REGTMP), uint32(REGTMP))
++		case C_DCON20S_32:
++			o1 = OP_IR(c.opir(ALU12IW), uint32(v>>12), uint32(REGTMP))
++			o2 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(REGTMP), uint32(REGTMP))
++			o3 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(REGTMP))
++		case C_DCON32_12U:
++			o1 = OP_12IRR(c.opirr(AOR), uint32(v), uint32(0), uint32(REGTMP))
++			o2 = OP_IR(c.opir(ALU32ID), uint32(v>>32), uint32(REGTMP))
++			o3 = OP_12IRR(c.opirr(ALU52ID), uint32(v>>52), uint32(REGTMP), uint32(REGTMP))
++		}
++		o4 = OP_RRR(c.oprrr(p.As), uint32(REGTMP), uint32(r), uint32(p.To.Reg))
+ 	}
+ 
+ 	out[0] = o1
+diff --git a/src/cmd/internal/obj/loong64/cnames.go b/src/cmd/internal/obj/loong64/cnames.go
+index ce76109d2a..a2f04a22ee 100644
+--- a/src/cmd/internal/obj/loong64/cnames.go
++++ b/src/cmd/internal/obj/loong64/cnames.go
+@@ -21,6 +21,20 @@ var cnames0 = []string{
+ 	"ADDCON",
+ 	"ANDCON",
+ 	"LCON",
++	"DCON20S_0",
++	"DCON12_0",
++	"DCON32_0",
++	"DCON12_20S",
++	"DCON20S_20",
++	"DCON32_20",
++	"DCON12_12S",
++	"DCON20S_12S",
++	"DCON32_12S",
++	"DCON12_32S",
++	"DCON20S_32",
++	"DCON12_12U",
++	"DCON20S_12U",
++	"DCON32_12U",
+ 	"DCON",
+ 	"SACON",
+ 	"LACON",
+-- 
+2.38.1
+
diff --git a/0012-math-big-optimize-addVV-function-for-loong64.patch b/0012-math-big-optimize-addVV-function-for-loong64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..8d91ab457176ba0293acd53bab799992c13f77ce
--- /dev/null
+++ b/0012-math-big-optimize-addVV-function-for-loong64.patch
@@ -0,0 +1,85 @@
+From a7a4eb8120aaf7d5f8d2146f190c64118c7e1235 Mon Sep 17 00:00:00 2001
+From: Huang Qiqi <huangqiqi@loongson.cn>
+Date: Thu, 6 Jun 2024 15:30:20 +0800
+Subject: [PATCH 12/44] math/big: optimize addVV function for loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Benchmark results on Loongson 3C5000 (which is an LA464 implementation):
+
+goos: linux
+goarch: loong64
+pkg: math/big
+cpu: Loongson-3C5000 @ 2200.00MHz
+             │ test/old_3c5000_addvv.log │      test/new_3c5000_addvv.log      │
+             │          sec/op           │   sec/op     vs base                │
+AddVV/1                     10.920n ± 0%   7.671n ± 0%  -29.75% (p=0.000 n=20)
+AddVV/2                     14.100n ± 0%   8.849n ± 0%  -37.24% (p=0.000 n=20)
+AddVV/3                      16.38n ± 0%   11.07n ± 0%  -32.42% (p=0.000 n=20)
+AddVV/4                      18.65n ± 0%   12.86n ± 0%  -31.05% (p=0.000 n=20)
+AddVV/5                      20.93n ± 0%   15.01n ± 0%  -28.28% (p=0.000 n=20)
+AddVV/10                     31.84n ± 0%   22.75n ± 0%  -28.53% (p=0.000 n=20)
+AddVV/100                    242.4n ± 0%   149.7n ± 0%  -38.24% (p=0.000 n=20)
+AddVV/1000                   2.290µ ± 0%   1.378µ ± 0%  -39.83% (p=0.000 n=20)
+AddVV/10000                  32.73µ ± 0%   19.36µ ± 0%  -40.84% (p=0.000 n=20)
+AddVV/100000                 340.9µ ± 0%   238.5µ ± 0%  -30.03% (p=0.000 n=20)
+geomean                      213.2n        141.2n       -33.79%
+
+Change-Id: I7983a93d9b97d4e9ebe96a49107ec6db9194b013
+---
+ src/math/big/arith_loong64.s | 31 +++++++++++++++++++++++++++++--
+ 1 file changed, 29 insertions(+), 2 deletions(-)
+
+diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s
+index 847e3127fb..bd7204cf06 100644
+--- a/src/math/big/arith_loong64.s
++++ b/src/math/big/arith_loong64.s
+@@ -2,15 +2,42 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build !math_big_pure_go && loong64
++//go:build !math_big_pure_go
+ 
+ #include "textflag.h"
+ 
+ // This file provides fast assembly versions for the elementary
+ // arithmetic operations on vectors implemented in arith.go.
+ 
++// func addVV(z, x, y []Word) (c Word)
+ TEXT ·addVV(SB),NOSPLIT,$0
+-	JMP ·addVV_g(SB)
++	// input:
++	//   R4: z
++	//   R5: z_len
++	//   R7: x
++	//   R10: y
++	MOVV	z+0(FP), R4
++	MOVV	z_len+8(FP), R5
++	MOVV	x+24(FP), R7
++	MOVV	y+48(FP), R10
++	MOVV	$0, R6
++	SLLV	$3, R5
++	MOVV	$0, R8
++loop:
++	BEQ	R5, R6, done
++	MOVV	(R6)(R7), R9
++	MOVV	(R6)(R10), R11
++	ADDV	R9, R11, R11	// x1 + y1 = z1', if z1' < x1 then z1' overflow
++	ADDV	R8, R11, R12	// z1' + c0 = z1, if z1 < z1' then z1 overflow
++	SGTU	R9, R11, R9
++	SGTU	R11, R12, R11
++	MOVV	R12, (R6)(R4)
++	OR	R9, R11, R8
++	ADDV	$8, R6
++	JMP	loop
++done:
++	MOVV	R8, c+72(FP)
++	RET
+ 
+ TEXT ·subVV(SB),NOSPLIT,$0
+ 	JMP ·subVV_g(SB)
+-- 
+2.38.1
+
diff --git a/0013-math-big-optimize-addVW-function-for-loong64.patch b/0013-math-big-optimize-addVW-function-for-loong64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..853a0d43914daf3d1d20664bb8ea8f31f137c34e
--- /dev/null
+++ b/0013-math-big-optimize-addVW-function-for-loong64.patch
@@ -0,0 +1,82 @@
+From 94a6bdcacffb17b8adf57ce0919a3d31ac70b646 Mon Sep 17 00:00:00 2001
+From: Huang Qiqi <huangqiqi@loongson.cn>
+Date: Tue, 11 Jun 2024 16:09:10 +0800
+Subject: [PATCH 13/44] math/big: optimize addVW function for loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Benchmark results on Loongson 3C5000 (which is an LA464 implementation):
+
+goos: linux
+goarch: loong64
+pkg: math/big
+cpu: Loongson-3C5000 @ 2200.00MHz
+                │ test/old_3c5000_addvw.log │      test/new_3c5000_addvw.log      │
+                │          sec/op           │   sec/op     vs base                │
+AddVW/1                         9.555n ± 0%   5.915n ± 0%  -38.09% (p=0.000 n=20)
+AddVW/2                        11.370n ± 0%   6.825n ± 0%  -39.97% (p=0.000 n=20)
+AddVW/3                        12.485n ± 0%   7.970n ± 0%  -36.16% (p=0.000 n=20)
+AddVW/4                        14.980n ± 0%   9.718n ± 0%  -35.13% (p=0.000 n=20)
+AddVW/5                         16.73n ± 0%   10.63n ± 0%  -36.46% (p=0.000 n=20)
+AddVW/10                        24.57n ± 0%   15.18n ± 0%  -38.23% (p=0.000 n=20)
+AddVW/100                       184.9n ± 0%   102.4n ± 0%  -44.62% (p=0.000 n=20)
+AddVW/1000                     1721.0n ± 0%   921.4n ± 0%  -46.46% (p=0.000 n=20)
+AddVW/10000                     16.83µ ± 0%   11.68µ ± 0%  -30.58% (p=0.000 n=20)
+AddVW/100000                    184.7µ ± 0%   131.3µ ± 0%  -28.93% (p=0.000 n=20)
+AddVWext/1                      9.554n ± 0%   5.915n ± 0%  -38.09% (p=0.000 n=20)
+AddVWext/2                     11.370n ± 0%   6.825n ± 0%  -39.97% (p=0.000 n=20)
+AddVWext/3                     12.505n ± 0%   7.969n ± 0%  -36.27% (p=0.000 n=20)
+AddVWext/4                     14.980n ± 0%   9.718n ± 0%  -35.13% (p=0.000 n=20)
+AddVWext/5                      16.70n ± 0%   10.63n ± 0%  -36.33% (p=0.000 n=20)
+AddVWext/10                     24.54n ± 0%   15.18n ± 0%  -38.13% (p=0.000 n=20)
+AddVWext/100                    185.0n ± 0%   102.4n ± 0%  -44.65% (p=0.000 n=20)
+AddVWext/1000                  1721.0n ± 0%   921.4n ± 0%  -46.46% (p=0.000 n=20)
+AddVWext/10000                  16.83µ ± 0%   11.68µ ± 0%  -30.60% (p=0.000 n=20)
+AddVWext/100000                 184.9µ ± 0%   130.4µ ± 0%  -29.51% (p=0.000 n=20)
+geomean                         155.5n        96.87n       -37.70%
+
+Change-Id: I824a90cb365e09d7d0d4a2c53ff4b30cf057a75e
+---
+ src/math/big/arith_loong64.s | 24 +++++++++++++++++++++++-
+ 1 file changed, 23 insertions(+), 1 deletion(-)
+
+diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s
+index bd7204cf06..bd6fec1b8d 100644
+--- a/src/math/big/arith_loong64.s
++++ b/src/math/big/arith_loong64.s
+@@ -42,8 +42,30 @@ done:
+ TEXT ·subVV(SB),NOSPLIT,$0
+ 	JMP ·subVV_g(SB)
+ 
++// func addVW(z, x []Word, y Word) (c Word)
+ TEXT ·addVW(SB),NOSPLIT,$0
+-	JMP ·addVW_g(SB)
++	// input:
++	//   R4: z
++	//   R5: z_len
++	//   R7: x
++	//   R10: y
++	MOVV	z+0(FP), R4
++	MOVV	z_len+8(FP), R5
++	MOVV	x+24(FP), R7
++	MOVV	y+48(FP), R10
++	MOVV	$0, R6
++	SLLV	$3, R5
++loop:
++	BEQ	R5, R6, done
++	MOVV	(R6)(R7), R8
++	ADDV	R8, R10, R9	// x1 + c = z1, if z1 < x1 then z1 overflow
++	SGTU	R8, R9, R10
++	MOVV	R9, (R6)(R4)
++	ADDV	$8, R6
++	JMP	loop
++done:
++	MOVV	R10, c+56(FP)
++	RET
+ 
+ TEXT ·subVW(SB),NOSPLIT,$0
+ 	JMP ·subVW_g(SB)
+-- 
+2.38.1
+
diff --git a/0014-math-big-optimize-subVV-function-for-loong64.patch b/0014-math-big-optimize-subVV-function-for-loong64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..1a9c5dd8394467656a88cc5f1288e506562532a8
--- /dev/null
+++ b/0014-math-big-optimize-subVV-function-for-loong64.patch
@@ -0,0 +1,77 @@
+From 7939ebdcaa1156ef4e9d8f896f4877df88d7636c Mon Sep 17 00:00:00 2001
+From: Huang Qiqi <huangqiqi@loongson.cn>
+Date: Tue, 11 Jun 2024 19:06:29 +0800
+Subject: [PATCH 14/44] math/big: optimize subVV function for loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Benchmark results on Loongson 3C5000 (which is an LA464 implementation):
+
+goos: linux
+goarch: loong64
+pkg: math/big
+cpu: Loongson-3C5000 @ 2200.00MHz
+             │ test/old_3c5000_subvv.log │      test/new_3c5000_subvv.log      │
+             │          sec/op           │   sec/op     vs base                │
+SubVV/1                     10.920n ± 0%   7.657n ± 0%  -29.88% (p=0.000 n=20)
+SubVV/2                     14.100n ± 0%   8.841n ± 0%  -37.30% (p=0.000 n=20)
+SubVV/3                      16.38n ± 0%   11.06n ± 0%  -32.48% (p=0.000 n=20)
+SubVV/4                      18.65n ± 0%   12.85n ± 0%  -31.10% (p=0.000 n=20)
+SubVV/5                      20.93n ± 0%   14.79n ± 0%  -29.34% (p=0.000 n=20)
+SubVV/10                     32.30n ± 0%   22.29n ± 0%  -30.99% (p=0.000 n=20)
+SubVV/100                    244.3n ± 0%   149.2n ± 0%  -38.93% (p=0.000 n=20)
+SubVV/1000                   2.292µ ± 0%   1.378µ ± 0%  -39.88% (p=0.000 n=20)
+SubVV/10000                  26.26µ ± 0%   25.64µ ± 0%   -2.33% (p=0.000 n=20)
+SubVV/100000                 341.3µ ± 0%   238.0µ ± 0%  -30.26% (p=0.000 n=20)
+geomean                      209.1n        144.5n       -30.86%
+
+Change-Id: I3863c2c6728f1b0f8fecbf77de13254299c5b1cb
+---
+ src/math/big/arith_loong64.s | 29 ++++++++++++++++++++++++++++-
+ 1 file changed, 28 insertions(+), 1 deletion(-)
+
+diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s
+index bd6fec1b8d..8016c25207 100644
+--- a/src/math/big/arith_loong64.s
++++ b/src/math/big/arith_loong64.s
+@@ -39,8 +39,35 @@ done:
+ 	MOVV	R8, c+72(FP)
+ 	RET
+ 
++// func subVV(z, x, y []Word) (c Word)
+ TEXT ·subVV(SB),NOSPLIT,$0
+-	JMP ·subVV_g(SB)
++	// input:
++	//   R4: z
++	//   R5: z_len
++	//   R7: x
++	//   R10: y
++	MOVV	z+0(FP), R4
++	MOVV	z_len+8(FP), R5
++	MOVV	x+24(FP), R7
++	MOVV	y+48(FP), R10
++	MOVV	$0, R6
++	SLLV	$3, R5
++	MOVV	$0, R8
++loop:
++	BEQ	R5, R6, done
++	MOVV	(R6)(R7), R9
++	MOVV	(R6)(R10), R11
++	SUBV	R11, R9, R11	// x1 - y1 = z1', if z1' > x1 then overflow
++	SUBV	R8, R11, R12	// z1' - c0 = z1, if z1 > z1' then overflow
++	SGTU	R11, R9, R9
++	SGTU	R12, R11, R11
++	MOVV	R12, (R6)(R4)
++	OR	R9, R11, R8
++	ADDV	$8, R6
++	JMP	loop
++done:
++	MOVV	R8, c+72(FP)
++	RET
+ 
+ // func addVW(z, x []Word, y Word) (c Word)
+ TEXT ·addVW(SB),NOSPLIT,$0
+-- 
+2.38.1
+
diff --git a/0015-math-big-optimize-subVW-function-for-loong64.patch b/0015-math-big-optimize-subVW-function-for-loong64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..82c8a1c43ae0fc4c6ba1edd1e62c907c855baf55
--- /dev/null
+++ b/0015-math-big-optimize-subVW-function-for-loong64.patch
@@ -0,0 +1,82 @@
+From b8516483f552400ef8708645b8a10bed5f666dba Mon Sep 17 00:00:00 2001
+From: Huang Qiqi <huangqiqi@loongson.cn>
+Date: Tue, 11 Jun 2024 20:33:50 +0800
+Subject: [PATCH 15/44] math/big: optimize subVW function for loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Benchmark results on Loongson 3C5000 (which is an LA464 implementation):
+
+goos: linux
+goarch: loong64
+pkg: math/big
+cpu: Loongson-3C5000 @ 2200.00MHz
+                │ test/old_3c5000_subvw.log │      test/new_3c5000_subvw.log      │
+                │          sec/op           │   sec/op     vs base                │
+SubVW/1                         8.564n ± 0%   5.915n ± 0%  -30.93% (p=0.000 n=20)
+SubVW/2                        11.675n ± 0%   6.825n ± 0%  -41.54% (p=0.000 n=20)
+SubVW/3                        13.410n ± 0%   7.969n ± 0%  -40.57% (p=0.000 n=20)
+SubVW/4                        15.300n ± 0%   9.740n ± 0%  -36.34% (p=0.000 n=20)
+SubVW/5                         17.34n ± 1%   10.66n ± 0%  -38.55% (p=0.000 n=20)
+SubVW/10                        26.55n ± 0%   15.21n ± 0%  -42.70% (p=0.000 n=20)
+SubVW/100                       199.2n ± 0%   102.5n ± 0%  -48.52% (p=0.000 n=20)
+SubVW/1000                     1866.5n ± 1%   924.6n ± 0%  -50.46% (p=0.000 n=20)
+SubVW/10000                     17.67µ ± 2%   12.04µ ± 2%  -31.83% (p=0.000 n=20)
+SubVW/100000                    186.4µ ± 0%   132.0µ ± 0%  -29.17% (p=0.000 n=20)
+SubVWext/1                      8.616n ± 0%   5.949n ± 0%  -30.95% (p=0.000 n=20)
+SubVWext/2                     11.410n ± 0%   7.008n ± 1%  -38.58% (p=0.000 n=20)
+SubVWext/3                     13.255n ± 1%   8.073n ± 0%  -39.09% (p=0.000 n=20)
+SubVWext/4                     15.095n ± 0%   9.893n ± 0%  -34.47% (p=0.000 n=20)
+SubVWext/5                      16.87n ± 0%   10.86n ± 0%  -35.63% (p=0.000 n=20)
+SubVWext/10                     26.00n ± 0%   15.54n ± 0%  -40.22% (p=0.000 n=20)
+SubVWext/100                    196.0n ± 0%   104.3n ± 1%  -46.76% (p=0.000 n=20)
+SubVWext/1000                  1847.0n ± 0%   923.7n ± 0%  -49.99% (p=0.000 n=20)
+SubVWext/10000                  17.30µ ± 1%   11.71µ ± 1%  -32.31% (p=0.000 n=20)
+SubVWext/100000                 187.5µ ± 0%   131.6µ ± 0%  -29.82% (p=0.000 n=20)
+geomean                         159.7n        97.79n       -38.79%
+
+Change-Id: I21a6903e79b02cb22282e80c9bfe2ae9f1a87589
+---
+ src/math/big/arith_loong64.s | 24 +++++++++++++++++++++++-
+ 1 file changed, 23 insertions(+), 1 deletion(-)
+
+diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s
+index 8016c25207..02d8262129 100644
+--- a/src/math/big/arith_loong64.s
++++ b/src/math/big/arith_loong64.s
+@@ -94,8 +94,30 @@ done:
+ 	MOVV	R10, c+56(FP)
+ 	RET
+ 
++// func subVW(z, x []Word, y Word) (c Word)
+ TEXT ·subVW(SB),NOSPLIT,$0
+-	JMP ·subVW_g(SB)
++	// input:
++	//   R4: z
++	//   R5: z_len
++	//   R7: x
++	//   R10: y
++	MOVV	z+0(FP), R4
++	MOVV	z_len+8(FP), R5
++	MOVV	x+24(FP), R7
++	MOVV	y+48(FP), R10
++	MOVV	$0, R6
++	SLLV	$3, R5
++loop:
++	BEQ	R5, R6, done
++	MOVV	(R6)(R7), R8
++	SUBV	R10, R8, R11	// x1 - c = z1, if z1 > x1 then overflow
++	SGTU	R11, R8, R10
++	MOVV	R11, (R6)(R4)
++	ADDV	$8, R6
++	JMP	loop
++done:
++	MOVV	R10, c+56(FP)
++	RET
+ 
+ TEXT ·shlVU(SB),NOSPLIT,$0
+ 	JMP ·shlVU_g(SB)
+-- 
+2.38.1
+
diff --git a/0016-math-big-optimize-shlVU-function-for-loong64.patch b/0016-math-big-optimize-shlVU-function-for-loong64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..a7fb046e70251547d0934928b6f9000eb7ac700a
--- /dev/null
+++ b/0016-math-big-optimize-shlVU-function-for-loong64.patch
@@ -0,0 +1,92 @@
+From 3d520765bbff022132512b918379fe1a5e788f2e Mon Sep 17 00:00:00 2001
+From: Huang Qiqi <huangqiqi@loongson.cn>
+Date: Thu, 13 Jun 2024 11:36:30 +0800
+Subject: [PATCH 16/44] math/big: optimize shlVU function for loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Benchmark results on Loongson 3A5000 (which is an LA464 implementation):
+
+goos: linux
+goarch: loong64
+pkg: math/big
+cpu: Loongson-3A5000-HV @ 2500.00MHz
+                           │ old_3a5000_shlvu.log │      new_3a5000_shlvu_1st.log       │
+                           │        sec/op        │   sec/op     vs base                │
+NonZeroShifts/1/shlVU                 7.606n ± 0%   5.304n ± 0%  -30.27% (p=0.000 n=20)
+NonZeroShifts/2/shlVU                 9.608n ± 0%   6.164n ± 0%  -35.85% (p=0.000 n=20)
+NonZeroShifts/3/shlVU                11.610n ± 0%   6.984n ± 0%  -39.84% (p=0.000 n=20)
+NonZeroShifts/4/shlVU                12.210n ± 0%   8.869n ± 0%  -27.36% (p=0.000 n=20)
+NonZeroShifts/5/shlVU                 14.11n ± 0%   10.41n ± 0%  -26.22% (p=0.000 n=20)
+NonZeroShifts/10/shlVU                22.02n ± 0%   14.77n ± 0%  -32.92% (p=0.000 n=20)
+NonZeroShifts/100/shlVU              161.30n ± 0%   91.15n ± 0%  -43.49% (p=0.000 n=20)
+NonZeroShifts/1000/shlVU             1514.0n ± 0%   811.7n ± 0%  -46.39% (p=0.000 n=20)
+NonZeroShifts/10000/shlVU             21.53µ ± 0%   10.54µ ± 0%  -51.04% (p=0.000 n=20)
+NonZeroShifts/100000/shlVU            208.1µ ± 0%   113.0µ ± 0%  -45.69% (p=0.000 n=20)
+geomean                               142.8n        87.87n       -38.46%
+
+Change-Id: I8e13eb0af27ac3d6846e559cdb61d2b544b05353
+---
+ src/math/big/arith_loong64.s | 44 +++++++++++++++++++++++++++++++++++-
+ 1 file changed, 43 insertions(+), 1 deletion(-)
+
+diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s
+index 02d8262129..1820988d3f 100644
+--- a/src/math/big/arith_loong64.s
++++ b/src/math/big/arith_loong64.s
+@@ -119,8 +119,50 @@ done:
+ 	MOVV	R10, c+56(FP)
+ 	RET
+ 
++// func shlVU(z, x []Word, s uint) (c Word)
+ TEXT ·shlVU(SB),NOSPLIT,$0
+-	JMP ·shlVU_g(SB)
++	// input:
++	//   R4: z
++	//   R5: z_len
++	//   R7: x
++	//   R10: s
++	MOVV	z_len+8(FP), R5
++	MOVV	s+48(FP), R10
++	MOVV	z+0(FP), R4
++	MOVV	x+24(FP), R7
++	BEQ	R5, len0
++	SLLV	$3, R5
++	BEQ	R10, copy
++	MOVV	$64, R9
++	ADDV	$-8, R7		// &x[-1]
++	SUB	R10, R9		// ŝ = 64 - s
++	MOVV	(R5)(R7), R6
++	SRLV	R9, R6, R8	// c = x[len(z)-1] >> ŝ
++loop:
++	ADDV	$-8, R5
++	BEQ	R5, done
++	SLLV	R10, R6, R12
++	MOVV	(R5)(R7), R6
++	SRLV	R9, R6, R11
++	OR	R11, R12
++	MOVV	R12, (R5)(R4)	// z[i] = x[i]<<s | x[i-1]>>ŝ
++	JMP	loop
++done:
++	SLLV	R10, R6
++	MOVV	R8, c+56(FP)
++	MOVV	R6, 0(R4)	// z[0] = x[0] << s
++	RET
++copy:
++	BEQ	R7, R4, len0
++copyloop:
++	ADDV	$-8, R5
++	BLT	R5, R0, len0
++	MOVV	(R5)(R7), R9
++	MOVV	R9, (R5)(R4)
++	JMP	copyloop
++len0:
++	MOVV	R0, c+56(FP)
++	RET
+ 
+ TEXT ·shrVU(SB),NOSPLIT,$0
+ 	JMP ·shrVU_g(SB)
+-- 
+2.38.1
+
diff --git a/0017-math-big-optimize-shrVU-function-for-loong64.patch b/0017-math-big-optimize-shrVU-function-for-loong64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..504501900a094f24b89d68fa1e33f51664a5575e
--- /dev/null
+++ b/0017-math-big-optimize-shrVU-function-for-loong64.patch
@@ -0,0 +1,92 @@
+From 14d44d92f1d59c42e85bd89797a3730f48699dc6 Mon Sep 17 00:00:00 2001
+From: Huang Qiqi <huangqiqi@loongson.cn>
+Date: Tue, 18 Jun 2024 02:00:38 +0000
+Subject: [PATCH 17/44] math/big: optimize shrVU function for loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Benchmark results on Loongson 3A5000 (which is an LA464 implementation):
+
+goos: linux
+goarch: loong64
+pkg: math/big
+cpu: Loongson-3A5000-HV @ 2500.00MHz
+                           │ test/old_3a5000_shrvu.log │      test/new_3a5000_shrvu.log      │
+                           │          sec/op           │   sec/op     vs base                │
+NonZeroShifts/1/shrVU                      7.968n ± 0%   5.210n ± 0%  -34.62% (p=0.000 n=20)
+NonZeroShifts/2/shrVU                      9.608n ± 0%   6.178n ± 0%  -35.70% (p=0.000 n=20)
+NonZeroShifts/3/shrVU                     11.400n ± 0%   7.419n ± 0%  -34.92% (p=0.000 n=20)
+NonZeroShifts/4/shrVU                     13.350n ± 0%   9.159n ± 0%  -31.39% (p=0.000 n=20)
+NonZeroShifts/5/shrVU                      15.93n ± 0%   10.58n ± 0%  -33.58% (p=0.000 n=20)
+NonZeroShifts/10/shrVU                     24.42n ± 0%   15.70n ± 0%  -35.71% (p=0.000 n=20)
+NonZeroShifts/100/shrVU                   190.60n ± 0%   90.87n ± 0%  -52.32% (p=0.000 n=20)
+NonZeroShifts/1000/shrVU                  1782.0n ± 0%   811.5n ± 0%  -54.46% (p=0.000 n=20)
+NonZeroShifts/10000/shrVU                  21.54µ ± 0%   12.55µ ± 0%  -41.76% (p=0.000 n=20)
+NonZeroShifts/100000/shrVU                 224.1µ ± 0%   126.2µ ± 0%  -43.71% (p=0.000 n=20)
+geomean                                    153.9n        91.78n       -40.35%
+
+Change-Id: I86f1f3ac44d60ad8dc2e77bdb9b541f55eb18e74
+---
+ src/math/big/arith_loong64.s | 45 +++++++++++++++++++++++++++++++++++-
+ 1 file changed, 44 insertions(+), 1 deletion(-)
+
+diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s
+index 1820988d3f..bdaaf14821 100644
+--- a/src/math/big/arith_loong64.s
++++ b/src/math/big/arith_loong64.s
+@@ -165,7 +165,50 @@ len0:
+ 	RET
+ 
+ TEXT ·shrVU(SB),NOSPLIT,$0
+-	JMP ·shrVU_g(SB)
++	// input:
++	//   R4: z
++	//   R5: z_len
++	//   R7: x
++	//   R10: s
++	MOVV	z_len+8(FP), R5
++	MOVV	s+48(FP), R10
++	MOVV	z+0(FP), R4
++	MOVV	x+24(FP), R7
++	BEQ	R5, len0
++	SLLV	$3, R5
++	BEQ	R10, copy
++	MOVV	0(R7), R6
++	MOVV	$64, R9
++	MOVV	$8, R8
++	SUB	R10, R9		// ŝ = 64 - s
++	ADDV	$-8, R4		// &z[-1]
++	SLLV	R9, R6, R13	// c = x[0] << ŝ
++loop:
++	BEQ	R5, R8, done
++	SRLV	R10, R6, R12
++	MOVV	(R8)(R7), R6
++	SLLV	R9, R6, R11
++	OR	R11, R12
++	MOVV	R12, (R8)(R4)	// z[i-1] = x[i-1]>>s | x[i]<<ŝ
++	ADDV	$8, R8
++	JMP	loop
++done:
++	SRLV	R10, R6
++	MOVV	R13, c+56(FP)
++	MOVV	R6, (R8)(R4)	// z[len(z)-1] = x[len(z)-1] >> s
++	RET
++copy:
++	MOVV	$0, R8
++	BEQ	R7, R4, len0
++copyloop:
++	BEQ	R5, R8, len0
++	MOVV	(R8)(R7), R9
++	MOVV	R9, (R8)(R4)
++	ADDV	$8, R8
++	JMP	copyloop
++len0:
++	MOVV	R0, c+56(FP)
++	RET
+ 
+ TEXT ·mulAddVWW(SB),NOSPLIT,$0
+ 	JMP ·mulAddVWW_g(SB)
+-- 
+2.38.1
+
diff --git a/0018-math-big-optimize-mulAddVWW-function-for-loong64.patch b/0018-math-big-optimize-mulAddVWW-function-for-loong64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..0ad375fa32d09ecea23d0d692da2b3adc95add16
--- /dev/null
+++ b/0018-math-big-optimize-mulAddVWW-function-for-loong64.patch
@@ -0,0 +1,77 @@
+From b956f69c885cd7fdf5305fd4047fd939000c9745 Mon Sep 17 00:00:00 2001
+From: Huang Qiqi <huangqiqi@loongson.cn>
+Date: Wed, 19 Jun 2024 06:31:00 +0000
+Subject: [PATCH 18/44] math/big: optimize mulAddVWW function for loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Benchmark results on Loongson 3A5000 (which is an LA464 implementation):
+
+goos: linux
+goarch: loong64
+pkg: math/big
+cpu: Loongson-3A5000-HV @ 2500.00MHz
+                 │ test/old_3a5000_muladdvww.log │    test/new_3a5000_muladdvww.log    │
+                 │            sec/op             │   sec/op     vs base                │
+MulAddVWW/1                          7.606n ± 0%   6.987n ± 0%   -8.14% (p=0.000 n=20)
+MulAddVWW/2                          9.207n ± 0%   8.567n ± 0%   -6.95% (p=0.000 n=20)
+MulAddVWW/3                         10.810n ± 0%   9.223n ± 0%  -14.68% (p=0.000 n=20)
+MulAddVWW/4                          13.01n ± 0%   12.41n ± 0%   -4.61% (p=0.000 n=20)
+MulAddVWW/5                          15.79n ± 0%   12.99n ± 0%  -17.73% (p=0.000 n=20)
+MulAddVWW/10                         25.62n ± 0%   20.02n ± 0%  -21.86% (p=0.000 n=20)
+MulAddVWW/100                        217.0n ± 0%   170.9n ± 0%  -21.24% (p=0.000 n=20)
+MulAddVWW/1000                       2.064µ ± 0%   1.612µ ± 0%  -21.90% (p=0.000 n=20)
+MulAddVWW/10000                      24.50µ ± 0%   16.74µ ± 0%  -31.66% (p=0.000 n=20)
+MulAddVWW/100000                     239.1µ ± 0%   171.1µ ± 0%  -28.45% (p=0.000 n=20)
+geomean                              159.2n        130.3n       -18.18%
+
+Change-Id: I063434bc382f4f1234f879172ab671a3d6f2eb80
+---
+ src/math/big/arith_loong64.s | 29 ++++++++++++++++++++++++++++-
+ 1 file changed, 28 insertions(+), 1 deletion(-)
+
+diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s
+index bdaaf14821..fe7c971120 100644
+--- a/src/math/big/arith_loong64.s
++++ b/src/math/big/arith_loong64.s
+@@ -210,8 +210,35 @@ len0:
+ 	MOVV	R0, c+56(FP)
+ 	RET
+ 
++// func mulAddVWW(z, x []Word, y, r Word) (c Word)
+ TEXT ·mulAddVWW(SB),NOSPLIT,$0
+-	JMP ·mulAddVWW_g(SB)
++	// input:
++	//   R4: z
++	//   R5: z_len
++	//   R7: x
++	//   R10: y
++	//   R11: r
++	MOVV	z+0(FP), R4
++	MOVV	z_len+8(FP), R5
++	MOVV	x+24(FP), R7
++	MOVV	y+48(FP), R10
++	MOVV	r+56(FP), R11
++	SLLV	$3, R5
++	MOVV	$0, R6
++loop:
++	BEQ	R5, R6, done
++	MOVV	(R6)(R7), R8
++	MULV	R8, R10, R9
++	MULHVU	R8, R10, R12
++	ADDV	R9, R11, R8
++	SGTU	R9, R8, R11	// if (c' = lo + c) < lo then overflow
++	MOVV	R8, (R6)(R4)
++	ADDV	R12, R11
++	ADDV	$8, R6
++	JMP	loop
++done:
++	MOVV	R11, c+64(FP)
++	RET
+ 
+ TEXT ·addMulVVW(SB),NOSPLIT,$0
+ 	JMP ·addMulVVW_g(SB)
+-- 
+2.38.1
+
diff --git a/0019-math-big-optimize-addMulVVW-function-for-loong64.patch b/0019-math-big-optimize-addMulVVW-function-for-loong64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..965a89c81ebc046ba70e60c87a94fc970c1005c4
--- /dev/null
+++ b/0019-math-big-optimize-addMulVVW-function-for-loong64.patch
@@ -0,0 +1,77 @@
+From e7a6135d5c0fc4685ad18a82e770acf9f226b08e Mon Sep 17 00:00:00 2001
+From: Huang Qiqi <huangqiqi@loongson.cn>
+Date: Wed, 19 Jun 2024 08:05:24 +0000
+Subject: [PATCH 19/44] math/big: optimize addMulVVW function for loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Benchmark results on Loongson 3A5000 (which is an LA464 implementation):
+
+goos: linux
+goarch: loong64
+pkg: math/big
+cpu: Loongson-3A5000-HV @ 2500.00MHz
+                 │ test/old_3a5000_addmulvvw.log │    test/new_3a5000_addmulvvw.log    │
+                 │            sec/op             │   sec/op     vs base                │
+AddMulVVW/1                          9.208n ± 0%   5.777n ± 0%  -37.26% (p=0.000 n=20)
+AddMulVVW/2                         11.950n ± 0%   7.763n ± 0%  -35.04% (p=0.000 n=20)
+AddMulVVW/3                          14.01n ± 0%   10.41n ± 0%  -25.70% (p=0.000 n=20)
+AddMulVVW/4                          16.01n ± 0%   13.21n ± 0%  -17.49% (p=0.000 n=20)
+AddMulVVW/5                          18.01n ± 0%   14.12n ± 0%  -21.57% (p=0.000 n=20)
+AddMulVVW/10                         29.60n ± 0%   23.35n ± 0%  -21.11% (p=0.000 n=20)
+AddMulVVW/100                        273.4n ± 0%   173.8n ± 0%  -36.43% (p=0.000 n=20)
+AddMulVVW/1000                       2.516µ ± 0%   1.615µ ± 0%  -35.81% (p=0.000 n=20)
+AddMulVVW/10000                      30.31µ ± 0%   21.54µ ± 0%  -28.93% (p=0.000 n=20)
+AddMulVVW/100000                     322.5µ ± 0%   234.1µ ± 0%  -27.41% (p=0.000 n=20)
+geomean                              197.1n        139.9n       -29.00%
+
+Change-Id: Ib7e95b50f7af893abee72ec26948a65115455692
+---
+ src/math/big/arith_loong64.s | 32 +++++++++++++++++++++++++++++++-
+ 1 file changed, 31 insertions(+), 1 deletion(-)
+
+diff --git a/src/math/big/arith_loong64.s b/src/math/big/arith_loong64.s
+index fe7c971120..012af94f5c 100644
+--- a/src/math/big/arith_loong64.s
++++ b/src/math/big/arith_loong64.s
+@@ -240,5 +240,35 @@ done:
+ 	MOVV	R11, c+64(FP)
+ 	RET
+ 
++// func addMulVVW(z, x []Word, y Word) (c Word)
+ TEXT ·addMulVVW(SB),NOSPLIT,$0
+-	JMP ·addMulVVW_g(SB)
++	// input:
++	//   R4: z
++	//   R5: z_len
++	//   R7: x
++	//   R10: y
++	MOVV	z_len+8(FP), R5
++	MOVV	x+24(FP), R7
++	MOVV	z+0(FP), R4
++	MOVV	y+48(FP), R10
++	MOVV	$0, R6
++	SLLV	$3, R5
++	MOVV	$0, R11
++loop:
++	BEQ	R5, R6, done
++	MOVV	(R6)(R7), R8
++	MOVV	(R6)(R4), R9
++	MULV	R8, R10, R12
++	MULHVU	R8, R10, R13
++	ADDV	R12, R9, R8
++	SGTU	R12, R8, R9
++	ADDV	R13, R9
++	ADDV	R8, R11, R12
++	SGTU	R8, R12, R11
++	MOVV	R12, (R6)(R4)
++	ADDV	$8, R6
++	ADDV	R9, R11
++	JMP	loop
++done:
++	MOVV	R11, c+56(FP)
++	RET
+-- 
+2.38.1
+
diff --git a/0020-cmd-compile-fold-constant-shift-with-extension-on-lo.patch b/0020-cmd-compile-fold-constant-shift-with-extension-on-lo.patch
new file mode 100644
index 0000000000000000000000000000000000000000..48553defe786efdc60d7eafbccac34ccc458148f
--- /dev/null
+++ b/0020-cmd-compile-fold-constant-shift-with-extension-on-lo.patch
@@ -0,0 +1,376 @@
+From f10d1a3db9650a738d0254a58aadb62ec89eaca9 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Tue, 24 Sep 2024 16:59:06 +0800
+Subject: [PATCH 20/44] cmd/compile: fold constant shift with extension on
+ loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+goos: linux
+goarch: loong64
+pkg: test/bench/go1
+cpu: Loongson-3A6000 @ 2500.00MHz
+                      │  bench.old  │             bench.new              │
+                      │   sec/op    │   sec/op     vs base               │
+BinaryTree17             7.775 ± 1%    7.747 ± 1%       ~ (p=0.713 n=15)
+Fannkuch11               2.645 ± 0%    2.646 ± 0%  +0.05% (p=0.002 n=15)
+FmtFprintfEmpty         35.87n ± 0%   35.85n ± 0%  -0.06% (p=0.000 n=15)
+FmtFprintfString        59.50n ± 0%   59.17n ± 0%  -0.55% (p=0.000 n=15)
+FmtFprintfInt           62.03n ± 0%   62.38n ± 0%  +0.56% (p=0.000 n=15)
+FmtFprintfIntInt        97.73n ± 0%   96.51n ± 0%  -1.25% (p=0.000 n=15)
+FmtFprintfPrefixedInt   116.6n ± 0%   118.8n ± 0%  +1.89% (p=0.000 n=15)
+FmtFprintfFloat         204.1n ± 0%   200.3n ± 0%  -1.86% (p=0.000 n=15)
+FmtManyArgs             455.1n ± 0%   464.8n ± 0%  +2.13% (p=0.000 n=15)
+GobDecode               7.127m ± 1%   7.063m ± 1%  -0.89% (p=0.033 n=15)
+GobEncode               8.061m ± 1%   8.069m ± 5%       ~ (p=0.870 n=15)
+Gzip                    279.8m ± 0%   271.4m ± 0%  -3.00% (p=0.000 n=15)
+Gunzip                  32.63m ± 0%   31.68m ± 0%  -2.93% (p=0.000 n=15)
+HTTPClientServer        53.39µ ± 0%   53.12µ ± 0%  -0.51% (p=0.000 n=15)
+JSONEncode              9.323m ± 0%   8.990m ± 1%  -3.57% (p=0.000 n=15)
+JSONDecode              46.65m ± 1%   46.58m ± 0%       ~ (p=0.050 n=15)
+Mandelbrot200           4.600m ± 0%   4.603m ± 0%  +0.06% (p=0.000 n=15)
+GoParse                 4.651m ± 0%   4.765m ± 1%  +2.45% (p=0.000 n=15)
+RegexpMatchEasy0_32     59.64n ± 0%   58.26n ± 0%  -2.31% (p=0.000 n=15)
+RegexpMatchEasy0_1K     457.3n ± 0%   458.0n ± 0%  +0.15% (p=0.002 n=15)
+RegexpMatchEasy1_32     59.24n ± 0%   60.12n ± 0%  +1.49% (p=0.000 n=15)
+RegexpMatchEasy1_1K     556.6n ± 0%   556.9n ± 0%  +0.05% (p=0.002 n=15)
+RegexpMatchMedium_32    801.5n ± 0%   799.5n ± 0%  -0.25% (p=0.000 n=15)
+RegexpMatchMedium_1K    27.25µ ± 0%   27.21µ ± 0%  -0.15% (p=0.001 n=15)
+RegexpMatchHard_32      1.382µ ± 0%   1.412µ ± 0%  +2.17% (p=0.000 n=15)
+RegexpMatchHard_1K      40.84µ ± 0%   40.91µ ± 0%  +0.18% (p=0.000 n=15)
+Revcomp                 474.5m ± 0%   473.9m ± 0%       ~ (p=0.081 n=15)
+Template                76.85m ± 1%   74.71m ± 1%  -2.79% (p=0.000 n=15)
+TimeParse               271.1n ± 0%   269.1n ± 0%  -0.74% (p=0.000 n=15)
+TimeFormat              289.5n ± 0%   287.5n ± 0%  -0.69% (p=0.000 n=15)
+geomean                 51.59µ        51.40µ       -0.38%
+
+Change-Id: I721e930c30b3d1cb88a79306ec51990505d850f1
+---
+ .../internal/ssa/_gen/LOONG64latelower.rules  |  19 ++
+ src/cmd/compile/internal/ssa/config.go        |   2 +
+ .../internal/ssa/rewriteLOONG64latelower.go   | 246 ++++++++++++++++++
+ test/codegen/shift.go                         |   3 +
+ 4 files changed, 270 insertions(+)
+ create mode 100644 src/cmd/compile/internal/ssa/_gen/LOONG64latelower.rules
+ create mode 100644 src/cmd/compile/internal/ssa/rewriteLOONG64latelower.go
+
+diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64latelower.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64latelower.rules
+new file mode 100644
+index 0000000000..1158f84422
+--- /dev/null
++++ b/src/cmd/compile/internal/ssa/_gen/LOONG64latelower.rules
+@@ -0,0 +1,19 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++// Fold constant shift with extension.
++(SRAVconst (MOVBreg  x) [c]) && c <   8 => (SRAVconst (SLLVconst <typ.Int64> x [56]) [56+c])
++(SRAVconst (MOVHreg  x) [c]) && c <  16 => (SRAVconst (SLLVconst <typ.Int64> x [48]) [48+c])
++(SRAVconst (MOVWreg  x) [c]) && c <  32 => (SRAVconst (SLLVconst <typ.Int64> x [32]) [32+c])
++(SRLVconst (MOVBUreg x) [c]) && c <   8 => (SRLVconst (SLLVconst <typ.UInt64> x [56]) [56+c])
++(SRLVconst (MOVHUreg x) [c]) && c <  16 => (SRLVconst (SLLVconst <typ.UInt64> x [48]) [48+c])
++(SRLVconst (MOVWUreg x) [c]) && c <  32 => (SRLVconst (SLLVconst <typ.UInt64> x [32]) [32+c])
++(SLLVconst (MOVBUreg x) [c]) && c <= 56 => (SRLVconst (SLLVconst <typ.UInt64> x [56]) [56-c])
++(SLLVconst (MOVHUreg x) [c]) && c <= 48 => (SRLVconst (SLLVconst <typ.UInt64> x [48]) [48-c])
++(SLLVconst (MOVWUreg x) [c]) && c <= 32 => (SRLVconst (SLLVconst <typ.UInt64> x [32]) [32-c])
++
++// Shift by zero.
++(SRAVconst x [0]) => x
++(SRLVconst x [0]) => x
++(SLLVconst x [0]) => x
+diff --git a/src/cmd/compile/internal/ssa/config.go b/src/cmd/compile/internal/ssa/config.go
+index d674cca009..9c4f60f613 100644
+--- a/src/cmd/compile/internal/ssa/config.go
++++ b/src/cmd/compile/internal/ssa/config.go
+@@ -280,6 +280,8 @@ func NewConfig(arch string, types Types, ctxt *obj.Link, optimize, softfloat boo
+ 		c.RegSize = 8
+ 		c.lowerBlock = rewriteBlockLOONG64
+ 		c.lowerValue = rewriteValueLOONG64
++		c.lateLowerBlock = rewriteBlockLOONG64latelower
++		c.lateLowerValue = rewriteValueLOONG64latelower
+ 		c.registers = registersLOONG64[:]
+ 		c.gpRegMask = gpRegMaskLOONG64
+ 		c.fpRegMask = fpRegMaskLOONG64
+diff --git a/src/cmd/compile/internal/ssa/rewriteLOONG64latelower.go b/src/cmd/compile/internal/ssa/rewriteLOONG64latelower.go
+new file mode 100644
+index 0000000000..f092b0a1ef
+--- /dev/null
++++ b/src/cmd/compile/internal/ssa/rewriteLOONG64latelower.go
+@@ -0,0 +1,246 @@
++// Code generated from _gen/LOONG64latelower.rules using 'go generate'; DO NOT EDIT.
++
++package ssa
++
++func rewriteValueLOONG64latelower(v *Value) bool {
++	switch v.Op {
++	case OpLOONG64SLLVconst:
++		return rewriteValueLOONG64latelower_OpLOONG64SLLVconst(v)
++	case OpLOONG64SRAVconst:
++		return rewriteValueLOONG64latelower_OpLOONG64SRAVconst(v)
++	case OpLOONG64SRLVconst:
++		return rewriteValueLOONG64latelower_OpLOONG64SRLVconst(v)
++	}
++	return false
++}
++func rewriteValueLOONG64latelower_OpLOONG64SLLVconst(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (SLLVconst (MOVBUreg x) [c])
++	// cond: c <= 56
++	// result: (SRLVconst (SLLVconst <typ.UInt64> x [56]) [56-c])
++	for {
++		c := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpLOONG64MOVBUreg {
++			break
++		}
++		x := v_0.Args[0]
++		if !(c <= 56) {
++			break
++		}
++		v.reset(OpLOONG64SRLVconst)
++		v.AuxInt = int64ToAuxInt(56 - c)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.UInt64)
++		v0.AuxInt = int64ToAuxInt(56)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++	// match: (SLLVconst (MOVHUreg x) [c])
++	// cond: c <= 48
++	// result: (SRLVconst (SLLVconst <typ.UInt64> x [48]) [48-c])
++	for {
++		c := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpLOONG64MOVHUreg {
++			break
++		}
++		x := v_0.Args[0]
++		if !(c <= 48) {
++			break
++		}
++		v.reset(OpLOONG64SRLVconst)
++		v.AuxInt = int64ToAuxInt(48 - c)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.UInt64)
++		v0.AuxInt = int64ToAuxInt(48)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++	// match: (SLLVconst (MOVWUreg x) [c])
++	// cond: c <= 32
++	// result: (SRLVconst (SLLVconst <typ.UInt64> x [32]) [32-c])
++	for {
++		c := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpLOONG64MOVWUreg {
++			break
++		}
++		x := v_0.Args[0]
++		if !(c <= 32) {
++			break
++		}
++		v.reset(OpLOONG64SRLVconst)
++		v.AuxInt = int64ToAuxInt(32 - c)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.UInt64)
++		v0.AuxInt = int64ToAuxInt(32)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++	// match: (SLLVconst x [0])
++	// result: x
++	for {
++		if auxIntToInt64(v.AuxInt) != 0 {
++			break
++		}
++		x := v_0
++		v.copyOf(x)
++		return true
++	}
++	return false
++}
++func rewriteValueLOONG64latelower_OpLOONG64SRAVconst(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (SRAVconst (MOVBreg x) [c])
++	// cond: c < 8
++	// result: (SRAVconst (SLLVconst <typ.Int64> x [56]) [56+c])
++	for {
++		c := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpLOONG64MOVBreg {
++			break
++		}
++		x := v_0.Args[0]
++		if !(c < 8) {
++			break
++		}
++		v.reset(OpLOONG64SRAVconst)
++		v.AuxInt = int64ToAuxInt(56 + c)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.Int64)
++		v0.AuxInt = int64ToAuxInt(56)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++	// match: (SRAVconst (MOVHreg x) [c])
++	// cond: c < 16
++	// result: (SRAVconst (SLLVconst <typ.Int64> x [48]) [48+c])
++	for {
++		c := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpLOONG64MOVHreg {
++			break
++		}
++		x := v_0.Args[0]
++		if !(c < 16) {
++			break
++		}
++		v.reset(OpLOONG64SRAVconst)
++		v.AuxInt = int64ToAuxInt(48 + c)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.Int64)
++		v0.AuxInt = int64ToAuxInt(48)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++	// match: (SRAVconst (MOVWreg x) [c])
++	// cond: c < 32
++	// result: (SRAVconst (SLLVconst <typ.Int64> x [32]) [32+c])
++	for {
++		c := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpLOONG64MOVWreg {
++			break
++		}
++		x := v_0.Args[0]
++		if !(c < 32) {
++			break
++		}
++		v.reset(OpLOONG64SRAVconst)
++		v.AuxInt = int64ToAuxInt(32 + c)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.Int64)
++		v0.AuxInt = int64ToAuxInt(32)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++	// match: (SRAVconst x [0])
++	// result: x
++	for {
++		if auxIntToInt64(v.AuxInt) != 0 {
++			break
++		}
++		x := v_0
++		v.copyOf(x)
++		return true
++	}
++	return false
++}
++func rewriteValueLOONG64latelower_OpLOONG64SRLVconst(v *Value) bool {
++	v_0 := v.Args[0]
++	b := v.Block
++	typ := &b.Func.Config.Types
++	// match: (SRLVconst (MOVBUreg x) [c])
++	// cond: c < 8
++	// result: (SRLVconst (SLLVconst <typ.UInt64> x [56]) [56+c])
++	for {
++		c := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpLOONG64MOVBUreg {
++			break
++		}
++		x := v_0.Args[0]
++		if !(c < 8) {
++			break
++		}
++		v.reset(OpLOONG64SRLVconst)
++		v.AuxInt = int64ToAuxInt(56 + c)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.UInt64)
++		v0.AuxInt = int64ToAuxInt(56)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++	// match: (SRLVconst (MOVHUreg x) [c])
++	// cond: c < 16
++	// result: (SRLVconst (SLLVconst <typ.UInt64> x [48]) [48+c])
++	for {
++		c := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpLOONG64MOVHUreg {
++			break
++		}
++		x := v_0.Args[0]
++		if !(c < 16) {
++			break
++		}
++		v.reset(OpLOONG64SRLVconst)
++		v.AuxInt = int64ToAuxInt(48 + c)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.UInt64)
++		v0.AuxInt = int64ToAuxInt(48)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++	// match: (SRLVconst (MOVWUreg x) [c])
++	// cond: c < 32
++	// result: (SRLVconst (SLLVconst <typ.UInt64> x [32]) [32+c])
++	for {
++		c := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpLOONG64MOVWUreg {
++			break
++		}
++		x := v_0.Args[0]
++		if !(c < 32) {
++			break
++		}
++		v.reset(OpLOONG64SRLVconst)
++		v.AuxInt = int64ToAuxInt(32 + c)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, typ.UInt64)
++		v0.AuxInt = int64ToAuxInt(32)
++		v0.AddArg(x)
++		v.AddArg(v0)
++		return true
++	}
++	// match: (SRLVconst x [0])
++	// result: x
++	for {
++		if auxIntToInt64(v.AuxInt) != 0 {
++			break
++		}
++		x := v_0
++		v.copyOf(x)
++		return true
++	}
++	return false
++}
++func rewriteBlockLOONG64latelower(b *Block) bool {
++	return false
++}
+diff --git a/test/codegen/shift.go b/test/codegen/shift.go
+index 2d8cf86857..ad69d69aa5 100644
+--- a/test/codegen/shift.go
++++ b/test/codegen/shift.go
+@@ -61,18 +61,21 @@ func rshConst64x64Overflow8(v int8) int64 {
+ func lshConst32x64(v int32) int32 {
+ 	// ppc64x:"SLW"
+ 	// riscv64:"SLLI",-"AND",-"SLTIU", -"MOVW"
++	// loong64:"SLLV"
+ 	return v << uint64(29)
+ }
+ 
+ func rshConst32Ux64(v uint32) uint32 {
+ 	// ppc64x:"SRW"
+ 	// riscv64:"SRLIW",-"AND",-"SLTIU", -"MOVW"
++	// loong64:"SLLV","SRLV",-"MOVWU"
+ 	return v >> uint64(29)
+ }
+ 
+ func rshConst32x64(v int32) int32 {
+ 	// ppc64x:"SRAW"
+ 	// riscv64:"SRAIW",-"OR",-"SLTIU", -"MOVW"
++	// loong64:"SLLV","SRAV",-"MOVW"
+ 	return v >> uint64(29)
+ }
+ 
+-- 
+2.38.1
+
diff --git a/0021-test-codegen-fix-the-matching-instructions-inside-pl.patch b/0021-test-codegen-fix-the-matching-instructions-inside-pl.patch
new file mode 100644
index 0000000000000000000000000000000000000000..f9e0b7be338df01040c2c024600d5efd4d90c443
--- /dev/null
+++ b/0021-test-codegen-fix-the-matching-instructions-inside-pl.patch
@@ -0,0 +1,31 @@
+From 53fc992fd2ba2f64eb436c5cf210e31e70282fc0 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Tue, 8 Oct 2024 16:23:56 +0800
+Subject: [PATCH 21/44] test/codegen: fix the matching instructions inside
+ plain comments for func rshConst32Ux64 on loong64
+
+after add rules for (x << lc) >> rc in commit "cmd/compile: add patterns
+for bitfield opcodes on loong64", the generated assembly from func
+rshConst32Ux64 matches BSTRPICKV, not SLLV and SRLV.
+
+Change-Id: I4348716156abc3410134495edb977a88727139f8
+---
+ test/codegen/shift.go | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/test/codegen/shift.go b/test/codegen/shift.go
+index ad69d69aa5..6112a989b9 100644
+--- a/test/codegen/shift.go
++++ b/test/codegen/shift.go
+@@ -68,7 +68,7 @@ func lshConst32x64(v int32) int32 {
+ func rshConst32Ux64(v uint32) uint32 {
+ 	// ppc64x:"SRW"
+ 	// riscv64:"SRLIW",-"AND",-"SLTIU", -"MOVW"
+-	// loong64:"SLLV","SRLV",-"MOVWU"
++	// loong64:"BSTRPICKV",-"SLLV",-"SRLV",-"MOVWU"
+ 	return v >> uint64(29)
+ }
+ 
+-- 
+2.38.1
+
diff --git a/0022-cmd-compile-optimize-shifts-of-int32-and-uint32-on-l.patch b/0022-cmd-compile-optimize-shifts-of-int32-and-uint32-on-l.patch
new file mode 100644
index 0000000000000000000000000000000000000000..c119f2e94194934fc073159898e1795f4b99b70a
--- /dev/null
+++ b/0022-cmd-compile-optimize-shifts-of-int32-and-uint32-on-l.patch
@@ -0,0 +1,1064 @@
+From 2ab1123adf4a080d91ef549b76572bf4b22f907f Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Thu, 24 Oct 2024 17:41:01 +0800
+Subject: [PATCH 22/44] cmd/compile: optimize shifts of int32 and uint32 on
+ loong64
+
+Change-Id: I6b8d110cfed8d55e2b753259a45f55e09b8f759d
+---
+ src/cmd/compile/internal/loong64/ssa.go       |   6 +
+ .../compile/internal/ssa/_gen/LOONG64.rules   |  39 +-
+ .../compile/internal/ssa/_gen/LOONG64Ops.go   |   6 +
+ src/cmd/compile/internal/ssa/opGen.go         |  90 ++++
+ .../compile/internal/ssa/rewriteLOONG64.go    | 431 +++++++++++++-----
+ test/codegen/shift.go                         |  20 +-
+ 6 files changed, 462 insertions(+), 130 deletions(-)
+
+diff --git a/src/cmd/compile/internal/loong64/ssa.go b/src/cmd/compile/internal/loong64/ssa.go
+index 0ba9efa1d3..bd761c407e 100644
+--- a/src/cmd/compile/internal/loong64/ssa.go
++++ b/src/cmd/compile/internal/loong64/ssa.go
+@@ -165,8 +165,11 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		ssa.OpLOONG64OR,
+ 		ssa.OpLOONG64XOR,
+ 		ssa.OpLOONG64NOR,
++		ssa.OpLOONG64SLL,
+ 		ssa.OpLOONG64SLLV,
++		ssa.OpLOONG64SRL,
+ 		ssa.OpLOONG64SRLV,
++		ssa.OpLOONG64SRA,
+ 		ssa.OpLOONG64SRAV,
+ 		ssa.OpLOONG64ROTR,
+ 		ssa.OpLOONG64ROTRV,
+@@ -274,8 +277,11 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
+ 		ssa.OpLOONG64ORconst,
+ 		ssa.OpLOONG64XORconst,
+ 		ssa.OpLOONG64NORconst,
++		ssa.OpLOONG64SLLconst,
+ 		ssa.OpLOONG64SLLVconst,
++		ssa.OpLOONG64SRLconst,
+ 		ssa.OpLOONG64SRLVconst,
++		ssa.OpLOONG64SRAconst,
+ 		ssa.OpLOONG64SRAVconst,
+ 		ssa.OpLOONG64ROTRconst,
+ 		ssa.OpLOONG64ROTRVconst,
+diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
+index 00a0a84f33..014cd6fb05 100644
+--- a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
+@@ -62,10 +62,10 @@
+ (Lsh64x16 <t> x y) => (MASKEQZ (SLLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+ (Lsh64x8  <t> x y) => (MASKEQZ (SLLV <t> x (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
+ 
+-(Lsh32x64 <t> x y) => (MASKEQZ (SLLV <t> x                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
+-(Lsh32x32 <t> x y) => (MASKEQZ (SLLV <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+-(Lsh32x16 <t> x y) => (MASKEQZ (SLLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+-(Lsh32x8  <t> x y) => (MASKEQZ (SLLV <t> x (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
++(Lsh32x64 <t> x y) => (MASKEQZ (SLL <t> x                y)  (SGTU (MOVVconst <typ.UInt64> [32])                y))
++(Lsh32x32 <t> x y) => (MASKEQZ (SLL <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt32to64 y)))
++(Lsh32x16 <t> x y) => (MASKEQZ (SLL <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt16to64 y)))
++(Lsh32x8  <t> x y) => (MASKEQZ (SLL <t> x (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt8to64  y)))
+ 
+ (Lsh16x64 <t> x y) => (MASKEQZ (SLLV <t> x                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
+ (Lsh16x32 <t> x y) => (MASKEQZ (SLLV <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+@@ -82,10 +82,10 @@
+ (Rsh64Ux16 <t> x y) => (MASKEQZ (SRLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+ (Rsh64Ux8  <t> x y) => (MASKEQZ (SRLV <t> x (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
+ 
+-(Rsh32Ux64 <t> x y) => (MASKEQZ (SRLV <t> (ZeroExt32to64 x)                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
+-(Rsh32Ux32 <t> x y) => (MASKEQZ (SRLV <t> (ZeroExt32to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+-(Rsh32Ux16 <t> x y) => (MASKEQZ (SRLV <t> (ZeroExt32to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+-(Rsh32Ux8  <t> x y) => (MASKEQZ (SRLV <t> (ZeroExt32to64 x) (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
++(Rsh32Ux64 <t> x y) => (MASKEQZ (SRL <t> x                y)   (SGTU (MOVVconst <typ.UInt64> [32])                y))
++(Rsh32Ux32 <t> x y) => (MASKEQZ (SRL <t> x (ZeroExt32to64 y))  (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt32to64 y)))
++(Rsh32Ux16 <t> x y) => (MASKEQZ (SRL <t> x (ZeroExt16to64 y))  (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt16to64 y)))
++(Rsh32Ux8  <t> x y) => (MASKEQZ (SRL <t> x (ZeroExt8to64  y))  (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt8to64  y)))
+ 
+ (Rsh16Ux64 <t> x y) => (MASKEQZ (SRLV <t> (ZeroExt16to64 x)                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
+ (Rsh16Ux32 <t> x y) => (MASKEQZ (SRLV <t> (ZeroExt16to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+@@ -102,10 +102,10 @@
+ (Rsh64x16 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+ (Rsh64x8  <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
+ 
+-(Rsh32x64 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+-(Rsh32x32 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+-(Rsh32x16 <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+-(Rsh32x8  <t> x y) => (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
++(Rsh32x64 <t> x y) => (SRA x (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [31]))) y))
++(Rsh32x32 <t> x y) => (SRA x (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [31]))) (ZeroExt32to64 y)))
++(Rsh32x16 <t> x y) => (SRA x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [31]))) (ZeroExt16to64 y)))
++(Rsh32x8  <t> x y) => (SRA x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [31]))) (ZeroExt8to64  y)))
+ 
+ (Rsh16x64 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+ (Rsh16x32 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+@@ -683,15 +683,30 @@
+ (XOR x (MOVVconst [c])) && is32Bit(c) => (XORconst [c] x)
+ (NOR x (MOVVconst [c])) && is32Bit(c) => (NORconst [c] x)
+ 
++(SLL _ (MOVVconst [c])) && uint64(c)>=32 => (MOVVconst [0])
+ (SLLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0])
++(SRL _ (MOVVconst [c])) && uint64(c)>=32 => (MOVVconst [0])
+ (SRLV _ (MOVVconst [c])) && uint64(c)>=64 => (MOVVconst [0])
++(SRA x (MOVVconst [c])) && uint64(c)>=32 => (SRAconst x [31])
+ (SRAV x (MOVVconst [c])) && uint64(c)>=64 => (SRAVconst x [63])
++(SLL x (MOVVconst [c])) && uint64(c) >=0 && uint64(c) <= 31 => (SLLconst x [c])
+ (SLLV x (MOVVconst [c])) => (SLLVconst x [c])
++(SRL x (MOVVconst [c])) && uint64(c) >=0 && uint64(c) <= 31 => (SRLconst x [c])
+ (SRLV x (MOVVconst [c])) => (SRLVconst x [c])
++(SRA x (MOVVconst [c])) && uint64(c) >=0 && uint64(c) <= 31 => (SRAconst x [c])
+ (SRAV x (MOVVconst [c])) => (SRAVconst x [c])
+ (ROTR x (MOVVconst [c]))  => (ROTRconst x [c&31])
+ (ROTRV x (MOVVconst [c])) => (ROTRVconst x [c&63])
+ 
++// Avoid unnecessary zero and sign extension when right shifting.
++(SRLVconst <t> [rc] (MOVWUreg y)) && rc >= 0 && rc <= 31 => (SRLconst <t> [int64(rc)] y)
++(SRAVconst <t> [rc] (MOVWreg y)) && rc >= 0 && rc <= 31 => (SRAconst <t> [int64(rc)] y)
++
++// Replace right shifts that exceed size of signed type. 
++(SRAVconst <t> [rc] (MOVBreg y)) && rc >=  8 => (SRAVconst [63] (SLLVconst <t> [56] y))
++(SRAVconst <t> [rc] (MOVHreg y)) && rc >= 16 => (SRAVconst  [63] (SLLVconst <t> [48] y))
++(SRAVconst <t> [rc] (MOVWreg y)) && rc >= 32 => (SRAconst [31] y)
++
+ // If the shift amount is larger than the datasize(32, 16, 8), we can optimize to constant 0.
+ (MOVWUreg (SLLVconst [lc] x)) && lc >= 32 => (MOVVconst [0])
+ (MOVHUreg (SLLVconst [lc] x)) && lc >= 16 => (MOVVconst [0])
+diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
+index 8f17158b64..4b3f1fd689 100644
+--- a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
++++ b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
+@@ -240,11 +240,17 @@ func init() {
+ 		{name: "FCOPYSGD", argLength: 2, reg: fp21, asm: "FCOPYSGD"}, // float64
+ 
+ 		// shifts
++		{name: "SLL", argLength: 2, reg: gp21, asm: "SLL"},                        // arg0 << arg1, shift amount is mod 32
+ 		{name: "SLLV", argLength: 2, reg: gp21, asm: "SLLV"},                      // arg0 << arg1, shift amount is mod 64
++		{name: "SLLconst", argLength: 1, reg: gp11, asm: "SLL", aux: "Int64"},     // arg0 << auxInt
+ 		{name: "SLLVconst", argLength: 1, reg: gp11, asm: "SLLV", aux: "Int64"},   // arg0 << auxInt
++		{name: "SRL", argLength: 2, reg: gp21, asm: "SRL"},                        // arg0 >> arg1, unsigned, shift amount is mod 32
+ 		{name: "SRLV", argLength: 2, reg: gp21, asm: "SRLV"},                      // arg0 >> arg1, unsigned, shift amount is mod 64
++		{name: "SRLconst", argLength: 1, reg: gp11, asm: "SRL", aux: "Int64"},     // arg0 >> auxInt, unsigned
+ 		{name: "SRLVconst", argLength: 1, reg: gp11, asm: "SRLV", aux: "Int64"},   // arg0 >> auxInt, unsigned
++		{name: "SRA", argLength: 2, reg: gp21, asm: "SRA"},                        // arg0 >> arg1, signed, shift amount is mod 32
+ 		{name: "SRAV", argLength: 2, reg: gp21, asm: "SRAV"},                      // arg0 >> arg1, signed, shift amount is mod 64
++		{name: "SRAconst", argLength: 1, reg: gp11, asm: "SRA", aux: "Int64"},     // arg0 >> auxInt, signed
+ 		{name: "SRAVconst", argLength: 1, reg: gp11, asm: "SRAV", aux: "Int64"},   // arg0 >> auxInt, signed
+ 		{name: "ROTR", argLength: 2, reg: gp21, asm: "ROTR"},                      // arg0 right rotate by (arg1 mod 32) bits
+ 		{name: "ROTRV", argLength: 2, reg: gp21, asm: "ROTRV"},                    // arg0 right rotate by (arg1 mod 64) bits
+diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
+index df1ddfa69e..643d012ca1 100644
+--- a/src/cmd/compile/internal/ssa/opGen.go
++++ b/src/cmd/compile/internal/ssa/opGen.go
+@@ -1824,11 +1824,17 @@ const (
+ 	OpLOONG64MASKEQZ
+ 	OpLOONG64MASKNEZ
+ 	OpLOONG64FCOPYSGD
++	OpLOONG64SLL
+ 	OpLOONG64SLLV
++	OpLOONG64SLLconst
+ 	OpLOONG64SLLVconst
++	OpLOONG64SRL
+ 	OpLOONG64SRLV
++	OpLOONG64SRLconst
+ 	OpLOONG64SRLVconst
++	OpLOONG64SRA
+ 	OpLOONG64SRAV
++	OpLOONG64SRAconst
+ 	OpLOONG64SRAVconst
+ 	OpLOONG64ROTR
+ 	OpLOONG64ROTRV
+@@ -24541,6 +24547,20 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:   "SLL",
++		argLen: 2,
++		asm:    loong64.ASLL,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
++				{1, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
++			},
++			outputs: []outputInfo{
++				{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
++			},
++		},
++	},
+ 	{
+ 		name:   "SLLV",
+ 		argLen: 2,
+@@ -24555,6 +24575,20 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:    "SLLconst",
++		auxType: auxInt64,
++		argLen:  1,
++		asm:     loong64.ASLL,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
++			},
++			outputs: []outputInfo{
++				{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
++			},
++		},
++	},
+ 	{
+ 		name:    "SLLVconst",
+ 		auxType: auxInt64,
+@@ -24569,6 +24603,20 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:   "SRL",
++		argLen: 2,
++		asm:    loong64.ASRL,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
++				{1, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
++			},
++			outputs: []outputInfo{
++				{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
++			},
++		},
++	},
+ 	{
+ 		name:   "SRLV",
+ 		argLen: 2,
+@@ -24583,6 +24631,20 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:    "SRLconst",
++		auxType: auxInt64,
++		argLen:  1,
++		asm:     loong64.ASRL,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
++			},
++			outputs: []outputInfo{
++				{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
++			},
++		},
++	},
+ 	{
+ 		name:    "SRLVconst",
+ 		auxType: auxInt64,
+@@ -24597,6 +24659,20 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:   "SRA",
++		argLen: 2,
++		asm:    loong64.ASRA,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
++				{1, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
++			},
++			outputs: []outputInfo{
++				{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
++			},
++		},
++	},
+ 	{
+ 		name:   "SRAV",
+ 		argLen: 2,
+@@ -24611,6 +24687,20 @@ var opcodeTable = [...]opInfo{
+ 			},
+ 		},
+ 	},
++	{
++		name:    "SRAconst",
++		auxType: auxInt64,
++		argLen:  1,
++		asm:     loong64.ASRA,
++		reg: regInfo{
++			inputs: []inputInfo{
++				{0, 1073741816}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
++			},
++			outputs: []outputInfo{
++				{0, 1071644664}, // R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31
++			},
++		},
++	},
+ 	{
+ 		name:    "SRAVconst",
+ 		auxType: auxInt64,
+diff --git a/src/cmd/compile/internal/ssa/rewriteLOONG64.go b/src/cmd/compile/internal/ssa/rewriteLOONG64.go
+index ab39040de1..93bf95eb51 100644
+--- a/src/cmd/compile/internal/ssa/rewriteLOONG64.go
++++ b/src/cmd/compile/internal/ssa/rewriteLOONG64.go
+@@ -440,14 +440,20 @@ func rewriteValueLOONG64(v *Value) bool {
+ 		return rewriteValueLOONG64_OpLOONG64SGTUconst(v)
+ 	case OpLOONG64SGTconst:
+ 		return rewriteValueLOONG64_OpLOONG64SGTconst(v)
++	case OpLOONG64SLL:
++		return rewriteValueLOONG64_OpLOONG64SLL(v)
+ 	case OpLOONG64SLLV:
+ 		return rewriteValueLOONG64_OpLOONG64SLLV(v)
+ 	case OpLOONG64SLLVconst:
+ 		return rewriteValueLOONG64_OpLOONG64SLLVconst(v)
++	case OpLOONG64SRA:
++		return rewriteValueLOONG64_OpLOONG64SRA(v)
+ 	case OpLOONG64SRAV:
+ 		return rewriteValueLOONG64_OpLOONG64SRAV(v)
+ 	case OpLOONG64SRAVconst:
+ 		return rewriteValueLOONG64_OpLOONG64SRAVconst(v)
++	case OpLOONG64SRL:
++		return rewriteValueLOONG64_OpLOONG64SRL(v)
+ 	case OpLOONG64SRLV:
+ 		return rewriteValueLOONG64_OpLOONG64SRLV(v)
+ 	case OpLOONG64SRLVconst:
+@@ -5953,6 +5959,43 @@ func rewriteValueLOONG64_OpLOONG64SGTconst(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueLOONG64_OpLOONG64SLL(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (SLL _ (MOVVconst [c]))
++	// cond: uint64(c)>=32
++	// result: (MOVVconst [0])
++	for {
++		if v_1.Op != OpLOONG64MOVVconst {
++			break
++		}
++		c := auxIntToInt64(v_1.AuxInt)
++		if !(uint64(c) >= 32) {
++			break
++		}
++		v.reset(OpLOONG64MOVVconst)
++		v.AuxInt = int64ToAuxInt(0)
++		return true
++	}
++	// match: (SLL x (MOVVconst [c]))
++	// cond: uint64(c) >=0 && uint64(c) <= 31
++	// result: (SLLconst x [c])
++	for {
++		x := v_0
++		if v_1.Op != OpLOONG64MOVVconst {
++			break
++		}
++		c := auxIntToInt64(v_1.AuxInt)
++		if !(uint64(c) >= 0 && uint64(c) <= 31) {
++			break
++		}
++		v.reset(OpLOONG64SLLconst)
++		v.AuxInt = int64ToAuxInt(c)
++		v.AddArg(x)
++		return true
++	}
++	return false
++}
+ func rewriteValueLOONG64_OpLOONG64SLLV(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+@@ -6002,6 +6045,45 @@ func rewriteValueLOONG64_OpLOONG64SLLVconst(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueLOONG64_OpLOONG64SRA(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (SRA x (MOVVconst [c]))
++	// cond: uint64(c)>=32
++	// result: (SRAconst x [31])
++	for {
++		x := v_0
++		if v_1.Op != OpLOONG64MOVVconst {
++			break
++		}
++		c := auxIntToInt64(v_1.AuxInt)
++		if !(uint64(c) >= 32) {
++			break
++		}
++		v.reset(OpLOONG64SRAconst)
++		v.AuxInt = int64ToAuxInt(31)
++		v.AddArg(x)
++		return true
++	}
++	// match: (SRA x (MOVVconst [c]))
++	// cond: uint64(c) >=0 && uint64(c) <= 31
++	// result: (SRAconst x [c])
++	for {
++		x := v_0
++		if v_1.Op != OpLOONG64MOVVconst {
++			break
++		}
++		c := auxIntToInt64(v_1.AuxInt)
++		if !(uint64(c) >= 0 && uint64(c) <= 31) {
++			break
++		}
++		v.reset(OpLOONG64SRAconst)
++		v.AuxInt = int64ToAuxInt(c)
++		v.AddArg(x)
++		return true
++	}
++	return false
++}
+ func rewriteValueLOONG64_OpLOONG64SRAV(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+@@ -6039,6 +6121,85 @@ func rewriteValueLOONG64_OpLOONG64SRAV(v *Value) bool {
+ }
+ func rewriteValueLOONG64_OpLOONG64SRAVconst(v *Value) bool {
+ 	v_0 := v.Args[0]
++	b := v.Block
++	// match: (SRAVconst <t> [rc] (MOVWreg y))
++	// cond: rc >= 0 && rc <= 31
++	// result: (SRAconst <t> [int64(rc)] y)
++	for {
++		t := v.Type
++		rc := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpLOONG64MOVWreg {
++			break
++		}
++		y := v_0.Args[0]
++		if !(rc >= 0 && rc <= 31) {
++			break
++		}
++		v.reset(OpLOONG64SRAconst)
++		v.Type = t
++		v.AuxInt = int64ToAuxInt(int64(rc))
++		v.AddArg(y)
++		return true
++	}
++	// match: (SRAVconst <t> [rc] (MOVBreg y))
++	// cond: rc >= 8
++	// result: (SRAVconst [63] (SLLVconst <t> [56] y))
++	for {
++		t := v.Type
++		rc := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpLOONG64MOVBreg {
++			break
++		}
++		y := v_0.Args[0]
++		if !(rc >= 8) {
++			break
++		}
++		v.reset(OpLOONG64SRAVconst)
++		v.AuxInt = int64ToAuxInt(63)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, t)
++		v0.AuxInt = int64ToAuxInt(56)
++		v0.AddArg(y)
++		v.AddArg(v0)
++		return true
++	}
++	// match: (SRAVconst <t> [rc] (MOVHreg y))
++	// cond: rc >= 16
++	// result: (SRAVconst [63] (SLLVconst <t> [48] y))
++	for {
++		t := v.Type
++		rc := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpLOONG64MOVHreg {
++			break
++		}
++		y := v_0.Args[0]
++		if !(rc >= 16) {
++			break
++		}
++		v.reset(OpLOONG64SRAVconst)
++		v.AuxInt = int64ToAuxInt(63)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SLLVconst, t)
++		v0.AuxInt = int64ToAuxInt(48)
++		v0.AddArg(y)
++		v.AddArg(v0)
++		return true
++	}
++	// match: (SRAVconst <t> [rc] (MOVWreg y))
++	// cond: rc >= 32
++	// result: (SRAconst [31] y)
++	for {
++		rc := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpLOONG64MOVWreg {
++			break
++		}
++		y := v_0.Args[0]
++		if !(rc >= 32) {
++			break
++		}
++		v.reset(OpLOONG64SRAconst)
++		v.AuxInt = int64ToAuxInt(31)
++		v.AddArg(y)
++		return true
++	}
+ 	// match: (SRAVconst [c] (MOVVconst [d]))
+ 	// result: (MOVVconst [d>>uint64(c)])
+ 	for {
+@@ -6053,6 +6214,43 @@ func rewriteValueLOONG64_OpLOONG64SRAVconst(v *Value) bool {
+ 	}
+ 	return false
+ }
++func rewriteValueLOONG64_OpLOONG64SRL(v *Value) bool {
++	v_1 := v.Args[1]
++	v_0 := v.Args[0]
++	// match: (SRL _ (MOVVconst [c]))
++	// cond: uint64(c)>=32
++	// result: (MOVVconst [0])
++	for {
++		if v_1.Op != OpLOONG64MOVVconst {
++			break
++		}
++		c := auxIntToInt64(v_1.AuxInt)
++		if !(uint64(c) >= 32) {
++			break
++		}
++		v.reset(OpLOONG64MOVVconst)
++		v.AuxInt = int64ToAuxInt(0)
++		return true
++	}
++	// match: (SRL x (MOVVconst [c]))
++	// cond: uint64(c) >=0 && uint64(c) <= 31
++	// result: (SRLconst x [c])
++	for {
++		x := v_0
++		if v_1.Op != OpLOONG64MOVVconst {
++			break
++		}
++		c := auxIntToInt64(v_1.AuxInt)
++		if !(uint64(c) >= 0 && uint64(c) <= 31) {
++			break
++		}
++		v.reset(OpLOONG64SRLconst)
++		v.AuxInt = int64ToAuxInt(c)
++		v.AddArg(x)
++		return true
++	}
++	return false
++}
+ func rewriteValueLOONG64_OpLOONG64SRLV(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+@@ -6157,6 +6355,25 @@ func rewriteValueLOONG64_OpLOONG64SRLVconst(v *Value) bool {
+ 		v.AddArg(x)
+ 		return true
+ 	}
++	// match: (SRLVconst <t> [rc] (MOVWUreg y))
++	// cond: rc >= 0 && rc <= 31
++	// result: (SRLconst <t> [int64(rc)] y)
++	for {
++		t := v.Type
++		rc := auxIntToInt64(v.AuxInt)
++		if v_0.Op != OpLOONG64MOVWUreg {
++			break
++		}
++		y := v_0.Args[0]
++		if !(rc >= 0 && rc <= 31) {
++			break
++		}
++		v.reset(OpLOONG64SRLconst)
++		v.Type = t
++		v.AuxInt = int64ToAuxInt(int64(rc))
++		v.AddArg(y)
++		return true
++	}
+ 	// match: (SRLVconst [rc] (MOVWUreg x))
+ 	// cond: rc >= 32
+ 	// result: (MOVVconst [0])
+@@ -7262,19 +7479,19 @@ func rewriteValueLOONG64_OpLsh32x16(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Lsh32x16 <t> x y)
+-	// result: (MASKEQZ (SLLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
++	// result: (MASKEQZ (SLL <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt16to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
+ 		v.reset(OpLOONG64MASKEQZ)
+-		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+ 		v1.AddArg(y)
+ 		v0.AddArg2(x, v1)
+ 		v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
+ 		v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
+-		v3.AuxInt = int64ToAuxInt(64)
++		v3.AuxInt = int64ToAuxInt(32)
+ 		v2.AddArg2(v3, v1)
+ 		v.AddArg2(v0, v2)
+ 		return true
+@@ -7286,19 +7503,19 @@ func rewriteValueLOONG64_OpLsh32x32(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Lsh32x32 <t> x y)
+-	// result: (MASKEQZ (SLLV <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
++	// result: (MASKEQZ (SLL <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt32to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
+ 		v.reset(OpLOONG64MASKEQZ)
+-		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+ 		v1.AddArg(y)
+ 		v0.AddArg2(x, v1)
+ 		v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
+ 		v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
+-		v3.AuxInt = int64ToAuxInt(64)
++		v3.AuxInt = int64ToAuxInt(32)
+ 		v2.AddArg2(v3, v1)
+ 		v.AddArg2(v0, v2)
+ 		return true
+@@ -7310,17 +7527,17 @@ func rewriteValueLOONG64_OpLsh32x64(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Lsh32x64 <t> x y)
+-	// result: (MASKEQZ (SLLV <t> x y) (SGTU (MOVVconst <typ.UInt64> [64]) y))
++	// result: (MASKEQZ (SLL <t> x y) (SGTU (MOVVconst <typ.UInt64> [32]) y))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
+ 		v.reset(OpLOONG64MASKEQZ)
+-		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t)
+ 		v0.AddArg2(x, y)
+ 		v1 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
+ 		v2 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
+-		v2.AuxInt = int64ToAuxInt(64)
++		v2.AuxInt = int64ToAuxInt(32)
+ 		v1.AddArg2(v2, y)
+ 		v.AddArg2(v0, v1)
+ 		return true
+@@ -7332,19 +7549,19 @@ func rewriteValueLOONG64_OpLsh32x8(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Lsh32x8 <t> x y)
+-	// result: (MASKEQZ (SLLV <t> x (ZeroExt8to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y)))
++	// result: (MASKEQZ (SLL <t> x (ZeroExt8to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt8to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
+ 		v.reset(OpLOONG64MASKEQZ)
+-		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+ 		v1.AddArg(y)
+ 		v0.AddArg2(x, v1)
+ 		v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
+ 		v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
+-		v3.AuxInt = int64ToAuxInt(64)
++		v3.AuxInt = int64ToAuxInt(32)
+ 		v2.AddArg2(v3, v1)
+ 		v.AddArg2(v0, v2)
+ 		return true
+@@ -8694,23 +8911,21 @@ func rewriteValueLOONG64_OpRsh32Ux16(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32Ux16 <t> x y)
+-	// result: (MASKEQZ (SRLV <t> (ZeroExt32to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
++	// result: (MASKEQZ (SRL <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt16to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
+ 		v.reset(OpLOONG64MASKEQZ)
+-		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
+-		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v1.AddArg(x)
+-		v2 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+-		v2.AddArg(y)
+-		v0.AddArg2(v1, v2)
+-		v3 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
+-		v4 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
+-		v4.AuxInt = int64ToAuxInt(64)
+-		v3.AddArg2(v4, v2)
+-		v.AddArg2(v0, v3)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t)
++		v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
++		v1.AddArg(y)
++		v0.AddArg2(x, v1)
++		v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
++		v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
++		v3.AuxInt = int64ToAuxInt(32)
++		v2.AddArg2(v3, v1)
++		v.AddArg2(v0, v2)
+ 		return true
+ 	}
+ }
+@@ -8720,23 +8935,21 @@ func rewriteValueLOONG64_OpRsh32Ux32(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32Ux32 <t> x y)
+-	// result: (MASKEQZ (SRLV <t> (ZeroExt32to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
++	// result: (MASKEQZ (SRL <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt32to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
+ 		v.reset(OpLOONG64MASKEQZ)
+-		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v1.AddArg(x)
+-		v2 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v2.AddArg(y)
+-		v0.AddArg2(v1, v2)
+-		v3 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
+-		v4 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
+-		v4.AuxInt = int64ToAuxInt(64)
+-		v3.AddArg2(v4, v2)
+-		v.AddArg2(v0, v3)
++		v1.AddArg(y)
++		v0.AddArg2(x, v1)
++		v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
++		v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
++		v3.AuxInt = int64ToAuxInt(32)
++		v2.AddArg2(v3, v1)
++		v.AddArg2(v0, v2)
+ 		return true
+ 	}
+ }
+@@ -8746,21 +8959,19 @@ func rewriteValueLOONG64_OpRsh32Ux64(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32Ux64 <t> x y)
+-	// result: (MASKEQZ (SRLV <t> (ZeroExt32to64 x) y) (SGTU (MOVVconst <typ.UInt64> [64]) y))
++	// result: (MASKEQZ (SRL <t> x y) (SGTU (MOVVconst <typ.UInt64> [32]) y))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
+ 		v.reset(OpLOONG64MASKEQZ)
+-		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
+-		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v1.AddArg(x)
+-		v0.AddArg2(v1, y)
+-		v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
+-		v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
+-		v3.AuxInt = int64ToAuxInt(64)
+-		v2.AddArg2(v3, y)
+-		v.AddArg2(v0, v2)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t)
++		v0.AddArg2(x, y)
++		v1 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
++		v2 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
++		v2.AuxInt = int64ToAuxInt(32)
++		v1.AddArg2(v2, y)
++		v.AddArg2(v0, v1)
+ 		return true
+ 	}
+ }
+@@ -8770,23 +8981,21 @@ func rewriteValueLOONG64_OpRsh32Ux8(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32Ux8 <t> x y)
+-	// result: (MASKEQZ (SRLV <t> (ZeroExt32to64 x) (ZeroExt8to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y)))
++	// result: (MASKEQZ (SRL <t> x (ZeroExt8to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt8to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
+ 		v.reset(OpLOONG64MASKEQZ)
+-		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
+-		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v1.AddArg(x)
+-		v2 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+-		v2.AddArg(y)
+-		v0.AddArg2(v1, v2)
+-		v3 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
+-		v4 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
+-		v4.AuxInt = int64ToAuxInt(64)
+-		v3.AddArg2(v4, v2)
+-		v.AddArg2(v0, v3)
++		v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t)
++		v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
++		v1.AddArg(y)
++		v0.AddArg2(x, v1)
++		v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
++		v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
++		v3.AuxInt = int64ToAuxInt(32)
++		v2.AddArg2(v3, v1)
++		v.AddArg2(v0, v2)
+ 		return true
+ 	}
+ }
+@@ -8796,25 +9005,23 @@ func rewriteValueLOONG64_OpRsh32x16(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32x16 <t> x y)
+-	// result: (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
++	// result: (SRA x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [31]))) (ZeroExt16to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
+-		v.reset(OpLOONG64SRAV)
+-		v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
+-		v0.AddArg(x)
+-		v1 := b.NewValue0(v.Pos, OpLOONG64OR, t)
+-		v2 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
+-		v3 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
+-		v4 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+-		v4.AddArg(y)
+-		v5 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
+-		v5.AuxInt = int64ToAuxInt(63)
+-		v3.AddArg2(v4, v5)
+-		v2.AddArg(v3)
+-		v1.AddArg2(v2, v4)
+-		v.AddArg2(v0, v1)
++		v.reset(OpLOONG64SRA)
++		v0 := b.NewValue0(v.Pos, OpLOONG64OR, t)
++		v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
++		v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
++		v3 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
++		v3.AddArg(y)
++		v4 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
++		v4.AuxInt = int64ToAuxInt(31)
++		v2.AddArg2(v3, v4)
++		v1.AddArg(v2)
++		v0.AddArg2(v1, v3)
++		v.AddArg2(x, v0)
+ 		return true
+ 	}
+ }
+@@ -8824,25 +9031,23 @@ func rewriteValueLOONG64_OpRsh32x32(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32x32 <t> x y)
+-	// result: (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
++	// result: (SRA x (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [31]))) (ZeroExt32to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
+-		v.reset(OpLOONG64SRAV)
+-		v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
+-		v0.AddArg(x)
+-		v1 := b.NewValue0(v.Pos, OpLOONG64OR, t)
+-		v2 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
+-		v3 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
+-		v4 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+-		v4.AddArg(y)
+-		v5 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
+-		v5.AuxInt = int64ToAuxInt(63)
+-		v3.AddArg2(v4, v5)
+-		v2.AddArg(v3)
+-		v1.AddArg2(v2, v4)
+-		v.AddArg2(v0, v1)
++		v.reset(OpLOONG64SRA)
++		v0 := b.NewValue0(v.Pos, OpLOONG64OR, t)
++		v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
++		v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
++		v3 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
++		v3.AddArg(y)
++		v4 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
++		v4.AuxInt = int64ToAuxInt(31)
++		v2.AddArg2(v3, v4)
++		v1.AddArg(v2)
++		v0.AddArg2(v1, v3)
++		v.AddArg2(x, v0)
+ 		return true
+ 	}
+ }
+@@ -8852,23 +9057,21 @@ func rewriteValueLOONG64_OpRsh32x64(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32x64 <t> x y)
+-	// result: (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
++	// result: (SRA x (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [31]))) y))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
+-		v.reset(OpLOONG64SRAV)
+-		v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
+-		v0.AddArg(x)
+-		v1 := b.NewValue0(v.Pos, OpLOONG64OR, t)
+-		v2 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
+-		v3 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
+-		v4 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
+-		v4.AuxInt = int64ToAuxInt(63)
+-		v3.AddArg2(y, v4)
+-		v2.AddArg(v3)
+-		v1.AddArg2(v2, y)
+-		v.AddArg2(v0, v1)
++		v.reset(OpLOONG64SRA)
++		v0 := b.NewValue0(v.Pos, OpLOONG64OR, t)
++		v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
++		v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
++		v3 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
++		v3.AuxInt = int64ToAuxInt(31)
++		v2.AddArg2(y, v3)
++		v1.AddArg(v2)
++		v0.AddArg2(v1, y)
++		v.AddArg2(x, v0)
+ 		return true
+ 	}
+ }
+@@ -8878,25 +9081,23 @@ func rewriteValueLOONG64_OpRsh32x8(v *Value) bool {
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
+ 	// match: (Rsh32x8 <t> x y)
+-	// result: (SRAV (SignExt32to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y)))
++	// result: (SRA x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [31]))) (ZeroExt8to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
+-		v.reset(OpLOONG64SRAV)
+-		v0 := b.NewValue0(v.Pos, OpSignExt32to64, typ.Int64)
+-		v0.AddArg(x)
+-		v1 := b.NewValue0(v.Pos, OpLOONG64OR, t)
+-		v2 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
+-		v3 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
+-		v4 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+-		v4.AddArg(y)
+-		v5 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
+-		v5.AuxInt = int64ToAuxInt(63)
+-		v3.AddArg2(v4, v5)
+-		v2.AddArg(v3)
+-		v1.AddArg2(v2, v4)
+-		v.AddArg2(v0, v1)
++		v.reset(OpLOONG64SRA)
++		v0 := b.NewValue0(v.Pos, OpLOONG64OR, t)
++		v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
++		v2 := b.NewValue0(v.Pos, OpLOONG64SGTU, typ.Bool)
++		v3 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
++		v3.AddArg(y)
++		v4 := b.NewValue0(v.Pos, OpLOONG64MOVVconst, typ.UInt64)
++		v4.AuxInt = int64ToAuxInt(31)
++		v2.AddArg2(v3, v4)
++		v1.AddArg(v2)
++		v0.AddArg2(v1, v3)
++		v.AddArg2(x, v0)
+ 		return true
+ 	}
+ }
+diff --git a/test/codegen/shift.go b/test/codegen/shift.go
+index 6112a989b9..3c669edcb2 100644
+--- a/test/codegen/shift.go
++++ b/test/codegen/shift.go
+@@ -11,87 +11,99 @@ package codegen
+ // ------------------ //
+ 
+ func lshConst64x64(v int64) int64 {
++	// loong64:"SLLV"
+ 	// ppc64x:"SLD"
+ 	// riscv64:"SLLI",-"AND",-"SLTIU"
+ 	return v << uint64(33)
+ }
+ 
+ func rshConst64Ux64(v uint64) uint64 {
++	// loong64:"SRLV"
+ 	// ppc64x:"SRD"
+ 	// riscv64:"SRLI\t",-"AND",-"SLTIU"
+ 	return v >> uint64(33)
+ }
+ 
+ func rshConst64Ux64Overflow32(v uint32) uint64 {
++	// loong64:"MOVV\t\\$0,",-"SRL\t"
+ 	// riscv64:"MOV\t\\$0,",-"SRL"
+ 	return uint64(v) >> 32
+ }
+ 
+ func rshConst64Ux64Overflow16(v uint16) uint64 {
++	// loong64:"MOVV\t\\$0,",-"SRLV"
+ 	// riscv64:"MOV\t\\$0,",-"SRL"
+ 	return uint64(v) >> 16
+ }
+ 
+ func rshConst64Ux64Overflow8(v uint8) uint64 {
++	// loong64:"MOVV\t\\$0,",-"SRLV"
+ 	// riscv64:"MOV\t\\$0,",-"SRL"
+ 	return uint64(v) >> 8
+ }
+ 
+ func rshConst64x64(v int64) int64 {
++	// loong64:"SRAV"
+ 	// ppc64x:"SRAD"
+ 	// riscv64:"SRAI\t",-"OR",-"SLTIU"
+ 	return v >> uint64(33)
+ }
+ 
+ func rshConst64x64Overflow32(v int32) int64 {
++	// loong64:"SRA\t\\$31"
+ 	// riscv64:"SRAIW",-"SLLI",-"SRAI\t"
+ 	return int64(v) >> 32
+ }
+ 
+ func rshConst64x64Overflow16(v int16) int64 {
++	// loong64:"SLLV\t\\$48","SRAV\t\\$63"
+ 	// riscv64:"SLLI","SRAI",-"SRAIW"
+ 	return int64(v) >> 16
+ }
+ 
+ func rshConst64x64Overflow8(v int8) int64 {
++	// loong64:"SLLV\t\\$56","SRAV\t\\$63"
+ 	// riscv64:"SLLI","SRAI",-"SRAIW"
+ 	return int64(v) >> 8
+ }
+ 
+ func lshConst32x64(v int32) int32 {
++	// loong64:"SLL\t"
+ 	// ppc64x:"SLW"
+ 	// riscv64:"SLLI",-"AND",-"SLTIU", -"MOVW"
+-	// loong64:"SLLV"
+ 	return v << uint64(29)
+ }
+ 
+ func rshConst32Ux64(v uint32) uint32 {
++	// loong64:"SRL\t"
+ 	// ppc64x:"SRW"
+ 	// riscv64:"SRLIW",-"AND",-"SLTIU", -"MOVW"
+-	// loong64:"BSTRPICKV",-"SLLV",-"SRLV",-"MOVWU"
+ 	return v >> uint64(29)
+ }
+ 
+ func rshConst32x64(v int32) int32 {
++	// loong64:"SRA\t"
+ 	// ppc64x:"SRAW"
+ 	// riscv64:"SRAIW",-"OR",-"SLTIU", -"MOVW"
+-	// loong64:"SLLV","SRAV",-"MOVW"
+ 	return v >> uint64(29)
+ }
+ 
+ func lshConst64x32(v int64) int64 {
++	// loong64:"SLLV"
+ 	// ppc64x:"SLD"
+ 	// riscv64:"SLLI",-"AND",-"SLTIU"
+ 	return v << uint32(33)
+ }
+ 
+ func rshConst64Ux32(v uint64) uint64 {
++	// loong64:"SRLV"
+ 	// ppc64x:"SRD"
+ 	// riscv64:"SRLI\t",-"AND",-"SLTIU"
+ 	return v >> uint32(33)
+ }
+ 
+ func rshConst64x32(v int64) int64 {
++	// loong64:"SRAV"
+ 	// ppc64x:"SRAD"
+ 	// riscv64:"SRAI\t",-"OR",-"SLTIU"
+ 	return v >> uint32(33)
+@@ -253,6 +265,7 @@ func rshGuarded64U(v uint64, s uint) uint64 {
+ 		// s390x:-"RISBGZ",-"AND",-"LOCGR"
+ 		// wasm:-"Select",-".*LtU"
+ 		// arm64:"LSR",-"CSEL"
++		// loong64:"SRLV"
+ 		return v >> s
+ 	}
+ 	panic("shift too large")
+@@ -264,6 +277,7 @@ func rshGuarded64(v int64, s uint) int64 {
+ 		// s390x:-"RISBGZ",-"AND",-"LOCGR"
+ 		// wasm:-"Select",-".*LtU"
+ 		// arm64:"ASR",-"CSEL"
++		// loong64:"SRAV"
+ 		return v >> s
+ 	}
+ 	panic("shift too large")
+-- 
+2.38.1
+
diff --git a/0023-cmd-compile-simplify-bounded-shift-on-loong64.patch b/0023-cmd-compile-simplify-bounded-shift-on-loong64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..c4ba508b2422f8ec4217d0e36810d86d60d1d763
--- /dev/null
+++ b/0023-cmd-compile-simplify-bounded-shift-on-loong64.patch
@@ -0,0 +1,2206 @@
+From 03f91ceb084274b0840d7c2cf7a7cb83a7fb2ed0 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Fri, 15 Nov 2024 17:28:07 +0800
+Subject: [PATCH 23/44] cmd/compile: simplify bounded shift on loong64
+
+Use the shiftIsBounded function to generate more efficient shift instructions.
+Also optimize shift ops when the shift value is v&63 and v&31.
+
+Change-Id: I12548101a7cea6bca7f5fef2b12c4b8af8a20bb3
+---
+ .../compile/internal/ssa/_gen/LOONG64.rules   | 146 +--
+ .../compile/internal/ssa/rewriteLOONG64.go    | 968 ++++++++++++++++++
+ test/codegen/shift.go                         |  16 +
+ 3 files changed, 1071 insertions(+), 59 deletions(-)
+
+diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
+index 014cd6fb05..9d0435f434 100644
+--- a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
++++ b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules
+@@ -57,65 +57,84 @@
+ // shifts
+ // hardware instruction uses only the low 6 bits of the shift
+ // we compare to 64 to ensure Go semantics for large shifts
+-(Lsh64x64 <t> x y) => (MASKEQZ (SLLV <t> x                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
+-(Lsh64x32 <t> x y) => (MASKEQZ (SLLV <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+-(Lsh64x16 <t> x y) => (MASKEQZ (SLLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+-(Lsh64x8  <t> x y) => (MASKEQZ (SLLV <t> x (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
+-
+-(Lsh32x64 <t> x y) => (MASKEQZ (SLL <t> x                y)  (SGTU (MOVVconst <typ.UInt64> [32])                y))
+-(Lsh32x32 <t> x y) => (MASKEQZ (SLL <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt32to64 y)))
+-(Lsh32x16 <t> x y) => (MASKEQZ (SLL <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt16to64 y)))
+-(Lsh32x8  <t> x y) => (MASKEQZ (SLL <t> x (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt8to64  y)))
+-
+-(Lsh16x64 <t> x y) => (MASKEQZ (SLLV <t> x                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
+-(Lsh16x32 <t> x y) => (MASKEQZ (SLLV <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+-(Lsh16x16 <t> x y) => (MASKEQZ (SLLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+-(Lsh16x8  <t> x y) => (MASKEQZ (SLLV <t> x (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
+-
+-(Lsh8x64 <t> x y) => (MASKEQZ (SLLV <t> x                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
+-(Lsh8x32 <t> x y) => (MASKEQZ (SLLV <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+-(Lsh8x16 <t> x y) => (MASKEQZ (SLLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+-(Lsh8x8  <t> x y) => (MASKEQZ (SLLV <t> x (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
+-
+-(Rsh64Ux64 <t> x y) => (MASKEQZ (SRLV <t> x                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
+-(Rsh64Ux32 <t> x y) => (MASKEQZ (SRLV <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+-(Rsh64Ux16 <t> x y) => (MASKEQZ (SRLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+-(Rsh64Ux8  <t> x y) => (MASKEQZ (SRLV <t> x (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
+-
+-(Rsh32Ux64 <t> x y) => (MASKEQZ (SRL <t> x                y)   (SGTU (MOVVconst <typ.UInt64> [32])                y))
+-(Rsh32Ux32 <t> x y) => (MASKEQZ (SRL <t> x (ZeroExt32to64 y))  (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt32to64 y)))
+-(Rsh32Ux16 <t> x y) => (MASKEQZ (SRL <t> x (ZeroExt16to64 y))  (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt16to64 y)))
+-(Rsh32Ux8  <t> x y) => (MASKEQZ (SRL <t> x (ZeroExt8to64  y))  (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt8to64  y)))
+-
+-(Rsh16Ux64 <t> x y) => (MASKEQZ (SRLV <t> (ZeroExt16to64 x)                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
+-(Rsh16Ux32 <t> x y) => (MASKEQZ (SRLV <t> (ZeroExt16to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+-(Rsh16Ux16 <t> x y) => (MASKEQZ (SRLV <t> (ZeroExt16to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+-(Rsh16Ux8  <t> x y) => (MASKEQZ (SRLV <t> (ZeroExt16to64 x) (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
+-
+-(Rsh8Ux64 <t> x y) => (MASKEQZ (SRLV <t> (ZeroExt8to64 x)                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
+-(Rsh8Ux32 <t> x y) => (MASKEQZ (SRLV <t> (ZeroExt8to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+-(Rsh8Ux16 <t> x y) => (MASKEQZ (SRLV <t> (ZeroExt8to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+-(Rsh8Ux8  <t> x y) => (MASKEQZ (SRLV <t> (ZeroExt8to64 x) (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
+-
+-(Rsh64x64 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+-(Rsh64x32 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+-(Rsh64x16 <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+-(Rsh64x8  <t> x y) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
+-
+-(Rsh32x64 <t> x y) => (SRA x (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [31]))) y))
+-(Rsh32x32 <t> x y) => (SRA x (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [31]))) (ZeroExt32to64 y)))
+-(Rsh32x16 <t> x y) => (SRA x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [31]))) (ZeroExt16to64 y)))
+-(Rsh32x8  <t> x y) => (SRA x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [31]))) (ZeroExt8to64  y)))
+-
+-(Rsh16x64 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+-(Rsh16x32 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+-(Rsh16x16 <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+-(Rsh16x8  <t> x y) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
+-
+-(Rsh8x64 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+-(Rsh8x32 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+-(Rsh8x16 <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+-(Rsh8x8  <t> x y) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
++
++// left shift
++(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SLLV x y)
++(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SLL x y)
++(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SLLV x y)
++(Lsh8x(64|32|16|8)  x y) && shiftIsBounded(v) => (SLLV x y)
++
++(Lsh64x64 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV <t> x                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
++(Lsh64x32 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
++(Lsh64x16 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
++(Lsh64x8  <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV <t> x (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
++
++(Lsh32x64 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLL <t> x                y)  (SGTU (MOVVconst <typ.UInt64> [32])                y))
++(Lsh32x32 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLL <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt32to64 y)))
++(Lsh32x16 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLL <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt16to64 y)))
++(Lsh32x8  <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLL <t> x (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt8to64  y)))
++
++(Lsh16x64 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV <t> x                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
++(Lsh16x32 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
++(Lsh16x16 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
++(Lsh16x8  <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV <t> x (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
++
++(Lsh8x64 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV <t> x                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
++(Lsh8x32 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
++(Lsh8x16 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
++(Lsh8x8  <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SLLV <t> x (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
++
++// unsigned right shift
++(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRLV  x                 y)
++(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRL   x                 y)
++(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRLV  (ZeroExt16to64 x) y)
++(Rsh8Ux(64|32|16|8)  x y) && shiftIsBounded(v) => (SRLV  (ZeroExt8to64  x) y)
++
++(Rsh64Ux64 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV <t> x                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
++(Rsh64Ux32 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
++(Rsh64Ux16 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
++(Rsh64Ux8  <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV <t> x (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
++
++(Rsh32Ux64 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRL <t> x                y)   (SGTU (MOVVconst <typ.UInt64> [32])                y))
++(Rsh32Ux32 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRL <t> x (ZeroExt32to64 y))  (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt32to64 y)))
++(Rsh32Ux16 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRL <t> x (ZeroExt16to64 y))  (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt16to64 y)))
++(Rsh32Ux8  <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRL <t> x (ZeroExt8to64  y))  (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt8to64  y)))
++
++(Rsh16Ux64 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV <t> (ZeroExt16to64 x)                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
++(Rsh16Ux32 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV <t> (ZeroExt16to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
++(Rsh16Ux16 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV <t> (ZeroExt16to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
++(Rsh16Ux8  <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV <t> (ZeroExt16to64 x) (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
++
++(Rsh8Ux64 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV <t> (ZeroExt8to64 x)                y)  (SGTU (MOVVconst <typ.UInt64> [64])                y))
++(Rsh8Ux32 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV <t> (ZeroExt8to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
++(Rsh8Ux16 <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV <t> (ZeroExt8to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
++(Rsh8Ux8  <t> x y) && !shiftIsBounded(v) => (MASKEQZ (SRLV <t> (ZeroExt8to64 x) (ZeroExt8to64  y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64  y)))
++
++// signed right shift
++(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAV   x                y)
++(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SRA    x                y)
++(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAV  (SignExt16to64 x) y)
++(Rsh8x(64|32|16|8)  x y) && shiftIsBounded(v) => (SRAV  (SignExt8to64  x) y)
++
++(Rsh64x64 <t> x y) && !shiftIsBounded(v) => (SRAV x (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
++(Rsh64x32 <t> x y) && !shiftIsBounded(v) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
++(Rsh64x16 <t> x y) && !shiftIsBounded(v) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
++(Rsh64x8  <t> x y) && !shiftIsBounded(v) => (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
++
++(Rsh32x64 <t> x y) && !shiftIsBounded(v) => (SRA x (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [31]))) y))
++(Rsh32x32 <t> x y) && !shiftIsBounded(v) => (SRA x (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [31]))) (ZeroExt32to64 y)))
++(Rsh32x16 <t> x y) && !shiftIsBounded(v) => (SRA x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [31]))) (ZeroExt16to64 y)))
++(Rsh32x8  <t> x y) && !shiftIsBounded(v) => (SRA x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [31]))) (ZeroExt8to64  y)))
++
++(Rsh16x64 <t> x y) && !shiftIsBounded(v) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
++(Rsh16x32 <t> x y) && !shiftIsBounded(v) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
++(Rsh16x16 <t> x y) && !shiftIsBounded(v) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
++(Rsh16x8  <t> x y) && !shiftIsBounded(v) => (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
++
++(Rsh8x64 <t> x y) && !shiftIsBounded(v) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
++(Rsh8x32 <t> x y) && !shiftIsBounded(v) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
++(Rsh8x16 <t> x y) && !shiftIsBounded(v) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
++(Rsh8x8  <t> x y) && !shiftIsBounded(v) => (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64  y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64  y)))
+ 
+ // bitfield ops
+ 
+@@ -698,6 +717,15 @@
+ (ROTR x (MOVVconst [c]))  => (ROTRconst x [c&31])
+ (ROTRV x (MOVVconst [c])) => (ROTRVconst x [c&63])
+ 
++// SLLV/SRLV/SRAV only considers the bottom 6 bits of y, similarly SLL/SRL/SRA only considers the
++// bottom 5 bits of y. 
++(SLL x (ANDconst [31] y)) => (SLL x y)
++(SRL x (ANDconst [31] y)) => (SRL x y)
++(SRA x (ANDconst [31] y)) => (SRA x y)
++(SLLV x (ANDconst [63] y)) => (SLLV x y)
++(SRLV x (ANDconst [63] y)) => (SRLV x y)
++(SRAV x (ANDconst [63] y)) => (SRAV x y)
++
+ // Avoid unnecessary zero and sign extension when right shifting.
+ (SRLVconst <t> [rc] (MOVWUreg y)) && rc >= 0 && rc <= 31 => (SRLconst <t> [int64(rc)] y)
+ (SRAVconst <t> [rc] (MOVWreg y)) && rc >= 0 && rc <= 31 => (SRAconst <t> [int64(rc)] y)
+diff --git a/src/cmd/compile/internal/ssa/rewriteLOONG64.go b/src/cmd/compile/internal/ssa/rewriteLOONG64.go
+index 93bf95eb51..9efdca9c9c 100644
+--- a/src/cmd/compile/internal/ssa/rewriteLOONG64.go
++++ b/src/cmd/compile/internal/ssa/rewriteLOONG64.go
+@@ -5994,6 +5994,18 @@ func rewriteValueLOONG64_OpLOONG64SLL(v *Value) bool {
+ 		v.AddArg(x)
+ 		return true
+ 	}
++	// match: (SLL x (ANDconst [31] y))
++	// result: (SLL x y)
++	for {
++		x := v_0
++		if v_1.Op != OpLOONG64ANDconst || auxIntToInt64(v_1.AuxInt) != 31 {
++			break
++		}
++		y := v_1.Args[0]
++		v.reset(OpLOONG64SLL)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	return false
+ }
+ func rewriteValueLOONG64_OpLOONG64SLLV(v *Value) bool {
+@@ -6027,6 +6039,18 @@ func rewriteValueLOONG64_OpLOONG64SLLV(v *Value) bool {
+ 		v.AddArg(x)
+ 		return true
+ 	}
++	// match: (SLLV x (ANDconst [63] y))
++	// result: (SLLV x y)
++	for {
++		x := v_0
++		if v_1.Op != OpLOONG64ANDconst || auxIntToInt64(v_1.AuxInt) != 63 {
++			break
++		}
++		y := v_1.Args[0]
++		v.reset(OpLOONG64SLLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	return false
+ }
+ func rewriteValueLOONG64_OpLOONG64SLLVconst(v *Value) bool {
+@@ -6082,6 +6106,18 @@ func rewriteValueLOONG64_OpLOONG64SRA(v *Value) bool {
+ 		v.AddArg(x)
+ 		return true
+ 	}
++	// match: (SRA x (ANDconst [31] y))
++	// result: (SRA x y)
++	for {
++		x := v_0
++		if v_1.Op != OpLOONG64ANDconst || auxIntToInt64(v_1.AuxInt) != 31 {
++			break
++		}
++		y := v_1.Args[0]
++		v.reset(OpLOONG64SRA)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	return false
+ }
+ func rewriteValueLOONG64_OpLOONG64SRAV(v *Value) bool {
+@@ -6117,6 +6153,18 @@ func rewriteValueLOONG64_OpLOONG64SRAV(v *Value) bool {
+ 		v.AddArg(x)
+ 		return true
+ 	}
++	// match: (SRAV x (ANDconst [63] y))
++	// result: (SRAV x y)
++	for {
++		x := v_0
++		if v_1.Op != OpLOONG64ANDconst || auxIntToInt64(v_1.AuxInt) != 63 {
++			break
++		}
++		y := v_1.Args[0]
++		v.reset(OpLOONG64SRAV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	return false
+ }
+ func rewriteValueLOONG64_OpLOONG64SRAVconst(v *Value) bool {
+@@ -6249,6 +6297,18 @@ func rewriteValueLOONG64_OpLOONG64SRL(v *Value) bool {
+ 		v.AddArg(x)
+ 		return true
+ 	}
++	// match: (SRL x (ANDconst [31] y))
++	// result: (SRL x y)
++	for {
++		x := v_0
++		if v_1.Op != OpLOONG64ANDconst || auxIntToInt64(v_1.AuxInt) != 31 {
++			break
++		}
++		y := v_1.Args[0]
++		v.reset(OpLOONG64SRL)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	return false
+ }
+ func rewriteValueLOONG64_OpLOONG64SRLV(v *Value) bool {
+@@ -6282,6 +6342,18 @@ func rewriteValueLOONG64_OpLOONG64SRLV(v *Value) bool {
+ 		v.AddArg(x)
+ 		return true
+ 	}
++	// match: (SRLV x (ANDconst [63] y))
++	// result: (SRLV x y)
++	for {
++		x := v_0
++		if v_1.Op != OpLOONG64ANDconst || auxIntToInt64(v_1.AuxInt) != 63 {
++			break
++		}
++		y := v_1.Args[0]
++		v.reset(OpLOONG64SRLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	return false
+ }
+ func rewriteValueLOONG64_OpLOONG64SRLVconst(v *Value) bool {
+@@ -7384,12 +7456,29 @@ func rewriteValueLOONG64_OpLsh16x16(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh16x16 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh16x16 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+@@ -7402,18 +7491,36 @@ func rewriteValueLOONG64_OpLsh16x16(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpLsh16x32(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh16x32 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh16x32 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLLV <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+@@ -7426,18 +7533,36 @@ func rewriteValueLOONG64_OpLsh16x32(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpLsh16x64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh16x64 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh16x64 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLLV <t> x y) (SGTU (MOVVconst <typ.UInt64> [64]) y))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
+ 		v0.AddArg2(x, y)
+@@ -7448,18 +7573,36 @@ func rewriteValueLOONG64_OpLsh16x64(v *Value) bool {
+ 		v.AddArg2(v0, v1)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpLsh16x8(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh16x8 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh16x8 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLLV <t> x (ZeroExt8to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+@@ -7472,18 +7615,36 @@ func rewriteValueLOONG64_OpLsh16x8(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpLsh32x16(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh32x16 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLL x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLL)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh32x16 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLL <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt16to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+@@ -7496,18 +7657,36 @@ func rewriteValueLOONG64_OpLsh32x16(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpLsh32x32(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh32x32 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLL x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLL)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh32x32 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLL <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt32to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+@@ -7520,18 +7699,36 @@ func rewriteValueLOONG64_OpLsh32x32(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpLsh32x64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh32x64 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLL x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLL)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh32x64 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLL <t> x y) (SGTU (MOVVconst <typ.UInt64> [32]) y))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t)
+ 		v0.AddArg2(x, y)
+@@ -7542,18 +7739,36 @@ func rewriteValueLOONG64_OpLsh32x64(v *Value) bool {
+ 		v.AddArg2(v0, v1)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpLsh32x8(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh32x8 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLL x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLL)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh32x8 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLL <t> x (ZeroExt8to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt8to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLL, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+@@ -7566,18 +7781,36 @@ func rewriteValueLOONG64_OpLsh32x8(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpLsh64x16(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh64x16 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh64x16 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+@@ -7590,18 +7823,36 @@ func rewriteValueLOONG64_OpLsh64x16(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpLsh64x32(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh64x32 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh64x32 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLLV <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+@@ -7614,18 +7865,36 @@ func rewriteValueLOONG64_OpLsh64x32(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpLsh64x64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh64x64 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh64x64 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLLV <t> x y) (SGTU (MOVVconst <typ.UInt64> [64]) y))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
+ 		v0.AddArg2(x, y)
+@@ -7636,18 +7905,36 @@ func rewriteValueLOONG64_OpLsh64x64(v *Value) bool {
+ 		v.AddArg2(v0, v1)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpLsh64x8(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh64x8 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh64x8 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLLV <t> x (ZeroExt8to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+@@ -7660,18 +7947,36 @@ func rewriteValueLOONG64_OpLsh64x8(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpLsh8x16(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh8x16 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh8x16 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+@@ -7684,18 +7989,36 @@ func rewriteValueLOONG64_OpLsh8x16(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpLsh8x32(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh8x32 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh8x32 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLLV <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+@@ -7708,18 +8031,36 @@ func rewriteValueLOONG64_OpLsh8x32(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpLsh8x64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh8x64 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh8x64 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLLV <t> x y) (SGTU (MOVVconst <typ.UInt64> [64]) y))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
+ 		v0.AddArg2(x, y)
+@@ -7730,18 +8071,36 @@ func rewriteValueLOONG64_OpLsh8x64(v *Value) bool {
+ 		v.AddArg2(v0, v1)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpLsh8x8(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Lsh8x8 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SLLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SLLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Lsh8x8 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SLLV <t> x (ZeroExt8to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SLLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+@@ -7754,6 +8113,7 @@ func rewriteValueLOONG64_OpLsh8x8(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpMod16(v *Value) bool {
+ 	v_1 := v.Args[1]
+@@ -8698,12 +9058,31 @@ func rewriteValueLOONG64_OpRsh16Ux16(v *Value) bool {
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh16Ux16 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRLV (ZeroExt16to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRLV)
++		v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh16Ux16 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRLV <t> (ZeroExt16to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+@@ -8718,18 +9097,38 @@ func rewriteValueLOONG64_OpRsh16Ux16(v *Value) bool {
+ 		v.AddArg2(v0, v3)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh16Ux32(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh16Ux32 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRLV (ZeroExt16to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRLV)
++		v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh16Ux32 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRLV <t> (ZeroExt16to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+@@ -8744,18 +9143,38 @@ func rewriteValueLOONG64_OpRsh16Ux32(v *Value) bool {
+ 		v.AddArg2(v0, v3)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh16Ux64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh16Ux64 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRLV (ZeroExt16to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRLV)
++		v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh16Ux64 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRLV <t> (ZeroExt16to64 x) y) (SGTU (MOVVconst <typ.UInt64> [64]) y))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+@@ -8768,18 +9187,38 @@ func rewriteValueLOONG64_OpRsh16Ux64(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh16Ux8(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh16Ux8 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRLV (ZeroExt16to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRLV)
++		v0 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh16Ux8 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRLV <t> (ZeroExt16to64 x) (ZeroExt8to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+@@ -8794,18 +9233,38 @@ func rewriteValueLOONG64_OpRsh16Ux8(v *Value) bool {
+ 		v.AddArg2(v0, v3)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh16x16(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh16x16 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRAV (SignExt16to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRAV)
++		v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh16x16 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRAV)
+ 		v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
+ 		v0.AddArg(x)
+@@ -8822,18 +9281,38 @@ func rewriteValueLOONG64_OpRsh16x16(v *Value) bool {
+ 		v.AddArg2(v0, v1)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh16x32(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh16x32 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRAV (SignExt16to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRAV)
++		v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh16x32 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRAV)
+ 		v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
+ 		v0.AddArg(x)
+@@ -8850,18 +9329,38 @@ func rewriteValueLOONG64_OpRsh16x32(v *Value) bool {
+ 		v.AddArg2(v0, v1)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh16x64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh16x64 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRAV (SignExt16to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRAV)
++		v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh16x64 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRAV)
+ 		v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
+ 		v0.AddArg(x)
+@@ -8876,18 +9375,38 @@ func rewriteValueLOONG64_OpRsh16x64(v *Value) bool {
+ 		v.AddArg2(v0, v1)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh16x8(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh16x8 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRAV (SignExt16to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRAV)
++		v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh16x8 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRAV (SignExt16to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRAV)
+ 		v0 := b.NewValue0(v.Pos, OpSignExt16to64, typ.Int64)
+ 		v0.AddArg(x)
+@@ -8904,18 +9423,36 @@ func rewriteValueLOONG64_OpRsh16x8(v *Value) bool {
+ 		v.AddArg2(v0, v1)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh32Ux16(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh32Ux16 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRL x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRL)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh32Ux16 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRL <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt16to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+@@ -8928,18 +9465,36 @@ func rewriteValueLOONG64_OpRsh32Ux16(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh32Ux32(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh32Ux32 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRL x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRL)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh32Ux32 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRL <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt32to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+@@ -8952,18 +9507,36 @@ func rewriteValueLOONG64_OpRsh32Ux32(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh32Ux64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh32Ux64 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRL x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRL)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh32Ux64 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRL <t> x y) (SGTU (MOVVconst <typ.UInt64> [32]) y))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t)
+ 		v0.AddArg2(x, y)
+@@ -8974,18 +9547,36 @@ func rewriteValueLOONG64_OpRsh32Ux64(v *Value) bool {
+ 		v.AddArg2(v0, v1)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh32Ux8(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh32Ux8 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRL x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRL)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh32Ux8 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRL <t> x (ZeroExt8to64 y)) (SGTU (MOVVconst <typ.UInt64> [32]) (ZeroExt8to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRL, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+@@ -8998,18 +9589,36 @@ func rewriteValueLOONG64_OpRsh32Ux8(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh32x16(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh32x16 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRA x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRA)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh32x16 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRA x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [31]))) (ZeroExt16to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRA)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64OR, t)
+ 		v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
+@@ -9024,18 +9633,36 @@ func rewriteValueLOONG64_OpRsh32x16(v *Value) bool {
+ 		v.AddArg2(x, v0)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh32x32(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh32x32 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRA x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRA)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh32x32 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRA x (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [31]))) (ZeroExt32to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRA)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64OR, t)
+ 		v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
+@@ -9050,18 +9677,36 @@ func rewriteValueLOONG64_OpRsh32x32(v *Value) bool {
+ 		v.AddArg2(x, v0)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh32x64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh32x64 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRA x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRA)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh32x64 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRA x (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [31]))) y))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRA)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64OR, t)
+ 		v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
+@@ -9074,18 +9719,36 @@ func rewriteValueLOONG64_OpRsh32x64(v *Value) bool {
+ 		v.AddArg2(x, v0)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh32x8(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh32x8 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRA x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRA)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh32x8 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRA x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [31]))) (ZeroExt8to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRA)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64OR, t)
+ 		v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
+@@ -9100,18 +9763,36 @@ func rewriteValueLOONG64_OpRsh32x8(v *Value) bool {
+ 		v.AddArg2(x, v0)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh64Ux16(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh64Ux16 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh64Ux16 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRLV <t> x (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt16to64, typ.UInt64)
+@@ -9124,18 +9805,36 @@ func rewriteValueLOONG64_OpRsh64Ux16(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh64Ux32(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh64Ux32 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh64Ux32 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRLV <t> x (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt32to64, typ.UInt64)
+@@ -9148,18 +9847,36 @@ func rewriteValueLOONG64_OpRsh64Ux32(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh64Ux64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh64Ux64 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh64Ux64 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRLV <t> x y) (SGTU (MOVVconst <typ.UInt64> [64]) y))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
+ 		v0.AddArg2(x, y)
+@@ -9170,18 +9887,36 @@ func rewriteValueLOONG64_OpRsh64Ux64(v *Value) bool {
+ 		v.AddArg2(v0, v1)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh64Ux8(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh64Ux8 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRLV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRLV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh64Ux8 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRLV <t> x (ZeroExt8to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+@@ -9194,18 +9929,36 @@ func rewriteValueLOONG64_OpRsh64Ux8(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh64x16(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh64x16 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRAV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRAV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh64x16 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRAV)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64OR, t)
+ 		v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
+@@ -9220,18 +9973,36 @@ func rewriteValueLOONG64_OpRsh64x16(v *Value) bool {
+ 		v.AddArg2(x, v0)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh64x32(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh64x32 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRAV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRAV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh64x32 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRAV)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64OR, t)
+ 		v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
+@@ -9246,18 +10017,36 @@ func rewriteValueLOONG64_OpRsh64x32(v *Value) bool {
+ 		v.AddArg2(x, v0)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh64x64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh64x64 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRAV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRAV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh64x64 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRAV x (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRAV)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64OR, t)
+ 		v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
+@@ -9270,18 +10059,36 @@ func rewriteValueLOONG64_OpRsh64x64(v *Value) bool {
+ 		v.AddArg2(x, v0)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh64x8(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh64x8 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRAV x y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRAV)
++		v.AddArg2(x, y)
++		return true
++	}
+ 	// match: (Rsh64x8 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRAV x (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRAV)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64OR, t)
+ 		v1 := b.NewValue0(v.Pos, OpLOONG64NEGV, t)
+@@ -9296,18 +10103,38 @@ func rewriteValueLOONG64_OpRsh64x8(v *Value) bool {
+ 		v.AddArg2(x, v0)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh8Ux16(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh8Ux16 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRLV (ZeroExt8to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRLV)
++		v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh8Ux16 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRLV <t> (ZeroExt8to64 x) (ZeroExt16to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt16to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+@@ -9322,18 +10149,38 @@ func rewriteValueLOONG64_OpRsh8Ux16(v *Value) bool {
+ 		v.AddArg2(v0, v3)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh8Ux32(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh8Ux32 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRLV (ZeroExt8to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRLV)
++		v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh8Ux32 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRLV <t> (ZeroExt8to64 x) (ZeroExt32to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt32to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+@@ -9348,18 +10195,38 @@ func rewriteValueLOONG64_OpRsh8Ux32(v *Value) bool {
+ 		v.AddArg2(v0, v3)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh8Ux64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh8Ux64 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRLV (ZeroExt8to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRLV)
++		v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh8Ux64 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRLV <t> (ZeroExt8to64 x) y) (SGTU (MOVVconst <typ.UInt64> [64]) y))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+@@ -9372,18 +10239,38 @@ func rewriteValueLOONG64_OpRsh8Ux64(v *Value) bool {
+ 		v.AddArg2(v0, v2)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh8Ux8(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh8Ux8 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRLV (ZeroExt8to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRLV)
++		v0 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh8Ux8 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (MASKEQZ (SRLV <t> (ZeroExt8to64 x) (ZeroExt8to64 y)) (SGTU (MOVVconst <typ.UInt64> [64]) (ZeroExt8to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64MASKEQZ)
+ 		v0 := b.NewValue0(v.Pos, OpLOONG64SRLV, t)
+ 		v1 := b.NewValue0(v.Pos, OpZeroExt8to64, typ.UInt64)
+@@ -9398,18 +10285,38 @@ func rewriteValueLOONG64_OpRsh8Ux8(v *Value) bool {
+ 		v.AddArg2(v0, v3)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh8x16(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh8x16 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRAV (SignExt8to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRAV)
++		v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh8x16 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt16to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt16to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRAV)
+ 		v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
+ 		v0.AddArg(x)
+@@ -9426,18 +10333,38 @@ func rewriteValueLOONG64_OpRsh8x16(v *Value) bool {
+ 		v.AddArg2(v0, v1)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh8x32(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh8x32 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRAV (SignExt8to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRAV)
++		v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh8x32 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt32to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt32to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRAV)
+ 		v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
+ 		v0.AddArg(x)
+@@ -9454,18 +10381,38 @@ func rewriteValueLOONG64_OpRsh8x32(v *Value) bool {
+ 		v.AddArg2(v0, v1)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh8x64(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh8x64 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRAV (SignExt8to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRAV)
++		v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh8x64 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU y (MOVVconst <typ.UInt64> [63]))) y))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRAV)
+ 		v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
+ 		v0.AddArg(x)
+@@ -9480,18 +10427,38 @@ func rewriteValueLOONG64_OpRsh8x64(v *Value) bool {
+ 		v.AddArg2(v0, v1)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpRsh8x8(v *Value) bool {
+ 	v_1 := v.Args[1]
+ 	v_0 := v.Args[0]
+ 	b := v.Block
+ 	typ := &b.Func.Config.Types
++	// match: (Rsh8x8 x y)
++	// cond: shiftIsBounded(v)
++	// result: (SRAV (SignExt8to64 x) y)
++	for {
++		x := v_0
++		y := v_1
++		if !(shiftIsBounded(v)) {
++			break
++		}
++		v.reset(OpLOONG64SRAV)
++		v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
++		v0.AddArg(x)
++		v.AddArg2(v0, y)
++		return true
++	}
+ 	// match: (Rsh8x8 <t> x y)
++	// cond: !shiftIsBounded(v)
+ 	// result: (SRAV (SignExt8to64 x) (OR <t> (NEGV <t> (SGTU (ZeroExt8to64 y) (MOVVconst <typ.UInt64> [63]))) (ZeroExt8to64 y)))
+ 	for {
+ 		t := v.Type
+ 		x := v_0
+ 		y := v_1
++		if !(!shiftIsBounded(v)) {
++			break
++		}
+ 		v.reset(OpLOONG64SRAV)
+ 		v0 := b.NewValue0(v.Pos, OpSignExt8to64, typ.Int64)
+ 		v0.AddArg(x)
+@@ -9508,6 +10475,7 @@ func rewriteValueLOONG64_OpRsh8x8(v *Value) bool {
+ 		v.AddArg2(v0, v1)
+ 		return true
+ 	}
++	return false
+ }
+ func rewriteValueLOONG64_OpSelect0(v *Value) bool {
+ 	v_0 := v.Args[0]
+diff --git a/test/codegen/shift.go b/test/codegen/shift.go
+index 3c669edcb2..db4e6409a8 100644
+--- a/test/codegen/shift.go
++++ b/test/codegen/shift.go
+@@ -115,6 +115,7 @@ func rshConst64x32(v int64) int64 {
+ 
+ func lshMask64x64(v int64, s uint64) int64 {
+ 	// arm64:"LSL",-"AND"
++	// loong64:"SLLV",-"AND"
+ 	// ppc64x:"RLDICL",-"ORN",-"ISEL"
+ 	// riscv64:"SLL",-"AND\t",-"SLTIU"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+@@ -123,6 +124,7 @@ func lshMask64x64(v int64, s uint64) int64 {
+ 
+ func rshMask64Ux64(v uint64, s uint64) uint64 {
+ 	// arm64:"LSR",-"AND",-"CSEL"
++	// loong64:"SRLV",-"AND"
+ 	// ppc64x:"RLDICL",-"ORN",-"ISEL"
+ 	// riscv64:"SRL\t",-"AND\t",-"SLTIU"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+@@ -131,6 +133,7 @@ func rshMask64Ux64(v uint64, s uint64) uint64 {
+ 
+ func rshMask64x64(v int64, s uint64) int64 {
+ 	// arm64:"ASR",-"AND",-"CSEL"
++	// loong64:"SRAV",-"AND"
+ 	// ppc64x:"RLDICL",-"ORN",-"ISEL"
+ 	// riscv64:"SRA\t",-"OR",-"SLTIU"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+@@ -139,14 +142,21 @@ func rshMask64x64(v int64, s uint64) int64 {
+ 
+ func lshMask32x64(v int32, s uint64) int32 {
+ 	// arm64:"LSL",-"AND"
++	// loong64:"SLL\t","AND","SGTU","MASKEQZ"
+ 	// ppc64x:"ISEL",-"ORN"
+ 	// riscv64:"SLL",-"AND\t",-"SLTIU"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+ 	return v << (s & 63)
+ }
+ 
++func lsh5Mask32x64(v int32, s uint64) int32 {
++	// loong64:"SLL\t",-"AND"
++	return v << (s & 31)
++}
++
+ func rshMask32Ux64(v uint32, s uint64) uint32 {
+ 	// arm64:"LSR",-"AND"
++	// loong64:"SRL\t","AND","SGTU","MASKEQZ"
+ 	// ppc64x:"ISEL",-"ORN"
+ 	// riscv64:"SRLW","SLTIU","NEG","AND\t",-"SRL\t"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+@@ -154,12 +164,14 @@ func rshMask32Ux64(v uint32, s uint64) uint32 {
+ }
+ 
+ func rsh5Mask32Ux64(v uint32, s uint64) uint32 {
++	// loong64:"SRL\t",-"AND"
+ 	// riscv64:"SRLW",-"AND\t",-"SLTIU",-"SRL\t"
+ 	return v >> (s & 31)
+ }
+ 
+ func rshMask32x64(v int32, s uint64) int32 {
+ 	// arm64:"ASR",-"AND"
++	// loong64:"SRA\t","AND","SGTU","SUBVU","OR"
+ 	// ppc64x:"ISEL",-"ORN"
+ 	// riscv64:"SRAW","OR","SLTIU"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+@@ -167,12 +179,14 @@ func rshMask32x64(v int32, s uint64) int32 {
+ }
+ 
+ func rsh5Mask32x64(v int32, s uint64) int32 {
++	// loong64:"SRA\t",-"AND"
+ 	// riscv64:"SRAW",-"OR",-"SLTIU"
+ 	return v >> (s & 31)
+ }
+ 
+ func lshMask64x32(v int64, s uint32) int64 {
+ 	// arm64:"LSL",-"AND"
++	// loong64:"SLLV",-"AND"
+ 	// ppc64x:"RLDICL",-"ORN"
+ 	// riscv64:"SLL",-"AND\t",-"SLTIU"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+@@ -181,6 +195,7 @@ func lshMask64x32(v int64, s uint32) int64 {
+ 
+ func rshMask64Ux32(v uint64, s uint32) uint64 {
+ 	// arm64:"LSR",-"AND",-"CSEL"
++	// loong64:"SRLV",-"AND"
+ 	// ppc64x:"RLDICL",-"ORN"
+ 	// riscv64:"SRL\t",-"AND\t",-"SLTIU"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+@@ -189,6 +204,7 @@ func rshMask64Ux32(v uint64, s uint32) uint64 {
+ 
+ func rshMask64x32(v int64, s uint32) int64 {
+ 	// arm64:"ASR",-"AND",-"CSEL"
++	// loong64:"SRAV",-"AND"
+ 	// ppc64x:"RLDICL",-"ORN",-"ISEL"
+ 	// riscv64:"SRA\t",-"OR",-"SLTIU"
+ 	// s390x:-"RISBGZ",-"AND",-"LOCGR"
+-- 
+2.38.1
+
diff --git a/0024-runtime-use-ABIInternal-on-syscall-and-other-sys.stu.patch b/0024-runtime-use-ABIInternal-on-syscall-and-other-sys.stu.patch
new file mode 100644
index 0000000000000000000000000000000000000000..bc81e40b22cf501e51b804355e9f185b57a9dd79
--- /dev/null
+++ b/0024-runtime-use-ABIInternal-on-syscall-and-other-sys.stu.patch
@@ -0,0 +1,505 @@
+From 7e54d3bbc1af00ca94819f9c1bbb61f822d37439 Mon Sep 17 00:00:00 2001
+From: Guoqi Chen <chenguoqi@loongson.cn>
+Date: Tue, 26 Nov 2024 15:44:28 +0800
+Subject: [PATCH 24/44] runtime: use ABIInternal on syscall and other sys.stuff
+ for loong64
+
+Change-Id: Ieeb3f2af02c55a9ad62a19d0085b0e082a182db4
+---
+ src/runtime/sys_linux_loong64.s | 227 +++++++++++---------------------
+ 1 file changed, 79 insertions(+), 148 deletions(-)
+
+diff --git a/src/runtime/sys_linux_loong64.s b/src/runtime/sys_linux_loong64.s
+index 57cee99da7..b4e9930755 100644
+--- a/src/runtime/sys_linux_loong64.s
++++ b/src/runtime/sys_linux_loong64.s
+@@ -47,8 +47,7 @@
+ #define SYS_timer_delete	111
+ 
+ // func exit(code int32)
+-TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0-4
+-	MOVW	code+0(FP), R4
++TEXT runtime·exit<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_exit_group, R11
+ 	SYSCALL
+ 	RET
+@@ -67,48 +66,49 @@ TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8
+ 	JMP	0(PC)
+ 
+ // func open(name *byte, mode, perm int32) int32
+-TEXT runtime·open(SB),NOSPLIT|NOFRAME,$0-20
++TEXT runtime·open<ABIInternal>(SB),NOSPLIT,$0
++	// before:
++	// R4: name
++	// R5: mode
++	// R6: perm
++
++	// after:
++	// R4: AT_FDCWD
++	// R5: name
++	// R6: mode
++	// R7: perm
++
++	MOVW	R6, R7
++	MOVW	R5, R6
++	MOVV	R4, R5
+ 	MOVW	$AT_FDCWD, R4 // AT_FDCWD, so this acts like open
+-	MOVV	name+0(FP), R5
+-	MOVW	mode+8(FP), R6
+-	MOVW	perm+12(FP), R7
++
+ 	MOVV	$SYS_openat, R11
+ 	SYSCALL
+ 	MOVW	$-4096, R5
+ 	BGEU	R5, R4, 2(PC)
+ 	MOVW	$-1, R4
+-	MOVW	R4, ret+16(FP)
+ 	RET
+ 
+ // func closefd(fd int32) int32
+-TEXT runtime·closefd(SB),NOSPLIT|NOFRAME,$0-12
+-	MOVW	fd+0(FP), R4
++TEXT runtime·closefd<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_close, R11
+ 	SYSCALL
+ 	MOVW	$-4096, R5
+ 	BGEU	R5, R4, 2(PC)
+ 	MOVW	$-1, R4
+-	MOVW	R4, ret+8(FP)
+ 	RET
+ 
+ // func write1(fd uintptr, p unsafe.Pointer, n int32) int32
+-TEXT runtime·write1(SB),NOSPLIT|NOFRAME,$0-28
+-	MOVV	fd+0(FP), R4
+-	MOVV	p+8(FP), R5
+-	MOVW	n+16(FP), R6
++TEXT runtime·write1<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_write, R11
+ 	SYSCALL
+-	MOVW	R4, ret+24(FP)
+ 	RET
+ 
+ // func read(fd int32, p unsafe.Pointer, n int32) int32
+-TEXT runtime·read(SB),NOSPLIT|NOFRAME,$0-28
+-	MOVW	fd+0(FP), R4
+-	MOVV	p+8(FP), R5
+-	MOVW	n+16(FP), R6
++TEXT runtime·read<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_read, R11
+ 	SYSCALL
+-	MOVW	R4, ret+24(FP)
+ 	RET
+ 
+ // func pipe2(flags int32) (r, w int32, errno int32)
+@@ -121,16 +121,15 @@ TEXT runtime·pipe2(SB),NOSPLIT|NOFRAME,$0-20
+ 	RET
+ 
+ // func usleep(usec uint32)
+-TEXT runtime·usleep(SB),NOSPLIT,$16-4
+-	MOVWU	usec+0(FP), R7
++TEXT runtime·usleep<ABIInternal>(SB),NOSPLIT,$16
+ 	MOVV	$1000, R6
+-	MULVU	R6, R7, R7
++	MULVU	R6, R4, R4
+ 	MOVV	$1000000000, R6
+ 
+-	DIVVU	R6, R7, R5	// ts->tv_sec
+-	REMVU	R6, R7, R4	// ts->tv_nsec
++	DIVVU	R6, R4, R5	// ts->tv_sec
++	REMVU	R6, R4, R8	// ts->tv_nsec
+ 	MOVV	R5, 8(R3)
+-	MOVV	R4, 16(R3)
++	MOVV	R8, 16(R3)
+ 
+ 	// nanosleep(&ts, 0)
+ 	ADDV	$8, R3, R4
+@@ -140,14 +139,14 @@ TEXT runtime·usleep(SB),NOSPLIT,$16-4
+ 	RET
+ 
+ // func gettid() uint32
+-TEXT runtime·gettid(SB),NOSPLIT,$0-4
++TEXT runtime·gettid<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_gettid, R11
+ 	SYSCALL
+-	MOVW	R4, ret+0(FP)
+ 	RET
+ 
+ // func raise(sig uint32)
+-TEXT runtime·raise(SB),NOSPLIT|NOFRAME,$0
++TEXT runtime·raise<ABIInternal>(SB),NOSPLIT,$0
++	MOVW	R4, R24 // backup sig
+ 	MOVV	$SYS_getpid, R11
+ 	SYSCALL
+ 	MOVW	R4, R23
+@@ -155,87 +154,66 @@ TEXT runtime·raise(SB),NOSPLIT|NOFRAME,$0
+ 	SYSCALL
+ 	MOVW	R4, R5	// arg 2 tid
+ 	MOVW	R23, R4	// arg 1 pid
+-	MOVW	sig+0(FP), R6	// arg 3
++	MOVW	R24, R6	// arg 3
+ 	MOVV	$SYS_tgkill, R11
+ 	SYSCALL
+ 	RET
+ 
+ // func raiseproc(sig uint32)
+-TEXT runtime·raiseproc(SB),NOSPLIT|NOFRAME,$0
++TEXT runtime·raiseproc<ABIInternal>(SB),NOSPLIT,$0
++	MOVW	R4, R24 // backup sig
+ 	MOVV	$SYS_getpid, R11
+ 	SYSCALL
+ 	//MOVW	R4, R4	// arg 1 pid
+-	MOVW	sig+0(FP), R5	// arg 2
++	MOVW	R24, R5	// arg 2
+ 	MOVV	$SYS_kill, R11
+ 	SYSCALL
+ 	RET
+ 
+ // func getpid() int
+-TEXT ·getpid(SB),NOSPLIT|NOFRAME,$0-8
++TEXT ·getpid<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_getpid, R11
+ 	SYSCALL
+-	MOVV	R4, ret+0(FP)
+ 	RET
+ 
+ // func tgkill(tgid, tid, sig int)
+-TEXT ·tgkill(SB),NOSPLIT|NOFRAME,$0-24
+-	MOVV	tgid+0(FP), R4
+-	MOVV	tid+8(FP), R5
+-	MOVV	sig+16(FP), R6
++TEXT ·tgkill<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_tgkill, R11
+ 	SYSCALL
+ 	RET
+ 
+ // func setitimer(mode int32, new, old *itimerval)
+-TEXT runtime·setitimer(SB),NOSPLIT|NOFRAME,$0-24
+-	MOVW	mode+0(FP), R4
+-	MOVV	new+8(FP), R5
+-	MOVV	old+16(FP), R6
++TEXT runtime·setitimer<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_setitimer, R11
+ 	SYSCALL
+ 	RET
+ 
+ // func timer_create(clockid int32, sevp *sigevent, timerid *int32) int32
+-TEXT runtime·timer_create(SB),NOSPLIT,$0-28
+-	MOVW	clockid+0(FP), R4
+-	MOVV	sevp+8(FP), R5
+-	MOVV	timerid+16(FP), R6
++TEXT runtime·timer_create<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_timer_create, R11
+ 	SYSCALL
+-	MOVW	R4, ret+24(FP)
+ 	RET
+ 
+ // func timer_settime(timerid int32, flags int32, new, old *itimerspec) int32
+-TEXT runtime·timer_settime(SB),NOSPLIT,$0-28
+-	MOVW	timerid+0(FP), R4
+-	MOVW	flags+4(FP), R5
+-	MOVV	new+8(FP), R6
+-	MOVV	old+16(FP), R7
++TEXT runtime·timer_settime<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_timer_settime, R11
+ 	SYSCALL
+-	MOVW	R4, ret+24(FP)
+ 	RET
+ 
+ // func timer_delete(timerid int32) int32
+-TEXT runtime·timer_delete(SB),NOSPLIT,$0-12
+-	MOVW	timerid+0(FP), R4
++TEXT runtime·timer_delete<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_timer_delete, R11
+ 	SYSCALL
+-	MOVW	R4, ret+8(FP)
+ 	RET
+ 
+ // func mincore(addr unsafe.Pointer, n uintptr, dst *byte) int32
+-TEXT runtime·mincore(SB),NOSPLIT|NOFRAME,$0-28
+-	MOVV	addr+0(FP), R4
+-	MOVV	n+8(FP), R5
+-	MOVV	dst+16(FP), R6
++TEXT runtime·mincore<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_mincore, R11
+ 	SYSCALL
+-	MOVW	R4, ret+24(FP)
+ 	RET
+ 
+ // func walltime() (sec int64, nsec int32)
+-TEXT runtime·walltime(SB),NOSPLIT,$24-12
++TEXT runtime·walltime<ABIInternal>(SB),NOSPLIT,$24
+ 	MOVV	R3, R23	// R23 is unchanged by C code
+ 	MOVV	R3, R25
+ 
+@@ -291,7 +269,7 @@ nosaveg:
+ 	JAL	(R20)
+ 
+ finish:
+-	MOVV	0(R3), R7	// sec
++	MOVV	0(R3), R4	// sec
+ 	MOVV	8(R3), R5	// nsec
+ 
+ 	MOVV	R23, R3	// restore SP
+@@ -304,9 +282,6 @@ finish:
+ 	MOVV	R25, m_vdsoSP(R24)
+ 	MOVV	8(R3), R25
+ 	MOVV	R25, m_vdsoPC(R24)
+-
+-	MOVV	R7, sec+0(FP)
+-	MOVW	R5, nsec+8(FP)
+ 	RET
+ 
+ fallback:
+@@ -315,7 +290,7 @@ fallback:
+ 	JMP finish
+ 
+ // func nanotime1() int64
+-TEXT runtime·nanotime1(SB),NOSPLIT,$16-8
++TEXT runtime·nanotime1<ABIInternal>(SB),NOSPLIT,$24
+ 	MOVV	R3, R23	// R23 is unchanged by C code
+ 	MOVV	R3, R25
+ 
+@@ -389,8 +364,7 @@ finish:
+ 	// return nsec in R7
+ 	MOVV	$1000000000, R4
+ 	MULVU	R4, R7, R7
+-	ADDVU	R5, R7
+-	MOVV	R7, ret+0(FP)
++	ADDVU	R5, R7, R4
+ 	RET
+ 
+ fallback:
+@@ -399,11 +373,7 @@ fallback:
+ 	JMP	finish
+ 
+ // func rtsigprocmask(how int32, new, old *sigset, size int32)
+-TEXT runtime·rtsigprocmask(SB),NOSPLIT|NOFRAME,$0-28
+-	MOVW	how+0(FP), R4
+-	MOVV	new+8(FP), R5
+-	MOVV	old+16(FP), R6
+-	MOVW	size+24(FP), R7
++TEXT runtime·rtsigprocmask<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_rt_sigprocmask, R11
+ 	SYSCALL
+ 	MOVW	$-4096, R5
+@@ -412,22 +382,21 @@ TEXT runtime·rtsigprocmask(SB),NOSPLIT|NOFRAME,$0-28
+ 	RET
+ 
+ // func rt_sigaction(sig uintptr, new, old *sigactiont, size uintptr) int32
+-TEXT runtime·rt_sigaction(SB),NOSPLIT|NOFRAME,$0-36
+-	MOVV	sig+0(FP), R4
+-	MOVV	new+8(FP), R5
+-	MOVV	old+16(FP), R6
+-	MOVV	size+24(FP), R7
++TEXT runtime·rt_sigaction<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_rt_sigaction, R11
+ 	SYSCALL
+-	MOVW	R4, ret+32(FP)
+ 	RET
+ 
+ // func sigfwd(fn uintptr, sig uint32, info *siginfo, ctx unsafe.Pointer)
+-TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
+-	MOVW	sig+8(FP), R4
+-	MOVV	info+16(FP), R5
+-	MOVV	ctx+24(FP), R6
+-	MOVV	fn+0(FP), R20
++TEXT runtime·sigfwd<ABIInternal>(SB),NOSPLIT,$0
++	// before:
++	//    R4:  fn, R5: sig, R6: info, R7: ctx
++	// after:
++	//    R20: fn, R4: sig, R5: info, R6: ctx
++	MOVV	R4, R20
++	MOVV	R5, R4
++	MOVV	R6, R5
++	MOVV	R7, R6
+ 	JAL	(R20)
+ 	RET
+ 
+@@ -460,48 +429,31 @@ TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+ 	JMP	runtime·sigtramp(SB)
+ 
+ // func sysMmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) (p unsafe.Pointer, err int)
+-TEXT runtime·sysMmap(SB),NOSPLIT|NOFRAME,$0
+-	MOVV	addr+0(FP), R4
+-	MOVV	n+8(FP), R5
+-	MOVW	prot+16(FP), R6
+-	MOVW	flags+20(FP), R7
+-	MOVW	fd+24(FP), R8
+-	MOVW	off+28(FP), R9
+-
++TEXT runtime·sysMmap<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_mmap, R11
+ 	SYSCALL
+ 	MOVW	$-4096, R5
+ 	BGEU	R5, R4, ok
+-	MOVV	$0, p+32(FP)
+-	SUBVU	R4, R0, R4
+-	MOVV	R4, err+40(FP)
++	SUBVU	R4, R0, R5
++	MOVV	$0, R4
+ 	RET
+ ok:
+-	MOVV	R4, p+32(FP)
+-	MOVV	$0, err+40(FP)
++	MOVV	$0, R5
+ 	RET
+ 
+ // Call the function stored in _cgo_mmap using the GCC calling convention.
+ // This must be called on the system stack.
+ // func callCgoMmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) uintptr
+-TEXT runtime·callCgoMmap(SB),NOSPLIT,$0
+-	MOVV	addr+0(FP), R4
+-	MOVV	n+8(FP), R5
+-	MOVW	prot+16(FP), R6
+-	MOVW	flags+20(FP), R7
+-	MOVW	fd+24(FP), R8
+-	MOVW	off+28(FP), R9
++TEXT runtime·callCgoMmap<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	_cgo_mmap(SB), R13
+ 	SUBV	$16, R3		// reserve 16 bytes for sp-8 where fp may be saved.
+ 	JAL	(R13)
+ 	ADDV	$16, R3
+-	MOVV	R4, ret+32(FP)
++	MOVV	R4, R4
+ 	RET
+ 
+ // func sysMunmap(addr unsafe.Pointer, n uintptr)
+-TEXT runtime·sysMunmap(SB),NOSPLIT|NOFRAME,$0
+-	MOVV	addr+0(FP), R4
+-	MOVV	n+8(FP), R5
++TEXT runtime·sysMunmap<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_munmap, R11
+ 	SYSCALL
+ 	MOVW	$-4096, R5
+@@ -512,9 +464,7 @@ TEXT runtime·sysMunmap(SB),NOSPLIT|NOFRAME,$0
+ // Call the function stored in _cgo_munmap using the GCC calling convention.
+ // This must be called on the system stack.
+ // func callCgoMunmap(addr unsafe.Pointer, n uintptr)
+-TEXT runtime·callCgoMunmap(SB),NOSPLIT,$0
+-	MOVV	addr+0(FP), R4
+-	MOVV	n+8(FP), R5
++TEXT runtime·callCgoMunmap<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	_cgo_munmap(SB), R13
+ 	SUBV	$16, R3		// reserve 16 bytes for sp-8 where fp may be saved.
+ 	JAL	(R13)
+@@ -522,38 +472,24 @@ TEXT runtime·callCgoMunmap(SB),NOSPLIT,$0
+ 	RET
+ 
+ // func madvise(addr unsafe.Pointer, n uintptr, flags int32)
+-TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
+-	MOVV	addr+0(FP), R4
+-	MOVV	n+8(FP), R5
+-	MOVW	flags+16(FP), R6
++TEXT runtime·madvise<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_madvise, R11
+ 	SYSCALL
+-	MOVW	R4, ret+24(FP)
+ 	RET
+ 
+ // func futex(addr unsafe.Pointer, op int32, val uint32, ts, addr2 unsafe.Pointer, val3 uint32) int32
+-TEXT runtime·futex(SB),NOSPLIT|NOFRAME,$0
+-	MOVV	addr+0(FP), R4
+-	MOVW	op+8(FP), R5
+-	MOVW	val+12(FP), R6
+-	MOVV	ts+16(FP), R7
+-	MOVV	addr2+24(FP), R8
+-	MOVW	val3+32(FP), R9
++TEXT runtime·futex<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_futex, R11
+ 	SYSCALL
+-	MOVW	R4, ret+40(FP)
+ 	RET
+ 
+ // int64 clone(int32 flags, void *stk, M *mp, G *gp, void (*fn)(void));
+-TEXT runtime·clone(SB),NOSPLIT|NOFRAME,$0
+-	MOVW	flags+0(FP), R4
+-	MOVV	stk+8(FP), R5
+-
++TEXT runtime·clone<ABIInternal>(SB),NOSPLIT,$0
+ 	// Copy mp, gp, fn off parent stack for use by child.
+ 	// Careful: Linux system call clobbers ???.
+-	MOVV	mp+16(FP), R23
+-	MOVV	gp+24(FP), R24
+-	MOVV	fn+32(FP), R25
++	MOVV	R6, R23
++	MOVV	R7, R24
++	MOVV	R8, R25
+ 
+ 	MOVV	R23, -8(R5)
+ 	MOVV	R24, -16(R5)
+@@ -565,8 +501,7 @@ TEXT runtime·clone(SB),NOSPLIT|NOFRAME,$0
+ 	SYSCALL
+ 
+ 	// In parent, return.
+-	BEQ	R4, 3(PC)
+-	MOVW	R4, ret+40(FP)
++	BEQ	R4, 2(PC)
+ 	RET
+ 
+ 	// In child, on new stack.
+@@ -606,9 +541,7 @@ nog:
+ 	JMP	-3(PC)	// keep exiting
+ 
+ // func sigaltstack(new, old *stackt)
+-TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0
+-	MOVV	new+0(FP), R4
+-	MOVV	old+8(FP), R5
++TEXT runtime·sigaltstack<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_sigaltstack, R11
+ 	SYSCALL
+ 	MOVW	$-4096, R5
+@@ -617,42 +550,40 @@ TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0
+ 	RET
+ 
+ // func osyield()
+-TEXT runtime·osyield(SB),NOSPLIT|NOFRAME,$0
++TEXT runtime·osyield<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_sched_yield, R11
+ 	SYSCALL
+ 	RET
+ 
+ // func sched_getaffinity(pid, len uintptr, buf *uintptr) int32
+-TEXT runtime·sched_getaffinity(SB),NOSPLIT|NOFRAME,$0
+-	MOVV	pid+0(FP), R4
+-	MOVV	len+8(FP), R5
+-	MOVV	buf+16(FP), R6
++TEXT runtime·sched_getaffinity<ABIInternal>(SB),NOSPLIT,$0
+ 	MOVV	$SYS_sched_getaffinity, R11
+ 	SYSCALL
+-	MOVW	R4, ret+24(FP)
+ 	RET
+ 
+ // func sbrk0() uintptr
+-TEXT runtime·sbrk0(SB),NOSPLIT|NOFRAME,$0-8
++TEXT runtime·sbrk0<ABIInternal>(SB),NOSPLIT,$0
+ 	// Implemented as brk(NULL).
+ 	MOVV	$0, R4
+ 	MOVV	$SYS_brk, R11
+ 	SYSCALL
+-	MOVV	R4, ret+0(FP)
+ 	RET
+ 
++// unimplemented, only needed for android; declared in stubs_linux.go
+ TEXT runtime·access(SB),$0-20
+-	MOVV	R0, 2(R0) // unimplemented, only needed for android; declared in stubs_linux.go
++	MOVV	R0, 2(R0)
+ 	MOVW	R0, ret+16(FP) // for vet
+ 	RET
+ 
++// unimplemented, only needed for android; declared in stubs_linux.go
+ TEXT runtime·connect(SB),$0-28
+-	MOVV	R0, 2(R0) // unimplemented, only needed for android; declared in stubs_linux.go
++	MOVV	R0, 2(R0)
+ 	MOVW	R0, ret+24(FP) // for vet
+ 	RET
+ 
++// unimplemented, only needed for android; declared in stubs_linux.go
+ TEXT runtime·socket(SB),$0-20
+-	MOVV	R0, 2(R0) // unimplemented, only needed for android; declared in stubs_linux.go
++	MOVV	R0, 2(R0)
+ 	MOVW	R0, ret+16(FP) // for vet
+ 	RET
+ 
+-- 
+2.38.1
+
diff --git a/0025-runtime-use-correct-memory-barrier-in-exitThread-fun.patch b/0025-runtime-use-correct-memory-barrier-in-exitThread-fun.patch
new file mode 100644
index 0000000000000000000000000000000000000000..0b81cb190f22ff450cd050c9bcc0f3cfc1827f41
--- /dev/null
+++ b/0025-runtime-use-correct-memory-barrier-in-exitThread-fun.patch
@@ -0,0 +1,34 @@
+From 5bb6b8ebb22faf46a01ff292c45a7dc72f2b5022 Mon Sep 17 00:00:00 2001
+From: Guoqi Chen <chenguoqi@loongson.cn>
+Date: Tue, 26 Nov 2024 17:10:32 +0800
+Subject: [PATCH 25/44] runtime: use correct memory barrier in exitThread
+ function on loong64
+
+In the runtime.exitThread function, a storeRelease barrier
+is required instead of a full barrier.
+
+Change-Id: I614c6f74e8c9fd56c3badf3bf450b3314e3f377c
+---
+ src/runtime/sys_linux_loong64.s | 6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+diff --git a/src/runtime/sys_linux_loong64.s b/src/runtime/sys_linux_loong64.s
+index b4e9930755..830eb9d099 100644
+--- a/src/runtime/sys_linux_loong64.s
++++ b/src/runtime/sys_linux_loong64.s
+@@ -56,10 +56,8 @@ TEXT runtime·exit<ABIInternal>(SB),NOSPLIT,$0
+ TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8
+ 	MOVV	wait+0(FP), R19
+ 	// We're done using the stack.
+-	MOVW	$0, R11
+-	DBAR
+-	MOVW	R11, (R19)
+-	DBAR
++	DBAR	$0x12	// StoreRelease barrier
++	MOVW	R0, (R19)
+ 	MOVW	$0, R4	// exit code
+ 	MOVV	$SYS_exit, R11
+ 	SYSCALL
+-- 
+2.38.1
+
diff --git a/0026-cmd-internal-obj-loong64-add-V-XV-SEQI-V-XV-.-AND-OR.patch b/0026-cmd-internal-obj-loong64-add-V-XV-SEQI-V-XV-.-AND-OR.patch
new file mode 100644
index 0000000000000000000000000000000000000000..d73cc6515c410c16cd99184bc242d0799d477852
--- /dev/null
+++ b/0026-cmd-internal-obj-loong64-add-V-XV-SEQI-V-XV-.-AND-OR.patch
@@ -0,0 +1,410 @@
+From 38ab8bc5eb69cb2746b32fd4a6ca7931adb7722b Mon Sep 17 00:00:00 2001
+From: Guoqi Chen <chenguoqi@loongson.cn>
+Date: Fri, 29 Nov 2024 15:41:33 +0800
+Subject: [PATCH 26/44] cmd/internal/obj/loong64: add {V,XV}SEQI,
+ {V,XV}.{AND,OR,XOR,NOR} instructions support
+
+Go asm syntax:
+	 VSEQB  $1, V2, V3
+	XVSEQB  $2, X2, X3
+	 V{AND,OR,XOR,NOR}B  $1, V2, V3
+	XV{AND,OR,XOR,NOR}B  $1, V2, V3
+	 V{AND,OR,XOR,NOR,ANDN,ORN}V V1, V2, V3
+	XV{AND,OR,XOR,NOR,ANDN,ORN}V V1, V2, V3
+
+Equivalent platform assembler syntax:
+	 vseqi.b v3, v2, $1
+	xvseqi.b x3, x2 ,$2
+	 v{and,or,xor,nor}.b  v3, v2, $1
+	xv{and,or,xor,nor}.b  x3, x2, $1
+	 v{and,or,xor,nor,andn,orn}v v3, v2, v1
+	xv{and,or,xor,nor,andn,orn}v x3, x2, x1
+
+Change-Id: I56ae0db72c7f473755cbdc7f7171c1058a9def97
+---
+ .../asm/internal/asm/testdata/loong64enc1.s   |  38 ++++
+ src/cmd/internal/obj/loong64/a.out.go         |  21 +++
+ src/cmd/internal/obj/loong64/anames.go        |  20 ++
+ src/cmd/internal/obj/loong64/asm.go           | 173 ++++++++++++++++--
+ 4 files changed, 238 insertions(+), 14 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+index 3a3eb10a74..2418412a3a 100644
+--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+@@ -506,6 +506,16 @@ lable2:
+ 	XVSEQH		X3, X2, X4      // 448c0074
+ 	XVSEQW		X3, X2, X4      // 440c0174
+ 	XVSEQV		X3, X2, X4      // 448c0174
++	VSEQB		$0, V2, V3      // 43008072
++	VSEQH		$1,  V2, V3     // 43848072
++	VSEQW		$8, V2, V3      // 43208172
++	VSEQV		$15, V2, V3     // 43bc8172
++	VSEQV		$-15, V2, V3    // 43c48172
++	XVSEQB		$0, X2, X4      // 44008076
++	XVSEQH		$3, X2, X4      // 448c8076
++	XVSEQW		$12, X2, X4     // 44308176
++	XVSEQV		$15, X2, X4     // 44bc8176
++	XVSEQV		$-15, X2, X4    // 44c48176
+ 
+ 	// VPCNT{B,H,W,V}, XVPCNT{B,H,W,V} instruction
+ 	VPCNTB		V1, V2          // 22209c72
+@@ -517,6 +527,34 @@ lable2:
+ 	XVPCNTW		X3, X2          // 62289c76
+ 	XVPCNTV		X3, X2          // 622c9c76
+ 
++	// VANDV,VORV,VXORV,VNORV,VANDNV,VORNV
++	VANDV		V1, V2, V3      // 43042671
++	VORV		V1, V2, V3      // 43842671
++	VXORV		V1, V2, V3      // 43042771
++	VNORV		V1, V2, V3      // 43842771
++	VANDNV		V1, V2, V3      // 43042871
++	VORNV		V1, V2, V3      // 43842871
++
++	// VANDB,VORB,VXORB,VNORB
++	VANDB		$0, V2, V3      // 4300d073
++	VORB		$64, V2, V3     // 4300d573
++	VXORB		$128, V2, V3    // 4300da73
++	VNORB		$255, V2, V3    // 43fcdf73
++
++	// XVANDV,XVORV,XVXORV,XVNORV,XVANDNV,XVORNV
++	XVANDV		X1, X2, X3      // 43042675
++	XVORV		X1, X2, X3      // 43842675
++	XVXORV		X1, X2, X3      // 43042775
++	XVNORV		X1, X2, X3      // 43842775
++	XVANDNV		X1, X2, X3      // 43042875
++	XVORNV		X1, X2, X3      // 43842875
++
++	// XVANDB,XVORB,XVXORB,XVNORB
++	XVANDB		$0, X2, X3      // 4300d077
++	XVORB		$1, X2, X3      // 4304d477
++	XVXORB		$127, X2, X3    // 43fcd977
++	XVNORB		$255, X2, X3    // 43fcdf77
++
+ 	// MOVV C_DCON12_0, r
+ 	MOVV    $0x7a90000000000000, R4         // MOVV $8831558869273542656, R4        // 04a41e03
+ 	MOVV    $0xea90000000000000, R4         // MOVV $-1544734672188080128, R4       // 04a43a03
+diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
+index b2207c2523..bd3ce61826 100644
+--- a/src/cmd/internal/obj/loong64/a.out.go
++++ b/src/cmd/internal/obj/loong64/a.out.go
+@@ -726,6 +726,27 @@ const (
+ 	AXVMOVQ
+ 
+ 	// LSX and LASX Bit-manipulation Instructions
++	AVANDB
++	AVORB
++	AVXORB
++	AVNORB
++	AXVANDB
++	AXVORB
++	AXVXORB
++	AXVNORB
++	AVANDV
++	AVORV
++	AVXORV
++	AVNORV
++	AVANDNV
++	AVORNV
++	AXVANDV
++	AXVORV
++	AXVXORV
++	AXVNORV
++	AXVANDNV
++	AXVORNV
++
+ 	AVPCNTB
+ 	AVPCNTH
+ 	AVPCNTW
+diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go
+index 3d2f329917..6c1537d123 100644
+--- a/src/cmd/internal/obj/loong64/anames.go
++++ b/src/cmd/internal/obj/loong64/anames.go
+@@ -257,6 +257,26 @@ var Anames = []string{
+ 	"FTINTRNEVD",
+ 	"VMOVQ",
+ 	"XVMOVQ",
++	"VANDB",
++	"VORB",
++	"VXORB",
++	"VNORB",
++	"XVANDB",
++	"XVORB",
++	"XVXORB",
++	"XVNORB",
++	"VANDV",
++	"VORV",
++	"VXORV",
++	"VNORV",
++	"VANDNV",
++	"VORNV",
++	"XVANDV",
++	"XVORV",
++	"XVXORV",
++	"XVNORV",
++	"XVANDNV",
++	"XVORNV",
+ 	"VPCNTB",
+ 	"VPCNTH",
+ 	"VPCNTW",
+diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
+index 5757c3c452..7247193c95 100644
+--- a/src/cmd/internal/obj/loong64/asm.go
++++ b/src/cmd/internal/obj/loong64/asm.go
+@@ -51,6 +51,8 @@ const (
+ 	// branchLoopHead marks loop entry.
+ 	// Used to insert padding for under-aligned loops.
+ 	branchLoopHead
++	immFiledSi5 // The encoding of the immediate field in the instruction is 5-bits
++	immFiledUi8 // The encoding of the immediate field in the instruction is 8-bits
+ )
+ 
+ var optab = []Optab{
+@@ -88,6 +90,17 @@ var optab = []Optab{
+ 	{ACMPEQF, C_FREG, C_FREG, C_NONE, C_FCCREG, C_NONE, 2, 4, 0, 0},
+ 	{AVSEQB, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0},
+ 	{AXVSEQB, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0},
++	{AVSEQB, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 13, 4, 0, immFiledSi5},
++	{AXVSEQB, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 13, 4, 0, immFiledSi5},
++	{AVSEQB, C_ADDCON, C_VREG, C_NONE, C_VREG, C_NONE, 13, 4, 0, immFiledSi5},
++	{AXVSEQB, C_ADDCON, C_XREG, C_NONE, C_XREG, C_NONE, 13, 4, 0, immFiledSi5},
++
++	{AVANDV, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0},
++	{AXVANDV, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0},
++	{AVANDB, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 14, 4, 0, immFiledUi8},
++	{AXVANDB, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 14, 4, 0, immFiledUi8},
++	{AVANDB, C_ADDCON, C_VREG, C_NONE, C_VREG, C_NONE, 14, 4, 0, immFiledUi8},
++	{AXVANDB, C_ADDCON, C_XREG, C_NONE, C_XREG, C_NONE, 14, 4, 0, immFiledUi8},
+ 
+ 	{ACLOW, C_REG, C_NONE, C_NONE, C_REG, C_NONE, 9, 4, 0, 0},
+ 	{AABSF, C_FREG, C_NONE, C_NONE, C_FREG, C_NONE, 9, 4, 0, 0},
+@@ -1499,6 +1512,7 @@ func buildop(ctxt *obj.Link) {
+ 				}
+ 				opset(i, r0)
+ 			}
++
+ 		case AVSEQB:
+ 			opset(AVSEQH, r0)
+ 			opset(AVSEQW, r0)
+@@ -1509,6 +1523,30 @@ func buildop(ctxt *obj.Link) {
+ 			opset(AXVSEQW, r0)
+ 			opset(AXVSEQV, r0)
+ 
++		case AVANDB:
++			opset(AVORB, r0)
++			opset(AVXORB, r0)
++			opset(AVNORB, r0)
++
++		case AXVANDB:
++			opset(AXVORB, r0)
++			opset(AXVXORB, r0)
++			opset(AXVNORB, r0)
++
++		case AVANDV:
++			opset(AVORV, r0)
++			opset(AVXORV, r0)
++			opset(AVNORV, r0)
++			opset(AVANDNV, r0)
++			opset(AVORNV, r0)
++
++		case AXVANDV:
++			opset(AXVORV, r0)
++			opset(AXVXORV, r0)
++			opset(AXVNORV, r0)
++			opset(AXVANDNV, r0)
++			opset(AXVORNV, r0)
++
+ 		case AVPCNTB:
+ 			opset(AVPCNTH, r0)
+ 			opset(AVPCNTW, r0)
+@@ -1551,6 +1589,14 @@ func OP_12IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+ 	return op | (i&0xFFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+ }
+ 
++func OP_8IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
++	return op | (i&0xFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
++}
++
++func OP_5IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
++	return op | (i&0x1F)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
++}
++
+ func OP_IR(op uint32, i uint32, r2 uint32) uint32 {
+ 	return op | (i&0xFFFFF)<<5 | (r2&0x1F)<<0 // ui20, rd5
+ }
+@@ -1623,12 +1669,10 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
+ 
+ 	case 4: // add $scon,[r1],r2
+ 		v := c.regoff(&p.From)
+-
+ 		r := int(p.Reg)
+ 		if r == 0 {
+ 			r = int(p.To.Reg)
+ 		}
+-
+ 		o1 = OP_12IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
+ 
+ 	case 5: // syscall
+@@ -1738,6 +1782,36 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
+ 			c.ctxt.Diag("unexpected encoding\n%v", p)
+ 		}
+ 
++	case 13: // add $si5,[r1],r2
++		v := c.regoff(&p.From)
++		r := int(p.Reg)
++		if r == 0 {
++		        r = int(p.To.Reg)
++		}
++		
++		switch o.flag {
++		case immFiledSi5:
++		        c.checkimmFiled(p, v, 5, true)
++		        o1 = OP_5IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
++		default:
++		        c.ctxt.Diag("Invalid immediate value type\n%v", p)
++		}
++	
++	case 14: // add $ui8,[r1],r2
++		v := c.regoff(&p.From)
++		r := int(p.Reg)
++		if r == 0 {
++		        r = int(p.To.Reg)
++		}
++		
++		switch o.flag {
++		case immFiledUi8:
++		        c.checkimmFiled(p, v, 8, false)
++		        o1 = OP_8IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
++		default:
++		        c.ctxt.Diag("Invalid immediate value type\n%v", p)
++		}
++
+ 	case 15: // teq $c r,r
+ 		v := c.regoff(&p.From)
+ 		r := int(p.Reg)
+@@ -1760,18 +1834,18 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
+ 		o2 = OP_15I(c.opi(ABREAK), uint32(v))
+ 
+ 	case 16: // sll $c,[r1],r2
+-		v := c.regoff(&p.From)
+-		r := int(p.Reg)
+-		if r == 0 {
+-			r = int(p.To.Reg)
+-		}
+-
+-		// instruction ending with V:6-digit immediate, others:5-digit immediate
+-		if v >= 32 && vshift(p.As) {
+-			o1 = OP_16IRR(c.opirr(p.As), uint32(v)&0x3f, uint32(r), uint32(p.To.Reg))
+-		} else {
+-			o1 = OP_16IRR(c.opirr(p.As), uint32(v)&0x1f, uint32(r), uint32(p.To.Reg))
+-		}
++	    v := c.regoff(&p.From)
++	    r := int(p.Reg)
++	    if r == 0 {
++	        r = int(p.To.Reg)
++	    }
++	
++	    // instruction ending with V:6-digit immediate, others:5-digit immediate
++	    if v >= 32 && vshift(p.As) {
++	        o1 = OP_16IRR(c.opirr(p.As), uint32(v)&0x3f, uint32(r), uint32(p.To.Reg))
++	    } else {
++	        o1 = OP_16IRR(c.opirr(p.As), uint32(v)&0x1f, uint32(r), uint32(p.To.Reg))
++	    }
+ 
+ 	case 17: // bstrpickw $msbw, r1, $lsbw, r2
+ 		rd, rj := p.To.Reg, p.Reg
+@@ -2348,6 +2422,21 @@ func (c *ctxt0) checkindex(p *obj.Prog, index uint32, mask uint32) {
+ 	}
+ }
+ 
++// checkimmFiled checks whether the immediate value exceeds the valid encoding range
++func (c *ctxt0) checkimmFiled(p *obj.Prog, imm int32, bits uint8, isSigned bool) {
++	if isSigned {
++		bound := int32(1 << (bits - 1))
++		if imm < -bound || imm > bound {
++			c.ctxt.Diag("signed immediate %v exceeds the %d-bit range: %v", imm, bits, p)
++		}
++	} else {
++		mask := uint32(0xffffffff) << bits
++		if uint32(imm) != (uint32(imm) & ^mask) {
++			c.ctxt.Diag("unsigned immediate %v exceeds the %d-bit range: %v", imm, bits, p)
++		}
++	}
++}
++
+ func (c *ctxt0) vregoff(a *obj.Addr) int64 {
+ 	c.instoffset = 0
+ 	c.aclass(a)
+@@ -2588,6 +2677,30 @@ func (c *ctxt0) oprrr(a obj.As) uint32 {
+ 		return 0x0e003 << 15 // vseq.d
+ 	case AXVSEQV:
+ 		return 0x0e803 << 15 // xvseq.d
++	case AVANDV:
++		return 0x0E24C << 15 // vand.v
++	case AVORV:
++		return 0x0E24D << 15 // vor.v
++	case AVXORV:
++		return 0x0E24E << 15 // vxor.v
++	case AVNORV:
++		return 0x0E24F << 15 // vnor.v
++	case AVANDNV:
++		return 0x0E250 << 15 // vandn.v
++	case AVORNV:
++		return 0x0E251 << 15 // vorn.v
++	case AXVANDV:
++		return 0x0EA4C << 15 // xvand.v
++	case AXVORV:
++		return 0x0EA4D << 15 // xvor.v
++	case AXVXORV:
++		return 0x0EA4E << 15 // xvxor.v
++	case AXVNORV:
++		return 0x0EA4F << 15 // xvnor.v
++	case AXVANDNV:
++		return 0x0EA50 << 15 // xvandn.v
++	case AXVORNV:
++		return 0x0EA51 << 15 // xvorn.v
+ 	}
+ 
+ 	if a < 0 {
+@@ -2915,6 +3028,38 @@ func (c *ctxt0) opirr(a obj.As) uint32 {
+ 		return 0x021 << 24
+ 	case ASCV:
+ 		return 0x023 << 24
++	case AVANDB:
++		return 0x1CF4 << 18 // vandi.b
++	case AVORB:
++		return 0x1CF5 << 18 // vori.b
++	case AVXORB:
++		return 0x1CF6 << 18 // xori.b
++	case AVNORB:
++		return 0x1CF7 << 18 // xnori.b
++	case AXVANDB:
++		return 0x1DF4 << 18 // xvandi.b
++	case AXVORB:
++		return 0x1DF5 << 18 // xvori.b
++	case AXVXORB:
++		return 0x1DF6 << 18 // xvxori.b
++	case AXVNORB:
++		return 0x1DF7 << 18 // xvnor.b
++	case AVSEQB:
++		return 0x0E500 << 15 //vseqi.b
++	case AVSEQH:
++		return 0x0E501 << 15 // vseqi.h
++	case AVSEQW:
++		return 0x0E502 << 15 //vseqi.w
++	case AVSEQV:
++		return 0x0E503 << 15 //vseqi.d
++	case AXVSEQB:
++		return 0x0ED00 << 15 //xvseqi.b
++	case AXVSEQH:
++		return 0x0ED01 << 15 // xvseqi.h
++	case AXVSEQW:
++		return 0x0ED02 << 15 // xvseqi.w
++	case AXVSEQV:
++		return 0x0ED03 << 15 // xvseqi.d
+ 	}
+ 
+ 	if a < 0 {
+-- 
+2.38.1
+
diff --git a/0027-cmd-internal-obj-loong64-add-V-XV-ADD-SUB-.-B-H-W-D-.patch b/0027-cmd-internal-obj-loong64-add-V-XV-ADD-SUB-.-B-H-W-D-.patch
new file mode 100644
index 0000000000000000000000000000000000000000..2192272d7bb79100579ca52591c8b18158fec4a7
--- /dev/null
+++ b/0027-cmd-internal-obj-loong64-add-V-XV-ADD-SUB-.-B-H-W-D-.patch
@@ -0,0 +1,207 @@
+From f5bbb15710944ebcc7d2c808fe9087892a690bc4 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 11 Dec 2024 09:26:38 +0800
+Subject: [PATCH 27/44] cmd/internal/obj/loong64: add
+ {V,XV}{ADD/SUB}.{B,H,W,D,Q} instructions support
+
+Go asm syntax:
+	 V{ADD/SUB}{B,H,W,V,Q}	VK, VJ, VD
+	XV{ADD/SUB}{B,H,W,V,Q}	XK, XJ, XD
+
+Equivalent platform assembler syntax:
+	 v{add/sub}.{b,w,h,d,q}	vd, vj, vk
+	xv{add/sub}.{b,w,h,d,q}	xd, xj, xk
+
+Change-Id: Iadc28100c93d6d6c69e9641bfea78fa85d75bddf
+---
+ .../asm/internal/asm/testdata/loong64enc1.s   | 22 +++++++
+ src/cmd/internal/obj/loong64/a.out.go         | 22 +++++++
+ src/cmd/internal/obj/loong64/anames.go        | 20 +++++++
+ src/cmd/internal/obj/loong64/asm.go           | 60 +++++++++++++++++++
+ 4 files changed, 124 insertions(+)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+index 2418412a3a..76faf2d3cb 100644
+--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+@@ -555,6 +555,28 @@ lable2:
+ 	XVXORB		$127, X2, X3    // 43fcd977
+ 	XVNORB		$255, X2, X3    // 43fcdf77
+ 
++	// [X]VADD{B,H,W,V,Q}, [X]VSUB{B,H,W,V,Q} instructions
++	VADDB		V1, V2, V3	// 43040a70
++	VADDH		V1, V2, V3	// 43840a70
++	VADDW		V1, V2, V3	// 43040b70
++	VADDV		V1, V2, V3	// 43840b70
++	VADDQ		V1, V2, V3	// 43042d71
++	VSUBB		V1, V2, V3	// 43040c70
++	VSUBH		V1, V2, V3	// 43840c70
++	VSUBW		V1, V2, V3	// 43040d70
++	VSUBV		V1, V2, V3	// 43840d70
++	VSUBQ		V1, V2, V3	// 43842d71
++	XVADDB		X3, X2, X1	// 410c0a74
++	XVADDH		X3, X2, X1	// 418c0a74
++	XVADDW		X3, X2, X1	// 410c0b74
++	XVADDV		X3, X2, X1	// 418c0b74
++	XVADDQ		X3, X2, X1	// 410c2d75
++	XVSUBB		X3, X2, X1	// 410c0c74
++	XVSUBH		X3, X2, X1	// 418c0c74
++	XVSUBW		X3, X2, X1	// 410c0d74
++	XVSUBV		X3, X2, X1	// 418c0d74
++	XVSUBQ		X3, X2, X1	// 418c2d75
++
+ 	// MOVV C_DCON12_0, r
+ 	MOVV    $0x7a90000000000000, R4         // MOVV $8831558869273542656, R4        // 04a41e03
+ 	MOVV    $0xea90000000000000, R4         // MOVV $-1544734672188080128, R4       // 04a43a03
+diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
+index bd3ce61826..3bef0da869 100644
+--- a/src/cmd/internal/obj/loong64/a.out.go
++++ b/src/cmd/internal/obj/loong64/a.out.go
+@@ -725,6 +725,28 @@ const (
+ 	AVMOVQ
+ 	AXVMOVQ
+ 
++	// LSX and LASX arithmetic instructions
++	AVADDB
++	AVADDH
++	AVADDW
++	AVADDV
++	AVADDQ
++	AXVADDB
++	AXVADDH
++	AXVADDW
++	AXVADDV
++	AXVADDQ
++	AVSUBB
++	AVSUBH
++	AVSUBW
++	AVSUBV
++	AVSUBQ
++	AXVSUBB
++	AXVSUBH
++	AXVSUBW
++	AXVSUBV
++	AXVSUBQ
++
+ 	// LSX and LASX Bit-manipulation Instructions
+ 	AVANDB
+ 	AVORB
+diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go
+index 6c1537d123..194021219e 100644
+--- a/src/cmd/internal/obj/loong64/anames.go
++++ b/src/cmd/internal/obj/loong64/anames.go
+@@ -257,6 +257,26 @@ var Anames = []string{
+ 	"FTINTRNEVD",
+ 	"VMOVQ",
+ 	"XVMOVQ",
++	"VADDB",
++	"VADDH",
++	"VADDW",
++	"VADDV",
++	"VADDQ",
++	"XVADDB",
++	"XVADDH",
++	"XVADDW",
++	"XVADDV",
++	"XVADDQ",
++	"VSUBB",
++	"VSUBH",
++	"VSUBW",
++	"VSUBV",
++	"VSUBQ",
++	"XVSUBB",
++	"XVSUBH",
++	"XVSUBW",
++	"XVSUBV",
++	"XVSUBQ",
+ 	"VANDB",
+ 	"VORB",
+ 	"VXORB",
+diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
+index 7247193c95..7489b4dbf6 100644
+--- a/src/cmd/internal/obj/loong64/asm.go
++++ b/src/cmd/internal/obj/loong64/asm.go
+@@ -1539,6 +1539,16 @@ func buildop(ctxt *obj.Link) {
+ 			opset(AVNORV, r0)
+ 			opset(AVANDNV, r0)
+ 			opset(AVORNV, r0)
++			opset(AVADDB, r0)
++			opset(AVADDH, r0)
++			opset(AVADDW, r0)
++			opset(AVADDV, r0)
++			opset(AVADDQ, r0)
++			opset(AVSUBB, r0)
++			opset(AVSUBH, r0)
++			opset(AVSUBW, r0)
++			opset(AVSUBV, r0)
++			opset(AVSUBQ, r0)
+ 
+ 		case AXVANDV:
+ 			opset(AXVORV, r0)
+@@ -1546,6 +1556,16 @@ func buildop(ctxt *obj.Link) {
+ 			opset(AXVNORV, r0)
+ 			opset(AXVANDNV, r0)
+ 			opset(AXVORNV, r0)
++			opset(AXVADDB, r0)
++			opset(AXVADDH, r0)
++			opset(AXVADDW, r0)
++			opset(AXVADDV, r0)
++			opset(AXVADDQ, r0)
++			opset(AXVSUBB, r0)
++			opset(AXVSUBH, r0)
++			opset(AXVSUBW, r0)
++			opset(AXVSUBV, r0)
++			opset(AXVSUBQ, r0)
+ 
+ 		case AVPCNTB:
+ 			opset(AVPCNTH, r0)
+@@ -2701,6 +2721,46 @@ func (c *ctxt0) oprrr(a obj.As) uint32 {
+ 		return 0x0EA50 << 15 // xvandn.v
+ 	case AXVORNV:
+ 		return 0x0EA51 << 15 // xvorn.v
++	case AVADDB:
++		return 0xE014 << 15 // vadd.b
++	case AVADDH:
++		return 0xE015 << 15 // vadd.h
++	case AVADDW:
++		return 0xE016 << 15 // vadd.w
++	case AVADDV:
++		return 0xE017 << 15 // vadd.d
++	case AVADDQ:
++		return 0xE25A << 15 // vadd.q
++	case AVSUBB:
++		return 0xE018 << 15 // vsub.b
++	case AVSUBH:
++		return 0xE019 << 15 // vsub.h
++	case AVSUBW:
++		return 0xE01A << 15 // vsub.w
++	case AVSUBV:
++		return 0xE01B << 15 // vsub.d
++	case AVSUBQ:
++		return 0xE25B << 15 // vsub.q
++	case AXVADDB:
++		return 0xE814 << 15 // xvadd.b
++	case AXVADDH:
++		return 0xE815 << 15 // xvadd.h
++	case AXVADDW:
++		return 0xE816 << 15 // xvadd.w
++	case AXVADDV:
++		return 0xE817 << 15 // xvadd.d
++	case AXVADDQ:
++		return 0xEA5A << 15 // xvadd.q
++	case AXVSUBB:
++		return 0xE818 << 15 // xvsub.b
++	case AXVSUBH:
++		return 0xE819 << 15 // xvsub.h
++	case AXVSUBW:
++		return 0xE81A << 15 // xvsub.w
++	case AXVSUBV:
++		return 0xE81B << 15 // xvsub.d
++	case AXVSUBQ:
++		return 0xEA5B << 15 // xvsub.q
+ 	}
+ 
+ 	if a < 0 {
+-- 
+2.38.1
+
diff --git a/0028-cmd-internal-obj-loong64-add-V-XV-ILV-L-H-.-B-H-W-D-.patch b/0028-cmd-internal-obj-loong64-add-V-XV-ILV-L-H-.-B-H-W-D-.patch
new file mode 100644
index 0000000000000000000000000000000000000000..316746628a9aa2ec11178b65fa685d12a159372e
--- /dev/null
+++ b/0028-cmd-internal-obj-loong64-add-V-XV-ILV-L-H-.-B-H-W-D-.patch
@@ -0,0 +1,181 @@
+From db7ccba69b0c246434a610f3be2ab31c8406b163 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 11 Dec 2024 10:24:13 +0800
+Subject: [PATCH 28/44] cmd/internal/obj/loong64: add {V,XV}ILV{L/H}.{B/H/W/D}
+ instructions support
+
+Go asm syntax:
+	 VILV{L/H}{B/H/W/V}	VK, VJ, VD
+	XVILV{L/H}{B/H/W/V}	XK, XJ, XD
+Equivalent platform assembler syntax:
+	 vilv{l/h}.{b/h/w/d}	vd, vj, vk
+	xvilv{l/h}.{b/h/w/d}	xd, xj, xk
+
+Change-Id: If1f146fd5e049281494026bf4c24d302bcad1373
+---
+ .../asm/internal/asm/testdata/loong64enc1.s   | 18 +++++++
+ src/cmd/internal/obj/loong64/a.out.go         | 18 +++++++
+ src/cmd/internal/obj/loong64/anames.go        | 16 +++++++
+ src/cmd/internal/obj/loong64/asm.go           | 48 +++++++++++++++++++
+ 4 files changed, 100 insertions(+)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+index 76faf2d3cb..419f257c4a 100644
+--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+@@ -577,6 +577,24 @@ lable2:
+ 	XVSUBV		X3, X2, X1	// 418c0d74
+ 	XVSUBQ		X3, X2, X1	// 418c2d75
+ 
++	// [X]VILV{L/H}{B,H,W,V} instructions
++	VILVLB		V1, V2, V3	// 43041a71
++	VILVLH		V1, V2, V3	// 43841a71
++	VILVLW		V1, V2, V3	// 43041b71
++	VILVLV		V1, V2, V3	// 43841b71
++	VILVHB		V1, V2, V3	// 43041c71
++	VILVHH		V1, V2, V3	// 43841c71
++	VILVHW		V1, V2, V3	// 43041d71
++	VILVHV		V1, V2, V3	// 43841d71
++	XVILVLB		X3, X2, X1	// 410c1a75
++	XVILVLH		X3, X2, X1	// 418c1a75
++	XVILVLW		X3, X2, X1	// 410c1b75
++	XVILVLV		X3, X2, X1	// 418c1b75
++	XVILVHB		X3, X2, X1	// 410c1c75
++	XVILVHH		X3, X2, X1	// 418c1c75
++	XVILVHW		X3, X2, X1	// 410c1d75
++	XVILVHV		X3, X2, X1	// 418c1d75
++
+ 	// MOVV C_DCON12_0, r
+ 	MOVV    $0x7a90000000000000, R4         // MOVV $8831558869273542656, R4        // 04a41e03
+ 	MOVV    $0xea90000000000000, R4         // MOVV $-1544734672188080128, R4       // 04a43a03
+diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
+index 3bef0da869..c7f4769395 100644
+--- a/src/cmd/internal/obj/loong64/a.out.go
++++ b/src/cmd/internal/obj/loong64/a.out.go
+@@ -788,6 +788,24 @@ const (
+ 	AVSEQV
+ 	AXVSEQV
+ 
++	// LSX and LASX move and shuffle instructions
++	AVILVLB
++	AVILVLH
++	AVILVLW
++	AVILVLV
++	AVILVHB
++	AVILVHH
++	AVILVHW
++	AVILVHV
++	AXVILVLB
++	AXVILVLH
++	AXVILVLW
++	AXVILVLV
++	AXVILVHB
++	AXVILVHH
++	AXVILVHW
++	AXVILVHV
++
+ 	ALAST
+ 
+ 	// aliases
+diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go
+index 194021219e..485940e19c 100644
+--- a/src/cmd/internal/obj/loong64/anames.go
++++ b/src/cmd/internal/obj/loong64/anames.go
+@@ -313,5 +313,21 @@ var Anames = []string{
+ 	"XVSEQW",
+ 	"VSEQV",
+ 	"XVSEQV",
++	"VILVLB",
++	"VILVLH",
++	"VILVLW",
++	"VILVLV",
++	"VILVHB",
++	"VILVHH",
++	"VILVHW",
++	"VILVHV",
++	"XVILVLB",
++	"XVILVLH",
++	"XVILVLW",
++	"XVILVLV",
++	"XVILVHB",
++	"XVILVHH",
++	"XVILVHW",
++	"XVILVHV",
+ 	"LAST",
+ }
+diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
+index 7489b4dbf6..9ef414a132 100644
+--- a/src/cmd/internal/obj/loong64/asm.go
++++ b/src/cmd/internal/obj/loong64/asm.go
+@@ -1549,6 +1549,14 @@ func buildop(ctxt *obj.Link) {
+ 			opset(AVSUBW, r0)
+ 			opset(AVSUBV, r0)
+ 			opset(AVSUBQ, r0)
++			opset(AVILVLB, r0)
++			opset(AVILVLH, r0)
++			opset(AVILVLW, r0)
++			opset(AVILVLV, r0)
++			opset(AVILVHB, r0)
++			opset(AVILVHH, r0)
++			opset(AVILVHW, r0)
++			opset(AVILVHV, r0)
+ 
+ 		case AXVANDV:
+ 			opset(AXVORV, r0)
+@@ -1566,6 +1574,14 @@ func buildop(ctxt *obj.Link) {
+ 			opset(AXVSUBW, r0)
+ 			opset(AXVSUBV, r0)
+ 			opset(AXVSUBQ, r0)
++			opset(AXVILVLB, r0)
++			opset(AXVILVLH, r0)
++			opset(AXVILVLW, r0)
++			opset(AXVILVLV, r0)
++			opset(AXVILVHB, r0)
++			opset(AXVILVHH, r0)
++			opset(AXVILVHW, r0)
++			opset(AXVILVHV, r0)
+ 
+ 		case AVPCNTB:
+ 			opset(AVPCNTH, r0)
+@@ -2761,6 +2777,38 @@ func (c *ctxt0) oprrr(a obj.As) uint32 {
+ 		return 0xE81B << 15 // xvsub.d
+ 	case AXVSUBQ:
+ 		return 0xEA5B << 15 // xvsub.q
++	case AVILVLB:
++		return 0xE234 << 15 // vilvl.b
++	case AVILVLH:
++		return 0xE235 << 15 // vilvl.h
++	case AVILVLW:
++		return 0xE236 << 15 // vilvl.w
++	case AVILVLV:
++		return 0xE237 << 15 // vilvl.d
++	case AVILVHB:
++		return 0xE238 << 15 // vilvh.b
++	case AVILVHH:
++		return 0xE239 << 15 // vilvh.h
++	case AVILVHW:
++		return 0xE23A << 15 // vilvh.w
++	case AVILVHV:
++		return 0xE23B << 15 // vilvh.d
++	case AXVILVLB:
++		return 0xEA34 << 15 // xvilvl.b
++	case AXVILVLH:
++		return 0xEA35 << 15 // xvilvl.h
++	case AXVILVLW:
++		return 0xEA36 << 15 // xvilvl.w
++	case AXVILVLV:
++		return 0xEA37 << 15 // xvilvl.d
++	case AXVILVHB:
++		return 0xEA38 << 15 // xvilvh.b
++	case AXVILVHH:
++		return 0xEA39 << 15 // xvilvh.h
++	case AXVILVHW:
++		return 0xEA3A << 15 // xvilvh.w
++	case AXVILVHV:
++		return 0xEA3B << 15 // xvilvh.d
+ 	}
+ 
+ 	if a < 0 {
+-- 
+2.38.1
+
diff --git a/0029-cmd-internal-obj-loong64-add-V-XV-SLL-SRL-SRA-ROTR-I.patch b/0029-cmd-internal-obj-loong64-add-V-XV-SLL-SRL-SRA-ROTR-I.patch
new file mode 100644
index 0000000000000000000000000000000000000000..71ebec387f8fd8b8c24ca42ab3e6d936640ccb99
--- /dev/null
+++ b/0029-cmd-internal-obj-loong64-add-V-XV-SLL-SRL-SRA-ROTR-I.patch
@@ -0,0 +1,599 @@
+From d765027e47dec10f8869d04b0bf52661ac63f302 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 11 Dec 2024 14:19:04 +0800
+Subject: [PATCH 29/44] cmd/internal/obj/loong64: add
+ {V,XV}{SLL/SRL/SRA/ROTR}[I].{B/H/W/D} instructions support
+
+Go asm syntax:
+	 V{SLL/SRL/SRA/ROTR}{B/H/W/V}	$1, V2, V3
+	XV{SLL/SRL/SRA/ROTR}{B/H/W/V}	$1, X2, X3
+	 V{SLL/SRL/SRA/ROTR}{B/H/W/V}	VK, VJ, VD
+	XV{SLL/SRL/SRA/ROTR}{B/H/W/V}	XK, XJ, XD
+
+Equivalent platform assembler syntax:
+	 v{sll/srl/sra/rotr}i.{b/h/w/d}	v3, v2, $1
+	xv{sll/srl/sra/rotr}i.{b/h/w/d}	x3, x2, $1
+	 v{sll/srl/sra/rotr}.{b/h/w/d}	vd, vj, vk
+	xv{sll/srl/sra/rotr}.{b/h/w/d}	xd, xj, xk
+
+Change-Id: I8693e15f3778057e5a1e636d618c6f46acc5042b
+---
+ .../asm/internal/asm/testdata/loong64enc1.s   | 130 +++++++++
+ src/cmd/internal/obj/loong64/a.out.go         |  33 +++
+ src/cmd/internal/obj/loong64/anames.go        |  32 ++
+ src/cmd/internal/obj/loong64/asm.go           | 274 +++++++++++++++++-
+ 4 files changed, 468 insertions(+), 1 deletion(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+index 419f257c4a..79012784dc 100644
+--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+@@ -595,6 +595,136 @@ lable2:
+ 	XVILVHW		X3, X2, X1	// 410c1d75
+ 	XVILVHV		X3, X2, X1	// 418c1d75
+ 
++	// [X]{VSLL/VSRL/VSRA/VROTR}{B,H,W,V} instructions
++	VSLLB		V1, V2, V3	// 4304e870
++	VSLLH		V1, V2, V3	// 4384e870
++	VSLLW		V1, V2, V3	// 4304e970
++	VSLLV		V1, V2, V3	// 4384e970
++	VSRLB		V1, V2, V3	// 4304ea70
++	VSRLH		V1, V2, V3	// 4384ea70
++	VSRLW		V1, V2, V3	// 4304eb70
++	VSRLV		V1, V2, V3	// 4384eb70
++	VSRAB		V1, V2, V3	// 4304ec70
++	VSRAH		V1, V2, V3	// 4384ec70
++	VSRAW		V1, V2, V3	// 4304ed70
++	VSRAV		V1, V2, V3	// 4384ed70
++	VROTRB		V1, V2, V3	// 4304ee70
++	VROTRH		V1, V2, V3	// 4384ee70
++	VROTRW		V1, V2, V3	// 4304ef70
++	VROTRV		V1, V2, V3	// 4384ef70
++	XVSLLB		X3, X2, X1	// 410ce874
++	XVSLLH		X3, X2, X1	// 418ce874
++	XVSLLW		X3, X2, X1	// 410ce974
++	XVSLLV		X3, X2, X1	// 418ce974
++	XVSRLB		X3, X2, X1	// 410cea74
++	XVSRLH		X3, X2, X1	// 418cea74
++	XVSRLW		X3, X2, X1	// 410ceb74
++	XVSRLV		X3, X2, X1	// 418ceb74
++	XVSRAB		X3, X2, X1	// 410cec74
++	XVSRAH		X3, X2, X1	// 418cec74
++	XVSRAW		X3, X2, X1	// 410ced74
++	XVSRAV		X3, X2, X1	// 418ced74
++	XVROTRB		X3, X2, X1	// 410cee74
++	XVROTRH		X3, X2, X1	// 418cee74
++	XVROTRW		X3, X2, X1	// 410cef74
++	XVROTRV		X3, X2, X1	// 418cef74
++	VSLLB		$0, V1, V2	// 22202c73 
++	VSLLB		$7, V1, V2	// 223c2c73
++	VSLLB		$5, V1		// 21342c73
++	VSLLH		$0, V1, V2	// 22402c73
++	VSLLH		$15, V1, V2	// 227c2c73
++	VSLLH		$10, V1		// 21682c73
++	VSLLW		$0, V1, V2	// 22802c73
++	VSLLW		$31, V1, V2	// 22fc2c73
++	VSLLW		$11, V1		// 21ac2c73
++	VSLLV		$0, V1, V2	// 22002d73
++	VSLLV		$63, V1, V2	// 22fc2d73
++	VSLLV		$30, V1		// 21782d73
++	VSRLB		$0, V1, V2	// 22203073
++	VSRLB		$7, V1, V2	// 223c3073
++	VSRLB		$4, V1		// 21303073
++	VSRLH		$0, V1, V2	// 22403073
++	VSRLH		$15, V1, V2	// 227c3073
++	VSRLH		$9, V1		// 21643073
++	VSRLW		$0, V1, V2	// 22803073
++	VSRLW		$31, V1, V2	// 22fc3073
++	VSRLW		$16, V1		// 21c03073
++	VSRLV		$0, V1, V2	// 22003173
++	VSRLV		$63, V1, V2	// 22fc3173
++	VSRLV		$40, V1		// 21a03173
++	VSRAB		$0, V1, V2	// 22203473
++	VSRAB		$7, V1, V2	// 223c3473
++	VSRAB		$6, V1		// 21383473
++	VSRAH		$0, V1, V2	// 22403473
++	VSRAH		$15, V1, V2	// 227c3473
++	VSRAH		$8, V1		// 21603473
++	VSRAW		$0, V1, V2	// 22803473
++	VSRAW		$31, V1, V2	// 22fc3473
++	VSRAW		$12, V1		// 21b03473
++	VSRAV		$0, V1, V2	// 22003573
++	VSRAV		$63, V1, V2	// 22fc3573
++	VSRAV		$50, V1		// 21c83573
++	VROTRB		$0, V1, V2	// 2220a072
++	VROTRB		$7, V1, V2	// 223ca072
++	VROTRB		$3, V1		// 212ca072 
++	VROTRH		$0, V1, V2	// 2240a072
++	VROTRH		$15, V1, V2	// 227ca072
++	VROTRH		$5, V1		// 2154a072
++	VROTRW		$0, V1, V2	// 2280a072
++	VROTRW		$31, V1, V2	// 22fca072
++	VROTRW		$18, V1		// 21c8a072
++	VROTRV		$0, V1, V2	// 2200a172
++	VROTRV		$63, V1, V2	// 22fca172
++	VROTRV		$52, V1		// 21d0a172
++	XVSLLB		$0, X2, X1	// 41202c77
++	XVSLLB		$7, X2, X1	// 413c2c77
++	XVSLLB		$4, X2		// 42302c77
++	XVSLLH		$0, X2, X1	// 41402c77
++	XVSLLH		$15, X2, X1	// 417c2c77
++	XVSLLH		$8, X2		// 42602c77
++	XVSLLW		$0, X2, X1	// 41802c77
++	XVSLLW		$31, X2, X1	// 41fc2c77
++	XVSLLW		$13, X2		// 42b42c77
++	XVSLLV		$0, X2, X1	// 41002d77
++	XVSLLV		$63, X2, X1	// 41fc2d77
++	XVSLLV		$36, X2		// 42902d77
++	XVSRLB		$0, X2, X1	// 41203077
++	XVSRLB		$7, X2, X1	// 413c3077
++	XVSRLB		$5, X2		// 42343077
++	XVSRLH		$0, X2, X1	// 41403077
++	XVSRLH		$15, X2, X1	// 417c3077
++	XVSRLH		$9, X2		// 42643077
++	XVSRLW		$0, X2, X1	// 41803077
++	XVSRLW		$31, X2, X1	// 41fc3077
++	XVSRLW		$14, X2		// 42b83077
++	XVSRLV		$0, X2, X1	// 41003177
++	XVSRLV		$63, X2, X1	// 41fc3177
++	XVSRLV		$45, X2		// 42b43177
++	XVSRAB		$0, X2, X1	// 41203477
++	XVSRAB		$7, X2, X1	// 413c3477
++	XVSRAB		$6, X2		// 42383477
++	XVSRAH		$0, X2, X1	// 41403477
++	XVSRAH		$15, X2, X1	// 417c3477
++	XVSRAH		$10, X2		// 42683477
++	XVSRAW		$0, X2, X1	// 41803477
++	XVSRAW		$31, X2, X1	// 41fc3477
++	XVSRAW		$16, X2		// 42c03477
++	XVSRAV		$0, X2, X1	// 41003577
++	XVSRAV		$63, X2, X1	// 41fc3577
++	XVSRAV		$48, X2		// 42c03577
++	XVROTRB		$0, X2, X1	// 4120a076 
++	XVROTRB		$7, X2, X1	// 413ca076
++	XVROTRB		$3, X2		// 422ca076
++	XVROTRH		$0, X2, X1	// 4140a076
++	XVROTRH		$15, X2, X1	// 417ca076
++	XVROTRH		$13, X2		// 4274a076
++	XVROTRW		$0, X2, X1	// 4180a076
++	XVROTRW		$31, X2, X1	// 41fca076
++	XVROTRW		$24, X2		// 42e0a076
++	XVROTRV		$0, X2, X1	// 4100a176
++	XVROTRV		$63, X2, X1	// 41fca176
++	XVROTRV		$52, X2		// 42d0a176
++
+ 	// MOVV C_DCON12_0, r
+ 	MOVV    $0x7a90000000000000, R4         // MOVV $8831558869273542656, R4        // 04a41e03
+ 	MOVV    $0xea90000000000000, R4         // MOVV $-1544734672188080128, R4       // 04a43a03
+diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
+index c7f4769395..3257d376b4 100644
+--- a/src/cmd/internal/obj/loong64/a.out.go
++++ b/src/cmd/internal/obj/loong64/a.out.go
+@@ -769,6 +769,39 @@ const (
+ 	AXVANDNV
+ 	AXVORNV
+ 
++	AVSLLB
++	AVSLLH
++	AVSLLW
++	AVSLLV
++	AVSRLB
++	AVSRLH
++	AVSRLW
++	AVSRLV
++	AVSRAB
++	AVSRAH
++	AVSRAW
++	AVSRAV
++	AVROTRB
++	AVROTRH
++	AVROTRW
++	AVROTRV
++	AXVSLLB
++	AXVSLLH
++	AXVSLLW
++	AXVSLLV
++	AXVSRLB
++	AXVSRLH
++	AXVSRLW
++	AXVSRLV
++	AXVSRAB
++	AXVSRAH
++	AXVSRAW
++	AXVSRAV
++	AXVROTRB
++	AXVROTRH
++	AXVROTRW
++	AXVROTRV
++
+ 	AVPCNTB
+ 	AVPCNTH
+ 	AVPCNTW
+diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go
+index 485940e19c..776e272a0b 100644
+--- a/src/cmd/internal/obj/loong64/anames.go
++++ b/src/cmd/internal/obj/loong64/anames.go
+@@ -297,6 +297,38 @@ var Anames = []string{
+ 	"XVNORV",
+ 	"XVANDNV",
+ 	"XVORNV",
++	"VSLLB",
++	"VSLLH",
++	"VSLLW",
++	"VSLLV",
++	"VSRLB",
++	"VSRLH",
++	"VSRLW",
++	"VSRLV",
++	"VSRAB",
++	"VSRAH",
++	"VSRAW",
++	"VSRAV",
++	"VROTRB",
++	"VROTRH",
++	"VROTRW",
++	"VROTRV",
++	"XVSLLB",
++	"XVSLLH",
++	"XVSLLW",
++	"XVSLLV",
++	"XVSRLB",
++	"XVSRLH",
++	"XVSRLW",
++	"XVSRLV",
++	"XVSRAB",
++	"XVSRAH",
++	"XVSRAW",
++	"XVSRAV",
++	"XVROTRB",
++	"XVROTRH",
++	"XVROTRW",
++	"XVROTRV",
+ 	"VPCNTB",
+ 	"VPCNTH",
+ 	"VPCNTW",
+diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
+index 9ef414a132..25a40d736e 100644
+--- a/src/cmd/internal/obj/loong64/asm.go
++++ b/src/cmd/internal/obj/loong64/asm.go
+@@ -52,6 +52,10 @@ const (
+ 	// Used to insert padding for under-aligned loops.
+ 	branchLoopHead
+ 	immFiledSi5 // The encoding of the immediate field in the instruction is 5-bits
++	immFiledUi3 // The encoding of the immediate field in the instruction is 3-bits
++	immFiledUi4 // The encoding of the immediate field in the instruction is 4-bits
++	immFiledUi5 // The encoding of the immediate field in the instruction is 5-bits
++	immFiledUi6 // The encoding of the immediate field in the instruction is 6-bits
+ 	immFiledUi8 // The encoding of the immediate field in the instruction is 8-bits
+ )
+ 
+@@ -102,6 +106,34 @@ var optab = []Optab{
+ 	{AVANDB, C_ADDCON, C_VREG, C_NONE, C_VREG, C_NONE, 14, 4, 0, immFiledUi8},
+ 	{AXVANDB, C_ADDCON, C_XREG, C_NONE, C_XREG, C_NONE, 14, 4, 0, immFiledUi8},
+ 
++	{AVSLLB, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0},
++	{AXVSLLB, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0},
++	{AVSLLB, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 29, 4, 0, immFiledUi3},
++	{AXVSLLB, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 29, 4, 0, immFiledUi3},
++	{AVSLLB, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 29, 4, 0, immFiledUi3},
++	{AXVSLLB, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 29, 4, 0, immFiledUi3},
++
++	{AVSLLH, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0},
++	{AXVSLLH, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0},
++	{AVSLLH, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 31, 4, 0, immFiledUi4},
++	{AXVSLLH, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 31, 4, 0, immFiledUi4},
++	{AVSLLH, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 31, 4, 0, immFiledUi4},
++	{AXVSLLH, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 31, 4, 0, immFiledUi4},
++
++	{AVSLLW, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0},
++	{AXVSLLW, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0},
++	{AVSLLW, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 32, 4, 0, immFiledUi5},
++	{AXVSLLW, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 32, 4, 0, immFiledUi5},
++	{AVSLLW, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 32, 4, 0, immFiledUi5},
++	{AXVSLLW, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 32, 4, 0, immFiledUi5},
++
++	{AVSLLV, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0},
++	{AXVSLLV, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0},
++	{AVSLLV, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 33, 4, 0, immFiledUi6},
++	{AXVSLLV, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 33, 4, 0, immFiledUi6},
++	{AVSLLV, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 33, 4, 0, immFiledUi6},
++	{AXVSLLV, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 33, 4, 0, immFiledUi6},
++
+ 	{ACLOW, C_REG, C_NONE, C_NONE, C_REG, C_NONE, 9, 4, 0, 0},
+ 	{AABSF, C_FREG, C_NONE, C_NONE, C_FREG, C_NONE, 9, 4, 0, 0},
+ 	{AMOVVF, C_FREG, C_NONE, C_NONE, C_FREG, C_NONE, 9, 4, 0, 0},
+@@ -1521,7 +1553,7 @@ func buildop(ctxt *obj.Link) {
+ 		case AXVSEQB:
+ 			opset(AXVSEQH, r0)
+ 			opset(AXVSEQW, r0)
+-			opset(AXVSEQV, r0)
++				opset(AXVSEQV, r0)
+ 
+ 		case AVANDB:
+ 			opset(AVORB, r0)
+@@ -1583,6 +1615,46 @@ func buildop(ctxt *obj.Link) {
+ 			opset(AXVILVHW, r0)
+ 			opset(AXVILVHV, r0)
+ 
++		case AVSLLB:
++			opset(AVSRLB, r0)
++			opset(AVSRAB, r0)
++			opset(AVROTRB, r0)
++
++		case AXVSLLB:
++			opset(AXVSRLB, r0)
++			opset(AXVSRAB, r0)
++			opset(AXVROTRB, r0)
++
++		case AVSLLH:
++			opset(AVSRLH, r0)
++			opset(AVSRAH, r0)
++			opset(AVROTRH, r0)
++
++		case AXVSLLH:
++			opset(AXVSRLH, r0)
++			opset(AXVSRAH, r0)
++			opset(AXVROTRH, r0)
++
++		case AVSLLW:
++			opset(AVSRLW, r0)
++			opset(AVSRAW, r0)
++			opset(AVROTRW, r0)
++
++		case AXVSLLW:
++			opset(AXVSRLW, r0)
++			opset(AXVSRAW, r0)
++			opset(AXVROTRW, r0)
++
++		case AVSLLV:
++			opset(AVSRLV, r0)
++			opset(AVSRAV, r0)
++			opset(AVROTRV, r0)
++
++		case AXVSLLV:
++			opset(AXVSRLV, r0)
++			opset(AXVSRAV, r0)
++			opset(AXVROTRV, r0)
++
+ 		case AVPCNTB:
+ 			opset(AVPCNTH, r0)
+ 			opset(AVPCNTW, r0)
+@@ -1629,10 +1701,22 @@ func OP_8IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+ 	return op | (i&0xFF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+ }
+ 
++func OP_6IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
++	return op | (i&0x3F)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
++}
++
+ func OP_5IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
+ 	return op | (i&0x1F)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
+ }
+ 
++func OP_4IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
++	return op | (i&0xF)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
++}
++
++func OP_3IRR(op uint32, i uint32, r2 uint32, r3 uint32) uint32 {
++	return op | (i&0x7)<<10 | (r2&0x1F)<<5 | (r3&0x1F)<<0
++}
++
+ func OP_IR(op uint32, i uint32, r2 uint32) uint32 {
+ 	return op | (i&0xFFFFF)<<5 | (r2&0x1F)<<0 // ui20, rd5
+ }
+@@ -1994,10 +2078,70 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
+ 			o1 = OP_12IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.From.Reg))
+ 		}
+ 
++	case 29: // add $ui3,[r1],r2
++		v := c.regoff(&p.From)
++		r := int(p.Reg)
++		if r == 0 {
++			r = int(p.To.Reg)
++		}
++
++		switch o.flag {
++		case immFiledUi3:
++			c.checkimmFiled(p, v, 3, false)
++			o1 = OP_3IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
++		default:
++			c.ctxt.Diag("Invalid immediate value type\n%v", p)
++		}
++
+ 	case 30: // mov gr/fr/fcc/fcsr, fr/fcc/fcsr/gr
+ 		a := c.specialFpMovInst(p.As, oclass(&p.From), oclass(&p.To))
+ 		o1 = OP_RR(a, uint32(p.From.Reg), uint32(p.To.Reg))
+ 
++	case 31: // add $ui4,[r1],r2
++		v := c.regoff(&p.From)
++		r := int(p.Reg)
++		if r == 0 {
++			r = int(p.To.Reg)
++		}
++
++		switch o.flag {
++		case immFiledUi4:
++			c.checkimmFiled(p, v, 4, false)
++			o1 = OP_4IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
++		default:
++			c.ctxt.Diag("Invalid immediate value type\n%v", p)
++		}
++
++	case 32: // add $ui5,[r1],r2
++		v := c.regoff(&p.From)
++		r := int(p.Reg)
++		if r == 0 {
++			r = int(p.To.Reg)
++		}
++
++		switch o.flag {
++		case immFiledUi5:
++			c.checkimmFiled(p, v, 5, false)
++			o1 = OP_5IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
++		default:
++			c.ctxt.Diag("Invalid immediate value type\n%v", p)
++		}
++
++	case 33: // add $ui6,[r1],r2
++		v := c.regoff(&p.From)
++		r := int(p.Reg)
++		if r == 0 {
++			r = int(p.To.Reg)
++		}
++
++		switch o.flag {
++		case immFiledUi6:
++			c.checkimmFiled(p, v, 6, false)
++			o1 = OP_6IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
++		default:
++			c.ctxt.Diag("Invalid immediate value type\n%v", p)
++		}
++
+ 	case 34: // mov $con,fr
+ 		v := c.regoff(&p.From)
+ 		a := AADDU
+@@ -2809,6 +2953,70 @@ func (c *ctxt0) oprrr(a obj.As) uint32 {
+ 		return 0xEA3A << 15 // xvilvh.w
+ 	case AXVILVHV:
+ 		return 0xEA3B << 15 // xvilvh.d
++	case AVSLLB:
++		return 0xE1D0 << 15 // vsll.b
++	case AVSLLH:
++		return 0xE1D1 << 15 // vsll.h
++	case AVSLLW:
++		return 0xE1D2 << 15 // vsll.w
++	case AVSLLV:
++		return 0xE1D3 << 15 // vsll.d
++	case AVSRLB:
++		return 0xE1D4 << 15 // vsrl.b
++	case AVSRLH:
++		return 0xE1D5 << 15 // vsrl.h
++	case AVSRLW:
++		return 0xE1D6 << 15 // vsrl.w
++	case AVSRLV:
++		return 0xE1D7 << 15 // vsrl.d
++	case AVSRAB:
++		return 0xE1D8 << 15 // vsra.b
++	case AVSRAH:
++		return 0xE1D9 << 15 // vsra.h
++	case AVSRAW:
++		return 0xE1DA << 15 // vsra.w
++	case AVSRAV:
++		return 0xE1DB << 15 // vsra.d
++	case AVROTRB:
++		return 0xE1DC << 15 // vrotr.b
++	case AVROTRH:
++		return 0xE1DD << 15 // vrotr.h
++	case AVROTRW:
++		return 0xE1DE << 15 // vrotr.w
++	case AVROTRV:
++		return 0xE1DF << 15 // vrotr.d
++	case AXVSLLB:
++		return 0xE9D0 << 15 // xvsll.b
++	case AXVSLLH:
++		return 0xE9D1 << 15 // xvsll.h
++	case AXVSLLW:
++		return 0xE9D2 << 15 // xvsll.w
++	case AXVSLLV:
++		return 0xE9D3 << 15 // xvsll.d
++	case AXVSRLB:
++		return 0xE9D4 << 15 // xvsrl.b
++	case AXVSRLH:
++		return 0xE9D5 << 15 // xvsrl.h
++	case AXVSRLW:
++		return 0xE9D6 << 15 // xvsrl.w
++	case AXVSRLV:
++		return 0xE9D7 << 15 // xvsrl.d
++	case AXVSRAB:
++		return 0xE9D8 << 15 // xvsra.b
++	case AXVSRAH:
++		return 0xE9D9 << 15 // xvsra.h
++	case AXVSRAW:
++		return 0xE9DA << 15 // xvsra.w
++	case AXVSRAV:
++		return 0xE9DB << 15 // xvsra.d
++	case AXVROTRB:
++		return 0xE9DC << 15 // xvrotr.b
++	case AXVROTRH:
++		return 0xE9DD << 15 // xvrotr.h
++	case AXVROTRW:
++		return 0xE9DE << 15 // xvrotr.w
++	case AXVROTRV:
++		return 0xE9DF << 15 // xvrotr.d
+ 	}
+ 
+ 	if a < 0 {
+@@ -3168,6 +3376,70 @@ func (c *ctxt0) opirr(a obj.As) uint32 {
+ 		return 0x0ED02 << 15 // xvseqi.w
+ 	case AXVSEQV:
+ 		return 0x0ED03 << 15 // xvseqi.d
++	case AVROTRB:
++		return 0x1CA8<<18 | 0x1<<13 // vrotri.b
++	case AVROTRH:
++		return 0x1CA8<<18 | 0x1<<14 // vrotri.h
++	case AVROTRW:
++		return 0x1CA8<<18 | 0x1<<15 // vrotri.w
++	case AVROTRV:
++		return 0x1CA8<<18 | 0x1<<16 // vrotri.d
++	case AXVROTRB:
++		return 0x1DA8<<18 | 0x1<<13 // xvrotri.b
++	case AXVROTRH:
++		return 0x1DA8<<18 | 0x1<<14 // xvrotri.h
++	case AXVROTRW:
++		return 0x1DA8<<18 | 0x1<<15 // xvrotri.w
++	case AXVROTRV:
++		return 0x1DA8<<18 | 0x1<<16 // xvrotri.d
++	case AVSLLB:
++		return 0x1CCB<<18 | 0x1<<13 // vslli.b
++	case AVSLLH:
++		return 0x1CCB<<18 | 0x1<<14 // vslli.h
++	case AVSLLW:
++		return 0x1CCB<<18 | 0x1<<15 // vslli.w
++	case AVSLLV:
++		return 0x1CCB<<18 | 0x1<<16 // vslli.d
++	case AVSRLB:
++		return 0x1CCC<<18 | 0x1<<13 // vsrli.b
++	case AVSRLH:
++		return 0x1CCC<<18 | 0x1<<14 // vsrli.h
++	case AVSRLW:
++		return 0x1CCC<<18 | 0x1<<15 // vsrli.w
++	case AVSRLV:
++		return 0x1CCC<<18 | 0x1<<16 // vsrli.d
++	case AVSRAB:
++		return 0x1CCD<<18 | 0x1<<13 // vsrai.b
++	case AVSRAH:
++		return 0x1CCD<<18 | 0x1<<14 // vsrai.h
++	case AVSRAW:
++		return 0x1CCD<<18 | 0x1<<15 // vsrai.w
++	case AVSRAV:
++		return 0x1CCD<<18 | 0x1<<16 // vsrai.d
++	case AXVSLLB:
++		return 0x1DCB<<18 | 0x1<<13 // xvslli.b
++	case AXVSLLH:
++		return 0x1DCB<<18 | 0x1<<14 // xvslli.h
++	case AXVSLLW:
++		return 0x1DCB<<18 | 0x1<<15 // xvslli.w
++	case AXVSLLV:
++		return 0x1DCB<<18 | 0x1<<16 // xvslli.d
++	case AXVSRLB:
++		return 0x1DCC<<18 | 0x1<<13 // xvsrli.b
++	case AXVSRLH:
++		return 0x1DCC<<18 | 0x1<<14 // xvsrli.h
++	case AXVSRLW:
++		return 0x1DCC<<18 | 0x1<<15 // xvsrli.w
++	case AXVSRLV:
++		return 0x1DCC<<18 | 0x1<<16 // xvsrli.d
++	case AXVSRAB:
++		return 0x1DCD<<18 | 0x1<<13 // xvsrai.b
++	case AXVSRAH:
++		return 0x1DCD<<18 | 0x1<<14 // xvsrai.h
++	case AXVSRAW:
++		return 0x1DCD<<18 | 0x1<<15 // xvsrai.w
++	case AXVSRAV:
++		return 0x1DCD<<18 | 0x1<<16 // xvsrai.d
+ 	}
+ 
+ 	if a < 0 {
+-- 
+2.38.1
+
diff --git a/0030-cmd-internal-obj-loong64-add-V-XV-FSQRT-FRECIP-FRSQR.patch b/0030-cmd-internal-obj-loong64-add-V-XV-FSQRT-FRECIP-FRSQR.patch
new file mode 100644
index 0000000000000000000000000000000000000000..ba201937d9220018abcfc3a9c8eb7ef92b211b0b
--- /dev/null
+++ b/0030-cmd-internal-obj-loong64-add-V-XV-FSQRT-FRECIP-FRSQR.patch
@@ -0,0 +1,166 @@
+From 344852ff0ccb2b948dc77e0934f246cc5ddf9506 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 11 Dec 2024 16:49:08 +0800
+Subject: [PATCH 30/44] cmd/internal/obj/loong64: add
+ {V,XV}{FSQRT/FRECIP/FRSQRT}.{S/D} instructions support
+
+Go asm syntax:
+	 V{FSQRT/FRECIP/FRSQRT}{F/D}	VJ, VD
+	XV{FSQRT/FRECIP/FRSQRT}{F/D}	XJ, XD
+
+Equivalent platform assembler syntax:
+	 v{fsqrt/frecip/frsqrt}.{s/d}	vd, vj
+	xv{fsqrt/frecip/frsqrt}.{s/d}	xd, xj
+
+Change-Id: Ied0b959e703d2199939c9ac0608eb3408ea249fa
+---
+ .../asm/internal/asm/testdata/loong64enc1.s   | 14 +++++++
+ src/cmd/internal/obj/loong64/a.out.go         | 14 +++++++
+ src/cmd/internal/obj/loong64/anames.go        | 12 ++++++
+ src/cmd/internal/obj/loong64/asm.go           | 38 ++++++++++++++++++-
+ 4 files changed, 77 insertions(+), 1 deletion(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+index 79012784dc..e2e8a6de6c 100644
+--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+@@ -725,6 +725,20 @@ lable2:
+ 	XVROTRV		$63, X2, X1	// 41fca176
+ 	XVROTRV		$52, X2		// 42d0a176
+ 
++	// [X]VF{SQRT/RECIP/RSQRT}{F/D} instructions
++	VFSQRTF		V1, V2		// 22e49c72
++	VFSQRTD		V1, V2		// 22e89c72
++	VFRECIPF	V1, V2		// 22f49c72
++	VFRECIPD	V1, V2		// 22f89c72
++	VFRSQRTF	V1, V2		// 22049d72
++	VFRSQRTD	V1, V2		// 22089d72
++	XVFSQRTF	X2, X1		// 41e49c76
++	XVFSQRTD	X2, X1		// 41e89c76
++	XVFRECIPF	X2, X1		// 41f49c76
++	XVFRECIPD	X2, X1		// 41f89c76
++	XVFRSQRTF	X2, X1		// 41049d76
++	XVFRSQRTD	X2, X1		// 41089d76
++
+ 	// MOVV C_DCON12_0, r
+ 	MOVV    $0x7a90000000000000, R4         // MOVV $8831558869273542656, R4        // 04a41e03
+ 	MOVV    $0xea90000000000000, R4         // MOVV $-1544734672188080128, R4       // 04a43a03
+diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
+index 3257d376b4..bd2b1e8300 100644
+--- a/src/cmd/internal/obj/loong64/a.out.go
++++ b/src/cmd/internal/obj/loong64/a.out.go
+@@ -811,6 +811,20 @@ const (
+ 	AXVPCNTW
+ 	AXVPCNTV
+ 
++	// LSX and LASX floating point instructions
++	AVFSQRTF
++	AVFSQRTD
++	AVFRECIPF
++	AVFRECIPD
++	AVFRSQRTF
++	AVFRSQRTD
++	AXVFSQRTF
++	AXVFSQRTD
++	AXVFRECIPF
++	AXVFRECIPD
++	AXVFRSQRTF
++	AXVFRSQRTD
++
+ 	// LSX and LASX integer comparison instruction
+ 	AVSEQB
+ 	AXVSEQB
+diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go
+index 776e272a0b..7dbe9b92e6 100644
+--- a/src/cmd/internal/obj/loong64/anames.go
++++ b/src/cmd/internal/obj/loong64/anames.go
+@@ -337,6 +337,18 @@ var Anames = []string{
+ 	"XVPCNTH",
+ 	"XVPCNTW",
+ 	"XVPCNTV",
++	"VFSQRTF",
++	"VFSQRTD",
++	"VFRECIPF",
++	"VFRECIPD",
++	"VFRSQRTF",
++	"VFRSQRTD",
++	"XVFSQRTF",
++	"XVFSQRTD",
++	"XVFRECIPF",
++	"XVFRECIPD",
++	"XVFRSQRTF",
++	"XVFRSQRTD",
+ 	"VSEQB",
+ 	"XVSEQB",
+ 	"VSEQH",
+diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
+index 25a40d736e..af38bef3aa 100644
+--- a/src/cmd/internal/obj/loong64/asm.go
++++ b/src/cmd/internal/obj/loong64/asm.go
+@@ -1553,7 +1553,7 @@ func buildop(ctxt *obj.Link) {
+ 		case AXVSEQB:
+ 			opset(AXVSEQH, r0)
+ 			opset(AXVSEQW, r0)
+-				opset(AXVSEQV, r0)
++			opset(AXVSEQV, r0)
+ 
+ 		case AVANDB:
+ 			opset(AVORB, r0)
+@@ -1659,11 +1659,23 @@ func buildop(ctxt *obj.Link) {
+ 			opset(AVPCNTH, r0)
+ 			opset(AVPCNTW, r0)
+ 			opset(AVPCNTV, r0)
++			opset(AVFSQRTF, r0)
++			opset(AVFSQRTD, r0)
++			opset(AVFRECIPF, r0)
++			opset(AVFRECIPD, r0)
++			opset(AVFRSQRTF, r0)
++			opset(AVFRSQRTD, r0)
+ 
+ 		case AXVPCNTB:
+ 			opset(AXVPCNTH, r0)
+ 			opset(AXVPCNTW, r0)
+ 			opset(AXVPCNTV, r0)
++			opset(AXVFSQRTF, r0)
++			opset(AXVFSQRTD, r0)
++			opset(AXVFRECIPF, r0)
++			opset(AXVFRECIPD, r0)
++			opset(AXVFRSQRTF, r0)
++			opset(AXVFRSQRTD, r0)
+ 		}
+ 	}
+ }
+@@ -3193,6 +3205,30 @@ func (c *ctxt0) oprr(a obj.As) uint32 {
+ 		return 0x1da70a << 10 // xvpcnt.w
+ 	case AXVPCNTV:
+ 		return 0x1da70b << 10 // xvpcnt.v
++	case AVFSQRTF:
++		return 0x1ca739 << 10 // vfsqrt.s
++	case AVFSQRTD:
++		return 0x1ca73a << 10 // vfsqrt.d
++	case AVFRECIPF:
++		return 0x1ca73d << 10 // vfrecip.s
++	case AVFRECIPD:
++		return 0x1ca73e << 10 // vfrecip.d
++	case AVFRSQRTF:
++		return 0x1ca741 << 10 // vfrsqrt.s
++	case AVFRSQRTD:
++		return 0x1ca742 << 10 // vfrsqrt.d
++	case AXVFSQRTF:
++		return 0x1da739 << 10 // xvfsqrt.s
++	case AXVFSQRTD:
++		return 0x1da73a << 10 // xvfsqrt.d
++	case AXVFRECIPF:
++		return 0x1da73d << 10 // xvfrecip.s
++	case AXVFRECIPD:
++		return 0x1da73e << 10 // xvfrecip.d
++	case AXVFRSQRTF:
++		return 0x1da741 << 10 // xvfrsqrt.s
++	case AXVFRSQRTD:
++		return 0x1da742 << 10 // xvfrsqrt.d
+ 	}
+ 
+ 	c.ctxt.Diag("bad rr opcode %v", a)
+-- 
+2.38.1
+
diff --git a/0031-cmd-internal-obj-loong64-add-V-XV-NEG-B-H-W-V-instru.patch b/0031-cmd-internal-obj-loong64-add-V-XV-NEG-B-H-W-V-instru.patch
new file mode 100644
index 0000000000000000000000000000000000000000..40e749c6f39030fd62a0f682dc3cc8486c1577e6
--- /dev/null
+++ b/0031-cmd-internal-obj-loong64-add-V-XV-NEG-B-H-W-V-instru.patch
@@ -0,0 +1,135 @@
+From 6849aaa3deb1fec44bb7625a70ecc2a19f86a389 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 11 Dec 2024 17:19:04 +0800
+Subject: [PATCH 31/44] cmd/internal/obj/loong64: add {V,XV}NEG{B/H/W/V}
+ instructions support
+
+Go asm syntax:
+	 VNEG{B/H/W/V}	VJ, VD
+	XVNEG{B/H/W/V}	XJ, XD
+
+Equivalent platform assembler syntax:
+	 vneg.{b/h/w/d}	vd, vj
+	xvneg.{b/h/w/d}	xd, xj
+
+Change-Id: Ib2df46b5386149efb44fe12e2485c01826339a5d
+---
+ .../asm/internal/asm/testdata/loong64enc1.s   | 10 ++++++++
+ src/cmd/internal/obj/loong64/a.out.go         | 10 ++++++++
+ src/cmd/internal/obj/loong64/anames.go        |  8 +++++++
+ src/cmd/internal/obj/loong64/asm.go           | 24 +++++++++++++++++++
+ 4 files changed, 52 insertions(+)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+index e2e8a6de6c..9deb3cbafd 100644
+--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+@@ -739,6 +739,16 @@ lable2:
+ 	XVFRSQRTF	X2, X1		// 41049d76
+ 	XVFRSQRTD	X2, X1		// 41089d76
+ 
++	// [X]VNEG{B/H/W/V} instructions
++	VNEGB		V1, V2		// 22309c72
++	VNEGH		V1, V2		// 22349c72
++	VNEGW		V1, V2		// 22389c72
++	VNEGV		V1, V2		// 223c9c72
++	XVNEGB		X2, X1		// 41309c76
++	XVNEGH		X2, X1		// 41349c76
++	XVNEGW		X2, X1		// 41389c76
++	XVNEGV		X2, X1		// 413c9c76
++
+ 	// MOVV C_DCON12_0, r
+ 	MOVV    $0x7a90000000000000, R4         // MOVV $8831558869273542656, R4        // 04a41e03
+ 	MOVV    $0xea90000000000000, R4         // MOVV $-1544734672188080128, R4       // 04a43a03
+diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
+index bd2b1e8300..486dc9fa89 100644
+--- a/src/cmd/internal/obj/loong64/a.out.go
++++ b/src/cmd/internal/obj/loong64/a.out.go
+@@ -747,6 +747,16 @@ const (
+ 	AXVSUBV
+ 	AXVSUBQ
+ 
++	// LSX and LASX integer neg instructions
++	AVNEGB
++	AVNEGH
++	AVNEGW
++	AVNEGV
++	AXVNEGB
++	AXVNEGH
++	AXVNEGW
++	AXVNEGV
++
+ 	// LSX and LASX Bit-manipulation Instructions
+ 	AVANDB
+ 	AVORB
+diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go
+index 7dbe9b92e6..d697b73e71 100644
+--- a/src/cmd/internal/obj/loong64/anames.go
++++ b/src/cmd/internal/obj/loong64/anames.go
+@@ -277,6 +277,14 @@ var Anames = []string{
+ 	"XVSUBW",
+ 	"XVSUBV",
+ 	"XVSUBQ",
++	"VNEGB",
++	"VNEGH",
++	"VNEGW",
++	"VNEGV",
++	"XVNEGB",
++	"XVNEGH",
++	"XVNEGW",
++	"XVNEGV",
+ 	"VANDB",
+ 	"VORB",
+ 	"VXORB",
+diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
+index af38bef3aa..e2c7afd82d 100644
+--- a/src/cmd/internal/obj/loong64/asm.go
++++ b/src/cmd/internal/obj/loong64/asm.go
+@@ -1665,6 +1665,10 @@ func buildop(ctxt *obj.Link) {
+ 			opset(AVFRECIPD, r0)
+ 			opset(AVFRSQRTF, r0)
+ 			opset(AVFRSQRTD, r0)
++			opset(AVNEGB, r0)
++			opset(AVNEGH, r0)
++			opset(AVNEGW, r0)
++			opset(AVNEGV, r0)
+ 
+ 		case AXVPCNTB:
+ 			opset(AXVPCNTH, r0)
+@@ -1676,6 +1680,10 @@ func buildop(ctxt *obj.Link) {
+ 			opset(AXVFRECIPD, r0)
+ 			opset(AXVFRSQRTF, r0)
+ 			opset(AXVFRSQRTD, r0)
++			opset(AXVNEGB, r0)
++			opset(AXVNEGH, r0)
++			opset(AXVNEGW, r0)
++			opset(AXVNEGV, r0)
+ 		}
+ 	}
+ }
+@@ -3229,6 +3237,22 @@ func (c *ctxt0) oprr(a obj.As) uint32 {
+ 		return 0x1da741 << 10 // xvfrsqrt.s
+ 	case AXVFRSQRTD:
+ 		return 0x1da742 << 10 // xvfrsqrt.d
++	case AVNEGB:
++		return 0x1ca70c << 10 // vneg.b
++	case AVNEGH:
++		return 0x1ca70d << 10 // vneg.h
++	case AVNEGW:
++		return 0x1ca70e << 10 // vneg.w
++	case AVNEGV:
++		return 0x1ca70f << 10 // vneg.d
++	case AXVNEGB:
++		return 0x1da70c << 10 // xvneg.b
++	case AXVNEGH:
++		return 0x1da70d << 10 // xvneg.h
++	case AXVNEGW:
++		return 0x1da70e << 10 // xvneg.w
++	case AXVNEGV:
++		return 0x1da70f << 10 // xvneg.d
+ 	}
+ 
+ 	c.ctxt.Diag("bad rr opcode %v", a)
+-- 
+2.38.1
+
diff --git a/0032-cmd-internal-obj-loong64-add-V-XV-MUL-B-H-W-V-and-V-.patch b/0032-cmd-internal-obj-loong64-add-V-XV-MUL-B-H-W-V-and-V-.patch
new file mode 100644
index 0000000000000000000000000000000000000000..67724ac532b420e34fd99dc4c3ea21173f284099
--- /dev/null
+++ b/0032-cmd-internal-obj-loong64-add-V-XV-MUL-B-H-W-V-and-V-.patch
@@ -0,0 +1,235 @@
+From 984f12cbb1763c855882b3c8e89727ad560b38c1 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 11 Dec 2024 17:46:09 +0800
+Subject: [PATCH 32/44] cmd/internal/obj/loong64: add {V,XV}MUL{B/H/W/V} and
+ {V,XV}MUH{B/H/W/V}[U] instructions support
+
+Go asm syntax:
+	 VMUL{B/H/W/V}		VK, VJ, VD
+	 VMUH{B/H/W/V}[U]	VK, VJ, VD
+	XVMUL{B/H/W/V}		XK, XJ, XD
+	XVMUH{B/H/W/V}[U]	XK, XJ, XD
+
+Equivalent platform assembler syntax:
+	 vmul.{b/h/w/d}		vd, vj, vk
+	 vmuh.{b/h/w/d}[u]	vd, vj, vk
+	xvmul.{b/h/w/d}		xd, xj, xk
+	xvmuh.{b/h/w/d}[u]	xd, xj, xk
+
+Change-Id: I8890f8a41100e4681a833c27067f0f76b593f731
+---
+ .../asm/internal/asm/testdata/loong64enc1.s   | 26 +++++++
+ src/cmd/internal/obj/loong64/a.out.go         | 26 +++++++
+ src/cmd/internal/obj/loong64/anames.go        | 24 +++++++
+ src/cmd/internal/obj/loong64/asm.go           | 72 +++++++++++++++++++
+ 4 files changed, 148 insertions(+)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+index 9deb3cbafd..c8b490234f 100644
+--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+@@ -749,6 +749,32 @@ lable2:
+ 	XVNEGW		X2, X1		// 41389c76
+ 	XVNEGV		X2, X1		// 413c9c76
+ 
++	// [X]VMUL{B/H/W/V} and [X]VMUH{B/H/W/V}[U] instructions
++	VMULB		V1, V2, V3	// 43048470
++	VMULH		V1, V2, V3	// 43848470
++	VMULW		V1, V2, V3	// 43048570
++	VMULV		V1, V2, V3	// 43848570
++	VMUHB		V1, V2, V3	// 43048670
++	VMUHH		V1, V2, V3	// 43848670
++	VMUHW		V1, V2, V3	// 43048770
++	VMUHV		V1, V2, V3	// 43848770
++	VMUHBU		V1, V2, V3	// 43048870
++	VMUHHU		V1, V2, V3	// 43848870
++	VMUHWU		V1, V2, V3	// 43048970
++	VMUHVU		V1, V2, V3	// 43848970
++	XVMULB		X3, X2, X1	// 410c8474
++	XVMULH		X3, X2, X1	// 418c8474
++	XVMULW		X3, X2, X1	// 410c8574
++	XVMULV		X3, X2, X1	// 418c8574
++	XVMUHB		X3, X2, X1	// 410c8674
++	XVMUHH		X3, X2, X1	// 418c8674
++	XVMUHW		X3, X2, X1	// 410c8774
++	XVMUHV		X3, X2, X1	// 418c8774
++	XVMUHBU		X3, X2, X1	// 410c8874
++	XVMUHHU		X3, X2, X1	// 418c8874
++	XVMUHWU		X3, X2, X1	// 410c8974
++	XVMUHVU		X3, X2, X1	// 418c8974
++
+ 	// MOVV C_DCON12_0, r
+ 	MOVV    $0x7a90000000000000, R4         // MOVV $8831558869273542656, R4        // 04a41e03
+ 	MOVV    $0xea90000000000000, R4         // MOVV $-1544734672188080128, R4       // 04a43a03
+diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
+index 486dc9fa89..95744e77a1 100644
+--- a/src/cmd/internal/obj/loong64/a.out.go
++++ b/src/cmd/internal/obj/loong64/a.out.go
+@@ -757,6 +757,32 @@ const (
+ 	AXVNEGW
+ 	AXVNEGV
+ 
++	// LSX and LASX integer mul instructions
++	AVMULB
++	AVMULH
++	AVMULW
++	AVMULV
++	AVMUHB
++	AVMUHH
++	AVMUHW
++	AVMUHV
++	AVMUHBU
++	AVMUHHU
++	AVMUHWU
++	AVMUHVU
++	AXVMULB
++	AXVMULH
++	AXVMULW
++	AXVMULV
++	AXVMUHB
++	AXVMUHH
++	AXVMUHW
++	AXVMUHV
++	AXVMUHBU
++	AXVMUHHU
++	AXVMUHWU
++	AXVMUHVU
++
+ 	// LSX and LASX Bit-manipulation Instructions
+ 	AVANDB
+ 	AVORB
+diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go
+index d697b73e71..d0cd3a26fa 100644
+--- a/src/cmd/internal/obj/loong64/anames.go
++++ b/src/cmd/internal/obj/loong64/anames.go
+@@ -285,6 +285,30 @@ var Anames = []string{
+ 	"XVNEGH",
+ 	"XVNEGW",
+ 	"XVNEGV",
++	"VMULB",
++	"VMULH",
++	"VMULW",
++	"VMULV",
++	"VMUHB",
++	"VMUHH",
++	"VMUHW",
++	"VMUHV",
++	"VMUHBU",
++	"VMUHHU",
++	"VMUHWU",
++	"VMUHVU",
++	"XVMULB",
++	"XVMULH",
++	"XVMULW",
++	"XVMULV",
++	"XVMUHB",
++	"XVMUHH",
++	"XVMUHW",
++	"XVMUHV",
++	"XVMUHBU",
++	"XVMUHHU",
++	"XVMUHWU",
++	"XVMUHVU",
+ 	"VANDB",
+ 	"VORB",
+ 	"VXORB",
+diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
+index e2c7afd82d..7fb99f66e6 100644
+--- a/src/cmd/internal/obj/loong64/asm.go
++++ b/src/cmd/internal/obj/loong64/asm.go
+@@ -1589,6 +1589,18 @@ func buildop(ctxt *obj.Link) {
+ 			opset(AVILVHH, r0)
+ 			opset(AVILVHW, r0)
+ 			opset(AVILVHV, r0)
++			opset(AVMULB, r0)
++			opset(AVMULH, r0)
++			opset(AVMULW, r0)
++			opset(AVMULV, r0)
++			opset(AVMUHB, r0)
++			opset(AVMUHH, r0)
++			opset(AVMUHW, r0)
++			opset(AVMUHV, r0)
++			opset(AVMUHBU, r0)
++			opset(AVMUHHU, r0)
++			opset(AVMUHWU, r0)
++			opset(AVMUHVU, r0)
+ 
+ 		case AXVANDV:
+ 			opset(AXVORV, r0)
+@@ -1614,6 +1626,18 @@ func buildop(ctxt *obj.Link) {
+ 			opset(AXVILVHH, r0)
+ 			opset(AXVILVHW, r0)
+ 			opset(AXVILVHV, r0)
++			opset(AXVMULB, r0)
++			opset(AXVMULH, r0)
++			opset(AXVMULW, r0)
++			opset(AXVMULV, r0)
++			opset(AXVMUHB, r0)
++			opset(AXVMUHH, r0)
++			opset(AXVMUHW, r0)
++			opset(AXVMUHV, r0)
++			opset(AXVMUHBU, r0)
++			opset(AXVMUHHU, r0)
++			opset(AXVMUHWU, r0)
++			opset(AXVMUHVU, r0)
+ 
+ 		case AVSLLB:
+ 			opset(AVSRLB, r0)
+@@ -3037,6 +3061,54 @@ func (c *ctxt0) oprrr(a obj.As) uint32 {
+ 		return 0xE9DE << 15 // xvrotr.w
+ 	case AXVROTRV:
+ 		return 0xE9DF << 15 // xvrotr.d
++	case AVMULB:
++		return 0xe108 << 15 // vmul.b
++	case AVMULH:
++		return 0xe109 << 15 // vmul.h
++	case AVMULW:
++		return 0xe10a << 15 // vmul.w
++	case AVMULV:
++		return 0xe10b << 15 // vmul.d
++	case AVMUHB:
++		return 0xe10c << 15 // vmuh.b
++	case AVMUHH:
++		return 0xe10d << 15 // vmuh.h
++	case AVMUHW:
++		return 0xe10e << 15 // vmuh.w
++	case AVMUHV:
++		return 0xe10f << 15 // vmuh.d
++	case AVMUHBU:
++		return 0xe110 << 15 // vmuh.bu
++	case AVMUHHU:
++		return 0xe111 << 15 // vmuh.hu
++	case AVMUHWU:
++		return 0xe112 << 15 // vmuh.wu
++	case AVMUHVU:
++		return 0xe113 << 15 // vmuh.du
++	case AXVMULB:
++		return 0xe908 << 15 // xvmul.b
++	case AXVMULH:
++		return 0xe909 << 15 // xvmul.h
++	case AXVMULW:
++		return 0xe90a << 15 // xvmul.w
++	case AXVMULV:
++		return 0xe90b << 15 // xvmul.d
++	case AXVMUHB:
++		return 0xe90c << 15 // xvmuh.b
++	case AXVMUHH:
++		return 0xe90d << 15 // xvmuh.h
++	case AXVMUHW:
++		return 0xe90e << 15 // xvmuh.w
++	case AXVMUHV:
++		return 0xe90f << 15 // xvmuh.d
++	case AXVMUHBU:
++		return 0xe910 << 15 // xvmuh.bu
++	case AXVMUHHU:
++		return 0xe911 << 15 // xvmuh.hu
++	case AXVMUHWU:
++		return 0xe912 << 15 // xvmuh.wu
++	case AXVMUHVU:
++		return 0xe913 << 15 // xvmuh.du
+ 	}
+ 
+ 	if a < 0 {
+-- 
+2.38.1
+
diff --git a/0033-cmd-internal-obj-loong64-add-V-XV-DIV-B-H-W-V-U-and-.patch b/0033-cmd-internal-obj-loong64-add-V-XV-DIV-B-H-W-V-U-and-.patch
new file mode 100644
index 0000000000000000000000000000000000000000..967f56d10b28d942f5868f28bd56209e5a7fbeb9
--- /dev/null
+++ b/0033-cmd-internal-obj-loong64-add-V-XV-DIV-B-H-W-V-U-and-.patch
@@ -0,0 +1,283 @@
+From 116a2261b3a110e6ff4f9608f447e6f07156d55f Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Wed, 11 Dec 2024 18:08:16 +0800
+Subject: [PATCH 33/44] cmd/internal/obj/loong64: add {V,XV}DIV{B/H/W/V}[U] and
+ {V,XV}MOD{B/H/W/V}[U] instructions support
+
+Go asm syntax:
+	 VDIV{B/H/W/V}[U]	VK, VJ, VD
+	XVDIV{B/H/W/V}[U]	XK, XJ, XD
+	 VMOD{B/H/W/V}[U]	VK, VJ, VD
+	XVMOD{B/H/W/V}[U]	XK, XJ, XD
+
+Equivalent platform assembler syntax:
+	 vdiv.{b/h/w/d}[u]	vd, vj, vk
+	xvdiv.{b/h/w/d}[u]	xd, xj, xk
+	 vmod.{b/h/w/d}[u]	vd, vj, vk
+	xvmod.{b/h/w/d}[u]	xd, xj, xk
+
+Change-Id: I27e9bc8999e6525a27f0bf12b21cc896c5a2a69c
+---
+ .../asm/internal/asm/testdata/loong64enc1.s   | 34 +++++++
+ src/cmd/internal/obj/loong64/a.out.go         | 34 +++++++
+ src/cmd/internal/obj/loong64/anames.go        | 32 +++++++
+ src/cmd/internal/obj/loong64/asm.go           | 96 +++++++++++++++++++
+ 4 files changed, 196 insertions(+)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+index c8b490234f..bbac6036cf 100644
+--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+@@ -775,6 +775,40 @@ lable2:
+ 	XVMUHWU		X3, X2, X1	// 410c8974
+ 	XVMUHVU		X3, X2, X1	// 418c8974
+ 
++	// [X]VDIV{B/H/W/V}[U] and [X]VMOD{B/H/W/V}[U] instructions
++	VDIVB		V1, V2, V3	// 4304e070 
++	VDIVH		V1, V2, V3	// 4384e070
++	VDIVW		V1, V2, V3	// 4304e170
++	VDIVV		V1, V2, V3	// 4384e170
++	VDIVBU		V1, V2, V3	// 4304e470
++	VDIVHU		V1, V2, V3	// 4384e470
++	VDIVWU		V1, V2, V3	// 4304e570
++	VDIVVU		V1, V2, V3	// 4384e570
++	VMODB		V1, V2, V3	// 4304e270
++	VMODH		V1, V2, V3	// 4384e270
++	VMODW		V1, V2, V3	// 4304e370
++	VMODV		V1, V2, V3	// 4384e370
++	VMODBU		V1, V2, V3	// 4304e670
++	VMODHU		V1, V2, V3	// 4384e670
++	VMODWU		V1, V2, V3	// 4304e770
++	VMODVU		V1, V2, V3	// 4384e770
++	XVDIVB		X3, X2, X1	// 410ce074
++	XVDIVH		X3, X2, X1	// 418ce074
++	XVDIVW		X3, X2, X1	// 410ce174
++	XVDIVV		X3, X2, X1	// 418ce174
++	XVDIVBU		X3, X2, X1	// 410ce474
++	XVDIVHU		X3, X2, X1	// 418ce474
++	XVDIVWU		X3, X2, X1	// 410ce574
++	XVDIVVU		X3, X2, X1	// 418ce574
++	XVMODB		X3, X2, X1	// 410ce274
++	XVMODH		X3, X2, X1	// 418ce274
++	XVMODW		X3, X2, X1	// 410ce374
++	XVMODV		X3, X2, X1	// 418ce374
++	XVMODBU		X3, X2, X1	// 410ce674
++	XVMODHU		X3, X2, X1	// 418ce674
++	XVMODWU		X3, X2, X1	// 410ce774
++	XVMODVU		X3, X2, X1	// 418ce774
++
+ 	// MOVV C_DCON12_0, r
+ 	MOVV    $0x7a90000000000000, R4         // MOVV $8831558869273542656, R4        // 04a41e03
+ 	MOVV    $0xea90000000000000, R4         // MOVV $-1544734672188080128, R4       // 04a43a03
+diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
+index 95744e77a1..9164e9d59f 100644
+--- a/src/cmd/internal/obj/loong64/a.out.go
++++ b/src/cmd/internal/obj/loong64/a.out.go
+@@ -783,6 +783,40 @@ const (
+ 	AXVMUHWU
+ 	AXVMUHVU
+ 
++	// LSX and LASX integer div and mod instructions
++	AVDIVB
++	AVDIVH
++	AVDIVW
++	AVDIVV
++	AVDIVBU
++	AVDIVHU
++	AVDIVWU
++	AVDIVVU
++	AVMODB
++	AVMODH
++	AVMODW
++	AVMODV
++	AVMODBU
++	AVMODHU
++	AVMODWU
++	AVMODVU
++	AXVDIVB
++	AXVDIVH
++	AXVDIVW
++	AXVDIVV
++	AXVDIVBU
++	AXVDIVHU
++	AXVDIVWU
++	AXVDIVVU
++	AXVMODB
++	AXVMODH
++	AXVMODW
++	AXVMODV
++	AXVMODBU
++	AXVMODHU
++	AXVMODWU
++	AXVMODVU
++
+ 	// LSX and LASX Bit-manipulation Instructions
+ 	AVANDB
+ 	AVORB
+diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go
+index d0cd3a26fa..15a264c8e2 100644
+--- a/src/cmd/internal/obj/loong64/anames.go
++++ b/src/cmd/internal/obj/loong64/anames.go
+@@ -309,6 +309,38 @@ var Anames = []string{
+ 	"XVMUHHU",
+ 	"XVMUHWU",
+ 	"XVMUHVU",
++	"VDIVB",
++	"VDIVH",
++	"VDIVW",
++	"VDIVV",
++	"VDIVBU",
++	"VDIVHU",
++	"VDIVWU",
++	"VDIVVU",
++	"VMODB",
++	"VMODH",
++	"VMODW",
++	"VMODV",
++	"VMODBU",
++	"VMODHU",
++	"VMODWU",
++	"VMODVU",
++	"XVDIVB",
++	"XVDIVH",
++	"XVDIVW",
++	"XVDIVV",
++	"XVDIVBU",
++	"XVDIVHU",
++	"XVDIVWU",
++	"XVDIVVU",
++	"XVMODB",
++	"XVMODH",
++	"XVMODW",
++	"XVMODV",
++	"XVMODBU",
++	"XVMODHU",
++	"XVMODWU",
++	"XVMODVU",
+ 	"VANDB",
+ 	"VORB",
+ 	"VXORB",
+diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
+index 7fb99f66e6..7a14137374 100644
+--- a/src/cmd/internal/obj/loong64/asm.go
++++ b/src/cmd/internal/obj/loong64/asm.go
+@@ -1601,6 +1601,22 @@ func buildop(ctxt *obj.Link) {
+ 			opset(AVMUHHU, r0)
+ 			opset(AVMUHWU, r0)
+ 			opset(AVMUHVU, r0)
++			opset(AVDIVB, r0)
++			opset(AVDIVH, r0)
++			opset(AVDIVW, r0)
++			opset(AVDIVV, r0)
++			opset(AVMODB, r0)
++			opset(AVMODH, r0)
++			opset(AVMODW, r0)
++			opset(AVMODV, r0)
++			opset(AVDIVBU, r0)
++			opset(AVDIVHU, r0)
++			opset(AVDIVWU, r0)
++			opset(AVDIVVU, r0)
++			opset(AVMODBU, r0)
++			opset(AVMODHU, r0)
++			opset(AVMODWU, r0)
++			opset(AVMODVU, r0)
+ 
+ 		case AXVANDV:
+ 			opset(AXVORV, r0)
+@@ -1638,6 +1654,22 @@ func buildop(ctxt *obj.Link) {
+ 			opset(AXVMUHHU, r0)
+ 			opset(AXVMUHWU, r0)
+ 			opset(AXVMUHVU, r0)
++			opset(AXVDIVB, r0)
++			opset(AXVDIVH, r0)
++			opset(AXVDIVW, r0)
++			opset(AXVDIVV, r0)
++			opset(AXVMODB, r0)
++			opset(AXVMODH, r0)
++			opset(AXVMODW, r0)
++			opset(AXVMODV, r0)
++			opset(AXVDIVBU, r0)
++			opset(AXVDIVHU, r0)
++			opset(AXVDIVWU, r0)
++			opset(AXVDIVVU, r0)
++			opset(AXVMODBU, r0)
++			opset(AXVMODHU, r0)
++			opset(AXVMODWU, r0)
++			opset(AXVMODVU, r0)
+ 
+ 		case AVSLLB:
+ 			opset(AVSRLB, r0)
+@@ -3109,6 +3141,70 @@ func (c *ctxt0) oprrr(a obj.As) uint32 {
+ 		return 0xe912 << 15 // xvmuh.wu
+ 	case AXVMUHVU:
+ 		return 0xe913 << 15 // xvmuh.du
++	case AVDIVB:
++		return 0xe1c0 << 15 // vdiv.b
++	case AVDIVH:
++		return 0xe1c1 << 15 // vdiv.h
++	case AVDIVW:
++		return 0xe1c2 << 15 // vdiv.w
++	case AVDIVV:
++		return 0xe1c3 << 15 // vdiv.d
++	case AVMODB:
++		return 0xe1c4 << 15 // vmod.b
++	case AVMODH:
++		return 0xe1c5 << 15 // vmod.h
++	case AVMODW:
++		return 0xe1c6 << 15 // vmod.w
++	case AVMODV:
++		return 0xe1c7 << 15 // vmod.d
++	case AVDIVBU:
++		return 0xe1c8 << 15 // vdiv.bu
++	case AVDIVHU:
++		return 0xe1c9 << 15 // vdiv.hu
++	case AVDIVWU:
++		return 0xe1ca << 15 // vdiv.wu
++	case AVDIVVU:
++		return 0xe1cb << 15 // vdiv.du
++	case AVMODBU:
++		return 0xe1cc << 15 // vmod.bu
++	case AVMODHU:
++		return 0xe1cd << 15 // vmod.hu
++	case AVMODWU:
++		return 0xe1ce << 15 // vmod.wu
++	case AVMODVU:
++		return 0xe1cf << 15 // vmod.du
++	case AXVDIVB:
++		return 0xe9c0 << 15 // xvdiv.b
++	case AXVDIVH:
++		return 0xe9c1 << 15 // xvdiv.h
++	case AXVDIVW:
++		return 0xe9c2 << 15 // xvdiv.w
++	case AXVDIVV:
++		return 0xe9c3 << 15 // xvdiv.d
++	case AXVMODB:
++		return 0xe9c4 << 15 // xvmod.b
++	case AXVMODH:
++		return 0xe9c5 << 15 // xvmod.h
++	case AXVMODW:
++		return 0xe9c6 << 15 // xvmod.w
++	case AXVMODV:
++		return 0xe9c7 << 15 // xvmod.d
++	case AXVDIVBU:
++		return 0xe9c8 << 15 // xvdiv.bu
++	case AXVDIVHU:
++		return 0xe9c9 << 15 // xvdiv.hu
++	case AXVDIVWU:
++		return 0xe9ca << 15 // xvdiv.wu
++	case AXVDIVVU:
++		return 0xe9cb << 15 // xvdiv.du
++	case AXVMODBU:
++		return 0xe9cc << 15 // xvmod.bu
++	case AXVMODHU:
++		return 0xe9cd << 15 // xvmod.hu
++	case AXVMODWU:
++		return 0xe9ce << 15 // xvmod.wu
++	case AXVMODVU:
++		return 0xe9cf << 15 // xvmod.du
+ 	}
+ 
+ 	if a < 0 {
+-- 
+2.38.1
+
diff --git a/0034-cmd-internal-obj-loong64-add-V-XV-BITCLR-BITSET-BITR.patch b/0034-cmd-internal-obj-loong64-add-V-XV-BITCLR-BITSET-BITR.patch
new file mode 100644
index 0000000000000000000000000000000000000000..2a2d4b13d8efbdfe73ea616c881cc4dc7968ad49
--- /dev/null
+++ b/0034-cmd-internal-obj-loong64-add-V-XV-BITCLR-BITSET-BITR.patch
@@ -0,0 +1,341 @@
+From 054df785d79675c02f6bd2ad3ace9f1ce5874e84 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Thu, 12 Dec 2024 10:54:00 +0800
+Subject: [PATCH 34/44] cmd/internal/obj/loong64: add
+ {V,XV}{BITCLR/BITSET/BITREV}[I].{B/H/W/D} instructions support
+
+Go asm syntax:
+	 V{BITCLR/BITSET/BITREV}{B/H/W/V}	$1, V2, V3
+	XV{BITCLR/BITSET/BITREV}{B/H/W/V}	$1, X2, X3
+	 V{BITCLR/BITSET/BITREV}{B/H/W/V}	VK, VJ, VD
+	XV{BITCLR/BITSET/BITREV}{B/H/W/V}	XK, XJ, XD
+
+Equivalent platform assembler syntax:
+	 v{bitclr/bitset/bitrev}i.{b/h/w/d}	v3, v2, $1
+	xv{bitclr/bitset/bitrev}i.{b/h/w/d}	x3, x2, $1
+	 v{bitclr/bitset/bitrev}.{b/h/w/d}	vd, vj, vk
+	xv{bitclr/bitset/bitrev}.{b/h/w/d}	xd, xj, xk
+
+Change-Id: Id44e6cb7c22d650bb6b4d9f6faee5dcda4edb24e
+---
+ .../asm/internal/asm/testdata/loong64enc1.s   |  50 ++++++++
+ src/cmd/internal/obj/loong64/a.out.go         |  25 ++++
+ src/cmd/internal/obj/loong64/anames.go        |  24 ++++
+ src/cmd/internal/obj/loong64/asm.go           | 120 ++++++++++++++++++
+ 4 files changed, 219 insertions(+)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+index bbac6036cf..19070c89ef 100644
+--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+@@ -809,6 +809,56 @@ lable2:
+ 	XVMODWU		X3, X2, X1	// 410ce774
+ 	XVMODVU		X3, X2, X1	// 418ce774
+ 
++	// [X]{VBITCLR/VBITSET/VBITREV}{B,H,W,V} instructions
++	VBITCLRB	V1, V2, V3	// 43040c71
++	VBITCLRH	V1, V2, V3	// 43840c71
++	VBITCLRW	V1, V2, V3	// 43040d71
++	VBITCLRV	V1, V2, V3	// 43840d71
++	VBITSETB	V1, V2, V3	// 43040e71
++	VBITSETH	V1, V2, V3	// 43840e71
++	VBITSETW	V1, V2, V3	// 43040f71
++	VBITSETV	V1, V2, V3	// 43840f71
++	VBITREVB	V1, V2, V3	// 43041071
++	VBITREVH	V1, V2, V3	// 43841071
++	VBITREVW	V1, V2, V3	// 43041171
++	VBITREVV	V1, V2, V3	// 43841171
++	XVBITCLRB	X3, X2, X1	// 410c0c75
++	XVBITCLRH	X3, X2, X1	// 418c0c75
++	XVBITCLRW	X3, X2, X1	// 410c0d75
++	XVBITCLRV	X3, X2, X1	// 418c0d75
++	XVBITSETB	X3, X2, X1	// 410c0e75
++	XVBITSETH	X3, X2, X1	// 418c0e75
++	XVBITSETW	X3, X2, X1	// 410c0f75
++	XVBITSETV	X3, X2, X1	// 418c0f75
++	XVBITREVB	X3, X2, X1	// 410c1075
++	XVBITREVH	X3, X2, X1	// 418c1075
++	XVBITREVW	X3, X2, X1	// 410c1175
++	XVBITREVV	X3, X2, X1	// 418c1175
++	VBITCLRB	$7, V2, V3	// 433c1073 
++	VBITCLRH	$15, V2, V3	// 437c1073
++	VBITCLRW	$31, V2, V3	// 43fc1073
++	VBITCLRV	$63, V2, V3	// 43fc1173
++	VBITSETB	$7, V2, V3	// 433c1473
++	VBITSETH	$15, V2, V3	// 437c1473
++	VBITSETW	$31, V2, V3	// 43fc1473
++	VBITSETV	$63, V2, V3	// 43fc1573
++	VBITREVB	$7, V2, V3	// 433c1873
++	VBITREVH	$15, V2, V3	// 437c1873
++	VBITREVW	$31, V2, V3	// 43fc1873
++	VBITREVV	$63, V2, V3	// 43fc1973
++	XVBITCLRB	$7, X2, X1	// 413c1077
++	XVBITCLRH	$15, X2, X1	// 417c1077
++	XVBITCLRW	$31, X2, X1	// 41fc1077
++	XVBITCLRV	$63, X2, X1	// 41fc1177
++	XVBITSETB	$7, X2, X1	// 413c1477
++	XVBITSETH	$15, X2, X1	// 417c1477
++	XVBITSETW	$31, X2, X1	// 41fc1477
++	XVBITSETV	$63, X2, X1	// 41fc1577
++	XVBITREVB	$7, X2, X1	// 413c1877
++	XVBITREVH	$15, X2, X1	// 417c1877
++	XVBITREVW	$31, X2, X1	// 41fc1877
++	XVBITREVV	$63, X2, X1	// 41fc1977
++
+ 	// MOVV C_DCON12_0, r
+ 	MOVV    $0x7a90000000000000, R4         // MOVV $8831558869273542656, R4        // 04a41e03
+ 	MOVV    $0xea90000000000000, R4         // MOVV $-1544734672188080128, R4       // 04a43a03
+diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
+index 9164e9d59f..1fadbc648a 100644
+--- a/src/cmd/internal/obj/loong64/a.out.go
++++ b/src/cmd/internal/obj/loong64/a.out.go
+@@ -881,6 +881,31 @@ const (
+ 	AXVPCNTW
+ 	AXVPCNTV
+ 
++	AVBITCLRB
++	AVBITCLRH
++	AVBITCLRW
++	AVBITCLRV
++	AVBITSETB
++	AVBITSETH
++	AVBITSETW
++	AVBITSETV
++	AVBITREVB
++	AVBITREVH
++	AVBITREVW
++	AVBITREVV
++	AXVBITCLRB
++	AXVBITCLRH
++	AXVBITCLRW
++	AXVBITCLRV
++	AXVBITSETB
++	AXVBITSETH
++	AXVBITSETW
++	AXVBITSETV
++	AXVBITREVB
++	AXVBITREVH
++	AXVBITREVW
++	AXVBITREVV
++
+ 	// LSX and LASX floating point instructions
+ 	AVFSQRTF
+ 	AVFSQRTD
+diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go
+index 15a264c8e2..aee0da0a6e 100644
+--- a/src/cmd/internal/obj/loong64/anames.go
++++ b/src/cmd/internal/obj/loong64/anames.go
+@@ -401,6 +401,30 @@ var Anames = []string{
+ 	"XVPCNTH",
+ 	"XVPCNTW",
+ 	"XVPCNTV",
++	"VBITCLRB",
++	"VBITCLRH",
++	"VBITCLRW",
++	"VBITCLRV",
++	"VBITSETB",
++	"VBITSETH",
++	"VBITSETW",
++	"VBITSETV",
++	"VBITREVB",
++	"VBITREVH",
++	"VBITREVW",
++	"VBITREVV",
++	"XVBITCLRB",
++	"XVBITCLRH",
++	"XVBITCLRW",
++	"XVBITCLRV",
++	"XVBITSETB",
++	"XVBITSETH",
++	"XVBITSETW",
++	"XVBITSETV",
++	"XVBITREVB",
++	"XVBITREVH",
++	"XVBITREVW",
++	"XVBITREVV",
+ 	"VFSQRTF",
+ 	"VFSQRTD",
+ 	"VFRECIPF",
+diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
+index 7a14137374..657d32ae81 100644
+--- a/src/cmd/internal/obj/loong64/asm.go
++++ b/src/cmd/internal/obj/loong64/asm.go
+@@ -1675,41 +1675,65 @@ func buildop(ctxt *obj.Link) {
+ 			opset(AVSRLB, r0)
+ 			opset(AVSRAB, r0)
+ 			opset(AVROTRB, r0)
++			opset(AVBITCLRB, r0)
++			opset(AVBITSETB, r0)
++			opset(AVBITREVB, r0)
+ 
+ 		case AXVSLLB:
+ 			opset(AXVSRLB, r0)
+ 			opset(AXVSRAB, r0)
+ 			opset(AXVROTRB, r0)
++			opset(AXVBITCLRB, r0)
++			opset(AXVBITSETB, r0)
++			opset(AXVBITREVB, r0)
+ 
+ 		case AVSLLH:
+ 			opset(AVSRLH, r0)
+ 			opset(AVSRAH, r0)
+ 			opset(AVROTRH, r0)
++			opset(AVBITCLRH, r0)
++			opset(AVBITSETH, r0)
++			opset(AVBITREVH, r0)
+ 
+ 		case AXVSLLH:
+ 			opset(AXVSRLH, r0)
+ 			opset(AXVSRAH, r0)
+ 			opset(AXVROTRH, r0)
++			opset(AXVBITCLRH, r0)
++			opset(AXVBITSETH, r0)
++			opset(AXVBITREVH, r0)
+ 
+ 		case AVSLLW:
+ 			opset(AVSRLW, r0)
+ 			opset(AVSRAW, r0)
+ 			opset(AVROTRW, r0)
++			opset(AVBITCLRW, r0)
++			opset(AVBITSETW, r0)
++			opset(AVBITREVW, r0)
+ 
+ 		case AXVSLLW:
+ 			opset(AXVSRLW, r0)
+ 			opset(AXVSRAW, r0)
+ 			opset(AXVROTRW, r0)
++			opset(AXVBITCLRW, r0)
++			opset(AXVBITSETW, r0)
++			opset(AXVBITREVW, r0)
+ 
+ 		case AVSLLV:
+ 			opset(AVSRLV, r0)
+ 			opset(AVSRAV, r0)
+ 			opset(AVROTRV, r0)
++			opset(AVBITCLRV, r0)
++			opset(AVBITSETV, r0)
++			opset(AVBITREVV, r0)
+ 
+ 		case AXVSLLV:
+ 			opset(AXVSRLV, r0)
+ 			opset(AXVSRAV, r0)
+ 			opset(AXVROTRV, r0)
++			opset(AXVBITCLRV, r0)
++			opset(AXVBITSETV, r0)
++			opset(AXVBITREVV, r0)
+ 
+ 		case AVPCNTB:
+ 			opset(AVPCNTH, r0)
+@@ -3205,6 +3229,54 @@ func (c *ctxt0) oprrr(a obj.As) uint32 {
+ 		return 0xe9ce << 15 // xvmod.wu
+ 	case AXVMODVU:
+ 		return 0xe9cf << 15 // xvmod.du
++	case AVBITCLRB:
++		return 0xe218 << 15 // vbitclr.b
++	case AVBITCLRH:
++		return 0xe219 << 15 // vbitclr.h
++	case AVBITCLRW:
++		return 0xe21a << 15 // vbitclr.w
++	case AVBITCLRV:
++		return 0xe21b << 15 // vbitclr.d
++	case AVBITSETB:
++		return 0xe21c << 15 // vbitset.b
++	case AVBITSETH:
++		return 0xe21d << 15 // vbitset.h
++	case AVBITSETW:
++		return 0xe21e << 15 // vbitset.w
++	case AVBITSETV:
++		return 0xe21f << 15 // vbitset.d
++	case AVBITREVB:
++		return 0xe220 << 15 // vbitrev.b
++	case AVBITREVH:
++		return 0xe221 << 15 // vbitrev.h
++	case AVBITREVW:
++		return 0xe222 << 15 // vbitrev.w
++	case AVBITREVV:
++		return 0xe223 << 15 // vbitrev.d
++	case AXVBITCLRB:
++		return 0xea18 << 15 // xvbitclr.b
++	case AXVBITCLRH:
++		return 0xea19 << 15 // xvbitclr.h
++	case AXVBITCLRW:
++		return 0xea1a << 15 // xvbitclr.w
++	case AXVBITCLRV:
++		return 0xea1b << 15 // xvbitclr.d
++	case AXVBITSETB:
++		return 0xea1c << 15 // xvbitset.b
++	case AXVBITSETH:
++		return 0xea1d << 15 // xvbitset.h
++	case AXVBITSETW:
++		return 0xea1e << 15 // xvbitset.w
++	case AXVBITSETV:
++		return 0xea1f << 15 // xvbitset.d
++	case AXVBITREVB:
++		return 0xea20 << 15 // xvbitrev.b
++	case AXVBITREVH:
++		return 0xea21 << 15 // xvbitrev.h
++	case AXVBITREVW:
++		return 0xea22 << 15 // xvbitrev.w
++	case AXVBITREVV:
++		return 0xea23 << 15 // xvbitrev.d
+ 	}
+ 
+ 	if a < 0 {
+@@ -3668,6 +3740,54 @@ func (c *ctxt0) opirr(a obj.As) uint32 {
+ 		return 0x1DCD<<18 | 0x1<<15 // xvsrai.w
+ 	case AXVSRAV:
+ 		return 0x1DCD<<18 | 0x1<<16 // xvsrai.d
++	case AVBITCLRB:
++		return 0x1CC4<<18 | 0x1<<13 // vbitclri.b
++	case AVBITCLRH:
++		return 0x1CC4<<18 | 0x1<<14 // vbitclri.h
++	case AVBITCLRW:
++		return 0x1CC4<<18 | 0x1<<15 // vbitclri.w
++	case AVBITCLRV:
++		return 0x1CC4<<18 | 0x1<<16 // vbitclri.d
++	case AVBITSETB:
++		return 0x1CC5<<18 | 0x1<<13 // vbitseti.b
++	case AVBITSETH:
++		return 0x1CC5<<18 | 0x1<<14 // vbitseti.h
++	case AVBITSETW:
++		return 0x1CC5<<18 | 0x1<<15 // vbitseti.w
++	case AVBITSETV:
++		return 0x1CC5<<18 | 0x1<<16 // vbitseti.d
++	case AVBITREVB:
++		return 0x1CC6<<18 | 0x1<<13 // vbitrevi.b
++	case AVBITREVH:
++		return 0x1CC6<<18 | 0x1<<14 // vbitrevi.h
++	case AVBITREVW:
++		return 0x1CC6<<18 | 0x1<<15 // vbitrevi.w
++	case AVBITREVV:
++		return 0x1CC6<<18 | 0x1<<16 // vbitrevi.d
++	case AXVBITCLRB:
++		return 0x1DC4<<18 | 0x1<<13 // xvbitclri.b
++	case AXVBITCLRH:
++		return 0x1DC4<<18 | 0x1<<14 // xvbitclri.h
++	case AXVBITCLRW:
++		return 0x1DC4<<18 | 0x1<<15 // xvbitclri.w
++	case AXVBITCLRV:
++		return 0x1DC4<<18 | 0x1<<16 // xvbitclri.d
++	case AXVBITSETB:
++		return 0x1DC5<<18 | 0x1<<13 // xvbitseti.b
++	case AXVBITSETH:
++		return 0x1DC5<<18 | 0x1<<14 // xvbitseti.h
++	case AXVBITSETW:
++		return 0x1DC5<<18 | 0x1<<15 // xvbitseti.w
++	case AXVBITSETV:
++		return 0x1DC5<<18 | 0x1<<16 // xvbitseti.d
++	case AXVBITREVB:
++		return 0x1DC6<<18 | 0x1<<13 // xvbitrevi.b
++	case AXVBITREVH:
++		return 0x1DC6<<18 | 0x1<<14 // xvbitrevi.h
++	case AXVBITREVW:
++		return 0x1DC6<<18 | 0x1<<15 // xvbitrevi.w
++	case AXVBITREVV:
++		return 0x1DC6<<18 | 0x1<<16 // xvbitrevi.d
+ 	}
+ 
+ 	if a < 0 {
+-- 
+2.38.1
+
diff --git a/0035-crypto-chacha20-add-loong64-SIMD-implementation.patch b/0035-crypto-chacha20-add-loong64-SIMD-implementation.patch
new file mode 100644
index 0000000000000000000000000000000000000000..2c8eb2b11612f20bef6bf8f08996bedaa193b3b8
--- /dev/null
+++ b/0035-crypto-chacha20-add-loong64-SIMD-implementation.patch
@@ -0,0 +1,490 @@
+From d6bdc012b1c105a007d0fb5d7d1642f1a5653b1d Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Fri, 13 Dec 2024 17:09:31 +0800
+Subject: [PATCH 35/44] crypto/chacha20: add loong64 SIMD implementation
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The performance of chacha20 has been greatly improved.
+
+goos: linux
+goarch: loong64
+pkg: golang.org/x/crypto/chacha20
+cpu: Loongson-3A6000 @ 2500.00MHz
+                 |  bench.old   |              bench.new              |
+                 |    sec/op    |   sec/op     vs base                |
+ChaCha20/64         171.9n ± 0%   159.3n ± 0%   -7.33% (p=0.000 n=20)
+ChaCha20/256        592.2n ± 0%   142.8n ± 0%  -75.89% (p=0.000 n=20)
+ChaCha20/10x25      981.5n ± 0%   518.8n ± 0%  -47.14% (p=0.000 n=20)
+ChaCha20/4096       8.991µ ± 0%   1.732µ ± 0%  -80.74% (p=0.000 n=20)
+ChaCha20/100x40    10.651µ ± 0%   5.135µ ± 0%  -51.79% (p=0.000 n=20)
+ChaCha20/65536     143.43µ ± 0%   28.76µ ± 0%  -79.95% (p=0.000 n=20)
+ChaCha20/1000x65   146.17µ ± 0%   37.13µ ± 0%  -74.60% (p=0.000 n=20)
+geomean             5.721µ        1.962µ       -65.70%
+
+                 |  bench.old   |               bench.new                |
+                 |     B/s      |      B/s       vs base                 |
+ChaCha20/64        355.1Mi ± 0%    383.1Mi ± 0%    +7.89% (p=0.000 n=20)
+ChaCha20/256       412.2Mi ± 0%   1710.2Mi ± 0%  +314.86% (p=0.000 n=20)
+ChaCha20/10x25     242.9Mi ± 0%    459.6Mi ± 0%   +89.19% (p=0.000 n=20)
+ChaCha20/4096      434.5Mi ± 0%   2255.8Mi ± 0%  +419.22% (p=0.000 n=20)
+ChaCha20/100x40    358.1Mi ± 0%    742.9Mi ± 0%  +107.44% (p=0.000 n=20)
+ChaCha20/65536     435.8Mi ± 0%   2173.2Mi ± 0%  +398.72% (p=0.000 n=20)
+ChaCha20/1000x65   424.1Mi ± 0%   1669.4Mi ± 0%  +293.64% (p=0.000 n=20)
+geomean            373.9Mi         1.065Gi       +191.55%
+
+goos: linux
+goarch: loong64
+pkg: golang.org/x/crypto/chacha20
+cpu: Loongson-3A5000 @ 2500.00MHz
+                 |  bench.old   |              bench.new              |
+                 |    sec/op    |   sec/op     vs base                |
+ChaCha20/64         234.5n ± 0%   295.8n ± 0%  +26.14% (p=0.000 n=20)
+ChaCha20/256        782.0n ± 0%   274.6n ± 0%  -64.88% (p=0.000 n=20)
+ChaCha20/10x25     1340.0n ± 0%   752.7n ± 0%  -43.83% (p=0.000 n=20)
+ChaCha20/4096      11.744µ ± 0%   3.455µ ± 0%  -70.58% (p=0.000 n=20)
+ChaCha20/100x40    14.151µ ± 0%   7.435µ ± 0%  -47.46% (p=0.000 n=20)
+ChaCha20/65536     188.05µ ± 0%   54.33µ ± 0%  -71.11% (p=0.000 n=20)
+ChaCha20/1000x65   191.44µ ± 0%   66.29µ ± 0%  -65.37% (p=0.000 n=20)
+geomean             7.604µ        3.436µ       -54.81%
+
+                 |  bench.old   |               bench.new                |
+                 |     B/s      |      B/s       vs base                 |
+ChaCha20/64        260.3Mi ± 0%    206.3Mi ± 0%   -20.73% (p=0.000 n=20)
+ChaCha20/256       312.2Mi ± 0%    888.9Mi ± 0%  +184.75% (p=0.000 n=20)
+ChaCha20/10x25     177.9Mi ± 0%    316.8Mi ± 0%   +78.08% (p=0.000 n=20)
+ChaCha20/4096      332.6Mi ± 0%   1130.8Mi ± 0%  +239.95% (p=0.000 n=20)
+ChaCha20/100x40    269.6Mi ± 0%    513.1Mi ± 0%   +90.34% (p=0.000 n=20)
+ChaCha20/65536     332.4Mi ± 0%   1150.5Mi ± 0%  +246.16% (p=0.000 n=20)
+ChaCha20/1000x65   323.8Mi ± 0%    935.2Mi ± 0%  +188.81% (p=0.000 n=20)
+geomean            281.3Mi         622.6Mi       +121.31%
+
+Change-Id: Iab4934d78b845e3b248bd5d0a9a62e4e9c516831
+---
+ .../x/crypto/chacha20/chacha_loong64.go       |  17 +
+ .../x/crypto/chacha20/chacha_loong64.s        | 374 ++++++++++++++++++
+ .../x/crypto/chacha20/chacha_noasm.go         |   2 +-
+ 3 files changed, 392 insertions(+), 1 deletion(-)
+ create mode 100644 src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.go
+ create mode 100644 src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.s
+
+diff --git a/src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.go b/src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.go
+new file mode 100644
+index 0000000000..d0f5d909fc
+--- /dev/null
++++ b/src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.go
+@@ -0,0 +1,17 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++//go:build gc && !purego
++
++package chacha20
++
++const bufSize = 256
++
++//go:noescape
++func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
++
++func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
++	// add cpu.Loong64.HasLSX check TODO
++	xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
++}
+diff --git a/src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.s b/src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.s
+new file mode 100644
+index 0000000000..883c8d992a
+--- /dev/null
++++ b/src/vendor/golang.org/x/crypto/chacha20/chacha_loong64.s
+@@ -0,0 +1,374 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++// derived from chacha_arm64.s
++
++//go:build gc && !purego
++
++#include "textflag.h"
++
++DATA	·constants+0x00(SB)/4, $0x61707865
++DATA	·constants+0x04(SB)/4, $0x3320646e
++DATA	·constants+0x08(SB)/4, $0x79622d32
++DATA	·constants+0x0c(SB)/4, $0x6b206574
++GLOBL	·constants(SB), NOPTR|RODATA, $32
++
++DATA	·incRotMatrix+0x00(SB)/4, $0x00000000
++DATA	·incRotMatrix+0x04(SB)/4, $0x00000001
++DATA	·incRotMatrix+0x08(SB)/4, $0x00000002
++DATA	·incRotMatrix+0x0c(SB)/4, $0x00000003
++GLOBL	·incRotMatrix(SB), NOPTR|RODATA, $32
++
++#define NUM_ROUNDS 10
++
++// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
++TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
++	MOVV	dst+0(FP), R4
++	MOVV	src+24(FP), R5
++	MOVV	src_len+32(FP), R6
++	MOVV	key+48(FP), R7 
++	MOVV	nonce+56(FP), R8
++	MOVV	counter+64(FP), R9
++
++	MOVV	$·constants(SB), R10
++	MOVV	$·incRotMatrix(SB), R11
++
++	MOVW	(R9), R12
++
++loop:
++	MOVV	$NUM_ROUNDS, R15
++	// load 4-32bit data from incRotMatrix added to counter
++	VMOVQ	(R11), V30
++
++	// load contants
++	// VLDREPL.W  $0, R10, V0
++	WORD	$0x30200140
++	// VLDREPL.W  $1, R10, V1
++	WORD	$0x30200541
++	// VLDREPL.W  $2, R10, V2
++	WORD	$0x30200942
++	// VLDREPL.W  $3, R10, V3
++	WORD	$0x30200d43
++
++	// load keys
++	// VLDREPL.W  $0, R7, V4
++	WORD	$0x302000e4
++	// VLDREPL.W  $1, R7, V5
++	WORD	$0x302004e5
++	// VLDREPL.W  $2, R7, V6
++	WORD	$0x302008e6
++	// VLDREPL.W  $3, R7, V7
++	WORD	$0x30200ce7
++	// VLDREPL.W  $4, R7, V8
++	WORD	$0x302010e8
++	// VLDREPL.W  $5, R7, V9
++	WORD	$0x302014e9
++	// VLDREPL.W  $6, R7, V10
++	WORD	$0x302018ea
++	// VLDREPL.W  $7, R7, V11
++	WORD	$0x30201ceb
++
++	// load counter + nonce
++	// VLDREPL.W  $0, R9, V12 	
++	WORD	$0x3020012c
++
++	// VLDREPL.W  $0, R8, V13
++	WORD	$0x3020010d
++	// VLDREPL.W  $1, R8, V14
++	WORD	$0x3020050e
++	// VLDREPL.W  $2, R8, V15
++	WORD	$0x3020090f
++
++	// update counter
++	VADDW	V30, V12, V12
++
++chacha:
++	// V0..V3 += V4..V7
++	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
++	VADDW	V0, V4, V0
++	VADDW	V1, V5, V1
++	VADDW	V2, V6, V2
++	VADDW	V3, V7, V3
++	VXORV	V12, V0, V12
++	VXORV	V13, V1, V13
++	VXORV	V14, V2, V14
++	VXORV	V15, V3, V15
++	VROTRW	$16, V12, V12
++	VROTRW	$16, V13, V13
++	VROTRW	$16, V14, V14
++	VROTRW	$16, V15, V15
++
++	// V8..V11 += V12..V15
++	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
++	VADDW	V8, V12, V8
++	VADDW	V9, V13, V9
++	VADDW	V10, V14, V10
++	VADDW	V11, V15, V11
++	VXORV	V4, V8, V4
++	VXORV	V5, V9, V5
++	VXORV	V6, V10, V6
++	VXORV	V7, V11, V7
++	VROTRW	$20, V4, V4
++	VROTRW	$20, V5, V5
++	VROTRW	$20, V6, V6
++	VROTRW	$20, V7, V7
++
++	// V0..V3 += V4..V7
++	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
++	VADDW	V0, V4, V0
++	VADDW	V1, V5, V1
++	VADDW	V2, V6, V2
++	VADDW	V3, V7, V3
++	VXORV	V12, V0, V12
++	VXORV	V13, V1, V13
++	VXORV	V14, V2, V14
++	VXORV	V15, V3, V15
++	VROTRW	$24, V12, V12
++	VROTRW	$24, V13, V13
++	VROTRW	$24, V14, V14
++	VROTRW	$24, V15, V15
++	
++	// V8..V11 += V12..V15
++	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
++	VADDW	V12, V8, V8
++	VADDW	V13, V9, V9
++	VADDW	V14, V10, V10
++	VADDW	V15, V11, V11
++	VXORV	V4, V8, V4
++	VXORV	V5, V9, V5
++	VXORV	V6, V10, V6
++	VXORV	V7, V11, V7
++	VROTRW	$25, V4, V4
++	VROTRW	$25, V5, V5
++	VROTRW	$25, V6, V6
++	VROTRW	$25, V7, V7
++
++	// V0..V3 += V5..V7, V4
++	// V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
++	VADDW	V0, V5, V0
++	VADDW	V1, V6, V1
++	VADDW	V2, V7, V2
++	VADDW	V3, V4, V3
++	VXORV	V15, V0, V15
++	VXORV	V12, V1, V12
++	VXORV	V13, V2, V13
++	VXORV	V14, V3, V14
++	VROTRW	$16, V15, V15
++	VROTRW	$16, V12, V12
++	VROTRW	$16, V13, V13
++	VROTRW	$16, V14, V14
++
++	// V10,V11,V8,V9 += V15,V12,V13,V14
++	// V5,V6,V7,V4 <<<= ((V5,V6,V7,V4 XOR V10,V11,V8,V9), 12)
++	VADDW	V10, V15, V10
++	VADDW	V11, V12, V11
++	VADDW	V8, V13, V8
++	VADDW	V9, V14, V9
++	VXORV	V5, V10, V5
++	VXORV	V6, V11, V6
++	VXORV	V7, V8, V7
++	VXORV	V4, V9, V4
++	VROTRW	$20, V5, V5
++	VROTRW	$20, V6, V6
++	VROTRW	$20, V7, V7
++	VROTRW	$20, V4, V4
++
++	// V0..V3 += V5..V7, V4
++	// V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 8)
++	VADDW	V5, V0, V0
++	VADDW	V6, V1, V1
++	VADDW	V7, V2, V2
++	VADDW	V4, V3, V3
++	VXORV	V15, V0, V15
++	VXORV	V12, V1, V12
++	VXORV	V13, V2, V13
++	VXORV	V14, V3, V14
++	VROTRW	$24, V15, V15
++	VROTRW	$24, V12, V12
++	VROTRW	$24, V13, V13
++	VROTRW	$24, V14, V14
++
++	// V10,V11,V8,V9 += V15,V12,V13,V14
++	// V5,V6,V7,V4 <<<= ((V5,V6,V7,V4 XOR V10,V11,V8,V9), 7)
++	VADDW	V15, V10, V10
++	VADDW	V12, V11, V11
++	VADDW	V13, V8, V8
++	VADDW	V14, V9, V9
++	VXORV	V5, V10, V5
++	VXORV	V6, V11, V6
++	VXORV	V7, V8, V7
++	VXORV	V4, V9, V4
++	VROTRW	$25, V5, V5
++	VROTRW	$25, V6, V6
++	VROTRW	$25, V7, V7
++	VROTRW	$25, V4, V4
++	
++	SUBV	$1, R15
++	BNE	R15, R0, chacha
++
++	// load origin contants
++	// VLDREPL.W  $0, R10, V16
++	WORD	$0x30200150
++	// VLDREPL.W  $1, R10, V17
++	WORD	$0x30200551
++	// VLDREPL.W  $2, R10, V18
++	WORD	$0x30200952
++	// VLDREPL.W  $3, R10, V19
++	WORD	$0x30200d53
++
++	// load origin keys
++	// VLDREPL.W  $0, R7, V20
++	WORD	$0x302000f4
++	// VLDREPL.W  $1, R7, V21
++	WORD	$0x302004f5
++	// VLDREPL.W  $2, R7, V22
++	WORD	$0x302008f6
++	// VLDREPL.W  $3, R7, V23
++	WORD	$0x30200cf7
++	// VLDREPL.W  $4, R7, V24
++	WORD	$0x302010f8
++	// VLDREPL.W  $5, R7, V25
++	WORD	$0x302014f9
++	// VLDREPL.W  $6, R7, V26
++	WORD	$0x302018fa
++	// VLDREPL.W  $7, R7, V27
++	WORD	$0x30201cfb
++
++	// add back the initial state to generate the key stream
++	VADDW	V30, V12, V12	// update counter in advance to prevent V30 from being overwritten 
++	VADDW	V16, V0, V0
++	VADDW	V17, V1, V1
++	VADDW	V18, V2, V2
++	VADDW	V19, V3, V3
++
++	// load origin counter + nonce
++	// VLDREPL.W  $0, R9, V28	
++	WORD	$0x3020013c
++	// VLDREPL.W  $0, R8, V29
++	WORD	$0x3020011d
++	// VLDREPL.W  $1, R8, V30
++	WORD	$0x3020051e
++	// VLDREPL.W  $2, R8, V31
++	WORD	$0x3020091f
++
++	VADDW	V20, V4, V4
++	VADDW	V21, V5, V5
++	VADDW	V22, V6, V6
++	VADDW	V23, V7, V7
++	VADDW	V24, V8, V8
++	VADDW	V25, V9, V9
++	VADDW	V26, V10, V10
++	VADDW	V27, V11, V11
++	VADDW	V28, V12, V12
++	VADDW	V29, V13, V13
++	VADDW	V30, V14, V14
++	VADDW	V31, V15, V15
++
++	// shuffle
++	VILVLW	V0, V1, V16
++	VILVHW	V0, V1, V17
++	VILVLW	V2, V3, V18
++	VILVHW	V2, V3, V19
++	VILVLW	V4, V5 ,V20
++	VILVHW	V4, V5, V21
++	VILVLW	V6, V7, V22
++	VILVHW	V6, V7, V23
++	VILVLW	V8, V9, V24
++	VILVHW	V8, V9, V25
++	VILVLW	V10, V11, V26
++	VILVHW	V10, V11, V27
++	VILVLW	V12, V13, V28
++	VILVHW	V12, V13, V29
++	VILVLW	V14, V15, V30
++	VILVHW	V14, V15, V31
++	VILVLV	V16, V18, V0
++	VILVHV	V16, V18, V4
++	VILVLV	V17, V19, V8
++	VILVHV	V17, V19, V12
++
++	// load src data from R5
++	VMOVQ	0(R5), V16
++	VMOVQ	16(R5), V17
++	VMOVQ	32(R5), V18
++	VMOVQ	48(R5), V19
++
++	VILVLV	V20, V22, V1
++	VILVHV	V20, V22, V5 
++	VILVLV	V21, V23, V9
++	VILVHV	V21, V23, V13
++
++	VMOVQ	64(R5), V20
++	VMOVQ	80(R5), V21
++	VMOVQ	96(R5), V22
++	VMOVQ	112(R5), V23
++
++	VILVLV	V24, V26, V2
++	VILVHV	V24, V26, V6
++	VILVLV	V25, V27, V10
++	VILVHV	V25, V27, V14
++
++	VMOVQ	128(R5), V24
++	VMOVQ	144(R5), V25
++	VMOVQ	160(R5), V26
++	VMOVQ	176(R5), V27
++
++	VILVLV	V28, V30, V3 
++	VILVHV	V28, V30, V7
++	VILVLV	V29, V31, V11
++	VILVHV	V29, V31, V15
++
++	VMOVQ	192(R5), V28
++	VMOVQ	208(R5), V29
++	VMOVQ	224(R5), V30
++	VMOVQ	240(R5), V31
++
++	VXORV	V0, V16, V16
++	VXORV	V1, V17, V17
++	VXORV	V2, V18, V18
++	VXORV	V3, V19, V19
++
++	VMOVQ	V16, 0(R4)	
++	VMOVQ	V17, 16(R4)	
++	VMOVQ	V18, 32(R4)	
++	VMOVQ	V19, 48(R4)	
++
++	VXORV	V4, V20, V20
++	VXORV	V5, V21, V21
++	VXORV	V6, V22, V22
++	VXORV	V7, V23, V23
++
++	VMOVQ	V20, 64(R4)	
++	VMOVQ	V21, 80(R4)	
++	VMOVQ	V22, 96(R4)	
++	VMOVQ	V23, 112(R4)	
++
++	VXORV	V8, V24, V24
++	VXORV	V9, V25, V25
++	VXORV	V10, V26, V26
++	VXORV	V11, V27, V27
++
++	VMOVQ	V24, 128(R4)
++	VMOVQ	V25, 144(R4)
++	VMOVQ	V26, 160(R4)
++	VMOVQ	V27, 176(R4)
++
++	VXORV	V12, V28, V28
++	VXORV	V13, V29, V29
++	VXORV	V14, V30, V30
++	VXORV	V15, V31, V31
++
++	VMOVQ	V28, 192(R4)
++	VMOVQ	V29, 208(R4)
++	VMOVQ	V30, 224(R4)
++	VMOVQ	V31, 240(R4)
++
++	ADD	$4, R12, R12
++	MOVW	R12, (R9) // update counter
++
++	ADDV	$256, R4, R4
++	ADDV	$256, R5, R5
++	SUBV    $256, R6, R6
++	BNE     R6, R0, loop
++
++	RET
+diff --git a/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go b/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go
+index c709b72847..3853cc0e0b 100644
+--- a/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go
++++ b/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build (!arm64 && !s390x && !ppc64 && !ppc64le) || !gc || purego
++//go:build (!arm64 && !loong64 && !s390x && !ppc64 && !ppc64le) || !gc || purego
+ 
+ package chacha20
+ 
+-- 
+2.38.1
+
diff --git a/0036-internal-bytealg-optimize-Count-String-in-loong64.patch b/0036-internal-bytealg-optimize-Count-String-in-loong64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..3a513c22d1714506707ce1b4ebb7eeff0af434cd
--- /dev/null
+++ b/0036-internal-bytealg-optimize-Count-String-in-loong64.patch
@@ -0,0 +1,268 @@
+From 1698704d825764d2cbdbbf2718c582cf45d66fb0 Mon Sep 17 00:00:00 2001
+From: Guoqi Chen <chenguoqi@loongson.cn>
+Date: Tue, 10 Dec 2024 21:06:28 +0800
+Subject: [PATCH 36/44] internal/bytealg: optimize Count{,String} in loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Benchmark on Loongson 3A6000 and 3A5000:
+
+goos: linux
+goarch: loong64
+pkg: bytes
+cpu: Loongson-3A6000 @ 2500.00MHz
+                 |   bench.old   |           bench.new                 |
+                 |   sec/op      |   sec/op      vs base               |
+CountSingle/10      12.81n ± 0%     10.74n ±  0%  -16.16% (p=0.000 n=10)
+CountSingle/32     33.135n ± 0%     8.007n ±  0%  -75.84% (p=0.000 n=10)
+CountSingle/4K     4057.0n ± 0%     207.5n ±  0%  -94.89% (p=0.000 n=10)
+CountSingle/4M     4161.7µ ± 0%     217.1µ ±  0%  -94.78% (p=0.000 n=10)
+CountSingle/64M    68.722m ± 0%     3.717m ± 11%  -94.59% (p=0.000 n=10)
+geomean             13.76µ          1.705µ        -87.61%
+
+                 |   bench.old   |                 bench.new                |
+                 |       B/s     |       B/s         vs base                |
+CountSingle/10     744.4Mi ± 0%       887.8Mi ±  0%    +19.26% (p=0.000 n=10)
+CountSingle/32     921.0Mi ± 0%      3811.5Mi ±  0%   +313.84% (p=0.000 n=10)
+CountSingle/4K     962.7Mi ± 0%     18825.3Mi ±  0%  +1855.40% (p=0.000 n=10)
+CountSingle/4M     961.2Mi ± 0%     18425.4Mi ±  0%  +1817.02% (p=0.000 n=10)
+CountSingle/64M    931.3Mi ± 0%     17216.0Mi ± 10%  +1748.62% (p=0.000 n=10)
+geomean            900.1Mi            7.092Gi         +706.88%
+
+goos: linux
+goarch: loong64
+pkg: bytes
+cpu: Loongson-3A5000-HV @ 2500.00MHz
+                 |    bench.old    |           bench.new                 |
+                 |    sec/op       |    sec/op     vs base               |
+CountSingle/10       14.03n ±  1%     14.82n ± 0%   +5.67% (p=0.000 n=10)
+CountSingle/32       36.23n ±  0%     11.61n ± 0%  -67.95% (p=0.000 n=10)
+CountSingle/4K      4367.0n ±  0%     323.5n ± 0%  -92.59% (p=0.000 n=10)
+CountSingle/4M      4538.6µ ±  0%     381.2µ ± 0%  -91.60% (p=0.000 n=10)
+CountSingle/64M     76.575m ± 22%     7.971m ± 0%  -89.59% (p=0.000 n=10)
+geomean              15.05µ           2.790µ       -81.46%
+
+                 |   bench.old      |               bench.new                 |
+                 |       B/s        |        B/s        vs base               |
+CountSingle/10       680.0Mi ±  1%       643.7Mi ± 0%     -5.34% (p=0.000 n=10)
+CountSingle/32       842.2Mi ±  0%      2628.4Mi ± 0%   +212.07% (p=0.000 n=10)
+CountSingle/4K       894.5Mi ±  0%     12075.4Mi ± 0%  +1249.95% (p=0.000 n=10)
+CountSingle/4M       881.3Mi ±  0%     10492.9Mi ± 0%  +1090.57% (p=0.000 n=10)
+CountSingle/64M      835.8Mi ± 18%      8028.7Mi ± 0%   +860.61% (p=0.000 n=10)
+geomean              822.9Mi             4.334Gi        +439.27%
+
+Change-Id: I0a45139965b3e5eb09ab22be75145302f88a1915
+---
+ src/internal/bytealg/bytealg.go       |   3 +
+ src/internal/bytealg/count_loong64.s  | 110 ++++++++++++++++++--------
+ src/internal/cpu/cpu.go               |   1 +
+ src/internal/cpu/cpu_loong64.go       |   1 +
+ src/internal/cpu/cpu_loong64_hwcap.go |   2 +
+ 5 files changed, 85 insertions(+), 32 deletions(-)
+
+diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go
+index 6b79a2e1fa..a5f71ce342 100644
+--- a/src/internal/bytealg/bytealg.go
++++ b/src/internal/bytealg/bytealg.go
+@@ -18,6 +18,9 @@ const (
+ 	offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX)
+ 
+ 	offsetPPC64HasPOWER9 = unsafe.Offsetof(cpu.PPC64.IsPOWER9)
++
++	offsetLOONG64HasLSX  = unsafe.Offsetof(cpu.Loong64.HasLSX)
++	offsetLOONG64HasLASX = unsafe.Offsetof(cpu.Loong64.HasLASX)
+ )
+ 
+ // MaxLen is the maximum length of the string to be searched for (argument b) in Index.
+diff --git a/src/internal/bytealg/count_loong64.s b/src/internal/bytealg/count_loong64.s
+index db8ba2cb24..5c9dfeb0eb 100644
+--- a/src/internal/bytealg/count_loong64.s
++++ b/src/internal/bytealg/count_loong64.s
+@@ -25,17 +25,81 @@ TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
+ //   R5 = s_len
+ //   R6 = byte to count
+ TEXT countbody<>(SB),NOSPLIT,$0
+-	MOVV	R0, R7		// count
+-	ADDV	R4, R5		// end
++	MOVV	R0, R7	// count
++
++	// short path to handle 0-byte case
++	BEQ	R5, done
++
++	// jump directly to tail length < 4
++	MOVV	$4, R8
++	BLT	R5, R8, tail
++
++	// jump directly to genericCountBody if length < 16
++	MOVV	$16, R8
++	BLT	R5, R8, genericCountBody
++
++	// jump directly to lsxCountBody if length < 64
++	MOVV	$64, R8
++	BLT	R5, R8, lsxCountBody
++lasxCountBody:
++	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R8
++	BEQ	R8, lsxCountBody
++	MOVV	$32, R9
++	XVMOVQ	R6, X0.B32
++
++	PCALIGN	$16
++lasxLoop:
++	XVMOVQ	(R4), X1
++	XVSEQB  X0, X1, X2
++	XVANDB  $1, X2, X2
++	XVPCNTV	X2, X3
++	XVMOVQ	X3.V[0], R8
++	ADDV	R8, R7
++	XVMOVQ	X3.V[1], R8
++	ADDV	R8, R7
++	XVMOVQ	X3.V[2], R8
++	ADDV	R8, R7
++	XVMOVQ	X3.V[3], R8
++	ADDV	R8, R7
++	ADDV	$-32, R5
++	ADDV	$32, R4
++	BGE	R5, R9, lasxLoop
++
++lsxCountBody:
++	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R8
++	BEQ	R8, genericCountBody
++	// jump directly to genericCountBody if length < 16
++	MOVV	$16, R9
++	BLT	R5, R9, genericCountBody
++	VMOVQ	R6, V0.B16
++
++	PCALIGN	$16
++lsxLoop:
++	VMOVQ	(R4), V1
++	VSEQB	V0, V1, V2
++	VANDB	$1, V2, V2
++	VPCNTV	V2, V3
++	VMOVQ	V3.V[0], R8
++	ADDV	R8, R7
++	VMOVQ	V3.V[1], R8
++	ADDV	R8, R7
++	ADDV	$-16, R5
++	ADDV	$16, R4
++	BGE	R5, R9, lsxLoop
++
++	// Work with genericCountBody shorter than 16 bytes
++genericCountBody:
++	MOVV	$4, R9
+ 	MOVV	$1, R17
+ 
+-loop:
+-	ADDV	$8, R4, R9
++	PCALIGN	$16
++genericLoop:
+ 	BLT	R5, R9, tail
+-	MOVV	(R4), R8
++	ADDV	$-4, R5
++	MOVWU	(R4)(R5), R8
+ 
+ 	AND	$0xff, R8, R10
+-	WORD	$0xcf210b	// bstrpick.w r11, r8, 15, 8
++	BSTRPICKW	$15, R8, $8, R11
+ 	XOR	R6, R10, R10
+ 	XOR	R6, R11, R11
+ 	MASKNEZ	R10, R17, R12
+@@ -43,8 +107,8 @@ loop:
+ 	ADDV	R7, R12, R7
+ 	ADDV	R7, R13, R7
+ 
+-	WORD	$0xd7410a	// bstrpick.w r10, r8, 23, 16
+-	WORD	$0xdf610b	// bstrpick.w r11, r8, 31, 24
++	BSTRPICKW	$23, R8, $16, R10
++	BSTRPICKW	$31, R8, $24, R11
+ 	XOR	R6, R10, R10
+ 	XOR	R6, R11, R11
+ 	MASKNEZ	R10, R17, R12
+@@ -52,35 +116,17 @@ loop:
+ 	ADDV	R7, R12, R7
+ 	ADDV	R7, R13, R7
+ 
+-	WORD	$0xe7810a	// bstrpick.w r10, r8, 39, 32
+-	WORD	$0xefa10b	// bstrpick.w r11, r8, 47, 40
+-	XOR	R6, R10, R10
+-	XOR	R6, R11, R11
+-	MASKNEZ	R10, R17, R12
+-	MASKNEZ	R11, R17, R13
+-	ADDV	R7, R12, R7
+-	ADDV	R7, R13, R7
+-
+-	WORD	$0xf7c10a	// bstrpick.w r10, r8, 55, 48
+-	WORD	$0xffe10b	// bstrpick.w r11, r8, 63, 56
+-	XOR	R6, R10, R10
+-	XOR	R6, R11, R11
+-	MASKNEZ	R10, R17, R12
+-	MASKNEZ	R11, R17, R13
+-	ADDV	R7, R12, R7
+-	ADDV	R7, R13, R7
+-
+-	MOVV	R9, R4
+-	JMP	loop
++	JMP	genericLoop
+ 
++	// Work with tail shorter than 4 bytes
++	PCALIGN	$16
+ tail:
+-	BEQ	R4, R5, done
+-	MOVBU	(R4), R8
+-	ADDV	$1, R4
++	BEQ	R5, done
++	ADDV	$-1, R5
++	MOVBU	(R4)(R5), R8
+ 	BNE	R6, R8, tail
+ 	ADDV	$1, R7
+ 	JMP	tail
+-
+ done:
+ 	MOVV	R7, R4
+ 	RET
+diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go
+index cd3db10523..2443b31fc8 100644
+--- a/src/internal/cpu/cpu.go
++++ b/src/internal/cpu/cpu.go
+@@ -83,6 +83,7 @@ var ARM64 struct {
+ var Loong64 struct {
+ 	_         CacheLinePad
+ 	HasLSX    bool // support 128-bit vector extension
++	HasLASX   bool // support 256-bit vector extension
+ 	HasCRC32  bool // support CRC instruction
+ 	HasLAMCAS bool // support AMCAS[_DB].{B/H/W/D}
+ 	HasLAM_BH bool // support AM{SWAP/ADD}[_DB].{B/H} instruction
+diff --git a/src/internal/cpu/cpu_loong64.go b/src/internal/cpu/cpu_loong64.go
+index 92583d0bca..9a58ea251c 100644
+--- a/src/internal/cpu/cpu_loong64.go
++++ b/src/internal/cpu/cpu_loong64.go
+@@ -27,6 +27,7 @@ func get_cpucfg(reg uint32) uint32
+ func doinit() {
+ 	options = []option{
+ 		{Name: "lsx", Feature: &Loong64.HasLSX},
++		{Name: "lasx", Feature: &Loong64.HasLASX},
+ 		{Name: "crc32", Feature: &Loong64.HasCRC32},
+ 		{Name: "lamcas", Feature: &Loong64.HasLAMCAS},
+ 		{Name: "lam_bh", Feature: &Loong64.HasLAM_BH},
+diff --git a/src/internal/cpu/cpu_loong64_hwcap.go b/src/internal/cpu/cpu_loong64_hwcap.go
+index 58397adae8..6c6b8a81f2 100644
+--- a/src/internal/cpu/cpu_loong64_hwcap.go
++++ b/src/internal/cpu/cpu_loong64_hwcap.go
+@@ -13,12 +13,14 @@ var HWCap uint
+ // HWCAP bits. These are exposed by the Linux kernel.
+ const (
+ 	hwcap_LOONGARCH_LSX = 1 << 4
++	hwcap_LOONGARCH_LASX = 1 << 5
+ )
+ 
+ func hwcapInit() {
+ 	// TODO: Features that require kernel support like LSX and LASX can
+ 	// be detected here once needed in std library or by the compiler.
+ 	Loong64.HasLSX = hwcIsSet(HWCap, hwcap_LOONGARCH_LSX)
++	Loong64.HasLASX = hwcIsSet(HWCap, hwcap_LOONGARCH_LASX)
+ }
+ 
+ func hwcIsSet(hwc uint, val uint) bool {
+-- 
+2.38.1
+
diff --git a/0037-cmd-internal-obj-cmd-asm-reclassify-32-bit-immediate.patch b/0037-cmd-internal-obj-cmd-asm-reclassify-32-bit-immediate.patch
new file mode 100644
index 0000000000000000000000000000000000000000..ccad194868f90528439f26426f941baa445ffaca
--- /dev/null
+++ b/0037-cmd-internal-obj-cmd-asm-reclassify-32-bit-immediate.patch
@@ -0,0 +1,690 @@
+From a713105842cd7b88dbb573980731062c218a8310 Mon Sep 17 00:00:00 2001
+From: limeidan <limeidan@loongson.cn>
+Date: Mon, 16 Dec 2024 16:31:37 +0800
+Subject: [PATCH 37/44] cmd/internal/obj, cmd/asm: reclassify 32-bit immediate
+ value
+
+Change-Id: If9fd257ca0837a8c8597889c4f5ed3d4edc602c1
+---
+ .../asm/internal/asm/testdata/loong64enc1.s   |   4 +-
+ .../asm/internal/asm/testdata/loong64enc2.s   |   2 +-
+ src/cmd/internal/obj/loong64/a.out.go         |  31 +-
+ src/cmd/internal/obj/loong64/asm.go           | 376 +++++++-----------
+ src/cmd/internal/obj/loong64/cnames.go        |  25 +-
+ 5 files changed, 186 insertions(+), 252 deletions(-)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+index 19070c89ef..b40d86e596 100644
+--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+@@ -107,8 +107,8 @@ lable2:
+ 	MOVV	$4(R4), R5		// 8510c002
+ 	MOVW	$-1, R4			// 04fcff02
+ 	MOVV	$-1, R4			// 04fcff02
+-	MOVW	$1, R4			// 0404c002
+-	MOVV	$1, R4			// 0404c002
++	MOVW	$1, R4			// 04048003
++	MOVV	$1, R4			// 04048003
+ 	ADD	$-1, R4, R5		// 85fcbf02
+ 	ADD	$-1, R4			// 84fcbf02
+ 	ADDV	$-1, R4, R5		// 85fcff02
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc2.s b/src/cmd/asm/internal/asm/testdata/loong64enc2.s
+index ee3bad74b1..91aed4e2c7 100644
+--- a/src/cmd/asm/internal/asm/testdata/loong64enc2.s
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc2.s
+@@ -12,7 +12,7 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
+ 	AND	$-1, R4, R5		// 1efcbf0285f81400
+ 	AND	$-1, R4			// 1efcbf0284f81400
+ 	MOVW	$-1, F4			// 1efcbf02c4a71401
+-	MOVW	$1, F4			// 1e048002c4a71401
++	MOVW	$1, F4			// 1e048003c4a71401
+ 	TEQ	$4, R4, R5		// 8508005c04002a00
+ 	TEQ	$4, R4			// 0408005c04002a00
+ 	TNE	$4, R4, R5		// 8508005804002a00
+diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
+index 1fadbc648a..f2d4c41d68 100644
+--- a/src/cmd/internal/obj/loong64/a.out.go
++++ b/src/cmd/internal/obj/loong64/a.out.go
+@@ -325,19 +325,26 @@ const (
+ 	C_XREG
+ 	C_ARNG // Vn.<T>
+ 	C_ELEM // Vn.<T>[index]
++
+ 	C_ZCON
+-	C_SCON // 12 bit signed
+-	C_UCON // 32 bit signed, low 12 bits 0
+-
+-	// When the immediate value is SCON, it can choose either the ADDCON implementation
+-	// or the ANDCON implementation, using ADD0CON/AND0CON to distinguish them, so that
+-	// the program can choose the implementation with fewer instructions.
+-	C_ADD0CON
+-	C_AND0CON
+-
+-	C_ADDCON // -0x800 <= v < 0
+-	C_ANDCON // 0 < v <= 0xFFF
+-	C_LCON   // other 32
++	C_U1CON // 1 bit unsigned constant
++	C_U2CON // 2 bit unsigned constant
++	C_U3CON // 3 bit unsigned constant
++	C_U4CON // 4 bit unsigned constant
++	C_U5CON // 5 bit unsigned constant
++	C_U6CON // 6 bit unsigned constant
++	C_U7CON // 7 bit unsigned constant
++	C_U8CON // 8 bit unsigned constant
++	C_S5CON // 5 bit signed constant
++	C_US12CON // same as C_S12CON, increase the priority of C_S12CON in special cases.
++	C_UU12CON // same as C_U12CON, increase the priority of C_U12CON in special cases.
++	C_S12CON // 12 bit signed constant, -0x800 < v <= 0x7ff
++	C_U12CON // 12 bit unsigned constant, 0 < v <= 0xfff
++	C_12CON // 12 bit signed constant, or 12 bit unsigned constant
++	C_U15CON // 15 bit unsigned constant
++	C_15CON20_0 // 15 bit unsigned constant, low 12 bits 0
++	C_32CON20_0 // 32 bit signed, low 12 bits 0
++	C_32CON // other 32 bit signed
+ 
+ 	// 64 bit signed, lo32 bits 0, hi20 bits are not 0, hi12 bits can
+ 	// be obtained by sign extension of the hi20 bits.
+diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
+index 657d32ae81..2480cf9382 100644
+--- a/src/cmd/internal/obj/loong64/asm.go
++++ b/src/cmd/internal/obj/loong64/asm.go
+@@ -51,12 +51,6 @@ const (
+ 	// branchLoopHead marks loop entry.
+ 	// Used to insert padding for under-aligned loops.
+ 	branchLoopHead
+-	immFiledSi5 // The encoding of the immediate field in the instruction is 5-bits
+-	immFiledUi3 // The encoding of the immediate field in the instruction is 3-bits
+-	immFiledUi4 // The encoding of the immediate field in the instruction is 4-bits
+-	immFiledUi5 // The encoding of the immediate field in the instruction is 5-bits
+-	immFiledUi6 // The encoding of the immediate field in the instruction is 6-bits
+-	immFiledUi8 // The encoding of the immediate field in the instruction is 8-bits
+ )
+ 
+ var optab = []Optab{
+@@ -94,45 +88,41 @@ var optab = []Optab{
+ 	{ACMPEQF, C_FREG, C_FREG, C_NONE, C_FCCREG, C_NONE, 2, 4, 0, 0},
+ 	{AVSEQB, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0},
+ 	{AXVSEQB, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0},
+-	{AVSEQB, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 13, 4, 0, immFiledSi5},
+-	{AXVSEQB, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 13, 4, 0, immFiledSi5},
+-	{AVSEQB, C_ADDCON, C_VREG, C_NONE, C_VREG, C_NONE, 13, 4, 0, immFiledSi5},
+-	{AXVSEQB, C_ADDCON, C_XREG, C_NONE, C_XREG, C_NONE, 13, 4, 0, immFiledSi5},
++	{AVSEQB, C_S5CON, C_VREG, C_NONE, C_VREG, C_NONE, 13, 4, 0, 0},
++	{AXVSEQB, C_S5CON, C_XREG, C_NONE, C_XREG, C_NONE, 13, 4, 0, 0},
+ 
+ 	{AVANDV, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0},
+ 	{AXVANDV, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0},
+-	{AVANDB, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 14, 4, 0, immFiledUi8},
+-	{AXVANDB, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 14, 4, 0, immFiledUi8},
+-	{AVANDB, C_ADDCON, C_VREG, C_NONE, C_VREG, C_NONE, 14, 4, 0, immFiledUi8},
+-	{AXVANDB, C_ADDCON, C_XREG, C_NONE, C_XREG, C_NONE, 14, 4, 0, immFiledUi8},
++	{AVANDB, C_U8CON, C_VREG, C_NONE, C_VREG, C_NONE, 14, 4, 0, 0},
++	{AXVANDB, C_U8CON, C_XREG, C_NONE, C_XREG, C_NONE, 14, 4, 0, 0},
+ 
+ 	{AVSLLB, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0},
+ 	{AXVSLLB, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0},
+-	{AVSLLB, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 29, 4, 0, immFiledUi3},
+-	{AXVSLLB, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 29, 4, 0, immFiledUi3},
+-	{AVSLLB, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 29, 4, 0, immFiledUi3},
+-	{AXVSLLB, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 29, 4, 0, immFiledUi3},
++	{AVSLLB, C_U3CON, C_VREG, C_NONE, C_VREG, C_NONE, 29, 4, 0, 0},
++	{AXVSLLB, C_U3CON, C_XREG, C_NONE, C_XREG, C_NONE, 29, 4, 0, 0},
++	{AVSLLB, C_U3CON, C_NONE, C_NONE, C_VREG, C_NONE, 29, 4, 0, 0},
++	{AXVSLLB, C_U3CON, C_NONE, C_NONE, C_XREG, C_NONE, 29, 4, 0, 0},
+ 
+ 	{AVSLLH, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0},
+ 	{AXVSLLH, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0},
+-	{AVSLLH, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 31, 4, 0, immFiledUi4},
+-	{AXVSLLH, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 31, 4, 0, immFiledUi4},
+-	{AVSLLH, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 31, 4, 0, immFiledUi4},
+-	{AXVSLLH, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 31, 4, 0, immFiledUi4},
++	{AVSLLH, C_U4CON, C_VREG, C_NONE, C_VREG, C_NONE, 31, 4, 0, 0},
++	{AXVSLLH, C_U4CON, C_XREG, C_NONE, C_XREG, C_NONE, 31, 4, 0, 0},
++	{AVSLLH, C_U4CON, C_NONE, C_NONE, C_VREG, C_NONE, 31, 4, 0, 0},
++	{AXVSLLH, C_U4CON, C_NONE, C_NONE, C_XREG, C_NONE, 31, 4, 0, 0},
+ 
+ 	{AVSLLW, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0},
+ 	{AXVSLLW, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0},
+-	{AVSLLW, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 32, 4, 0, immFiledUi5},
+-	{AXVSLLW, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 32, 4, 0, immFiledUi5},
+-	{AVSLLW, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 32, 4, 0, immFiledUi5},
+-	{AXVSLLW, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 32, 4, 0, immFiledUi5},
++	{AVSLLW, C_U5CON, C_VREG, C_NONE, C_VREG, C_NONE, 32, 4, 0, 0},
++	{AXVSLLW, C_U5CON, C_XREG, C_NONE, C_XREG, C_NONE, 32, 4, 0, 0},
++	{AVSLLW, C_U5CON, C_NONE, C_NONE, C_VREG, C_NONE, 32, 4, 0, 0},
++	{AXVSLLW, C_U5CON, C_NONE, C_NONE, C_XREG, C_NONE, 32, 4, 0, 0},
+ 
+ 	{AVSLLV, C_VREG, C_VREG, C_NONE, C_VREG, C_NONE, 2, 4, 0, 0},
+ 	{AXVSLLV, C_XREG, C_XREG, C_NONE, C_XREG, C_NONE, 2, 4, 0, 0},
+-	{AVSLLV, C_SCON, C_VREG, C_NONE, C_VREG, C_NONE, 33, 4, 0, immFiledUi6},
+-	{AXVSLLV, C_SCON, C_XREG, C_NONE, C_XREG, C_NONE, 33, 4, 0, immFiledUi6},
+-	{AVSLLV, C_SCON, C_NONE, C_NONE, C_VREG, C_NONE, 33, 4, 0, immFiledUi6},
+-	{AXVSLLV, C_SCON, C_NONE, C_NONE, C_XREG, C_NONE, 33, 4, 0, immFiledUi6},
++	{AVSLLV, C_U6CON, C_VREG, C_NONE, C_VREG, C_NONE, 33, 4, 0, 0},
++	{AXVSLLV, C_U6CON, C_XREG, C_NONE, C_XREG, C_NONE, 33, 4, 0, 0},
++	{AVSLLV, C_U6CON, C_NONE, C_NONE, C_VREG, C_NONE, 33, 4, 0, 0},
++	{AXVSLLV, C_U6CON, C_NONE, C_NONE, C_XREG, C_NONE, 33, 4, 0, 0},
+ 
+ 	{ACLOW, C_REG, C_NONE, C_NONE, C_REG, C_NONE, 9, 4, 0, 0},
+ 	{AABSF, C_FREG, C_NONE, C_NONE, C_FREG, C_NONE, 9, 4, 0, 0},
+@@ -229,48 +219,46 @@ var optab = []Optab{
+ 
+ 	{AMOVW, C_LACON, C_NONE, C_NONE, C_REG, C_NONE, 26, 12, REGSP, 0},
+ 	{AMOVV, C_LACON, C_NONE, C_NONE, C_REG, C_NONE, 26, 12, REGSP, 0},
+-	{AMOVW, C_ADDCON, C_NONE, C_NONE, C_REG, C_NONE, 3, 4, REGZERO, 0},
+-	{AMOVV, C_ADDCON, C_NONE, C_NONE, C_REG, C_NONE, 3, 4, REGZERO, 0},
+-	{AMOVW, C_ANDCON, C_NONE, C_NONE, C_REG, C_NONE, 3, 4, REGZERO, 0},
+-	{AMOVV, C_ANDCON, C_NONE, C_NONE, C_REG, C_NONE, 3, 4, REGZERO, 0},
+-
+-	{AMOVW, C_UCON, C_NONE, C_NONE, C_REG, C_NONE, 24, 4, 0, 0},
+-	{AMOVV, C_UCON, C_NONE, C_NONE, C_REG, C_NONE, 24, 4, 0, 0},
+-	{AMOVW, C_LCON, C_NONE, C_NONE, C_REG, C_NONE, 19, 8, 0, NOTUSETMP},
+-	{AMOVV, C_LCON, C_NONE, C_NONE, C_REG, C_NONE, 19, 8, 0, NOTUSETMP},
++	{AMOVW, C_12CON, C_NONE, C_NONE, C_REG, C_NONE, 3, 4, REGZERO, 0},
++	{AMOVV, C_12CON, C_NONE, C_NONE, C_REG, C_NONE, 3, 4, REGZERO, 0},
++
++	{AMOVW, C_32CON20_0, C_NONE, C_NONE, C_REG, C_NONE, 24, 4, 0, 0},
++	{AMOVV, C_32CON20_0, C_NONE, C_NONE, C_REG, C_NONE, 24, 4, 0, 0},
++	{AMOVW, C_32CON, C_NONE, C_NONE, C_REG, C_NONE, 19, 8, 0, NOTUSETMP},
++	{AMOVV, C_32CON, C_NONE, C_NONE, C_REG, C_NONE, 19, 8, 0, NOTUSETMP},
+ 	{AMOVV, C_DCON12_0, C_NONE, C_NONE, C_REG, C_NONE, 67, 4, 0, NOTUSETMP},
+ 	{AMOVV, C_DCON12_20S, C_NONE, C_NONE, C_REG, C_NONE, 68, 8, 0, NOTUSETMP},
+ 	{AMOVV, C_DCON32_12S, C_NONE, C_NONE, C_REG, C_NONE, 69, 12, 0, NOTUSETMP},
+ 	{AMOVV, C_DCON, C_NONE, C_NONE, C_REG, C_NONE, 59, 16, 0, NOTUSETMP},
+ 
+-	{AADD, C_ADD0CON, C_REG, C_NONE, C_REG, C_NONE, 4, 4, 0, 0},
+-	{AADD, C_ADD0CON, C_NONE, C_NONE, C_REG, C_NONE, 4, 4, 0, 0},
+-	{AADD, C_ANDCON, C_REG, C_NONE, C_REG, C_NONE, 10, 8, 0, 0},
+-	{AADD, C_ANDCON, C_NONE, C_NONE, C_REG, C_NONE, 10, 8, 0, 0},
+-
+-	{AADDV, C_ADD0CON, C_REG, C_NONE, C_REG, C_NONE, 4, 4, 0, 0},
+-	{AADDV, C_ADD0CON, C_NONE, C_NONE, C_REG, C_NONE, 4, 4, 0, 0},
+-	{AADDV, C_ANDCON, C_REG, C_NONE, C_REG, C_NONE, 10, 8, 0, 0},
+-	{AADDV, C_ANDCON, C_NONE, C_NONE, C_REG, C_NONE, 10, 8, 0, 0},
+-
+-	{AAND, C_AND0CON, C_REG, C_NONE, C_REG, C_NONE, 4, 4, 0, 0},
+-	{AAND, C_AND0CON, C_NONE, C_NONE, C_REG, C_NONE, 4, 4, 0, 0},
+-	{AAND, C_ADDCON, C_REG, C_NONE, C_REG, C_NONE, 10, 8, 0, 0},
+-	{AAND, C_ADDCON, C_NONE, C_NONE, C_REG, C_NONE, 10, 8, 0, 0},
+-
+-	{AADD, C_UCON, C_REG, C_NONE, C_REG, C_NONE, 25, 8, 0, 0},
+-	{AADD, C_UCON, C_NONE, C_NONE, C_REG, C_NONE, 25, 8, 0, 0},
+-	{AADDV, C_UCON, C_REG, C_NONE, C_REG, C_NONE, 25, 8, 0, 0},
+-	{AADDV, C_UCON, C_NONE, C_NONE, C_REG, C_NONE, 25, 8, 0, 0},
+-	{AAND, C_UCON, C_REG, C_NONE, C_REG, C_NONE, 25, 8, 0, 0},
+-	{AAND, C_UCON, C_NONE, C_NONE, C_REG, C_NONE, 25, 8, 0, 0},
+-
+-	{AADD, C_LCON, C_NONE, C_NONE, C_REG, C_NONE, 23, 12, 0, 0},
+-	{AADDV, C_LCON, C_NONE, C_NONE, C_REG, C_NONE, 23, 12, 0, 0},
+-	{AAND, C_LCON, C_NONE, C_NONE, C_REG, C_NONE, 23, 12, 0, 0},
+-	{AADD, C_LCON, C_REG, C_NONE, C_REG, C_NONE, 23, 12, 0, 0},
+-	{AADDV, C_LCON, C_REG, C_NONE, C_REG, C_NONE, 23, 12, 0, 0},
+-	{AAND, C_LCON, C_REG, C_NONE, C_REG, C_NONE, 23, 12, 0, 0},
++	{AADD, C_US12CON, C_REG, C_NONE, C_REG, C_NONE, 4, 4, 0, 0},
++	{AADD, C_US12CON, C_NONE, C_NONE, C_REG, C_NONE, 4, 4, 0, 0},
++	{AADD, C_U12CON, C_REG, C_NONE, C_REG, C_NONE, 10, 8, 0, 0},
++	{AADD, C_U12CON, C_NONE, C_NONE, C_REG, C_NONE, 10, 8, 0, 0},
++
++	{AADDV, C_US12CON, C_REG, C_NONE, C_REG, C_NONE, 4, 4, 0, 0},
++	{AADDV, C_US12CON, C_NONE, C_NONE, C_REG, C_NONE, 4, 4, 0, 0},
++	{AADDV, C_U12CON, C_REG, C_NONE, C_REG, C_NONE, 10, 8, 0, 0},
++	{AADDV, C_U12CON, C_NONE, C_NONE, C_REG, C_NONE, 10, 8, 0, 0},
++
++	{AAND, C_UU12CON, C_REG, C_NONE, C_REG, C_NONE, 4, 4, 0, 0},
++	{AAND, C_UU12CON, C_NONE, C_NONE, C_REG, C_NONE, 4, 4, 0, 0},
++	{AAND, C_S12CON, C_REG, C_NONE, C_REG, C_NONE, 10, 8, 0, 0},
++	{AAND, C_S12CON, C_NONE, C_NONE, C_REG, C_NONE, 10, 8, 0, 0},
++
++	{AADD, C_32CON20_0, C_REG, C_NONE, C_REG, C_NONE, 25, 8, 0, 0},
++	{AADD, C_32CON20_0, C_NONE, C_NONE, C_REG, C_NONE, 25, 8, 0, 0},
++	{AADDV, C_32CON20_0, C_REG, C_NONE, C_REG, C_NONE, 25, 8, 0, 0},
++	{AADDV, C_32CON20_0, C_NONE, C_NONE, C_REG, C_NONE, 25, 8, 0, 0},
++	{AAND, C_32CON20_0, C_REG, C_NONE, C_REG, C_NONE, 25, 8, 0, 0},
++	{AAND, C_32CON20_0, C_NONE, C_NONE, C_REG, C_NONE, 25, 8, 0, 0},
++
++	{AADD, C_32CON, C_NONE, C_NONE, C_REG, C_NONE, 23, 12, 0, 0},
++	{AADDV, C_32CON, C_NONE, C_NONE, C_REG, C_NONE, 23, 12, 0, 0},
++	{AAND, C_32CON, C_NONE, C_NONE, C_REG, C_NONE, 23, 12, 0, 0},
++	{AADD, C_32CON, C_REG, C_NONE, C_REG, C_NONE, 23, 12, 0, 0},
++	{AADDV, C_32CON, C_REG, C_NONE, C_REG, C_NONE, 23, 12, 0, 0},
++	{AAND, C_32CON, C_REG, C_NONE, C_REG, C_NONE, 23, 12, 0, 0},
+ 
+ 	{AADDV, C_DCON, C_NONE, C_NONE, C_REG, C_NONE, 60, 20, 0, 0},
+ 	{AADDV, C_DCON, C_REG, C_NONE, C_REG, C_NONE, 60, 20, 0, 0},
+@@ -289,18 +277,18 @@ var optab = []Optab{
+ 	{AAND, C_DCON32_12S, C_NONE, C_NONE, C_REG, C_NONE, 72, 16, 0, 0},
+ 	{AAND, C_DCON32_12S, C_REG, C_NONE, C_REG, C_NONE, 72, 16, 0, 0},
+ 
+-	{ASLL, C_SCON, C_REG, C_NONE, C_REG, C_NONE, 16, 4, 0, 0},
+-	{ASLL, C_SCON, C_NONE, C_NONE, C_REG, C_NONE, 16, 4, 0, 0},
++	{ASLL, C_U5CON, C_REG, C_NONE, C_REG, C_NONE, 16, 4, 0, 0},
++	{ASLL, C_U5CON, C_NONE, C_NONE, C_REG, C_NONE, 16, 4, 0, 0},
+ 
+-	{ASLLV, C_SCON, C_REG, C_NONE, C_REG, C_NONE, 16, 4, 0, 0},
+-	{ASLLV, C_SCON, C_NONE, C_NONE, C_REG, C_NONE, 16, 4, 0, 0},
++	{ASLLV, C_U6CON, C_REG, C_NONE, C_REG, C_NONE, 16, 4, 0, 0},
++	{ASLLV, C_U6CON, C_NONE, C_NONE, C_REG, C_NONE, 16, 4, 0, 0},
+ 
+-	{ABSTRPICKW, C_SCON, C_REG, C_SCON, C_REG, C_NONE, 17, 4, 0, 0},
+-	{ABSTRPICKW, C_SCON, C_REG, C_ZCON, C_REG, C_NONE, 17, 4, 0, 0},
++	{ABSTRPICKW, C_U6CON, C_REG, C_U6CON, C_REG, C_NONE, 17, 4, 0, 0},
++	{ABSTRPICKW, C_U6CON, C_REG, C_ZCON, C_REG, C_NONE, 17, 4, 0, 0},
+ 	{ABSTRPICKW, C_ZCON, C_REG, C_ZCON, C_REG, C_NONE, 17, 4, 0, 0},
+ 
+ 	{ASYSCALL, C_NONE, C_NONE, C_NONE, C_NONE, C_NONE, 5, 4, 0, 0},
+-	{ASYSCALL, C_ANDCON, C_NONE, C_NONE, C_NONE, C_NONE, 5, 4, 0, 0},
++	{ASYSCALL, C_U15CON, C_NONE, C_NONE, C_NONE, C_NONE, 5, 4, 0, 0},
+ 
+ 	{ABEQ, C_REG, C_REG, C_NONE, C_BRAN, C_NONE, 6, 4, 0, 0},
+ 	{ABEQ, C_REG, C_NONE, C_NONE, C_BRAN, C_NONE, 6, 4, 0, 0},
+@@ -348,8 +336,7 @@ var optab = []Optab{
+ 	{AMOVV, C_FREG, C_NONE, C_NONE, C_FCCREG, C_NONE, 30, 4, 0, 0},
+ 	{AMOVV, C_FCCREG, C_NONE, C_NONE, C_FREG, C_NONE, 30, 4, 0, 0},
+ 
+-	{AMOVW, C_ADDCON, C_NONE, C_NONE, C_FREG, C_NONE, 34, 8, 0, 0},
+-	{AMOVW, C_ANDCON, C_NONE, C_NONE, C_FREG, C_NONE, 34, 8, 0, 0},
++	{AMOVW, C_12CON, C_NONE, C_NONE, C_FREG, C_NONE, 34, 8, 0, 0},
+ 
+ 	{AMOVB, C_REG, C_NONE, C_NONE, C_TLS_IE, C_NONE, 56, 16, 0, 0},
+ 	{AMOVW, C_REG, C_NONE, C_NONE, C_TLS_IE, C_NONE, 56, 16, 0, 0},
+@@ -363,13 +350,13 @@ var optab = []Optab{
+ 	{AMOVBU, C_TLS_IE, C_NONE, C_NONE, C_REG, C_NONE, 57, 16, 0, 0},
+ 	{AMOVWU, C_TLS_IE, C_NONE, C_NONE, C_REG, C_NONE, 57, 16, 0, 0},
+ 
+-	{AWORD, C_LCON, C_NONE, C_NONE, C_NONE, C_NONE, 38, 4, 0, 0},
++	{AWORD, C_32CON, C_NONE, C_NONE, C_NONE, C_NONE, 38, 4, 0, 0},
+ 	{AWORD, C_DCON, C_NONE, C_NONE, C_NONE, C_NONE, 61, 4, 0, 0},
+ 
+ 	{AMOVV, C_GOTADDR, C_NONE, C_NONE, C_REG, C_NONE, 65, 8, 0, 0},
+ 
+-	{ATEQ, C_SCON, C_REG, C_NONE, C_REG, C_NONE, 15, 8, 0, 0},
+-	{ATEQ, C_SCON, C_NONE, C_NONE, C_REG, C_NONE, 15, 8, 0, 0},
++	{ATEQ, C_US12CON, C_REG, C_NONE, C_REG, C_NONE, 15, 8, 0, 0},
++	{ATEQ, C_US12CON, C_NONE, C_NONE, C_REG, C_NONE, 15, 8, 0, 0},
+ 
+ 	{ARDTIMELW, C_NONE, C_NONE, C_NONE, C_REG, C_REG, 62, 4, 0, 0},
+ 	{AAMSWAPW, C_REG, C_NONE, C_NONE, C_ZOREG, C_REG, 66, 4, 0, 0},
+@@ -409,12 +396,12 @@ var optab = []Optab{
+ 
+ 	{AVMOVQ, C_ELEM, C_NONE, C_NONE, C_ARNG, C_NONE, 45, 4, 0, 0},
+ 
+-	{obj.APCALIGN, C_SCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0},
+-	{obj.APCDATA, C_LCON, C_NONE, C_NONE, C_LCON, C_NONE, 0, 0, 0, 0},
++	{obj.APCALIGN, C_U12CON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0},
++	{obj.APCDATA, C_32CON, C_NONE, C_NONE, C_32CON, C_NONE, 0, 0, 0, 0},
+ 	{obj.APCDATA, C_DCON, C_NONE, C_NONE, C_DCON, C_NONE, 0, 0, 0, 0},
+-	{obj.AFUNCDATA, C_SCON, C_NONE, C_NONE, C_ADDR, C_NONE, 0, 0, 0, 0},
++	{obj.AFUNCDATA, C_U12CON, C_NONE, C_NONE, C_ADDR, C_NONE, 0, 0, 0, 0},
+ 	{obj.ANOP, C_NONE, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0},
+-	{obj.ANOP, C_LCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0}, // nop variants, see #40689
++	{obj.ANOP, C_32CON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0}, // nop variants, see #40689
+ 	{obj.ANOP, C_DCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0}, // nop variants, see #40689
+ 	{obj.ANOP, C_REG, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0},
+ 	{obj.ANOP, C_FREG, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0},
+@@ -857,34 +844,35 @@ func (c *ctxt0) aclass(a *obj.Addr) int {
+ 		}
+ 
+ 		if c.instoffset >= 0 {
+-			if c.instoffset == 0 {
+-				return C_ZCON
+-			}
+-			if c.instoffset <= 0x7ff {
+-				return C_SCON
+-			}
+-			if c.instoffset <= 0xfff {
+-				return C_ANDCON
+-			}
+-			if c.instoffset&0xfff == 0 && isuint32(uint64(c.instoffset)) { // && ((instoffset & (1<<31)) == 0)
+-				return C_UCON
++			sbits := bits.Len64(uint64(c.instoffset))
++			switch {
++			case sbits <=8:
++				return C_ZCON + sbits
++			case sbits <= 12:
++				if c.instoffset <= 0x7ff {
++					return C_US12CON
++				}
++				return C_U12CON
++			case sbits <= 15:
++				if c.instoffset & 0xfff == 0 {
++					return C_15CON20_0
++				}
++				return C_U15CON
+ 			}
+-			if isint32(c.instoffset) || isuint32(uint64(c.instoffset)) {
+-				return C_LCON
++		} else {
++			sbits := bits.Len64(uint64(^c.instoffset))
++			switch {
++			case sbits < 5:
++				return C_S5CON
++			case sbits < 12:
++				return C_S12CON
+ 			}
+-			return C_LCON
+ 		}
+ 
+-		if c.instoffset >= -0x800 {
+-			return C_ADDCON
++		if c.instoffset&0xfff == 0 {
++			return C_32CON20_0
+ 		}
+-		if c.instoffset&0xfff == 0 && isint32(c.instoffset) {
+-			return C_UCON
+-		}
+-		if isint32(c.instoffset) {
+-			return C_LCON
+-		}
+-		return C_LCON
++		return C_32CON
+ 
+ 	case obj.TYPE_BRANCH:
+ 		return C_BRAN
+@@ -1130,10 +1118,11 @@ func (c *ctxt0) oplook(p *obj.Prog) *Optab {
+ 
+ 	ops := oprange[p.As&obj.AMask]
+ 	c1 := &xcmp[a1]
++	c3 := &xcmp[a3]
+ 	c4 := &xcmp[a4]
+ 	for i := range ops {
+ 		op := &ops[i]
+-		if (int(op.reg) == a2) && int(op.from3) == a3 && c1[op.from1] && c4[op.to1] && (int(op.to2) == a5) {
++		if (int(op.reg) == a2) && c3[op.from3] && c1[op.from1] && c4[op.to1] && (int(op.to2) == a5) {
+ 			p.Optab = uint16(cap(optab) - cap(ops) + i + 1)
+ 			return op
+ 		}
+@@ -1151,21 +1140,41 @@ func cmp(a int, b int) bool {
+ 	}
+ 	switch a {
+ 	case C_DCON:
+-		if b == C_LCON || b == C_DCON32_0 ||
+-			b == C_DCON12_0 || b == C_DCON20S_0 ||
+-			b == C_DCON12_20S || b == C_DCON12_12S ||
+-			b == C_DCON20S_20 || b == C_DCON32_20 ||
+-			b == C_DCON20S_12S || b == C_DCON32_12S ||
+-			b == C_DCON12_32S || b == C_DCON20S_32 ||
+-			b == C_DCON12_12U || b == C_DCON20S_12U ||
+-			b == C_DCON32_12U {
+-			return true
+-		}
+-		fallthrough
+-	case C_LCON:
+-		if b == C_ZCON || b == C_SCON || b == C_UCON || b == C_ADDCON || b == C_ANDCON {
+-			return true
+-		}
++		return cmp(C_32CON, b) || cmp(C_DCON12_20S, b) || cmp(C_DCON32_12S, b) || b == C_DCON12_0
++	case C_32CON:
++		return cmp(C_32CON20_0, b) || cmp(C_U15CON, b) || cmp(C_S12CON, b)
++	case C_32CON20_0:
++		return b == C_15CON20_0 || b == C_ZCON
++	case C_U15CON:
++		return cmp(C_U12CON, b) || b == C_15CON20_0
++	case C_12CON:
++		return cmp(C_U12CON, b) || cmp(C_S12CON, b)
++	case C_UU12CON:
++		return cmp(C_U12CON, b)
++	case C_U12CON:
++		return cmp(C_U8CON, b) || b == C_US12CON
++	case C_U8CON:
++		return cmp(C_U7CON, b)
++	case C_U7CON:
++		return cmp(C_U6CON, b)
++	case C_U6CON:
++		return cmp(C_U5CON, b)
++	case C_U5CON:
++		return cmp(C_U4CON, b)
++	case C_U4CON:
++		return cmp(C_U3CON, b)
++	case C_U3CON:
++		return cmp(C_U2CON, b)
++	case C_U2CON:
++		return cmp(C_U1CON, b)
++	case C_U1CON:
++		return cmp(C_ZCON, b)
++	case C_US12CON:
++		return cmp(C_S12CON, b)
++	case C_S12CON:
++		return cmp(C_S5CON, b) || cmp(C_U8CON, b) || b == C_US12CON
++	case C_S5CON:
++		return cmp(C_ZCON, b) || cmp(C_U4CON, b)
+ 
+ 	case C_DCON12_0:
+ 
+@@ -1183,62 +1192,20 @@ func cmp(a int, b int) bool {
+ 			return true
+ 		}
+ 
+-	case C_ADD0CON:
+-		if b == C_ADDCON {
+-			return true
+-		}
+-		fallthrough
+-
+-	case C_ADDCON:
+-		if b == C_ZCON || b == C_SCON {
+-			return true
+-		}
+-
+-	case C_AND0CON:
+-		if b == C_ANDCON {
+-			return true
+-		}
+-		fallthrough
+-
+-	case C_ANDCON:
+-		if b == C_ZCON || b == C_SCON {
+-			return true
+-		}
+-
+-	case C_UCON:
+-		if b == C_ZCON {
+-			return true
+-		}
+-
+-	case C_SCON:
+-		if b == C_ZCON {
+-			return true
+-		}
+-
+ 	case C_LACON:
+-		if b == C_SACON {
+-			return true
+-		}
++		return b == C_SACON
+ 
+ 	case C_LAUTO:
+-		if b == C_SAUTO {
+-			return true
+-		}
++		return b == C_SAUTO
+ 
+ 	case C_REG:
+-		if b == C_ZCON {
+-			return true
+-		}
++		return b == C_ZCON
+ 
+ 	case C_LOREG:
+-		if b == C_ZOREG || b == C_SOREG {
+-			return true
+-		}
++		return b == C_ZOREG || b == C_SOREG
+ 
+ 	case C_SOREG:
+-		if b == C_ZOREG {
+-			return true
+-		}
++		return b == C_ZOREG
+ 	}
+ 
+ 	return false
+@@ -1881,7 +1848,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
+ 			r = int(o.param)
+ 		}
+ 		a := add
+-		if o.from1 == C_ANDCON {
++		if o.from1 == C_12CON && v > 0 {
+ 			a = AOR
+ 		}
+ 
+@@ -2008,15 +1975,9 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
+ 		if r == 0 {
+ 		        r = int(p.To.Reg)
+ 		}
+-		
+-		switch o.flag {
+-		case immFiledSi5:
+-		        c.checkimmFiled(p, v, 5, true)
+-		        o1 = OP_5IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
+-		default:
+-		        c.ctxt.Diag("Invalid immediate value type\n%v", p)
+-		}
+-	
++
++		o1 = OP_5IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
++
+ 	case 14: // add $ui8,[r1],r2
+ 		v := c.regoff(&p.From)
+ 		r := int(p.Reg)
+@@ -2024,13 +1985,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
+ 		        r = int(p.To.Reg)
+ 		}
+ 		
+-		switch o.flag {
+-		case immFiledUi8:
+-		        c.checkimmFiled(p, v, 8, false)
+-		        o1 = OP_8IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
+-		default:
+-		        c.ctxt.Diag("Invalid immediate value type\n%v", p)
+-		}
++		o1 = OP_8IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
+ 
+ 	case 15: // teq $c r,r
+ 		v := c.regoff(&p.From)
+@@ -2185,13 +2140,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
+ 			r = int(p.To.Reg)
+ 		}
+ 
+-		switch o.flag {
+-		case immFiledUi3:
+-			c.checkimmFiled(p, v, 3, false)
+-			o1 = OP_3IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
+-		default:
+-			c.ctxt.Diag("Invalid immediate value type\n%v", p)
+-		}
++		o1 = OP_3IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
+ 
+ 	case 30: // mov gr/fr/fcc/fcsr, fr/fcc/fcsr/gr
+ 		a := c.specialFpMovInst(p.As, oclass(&p.From), oclass(&p.To))
+@@ -2204,13 +2153,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
+ 			r = int(p.To.Reg)
+ 		}
+ 
+-		switch o.flag {
+-		case immFiledUi4:
+-			c.checkimmFiled(p, v, 4, false)
+-			o1 = OP_4IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
+-		default:
+-			c.ctxt.Diag("Invalid immediate value type\n%v", p)
+-		}
++		o1 = OP_4IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
+ 
+ 	case 32: // add $ui5,[r1],r2
+ 		v := c.regoff(&p.From)
+@@ -2219,13 +2162,7 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
+ 			r = int(p.To.Reg)
+ 		}
+ 
+-		switch o.flag {
+-		case immFiledUi5:
+-			c.checkimmFiled(p, v, 5, false)
+-			o1 = OP_5IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
+-		default:
+-			c.ctxt.Diag("Invalid immediate value type\n%v", p)
+-		}
++		o1 = OP_5IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
+ 
+ 	case 33: // add $ui6,[r1],r2
+ 		v := c.regoff(&p.From)
+@@ -2234,18 +2171,12 @@ func (c *ctxt0) asmout(p *obj.Prog, o *Optab, out []uint32) {
+ 			r = int(p.To.Reg)
+ 		}
+ 
+-		switch o.flag {
+-		case immFiledUi6:
+-			c.checkimmFiled(p, v, 6, false)
+-			o1 = OP_6IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
+-		default:
+-			c.ctxt.Diag("Invalid immediate value type\n%v", p)
+-		}
++		o1 = OP_6IRR(c.opirr(p.As), uint32(v), uint32(r), uint32(p.To.Reg))
+ 
+ 	case 34: // mov $con,fr
+ 		v := c.regoff(&p.From)
+ 		a := AADDU
+-		if o.from1 == C_ANDCON {
++		if v > 0 {
+ 			a = AOR
+ 		}
+ 		a2 := c.specialFpMovInst(p.As, C_REG, oclass(&p.To))
+@@ -2702,21 +2633,6 @@ func (c *ctxt0) checkindex(p *obj.Prog, index uint32, mask uint32) {
+ 	}
+ }
+ 
+-// checkimmFiled checks whether the immediate value exceeds the valid encoding range
+-func (c *ctxt0) checkimmFiled(p *obj.Prog, imm int32, bits uint8, isSigned bool) {
+-	if isSigned {
+-		bound := int32(1 << (bits - 1))
+-		if imm < -bound || imm > bound {
+-			c.ctxt.Diag("signed immediate %v exceeds the %d-bit range: %v", imm, bits, p)
+-		}
+-	} else {
+-		mask := uint32(0xffffffff) << bits
+-		if uint32(imm) != (uint32(imm) & ^mask) {
+-			c.ctxt.Diag("unsigned immediate %v exceeds the %d-bit range: %v", imm, bits, p)
+-		}
+-	}
+-}
+-
+ func (c *ctxt0) vregoff(a *obj.Addr) int64 {
+ 	c.instoffset = 0
+ 	c.aclass(a)
+diff --git a/src/cmd/internal/obj/loong64/cnames.go b/src/cmd/internal/obj/loong64/cnames.go
+index a2f04a22ee..1d38f1ee36 100644
+--- a/src/cmd/internal/obj/loong64/cnames.go
++++ b/src/cmd/internal/obj/loong64/cnames.go
+@@ -14,13 +14,24 @@ var cnames0 = []string{
+ 	"ARNG",
+ 	"ELEM",
+ 	"ZCON",
+-	"SCON",
+-	"UCON",
+-	"ADD0CON",
+-	"AND0CON",
+-	"ADDCON",
+-	"ANDCON",
+-	"LCON",
++	"U1CON",
++	"U2CON",
++	"U3CON",
++	"U4CON",
++	"U5CON",
++	"U6CON",
++	"U7CON",
++	"U8CON",
++	"S5CON",
++	"US12CON",
++	"UU12CON",
++	"S12CON",
++	"U12CON",
++	"12CON",
++	"U15CON",
++	"15CON20_0",
++	"32CON20_0",
++	"32CON",
+ 	"DCON20S_0",
+ 	"DCON12_0",
+ 	"DCON32_0",
+-- 
+2.38.1
+
diff --git a/0038-crypto-internal-poly1305-implement-function-update-i.patch b/0038-crypto-internal-poly1305-implement-function-update-i.patch
new file mode 100644
index 0000000000000000000000000000000000000000..e18caf23e121c1bd51c48e07b0a441cfffe140d6
--- /dev/null
+++ b/0038-crypto-internal-poly1305-implement-function-update-i.patch
@@ -0,0 +1,298 @@
+From 9e01e315f3ea08fc01854bf8beb2cdeb9ff6dddc Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Thu, 19 Dec 2024 15:38:48 +0800
+Subject: [PATCH 38/44] crypto/internal/poly1305: implement function update in
+ assembly on loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+There is some improvement in performance on Loongson 3A5000 and 3A6000.
+
+goos: linux
+goarch: loong64
+pkg: golang.org/x/crypto/internal/poly1305
+cpu: Loongson-3A5000 @ 2500.00MHz
+                 |  bench.old   |              bench.new              |
+                 |    sec/op    |   sec/op     vs base                |
+64                  122.8n ± 0%   101.2n ± 0%  -17.59% (p=0.000 n=10)
+1K                 1152.0n ± 0%   779.4n ± 0%  -32.34% (p=0.000 n=10)
+2M                  2.356m ± 0%   1.556m ± 0%  -33.94% (p=0.000 n=10)
+64Unaligned         122.7n ± 0%   102.5n ± 0%  -16.46% (p=0.000 n=10)
+1KUnaligned        1152.0n ± 0%   802.4n ± 0%  -30.35% (p=0.000 n=10)
+2MUnaligned         2.336m ± 0%   1.582m ± 0%  -32.26% (p=0.000 n=10)
+Write64             77.92n ± 0%   57.45n ± 0%  -26.27% (p=0.000 n=10)
+Write1K            1106.0n ± 0%   736.2n ± 0%  -33.44% (p=0.000 n=10)
+Write2M             2.356m ± 0%   1.562m ± 0%  -33.69% (p=0.000 n=10)
+Write64Unaligned    77.87n ± 0%   59.71n ± 0%  -23.33% (p=0.000 n=10)
+Write1KUnaligned   1106.0n ± 0%   749.5n ± 0%  -32.23% (p=0.000 n=10)
+Write2MUnaligned    2.335m ± 0%   1.580m ± 0%  -32.34% (p=0.000 n=10)
+geomean             6.373µ        4.530µ       -28.93%
+
+                 |  bench.old   |               bench.new               |
+                 |     B/s      |      B/s       vs base                |
+64                 497.1Mi ± 0%    603.3Mi ± 0%  +21.37% (p=0.000 n=10)
+1K                 847.6Mi ± 0%   1252.9Mi ± 0%  +47.82% (p=0.000 n=10)
+2M                 849.0Mi ± 0%   1285.3Mi ± 0%  +51.39% (p=0.000 n=10)
+64Unaligned        497.4Mi ± 0%    595.5Mi ± 0%  +19.73% (p=0.000 n=10)
+1KUnaligned        847.6Mi ± 0%   1217.1Mi ± 0%  +43.59% (p=0.000 n=10)
+2MUnaligned        856.3Mi ± 0%   1264.0Mi ± 0%  +47.61% (p=0.000 n=10)
+Write64            783.3Mi ± 0%   1062.4Mi ± 0%  +35.64% (p=0.000 n=10)
+Write1K            882.8Mi ± 0%   1326.5Mi ± 0%  +50.25% (p=0.000 n=10)
+Write2M            849.0Mi ± 0%   1280.3Mi ± 0%  +50.80% (p=0.000 n=10)
+Write64Unaligned   783.8Mi ± 0%   1022.3Mi ± 0%  +30.43% (p=0.000 n=10)
+Write1KUnaligned   882.8Mi ± 0%   1303.0Mi ± 0%  +47.59% (p=0.000 n=10)
+Write2MUnaligned   856.5Mi ± 0%   1266.0Mi ± 0%  +47.81% (p=0.000 n=10)
+geomean            772.2Mi         1.061Gi       +40.72%
+
+goos: linux
+goarch: loong64
+pkg: golang.org/x/crypto/internal/poly1305
+cpu: Loongson-3A6000 @ 2500.00MHz
+                 |  bench.old  |              bench.new              |
+                 |   sec/op    |   sec/op     vs base                |
+64                 92.06n ± 0%   77.56n ± 0%  -15.75% (p=0.000 n=10)
+1K                 998.4n ± 0%   683.0n ± 0%  -31.59% (p=0.000 n=10)
+2M                 1.978m ± 0%   1.323m ± 0%  -33.11% (p=0.000 n=10)
+64Unaligned        92.06n ± 0%   77.56n ± 0%  -15.75% (p=0.000 n=10)
+1KUnaligned        998.4n ± 0%   683.0n ± 0%  -31.59% (p=0.000 n=10)
+2MUnaligned        1.979m ± 0%   1.369m ± 0%  -30.82% (p=0.000 n=10)
+Write64            65.25n ± 0%   50.39n ± 0%  -22.77% (p=0.000 n=10)
+Write1K            970.7n ± 0%   656.8n ± 0%  -32.34% (p=0.000 n=10)
+Write2M            1.966m ± 0%   1.323m ± 0%  -32.73% (p=0.000 n=10)
+Write64Unaligned   65.24n ± 0%   50.37n ± 0%  -22.79% (p=0.000 n=10)
+Write1KUnaligned   970.8n ± 0%   656.8n ± 0%  -32.34% (p=0.000 n=10)
+Write2MUnaligned   1.966m ± 0%   1.368m ± 0%  -30.42% (p=0.000 n=10)
+geomean            5.319µ        3.834µ       -27.93%
+
+                 |   bench.old   |               bench.new               |
+                 |      B/s      |      B/s       vs base                |
+64                  663.0Mi ± 0%    786.9Mi ± 0%  +18.69% (p=0.000 n=10)
+1K                  978.1Mi ± 0%   1429.8Mi ± 0%  +46.18% (p=0.000 n=10)
+2M                 1011.0Mi ± 0%   1511.4Mi ± 0%  +49.50% (p=0.000 n=10)
+64Unaligned         663.0Mi ± 0%    786.9Mi ± 0%  +18.69% (p=0.000 n=10)
+1KUnaligned         978.1Mi ± 0%   1429.8Mi ± 0%  +46.18% (p=0.000 n=10)
+2MUnaligned        1010.6Mi ± 0%   1460.9Mi ± 0%  +44.56% (p=0.000 n=10)
+Write64             935.4Mi ± 0%   1211.3Mi ± 0%  +29.49% (p=0.000 n=10)
+Write1K            1006.0Mi ± 0%   1486.9Mi ± 0%  +47.81% (p=0.000 n=10)
+Write2M            1017.3Mi ± 0%   1512.1Mi ± 0%  +48.64% (p=0.000 n=10)
+Write64Unaligned    935.5Mi ± 0%   1211.7Mi ± 0%  +29.53% (p=0.000 n=10)
+Write1KUnaligned   1005.9Mi ± 0%   1486.9Mi ± 0%  +47.81% (p=0.000 n=10)
+Write2MUnaligned   1017.1Mi ± 0%   1461.8Mi ± 0%  +43.71% (p=0.000 n=10)
+geomean             925.3Mi         1.254Gi       +38.75%
+
+Change-Id: Iec990384a7be9a89a019c2b3b546d9fc59a2d58e
+---
+ .../x/crypto/internal/poly1305/mac_noasm.go   |   2 +-
+ .../x/crypto/internal/poly1305/sum_loong64.go |  47 +++++++
+ .../x/crypto/internal/poly1305/sum_loong64.s  | 131 ++++++++++++++++++
+ 3 files changed, 179 insertions(+), 1 deletion(-)
+ create mode 100644 src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.go
+ create mode 100644 src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.s
+
+diff --git a/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go b/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
+index bd896bdc76..8d99551fee 100644
+--- a/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
++++ b/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build (!amd64 && !ppc64le && !ppc64 && !s390x) || !gc || purego
++//go:build (!amd64 && !loong64 && !ppc64le && !ppc64 && !s390x) || !gc || purego
+ 
+ package poly1305
+ 
+diff --git a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.go b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.go
+new file mode 100644
+index 0000000000..d4dc8f91ec
+--- /dev/null
++++ b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.go
+@@ -0,0 +1,47 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++//go:build gc && !purego
++
++package poly1305
++
++//go:noescape
++func update(state *macState, msg []byte)
++
++// mac is a wrapper for macGeneric that redirects calls that would have gone to
++// updateGeneric to update.
++//
++// Its Write and Sum methods are otherwise identical to the macGeneric ones, but
++// using function pointers would carry a major performance cost.
++type mac struct{ macGeneric }
++
++func (h *mac) Write(p []byte) (int, error) {
++	nn := len(p)
++	if h.offset > 0 {
++		n := copy(h.buffer[h.offset:], p)
++		if h.offset+n < TagSize {
++			h.offset += n
++			return nn, nil
++		}
++		p = p[n:]
++		h.offset = 0
++		update(&h.macState, h.buffer[:])
++	}
++	if n := len(p) - (len(p) % TagSize); n > 0 {
++		update(&h.macState, p[:n])
++		p = p[n:]
++	}
++	if len(p) > 0 {
++		h.offset += copy(h.buffer[h.offset:], p)
++	}
++	return nn, nil
++}
++
++func (h *mac) Sum(out *[16]byte) {
++	state := h.macState
++	if h.offset > 0 {
++		update(&state, h.buffer[:h.offset])
++	}
++	finalize(out, &state.h, &state.s)
++}
+diff --git a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.s b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.s
+new file mode 100644
+index 0000000000..baf0c95333
+--- /dev/null
++++ b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_loong64.s
+@@ -0,0 +1,131 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++//go:build gc && !purego
++
++// func update(state *macState, msg []byte)
++TEXT ·update(SB), $0-32
++	MOVV	state+0(FP), R4
++	MOVV	msg_base+8(FP), R5
++	MOVV	msg_len+16(FP), R6
++	MOVV	$16, R7
++	MOVV	(R4), R8	// h0
++	MOVV	8(R4), R9	// h1
++	MOVV	16(R4), R10	// h2
++	MOVV	24(R4), R11	// r0
++	MOVV	32(R4), R12	// r1
++
++	BLT	R6, R7, bytes_between_0_and_15
++
++loop:
++	MOVV	(R5), R14	// msg[0:8]
++	MOVV	8(R5), R16	// msg[8:16]
++	ADDV	R14, R8, R8	// h0
++	ADDV	R9, R16, R27
++	SGTU	R14, R8, R24	// h0.carry
++	SGTU	R9, R27, R28
++	ADDV	R27, R24, R9	// h1
++	SGTU	R27, R9, R24
++	OR	R24, R28, R24	// h1.carry
++	ADDV	$1, R24, R24
++	ADDV	R10, R24, R10	// h2
++
++	ADDV	$16, R5, R5	// msg = msg[16:]
++
++multiply:
++	MULV	R8, R11, R13	// h0r0.lo
++	MULHVU	R8, R11, R16	// h0r0.hi
++	MOVV	R13, R14
++	MOVV	R16, R15
++	MULV	R9, R11, R13	// h1r0.lo
++	MULHVU	R9, R11, R16	// h1r0.hi
++	ADDV	R13, R15, R15
++	SGTU	R13, R15, R24
++	ADDV	R24, R16, R16
++	MULV	R10, R11, R25
++	ADDV	R16, R25, R25
++	MULV	R8, R12, R13	// h0r1.lo
++	MULHVU	R8, R12, R16	// h0r1.hi
++	ADDV	R13, R15, R15
++	SGTU	R13, R15, R24
++	ADDV	R24, R16, R16
++	MOVV	R16, R8
++	MULV	R10, R12, R26	// h2r1
++	MULV	R9, R12, R13	// h1r1.lo
++	MULHVU	R9, R12, R16	// h1r1.hi
++	ADDV	R13, R25, R25
++	ADDV	R16, R26, R27
++	SGTU	R13, R25, R24
++	SGTU	R16, R27, R28
++	ADDV	R27, R24, R26
++	SGTU	R27, R26, R24
++	OR	R24, R28, R24
++	ADDV	R8, R25, R25
++	SGTU	R8, R25, R24
++	ADDV	R24, R26, R26
++	MOVV	R14, R8
++	MOVV	R15, R9
++	MOVV	R25, R10
++	MOVV	R25, R14
++	AND	$3, R10, R10
++	AND	$-4, R14, R14
++	ADDV	R14, R8, R8
++	ADDV	R26, R9, R27
++	SGTU	R14, R8, R24
++	SGTU	R26, R27, R28
++	ADDV	R27, R24, R9
++	SGTU	R27, R9, R24
++	OR	R24, R28, R24
++	ADDV	R24, R10, R10
++	SLLV	$62, R26, R27
++	SRLV	$2, R25, R28
++	SRLV	$2, R26, R26
++	OR	R27, R28, R25
++	ADDV	R25, R8, R8
++	ADDV	R26, R9, R27
++	SGTU	R25, R8, R24
++	SGTU	R26, R27, R28
++	ADDV	R27, R24, R9
++	SGTU	R27, R9, R24
++	OR	R24, R28, R24
++	ADDV	R24, R10, R10
++
++	SUBV	$16, R6, R6
++	BGE	R6, R7, loop
++
++bytes_between_0_and_15:
++	BEQ	R6, R0, done
++	MOVV	$1, R14
++	XOR	R15, R15
++	XOR	R25, R25
++	ADDV	R6, R5, R5
++
++flush_buffer:
++	SRLV	$56, R14, R24
++	SLLV	$8, R15, R28
++	OR	R24, R28, R15
++	SLLV	$8, R14, R14
++	MOVBU	-1(R5), R25
++	XOR	R25, R14, R14
++	SUBV	$1, R5, R5
++	SUBV	$1, R6, R6
++	BNE	R6, R0, flush_buffer
++
++	ADDV	R14, R8, R8
++	SGTU	R14, R8, R24
++	ADDV	R15, R9, R27
++	SGTU	R15, R27, R28
++	ADDV	R27, R24, R9
++	SGTU	R27, R9, R24
++	OR	R24, R28, R24
++	ADDV	R10, R24, R10
++
++	MOVV	$16, R6
++	JMP	multiply
++
++done:
++	MOVV	R8, (R4)
++	MOVV	R9, 8(R4)
++	MOVV	R10, 16(R4)
++	RET
+-- 
+2.38.1
+
diff --git a/0039-runtime-optimize-the-implementation-of-memclrNoHeapP.patch b/0039-runtime-optimize-the-implementation-of-memclrNoHeapP.patch
new file mode 100644
index 0000000000000000000000000000000000000000..289a1f5ac28f28b391aab13473136c3cdf82e036
--- /dev/null
+++ b/0039-runtime-optimize-the-implementation-of-memclrNoHeapP.patch
@@ -0,0 +1,374 @@
+From 0e94e34886a3632315e444c5fd0ba448239c500e Mon Sep 17 00:00:00 2001
+From: chenguoqi <chenguoqi@loongson.cn>
+Date: Tue, 31 Dec 2024 18:31:50 +0800
+Subject: [PATCH 39/44] runtime: optimize the implementation of
+ memclrNoHeapPointers on loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+goos: linux
+goarch: loong64
+pkg: runtime
+cpu: Loongson-3A6000 @ 2500.00MHz
+                        │  bench.old   │              bench.new               │
+                        │    sec/op    │    sec/op     vs base                │
+Memclr/5                   2.456n ± 0%    3.202n ± 0%  +30.37% (p=0.000 n=10)
+Memclr/16                  2.806n ± 0%    2.810n ± 1%   +0.14% (p=0.002 n=10)
+Memclr/64                  5.053n ± 1%    5.045n ± 1%        ~ (p=0.591 n=10)
+Memclr/256                10.240n ± 0%    6.027n ± 0%  -41.14% (p=0.000 n=10)
+Memclr/4096               107.00n ± 0%    30.46n ± 0%  -71.53% (p=0.000 n=10)
+Memclr/65536              1676.0n ± 0%    431.3n ± 0%  -74.26% (p=0.000 n=10)
+Memclr/1M                  52.52µ ± 0%    32.81µ ± 0%  -37.54% (p=0.000 n=10)
+Memclr/4M                  210.0µ ± 0%    131.3µ ± 0%  -37.48% (p=0.000 n=10)
+Memclr/8M                  420.0µ ± 0%    262.8µ ± 1%  -37.43% (p=0.000 n=10)
+Memclr/16M                 846.7µ ± 0%    528.8µ ± 0%  -37.55% (p=0.000 n=10)
+Memclr/64M                 3.388m ± 0%    2.180m ± 1%  -35.66% (p=0.000 n=10)
+MemclrUnaligned/0_5        4.382n ± 0%    4.006n ± 0%   -8.59% (p=0.000 n=10)
+MemclrUnaligned/0_16       4.600n ± 0%    4.204n ± 0%   -8.60% (p=0.000 n=10)
+MemclrUnaligned/0_64       5.604n ± 0%    5.005n ± 0%  -10.69% (p=0.000 n=10)
+MemclrUnaligned/0_256     10.340n ± 0%    6.808n ± 0%  -34.16% (p=0.000 n=10)
+MemclrUnaligned/0_4096    107.10n ± 0%    33.81n ± 0%  -68.43% (p=0.000 n=10)
+MemclrUnaligned/0_65536   1701.0n ± 0%    441.6n ± 0%  -74.04% (p=0.000 n=10)
+MemclrUnaligned/1_5        4.386n ± 0%    4.004n ± 0%   -8.71% (p=0.000 n=10)
+MemclrUnaligned/1_16       4.597n ± 0%    4.203n ± 0%   -8.56% (p=0.000 n=10)
+MemclrUnaligned/1_64       7.204n ± 0%    7.106n ± 0%   -1.36% (p=0.000 n=10)
+MemclrUnaligned/1_256     12.580n ± 0%    9.796n ± 0%  -22.13% (p=0.000 n=10)
+MemclrUnaligned/1_4096    115.60n ± 0%    38.63n ± 0%  -66.58% (p=0.000 n=10)
+MemclrUnaligned/1_65536   1709.0n ± 0%    446.5n ± 0%  -73.87% (p=0.000 n=10)
+MemclrUnaligned/4_5        4.386n ± 0%    4.005n ± 0%   -8.69% (p=0.000 n=10)
+MemclrUnaligned/4_16       4.597n ± 0%    4.203n ± 0%   -8.57% (p=0.000 n=10)
+MemclrUnaligned/4_64       7.204n ± 0%    7.104n ± 0%   -1.39% (p=0.000 n=10)
+MemclrUnaligned/4_256      12.58n ± 0%    10.66n ± 0%  -15.22% (p=0.000 n=10)
+MemclrUnaligned/4_4096    114.30n ± 0%    39.99n ± 0%  -65.01% (p=0.000 n=10)
+MemclrUnaligned/4_65536   1709.0n ± 0%    449.8n ± 0%  -73.68% (p=0.000 n=10)
+MemclrUnaligned/7_5        4.381n ± 0%    4.002n ± 0%   -8.64% (p=0.000 n=10)
+MemclrUnaligned/7_16       4.597n ± 0%    4.202n ± 0%   -8.59% (p=0.000 n=10)
+MemclrUnaligned/7_64       7.204n ± 0%    7.104n ± 0%   -1.39% (p=0.000 n=10)
+MemclrUnaligned/7_256      12.58n ± 0%    10.60n ± 0%  -15.74% (p=0.000 n=10)
+MemclrUnaligned/7_4096    115.50n ± 0%    39.75n ± 0%  -65.58% (p=0.000 n=10)
+MemclrUnaligned/7_65536   1709.0n ± 0%    447.1n ± 0%  -73.84% (p=0.000 n=10)
+MemclrUnaligned/0_1M       52.52µ ± 0%    32.80µ ± 0%  -37.56% (p=0.000 n=10)
+MemclrUnaligned/0_4M       210.0µ ± 0%    131.2µ ± 0%  -37.53% (p=0.000 n=10)
+MemclrUnaligned/0_8M       419.9µ ± 0%    262.5µ ± 0%  -37.48% (p=0.000 n=10)
+MemclrUnaligned/0_16M      845.0µ ± 0%    528.1µ ± 0%  -37.51% (p=0.000 n=10)
+MemclrUnaligned/0_64M      3.406m ± 0%    2.165m ± 1%  -36.44% (p=0.000 n=10)
+MemclrUnaligned/1_1M       52.53µ ± 0%    32.80µ ± 0%  -37.55% (p=0.000 n=10)
+MemclrUnaligned/1_4M       210.2µ ± 0%    131.3µ ± 0%  -37.55% (p=0.000 n=10)
+MemclrUnaligned/1_8M       419.9µ ± 0%    262.4µ ± 0%  -37.50% (p=0.000 n=10)
+MemclrUnaligned/1_16M      844.2µ ± 0%    528.0µ ± 0%  -37.46% (p=0.000 n=10)
+MemclrUnaligned/1_64M      3.369m ± 0%    2.161m ± 5%  -35.84% (p=0.000 n=10)
+MemclrUnaligned/4_1M       52.53µ ± 0%    32.80µ ± 0%  -37.55% (p=0.000 n=10)
+MemclrUnaligned/4_4M       210.2µ ± 0%    131.2µ ± 0%  -37.59% (p=0.000 n=10)
+MemclrUnaligned/4_8M       419.9µ ± 0%    262.4µ ± 0%  -37.52% (p=0.000 n=10)
+MemclrUnaligned/4_16M      844.5µ ± 0%    527.9µ ± 0%  -37.49% (p=0.000 n=10)
+MemclrUnaligned/4_64M      3.366m ± 0%    2.173m ± 0%  -35.46% (p=0.000 n=10)
+MemclrUnaligned/7_1M       52.52µ ± 0%    32.80µ ± 0%  -37.55% (p=0.000 n=10)
+MemclrUnaligned/7_4M       210.2µ ± 0%    131.5µ ± 0%  -37.45% (p=0.000 n=10)
+MemclrUnaligned/7_8M       419.9µ ± 0%    262.6µ ± 0%  -37.47% (p=0.000 n=10)
+MemclrUnaligned/7_16M      844.4µ ± 0%    529.0µ ± 0%  -37.36% (p=0.000 n=10)
+MemclrUnaligned/7_64M      3.372m ± 1%    2.201m ± 0%  -34.72% (p=0.000 n=10)
+MemclrRange/1K_2K         2703.0n ± 0%    948.1n ± 0%  -64.93% (p=0.000 n=10)
+MemclrRange/2K_8K          8.826µ ± 0%    2.458µ ± 0%  -72.15% (p=0.000 n=10)
+MemclrRange/4K_16K         8.325µ ± 0%    2.210µ ± 0%  -73.45% (p=0.000 n=10)
+MemclrRange/160K_228K      83.40µ ± 0%    31.27µ ± 0%  -62.50% (p=0.000 n=10)
+MemclrKnownSize1          0.4003n ± 0%   0.4002n ± 0%   -0.02% (p=0.027 n=10)
+MemclrKnownSize2          0.4003n ± 0%   0.4002n ± 0%   -0.02% (p=0.000 n=10)
+MemclrKnownSize4          0.4003n ± 0%   0.4002n ± 0%   -0.02% (p=0.000 n=10)
+MemclrKnownSize8          0.4003n ± 0%   0.4002n ± 0%   -0.02% (p=0.000 n=10)
+MemclrKnownSize16         0.4213n ± 1%   0.8007n ± 0%  +90.03% (p=0.000 n=10)
+MemclrKnownSize32          2.001n ± 0%    1.602n ± 0%  -19.94% (p=0.000 n=10)
+MemclrKnownSize64          2.010n ± 0%    2.402n ± 0%  +19.47% (p=0.000 n=10)
+MemclrKnownSize112         3.202n ± 0%    2.803n ± 0%  -12.46% (p=0.000 n=10)
+MemclrKnownSize128         3.442n ± 0%    3.236n ± 0%   -6.00% (p=0.000 n=10)
+MemclrKnownSize192         5.204n ± 0%    5.205n ± 0%        ~ (p=0.279 n=10)
+MemclrKnownSize248         6.301n ± 0%    6.299n ± 0%   -0.03% (p=0.000 n=10)
+MemclrKnownSize256         6.707n ± 0%    6.704n ± 0%   -0.04% (p=0.018 n=10)
+MemclrKnownSize512        13.610n ± 0%    6.989n ± 0%  -48.65% (p=0.000 n=10)
+MemclrKnownSize1024       26.420n ± 0%    8.458n ± 0%  -67.99% (p=0.000 n=10)
+MemclrKnownSize4096       103.30n ± 0%    28.02n ± 0%  -72.88% (p=0.000 n=10)
+MemclrKnownSize512KiB      26.28µ ± 0%    16.41µ ± 0%  -37.53% (p=0.000 n=10)
+geomean                    624.0n         397.1n       -36.37%
+
+Change-Id: I702b9c1991cf13f9338c189c5ef59cb2c6f279de
+---
+ src/runtime/cpuflags.go      |   3 +-
+ src/runtime/memclr_loong64.s | 214 ++++++++++++++++++++++++-----------
+ 2 files changed, 152 insertions(+), 65 deletions(-)
+
+diff --git a/src/runtime/cpuflags.go b/src/runtime/cpuflags.go
+index e81e50f5df..06424642c7 100644
+--- a/src/runtime/cpuflags.go
++++ b/src/runtime/cpuflags.go
+@@ -20,7 +20,8 @@ const (
+ 
+ 	offsetMIPS64XHasMSA = unsafe.Offsetof(cpu.MIPS64X.HasMSA)
+ 
+-	offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX)
++	offsetLOONG64HasLSX  = unsafe.Offsetof(cpu.Loong64.HasLSX)
++	offsetLOONG64HasLASX = unsafe.Offsetof(cpu.Loong64.HasLASX)
+ )
+ 
+ var (
+diff --git a/src/runtime/memclr_loong64.s b/src/runtime/memclr_loong64.s
+index 346b210c8d..0d0d9f0cbb 100644
+--- a/src/runtime/memclr_loong64.s
++++ b/src/runtime/memclr_loong64.s
+@@ -11,6 +11,7 @@
+ // R5: n
+ // R6: ptrend
+ // R7: tmp
++// R8: tmp
+ 
+ // Algorithm:
+ //
+@@ -38,44 +39,129 @@
+ 
+ // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
+ TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
+-	BEQ	R5, clr_0
++	// <=64 bytes, clear directly, not check aligned
++generic_small:
+ 	ADDV	R4, R5, R6
++	BEQ	R4, R6, clr_0
++	MOVV	$2, R7
++	BLT	R5, R7, clr_1
++	MOVV	$3, R7
++	BLT	R5, R7, clr_2
++	MOVV	$4, R7
++	BLT	R5, R7, clr_3
++	MOVV	$5, R7
++	BLT	R5, R7, clr_4
++	MOVV	$8, R7
++	BLT	R5, R7, clr_5_7
++	MOVV	$9, R7
++	BLT	R5, R7, clr_8
++	MOVV	$17, R7
++	BLT	R5, R7, clr_9_16
++	MOVV	$33, R7
++	BLT	R5, R7, clr_17_32
++	MOVV	$65, R7
++	BLT	R5, R7, clr_33_64
+ 
+-tail:
+-	// <=64 bytes, clear directly, not check aligned
+-	SGTU	$2, R5, R7
+-	BNE	R7, clr_1
+-	SGTU	$3, R5, R7
+-	BNE	R7, clr_2
+-	SGTU	$4, R5, R7
+-	BNE	R7, clr_3
+-	SGTU	$5, R5, R7
+-	BNE	R7, clr_4
+-	SGTU	$8, R5, R7
+-	BNE	R7, clr_5through7
+-	SGTU	$9, R5, R7
+-	BNE	R7, clr_8
+-	SGTU	$17, R5, R7
+-	BNE	R7, clr_9through16
+-	SGTU	$33, R5, R7
+-	BNE	R7, clr_17through32
+-	SGTU	$65, R5, R7
+-	BNE	R7, clr_33through64
++lasx_large:
++	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
++	BEQ	R7, lsx_large
++
++	// X0 = 0
++	XVMOVQ	R0, X0.V4
++
++	// check 32-byte alignment
++	AND	$31, R4, R7
++	BEQ	R7, lasx_large_aligned
++	XVMOVQ	X0, (R4)
++	SUBV	R7, R4
++	ADDV	R7, R5
++	SUBV	$32, R5 // newn = n - (32 - (ptr & 31))
++	ADDV	$32, R4 // newptr = ptr + (32 - (ptr & 31))
++
++lasx_large_aligned:
++	MOVV	$256, R8
++	BLT	R5, R8, lasx_small
++lasx_large_body:
++	XVMOVQ	X0, 0(R4)
++	XVMOVQ	X0, 32(R4)
++	XVMOVQ	X0, 64(R4)
++	XVMOVQ	X0, 96(R4)
++	XVMOVQ	X0, 128(R4)
++	XVMOVQ	X0, 160(R4)
++	XVMOVQ	X0, 192(R4)
++	XVMOVQ	X0, 224(R4)
++	SUBV	$256, R5
++	ADDV	$256, R4
++	BGE	R5, R8, lasx_large_body
++
++lasx_small:
++	MOVV	$32, R8
++	BLT	R5, R8, generic_small
++lasx_small_body:
++	XVMOVQ	X0, (R4)
++	SUBV	$32, R5
++	ADDV	$32, R4
++	BGE     R5, R8, lasx_small_body
++lasx_tail:
++	JMP	generic_small
++
++lsx_large:
++	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
++	BEQ	R7, generic_large
++
++	// V0 = 0
++	VMOVQ	R0, V0.V2
+ 
++	// check 16-byte alignment
++	AND	$15, R4, R7
++	BEQ	R7, lsx_large_aligned
++	VMOVQ	V0, (R4)
++	SUBV	R7, R4
++	ADDV	R7, R5
++	SUBV	$16, R5 // newn = n - (16 - (ptr & 15))
++	ADDV	$16, R4 // newptr = ptr + (16 - (ptr & 15))
++
++lsx_large_aligned:
++	MOVV	$128, R8
++	BLT	R5, R8, lsx_small
++lsx_large_body:
++	VMOVQ	V0, 0(R4)
++	VMOVQ	V0, 16(R4)
++	VMOVQ	V0, 32(R4)
++	VMOVQ	V0, 48(R4)
++	VMOVQ	V0, 64(R4)
++	VMOVQ	V0, 80(R4)
++	VMOVQ	V0, 96(R4)
++	VMOVQ	V0, 112(R4)
++	SUBV	$128, R5
++	ADDV	$128, R4
++	BGE     R5, R8, lsx_large_body
++
++lsx_small:
++	MOVV	$16, R8
++	BLT	R5, R8, generic_small
++lsx_small_body:
++	VMOVQ	V0, (R4)
++	SUBV	$16, R5
++	ADDV	$16, R4
++	BGE     R5, R8, lsx_small_body
++lsx_tail:
++	JMP	generic_small
++
++generic_large:
+ 	// n > 64 bytes, check aligned
+ 	AND	$7, R4, R7
+-	BEQ	R7, body
+-
+-head:
++	BEQ	R7, generic_large_aligned
+ 	MOVV	R0, (R4)
+-	SUBV	R7, R4
+ 	ADDV	R7, R5
+-	ADDV	$8, R4	// newptr = ptr + (8 - (ptr & 7))
++	SUBV	R7, R4
+ 	SUBV	$8, R5	// newn = n - (8 - (ptr & 7))
+-	SGTU	$65, R5, R7
+-	BNE	R7, clr_33through64
++	ADDV	$8, R4	// newptr = ptr + (8 - (ptr & 7))
+ 
+-body:
++generic_large_aligned:
++	MOVV    $65, R7
++	BLT	R5, R7, generic_small
++generic_large_body:
+ 	MOVV	R0, (R4)
+ 	MOVV	R0, 8(R4)
+ 	MOVV	R0, 16(R4)
+@@ -84,52 +170,52 @@ body:
+ 	MOVV	R0, 40(R4)
+ 	MOVV	R0, 48(R4)
+ 	MOVV	R0, 56(R4)
+-	ADDV	$-64, R5
++	SUBV	$64, R5
+ 	ADDV	$64, R4
+-	SGTU	$65, R5, R7
+-	BEQ	R7, body
+-	BEQ	R5, clr_0
+-	JMP	tail
++	BGE	R5, R7, generic_large_body
++generic_tail:
++	JMP	generic_small
+ 
+-clr_0:
++clr_33_64:
++	MOVV	R0, (R4)
++	MOVV	R0, 8(R4)
++	MOVV	R0, 16(R4)
++	MOVV	R0, 24(R4)
++	MOVV	R0, -32(R6)
++	MOVV	R0, -24(R6)
++	MOVV	R0, -16(R6)
++	MOVV	R0, -8(R6)
+ 	RET
+-clr_1:
+-	MOVB	R0, (R4)
++
++clr_17_32:
++	MOVV	R0, (R4)
++	MOVV	R0, 8(R4)
++	MOVV	R0, -16(R6)
++	MOVV	R0, -8(R6)
+ 	RET
+-clr_2:
+-	MOVH	R0, (R4)
++clr_9_16:
++	MOVV	R0, (R4)
++	MOVV	R0, -8(R6)
+ 	RET
+-clr_3:
+-	MOVH	R0, (R4)
+-	MOVB	R0, 2(R4)
++clr_8:
++	MOVV	R0, (R4)
+ 	RET
+-clr_4:
++clr_5_7:
+ 	MOVW	R0, (R4)
++	MOVW	R0, -4(R6)
+ 	RET
+-clr_5through7:
++clr_4:
+ 	MOVW	R0, (R4)
+-	MOVW	R0, -4(R6)
+ 	RET
+-clr_8:
+-	MOVV	R0, (R4)
++clr_3:
++	MOVH	R0, (R4)
++	MOVB	R0, 2(R4)
+ 	RET
+-clr_9through16:
+-	MOVV	R0, (R4)
+-	MOVV	R0, -8(R6)
++clr_2:
++	MOVH	R0, (R4)
+ 	RET
+-clr_17through32:
+-	MOVV	R0, (R4)
+-	MOVV	R0, 8(R4)
+-	MOVV	R0, -16(R6)
+-	MOVV	R0, -8(R6)
++clr_1:
++	MOVB	R0, (R4)
+ 	RET
+-clr_33through64:
+-	MOVV	R0, (R4)
+-	MOVV	R0, 8(R4)
+-	MOVV	R0, 16(R4)
+-	MOVV	R0, 24(R4)
+-	MOVV	R0, -32(R6)
+-	MOVV	R0, -24(R6)
+-	MOVV	R0, -16(R6)
+-	MOVV	R0, -8(R6)
++clr_0:
+ 	RET
+-- 
+2.38.1
+
diff --git a/0040-runtime-race-add-the-implementation-of-atomic.-Or-An.patch b/0040-runtime-race-add-the-implementation-of-atomic.-Or-An.patch
new file mode 100644
index 0000000000000000000000000000000000000000..39a261b130f8233e6a4ef7081b817fd0c5160f66
--- /dev/null
+++ b/0040-runtime-race-add-the-implementation-of-atomic.-Or-An.patch
@@ -0,0 +1,75 @@
+From 88b165cf7d4cb6a77f47d3c291d3ee7e1f13695e Mon Sep 17 00:00:00 2001
+From: Guoqi Chen <chenguoqi@loongson.cn>
+Date: Fri, 10 Jan 2025 10:31:47 +0800
+Subject: [PATCH 40/44] runtime/race: add the implementation of atomic.{Or,And}
+ on loong64
+
+Change-Id: Ia4298a4d92fce210e3c743b2d5ce2b28b82d4971
+---
+ src/runtime/race_loong64.s               |  50 +++++++++++++++++++++++
+ 2 files changed, 50 insertions(+)
+
+diff --git a/src/runtime/race_loong64.s b/src/runtime/race_loong64.s
+index 04f264b21b..e6c11d44f7 100644
+--- a/src/runtime/race_loong64.s
++++ b/src/runtime/race_loong64.s
+@@ -308,6 +308,56 @@ TEXT	sync∕atomic·AddUintptr(SB), NOSPLIT, $0-24
+ 	GO_ARGS
+ 	JMP	sync∕atomic·AddInt64(SB)
+ 
++// And
++TEXT	sync∕atomic·AndInt32(SB), NOSPLIT, $0-20
++	GO_ARGS
++	MOVV	$__tsan_go_atomic32_fetch_and(SB), RCALL
++	JAL	racecallatomic<>(SB)
++	RET
++
++TEXT	sync∕atomic·AndInt64(SB), NOSPLIT, $0-24
++	GO_ARGS
++	MOVV	$__tsan_go_atomic64_fetch_and(SB), RCALL
++	JAL	racecallatomic<>(SB)
++	RET
++
++TEXT	sync∕atomic·AndUint32(SB), NOSPLIT, $0-20
++	GO_ARGS
++	JMP	sync∕atomic·AndInt32(SB)
++
++TEXT	sync∕atomic·AndUint64(SB), NOSPLIT, $0-24
++	GO_ARGS
++	JMP	sync∕atomic·AndInt64(SB)
++
++TEXT	sync∕atomic·AndUintptr(SB), NOSPLIT, $0-24
++	GO_ARGS
++	JMP	sync∕atomic·AndInt64(SB)
++
++// Or
++TEXT	sync∕atomic·OrInt32(SB), NOSPLIT, $0-20
++	GO_ARGS
++	MOVV	$__tsan_go_atomic32_fetch_or(SB), RCALL
++	JAL	racecallatomic<>(SB)
++	RET
++
++TEXT	sync∕atomic·OrInt64(SB), NOSPLIT, $0-24
++	GO_ARGS
++	MOVV	$__tsan_go_atomic64_fetch_or(SB), RCALL
++	JAL	racecallatomic<>(SB)
++	RET
++
++TEXT	sync∕atomic·OrUint32(SB), NOSPLIT, $0-20
++	GO_ARGS
++	JMP	sync∕atomic·OrInt32(SB)
++
++TEXT	sync∕atomic·OrUint64(SB), NOSPLIT, $0-24
++	GO_ARGS
++	JMP	sync∕atomic·OrInt64(SB)
++
++TEXT	sync∕atomic·OrUintptr(SB), NOSPLIT, $0-24
++	GO_ARGS
++	JMP	sync∕atomic·OrInt64(SB)
++
+ // CompareAndSwap
+ TEXT	sync∕atomic·CompareAndSwapInt32(SB), NOSPLIT, $0-17
+ 	GO_ARGS
+-- 
+2.38.1
+
diff --git a/0041-cmd-internal-obj-loong64-add-F-MAXA-MINA-.-S-D-instr.patch b/0041-cmd-internal-obj-loong64-add-F-MAXA-MINA-.-S-D-instr.patch
new file mode 100644
index 0000000000000000000000000000000000000000..847b4b1f3f22a9f0cb249f5e573e8f6c950ac84f
--- /dev/null
+++ b/0041-cmd-internal-obj-loong64-add-F-MAXA-MINA-.-S-D-instr.patch
@@ -0,0 +1,107 @@
+From e652e32e37bfd898af333a32b73cfde6ab2116fa Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Mon, 30 Dec 2024 10:08:58 +0800
+Subject: [PATCH 41/44] cmd/internal/obj/loong64: add F{MAXA/MINA}.{S/D}
+ instructions
+
+Go asm syntax:
+	F{MAXA/MINA}{F/D}	FK, FJ, FD
+
+Equivalent platform assembler syntax:
+	f{maxa/mina}.{s/d}	fd, fj, fk
+
+Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html
+
+Change-Id: I6790657d2f36bdf5e6818b6c0aaa48117e782b8d
+---
+ src/cmd/asm/internal/asm/testdata/loong64enc1.s |  9 +++++++++
+ src/cmd/internal/obj/loong64/a.out.go           |  6 ++++++
+ src/cmd/internal/obj/loong64/anames.go          |  4 ++++
+ src/cmd/internal/obj/loong64/asm.go             | 12 ++++++++++++
+ 4 files changed, 31 insertions(+)
+
+diff --git a/src/cmd/asm/internal/asm/testdata/loong64enc1.s b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+index b40d86e596..32d3b3f0a2 100644
+--- a/src/cmd/asm/internal/asm/testdata/loong64enc1.s
++++ b/src/cmd/asm/internal/asm/testdata/loong64enc1.s
+@@ -346,6 +346,15 @@ lable2:
+ 	FTINTVF		F0, F1		// 01241b01
+ 	FTINTVD		F0, F1		// 01281b01
+ 
++	FMAXAF		F4, F5, F6	// a6900c01
++	FMAXAF		F4, F5		// a5900c01
++	FMAXAD		F4, F5, F6	// a6100d01
++	FMAXAD		F4, F5		// a5100d01
++	FMINAF		F4, F5, F6	// a6900e01
++	FMINAF		F4, F5		// a5900e01
++	FMINAD		F4, F5, F6	// a6100f01
++	FMINAD		F4, F5		// a5100f01
++
+ 	FTINTRMWF	F0, F2		// 02041a01
+ 	FTINTRMWD	F0, F2		// 02081a01
+ 	FTINTRMVF	F0, F2		// 02241a01
+diff --git a/src/cmd/internal/obj/loong64/a.out.go b/src/cmd/internal/obj/loong64/a.out.go
+index f2d4c41d68..857ea649e7 100644
+--- a/src/cmd/internal/obj/loong64/a.out.go
++++ b/src/cmd/internal/obj/loong64/a.out.go
+@@ -688,6 +688,12 @@ const (
+ 	AFMAXF
+ 	AFMAXD
+ 
++	// 3.2.1.4
++	AFMAXAF
++	AFMAXAD
++	AFMINAF
++	AFMINAD
++
+ 	// 3.2.1.7
+ 	AFCOPYSGF
+ 	AFCOPYSGD
+diff --git a/src/cmd/internal/obj/loong64/anames.go b/src/cmd/internal/obj/loong64/anames.go
+index aee0da0a6e..d2acdf7042 100644
+--- a/src/cmd/internal/obj/loong64/anames.go
++++ b/src/cmd/internal/obj/loong64/anames.go
+@@ -223,6 +223,10 @@ var Anames = []string{
+ 	"FMIND",
+ 	"FMAXF",
+ 	"FMAXD",
++	"FMAXAF",
++	"FMAXAD",
++	"FMINAF",
++	"FMINAD",
+ 	"FCOPYSGF",
+ 	"FCOPYSGD",
+ 	"FSCALEBF",
+diff --git a/src/cmd/internal/obj/loong64/asm.go b/src/cmd/internal/obj/loong64/asm.go
+index 2480cf9382..31f5376f8e 100644
+--- a/src/cmd/internal/obj/loong64/asm.go
++++ b/src/cmd/internal/obj/loong64/asm.go
+@@ -1347,6 +1347,10 @@ func buildop(ctxt *obj.Link) {
+ 			opset(AFCOPYSGD, r0)
+ 			opset(AFSCALEBF, r0)
+ 			opset(AFSCALEBD, r0)
++			opset(AFMAXAF, r0)
++			opset(AFMAXAD, r0)
++			opset(AFMINAF, r0)
++			opset(AFMINAD, r0)
+ 
+ 		case AFMADDF:
+ 			opset(AFMADDD, r0)
+@@ -2811,6 +2815,14 @@ func (c *ctxt0) oprrr(a obj.As) uint32 {
+ 		return 0x211 << 15 // fmax.s
+ 	case AFMAXD:
+ 		return 0x212 << 15 // fmax.d
++	case AFMAXAF:
++		return 0x219 << 15 // fmaxa.s
++	case AFMAXAD:
++		return 0x21a << 15 // fmaxa.d
++	case AFMINAF:
++		return 0x21d << 15 // fmina.s
++	case AFMINAD:
++		return 0x21e << 15 // fmina.d
+ 	case AFSCALEBF:
+ 		return 0x221 << 15 // fscaleb.s
+ 	case AFSCALEBD:
+-- 
+2.38.1
+
diff --git a/0042-math-implement-func-archExp-and-archExp2-in-assembly.patch b/0042-math-implement-func-archExp-and-archExp2-in-assembly.patch
new file mode 100644
index 0000000000000000000000000000000000000000..a9303c24b03f58fa52421ec53e6a08ff1d6c7e5f
--- /dev/null
+++ b/0042-math-implement-func-archExp-and-archExp2-in-assembly.patch
@@ -0,0 +1,358 @@
+From f463c4a1db9ac0e4be9d67bc53f4ddb8515232d3 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Tue, 31 Dec 2024 21:02:47 +0800
+Subject: [PATCH 42/44] math: implement func archExp and archExp2 in assembly
+ on loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+goos: linux
+goarch: loong64
+pkg: math
+cpu: Loongson-3A6000 @ 2500.00MHz
+        |  bench.old  |              bench.new              |
+        |   sec/op    |   sec/op     vs base                |
+Exp       26.30n ± 0%   12.93n ± 0%  -50.85% (p=0.000 n=10)
+ExpGo     26.86n ± 0%   26.92n ± 0%   +0.22% (p=0.000 n=10)
+Expm1     16.76n ± 0%   16.75n ± 0%        ~ (p=0.060 n=10)
+Exp2      23.05n ± 0%   12.12n ± 0%  -47.42% (p=0.000 n=10)
+Exp2Go    23.41n ± 0%   23.47n ± 0%   +0.28% (p=0.000 n=10)
+geomean   22.97n        17.54n       -23.64%
+
+goos: linux
+goarch: loong64
+pkg: math/cmplx
+cpu: Loongson-3A6000 @ 2500.00MHz
+    |  bench.old  |              bench.new              |
+    |   sec/op    |   sec/op     vs base                |
+Exp   51.32n ± 0%   35.41n ± 0%  -30.99% (p=0.000 n=10)
+
+goos: linux
+goarch: loong64
+pkg: math
+cpu: Loongson-3A5000 @ 2500.00MHz
+        |  bench.old  |              bench.new              |
+        |   sec/op    |   sec/op     vs base                |
+Exp       50.27n ± 0%   48.75n ± 1%   -3.01% (p=0.000 n=10)
+ExpGo     50.72n ± 0%   50.44n ± 0%   -0.55% (p=0.000 n=10)
+Expm1     28.40n ± 0%   28.32n ± 0%        ~ (p=0.360 n=10)
+Exp2      50.09n ± 0%   21.49n ± 1%  -57.10% (p=0.000 n=10)
+Exp2Go    50.05n ± 0%   49.69n ± 0%   -0.72% (p=0.000 n=10)
+geomean   44.85n        37.52n       -16.35%
+
+goos: linux
+goarch: loong64
+pkg: math/cmplx
+cpu: Loongson-3A5000 @ 2500.00MHz
+    |  bench.old  |              bench.new              |
+    |   sec/op    |   sec/op     vs base                |
+Exp   88.56n ± 0%   67.29n ± 0%  -24.03% (p=0.000 n=10)
+
+Change-Id: I89e456d26fc075d83335ee4a31227d2aface5714
+---
+ src/math/exp2_asm.go   |   2 +-
+ src/math/exp2_noasm.go |   2 +-
+ src/math/exp_asm.go    |   2 +-
+ src/math/exp_loong64.s | 236 +++++++++++++++++++++++++++++++++++++++++
+ src/math/exp_noasm.go  |   2 +-
+ 5 files changed, 240 insertions(+), 4 deletions(-)
+ create mode 100644 src/math/exp_loong64.s
+
+diff --git a/src/math/exp2_asm.go b/src/math/exp2_asm.go
+index c26b2c3fab..1e78759374 100644
+--- a/src/math/exp2_asm.go
++++ b/src/math/exp2_asm.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build arm64
++//go:build arm64 || loong64
+ 
+ package math
+ 
+diff --git a/src/math/exp2_noasm.go b/src/math/exp2_noasm.go
+index c2b409329f..847138b622 100644
+--- a/src/math/exp2_noasm.go
++++ b/src/math/exp2_noasm.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build !arm64
++//go:build !arm64 && !loong64
+ 
+ package math
+ 
+diff --git a/src/math/exp_asm.go b/src/math/exp_asm.go
+index 424442845b..125529fca3 100644
+--- a/src/math/exp_asm.go
++++ b/src/math/exp_asm.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build amd64 || arm64 || s390x
++//go:build amd64 || arm64 || loong64 || s390x
+ 
+ package math
+ 
+diff --git a/src/math/exp_loong64.s b/src/math/exp_loong64.s
+new file mode 100644
+index 0000000000..3d24214289
+--- /dev/null
++++ b/src/math/exp_loong64.s
+@@ -0,0 +1,236 @@
++// Copyright 2024 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++#include "textflag.h"
++
++#define NearZero	0x3e30000000000000	// 2**-28
++#define PosInf		0x7ff0000000000000
++#define FracMask	0x000fffffffffffff
++#define C1		0x3cb0000000000000	// 2**-52
++
++DATA exprodata<>+0(SB)/8, $0.0
++DATA exprodata<>+8(SB)/8, $0.5
++DATA exprodata<>+16(SB)/8, $1.0
++DATA exprodata<>+24(SB)/8, $2.0
++DATA exprodata<>+32(SB)/8, $6.93147180369123816490e-01	// Ln2Hi
++DATA exprodata<>+40(SB)/8, $1.90821492927058770002e-10	// Ln2Lo
++DATA exprodata<>+48(SB)/8, $1.44269504088896338700e+00	// Log2e
++DATA exprodata<>+56(SB)/8, $7.09782712893383973096e+02	// Overflow
++DATA exprodata<>+64(SB)/8, $-7.45133219101941108420e+02	// Underflow
++DATA exprodata<>+72(SB)/8, $1.0239999999999999e+03	// Overflow2
++DATA exprodata<>+80(SB)/8, $-1.0740e+03			// Underflow2
++DATA exprodata<>+88(SB)/8, $3.7252902984619141e-09	// NearZero
++GLOBL exprodata<>+0(SB), NOPTR|RODATA, $96
++
++DATA expmultirodata<>+0(SB)/8, $1.66666666666666657415e-01	// P1
++DATA expmultirodata<>+8(SB)/8, $-2.77777777770155933842e-03	// P2
++DATA expmultirodata<>+16(SB)/8, $6.61375632143793436117e-05	// P3
++DATA expmultirodata<>+24(SB)/8, $-1.65339022054652515390e-06	// P4
++DATA expmultirodata<>+32(SB)/8, $4.13813679705723846039e-08	// P5
++GLOBL expmultirodata<>+0(SB), NOPTR|RODATA, $40
++
++// Exp returns e**x, the base-e exponential of x.
++// This is an assembly implementation of the method used for function Exp in file exp.go.
++//
++// func Exp(x float64) float64
++TEXT ·archExp(SB),$0-16
++	MOVD	x+0(FP), F0	// F0 = x
++
++	MOVV	$exprodata<>+0(SB), R10
++	MOVD	56(R10), F1	// Overflow
++	MOVD	64(R10), F2	// Underflow
++	MOVD	88(R10), F3	// NearZero
++	MOVD	16(R10), F17	// 1.0
++
++	CMPEQD	F0, F0, FCC0
++	BFPF	isNaN		// x = NaN, return NaN
++
++	CMPGTD	F0, F1, FCC0
++	BFPT	overflow	// x > Overflow, return PosInf
++
++	CMPGTD	F2, F0, FCC0
++	BFPT	underflow	// x < Underflow, return 0
++
++	ABSD	F0, F5
++	CMPGTD	F3, F5, FCC0
++	BFPT	nearzero	// fabs(x) < NearZero, return 1 + x
++
++	// argument reduction, x = k*ln2 + r,  |r| <= 0.5*ln2
++	// computed as r = hi - lo for extra precision.
++	MOVD	0(R10), F5	// 0.0
++	MOVD	8(R10), F3	// 0.5
++	MOVD	48(R10), F2	// Log2e
++	CMPGTD	F0, F5, FCC0
++	BFPT	add		// x > 0
++sub:
++	FMSUBD	F3, F2, F0, F3	// Log2e*x - 0.5
++	JMP	2(PC)
++add:
++	FMADDD	F3, F2, F0, F3	// Log2e*x + 0.5
++
++	FTINTRZVD F3, F4	// float64 -> int64
++	MOVV	F4, R5		// R5 = int(k)
++	FFINTDV	F4, F3		// int64 -> float64
++
++	MOVD	32(R10), F4	// F4 = Ln2Hi
++	MOVD	40(R10), F5	// F5 = Ln2Lo
++	FNMSUBD	F0, F3, F4, F4	// F4 = hi = x - float64(int(k))*Ln2Hi
++	MULD	F3, F5, F5	// F5 = lo = float64(int(k)) * Ln2Lo
++	SUBD	F5, F4, F6	// F6 = r = hi - lo
++	MULD	F6, F6, F7	// F7 = t = r * r
++
++	// compute c
++	MOVV	$expmultirodata<>+0(SB), R11
++	MOVD	32(R11), F8		// F8 = P5
++	MOVD	24(R11), F9		// F9 = P4
++	FMADDD	F9, F8, F7, F13		// P4+t*P5
++	MOVD	16(R11), F10		// F10 = P3
++	FMADDD	F10, F13, F7, F13	// P3+t*(P4+t*P5)
++	MOVD	8(R11), F11		// F11 = P2
++	FMADDD	F11, F13, F7, F13	// P2+t*(P3+t*(P4+t*P5))
++	MOVD	0(R11), F12		// F12 = P1
++	FMADDD	F12, F13, F7, F13	// P1+t*(P2+t*(P3+t*(P4+t*P5)))
++	FNMSUBD	F6, F13, F7, F13	// F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))))
++
++	// compute y
++	MOVD	24(R10), F14	// F14 = 2.0
++	SUBD	F13, F14, F14	// F14 = 2 - c
++	MULD	F6, F13, F15	// F15 = r*c
++	DIVD	F14, F15, F15	// F15 = (r*c)/(2-c)
++	SUBD	F15, F5, F15	// F15 = lo-(r*c)/(2-c)
++	SUBD	F4, F15, F15	// F15 = (lo-(r*c)/(2-c))-hi
++	SUBD	F15, F17, F16	// F16 = y = 1-((lo-(r*c)/(2-c))-hi)
++
++	// inline Ldexp(y, k), benefit:
++	// 1, no parameter pass overhead.
++	// 2, skip unnecessary checks for Inf/NaN/Zero
++	MOVV	F16, R4
++	MOVV	$FracMask, R9
++	AND	R9, R4, R6	// fraction
++	SRLV	$52, R4, R7	// exponent
++	ADDV	R5, R7		// R5 = int(k)
++	MOVV	$1, R12
++	BGE	R7, R12, normal
++	ADDV	$52, R7		// denormal
++	MOVV	$C1, R8
++	MOVV	R8, F17		// m = 2**-52
++normal:
++	SLLV	$52, R7
++	OR	R7, R6, R4
++	MOVV	R4, F0
++	MULD	F17, F0		// return m * x
++	MOVD	F0, ret+8(FP)
++	RET
++nearzero:
++	ADDD	F17, F0, F0
++isNaN:
++	MOVD	F0, ret+8(FP)
++	RET
++underflow:
++	MOVV	R0, ret+8(FP)
++	RET
++overflow:
++	MOVV	$PosInf, R4
++	MOVV	R4, ret+8(FP)
++	RET
++
++
++// Exp2 returns 2**x, the base-2 exponential of x.
++// This is an assembly implementation of the method used for function Exp2 in file exp.go.
++//
++// func Exp2(x float64) float64
++TEXT ·archExp2(SB),$0-16
++	MOVD	x+0(FP), F0	// F0 = x
++
++	MOVV	$exprodata<>+0(SB), R10
++	MOVD	72(R10), F1	// Overflow2
++	MOVD	80(R10), F2	// Underflow2
++	MOVD	88(R10), F3	// NearZero
++
++	CMPEQD	F0, F0, FCC0
++	BFPF	isNaN		// x = NaN, return NaN
++
++	CMPGTD	F0, F1, FCC0
++	BFPT	overflow	// x > Overflow, return PosInf
++
++	CMPGTD	F2, F0, FCC0
++	BFPT	underflow	// x < Underflow, return 0
++
++	// argument reduction; x = r*lg(e) + k with |r| <= ln(2)/2
++	// computed as r = hi - lo for extra precision.
++	MOVD	0(R10), F10	// 0.0
++	MOVD	8(R10), F2	// 0.5
++	CMPGTD	F0, F10, FCC0
++	BFPT	add
++sub:
++	SUBD	F2, F0, F3	// x - 0.5
++	JMP	2(PC)
++add:
++	ADDD	F2, F0, F3	// x + 0.5
++
++	FTINTRZVD F3, F4	// float64 -> int64
++	MOVV	F4, R5		// R5 = int(k)
++	FFINTDV	F4, F3		// F3 = float64(int(k))
++
++	MOVD	32(R10), F4	// F4 = Ln2Hi
++	MOVD	40(R10), F5	// F5 = Ln2Lo
++	SUBD	F3, F0, F3	// t = x - float64(int(k))
++	MULD	F3, F4		// F4 = hi = t * Ln2Hi
++	FNMSUBD	F10, F3, F5, F5	// F5 = lo = -t * Ln2Lo
++	SUBD	F5, F4, F6	// F6 = r = hi - lo
++	MULD	F6, F6, F7	// F7 = t = r * r
++
++	// compute c
++	MOVV	$expmultirodata<>+0(SB), R11
++	MOVD	32(R11), F8		// F8 = P5
++	MOVD	24(R11), F9		// F9 = P4
++	FMADDD	F9, F8, F7, F13		// P4+t*P5
++	MOVD	16(R11), F10		// F10 = P3
++	FMADDD	F10, F13, F7, F13	// P3+t*(P4+t*P5)
++	MOVD	8(R11), F11		// F11 = P2
++	FMADDD	F11, F13, F7, F13	// P2+t*(P3+t*(P4+t*P5))
++	MOVD	0(R11), F12		// F12 = P1
++	FMADDD	F12, F13, F7, F13	// P1+t*(P2+t*(P3+t*(P4+t*P5)))
++	FNMSUBD	F6, F13, F7, F13	// F13 = c = r - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))))
++
++	// compute y
++	MOVD	24(R10), F14	// F14 = 2.0
++	SUBD	F13, F14, F14	// F14 = 2 - c
++	MULD	F6, F13, F15	// F15 = r*c
++	DIVD	F14, F15	// F15 = (r*c)/(2-c)
++
++	MOVD	16(R10), F17	// 1.0
++	SUBD	F15, F5, F15	// lo-(r*c)/(2-c)
++	SUBD	F4, F15, F15	// (lo-(r*c)/(2-c))-hi
++	SUBD	F15, F17, F16	// F16 = y = 1-((lo-(r*c)/(2-c))-hi)
++
++	// inline Ldexp(y, k), benefit:
++	// 1, no parameter pass overhead.
++	// 2, skip unnecessary checks for Inf/NaN/Zero
++	MOVV	F16, R4
++	MOVV	$FracMask, R9
++	SRLV	$52, R4, R7	// exponent
++	AND	R9, R4, R6	// fraction
++	ADDV	R5, R7		// R5 = int(k)
++	MOVV	$1, R12
++	BGE	R7, R12, normal
++
++	ADDV	$52, R7		// denormal
++	MOVV	$C1, R8
++	MOVV	R8, F17		// m = 2**-52
++normal:
++	SLLV	$52, R7
++	OR	R7, R6, R4
++	MOVV	R4, F0
++	MULD	F17, F0		// return m * x
++isNaN:
++	MOVD	F0, ret+8(FP)
++	RET
++underflow:
++	MOVV	R0, ret+8(FP)
++	RET
++overflow:
++	MOVV	$PosInf, R4
++	MOVV	R4, ret+8(FP)
++	RET
+diff --git a/src/math/exp_noasm.go b/src/math/exp_noasm.go
+index bd3f02412a..bf5e84b736 100644
+--- a/src/math/exp_noasm.go
++++ b/src/math/exp_noasm.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build !amd64 && !arm64 && !s390x
++//go:build !amd64 && !arm64 && !loong64 && !s390x
+ 
+ package math
+ 
+-- 
+2.38.1
+
diff --git a/0043-math-implement-func-archLog-in-assembly-on-loong64.patch b/0043-math-implement-func-archLog-in-assembly-on-loong64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..f01c831480e7521902cef28d0a507e0a1dfbf614
--- /dev/null
+++ b/0043-math-implement-func-archLog-in-assembly-on-loong64.patch
@@ -0,0 +1,217 @@
+From 066bd3bf1a03e21cc27b463164461a56ce107d59 Mon Sep 17 00:00:00 2001
+From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
+Date: Mon, 6 Jan 2025 15:40:06 +0800
+Subject: [PATCH 43/44] math: implement func archLog in assembly on loong64
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+goos: linux
+goarch: loong64
+pkg: math
+cpu: Loongson-3A6000 @ 2500.00MHz
+        |  bench.old  |              bench.new              |
+        |   sec/op    |   sec/op     vs base                |
+Log       18.87n ± 0%   12.85n ± 0%  -31.90% (p=0.000 n=10)
+Logb      5.203n ± 0%   5.604n ± 0%   +7.71% (p=0.000 n=10)
+Log1p     16.78n ± 0%   16.78n ± 0%        ~ (p=0.450 n=10)
+Log10     20.47n ± 0%   13.59n ± 0%  -33.61% (p=0.000 n=10)
+Log2      6.804n ± 0%   8.805n ± 0%  +29.40% (p=0.000 n=10)
+geomean   11.81n        10.77n        -8.82%
+
+goos: linux
+goarch: loong64
+pkg: math
+cpu: Loongson-3A5000 @ 2500.00MHz
+        |  bench.old  |              bench.new              |
+        |   sec/op    |   sec/op     vs base                |
+Log       28.28n ± 0%   24.95n ± 1%  -11.78% (p=0.000 n=10)
+Logb      7.609n ± 0%   7.207n ± 0%   -5.29% (p=0.000 n=10)
+Log1p     27.27n ± 0%   27.18n ± 1%        ~ (p=0.078 n=10)
+Log10     29.56n ± 0%   26.56n ± 0%  -10.16% (p=0.000 n=10)
+Log2      11.43n ± 0%   10.41n ± 0%   -8.92% (p=0.000 n=10)
+geomean   18.17n        16.83n        -7.38%
+
+Change-Id: I42a17280874c28b31a3b5c75fc19ddac90c92f32
+---
+ src/math/log_asm.go    |   2 +-
+ src/math/log_loong64.s | 140 +++++++++++++++++++++++++++++++++++++++++
+ src/math/log_stub.go   |   2 +-
+ 3 files changed, 142 insertions(+), 2 deletions(-)
+ create mode 100644 src/math/log_loong64.s
+
+diff --git a/src/math/log_asm.go b/src/math/log_asm.go
+index 848cce13b2..82372d1e64 100644
+--- a/src/math/log_asm.go
++++ b/src/math/log_asm.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build amd64 || s390x
++//go:build amd64 || loong64 || s390x
+ 
+ package math
+ 
+diff --git a/src/math/log_loong64.s b/src/math/log_loong64.s
+new file mode 100644
+index 0000000000..534295cb53
+--- /dev/null
++++ b/src/math/log_loong64.s
+@@ -0,0 +1,140 @@
++// Copyright 2025 The Go Authors. All rights reserved.
++// Use of this source code is governed by a BSD-style
++// license that can be found in the LICENSE file.
++
++#include "textflag.h"
++
++DATA logrodata<>+0(SB)/8, $0.5
++DATA logrodata<>+8(SB)/8, $1.0
++DATA logrodata<>+16(SB)/8, $2.0
++DATA logrodata<>+24(SB)/8, $7.07106781186547524401e-01	// sqrt(2)/2
++DATA logrodata<>+32(SB)/8, $6.93147180369123816490e-01	// Ln2Hi
++DATA logrodata<>+40(SB)/8, $1.90821492927058770002e-10	// Ln2Lo
++DATA logrodata<>+48(SB)/8, $6.666666666666735130e-01	// L1
++DATA logrodata<>+56(SB)/8, $3.999999999940941908e-01	// L2
++DATA logrodata<>+64(SB)/8, $2.857142874366239149e-01	// L3
++DATA logrodata<>+72(SB)/8, $2.222219843214978396e-01	// L4
++DATA logrodata<>+80(SB)/8, $1.818357216161805012e-01	// L5
++DATA logrodata<>+88(SB)/8, $1.531383769920937332e-01	// L6
++DATA logrodata<>+96(SB)/8, $1.479819860511658591e-01	// L7
++DATA logrodata<>+104(SB)/8, $2.2250738585072014e-308	// 2**-1022
++GLOBL logrodata<>+0(SB), NOPTR|RODATA, $112
++
++#define NaN    0x7FF8000000000001
++#define NegInf 0xFFF0000000000000
++#define PosInf 0x7FF0000000000000
++
++// func Log(x float64) float64
++TEXT ·archLog(SB),NOSPLIT,$0
++	// test bits for special cases
++	MOVD	x+0(FP), F0
++	MOVV	x+0(FP), R4
++	MOVV	$logrodata<>+0(SB), R10
++	FCLASSD	F0, F4
++	MOVV	F4, R5
++	AND	$67, R5, R6	// NaN or +Inf
++	AND	$544, R5, R7	// +0 or -0
++	AND	$28, R5, R8	// <0
++	BNE	R6, R0, isInfOrNaN
++	BNE	R7, R0, isZero
++	BNE	R8, R0, isNegative
++
++	// reduce
++	// f1, ki := Frexp(x) FIXME
++	MOVD	104(R10), F4
++	ABSD	F0, F1
++	CMPGED	F1, F4, FCC0
++	BFPT	direct_return
++	MOVV	$0x10000000000000, R5	// 1 << 52
++	MULV	R4, R5, R4		// R4 = y
++	MOVV	$-52, R15		// R15 = ki  (exp)
++	JMP	2(PC)
++direct_return:
++	MOVV	$0, R15			// R15 = ki  (exp)  F0 = y
++
++	MOVV	$0x000FFFFFFFFFFFFF, R5
++	AND	R4, R5, R7		// x &^= mask << shift
++	MOVV	$0x3FE0000000000000, R6	// (-1 + bias) << shift
++	OR	R6, R7			// x |= (-1 + bias) << shift
++	MOVV	R7, F2			// F2 = f1
++	SRLV	$52, R4			// x >> shift
++	AND	$0x7FF, R4		// (x>>shift)&mask
++	SUBV	$0x3FE, R4		// int((x>>shift)&mask) - bias + 1
++	ADDV	R4, R15, R4		// R4 = exp
++
++	// if f1 < math.Sqrt2/2 { k -= 1; f1 *= 2 }
++	MOVD	0(R10), F10	// 0.5
++	MOVD	8(R10), F3	// 1.0
++	MOVD	16(R10), F4	// 2.0
++	MOVD	24(R10), F0	// sqrt(2)/2
++	CMPGED	F2, F0, FCC0	// if f1 >= Sqrt2/2
++	BFPT	next
++	MULD	F4, F2, F2	// f1 *= 2
++	SUBV	$1, R4, R4
++next:
++	MOVV	R4, F1		// k--
++	FFINTDV	F1, F1		// F1 = k
++	// f := f1 - 1
++	SUBD	F3, F2, F2
++
++	// compute
++	MOVD	96(R10), F17	// L7
++	MOVD	80(R10), F15	// L5
++	MOVD	64(R10), F13	// L3
++	MOVD	48(R10), F11	// L1
++	ADDD	F4, F2, F3	// 2 + f
++	DIVD	F3, F2, F4	// s := f / (2 + f)
++	MULD	F4, F4, F5	// s2 := s * s
++	MULD	F5, F5, F6	// s4 := s2 * s2
++	// t1 := s2 * (L1 + s4*(L3+s4*(L5+s4*L7)))
++	MULD	F17, F6, F7	// s4*L7
++	ADDD	F15, F7		// L5+s4*L7
++	MULD	F6, F7		// s4*(L5+s4*L7)
++	ADDD	F13, F7		// L3+s4*(L5+s4*L7)
++	MULD	F6, F7		// s4*(L3+s4*(L5+s4*L7))
++	ADDD	F11, F7		// L1 + s4*(L3+s4*(L5+s4*L7))
++	MULD	F5, F7		// s2 * (L1 + s4*(L3+s4*(L5+s4*L7)))
++
++	MOVD	88(R10), F16	// L6
++	MOVD	72(R10), F14	// L4
++	MOVD	56(R10), F12	// L2
++	// t2 := s4 * (L2 + s4*(L4+s4*L6))
++	MULD	F6, F16, F8	// s4*L6
++	ADDD	F14, F8		// L4+s4*L6
++	MULD	F6, F8		// s4*(L4+s4*L6)
++	ADDD	F12, F8		// L2 + s4*(L4+s4*L6)
++	MULD	F6, F8		// s4 * (L2 + s4*(L4+s4*L6))
++
++	// R := t1 + t2
++	ADDD   F7, F8
++
++	// hfsq := 0.5 * f * f
++	MULD	F2, F2, F12	// f * f
++	MULD	F10, F12, F9	// 0.5 * f * f
++
++	// return k*Ln2Hi - ((hfsq - (s*(hfsq+R) + k*Ln2Lo)) - f)
++	MOVD	40(R10), F19	// Ln2Lo
++	MOVD	32(R10), F18	// Ln2Hi
++	// f9=hfsq, f1=k, f4=s, f8=R, f2=f
++	ADDD	F9, F8, F10	// F10 = hfsq+R
++	MULD	F1, F19, F11	// F11 = k*Ln2Lo
++	MULD	F10, F4, F12	// F12 = s*(hfsq+R)
++	MULD	F1, F18, F15	// F15 = k*Ln2Hi
++	ADDD	F12, F11, F13	// F13 = s*(hfsq+R) + k*Ln2Lo
++	SUBD	F13, F9, F14	// F14 = hfsq - (s*(hfsq+R) + k*Ln2Lo)
++	SUBD	F2, F14, F14	// F14 = (hfsq - (s*(hfsq+R) + k*Ln2Lo)) - f
++	SUBD	F14, F15, F0
++	MOVD	F0, ret+8(FP)
++	RET
++
++isInfOrNaN:
++	MOVV	R4, ret+8(FP)	// +Inf or NaN, return x
++	RET
++isNegative:
++	MOVV	$NaN, R4
++	MOVV	R4, ret+8(FP)	// return NaN
++	RET
++isZero:
++	MOVV	$NegInf, R4
++	MOVV	R4, ret+8(FP)	// return -Inf
++	RET
+diff --git a/src/math/log_stub.go b/src/math/log_stub.go
+index d35992bf37..1dd4058435 100644
+--- a/src/math/log_stub.go
++++ b/src/math/log_stub.go
+@@ -2,7 +2,7 @@
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+-//go:build !amd64 && !s390x
++//go:build !amd64 && !loong64 && !s390x
+ 
+ package math
+ 
+-- 
+2.38.1
+
diff --git a/0044-cmd-go-internal-work-allow-a-bunch-of-loong64-specif.patch b/0044-cmd-go-internal-work-allow-a-bunch-of-loong64-specif.patch
new file mode 100644
index 0000000000000000000000000000000000000000..40422df348c4672d8c6836dd29ad3fe19709142c
--- /dev/null
+++ b/0044-cmd-go-internal-work-allow-a-bunch-of-loong64-specif.patch
@@ -0,0 +1,126 @@
+From fc3470aafbb3facc619e4813eaf0ea10d5c7eda9 Mon Sep 17 00:00:00 2001
+From: WANG Xuerui <wangxuerui@iscas.ac.cn>
+Date: Sun, 9 Feb 2025 18:57:49 +0800
+Subject: [PATCH 44/44] cmd/go/internal/work: allow a bunch of loong64-specific
+ flags
+
+Recognize and allow all LoongArch-specific CFLAGS as standardized
+in the LoongArch Toolchain Conventions v1.1, and implemented in current
+versions of GCC and Clang, to enable advanced cgo use cases on loong64.
+These flags are also allowed for linker invocations in case of possible
+LTO.
+
+See: https://github.com/loongson/la-toolchain-conventions/blob/releases/v1.1/LoongArch-toolchain-conventions-EN.adoc#list
+
+While at it, also add support for -mtls-dialect as some C programs
+may benefit performance-wise from the optional TLSDESC usage. This flag
+is not specific to loong64 though; it is available for amd64, arm,
+arm64, loong64, riscv64 and x86.
+
+Fixes #71597.
+
+Change-Id: I35d2507edb71fa324ae429a3ae3c739644a9cac1
+---
+ src/cmd/go/internal/work/security.go      | 13 ++++++++--
+ src/cmd/go/internal/work/security_test.go | 31 +++++++++++++++++++++++
+ 2 files changed, 42 insertions(+), 2 deletions(-)
+
+diff --git a/src/cmd/go/internal/work/security.go b/src/cmd/go/internal/work/security.go
+index 50bfd0ab70..c3d62ddc23 100644
+--- a/src/cmd/go/internal/work/security.go
++++ b/src/cmd/go/internal/work/security.go
+@@ -96,17 +96,21 @@ var validCompilerFlags = []*lazyregexp.Regexp{
+ 	re(`-g([^@\-].*)?`),
+ 	re(`-m32`),
+ 	re(`-m64`),
+-	re(`-m(abi|arch|cpu|fpu|tune)=([^@\-].*)`),
++	re(`-m(abi|arch|cpu|fpu|simd|tls-dialect|tune)=([^@\-].*)`),
+ 	re(`-m(no-)?v?aes`),
+ 	re(`-marm`),
+ 	re(`-m(no-)?avx[0-9a-z]*`),
+ 	re(`-mcmodel=[0-9a-z-]+`),
+ 	re(`-mfloat-abi=([^@\-].*)`),
++	re(`-m(soft|single|double)-float`),
+ 	re(`-mfpmath=[0-9a-z,+]*`),
+ 	re(`-m(no-)?avx[0-9a-z.]*`),
+ 	re(`-m(no-)?ms-bitfields`),
+ 	re(`-m(no-)?stack-(.+)`),
+ 	re(`-mmacosx-(.+)`),
++	re(`-m(no-)?relax`),
++	re(`-m(no-)?strict-align`),
++	re(`-m(no-)?(lsx|lasx|frecipe|div32|lam-bh|lamcas|ld-seq-sa)`),
+ 	re(`-mios-simulator-version-min=(.+)`),
+ 	re(`-miphoneos-version-min=(.+)`),
+ 	re(`-mlarge-data-threshold=[0-9]+`),
+@@ -166,8 +170,13 @@ var validLinkerFlags = []*lazyregexp.Regexp{
+ 	re(`-flat_namespace`),
+ 	re(`-g([^@\-].*)?`),
+ 	re(`-headerpad_max_install_names`),
+-	re(`-m(abi|arch|cpu|fpu|tune)=([^@\-].*)`),
++	re(`-m(abi|arch|cpu|fpu|simd|tls-dialect|tune)=([^@\-].*)`),
++	re(`-mcmodel=[0-9a-z-]+`),
+ 	re(`-mfloat-abi=([^@\-].*)`),
++	re(`-m(soft|single|double)-float`),
++	re(`-m(no-)?relax`),
++	re(`-m(no-)?strict-align`),
++	re(`-m(no-)?(lsx|lasx|frecipe|div32|lam-bh|lamcas|ld-seq-sa)`),
+ 	re(`-mmacosx-(.+)`),
+ 	re(`-mios-simulator-version-min=(.+)`),
+ 	re(`-miphoneos-version-min=(.+)`),
+diff --git a/src/cmd/go/internal/work/security_test.go b/src/cmd/go/internal/work/security_test.go
+index 35af621764..48f98100a5 100644
+--- a/src/cmd/go/internal/work/security_test.go
++++ b/src/cmd/go/internal/work/security_test.go
+@@ -50,10 +50,35 @@ var goodCompilerFlags = [][]string{
+ 	{"-ftls-model=local-dynamic"},
+ 	{"-g"},
+ 	{"-ggdb"},
++	{"-mabi=lp64d"},
+ 	{"-march=souza"},
+ 	{"-mcmodel=medium"},
+ 	{"-mcpu=123"},
+ 	{"-mfpu=123"},
++	{"-mtls-dialect=gnu"},
++	{"-mtls-dialect=gnu2"},
++	{"-mtls-dialect=trad"},
++	{"-mtls-dialect=desc"},
++	{"-mtls-dialect=xyz"},
++	{"-msimd=lasx"},
++	{"-msimd=xyz"},
++	{"-mdouble-float"},
++	{"-mrelax"},
++	{"-mstrict-align"},
++	{"-mlsx"},
++	{"-mlasx"},
++	{"-mfrecipe"},
++	{"-mlam-bh"},
++	{"-mlamcas"},
++	{"-mld-seq-sa"},
++	{"-mno-relax"},
++	{"-mno-strict-align"},
++	{"-mno-lsx"},
++	{"-mno-lasx"},
++	{"-mno-frecipe"},
++	{"-mno-lam-bh"},
++	{"-mno-lamcas"},
++	{"-mno-ld-seq-sa"},
+ 	{"-mlarge-data-threshold=16"},
+ 	{"-mtune=happybirthday"},
+ 	{"-mstack-overflow"},
+@@ -96,7 +121,13 @@ var badCompilerFlags = [][]string{
+ 	{"-march=@dawn"},
+ 	{"-march=-dawn"},
+ 	{"-mcmodel=@model"},
++	{"-mfpu=@0"},
++	{"-mfpu=-0"},
+ 	{"-mlarge-data-threshold=@12"},
++	{"-mtls-dialect=@gnu"},
++	{"-mtls-dialect=-gnu"},
++	{"-msimd=@none"},
++	{"-msimd=-none"},
+ 	{"-std=@c99"},
+ 	{"-std=-c99"},
+ 	{"-x@c"},
+-- 
+2.38.1
+
diff --git a/golang.spec b/golang.spec
index b3d2c41ba6c04a2bcbfae5b0e5decfe1743793d0..7bc84e369af3f698df833d4a8800ddbab3ab4497 100644
--- a/golang.spec
+++ b/golang.spec
@@ -1,4 +1,4 @@
-%define anolis_release 1
+%define anolis_release 2
 
 # Disable debuginfo packages
 %global debug_package %{nil}
@@ -42,7 +42,11 @@
 %endif
 
 # Build golang shared objects for stdlib
+%ifarch loongarch64
+%bcond_with shared
+%else
 %bcond_without shared
+%endif
 
 # Pre build std lib with -race enabled
 # Disabled due to 1.20 new cache usage, see 1.20 upstream release notes
@@ -73,8 +77,50 @@ Source0:        https://go.dev/dl/go%{go_api}%{?go_patch:.%{go_patch}}.src.tar.g
 # make possible to override default traceback level at build time by setting build tag rpm_crashtraceback
 Source1:        anolis.go
 
-# Exclude for temporary
-ExcludeArch:    loongarch64
+Patch1:         0001-cmd-link-internal-add-support-for-internal-linking-o.patch
+Patch2:         0002-cmd-dist-internal-platform-enable-internal-linking-f.patch
+Patch3:         0003-cmd-runtime-enable-race-detector-on-loong64.patch
+Patch4:         0004-runtime-delete-on-register-ABI-fallback-path-for-rac.patch
+Patch5:         0005-cmd-internal-obj-loong64-remove-unused-register-alia.patch
+Patch6:         0006-internal-bytealg-optimize-IndexByte-and-IndexByteStr.patch
+Patch7:         0007-internal-bytealg-optimize-memequal-and-memequal_varl.patch
+Patch8:         0008-internal-bytealg-optimize-Index-and-IndexString-func.patch
+Patch9:         0009-internal-bytealg-optimize-Count-and-CountString-func.patch
+Patch10:         0010-internal-bytealg-adjust-the-format-of-assembly-files.patch
+Patch11:         0011-cmd-internal-obj-loong64-optimize-immediate-loading.patch
+Patch12:         0012-math-big-optimize-addVV-function-for-loong64.patch
+Patch13:         0013-math-big-optimize-addVW-function-for-loong64.patch
+Patch14:         0014-math-big-optimize-subVV-function-for-loong64.patch
+Patch15:         0015-math-big-optimize-subVW-function-for-loong64.patch
+Patch16:         0016-math-big-optimize-shlVU-function-for-loong64.patch
+Patch17:         0017-math-big-optimize-shrVU-function-for-loong64.patch
+Patch18:         0018-math-big-optimize-mulAddVWW-function-for-loong64.patch
+Patch19:         0019-math-big-optimize-addMulVVW-function-for-loong64.patch
+Patch20:         0020-cmd-compile-fold-constant-shift-with-extension-on-lo.patch
+Patch21:         0021-test-codegen-fix-the-matching-instructions-inside-pl.patch
+Patch22:         0022-cmd-compile-optimize-shifts-of-int32-and-uint32-on-l.patch
+Patch23:         0023-cmd-compile-simplify-bounded-shift-on-loong64.patch
+Patch24:         0024-runtime-use-ABIInternal-on-syscall-and-other-sys.stu.patch
+Patch25:         0025-runtime-use-correct-memory-barrier-in-exitThread-fun.patch
+Patch26:         0026-cmd-internal-obj-loong64-add-V-XV-SEQI-V-XV-.-AND-OR.patch
+Patch27:         0027-cmd-internal-obj-loong64-add-V-XV-ADD-SUB-.-B-H-W-D-.patch
+Patch28:         0028-cmd-internal-obj-loong64-add-V-XV-ILV-L-H-.-B-H-W-D-.patch
+Patch29:         0029-cmd-internal-obj-loong64-add-V-XV-SLL-SRL-SRA-ROTR-I.patch
+Patch30:         0030-cmd-internal-obj-loong64-add-V-XV-FSQRT-FRECIP-FRSQR.patch
+Patch31:         0031-cmd-internal-obj-loong64-add-V-XV-NEG-B-H-W-V-instru.patch
+Patch32:         0032-cmd-internal-obj-loong64-add-V-XV-MUL-B-H-W-V-and-V-.patch
+Patch33:         0033-cmd-internal-obj-loong64-add-V-XV-DIV-B-H-W-V-U-and-.patch
+Patch34:         0034-cmd-internal-obj-loong64-add-V-XV-BITCLR-BITSET-BITR.patch
+Patch35:         0035-crypto-chacha20-add-loong64-SIMD-implementation.patch
+Patch36:         0036-internal-bytealg-optimize-Count-String-in-loong64.patch
+Patch37:         0037-cmd-internal-obj-cmd-asm-reclassify-32-bit-immediate.patch
+Patch38:         0038-crypto-internal-poly1305-implement-function-update-i.patch
+Patch39:         0039-runtime-optimize-the-implementation-of-memclrNoHeapP.patch
+Patch40:         0040-runtime-race-add-the-implementation-of-atomic.-Or-An.patch
+Patch41:         0041-cmd-internal-obj-loong64-add-F-MAXA-MINA-.-S-D-instr.patch
+Patch42:         0042-math-implement-func-archExp-and-archExp2-in-assembly.patch
+Patch43:         0043-math-implement-func-archLog-in-assembly-on-loong64.patch
+Patch44:         0044-cmd-go-internal-work-allow-a-bunch-of-loong64-specif.patch
 
 # The compiler is written in Go. Needs go(1.4+) compiler for build.
 %if %{with bootstrap}
@@ -545,6 +591,13 @@ fi
 %files docs -f go-docs.list
 
 %changelog
+* Mon Feb 24 2025 limeidan <limeidan@loongson.cn> - 1.24.0-2
+- add internal linker support on loong64
+- optimize the internal/bytealg package on loong64
+- optimize the math/big package on loong64
+- add new instructions support on loong64
+- optimize memory operation function of runtime on loong64
+
 * Tue Feb 18 2025 gaochang <gc-taifu@linux.alibaba.com> - 1.24.0-1
 - update to 1.24.0
 
@@ -554,7 +607,7 @@ fi
 * Wed Jul 10 2024 yangxinyu <yangxinyu@nfschina.com> - 1.21.11-1
 - update to 1.21.11 fix cve-2024-24789
 
-* Thu Mon 13 2024 chenguoqi <chenguoqi@loongson.cn> - 1.21.10-2
+* Thu Jun 13 2024 chenguoqi <chenguoqi@loongson.cn> - 1.21.10-2
 - add buildmode={plugin,shared} support on linux/loong64
 - asan and msan support on linux/loong64
 - loong64 disassembler support
diff --git a/race_linux_loong64.syso b/race_linux_loong64.syso
index 6fdb3bad77751956e4c1ee6c0732ddcc3a7fc3dc..0d2b4946fbf31abc042ea4ee852785cb13cce5a6 100644
Binary files a/race_linux_loong64.syso and b/race_linux_loong64.syso differ